From 6cb05b09dfe764242516ffc0f747d732c65655df Mon Sep 17 00:00:00 2001 From: "github-classroom[bot]" <66690702+github-classroom[bot]@users.noreply.github.com> Date: Fri, 29 Dec 2023 10:01:41 +0000 Subject: [PATCH 01/65] Setting up GitHub Classroom Feedback From 703fd7f3e7b410d54daae7311e2ee643ff625869 Mon Sep 17 00:00:00 2001 From: soyoonjeong <76814748+soyoonjeong@users.noreply.github.com> Date: Thu, 4 Jan 2024 14:45:03 +0900 Subject: [PATCH 02/65] Feat : Add gitignore and templates - Add gitignore file - Add Github Issue Template - Add Github Pull Request Template #1 --- .github/ISSUE_TEMPLATE/issue_template.md | 21 ++ .../pull_request_template.md | 15 + .gitignore | 344 ++++++++++++++++++ 3 files changed, 380 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/issue_template.md create mode 100644 .github/PULL_REQUEST_TEMPLATE/pull_request_template.md create mode 100644 .gitignore diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md new file mode 100644 index 0000000..602c4c4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -0,0 +1,21 @@ +⭐ Description +--- +- + +📷 Screenshots +--- +- + +📁 Files +--- +- + +📈 To Reproduce +--- +- + +✔️ Tasks +--- +- [ ] Task 1 +- [ ] Task 2 +- [ ] Task 3 \ No newline at end of file diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md new file mode 100644 index 0000000..85b325c --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md @@ -0,0 +1,15 @@ +📌 개요 +--- +- + +💻 작업사항 +--- +- + +✅ 변경로직 +--- +- + +💡Issue 번호 +--- +- \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fc17431 --- /dev/null +++ b/.gitignore @@ -0,0 +1,344 @@ +# exclude input data +dataset/ + +# exclude baseline +baseline/ + +# exclude custom config files (only include base.yml) +!config/ +config/* +!config/base.yml +!config/final.yml + +# exclude custom dataset files (only include dataset.py) +# !data/ +# data/* +# !data/datasets.py +data/__pycache__ +# exclude custom model files (only include model.py) +!model/ +model/* +!model/model.py + +# exclude uitls cache +utils/__pycache__ + +# exclude EDA folder (temporarily) +EDA/ +eda.py + +# exclude results +results/ +output/ +.vscode/ + +# exclude etc +*.txt +*.ipynb +*.tar +*.gz + +# exclude wandb +wandb/ + +image_log.txt +*.sh + +# exclude test code +tests/ + +# +.vscode/* +.vscode/ + +# exlude xml format +*.xml + +requirements.txt + +# exlude csv format +*.csv + +# exclude sh format +*.sh + +# Created by https://www.toptal.com/developers/gitignore/api/macos,linux,windows,visualstudiocode,python,jupyternotebooks +# Edit at https://www.toptal.com/developers/gitignore?templates=macos,linux,windows,visualstudiocode,python,jupyternotebooks + +### JupyterNotebooks ### +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +.ipynb_checkpoints +*/.ipynb_checkpoints/* + +# IPython +profile_default/ +ipython_config.py + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook + +# IPython + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,visualstudiocode,python,jupyternotebooks From b1c7d94e85999391d148d26dc02f37444a98b3a1 Mon Sep 17 00:00:00 2001 From: soyoonjeong <76814748+soyoonjeong@users.noreply.github.com> Date: Thu, 4 Jan 2024 14:59:39 +0900 Subject: [PATCH 03/65] Rename : Rename github templates #2 --- .github/{ISSUE_TEMPLATE/issue_template.md => ISSUE_TEMPLATE.md} | 0 .../pull_request_template.md => PULL_REQUEST_TEMPLATE.md} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename .github/{ISSUE_TEMPLATE/issue_template.md => ISSUE_TEMPLATE.md} (100%) rename .github/{PULL_REQUEST_TEMPLATE/pull_request_template.md => PULL_REQUEST_TEMPLATE.md} (100%) diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE.md similarity index 100% rename from .github/ISSUE_TEMPLATE/issue_template.md rename to .github/ISSUE_TEMPLATE.md diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE.md similarity index 100% rename from .github/PULL_REQUEST_TEMPLATE/pull_request_template.md rename to .github/PULL_REQUEST_TEMPLATE.md From df2c46099fddde671f46d0fbd8f2b71250715211 Mon Sep 17 00:00:00 2001 From: Jouhy Date: Fri, 5 Jan 2024 11:58:07 +0900 Subject: [PATCH 04/65] modify gitignore --- .gitignore | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.gitignore b/.gitignore index fc17431..c95397e 100644 --- a/.gitignore +++ b/.gitignore @@ -342,3 +342,17 @@ $RECYCLE.BIN/ *.lnk # End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,visualstudiocode,python,jupyternotebooks + +# mmdetection +./mmdetection/work_dirs/ +./mmdetection/data/ +./mmdetection/*.pkl +./mmdetection/*.pkl.json +./mmdetection/*.log.json +./mmdetection/docs/modelzoo_statistics.md +./mmdetection/mmdet/.mim +./mmdetection/work_dirs/ +./mmdetection/.vscode +*.jpg +*.jpeg +*.png \ No newline at end of file From aa27db72914d87a7f02d645bef33b8502540f9bc Mon Sep 17 00:00:00 2001 From: Jouhy Date: Fri, 5 Jan 2024 12:00:00 +0900 Subject: [PATCH 05/65] modify gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c95397e..8f40a54 100644 --- a/.gitignore +++ b/.gitignore @@ -355,4 +355,6 @@ $RECYCLE.BIN/ ./mmdetection/.vscode *.jpg *.jpeg -*.png \ No newline at end of file +*.png +*.pth +*.log \ No newline at end of file From b40768a23ea460605949c040ebff5f803506bf9a Mon Sep 17 00:00:00 2001 From: Jouhy Date: Fri, 5 Jan 2024 12:03:54 +0900 Subject: [PATCH 06/65] modify gitignore --- .gitignore | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 8f40a54..92525df 100644 --- a/.gitignore +++ b/.gitignore @@ -344,14 +344,16 @@ $RECYCLE.BIN/ # End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,visualstudiocode,python,jupyternotebooks # mmdetection -./mmdetection/work_dirs/ -./mmdetection/data/ +./mmdetection/work_dirs/* +./mmdetection/work_dirs +./mmdetection/data/* +./mmdetection/data ./mmdetection/*.pkl ./mmdetection/*.pkl.json ./mmdetection/*.log.json ./mmdetection/docs/modelzoo_statistics.md ./mmdetection/mmdet/.mim -./mmdetection/work_dirs/ +./mmdetection/.vscode/* ./mmdetection/.vscode *.jpg *.jpeg From 0c467c5cc080f2cef6bcbbcd2eb7228737c6e98b Mon Sep 17 00:00:00 2001 From: Jouhy Date: Fri, 5 Jan 2024 12:07:16 +0900 Subject: [PATCH 07/65] modify gitignore --- .gitignore | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 92525df..82db057 100644 --- a/.gitignore +++ b/.gitignore @@ -344,19 +344,20 @@ $RECYCLE.BIN/ # End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,visualstudiocode,python,jupyternotebooks # mmdetection -./mmdetection/work_dirs/* -./mmdetection/work_dirs -./mmdetection/data/* -./mmdetection/data -./mmdetection/*.pkl -./mmdetection/*.pkl.json -./mmdetection/*.log.json -./mmdetection/docs/modelzoo_statistics.md -./mmdetection/mmdet/.mim -./mmdetection/.vscode/* -./mmdetection/.vscode +mmdetection/work_dirs/* +mmdetection/work_dirs +mmdetection/data/* +mmdetection/data +mmdetection/*.pkl +mmdetection/*.pkl.json +mmdetection/*.log.json +mmdetection/docs/modelzoo_statistics.md +mmdetection/mmdet/.mim +mmdetection/.vscode/* +mmdetection/.vscode *.jpg *.jpeg *.png *.pth -*.log \ No newline at end of file +*.log +*.mp4 \ No newline at end of file From d2884fd1fa9dc6520682a4bb8456d4750495ce9c Mon Sep 17 00:00:00 2001 From: Jouhy Date: Fri, 5 Jan 2024 12:09:09 +0900 Subject: [PATCH 08/65] add mmdetection --- mmdetection/.circleci/config.yml | 34 + mmdetection/.circleci/docker/Dockerfile | 11 + mmdetection/.circleci/test.yml | 210 + mmdetection/.dev_scripts/batch_test_list.py | 545 +++ mmdetection/.dev_scripts/benchmark_filter.py | 167 + .../.dev_scripts/benchmark_inference_fps.py | 171 + mmdetection/.dev_scripts/benchmark_options.py | 16 + mmdetection/.dev_scripts/benchmark_test.py | 115 + .../.dev_scripts/benchmark_test_image.py | 134 + mmdetection/.dev_scripts/benchmark_train.py | 178 + .../.dev_scripts/benchmark_valid_flops.py | 295 ++ mmdetection/.dev_scripts/check_links.py | 157 + .../convert_test_benchmark_script.py | 114 + .../convert_train_benchmark_script.py | 104 + mmdetection/.dev_scripts/covignore.cfg | 5 + .../.dev_scripts/download_checkpoints.py | 83 + mmdetection/.dev_scripts/gather_models.py | 308 ++ .../gather_test_benchmark_metric.py | 96 + .../gather_train_benchmark_metric.py | 151 + .../.dev_scripts/test_init_backbone.py | 178 + mmdetection/.owners.yml | 14 + mmdetection/.pre-commit-config-zh-cn.yaml | 61 + mmdetection/.pre-commit-config.yaml | 50 + mmdetection/.readthedocs.yml | 14 + mmdetection/CITATION.cff | 8 + mmdetection/LICENSE | 203 + mmdetection/MANIFEST.in | 7 + mmdetection/README.md | 487 +++ mmdetection/README_zh-CN.md | 507 +++ .../_base_/datasets/ade20k_instance.py | 53 + .../_base_/datasets/ade20k_panoptic.py | 38 + .../_base_/datasets/ade20k_semantic.py | 48 + .../_base_/datasets/cityscapes_detection.py | 84 + .../_base_/datasets/cityscapes_instance.py | 113 + .../configs/_base_/datasets/coco_caption.py | 60 + .../configs/_base_/datasets/coco_detection.py | 95 + .../configs/_base_/datasets/coco_instance.py | 95 + .../_base_/datasets/coco_instance_semantic.py | 78 + .../configs/_base_/datasets/coco_panoptic.py | 94 + .../configs/_base_/datasets/coco_semantic.py | 78 + .../configs/_base_/datasets/deepfashion.py | 95 + mmdetection/configs/_base_/datasets/dsdl.py | 62 + .../configs/_base_/datasets/isaid_instance.py | 59 + .../_base_/datasets/lvis_v0.5_instance.py | 79 + .../_base_/datasets/lvis_v1_instance.py | 22 + .../configs/_base_/datasets/mot_challenge.py | 90 + .../_base_/datasets/mot_challenge_det.py | 66 + .../_base_/datasets/mot_challenge_reid.py | 61 + .../_base_/datasets/objects365v1_detection.py | 74 + .../_base_/datasets/objects365v2_detection.py | 73 + .../_base_/datasets/openimages_detection.py | 81 + .../configs/_base_/datasets/refcoco+.py | 55 + .../configs/_base_/datasets/refcoco.py | 55 + .../configs/_base_/datasets/refcocog.py | 55 + .../_base_/datasets/semi_coco_detection.py | 178 + mmdetection/configs/_base_/datasets/v3det.py | 69 + .../configs/_base_/datasets/voc0712.py | 92 + .../configs/_base_/datasets/wider_face.py | 73 + .../configs/_base_/datasets/youtube_vis.py | 66 + mmdetection/configs/_base_/default_runtime.py | 24 + .../models/cascade-mask-rcnn_r50_fpn.py | 203 + .../_base_/models/cascade-rcnn_r50_fpn.py | 185 + .../_base_/models/fast-rcnn_r50_fpn.py | 68 + .../_base_/models/faster-rcnn_r50-caffe-c4.py | 123 + .../models/faster-rcnn_r50-caffe-dc5.py | 111 + .../_base_/models/faster-rcnn_r50_fpn.py | 114 + .../_base_/models/mask-rcnn_r50-caffe-c4.py | 132 + .../_base_/models/mask-rcnn_r50_fpn.py | 127 + .../_base_/models/retinanet_r50_fpn.py | 68 + .../configs/_base_/models/rpn_r50-caffe-c4.py | 64 + .../configs/_base_/models/rpn_r50_fpn.py | 64 + mmdetection/configs/_base_/models/ssd300.py | 63 + .../configs/_base_/schedules/schedule_1x.py | 28 + .../configs/_base_/schedules/schedule_20e.py | 28 + .../configs/_base_/schedules/schedule_2x.py | 28 + mmdetection/configs/albu_example/README.md | 31 + .../mask-rcnn_r50_fpn_albu-1x_coco.py | 66 + mmdetection/configs/albu_example/metafile.yml | 17 + mmdetection/configs/atss/README.md | 31 + .../configs/atss/atss_r101_fpn_1x_coco.py | 6 + .../atss_r101_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../atss_r18_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../configs/atss/atss_r50_fpn_1x_coco.py | 71 + .../atss_r50_fpn_8xb8-amp-lsj-200e_coco.py | 81 + mmdetection/configs/atss/metafile.yml | 60 + mmdetection/configs/autoassign/README.md | 35 + .../autoassign_r50-caffe_fpn_1x_coco.py | 69 + mmdetection/configs/autoassign/metafile.yml | 33 + mmdetection/configs/boxinst/README.md | 32 + .../boxinst/boxinst_r101_fpn_ms-90k_coco.py | 8 + .../boxinst/boxinst_r50_fpn_ms-90k_coco.py | 93 + mmdetection/configs/boxinst/metafile.yml | 52 + mmdetection/configs/bytetrack/README.md | 132 + ...dhuman-mot17halftrain_test-mot17halfval.py | 249 ++ ...0e_crowdhuman-mot20train_test-mot20test.py | 127 + ...dhuman-mot17halftrain_test-mot17halfval.py | 9 + ...rowdhuman-mot17halftrain_test-mot17test.py | 17 + ...0e_crowdhuman-mot20train_test-mot20test.py | 8 + mmdetection/configs/bytetrack/metafile.yml | 53 + ...dhuman-mot17halftrain_test-mot17halfval.py | 6 + mmdetection/configs/carafe/README.md | 42 + .../faster-rcnn_r50_fpn-carafe_1x_coco.py | 20 + .../mask-rcnn_r50_fpn-carafe_1x_coco.py | 30 + mmdetection/configs/carafe/metafile.yml | 55 + mmdetection/configs/cascade_rcnn/README.md | 79 + ...ascade-mask-rcnn_r101-caffe_fpn_1x_coco.py | 7 + ...ade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py | 7 + .../cascade-mask-rcnn_r101_fpn_1x_coco.py | 6 + .../cascade-mask-rcnn_r101_fpn_20e_coco.py | 6 + .../cascade-mask-rcnn_r101_fpn_ms-3x_coco.py | 6 + ...cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py | 14 + ...cade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py | 18 + .../cascade-mask-rcnn_r50_fpn_1x_coco.py | 5 + .../cascade-mask-rcnn_r50_fpn_20e_coco.py | 5 + .../cascade-mask-rcnn_r50_fpn_ms-3x_coco.py | 4 + ...ascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py | 14 + ...scade-mask-rcnn_x101-32x4d_fpn_20e_coco.py | 14 + ...ade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py | 14 + ...ade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py | 24 + ...ascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py | 14 + ...scade-mask-rcnn_x101-64x4d_fpn_20e_coco.py | 14 + ...ade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py | 14 + .../cascade-rcnn_r101-caffe_fpn_1x_coco.py | 7 + .../cascade-rcnn_r101_fpn_1x_coco.py | 6 + .../cascade-rcnn_r101_fpn_20e_coco.py | 6 + ...de-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py | 7 + ...ade-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../cascade-rcnn_r50-caffe_fpn_1x_coco.py | 16 + .../cascade-rcnn_r50_fpn_1x_coco.py | 5 + .../cascade-rcnn_r50_fpn_20e_coco.py | 5 + ...ade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py | 23 + .../cascade-rcnn_x101-32x4d_fpn_1x_coco.py | 14 + .../cascade-rcnn_x101-32x4d_fpn_20e_coco.py | 14 + .../cascade-rcnn_x101-64x4d_fpn_1x_coco.py | 15 + .../cascade-rcnn_x101_64x4d_fpn_20e_coco.py | 15 + mmdetection/configs/cascade_rcnn/metafile.yml | 545 +++ mmdetection/configs/cascade_rpn/README.md | 41 + ...ade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py | 27 + ...e-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py | 89 + .../cascade-rpn_r50-caffe_fpn_1x_coco.py | 76 + mmdetection/configs/cascade_rpn/metafile.yml | 44 + mmdetection/configs/centernet/README.md | 58 + ...-update_r101_fpn_8xb8-amp-lsj-200e_coco.py | 7 + ...t-update_r18_fpn_8xb8-amp-lsj-200e_coco.py | 7 + ...nternet-update_r50-caffe_fpn_ms-1x_coco.py | 105 + ...t-update_r50_fpn_8xb8-amp-lsj-200e_coco.py | 83 + ...ernet_r18-dcnv2_8xb16-crop512-140e_coco.py | 136 + .../centernet_r18_8xb16-crop512-140e_coco.py | 3 + .../configs/centernet/centernet_tta.py | 39 + mmdetection/configs/centernet/metafile.yml | 60 + mmdetection/configs/centripetalnet/README.md | 36 + ...glass104_16xb6-crop511-210e-mstest_coco.py | 181 + .../configs/centripetalnet/metafile.yml | 39 + mmdetection/configs/cityscapes/README.md | 46 + .../faster-rcnn_r50_fpn_1x_cityscapes.py | 41 + .../mask-rcnn_r50_fpn_1x_cityscapes.py | 43 + .../configs/common/lsj-100e_coco-detection.py | 122 + .../configs/common/lsj-100e_coco-instance.py | 122 + .../configs/common/lsj-200e_coco-detection.py | 18 + .../configs/common/lsj-200e_coco-instance.py | 18 + mmdetection/configs/common/ms-90k_coco.py | 122 + .../common/ms-poly-90k_coco-instance.py | 130 + .../common/ms-poly_3x_coco-instance.py | 118 + .../configs/common/ms_3x_coco-instance.py | 108 + mmdetection/configs/common/ms_3x_coco.py | 108 + .../configs/common/ssj_270k_coco-instance.py | 125 + .../common/ssj_scp_270k_coco-instance.py | 60 + mmdetection/configs/condinst/README.md | 40 + ...dinst_r50_fpn_ms-poly-90k_coco_instance.py | 85 + mmdetection/configs/condinst/metafile.yml | 32 + .../configs/conditional_detr/README.md | 39 + .../conditional-detr_r50_8xb2-50e_coco.py | 42 + .../configs/conditional_detr/metafile.yml | 32 + mmdetection/configs/convnext/README.md | 42 + ...7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py | 26 + ...7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py | 154 + ...onvnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py | 96 + mmdetection/configs/convnext/metafile.yml | 93 + mmdetection/configs/cornernet/README.md | 43 + ...glass104_10xb5-crop511-210e-mstest_coco.py | 8 + ...net_hourglass104_32xb3-210e-mstest_coco.py | 8 + ...rnet_hourglass104_8xb6-210e-mstest_coco.py | 183 + mmdetection/configs/cornernet/metafile.yml | 83 + mmdetection/configs/crowddet/README.md | 37 + ...owddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py | 227 + ...rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py | 3 + mmdetection/configs/crowddet/metafile.yml | 47 + mmdetection/configs/dab_detr/README.md | 40 + .../dab_detr/dab-detr_r50_8xb2-50e_coco.py | 159 + mmdetection/configs/dab_detr/metafile.yml | 32 + mmdetection/configs/dcn/README.md | 48 + ...-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py | 5 + ...e-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py | 5 + ...rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py | 5 + ...scade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py | 5 + ...ascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py | 5 + ...aster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py | 5 + ...faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py | 5 + .../dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py | 12 + ...rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py | 16 + .../mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py | 5 + .../mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py | 5 + ...sk-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py | 10 + mmdetection/configs/dcn/metafile.yml | 272 ++ mmdetection/configs/dcnv2/README.md | 37 + ...aster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py | 5 + ...cnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py | 5 + .../faster-rcnn_r50_fpn_mdpool_1x_coco.py | 12 + .../mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py | 5 + ...k-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py | 10 + mmdetection/configs/dcnv2/metafile.yml | 123 + mmdetection/configs/ddod/README.md | 31 + .../configs/ddod/ddod_r50_fpn_1x_coco.py | 72 + mmdetection/configs/ddod/metafile.yml | 33 + mmdetection/configs/ddq/README.md | 39 + .../ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py | 170 + .../ddq-detr-4scale_swinl_8xb2-30e_coco.py | 177 + .../ddq/ddq-detr-5scale_r50_8xb2-12e_coco.py | 171 + mmdetection/configs/ddq/metafile.yml | 56 + mmdetection/configs/deepfashion/README.md | 70 + .../mask-rcnn_r50_fpn_15e_deepfashion.py | 23 + mmdetection/configs/deepsort/README.md | 109 + ...xb2-4e_mot17halftrain_test-mot17halfval.py | 85 + ...0_fpn_8xb2-4e_mot17train_test-mot17test.py | 15 + mmdetection/configs/deepsort/metafile.yml | 37 + mmdetection/configs/deformable_detr/README.md | 41 + ...detr-refine-twostage_r50_16xb2-50e_coco.py | 2 + ...formable-detr-refine_r50_16xb2-50e_coco.py | 2 + .../deformable-detr_r50_16xb2-50e_coco.py | 156 + .../configs/deformable_detr/metafile.yml | 56 + mmdetection/configs/detectors/README.md | 69 + .../detectors/cascade-rcnn_r50-rfp_1x_coco.py | 28 + .../detectors/cascade-rcnn_r50-sac_1x_coco.py | 12 + .../detectors_cascade-rcnn_r50_1x_coco.py | 32 + .../detectors/detectors_htc-r101_20e_coco.py | 28 + .../detectors/detectors_htc-r50_1x_coco.py | 28 + .../configs/detectors/htc_r50-rfp_1x_coco.py | 24 + .../configs/detectors/htc_r50-sac_1x_coco.py | 8 + mmdetection/configs/detectors/metafile.yml | 114 + mmdetection/configs/detr/README.md | 37 + .../configs/detr/detr_r101_8xb2-500e_coco.py | 7 + .../configs/detr/detr_r18_8xb2-500e_coco.py | 7 + .../configs/detr/detr_r50_8xb2-150e_coco.py | 155 + .../configs/detr/detr_r50_8xb2-500e_coco.py | 24 + mmdetection/configs/detr/metafile.yml | 33 + mmdetection/configs/dino/README.md | 40 + .../dino/dino-4scale_r50_8xb2-12e_coco.py | 163 + .../dino/dino-4scale_r50_8xb2-24e_coco.py | 13 + .../dino/dino-4scale_r50_8xb2-36e_coco.py | 13 + .../dino-4scale_r50_improved_8xb2-12e_coco.py | 18 + .../dino/dino-5scale_swin-l_8xb2-12e_coco.py | 30 + .../dino/dino-5scale_swin-l_8xb2-36e_coco.py | 13 + mmdetection/configs/dino/metafile.yml | 85 + mmdetection/configs/double_heads/README.md | 32 + .../dh-faster-rcnn_r50_fpn_1x_coco.py | 23 + mmdetection/configs/double_heads/metafile.yml | 41 + mmdetection/configs/dsdl/README.md | 63 + mmdetection/configs/dsdl/coco.py | 33 + mmdetection/configs/dsdl/coco_instance.py | 62 + mmdetection/configs/dsdl/objects365v2.py | 54 + mmdetection/configs/dsdl/openimagesv6.py | 94 + mmdetection/configs/dsdl/voc07.py | 94 + mmdetection/configs/dsdl/voc0712.py | 132 + mmdetection/configs/dyhead/README.md | 52 + .../atss_r50-caffe_fpn_dyhead_1x_coco.py | 103 + .../dyhead/atss_r50_fpn_dyhead_1x_coco.py | 72 + ...tss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py | 140 + mmdetection/configs/dyhead/metafile.yml | 76 + mmdetection/configs/dynamic_rcnn/README.md | 30 + .../dynamic-rcnn_r50_fpn_1x_coco.py | 28 + mmdetection/configs/dynamic_rcnn/metafile.yml | 35 + mmdetection/configs/efficientnet/README.md | 30 + mmdetection/configs/efficientnet/metafile.yml | 19 + ...etinanet_effb3_fpn_8xb4-crop896-1x_coco.py | 94 + .../configs/empirical_attention/README.md | 33 + ...aster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py | 16 + .../faster-rcnn_r50-attn0010_fpn_1x_coco.py | 13 + ...aster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py | 16 + .../faster-rcnn_r50-attn1111_fpn_1x_coco.py | 13 + .../configs/empirical_attention/metafile.yml | 103 + mmdetection/configs/fast_rcnn/README.md | 121 + .../fast-rcnn_r101-caffe_fpn_1x_coco.py | 7 + .../fast_rcnn/fast-rcnn_r101_fpn_1x_coco.py | 6 + .../fast_rcnn/fast-rcnn_r101_fpn_2x_coco.py | 6 + .../fast-rcnn_r50-caffe_fpn_1x_coco.py | 16 + .../fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py | 39 + .../fast_rcnn/fast-rcnn_r50_fpn_2x_coco.py | 14 + mmdetection/configs/faster_rcnn/README.md | 88 + .../faster-rcnn_r101-caffe_fpn_1x_coco.py | 7 + .../faster-rcnn_r101-caffe_fpn_ms-3x_coco.py | 11 + .../faster-rcnn_r101_fpn_1x_coco.py | 6 + .../faster-rcnn_r101_fpn_2x_coco.py | 6 + ...er-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../faster-rcnn_r101_fpn_ms-3x_coco.py | 7 + ...ter-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../faster-rcnn_r50-caffe-c4_ms-1x_coco.py | 14 + .../faster-rcnn_r50-caffe-dc5_1x_coco.py | 5 + .../faster-rcnn_r50-caffe-dc5_ms-1x_coco.py | 14 + .../faster-rcnn_r50-caffe-dc5_ms-3x_coco.py | 18 + .../faster-rcnn_r50-caffe_c4-1x_coco.py | 5 + .../faster-rcnn_r50-caffe_fpn_1x_coco.py | 15 + .../faster-rcnn_r50-caffe_fpn_90k_coco.py | 22 + ...caffe_fpn_ms-1x_coco-person-bicycle-car.py | 16 + ...er-rcnn_r50-caffe_fpn_ms-1x_coco-person.py | 14 + .../faster-rcnn_r50-caffe_fpn_ms-1x_coco.py | 31 + .../faster-rcnn_r50-caffe_fpn_ms-2x_coco.py | 18 + .../faster-rcnn_r50-caffe_fpn_ms-3x_coco.py | 15 + .../faster-rcnn_r50-caffe_fpn_ms-90k_coco.py | 23 + .../faster-rcnn_r50-tnr-pre_fpn_1x_coco.py | 14 + .../faster-rcnn_r50_fpn_1x_coco.py | 5 + .../faster-rcnn_r50_fpn_2x_coco.py | 5 + ...ter-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py | 20 + .../faster-rcnn_r50_fpn_amp-1x_coco.py | 6 + ...faster-rcnn_r50_fpn_bounded-iou_1x_coco.py | 6 + .../faster-rcnn_r50_fpn_ciou_1x_coco.py | 6 + .../faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py | 48 + .../faster-rcnn_r50_fpn_giou_1x_coco.py | 6 + .../faster-rcnn_r50_fpn_iou_1x_coco.py | 6 + .../faster-rcnn_r50_fpn_ms-3x_coco.py | 1 + .../faster-rcnn_r50_fpn_ohem_1x_coco.py | 2 + .../faster-rcnn_r50_fpn_soft-nms_1x_coco.py | 12 + .../faster-rcnn_x101-32x4d_fpn_1x_coco.py | 14 + .../faster-rcnn_x101-32x4d_fpn_2x_coco.py | 14 + .../faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py | 14 + .../faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py | 23 + .../faster-rcnn_x101-64x4d_fpn_1x_coco.py | 14 + .../faster-rcnn_x101-64x4d_fpn_2x_coco.py | 14 + .../faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py | 14 + mmdetection/configs/faster_rcnn/metafile.yml | 451 ++ mmdetection/configs/fcos/README.md | 45 + .../fcos_r101-caffe_fpn_gn-head-1x_coco.py | 9 + ...01-caffe_fpn_gn-head_ms-640-800-2x_coco.py | 38 + ...centeronreg-giou_8xb8-amp-lsj-200e_coco.py | 7 + ...centeronreg-giou_8xb8-amp-lsj-200e_coco.py | 7 + ...enter-normbbox-centeronreg-giou_1x_coco.py | 43 + ...os_r50-caffe_fpn_gn-head-center_1x_coco.py | 4 + .../fcos_r50-caffe_fpn_gn-head_1x_coco.py | 75 + ...fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py | 5 + ...50-caffe_fpn_gn-head_ms-640-800-2x_coco.py | 30 + ...enter-normbbox-centeronreg-giou_1x_coco.py | 45 + ...centeronreg-giou_8xb8-amp-lsj-200e_coco.py | 75 + ...01-64x4d_fpn_gn-head_ms-640-800-2x_coco.py | 52 + mmdetection/configs/fcos/metafile.yml | 146 + mmdetection/configs/foveabox/README.md | 53 + .../foveabox/fovea_r101_fpn_4xb4-1x_coco.py | 6 + .../foveabox/fovea_r101_fpn_4xb4-2x_coco.py | 6 + ...vea_r101_fpn_gn-head-align_4xb4-2x_coco.py | 23 + ...n_gn-head-align_ms-640-800-4xb4-2x_coco.py | 34 + .../foveabox/fovea_r50_fpn_4xb4-1x_coco.py | 59 + .../foveabox/fovea_r50_fpn_4xb4-2x_coco.py | 15 + ...ovea_r50_fpn_gn-head-align_4xb4-2x_coco.py | 20 + ...n_gn-head-align_ms-640-800-4xb4-2x_coco.py | 30 + mmdetection/configs/foveabox/metafile.yml | 172 + mmdetection/configs/fpg/README.md | 43 + ...er-rcnn_r50_fpg-chn128_crop640-50e_coco.py | 9 + .../faster-rcnn_r50_fpg_crop640-50e_coco.py | 48 + .../faster-rcnn_r50_fpn_crop640-50e_coco.py | 73 + ...sk-rcnn_r50_fpg-chn128_crop640-50e_coco.py | 10 + .../fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py | 48 + .../fpg/mask-rcnn_r50_fpn_crop640-50e_coco.py | 79 + mmdetection/configs/fpg/metafile.yml | 104 + ...tinanet_r50_fpg-chn128_crop640_50e_coco.py | 5 + .../fpg/retinanet_r50_fpg_crop640_50e_coco.py | 53 + mmdetection/configs/free_anchor/README.md | 37 + .../freeanchor_r101_fpn_1x_coco.py | 6 + .../free_anchor/freeanchor_r50_fpn_1x_coco.py | 22 + .../freeanchor_x101-32x4d_fpn_1x_coco.py | 13 + mmdetection/configs/free_anchor/metafile.yml | 79 + mmdetection/configs/fsaf/README.md | 57 + .../configs/fsaf/fsaf_r101_fpn_1x_coco.py | 6 + .../configs/fsaf/fsaf_r50_fpn_1x_coco.py | 47 + .../fsaf/fsaf_x101-64x4d_fpn_1x_coco.py | 14 + mmdetection/configs/fsaf/metafile.yml | 80 + mmdetection/configs/gcnet/README.md | 69 + ...n-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py | 11 + ...bn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py | 11 + ...01-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py | 4 + ...-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py | 11 + ...1-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py | 11 + ...mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py | 4 + ...ask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py | 8 + ...mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py | 8 + ...n_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py | 11 + ...nn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py | 11 + .../mask-rcnn_r101-syncbn_fpn_1x_coco.py | 4 + ...mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py | 8 + .../mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py | 8 + ...nn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py | 11 + ...cnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py | 11 + .../gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py | 4 + ...-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py | 11 + ...1-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py | 11 + ...mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py | 4 + mmdetection/configs/gcnet/metafile.yml | 440 ++ mmdetection/configs/gfl/README.md | 42 + .../gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py | 15 + .../configs/gfl/gfl_r101_fpn_ms-2x_coco.py | 13 + .../configs/gfl/gfl_r50_fpn_1x_coco.py | 66 + .../configs/gfl/gfl_r50_fpn_ms-2x_coco.py | 28 + ...l_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py | 18 + .../gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py | 16 + mmdetection/configs/gfl/metafile.yml | 134 + mmdetection/configs/ghm/README.md | 33 + mmdetection/configs/ghm/metafile.yml | 101 + .../ghm/retinanet_r101_fpn_ghm-1x_coco.py | 6 + .../ghm/retinanet_r50_fpn_ghm-1x_coco.py | 18 + .../retinanet_x101-32x4d_fpn_ghm-1x_coco.py | 14 + .../retinanet_x101-64x4d_fpn_ghm-1x_coco.py | 14 + mmdetection/configs/glip/README.md | 80 + ...n-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py | 14 + ...ss_swin-l_fpn_dyhead_pretrain_mixeddata.py | 12 + ...t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py | 155 + ...tss_swin-t_a_fpn_dyhead_pretrain_obj365.py | 90 + ...t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py | 9 + ...tss_swin-t_b_fpn_dyhead_pretrain_obj365.py | 3 + ...t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py | 3 + ...in-t_c_fpn_dyhead_pretrain_obj365-goldg.py | 1 + ...n-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py | 3 + ...n_dyhead_pretrain_obj365-goldg-cc3m-sub.py | 1 + mmdetection/configs/glip/metafile.yml | 111 + mmdetection/configs/gn+ws/README.md | 54 + .../faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py | 6 + .../faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py | 16 + ...r-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py | 18 + ...er-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py | 18 + ...-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py | 17 + .../mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py | 6 + ...k-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py | 17 + .../mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py | 33 + ...x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py | 17 + ...k-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py | 19 + ..._x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py | 17 + ...sk-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py | 19 + mmdetection/configs/gn+ws/metafile.yml | 263 ++ mmdetection/configs/gn/README.md | 41 + .../gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py | 7 + .../gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py | 18 + ...ask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py | 31 + ...ask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py | 18 + .../gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py | 36 + .../gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py | 18 + mmdetection/configs/gn/metafile.yml | 162 + mmdetection/configs/grid_rcnn/README.md | 47 + .../grid-rcnn_r101_fpn_gn-head_2x_coco.py | 7 + .../grid-rcnn_r50_fpn_gn-head_1x_coco.py | 19 + .../grid-rcnn_r50_fpn_gn-head_2x_coco.py | 160 + ...rid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py | 13 + ...rid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py | 13 + mmdetection/configs/grid_rcnn/metafile.yml | 101 + mmdetection/configs/groie/README.md | 72 + .../groie/faste-rcnn_r50_fpn_groie_1x_coco.py | 25 + ...grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py | 45 + ...1_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py | 45 + .../groie/mask-rcnn_r50_fpn_groie_1x_coco.py | 45 + ...0_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py | 45 + mmdetection/configs/groie/metafile.yml | 94 + mmdetection/configs/grounding_dino/README.md | 172 + ...grounding_dino_r50_scratch_8xb2_1x_coco.py | 208 + ...ding_dino_swin-b_finetune_16xb2_1x_coco.py | 17 + ...rounding_dino_swin-b_pretrain_mixeddata.py | 16 + ...ding_dino_swin-t_finetune_16xb2_1x_coco.py | 204 + ...nding_dino_swin-t_finetune_8xb2_20e_cat.py | 56 + ...dino_swin-t_pretrain_obj365_goldg_cap4m.py | 127 + .../configs/grounding_dino/metafile.yml | 67 + .../configs/guided_anchoring/README.md | 59 + .../ga-fast-rcnn_r50-caffe_fpn_1x_coco.py | 66 + .../ga-faster-rcnn_r101-caffe_fpn_1x_coco.py | 7 + .../ga-faster-rcnn_r50-caffe_fpn_1x_coco.py | 64 + .../ga-faster-rcnn_r50_fpn_1x_coco.py | 64 + .../ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py | 14 + .../ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py | 14 + .../ga-retinanet_r101-caffe_fpn_1x_coco.py | 7 + .../ga-retinanet_r101-caffe_fpn_ms-2x.py | 34 + .../ga-retinanet_r50-caffe_fpn_1x_coco.py | 61 + .../ga-retinanet_r50_fpn_1x_coco.py | 61 + .../ga-retinanet_x101-32x4d_fpn_1x_coco.py | 14 + .../ga-retinanet_x101-64x4d_fpn_1x_coco.py | 14 + .../ga-rpn_r101-caffe_fpn_1x_coco.py | 8 + .../ga-rpn_r50-caffe_fpn_1x_coco.py | 57 + .../ga-rpn_r50_fpn_1x_coco.py | 57 + .../ga-rpn_x101-32x4d_fpn_1x_coco.py | 14 + .../ga-rpn_x101-64x4d_fpn_1x_coco.py | 14 + .../configs/guided_anchoring/metafile.yml | 246 ++ mmdetection/configs/hrnet/README.md | 101 + ...cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py | 11 + ...cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py | 51 + ...cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py | 12 + .../cascade-rcnn_hrnetv2p-w18-20e_coco.py | 11 + .../cascade-rcnn_hrnetv2p-w32-20e_coco.py | 51 + .../cascade-rcnn_hrnetv2p-w40-20e_coco.py | 12 + .../hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py | 11 + .../hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py | 16 + .../hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py | 37 + .../hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py | 16 + .../hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py | 11 + .../hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py | 16 + .../fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py | 10 + .../fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py | 16 + ...v2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py | 10 + .../fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py | 43 + .../fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py | 16 + ...v2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py | 35 + ...v2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py | 11 + .../hrnet/htc_hrnetv2p-w18_20e_coco.py | 10 + .../hrnet/htc_hrnetv2p-w32_20e_coco.py | 37 + .../hrnet/htc_hrnetv2p-w40_20e_coco.py | 11 + .../hrnet/htc_hrnetv2p-w40_28e_coco.py | 16 + .../htc_x101-64x4d_fpn_16xb1-28e_coco.py | 16 + .../hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py | 10 + .../hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py | 16 + .../hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py | 37 + .../hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py | 16 + .../hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py | 16 + .../hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py | 11 + mmdetection/configs/hrnet/metafile.yml | 971 +++++ mmdetection/configs/htc/README.md | 67 + .../htc-without-semantic_r50_fpn_1x_coco.py | 223 + .../configs/htc/htc_r101_fpn_20e_coco.py | 6 + .../configs/htc/htc_r50_fpn_1x_coco.py | 33 + .../configs/htc/htc_r50_fpn_20e_coco.py | 16 + .../htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py | 32 + ...nv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py | 20 + .../htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py | 7 + mmdetection/configs/htc/metafile.yml | 165 + mmdetection/configs/instaboost/README.md | 58 + ...e-mask-rcnn_r101_fpn_instaboost-4x_coco.py | 7 + ...de-mask-rcnn_r50_fpn_instaboost-4x_coco.py | 40 + ...-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py | 14 + .../mask-rcnn_r101_fpn_instaboost-4x_coco.py | 6 + .../mask-rcnn_r50_fpn_instaboost-4x_coco.py | 40 + ...-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py | 14 + mmdetection/configs/instaboost/metafile.yml | 99 + mmdetection/configs/lad/README.md | 45 + .../lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py | 127 + .../lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py | 126 + mmdetection/configs/lad/metafile.yml | 45 + mmdetection/configs/ld/README.md | 43 + .../ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py | 49 + .../ld/ld_r18-gflv1-r101_fpn_1x_coco.py | 70 + .../ld/ld_r34-gflv1-r101_fpn_1x_coco.py | 19 + .../ld/ld_r50-gflv1-r101_fpn_1x_coco.py | 19 + mmdetection/configs/ld/metafile.yml | 69 + mmdetection/configs/legacy_1.x/README.md | 54 + .../cascade-mask-rcnn_r50_fpn_1x_coco_v1.py | 78 + .../faster-rcnn_r50_fpn_1x_coco_v1.py | 38 + .../mask-rcnn_r50_fpn_1x_coco_v1.py | 34 + .../retinanet_r50-caffe_fpn_1x_coco_v1.py | 16 + .../retinanet_r50_fpn_1x_coco_v1.py | 17 + .../configs/legacy_1.x/ssd300_coco_v1.py | 20 + mmdetection/configs/libra_rcnn/README.md | 53 + .../libra-fast-rcnn_r50_fpn_1x_coco.py | 52 + .../libra-faster-rcnn_r101_fpn_1x_coco.py | 6 + .../libra-faster-rcnn_r50_fpn_1x_coco.py | 41 + ...ibra-faster-rcnn_x101-64x4d_fpn_1x_coco.py | 14 + .../libra-retinanet_r50_fpn_1x_coco.py | 26 + mmdetection/configs/libra_rcnn/metafile.yml | 99 + mmdetection/configs/lvis/README.md | 56 + ...-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py | 6 + ...cnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py | 6 + ...k-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py | 13 + ...rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py | 13 + ...x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py | 14 + ...01-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py | 14 + ...x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py | 14 + ...01-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py | 14 + mmdetection/configs/lvis/metafile.yml | 128 + mmdetection/configs/mask2former/README.md | 76 + ...2former_r101_8xb2-lsj-50e_coco-panoptic.py | 7 + .../mask2former_r101_8xb2-lsj-50e_coco.py | 7 + ...k2former_r50_8xb2-lsj-50e_coco-panoptic.py | 251 ++ .../mask2former_r50_8xb2-lsj-50e_coco.py | 100 + ...12-384-in21k_8xb2-lsj-50e_coco-panoptic.py | 5 + ...b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py | 42 + ...-384-in21k_16xb1-lsj-100e_coco-panoptic.py | 25 + ...-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py | 37 + ...rmer_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py | 37 + ...-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py | 58 + ...rmer_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py | 56 + mmdetection/configs/mask2former/metafile.yml | 223 + mmdetection/configs/mask2former_vis/README.md | 81 + ...mask2former_r101_8xb2-8e_youtubevis2019.py | 12 + ...mask2former_r101_8xb2-8e_youtubevis2021.py | 12 + .../mask2former_r50_8xb2-8e_youtubevis2019.py | 174 + .../mask2former_r50_8xb2-8e_youtubevis2021.py | 37 + ...p4-w12-384-in21k_8xb2-8e_youtubevis2021.py | 64 + .../configs/mask2former_vis/metafile.yml | 53 + mmdetection/configs/mask_rcnn/README.md | 59 + .../mask-rcnn_r101-caffe_fpn_1x_coco.py | 7 + ...ask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py | 19 + .../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py | 6 + .../mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py | 6 + ...sk-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../mask-rcnn_r101_fpn_ms-poly-3x_coco.py | 10 + ...ask-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../mask-rcnn_r50-caffe-c4_1x_coco.py | 5 + .../mask-rcnn_r50-caffe_fpn_1x_coco.py | 13 + .../mask-rcnn_r50-caffe_fpn_ms-1x_coco.py | 28 + ...mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py | 31 + ...mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py | 15 + ...mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py | 15 + ...mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py | 31 + .../mask-rcnn_r50_fpn_1x-wandb_coco.py | 16 + .../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py | 5 + .../mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py | 5 + ...ask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py | 22 + .../mask-rcnn_r50_fpn_amp-1x_coco.py | 4 + .../mask-rcnn_r50_fpn_ms-poly-3x_coco.py | 4 + .../mask-rcnn_r50_fpn_poly-1x_coco.py | 18 + .../mask-rcnn_x101-32x4d_fpn_1x_coco.py | 14 + .../mask-rcnn_x101-32x4d_fpn_2x_coco.py | 14 + ...ask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py | 18 + .../mask-rcnn_x101-32x8d_fpn_1x_coco.py | 22 + ...ask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py | 40 + ...ask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py | 25 + .../mask-rcnn_x101-64x4d_fpn_1x_coco.py | 14 + .../mask-rcnn_x101-64x4d_fpn_2x_coco.py | 14 + ...ask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py | 18 + mmdetection/configs/mask_rcnn/metafile.yml | 443 ++ mmdetection/configs/maskformer/README.md | 58 + .../maskformer_r50_ms-16xb1-75e_coco.py | 216 + ...former_swin-l-p4-w12_64xb1-ms-300e_coco.py | 73 + mmdetection/configs/maskformer/metafile.yml | 43 + mmdetection/configs/masktrack_rcnn/README.md | 93 + ...k-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py | 12 + ...k-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py | 28 + ...sk-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py | 130 + ...sk-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py | 17 + ...k-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py | 16 + ...k-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py | 32 + .../configs/masktrack_rcnn/metafile.yml | 91 + ...2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py | 75 + .../d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py | 83 + .../d2_retinanet_r50-caffe_fpn_ms-90k_coco.py | 48 + mmdetection/configs/ms_rcnn/README.md | 36 + mmdetection/configs/ms_rcnn/metafile.yml | 159 + .../ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py | 7 + .../ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py | 17 + .../ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py | 16 + .../ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py | 17 + .../ms_rcnn/ms-rcnn_r50_fpn_1x_coco.py | 16 + .../ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py | 14 + .../ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py | 14 + .../ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py | 17 + mmdetection/configs/nas_fcos/README.md | 35 + mmdetection/configs/nas_fcos/metafile.yml | 44 + ...caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py | 75 + ...-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py | 74 + mmdetection/configs/nas_fpn/README.md | 36 + mmdetection/configs/nas_fpn/metafile.yml | 59 + .../retinanet_r50_fpn_crop640-50e_coco.py | 78 + .../retinanet_r50_nasfpn_crop640-50e_coco.py | 16 + mmdetection/configs/objects365/README.md | 102 + ...-rcnn_r50-syncbn_fpn_1350k_objects365v1.py | 49 + ...ster-rcnn_r50_fpn_16xb4-1x_objects365v1.py | 39 + ...ster-rcnn_r50_fpn_16xb4-1x_objects365v2.py | 39 + mmdetection/configs/objects365/metafile.yml | 101 + ...nanet_r50-syncbn_fpn_1350k_objects365v1.py | 49 + .../retinanet_r50_fpn_1x_objects365v1.py | 35 + .../retinanet_r50_fpn_1x_objects365v2.py | 35 + mmdetection/configs/ocsort/README.md | 56 + mmdetection/configs/ocsort/metafile.yml | 27 + ...dhuman-mot17halftrain_test-mot17halfval.py | 18 + ...0e_crowdhuman-mot20train_test-mot20test.py | 18 + mmdetection/configs/openimages/README.md | 149 + ...n_r50_fpn_32xb2-1x_openimages-challenge.py | 39 + ...faster-rcnn_r50_fpn_32xb2-1x_openimages.py | 35 + ...0_fpn_32xb2-cas-1x_openimages-challenge.py | 5 + ...er-rcnn_r50_fpn_32xb2-cas-1x_openimages.py | 5 + mmdetection/configs/openimages/metafile.yml | 102 + .../retinanet_r50_fpn_32xb2-1x_openimages.py | 35 + .../openimages/ssd300_32xb8-36e_openimages.py | 88 + mmdetection/configs/paa/README.md | 47 + mmdetection/configs/paa/metafile.yml | 111 + .../configs/paa/paa_r101_fpn_1x_coco.py | 6 + .../configs/paa/paa_r101_fpn_2x_coco.py | 18 + .../configs/paa/paa_r101_fpn_ms-3x_coco.py | 6 + .../configs/paa/paa_r50_fpn_1.5x_coco.py | 18 + .../configs/paa/paa_r50_fpn_1x_coco.py | 80 + .../configs/paa/paa_r50_fpn_2x_coco.py | 18 + .../configs/paa/paa_r50_fpn_ms-3x_coco.py | 29 + mmdetection/configs/pafpn/README.md | 34 + .../pafpn/faster-rcnn_r50_pafpn_1x_coco.py | 8 + mmdetection/configs/pafpn/metafile.yml | 38 + mmdetection/configs/panoptic_fpn/README.md | 62 + mmdetection/configs/panoptic_fpn/metafile.yml | 70 + .../panoptic-fpn_r101_fpn_1x_coco.py | 6 + .../panoptic-fpn_r101_fpn_ms-3x_coco.py | 6 + .../panoptic-fpn_r50_fpn_1x_coco.py | 45 + .../panoptic-fpn_r50_fpn_ms-3x_coco.py | 35 + mmdetection/configs/pascal_voc/README.md | 40 + ...faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py | 86 + .../faster-rcnn_r50_fpn_1x_voc0712-cocofmt.py | 100 + .../faster-rcnn_r50_fpn_1x_voc0712.py | 35 + .../retinanet_r50_fpn_1x_voc0712.py | 34 + .../configs/pascal_voc/ssd300_voc0712.py | 102 + .../configs/pascal_voc/ssd512_voc0712.py | 82 + mmdetection/configs/pisa/README.md | 50 + .../pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py | 30 + ...faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py | 30 + .../pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py | 30 + .../mask-rcnn_x101-32x4d_fpn_pisa_1x_coco.py | 30 + mmdetection/configs/pisa/metafile.yml | 110 + .../pisa/retinanet-r50_fpn_pisa_1x_coco.py | 7 + .../retinanet_x101-32x4d_fpn_pisa_1x_coco.py | 7 + mmdetection/configs/pisa/ssd300_pisa_coco.py | 7 + mmdetection/configs/pisa/ssd512_pisa_coco.py | 7 + mmdetection/configs/point_rend/README.md | 33 + mmdetection/configs/point_rend/metafile.yml | 54 + .../point-rend_r50-caffe_fpn_ms-1x_coco.py | 44 + .../point-rend_r50-caffe_fpn_ms-3x_coco.py | 18 + mmdetection/configs/pvt/README.md | 57 + mmdetection/configs/pvt/metafile.yml | 243 ++ .../pvt/retinanet_pvt-l_fpn_1x_coco.py | 8 + .../pvt/retinanet_pvt-m_fpn_1x_coco.py | 6 + .../pvt/retinanet_pvt-s_fpn_1x_coco.py | 6 + .../pvt/retinanet_pvt-t_fpn_1x_coco.py | 18 + .../pvt/retinanet_pvtv2-b0_fpn_1x_coco.py | 19 + .../pvt/retinanet_pvtv2-b1_fpn_1x_coco.py | 7 + .../pvt/retinanet_pvtv2-b2_fpn_1x_coco.py | 8 + .../pvt/retinanet_pvtv2-b3_fpn_1x_coco.py | 8 + .../pvt/retinanet_pvtv2-b4_fpn_1x_coco.py | 20 + .../pvt/retinanet_pvtv2-b5_fpn_1x_coco.py | 21 + mmdetection/configs/qdtrack/README.md | 89 + mmdetection/configs/qdtrack/metafile.yml | 30 + .../qdtrack_faster-rcnn_r50_fpn_4e_base.py | 118 + ...xb2-4e_mot17halftrain_test-mot17halfval.py | 14 + mmdetection/configs/queryinst/README.md | 36 + mmdetection/configs/queryinst/metafile.yml | 100 + ...n_300-proposals_crop-ms-480-800-3x_coco.py | 7 + .../queryinst_r101_fpn_ms-480-800-3x_coco.py | 7 + .../queryinst/queryinst_r50_fpn_1x_coco.py | 155 + ...n_300-proposals_crop-ms-480-800-3x_coco.py | 45 + .../queryinst_r50_fpn_ms-480-800-3x_coco.py | 32 + .../recycle/detr_r50_8xb2-150e_recycle.py | 61 + .../recycle/faster-rcnn_r50_fpn_1x_recycle.py | 48 + mmdetection/configs/regnet/README.md | 121 + ...-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py | 17 + ...-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py | 28 + ...-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py | 17 + ...de-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py | 17 + ...-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py | 17 + ...aster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py | 17 + .../faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py | 30 + .../faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py | 16 + ...aster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py | 25 + ...aster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py | 17 + .../faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py | 17 + ...aster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py | 17 + ...-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py | 26 + .../mask-rcnn_regnetx-12GF_fpn_1x_coco.py | 17 + ..._regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py | 7 + .../mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py | 30 + .../mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py | 60 + ...-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py | 26 + .../mask-rcnn_regnetx-4GF_fpn_1x_coco.py | 17 + ...sk-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py | 26 + .../mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py | 17 + ...-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py | 26 + .../mask-rcnn_regnetx-8GF_fpn_1x_coco.py | 17 + mmdetection/configs/regnet/metafile.yml | 797 ++++ .../retinanet_regnetx-1.6GF_fpn_1x_coco.py | 17 + .../retinanet_regnetx-3.2GF_fpn_1x_coco.py | 31 + .../retinanet_regnetx-800MF_fpn_1x_coco.py | 17 + mmdetection/configs/reid/README.md | 135 + ...0_8xb32-6e_mot15train80_test-mot15val20.py | 7 + ...0_8xb32-6e_mot16train80_test-mot16val20.py | 7 + ...0_8xb32-6e_mot17train80_test-mot17val20.py | 61 + ...0_8xb32-6e_mot20train80_test-mot20val20.py | 10 + mmdetection/configs/reppoints/README.md | 59 + mmdetection/configs/reppoints/metafile.yml | 181 + ..._r50-center_fpn-gn_head-gn-grid_1x_coco.py | 2 + ...ts-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py | 13 + ...oints-minmax_r50_fpn-gn_head-gn_1x_coco.py | 2 + ...r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py | 8 + ...ints-moment_r101_fpn-gn_head-gn_2x_coco.py | 6 + ...oints-moment_r50_fpn-gn_head-gn_1x_coco.py | 3 + ...oints-moment_r50_fpn-gn_head-gn_2x_coco.py | 17 + .../reppoints-moment_r50_fpn_1x_coco.py | 74 + ...x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py | 16 + ...rtial-minmax_r50_fpn-gn_head-gn_1x_coco.py | 2 + mmdetection/configs/res2net/README.md | 77 + ...cade-mask-rcnn_res2net-101_fpn_20e_coco.py | 10 + .../cascade-rcnn_res2net-101_fpn_20e_coco.py | 10 + .../faster-rcnn_res2net-101_fpn_2x_coco.py | 10 + .../res2net/htc_res2net-101_fpn_20e_coco.py | 10 + .../mask-rcnn_res2net-101_fpn_2x_coco.py | 10 + mmdetection/configs/res2net/metafile.yml | 146 + mmdetection/configs/resnest/README.md | 54 + ...101_fpn_syncbn-backbone+head_ms-1x_coco.py | 7 + ...s50_fpn_syncbn-backbone+head_ms-1x_coco.py | 101 + ...n_syncbn-backbone+head_ms-range-1x_coco.py | 7 + ...n_syncbn-backbone+head_ms-range-1x_coco.py | 93 + ...n_syncbn-backbone+head_ms-range-1x_coco.py | 7 + ...n_syncbn-backbone+head_ms-range-1x_coco.py | 39 + ...101_fpn_syncbn-backbone+head_ms-1x_coco.py | 7 + ...s50_fpn_syncbn-backbone+head_ms-1x_coco.py | 46 + mmdetection/configs/resnest/metafile.yml | 230 + .../configs/resnet_strikes_back/README.md | 40 + ...scade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py | 15 + .../faster-rcnn_r50-rsb-pre_fpn_1x_coco.py | 15 + .../mask-rcnn_r50-rsb-pre_fpn_1x_coco.py | 15 + .../configs/resnet_strikes_back/metafile.yml | 116 + .../retinanet_r50-rsb-pre_fpn_1x_coco.py | 15 + mmdetection/configs/retinanet/README.md | 53 + mmdetection/configs/retinanet/metafile.yml | 312 ++ .../retinanet_r101-caffe_fpn_1x_coco.py | 7 + .../retinanet_r101-caffe_fpn_ms-3x_coco.py | 8 + .../retinanet/retinanet_r101_fpn_1x_coco.py | 6 + .../retinanet/retinanet_r101_fpn_2x_coco.py | 6 + ...tinanet_r101_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../retinanet_r101_fpn_ms-640-800-3x_coco.py | 9 + .../retinanet/retinanet_r18_fpn_1x_coco.py | 20 + .../retinanet_r18_fpn_1xb8-1x_coco.py | 24 + ...etinanet_r18_fpn_8xb8-amp-lsj-200e_coco.py | 7 + .../retinanet_r50-caffe_fpn_1x_coco.py | 16 + .../retinanet_r50-caffe_fpn_ms-1x_coco.py | 15 + .../retinanet_r50-caffe_fpn_ms-2x_coco.py | 16 + .../retinanet_r50-caffe_fpn_ms-3x_coco.py | 17 + .../retinanet/retinanet_r50_fpn_1x_coco.py | 10 + .../retinanet/retinanet_r50_fpn_2x_coco.py | 25 + ...etinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py | 21 + .../retinanet/retinanet_r50_fpn_90k_coco.py | 24 + .../retinanet_r50_fpn_amp-1x_coco.py | 6 + .../retinanet_r50_fpn_ms-640-800-3x_coco.py | 4 + .../configs/retinanet/retinanet_tta.py | 23 + .../retinanet_x101-32x4d_fpn_1x_coco.py | 14 + .../retinanet_x101-32x4d_fpn_2x_coco.py | 14 + .../retinanet_x101-64x4d_fpn_1x_coco.py | 14 + .../retinanet_x101-64x4d_fpn_2x_coco.py | 14 + ...nanet_x101-64x4d_fpn_ms-640-800-3x_coco.py | 11 + mmdetection/configs/rpn/README.md | 39 + mmdetection/configs/rpn/metafile.yml | 127 + .../configs/rpn/rpn_r101-caffe_fpn_1x_coco.py | 7 + .../configs/rpn/rpn_r101_fpn_1x_coco.py | 6 + .../configs/rpn/rpn_r101_fpn_2x_coco.py | 6 + .../configs/rpn/rpn_r50-caffe-c4_1x_coco.py | 8 + .../configs/rpn/rpn_r50-caffe_fpn_1x_coco.py | 16 + .../configs/rpn/rpn_r50_fpn_1x_coco.py | 36 + .../configs/rpn/rpn_r50_fpn_2x_coco.py | 17 + .../configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py | 14 + .../configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py | 14 + .../configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py | 14 + .../configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py | 14 + mmdetection/configs/rtmdet/README.md | 454 ++ .../configs/rtmdet/classification/README.md | 56 + .../cspnext-l_8xb256-rsb-a1-600e_in1k.py | 5 + .../cspnext-m_8xb256-rsb-a1-600e_in1k.py | 5 + .../cspnext-s_8xb256-rsb-a1-600e_in1k.py | 64 + .../cspnext-tiny_8xb256-rsb-a1-600e_in1k.py | 5 + .../cspnext-x_8xb256-rsb-a1-600e_in1k.py | 5 + mmdetection/configs/rtmdet/metafile.yml | 200 + .../rtmdet/rtmdet-ins_l_8xb32-300e_coco.py | 104 + .../rtmdet/rtmdet-ins_m_8xb32-300e_coco.py | 6 + .../rtmdet/rtmdet-ins_s_8xb32-300e_coco.py | 80 + .../rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py | 48 + .../rtmdet/rtmdet-ins_x_8xb16-300e_coco.py | 31 + .../rtmdet/rtmdet_l_8xb32-300e_coco.py | 179 + .../rtmdet/rtmdet_m_8xb32-300e_coco.py | 6 + .../rtmdet/rtmdet_s_8xb32-300e_coco.py | 62 + .../rtmdet/rtmdet_tiny_8xb32-300e_coco.py | 43 + mmdetection/configs/rtmdet/rtmdet_tta.py | 36 + .../rtmdet/rtmdet_x_8xb32-300e_coco.py | 7 + .../rtmdet/rtmdet_x_p6_4xb8-300e_coco.py | 132 + mmdetection/configs/sabl/README.md | 47 + mmdetection/configs/sabl/metafile.yml | 140 + .../sabl-cascade-rcnn_r101_fpn_1x_coco.py | 90 + .../sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py | 86 + .../sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py | 38 + .../sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py | 34 + .../sabl-retinanet_r101-gn_fpn_1x_coco.py | 57 + ...etinanet_r101-gn_fpn_ms-480-960-2x_coco.py | 68 + ...etinanet_r101-gn_fpn_ms-640-800-2x_coco.py | 68 + .../sabl/sabl-retinanet_r101_fpn_1x_coco.py | 55 + .../sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py | 53 + .../sabl/sabl-retinanet_r50_fpn_1x_coco.py | 51 + mmdetection/configs/scnet/README.md | 63 + mmdetection/configs/scnet/metafile.yml | 116 + .../configs/scnet/scnet_r101_fpn_20e_coco.py | 6 + .../configs/scnet/scnet_r50_fpn_1x_coco.py | 138 + .../configs/scnet/scnet_r50_fpn_20e_coco.py | 15 + .../scnet/scnet_x101-64x4d_fpn_20e_coco.py | 15 + .../scnet_x101-64x4d_fpn_8xb1-20e_coco.py | 8 + mmdetection/configs/scratch/README.md | 35 + ...ter-rcnn_r50-scratch_fpn_gn-all_6x_coco.py | 39 + ...ask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py | 40 + mmdetection/configs/scratch/metafile.yml | 48 + mmdetection/configs/seesaw_loss/README.md | 47 + ...w-loss-normed-mask_random-ms-2x_lvis-v1.py | 5 + ...ss-normed-mask_sample1e-3-ms-2x_lvis-v1.py | 5 + ...01_fpn_seesaw-loss_random-ms-2x_lvis-v1.py | 116 + ...pn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py | 95 + ...w-loss-normed-mask_random-ms-2x_lvis-v1.py | 6 + ...ss-normed-mask_sample1e-3-ms-2x_lvis-v1.py | 6 + ...01_fpn_seesaw-loss_random-ms-2x_lvis-v1.py | 6 + ...pn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py | 6 + ...w-loss-normed-mask_random-ms-2x_lvis-v1.py | 5 + ...ss-normed-mask_sample1e-3-ms-2x_lvis-v1.py | 5 + ...50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py | 59 + ...pn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py | 38 + mmdetection/configs/seesaw_loss/metafile.yml | 203 + .../configs/selfsup_pretrain/README.md | 109 + .../mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py | 13 + ...mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py | 25 + .../mask-rcnn_r50-swav-pre_fpn_1x_coco.py | 13 + .../mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py | 25 + .../configs/simple_copy_paste/README.md | 38 + ...4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py | 31 + ..._4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py | 18 + ...v1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py | 31 + ...nv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py | 18 + .../configs/simple_copy_paste/metafile.yml | 92 + mmdetection/configs/soft_teacher/README.md | 33 + mmdetection/configs/soft_teacher/metafile.yml | 67 + ...-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py | 9 + ...-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py | 9 + ...-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py | 9 + ...r-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py | 84 + mmdetection/configs/solo/README.md | 54 + .../decoupled-solo-light_r50_fpn_3x_coco.py | 50 + .../solo/decoupled-solo_r50_fpn_1x_coco.py | 24 + .../solo/decoupled-solo_r50_fpn_3x_coco.py | 25 + mmdetection/configs/solo/metafile.yml | 115 + .../solo/solo_r101_fpn_8xb8-lsj-200e_coco.py | 7 + .../solo/solo_r18_fpn_8xb8-lsj-200e_coco.py | 7 + .../configs/solo/solo_r50_fpn_1x_coco.py | 62 + .../configs/solo/solo_r50_fpn_3x_coco.py | 35 + .../solo/solo_r50_fpn_8xb8-lsj-200e_coco.py | 71 + mmdetection/configs/solov2/README.md | 59 + mmdetection/configs/solov2/metafile.yml | 93 + .../solov2/solov2-light_r18_fpn_ms-3x_coco.py | 7 + .../solov2/solov2-light_r34_fpn_ms-3x_coco.py | 7 + .../solov2-light_r50-dcn_fpn_ms-3x_coco.py | 14 + .../solov2/solov2-light_r50_fpn_ms-3x_coco.py | 56 + .../solov2/solov2_r101-dcn_fpn_ms-3x_coco.py | 13 + .../solov2/solov2_r101_fpn_ms-3x_coco.py | 6 + .../configs/solov2/solov2_r50_fpn_1x_coco.py | 70 + .../solov2/solov2_r50_fpn_ms-3x_coco.py | 35 + .../solov2/solov2_x101-dcn_fpn_ms-3x_coco.py | 17 + mmdetection/configs/sort/README.md | 108 + ...xb2-4e_mot17halftrain_test-mot17halfval.py | 41 + ..._fpn_8xb2-4e_mot17train_test-mot17train.py | 11 + ...xb2-8e_mot20halftrain_test-mot20halfval.py | 29 + ..._fpn_8xb2-8e_mot20train_test-mot20train.py | 32 + mmdetection/configs/sort/metafile.yml | 35 + ...xb2-4e_mot17halftrain_test-mot17halfval.py | 54 + ...0_fpn_8xb2-4e_mot17train_test-mot17test.py | 15 + mmdetection/configs/sparse_rcnn/README.md | 38 + mmdetection/configs/sparse_rcnn/metafile.yml | 80 + ...n_300-proposals_crop-ms-480-800-3x_coco.py | 7 + ...sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py | 7 + .../sparse-rcnn_r50_fpn_1x_coco.py | 101 + ...n_300-proposals_crop-ms-480-800-3x_coco.py | 43 + .../sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py | 32 + mmdetection/configs/ssd/README.md | 62 + mmdetection/configs/ssd/metafile.yml | 78 + mmdetection/configs/ssd/ssd300_coco.py | 71 + mmdetection/configs/ssd/ssd512_coco.py | 60 + ...ite_mobilenetv2-scratch_8xb24-600e_coco.py | 158 + .../configs/strong_baselines/README.md | 20 + ...v_4conv1fc_syncbn-all_amp-lsj-100e_coco.py | 4 + ...2conv_4conv1fc_syncbn-all_lsj-100e_coco.py | 68 + ...2conv_4conv1fc_syncbn-all_lsj-400e_coco.py | 20 + ...v_4conv1fc_syncbn-all_amp-lsj-100e_coco.py | 4 + ...2conv_4conv1fc_syncbn-all_lsj-100e_coco.py | 30 + ...-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py | 5 + .../configs/strong_baselines/metafile.yml | 24 + mmdetection/configs/strongsort/README.md | 108 + mmdetection/configs/strongsort/metafile.yml | 48 + ...dhuman-mot17halftrain_test-mot17halfval.py | 130 + ...0e_crowdhuman-mot20train_test-mot20test.py | 44 + ...dhuman-mot17halftrain_test-mot17halfval.py | 188 + ...0e_crowdhuman-mot20train_test-mot20test.py | 108 + mmdetection/configs/swin/README.md | 41 + ...nn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py | 6 + .../mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py | 60 + ...nn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py | 3 + ...k-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py | 99 + mmdetection/configs/swin/metafile.yml | 120 + .../retinanet_swin-t-p4-w7_fpn_1x_coco.py | 31 + mmdetection/configs/timm_example/README.md | 62 + ...inanet_timm-efficientnet-b1_fpn_1x_coco.py | 23 + .../retinanet_timm-tv-resnet50_fpn_1x_coco.py | 22 + mmdetection/configs/tood/README.md | 40 + mmdetection/configs/tood/metafile.yml | 95 + .../tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py | 7 + .../configs/tood/tood_r101_fpn_ms-2x_coco.py | 7 + .../configs/tood/tood_r50_fpn_1x_coco.py | 80 + .../tood/tood_r50_fpn_anchor-based_1x_coco.py | 2 + .../configs/tood/tood_r50_fpn_ms-2x_coco.py | 30 + ...d_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py | 7 + .../tood/tood_x101-64x4d_fpn_ms-2x_coco.py | 16 + mmdetection/configs/tridentnet/README.md | 38 + mmdetection/configs/tridentnet/metafile.yml | 55 + .../tridentnet_r50-caffe_1x_coco.py | 22 + .../tridentnet_r50-caffe_ms-1x_coco.py | 15 + .../tridentnet_r50-caffe_ms-3x_coco.py | 18 + mmdetection/configs/v3det/README.md | 86 + ...r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py | 171 + ...inb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py | 27 + ...-twostage_r50_8xb4_sample1e-3_v3det_50e.py | 108 + ...wostage_swin_16xb2_sample1e-3_v3det_50e.py | 27 + ...no-4scale_r50_8xb2_sample1e-3_v3det_36e.py | 109 + ...-4scale_swin_16xb1_sample1e-3_v3det_36e.py | 27 + ...r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py | 72 + ...inb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py | 27 + ...r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py | 116 + ...inb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py | 27 + mmdetection/configs/vfnet/README.md | 48 + mmdetection/configs/vfnet/metafile.yml | 116 + .../vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py | 15 + .../configs/vfnet/vfnet_r101_fpn_1x_coco.py | 6 + .../configs/vfnet/vfnet_r101_fpn_2x_coco.py | 20 + .../vfnet/vfnet_r101_fpn_ms-2x_coco.py | 6 + .../vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py | 6 + .../configs/vfnet/vfnet_r50_fpn_1x_coco.py | 104 + .../configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py | 36 + .../vfnet/vfnet_res2net-101_fpn_ms-2x_coco.py | 16 + ..._res2net101-mdconv-c3-c5_fpn_ms-2x_coco.py | 18 + ..._x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py | 17 + .../vfnet/vfnet_x101-32x4d_fpn_ms-2x_coco.py | 15 + ..._x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py | 17 + .../vfnet/vfnet_x101-64x4d_fpn_ms-2x_coco.py | 15 + mmdetection/configs/wider_face/README.md | 57 + .../retinanet_r50_fpn_1x_widerface.py | 10 + .../wider_face/ssd300_8xb32-24e_widerface.py | 64 + mmdetection/configs/yolact/README.md | 75 + mmdetection/configs/yolact/metafile.yml | 81 + .../yolact/yolact_r101_1xb8-55e_coco.py | 7 + .../yolact/yolact_r50_1xb8-55e_coco.py | 170 + .../yolact/yolact_r50_8xb8-55e_coco.py | 23 + mmdetection/configs/yolo/README.md | 55 + mmdetection/configs/yolo/metafile.yml | 124 + .../yolo/yolov3_d53_8xb8-320-273e_coco.py | 29 + .../yolov3_d53_8xb8-amp-ms-608-273e_coco.py | 3 + .../yolo/yolov3_d53_8xb8-ms-416-273e_coco.py | 28 + .../yolo/yolov3_d53_8xb8-ms-608-273e_coco.py | 167 + .../yolov3_mobilenetv2_8xb24-320-300e_coco.py | 42 + ...lov3_mobilenetv2_8xb24-ms-416-300e_coco.py | 176 + mmdetection/configs/yolof/README.md | 35 + mmdetection/configs/yolof/metafile.yml | 32 + .../yolof/yolof_r50-c5_8xb8-1x_coco.py | 116 + .../yolof/yolof_r50-c5_8xb8-iter-1x_coco.py | 32 + mmdetection/configs/yolox/README.md | 39 + mmdetection/configs/yolox/metafile.yml | 70 + .../configs/yolox/yolox_l_8xb8-300e_coco.py | 8 + .../configs/yolox/yolox_m_8xb8-300e_coco.py | 8 + .../yolox/yolox_nano_8xb8-300e_coco.py | 11 + .../configs/yolox/yolox_s_8xb8-300e_coco.py | 250 ++ .../yolox/yolox_tiny_8xb8-300e_coco.py | 54 + mmdetection/configs/yolox/yolox_tta.py | 36 + .../configs/yolox/yolox_x_8xb8-300e_coco.py | 8 + mmdetection/dataset-index.yml | 18 + mmdetection/demo/create_result_gif.py | 165 + mmdetection/demo/demo_multi_model.py | 212 + mmdetection/demo/image_demo.py | 136 + mmdetection/demo/large_image_demo.py | 282 ++ mmdetection/demo/mot_demo.py | 130 + mmdetection/demo/video_demo.py | 84 + mmdetection/demo/video_gpuaccel_demo.py | 144 + mmdetection/demo/webcam_demo.py | 65 + mmdetection/docker/Dockerfile | 40 + mmdetection/docker/serve/Dockerfile | 62 + mmdetection/docker/serve/config.properties | 5 + mmdetection/docker/serve_cn/Dockerfile | 65 + mmdetection/docs/en/Makefile | 20 + .../docs/en/_static/css/readthedocs.css | 6 + .../docs/en/advanced_guides/conventions.md | 111 + .../en/advanced_guides/customize_dataset.md | 433 ++ .../en/advanced_guides/customize_losses.md | 126 + .../en/advanced_guides/customize_models.md | 412 ++ .../en/advanced_guides/customize_runtime.md | 391 ++ .../advanced_guides/customize_transforms.md | 49 + .../docs/en/advanced_guides/data_flow.md | 1 + .../docs/en/advanced_guides/datasets.md | 1 + mmdetection/docs/en/advanced_guides/engine.md | 1 + .../docs/en/advanced_guides/evaluation.md | 1 + mmdetection/docs/en/advanced_guides/how_to.md | 222 + mmdetection/docs/en/advanced_guides/index.rst | 34 + mmdetection/docs/en/advanced_guides/models.md | 1 + .../docs/en/advanced_guides/structures.md | 1 + .../docs/en/advanced_guides/transforms.md | 42 + mmdetection/docs/en/api.rst | 161 + mmdetection/docs/en/conf.py | 116 + mmdetection/docs/en/dataset_zoo.md | 1 + mmdetection/docs/en/get_started.md | 297 ++ mmdetection/docs/en/index.rst | 63 + mmdetection/docs/en/make.bat | 35 + mmdetection/docs/en/migration.md | 1 + .../migration/api_and_registry_migration.md | 1 + .../docs/en/migration/config_migration.md | 819 ++++ .../docs/en/migration/dataset_migration.md | 1 + mmdetection/docs/en/migration/migration.md | 12 + .../docs/en/migration/migration_faq.md | 1 + .../docs/en/migration/model_migration.md | 1 + mmdetection/docs/en/model_zoo.md | 358 ++ mmdetection/docs/en/notes/changelog.md | 603 +++ mmdetection/docs/en/notes/changelog_v2.x.md | 1681 +++++++ mmdetection/docs/en/notes/compatibility.md | 178 + .../docs/en/notes/contribution_guide.md | 1 + mmdetection/docs/en/notes/faq.md | 240 + mmdetection/docs/en/notes/projects.md | 57 + mmdetection/docs/en/overview.md | 54 + mmdetection/docs/en/stat.py | 64 + mmdetection/docs/en/switch_language.md | 3 + mmdetection/docs/en/user_guides/config.md | 612 +++ .../docs/en/user_guides/dataset_prepare.md | 310 ++ mmdetection/docs/en/user_guides/deploy.md | 173 + mmdetection/docs/en/user_guides/finetune.md | 96 + mmdetection/docs/en/user_guides/index.rst | 41 + mmdetection/docs/en/user_guides/inference.md | 440 ++ mmdetection/docs/en/user_guides/init_cfg.md | 161 + .../docs/en/user_guides/label_studio.md | 256 ++ mmdetection/docs/en/user_guides/new_model.md | 290 ++ .../en/user_guides/robustness_benchmarking.md | 110 + mmdetection/docs/en/user_guides/semi_det.md | 325 ++ .../en/user_guides/single_stage_as_rpn.md | 176 + mmdetection/docs/en/user_guides/test.md | 303 ++ .../en/user_guides/test_results_submission.md | 182 + .../en/user_guides/tracking_analysis_tools.md | 86 + .../docs/en/user_guides/tracking_config.md | 112 + .../user_guides/tracking_dataset_prepare.md | 247 ++ .../docs/en/user_guides/tracking_inference.md | 55 + .../en/user_guides/tracking_train_test.md | 229 + .../en/user_guides/tracking_visualization.md | 47 + mmdetection/docs/en/user_guides/train.md | 456 ++ .../docs/en/user_guides/useful_hooks.md | 105 + .../docs/en/user_guides/useful_tools.md | 660 +++ .../docs/en/user_guides/visualization.md | 91 + mmdetection/docs/zh_cn/Makefile | 20 + .../docs/zh_cn/_static/css/readthedocs.css | 6 + .../docs/zh_cn/advanced_guides/conventions.md | 109 + .../advanced_guides/customize_dataset.md | 425 ++ .../zh_cn/advanced_guides/customize_losses.md | 125 + .../zh_cn/advanced_guides/customize_models.md | 412 ++ .../advanced_guides/customize_runtime.md | 387 ++ .../advanced_guides/customize_transforms.md | 47 + .../docs/zh_cn/advanced_guides/data_flow.md | 1 + .../docs/zh_cn/advanced_guides/datasets.md | 1 + .../docs/zh_cn/advanced_guides/engine.md | 1 + .../docs/zh_cn/advanced_guides/evaluation.md | 1 + .../docs/zh_cn/advanced_guides/how_to.md | 220 + .../docs/zh_cn/advanced_guides/index.rst | 34 + .../docs/zh_cn/advanced_guides/models.md | 1 + .../docs/zh_cn/advanced_guides/structures.md | 1 + .../docs/zh_cn/advanced_guides/transforms.md | 43 + mmdetection/docs/zh_cn/api.rst | 161 + mmdetection/docs/zh_cn/article.md | 53 + mmdetection/docs/zh_cn/conf.py | 118 + mmdetection/docs/zh_cn/get_started.md | 230 + mmdetection/docs/zh_cn/index.rst | 67 + mmdetection/docs/zh_cn/make.bat | 35 + .../migration/api_and_registry_migration.md | 1 + .../docs/zh_cn/migration/config_migration.md | 814 ++++ .../docs/zh_cn/migration/dataset_migration.md | 1 + mmdetection/docs/zh_cn/migration/migration.md | 12 + .../docs/zh_cn/migration/migration_faq.md | 1 + .../docs/zh_cn/migration/model_migration.md | 1 + mmdetection/docs/zh_cn/model_zoo.md | 333 ++ mmdetection/docs/zh_cn/notes/compatibility.md | 177 + mmdetection/docs/zh_cn/notes/faq.md | 259 ++ mmdetection/docs/zh_cn/notes/projects.md | 48 + mmdetection/docs/zh_cn/overview.md | 54 + mmdetection/docs/zh_cn/stat.py | 64 + mmdetection/docs/zh_cn/switch_language.md | 3 + mmdetection/docs/zh_cn/user_guides/config.md | 589 +++ .../docs/zh_cn/user_guides/dataset_prepare.md | 307 ++ mmdetection/docs/zh_cn/user_guides/deploy.md | 174 + .../docs/zh_cn/user_guides/finetune.md | 96 + mmdetection/docs/zh_cn/user_guides/index.rst | 34 + .../docs/zh_cn/user_guides/inference.md | 438 ++ .../docs/zh_cn/user_guides/init_cfg.md | 161 + .../docs/zh_cn/user_guides/label_studio.md | 255 ++ .../docs/zh_cn/user_guides/new_model.md | 289 ++ .../user_guides/robustness_benchmarking.md | 109 + .../docs/zh_cn/user_guides/semi_det.md | 320 ++ .../zh_cn/user_guides/single_stage_as_rpn.md | 171 + mmdetection/docs/zh_cn/user_guides/test.md | 285 ++ .../user_guides/test_results_submission.md | 174 + .../user_guides/tracking_analysis_tools.md | 87 + .../docs/zh_cn/user_guides/tracking_config.md | 109 + .../user_guides/tracking_dataset_prepare.md | 245 ++ .../user_guides/tracking_interference.md | 55 + .../user_guides/tracking_train_test_zh_cn.md | 229 + .../user_guides/tracking_visualization.md | 51 + mmdetection/docs/zh_cn/user_guides/train.md | 451 ++ .../docs/zh_cn/user_guides/useful_hooks.md | 107 + .../docs/zh_cn/user_guides/useful_tools.md | 636 +++ .../docs/zh_cn/user_guides/visualization.md | 93 + mmdetection/mmdet/__init__.py | 27 + mmdetection/mmdet/apis/__init__.py | 9 + mmdetection/mmdet/apis/det_inferencer.py | 644 +++ mmdetection/mmdet/apis/inference.py | 372 ++ .../configs/_base_/datasets/coco_detection.py | 104 + .../configs/_base_/datasets/coco_instance.py | 106 + .../_base_/datasets/coco_instance_semantic.py | 87 + .../configs/_base_/datasets/coco_panoptic.py | 105 + .../configs/_base_/datasets/mot_challenge.py | 101 + .../mmdet/configs/_base_/default_runtime.py | 33 + .../models/cascade_mask_rcnn_r50_fpn.py | 220 + .../_base_/models/cascade_rcnn_r50_fpn.py | 201 + .../_base_/models/faster_rcnn_r50_fpn.py | 138 + .../_base_/models/mask_rcnn_r50_caffe_c4.py | 158 + .../_base_/models/mask_rcnn_r50_fpn.py | 154 + .../_base_/models/retinanet_r50_fpn.py | 77 + .../configs/_base_/schedules/schedule_1x.py | 33 + .../configs/_base_/schedules/schedule_2x.py | 33 + .../cascade_mask_rcnn_r50_fpn_1x_coco.py | 13 + .../cascade_rcnn_r50_fpn_1x_coco.py | 13 + .../configs/common/lsj_100e_coco_detection.py | 134 + .../configs/common/lsj_100e_coco_instance.py | 134 + .../configs/common/lsj_200e_coco_detection.py | 25 + .../configs/common/lsj_200e_coco_instance.py | 25 + .../mmdet/configs/common/ms_3x_coco.py | 130 + .../configs/common/ms_3x_coco_instance.py | 136 + .../mmdet/configs/common/ms_90k_coco.py | 151 + .../common/ms_poly_3x_coco_instance.py | 138 + .../common/ms_poly_90k_coco_instance.py | 153 + .../configs/common/ssj_270_coco_instance.py | 158 + .../common/ssj_scp_270k_coco_instance.py | 70 + .../deformable_detr_r50_16xb2_50e_coco.py | 186 + ...formable_detr_refine_r50_16xb2_50e_coco.py | 12 + ...detr_refine_twostage_r50_16xb2_50e_coco.py | 12 + .../configs/detr/detr_r101_8xb2_500e_coco.py | 13 + .../configs/detr/detr_r18_8xb2_500e_coco.py | 14 + .../configs/detr/detr_r50_8xb2_150e_coco.py | 182 + .../configs/detr/detr_r50_8xb2_500e_coco.py | 25 + .../dino/dino_4scale_r50_8xb2_12e_coco.py | 190 + .../dino/dino_4scale_r50_8xb2_24e_coco.py | 12 + .../dino/dino_4scale_r50_8xb2_36e_coco.py | 12 + .../dino_4scale_r50_improved_8xb2_12e_coco.py | 24 + .../dino/dino_5scale_swin_l_8xb2_12e_coco.py | 40 + .../dino/dino_5scale_swin_l_8xb2_36e_coco.py | 12 + .../faster_rcnn_r50_fpn_1x_coco.py | 13 + .../mask_rcnn_r101_caffe_fpn_1x_coco.py | 19 + ...ask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py | 28 + .../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py | 18 + .../mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py | 18 + ...sk_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py | 18 + .../mask_rcnn_r101_fpn_ms_poly_3x_coco.py | 19 + ...ask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py | 19 + .../mask_rcnn_r50_caffe_c4_1x_coco.py | 13 + .../mask_rcnn_r50_caffe_fpn_1x_coco.py | 25 + .../mask_rcnn_r50_caffe_fpn_ms_1x_coco.py | 40 + ...mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py | 40 + ...mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py | 23 + ...mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py | 23 + ...mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py | 40 + .../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py | 13 + .../mask_rcnn_r50_fpn_1x_wandb_coco.py | 31 + .../mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py | 13 + ...ask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py | 1 + .../mask_rcnn_r50_fpn_amp_1x_coco.py | 14 + .../mask_rcnn_r50_fpn_ms_poly_-3x_coco.py | 11 + .../mask_rcnn_r50_fpn_poly_1x_coco.py | 23 + .../mask_rcnn_x101_32x4d_fpn_1x_coco.py | 28 + .../mask_rcnn_x101_32x4d_fpn_2x_coco.py | 28 + ...ask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py | 29 + .../mask_rcnn_x101_32x8d_fpn_1x_coco.py | 31 + ...ask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py | 54 + ...ask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py | 34 + .../mask_rcnn_x101_64_4d_fpn_1x_coco.py | 24 + .../mask_rcnn_x101_64x4d_fpn_2x_coco.py | 24 + ...ask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py | 27 + .../maskformer_r50_ms_16xb1_75e_coco.py | 249 ++ ...former_swin_l_p4_w12_64xb1_ms_300e_coco.py | 82 + .../panoptic_fpn_r50_fpn_1x_coco.py | 64 + .../qdtrack_faster_rcnn_r50_fpn_4e_base.py | 141 + ...xb2-4e_mot17halftrain_test-mot17halfval.py | 14 + .../retinanet/retinanet_r50_fpn_1x_coco.py | 20 + .../mmdet/configs/retinanet/retinanet_tta.py | 31 + .../rtmdet/rtmdet_ins_l_8xb32_300e_coco.py | 134 + .../rtmdet/rtmdet_ins_m_8xb32_300e_coco.py | 17 + .../rtmdet/rtmdet_ins_s_8xb32_300e_coco.py | 101 + .../rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py | 67 + .../rtmdet/rtmdet_ins_x_8xb16_300e_coco.py | 38 + .../rtmdet/rtmdet_l_8xb32_300e_coco.py | 220 + .../rtmdet/rtmdet_m_8xb32_300e_coco.py | 17 + .../rtmdet/rtmdet_s_8xb32_300e_coco.py | 88 + .../rtmdet/rtmdet_tiny_8xb32_300e_coco.py | 64 + .../mmdet/configs/rtmdet/rtmdet_tta.py | 43 + .../rtmdet/rtmdet_x_8xb32_300e_coco.py | 17 + mmdetection/mmdet/datasets/__init__.py | 46 + mmdetection/mmdet/datasets/ade20k.py | 260 ++ .../mmdet/datasets/api_wrappers/__init__.py | 5 + .../mmdet/datasets/api_wrappers/coco_api.py | 137 + .../datasets/api_wrappers/cocoeval_mp.py | 296 ++ .../mmdet/datasets/base_det_dataset.py | 124 + .../mmdet/datasets/base_semseg_dataset.py | 265 ++ .../mmdet/datasets/base_video_dataset.py | 304 ++ mmdetection/mmdet/datasets/cityscapes.py | 61 + mmdetection/mmdet/datasets/coco.py | 200 + mmdetection/mmdet/datasets/coco_caption.py | 32 + mmdetection/mmdet/datasets/coco_panoptic.py | 292 ++ mmdetection/mmdet/datasets/coco_semantic.py | 90 + mmdetection/mmdet/datasets/crowdhuman.py | 159 + .../mmdet/datasets/dataset_wrappers.py | 252 ++ mmdetection/mmdet/datasets/deepfashion.py | 19 + mmdetection/mmdet/datasets/dsdl.py | 192 + mmdetection/mmdet/datasets/isaid.py | 25 + mmdetection/mmdet/datasets/lvis.py | 638 +++ .../mmdet/datasets/mot_challenge_dataset.py | 88 + mmdetection/mmdet/datasets/objects365.py | 284 ++ mmdetection/mmdet/datasets/openimages.py | 484 +++ mmdetection/mmdet/datasets/recycle.py | 171 + mmdetection/mmdet/datasets/refcoco.py | 163 + mmdetection/mmdet/datasets/reid_dataset.py | 127 + .../mmdet/datasets/samplers/__init__.py | 15 + .../mmdet/datasets/samplers/batch_sampler.py | 193 + .../datasets/samplers/class_aware_sampler.py | 192 + .../datasets/samplers/multi_data_sampler.py | 110 + .../datasets/samplers/multi_source_sampler.py | 214 + .../datasets/samplers/track_img_sampler.py | 146 + .../mmdet/datasets/transforms/__init__.py | 43 + .../datasets/transforms/augment_wrappers.py | 264 ++ .../mmdet/datasets/transforms/colorspace.py | 493 +++ .../mmdet/datasets/transforms/formatting.py | 512 +++ .../datasets/transforms/frame_sampling.py | 177 + .../mmdet/datasets/transforms/geometric.py | 754 ++++ .../mmdet/datasets/transforms/instaboost.py | 150 + .../mmdet/datasets/transforms/loading.py | 1074 +++++ .../datasets/transforms/transformers_glip.py | 66 + .../mmdet/datasets/transforms/transforms.py | 3854 +++++++++++++++++ .../mmdet/datasets/transforms/wrappers.py | 277 ++ mmdetection/mmdet/datasets/utils.py | 48 + mmdetection/mmdet/datasets/v3det.py | 32 + mmdetection/mmdet/datasets/voc.py | 31 + mmdetection/mmdet/datasets/wider_face.py | 90 + mmdetection/mmdet/datasets/xml_style.py | 186 + .../mmdet/datasets/youtube_vis_dataset.py | 52 + mmdetection/mmdet/engine/__init__.py | 5 + mmdetection/mmdet/engine/hooks/__init__.py | 20 + .../mmdet/engine/hooks/checkloss_hook.py | 42 + .../mmdet/engine/hooks/mean_teacher_hook.py | 87 + .../engine/hooks/memory_profiler_hook.py | 121 + .../engine/hooks/num_class_check_hook.py | 68 + .../engine/hooks/pipeline_switch_hook.py | 43 + .../mmdet/engine/hooks/set_epoch_info_hook.py | 17 + .../mmdet/engine/hooks/submission_hook.py | 83 + .../mmdet/engine/hooks/sync_norm_hook.py | 37 + mmdetection/mmdet/engine/hooks/utils.py | 19 + .../mmdet/engine/hooks/visualization_hook.py | 312 ++ .../engine/hooks/yolox_mode_switch_hook.py | 66 + .../mmdet/engine/optimizers/__init__.py | 5 + .../layer_decay_optimizer_constructor.py | 158 + mmdetection/mmdet/engine/runner/__init__.py | 4 + mmdetection/mmdet/engine/runner/loops.py | 38 + .../mmdet/engine/schedulers/__init__.py | 8 + .../engine/schedulers/quadratic_warmup.py | 131 + mmdetection/mmdet/evaluation/__init__.py | 3 + .../mmdet/evaluation/functional/__init__.py | 26 + .../evaluation/functional/bbox_overlaps.py | 65 + .../evaluation/functional/cityscapes_utils.py | 302 ++ .../evaluation/functional/class_names.py | 517 +++ .../mmdet/evaluation/functional/mean_ap.py | 792 ++++ .../evaluation/functional/panoptic_utils.py | 228 + .../mmdet/evaluation/functional/recall.py | 199 + .../mmdet/evaluation/functional/ytvis.py | 305 ++ .../mmdet/evaluation/functional/ytviseval.py | 623 +++ .../mmdet/evaluation/metrics/__init__.py | 27 + .../evaluation/metrics/base_video_metric.py | 173 + .../evaluation/metrics/cityscapes_metric.py | 205 + .../evaluation/metrics/coco_caption_metric.py | 135 + .../mmdet/evaluation/metrics/coco_metric.py | 597 +++ .../metrics/coco_occluded_metric.py | 204 + .../metrics/coco_panoptic_metric.py | 618 +++ .../evaluation/metrics/coco_video_metric.py | 80 + .../evaluation/metrics/crowdhuman_metric.py | 824 ++++ .../evaluation/metrics/dump_det_results.py | 47 + .../metrics/dump_proposals_metric.py | 119 + .../mmdet/evaluation/metrics/lvis_metric.py | 364 ++ .../metrics/mot_challenge_metric.py | 443 ++ .../evaluation/metrics/openimages_metric.py | 237 + .../mmdet/evaluation/metrics/refseg_metric.py | 63 + .../mmdet/evaluation/metrics/reid_metric.py | 138 + .../mmdet/evaluation/metrics/semseg_metric.py | 279 ++ .../mmdet/evaluation/metrics/voc_metric.py | 176 + .../evaluation/metrics/youtube_vis_metric.py | 426 ++ mmdetection/mmdet/models/__init__.py | 18 + .../mmdet/models/backbones/__init__.py | 27 + .../mmdet/models/backbones/csp_darknet.py | 286 ++ mmdetection/mmdet/models/backbones/cspnext.py | 195 + mmdetection/mmdet/models/backbones/darknet.py | 213 + .../models/backbones/detectors_resnet.py | 353 ++ .../models/backbones/detectors_resnext.py | 123 + .../mmdet/models/backbones/efficientnet.py | 418 ++ .../mmdet/models/backbones/hourglass.py | 225 + mmdetection/mmdet/models/backbones/hrnet.py | 589 +++ .../mmdet/models/backbones/mobilenet_v2.py | 198 + mmdetection/mmdet/models/backbones/pvt.py | 665 +++ mmdetection/mmdet/models/backbones/regnet.py | 356 ++ mmdetection/mmdet/models/backbones/res2net.py | 327 ++ mmdetection/mmdet/models/backbones/resnest.py | 322 ++ mmdetection/mmdet/models/backbones/resnet.py | 672 +++ mmdetection/mmdet/models/backbones/resnext.py | 154 + mmdetection/mmdet/models/backbones/ssd_vgg.py | 128 + mmdetection/mmdet/models/backbones/swin.py | 819 ++++ .../mmdet/models/backbones/trident_resnet.py | 298 ++ .../models/data_preprocessors/__init__.py | 13 + .../data_preprocessors/data_preprocessor.py | 793 ++++ .../reid_data_preprocessor.py | 216 + .../track_data_preprocessor.py | 266 ++ .../mmdet/models/dense_heads/__init__.py | 72 + .../models/dense_heads/anchor_free_head.py | 317 ++ .../mmdet/models/dense_heads/anchor_head.py | 530 +++ .../mmdet/models/dense_heads/atss_head.py | 524 +++ .../models/dense_heads/atss_vlfusion_head.py | 949 ++++ .../models/dense_heads/autoassign_head.py | 524 +++ .../models/dense_heads/base_dense_head.py | 583 +++ .../models/dense_heads/base_mask_head.py | 128 + .../mmdet/models/dense_heads/boxinst_head.py | 252 ++ .../models/dense_heads/cascade_rpn_head.py | 1110 +++++ .../models/dense_heads/centernet_head.py | 447 ++ .../dense_heads/centernet_update_head.py | 624 +++ .../models/dense_heads/centripetal_head.py | 459 ++ .../mmdet/models/dense_heads/condinst_head.py | 1226 ++++++ .../dense_heads/conditional_detr_head.py | 168 + .../mmdet/models/dense_heads/corner_head.py | 1084 +++++ .../mmdet/models/dense_heads/dab_detr_head.py | 106 + .../mmdet/models/dense_heads/ddod_head.py | 794 ++++ .../mmdet/models/dense_heads/ddq_detr_head.py | 550 +++ .../dense_heads/deformable_detr_head.py | 329 ++ .../models/dense_heads/dense_test_mixins.py | 215 + .../mmdet/models/dense_heads/detr_head.py | 634 +++ .../mmdet/models/dense_heads/dino_head.py | 479 ++ .../models/dense_heads/embedding_rpn_head.py | 132 + .../mmdet/models/dense_heads/fcos_head.py | 476 ++ .../mmdet/models/dense_heads/fovea_head.py | 509 +++ .../dense_heads/free_anchor_retina_head.py | 312 ++ .../mmdet/models/dense_heads/fsaf_head.py | 458 ++ .../models/dense_heads/ga_retina_head.py | 120 + .../mmdet/models/dense_heads/ga_rpn_head.py | 222 + .../mmdet/models/dense_heads/gfl_head.py | 667 +++ .../models/dense_heads/grounding_dino_head.py | 767 ++++ .../models/dense_heads/guided_anchor_head.py | 994 +++++ .../mmdet/models/dense_heads/lad_head.py | 226 + .../mmdet/models/dense_heads/ld_head.py | 257 ++ .../models/dense_heads/mask2former_head.py | 459 ++ .../models/dense_heads/maskformer_head.py | 601 +++ .../mmdet/models/dense_heads/nasfcos_head.py | 114 + .../mmdet/models/dense_heads/paa_head.py | 730 ++++ .../models/dense_heads/pisa_retinanet_head.py | 154 + .../mmdet/models/dense_heads/pisa_ssd_head.py | 182 + .../models/dense_heads/reppoints_head.py | 885 ++++ .../mmdet/models/dense_heads/retina_head.py | 120 + .../models/dense_heads/retina_sepbn_head.py | 127 + .../mmdet/models/dense_heads/rpn_head.py | 302 ++ .../mmdet/models/dense_heads/rtmdet_head.py | 692 +++ .../models/dense_heads/rtmdet_ins_head.py | 1034 +++++ .../models/dense_heads/sabl_retina_head.py | 706 +++ .../mmdet/models/dense_heads/solo_head.py | 1263 ++++++ .../mmdet/models/dense_heads/solov2_head.py | 799 ++++ .../mmdet/models/dense_heads/ssd_head.py | 362 ++ .../mmdet/models/dense_heads/tood_head.py | 805 ++++ .../mmdet/models/dense_heads/vfnet_head.py | 722 +++ .../mmdet/models/dense_heads/yolact_head.py | 1193 +++++ .../mmdet/models/dense_heads/yolo_head.py | 527 +++ .../mmdet/models/dense_heads/yolof_head.py | 399 ++ .../mmdet/models/dense_heads/yolox_head.py | 618 +++ .../mmdet/models/detectors/__init__.py | 75 + mmdetection/mmdet/models/detectors/atss.py | 41 + .../mmdet/models/detectors/autoassign.py | 43 + mmdetection/mmdet/models/detectors/base.py | 156 + .../mmdet/models/detectors/base_detr.py | 332 ++ mmdetection/mmdet/models/detectors/boxinst.py | 28 + .../mmdet/models/detectors/cascade_rcnn.py | 29 + .../mmdet/models/detectors/centernet.py | 29 + .../mmdet/models/detectors/condinst.py | 28 + .../models/detectors/conditional_detr.py | 74 + .../mmdet/models/detectors/cornernet.py | 30 + .../mmdet/models/detectors/crowddet.py | 45 + .../mmdet/models/detectors/d2_wrapper.py | 291 ++ .../mmdet/models/detectors/dab_detr.py | 139 + mmdetection/mmdet/models/detectors/ddod.py | 41 + .../mmdet/models/detectors/ddq_detr.py | 274 ++ .../mmdet/models/detectors/deformable_detr.py | 572 +++ mmdetection/mmdet/models/detectors/detr.py | 225 + mmdetection/mmdet/models/detectors/dino.py | 287 ++ .../mmdet/models/detectors/fast_rcnn.py | 26 + .../mmdet/models/detectors/faster_rcnn.py | 28 + mmdetection/mmdet/models/detectors/fcos.py | 42 + mmdetection/mmdet/models/detectors/fovea.py | 41 + mmdetection/mmdet/models/detectors/fsaf.py | 26 + mmdetection/mmdet/models/detectors/gfl.py | 41 + mmdetection/mmdet/models/detectors/glip.py | 403 ++ .../mmdet/models/detectors/grid_rcnn.py | 33 + .../mmdet/models/detectors/grounding_dino.py | 384 ++ mmdetection/mmdet/models/detectors/htc.py | 16 + .../mmdet/models/detectors/kd_one_stage.py | 122 + mmdetection/mmdet/models/detectors/lad.py | 93 + .../mmdet/models/detectors/mask2former.py | 30 + .../mmdet/models/detectors/mask_rcnn.py | 30 + .../models/detectors/mask_scoring_rcnn.py | 31 + .../mmdet/models/detectors/maskformer.py | 170 + mmdetection/mmdet/models/detectors/nasfcos.py | 43 + mmdetection/mmdet/models/detectors/paa.py | 41 + .../mmdet/models/detectors/panoptic_fpn.py | 35 + .../detectors/panoptic_two_stage_segmentor.py | 234 + .../mmdet/models/detectors/point_rend.py | 35 + .../mmdet/models/detectors/queryinst.py | 29 + .../models/detectors/reppoints_detector.py | 30 + .../mmdet/models/detectors/retinanet.py | 26 + mmdetection/mmdet/models/detectors/rpn.py | 81 + mmdetection/mmdet/models/detectors/rtmdet.py | 52 + mmdetection/mmdet/models/detectors/scnet.py | 11 + .../mmdet/models/detectors/semi_base.py | 266 ++ .../mmdet/models/detectors/single_stage.py | 149 + .../detectors/single_stage_instance_seg.py | 180 + .../mmdet/models/detectors/soft_teacher.py | 378 ++ mmdetection/mmdet/models/detectors/solo.py | 31 + mmdetection/mmdet/models/detectors/solov2.py | 31 + .../mmdet/models/detectors/sparse_rcnn.py | 31 + mmdetection/mmdet/models/detectors/tood.py | 42 + .../models/detectors/trident_faster_rcnn.py | 81 + .../mmdet/models/detectors/two_stage.py | 243 ++ mmdetection/mmdet/models/detectors/vfnet.py | 42 + mmdetection/mmdet/models/detectors/yolact.py | 28 + mmdetection/mmdet/models/detectors/yolo.py | 45 + mmdetection/mmdet/models/detectors/yolof.py | 43 + mmdetection/mmdet/models/detectors/yolox.py | 43 + .../mmdet/models/language_models/__init__.py | 4 + .../mmdet/models/language_models/bert.py | 231 + mmdetection/mmdet/models/layers/__init__.py | 65 + .../mmdet/models/layers/activations.py | 22 + mmdetection/mmdet/models/layers/bbox_nms.py | 184 + .../mmdet/models/layers/brick_wrappers.py | 138 + .../mmdet/models/layers/conv_upsample.py | 67 + mmdetection/mmdet/models/layers/csp_layer.py | 246 ++ mmdetection/mmdet/models/layers/dropblock.py | 86 + mmdetection/mmdet/models/layers/ema.py | 66 + .../mmdet/models/layers/inverted_residual.py | 130 + mmdetection/mmdet/models/layers/matrix_nms.py | 121 + .../layers/msdeformattn_pixel_decoder.py | 246 ++ .../mmdet/models/layers/normed_predictor.py | 99 + .../mmdet/models/layers/pixel_decoder.py | 249 ++ .../models/layers/positional_encoding.py | 269 ++ mmdetection/mmdet/models/layers/res_layer.py | 195 + mmdetection/mmdet/models/layers/se_layer.py | 162 + .../models/layers/transformer/__init__.py | 41 + .../transformer/conditional_detr_layers.py | 170 + .../layers/transformer/dab_detr_layers.py | 298 ++ .../layers/transformer/ddq_detr_layers.py | 223 + .../transformer/deformable_detr_layers.py | 265 ++ .../models/layers/transformer/detr_layers.py | 374 ++ .../models/layers/transformer/dino_layers.py | 562 +++ .../transformer/grounding_dino_layers.py | 270 ++ .../layers/transformer/mask2former_layers.py | 135 + .../mmdet/models/layers/transformer/utils.py | 915 ++++ mmdetection/mmdet/models/losses/__init__.py | 42 + mmdetection/mmdet/models/losses/accuracy.py | 77 + mmdetection/mmdet/models/losses/ae_loss.py | 101 + .../mmdet/models/losses/balanced_l1_loss.py | 122 + .../mmdet/models/losses/cross_entropy_loss.py | 401 ++ .../mmdet/models/losses/ddq_detr_aux_loss.py | 303 ++ mmdetection/mmdet/models/losses/dice_loss.py | 146 + mmdetection/mmdet/models/losses/eqlv2_loss.py | 173 + mmdetection/mmdet/models/losses/focal_loss.py | 371 ++ .../models/losses/gaussian_focal_loss.py | 186 + .../mmdet/models/losses/gfocal_loss.py | 295 ++ mmdetection/mmdet/models/losses/ghm_loss.py | 213 + mmdetection/mmdet/models/losses/iou_loss.py | 926 ++++ mmdetection/mmdet/models/losses/kd_loss.py | 95 + mmdetection/mmdet/models/losses/l2_loss.py | 139 + .../mmdet/models/losses/margin_loss.py | 152 + mmdetection/mmdet/models/losses/mse_loss.py | 69 + .../losses/multipos_cross_entropy_loss.py | 100 + mmdetection/mmdet/models/losses/pisa_loss.py | 187 + .../mmdet/models/losses/seesaw_loss.py | 278 ++ .../mmdet/models/losses/smooth_l1_loss.py | 165 + .../mmdet/models/losses/triplet_loss.py | 88 + mmdetection/mmdet/models/losses/utils.py | 125 + .../mmdet/models/losses/varifocal_loss.py | 141 + mmdetection/mmdet/models/mot/__init__.py | 11 + mmdetection/mmdet/models/mot/base.py | 147 + mmdetection/mmdet/models/mot/bytetrack.py | 94 + mmdetection/mmdet/models/mot/deep_sort.py | 110 + mmdetection/mmdet/models/mot/ocsort.py | 82 + mmdetection/mmdet/models/mot/qdtrack.py | 186 + mmdetection/mmdet/models/mot/strongsort.py | 129 + mmdetection/mmdet/models/necks/__init__.py | 27 + mmdetection/mmdet/models/necks/bfp.py | 111 + .../mmdet/models/necks/channel_mapper.py | 112 + .../mmdet/models/necks/cspnext_pafpn.py | 170 + .../mmdet/models/necks/ct_resnet_neck.py | 102 + .../mmdet/models/necks/dilated_encoder.py | 109 + mmdetection/mmdet/models/necks/dyhead.py | 173 + mmdetection/mmdet/models/necks/fpg.py | 406 ++ mmdetection/mmdet/models/necks/fpn.py | 221 + mmdetection/mmdet/models/necks/fpn_carafe.py | 275 ++ .../mmdet/models/necks/fpn_dropblock.py | 90 + mmdetection/mmdet/models/necks/hrfpn.py | 100 + mmdetection/mmdet/models/necks/nas_fpn.py | 171 + mmdetection/mmdet/models/necks/nasfcos_fpn.py | 170 + mmdetection/mmdet/models/necks/pafpn.py | 157 + mmdetection/mmdet/models/necks/rfp.py | 134 + mmdetection/mmdet/models/necks/ssd_neck.py | 129 + mmdetection/mmdet/models/necks/ssh.py | 216 + mmdetection/mmdet/models/necks/yolo_neck.py | 145 + mmdetection/mmdet/models/necks/yolox_pafpn.py | 156 + mmdetection/mmdet/models/reid/__init__.py | 7 + mmdetection/mmdet/models/reid/base_reid.py | 65 + mmdetection/mmdet/models/reid/fc_module.py | 71 + mmdetection/mmdet/models/reid/gap.py | 40 + .../mmdet/models/reid/linear_reid_head.py | 202 + .../mmdet/models/roi_heads/__init__.py | 38 + .../mmdet/models/roi_heads/base_roi_head.py | 129 + .../models/roi_heads/bbox_heads/__init__.py | 15 + .../models/roi_heads/bbox_heads/bbox_head.py | 708 +++ .../roi_heads/bbox_heads/convfc_bbox_head.py | 249 ++ .../models/roi_heads/bbox_heads/dii_head.py | 422 ++ .../roi_heads/bbox_heads/double_bbox_head.py | 199 + .../bbox_heads/multi_instance_bbox_head.py | 626 +++ .../models/roi_heads/bbox_heads/sabl_head.py | 684 +++ .../roi_heads/bbox_heads/scnet_bbox_head.py | 101 + .../models/roi_heads/cascade_roi_head.py | 568 +++ .../mmdet/models/roi_heads/double_roi_head.py | 53 + .../models/roi_heads/dynamic_roi_head.py | 163 + .../mmdet/models/roi_heads/grid_roi_head.py | 280 ++ .../mmdet/models/roi_heads/htc_roi_head.py | 581 +++ .../models/roi_heads/mask_heads/__init__.py | 20 + .../roi_heads/mask_heads/coarse_mask_head.py | 110 + .../roi_heads/mask_heads/dynamic_mask_head.py | 166 + .../roi_heads/mask_heads/fcn_mask_head.py | 474 ++ .../mask_heads/feature_relay_head.py | 68 + .../mask_heads/fused_semantic_head.py | 144 + .../mask_heads/global_context_head.py | 127 + .../models/roi_heads/mask_heads/grid_head.py | 490 +++ .../roi_heads/mask_heads/htc_mask_head.py | 65 + .../roi_heads/mask_heads/mask_point_head.py | 284 ++ .../roi_heads/mask_heads/maskiou_head.py | 277 ++ .../roi_heads/mask_heads/scnet_mask_head.py | 28 + .../mask_heads/scnet_semantic_head.py | 28 + .../models/roi_heads/mask_scoring_roi_head.py | 208 + .../roi_heads/multi_instance_roi_head.py | 226 + .../mmdet/models/roi_heads/pisa_roi_head.py | 148 + .../models/roi_heads/point_rend_roi_head.py | 236 + .../roi_heads/roi_extractors/__init__.py | 6 + .../roi_extractors/base_roi_extractor.py | 111 + .../roi_extractors/generic_roi_extractor.py | 102 + .../single_level_roi_extractor.py | 119 + .../mmdet/models/roi_heads/scnet_roi_head.py | 677 +++ .../models/roi_heads/shared_heads/__init__.py | 4 + .../roi_heads/shared_heads/res_layer.py | 79 + .../mmdet/models/roi_heads/sparse_roi_head.py | 601 +++ .../models/roi_heads/standard_roi_head.py | 419 ++ .../mmdet/models/roi_heads/test_mixins.py | 171 + .../models/roi_heads/trident_roi_head.py | 112 + .../mmdet/models/seg_heads/__init__.py | 3 + .../models/seg_heads/base_semantic_head.py | 113 + .../models/seg_heads/panoptic_fpn_head.py | 174 + .../panoptic_fusion_heads/__init__.py | 5 + .../base_panoptic_fusion_head.py | 43 + .../heuristic_fusion_head.py | 159 + .../maskformer_fusion_head.py | 266 ++ .../mmdet/models/task_modules/__init__.py | 18 + .../models/task_modules/assigners/__init__.py | 32 + .../assigners/approx_max_iou_assigner.py | 162 + .../task_modules/assigners/assign_result.py | 198 + .../task_modules/assigners/atss_assigner.py | 254 ++ .../task_modules/assigners/base_assigner.py | 17 + .../assigners/center_region_assigner.py | 366 ++ .../assigners/dynamic_soft_label_assigner.py | 227 + .../task_modules/assigners/grid_assigner.py | 177 + .../assigners/hungarian_assigner.py | 145 + .../assigners/iou2d_calculator.py | 88 + .../task_modules/assigners/match_cost.py | 525 +++ .../assigners/max_iou_assigner.py | 325 ++ .../assigners/multi_instance_assigner.py | 140 + .../task_modules/assigners/point_assigner.py | 155 + .../task_modules/assigners/region_assigner.py | 239 + .../assigners/sim_ota_assigner.py | 223 + .../assigners/task_aligned_assigner.py | 158 + .../assigners/topk_hungarian_assigner.py | 182 + .../assigners/uniform_assigner.py | 173 + .../mmdet/models/task_modules/builder.py | 62 + .../models/task_modules/coders/__init__.py | 16 + .../task_modules/coders/base_bbox_coder.py | 26 + .../coders/bucketing_bbox_coder.py | 366 ++ .../coders/delta_xywh_bbox_coder.py | 579 +++ .../coders/distance_point_bbox_coder.py | 85 + .../coders/legacy_delta_xywh_bbox_coder.py | 235 + .../task_modules/coders/pseudo_bbox_coder.py | 29 + .../task_modules/coders/tblr_bbox_coder.py | 228 + .../task_modules/coders/yolo_bbox_coder.py | 94 + .../task_modules/prior_generators/__init__.py | 11 + .../prior_generators/anchor_generator.py | 848 ++++ .../prior_generators/point_generator.py | 321 ++ .../task_modules/prior_generators/utils.py | 70 + .../models/task_modules/samplers/__init__.py | 22 + .../task_modules/samplers/base_sampler.py | 136 + .../task_modules/samplers/combined_sampler.py | 21 + .../samplers/instance_balanced_pos_sampler.py | 56 + .../samplers/iou_balanced_neg_sampler.py | 158 + .../samplers/mask_pseudo_sampler.py | 60 + .../samplers/mask_sampling_result.py | 68 + .../samplers/multi_instance_random_sampler.py | 130 + .../multi_instance_sampling_result.py | 56 + .../task_modules/samplers/ohem_sampler.py | 111 + .../task_modules/samplers/pseudo_sampler.py | 60 + .../task_modules/samplers/random_sampler.py | 109 + .../task_modules/samplers/sampling_result.py | 240 + .../samplers/score_hlr_sampler.py | 290 ++ .../models/task_modules/tracking/__init__.py | 11 + .../models/task_modules/tracking/aflink.py | 281 ++ .../tracking/camera_motion_compensation.py | 104 + .../task_modules/tracking/interpolation.py | 168 + .../task_modules/tracking/kalman_filter.py | 267 ++ .../task_modules/tracking/similarity.py | 34 + .../mmdet/models/test_time_augs/__init__.py | 10 + .../mmdet/models/test_time_augs/det_tta.py | 144 + .../mmdet/models/test_time_augs/merge_augs.py | 219 + mmdetection/mmdet/models/trackers/__init__.py | 13 + .../mmdet/models/trackers/base_tracker.py | 240 + .../mmdet/models/trackers/byte_tracker.py | 334 ++ .../models/trackers/masktrack_rcnn_tracker.py | 189 + .../mmdet/models/trackers/ocsort_tracker.py | 531 +++ .../models/trackers/quasi_dense_tracker.py | 316 ++ .../mmdet/models/trackers/sort_tracker.py | 268 ++ .../models/trackers/strongsort_tracker.py | 273 ++ .../mmdet/models/tracking_heads/__init__.py | 11 + .../tracking_heads/mask2former_track_head.py | 729 ++++ .../tracking_heads/quasi_dense_embed_head.py | 347 ++ .../tracking_heads/quasi_dense_track_head.py | 178 + .../models/tracking_heads/roi_embed_head.py | 391 ++ .../models/tracking_heads/roi_track_head.py | 178 + mmdetection/mmdet/models/utils/__init__.py | 37 + .../mmdet/models/utils/gaussian_target.py | 268 ++ mmdetection/mmdet/models/utils/image.py | 52 + .../mmdet/models/utils/make_divisible.py | 28 + mmdetection/mmdet/models/utils/misc.py | 697 +++ .../models/utils/panoptic_gt_processing.py | 70 + .../mmdet/models/utils/point_sample.py | 88 + .../mmdet/models/utils/vlfuse_helper.py | 773 ++++ mmdetection/mmdet/models/utils/wbf.py | 250 ++ mmdetection/mmdet/models/vis/__init__.py | 5 + .../mmdet/models/vis/mask2former_vis.py | 120 + .../mmdet/models/vis/masktrack_rcnn.py | 181 + mmdetection/mmdet/registry.py | 121 + mmdetection/mmdet/structures/__init__.py | 10 + mmdetection/mmdet/structures/bbox/__init__.py | 25 + .../mmdet/structures/bbox/base_boxes.py | 549 +++ .../mmdet/structures/bbox/bbox_overlaps.py | 199 + mmdetection/mmdet/structures/bbox/box_type.py | 296 ++ .../mmdet/structures/bbox/horizontal_boxes.py | 432 ++ .../mmdet/structures/bbox/transforms.py | 498 +++ .../mmdet/structures/det_data_sample.py | 237 + mmdetection/mmdet/structures/mask/__init__.py | 11 + .../mmdet/structures/mask/mask_target.py | 127 + .../mmdet/structures/mask/structures.py | 1193 +++++ mmdetection/mmdet/structures/mask/utils.py | 77 + .../mmdet/structures/reid_data_sample.py | 123 + .../mmdet/structures/track_data_sample.py | 273 ++ mmdetection/mmdet/testing/__init__.py | 12 + .../mmdet/testing/_fast_stop_training_hook.py | 27 + mmdetection/mmdet/testing/_utils.py | 469 ++ mmdetection/mmdet/utils/__init__.py | 28 + mmdetection/mmdet/utils/benchmark.py | 529 +++ mmdetection/mmdet/utils/collect_env.py | 17 + mmdetection/mmdet/utils/compat_config.py | 139 + mmdetection/mmdet/utils/contextmanagers.py | 122 + mmdetection/mmdet/utils/dist_utils.py | 184 + mmdetection/mmdet/utils/large_image.py | 104 + mmdetection/mmdet/utils/logger.py | 49 + mmdetection/mmdet/utils/memory.py | 212 + mmdetection/mmdet/utils/misc.py | 149 + .../mmdet/utils/mot_error_visualize.py | 273 ++ mmdetection/mmdet/utils/profiling.py | 40 + mmdetection/mmdet/utils/replace_cfg_vals.py | 70 + mmdetection/mmdet/utils/setup_env.py | 118 + mmdetection/mmdet/utils/split_batch.py | 45 + mmdetection/mmdet/utils/typing_utils.py | 22 + mmdetection/mmdet/utils/util_mixins.py | 105 + mmdetection/mmdet/utils/util_random.py | 34 + mmdetection/mmdet/version.py | 27 + mmdetection/mmdet/visualization/__init__.py | 8 + .../mmdet/visualization/local_visualizer.py | 699 +++ mmdetection/mmdet/visualization/palette.py | 108 + mmdetection/model-index.yml | 101 + mmdetection/projects/AlignDETR/README.md | 33 + .../projects/AlignDETR/align_detr/__init__.py | 5 + .../AlignDETR/align_detr/align_detr_head.py | 508 +++ .../align_detr/mixed_hungarian_assigner.py | 162 + .../projects/AlignDETR/align_detr/utils.py | 34 + .../align_detr-4scale_r50_8xb2-12e_coco.py | 185 + .../align_detr-4scale_r50_8xb2-24e_coco.py | 19 + mmdetection/projects/CO-DETR/README.md | 32 + .../projects/CO-DETR/codetr/__init__.py | 13 + .../projects/CO-DETR/codetr/co_atss_head.py | 153 + .../projects/CO-DETR/codetr/co_dino_head.py | 677 +++ .../projects/CO-DETR/codetr/co_roi_head.py | 108 + mmdetection/projects/CO-DETR/codetr/codetr.py | 320 ++ .../projects/CO-DETR/codetr/transformer.py | 1376 ++++++ .../codino/co_dino_5scale_r50_8xb2_1x_coco.py | 68 + .../co_dino_5scale_r50_lsj_8xb2_1x_coco.py | 359 ++ .../co_dino_5scale_r50_lsj_8xb2_3x_coco.py | 4 + ...dino_5scale_swin_l_16xb1_16e_o365tococo.py | 115 + .../co_dino_5scale_swin_l_16xb1_1x_coco.py | 31 + .../co_dino_5scale_swin_l_16xb1_3x_coco.py | 6 + ...co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py | 72 + ...co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py | 6 + mmdetection/projects/ConvNeXt-V2/README.md | 37 + ...cnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py | 92 + mmdetection/projects/Detic/README.md | 156 + ...enternet2_swin-b_fpn_4x_lvis-coco-in21k.py | 298 ++ mmdetection/projects/Detic/demo.py | 142 + mmdetection/projects/Detic/detic/__init__.py | 9 + .../Detic/detic/centernet_rpn_head.py | 196 + .../projects/Detic/detic/detic_bbox_head.py | 112 + .../projects/Detic/detic/detic_roi_head.py | 326 ++ .../projects/Detic/detic/text_encoder.py | 50 + mmdetection/projects/Detic/detic/utils.py | 78 + .../Detic/detic/zero_shot_classifier.py | 73 + mmdetection/projects/Detic_new/README.md | 248 ++ ..._centernet2_r50_fpn_4x_lvis-base_boxsup.py | 9 + ...ternet2_r50_fpn_4x_lvis-base_in21k-lvis.py | 93 + ...detic_centernet2_r50_fpn_4x_lvis_boxsup.py | 410 ++ ...c_centernet2_r50_fpn_4x_lvis_in21k-lvis.py | 91 + ...nternet2_swin-b_fpn_4x_lvis-base_boxsup.py | 9 + ...net2_swin-b_fpn_4x_lvis-base_in21k-lvis.py | 118 + ...ic_centernet2_swin-b_fpn_4x_lvis_boxsup.py | 78 + ...enternet2_swin-b_fpn_4x_lvis_coco_in21k.py | 2 + ...enternet2_swin-b_fpn_4x_lvis_in21k-lvis.py | 116 + .../projects/Detic_new/detic/__init__.py | 13 + .../Detic_new/detic/centernet_rpn_head.py | 573 +++ mmdetection/projects/Detic_new/detic/detic.py | 274 ++ .../Detic_new/detic/detic_bbox_head.py | 434 ++ .../Detic_new/detic/detic_roi_head.py | 440 ++ .../Detic_new/detic/heatmap_focal_loss.py | 131 + .../projects/Detic_new/detic/imagenet_lvis.py | 395 ++ .../projects/Detic_new/detic/iou_loss.py | 125 + .../Detic_new/detic/zero_shot_classifier.py | 73 + mmdetection/projects/DiffusionDet/README.md | 172 + ...posals_1-step_crop-ms-480-800-450k_coco.py | 185 + .../DiffusionDet/diffusiondet/__init__.py | 10 + .../DiffusionDet/diffusiondet/diffusiondet.py | 26 + .../DiffusionDet/diffusiondet/head.py | 1034 +++++ .../DiffusionDet/diffusiondet/loss.py | 341 ++ .../diffusiondet_resnet_to_mmdet.py | 88 + mmdetection/projects/EfficientDet/README.md | 154 + ...det_effb0_bifpn_8xb16-crop512-300e_coco.py | 171 + ...fb3_bifpn_8xb16-crop896-300e_coco-90cls.py | 171 + ...det_effb3_bifpn_8xb16-crop896-300e_coco.py | 171 + ..._effb0_bifpn_8xb16-crop512-300e_coco_tf.py | 171 + .../projects/EfficientDet/convert_tf_to_pt.py | 626 +++ .../EfficientDet/efficientdet/__init__.py | 16 + .../EfficientDet/efficientdet/bifpn.py | 306 ++ .../EfficientDet/efficientdet/efficientdet.py | 25 + .../efficientdet/efficientdet_head.py | 261 ++ .../EfficientDet/efficientdet/huber_loss.py | 91 + .../tensorflow/anchor_generator.py | 109 + .../tensorflow/api_wrappers/__init__.py | 4 + .../tensorflow/api_wrappers/coco_api.py | 145 + .../efficientdet/tensorflow/coco_90class.py | 198 + .../efficientdet/tensorflow/coco_90metric.py | 540 +++ .../tensorflow/trans_max_iou_assigner.py | 110 + .../tensorflow/yxyx_bbox_coder.py | 369 ++ .../EfficientDet/efficientdet/utils.py | 154 + mmdetection/projects/HDINO/README.md | 35 + mmdetection/projects/HDINO/__init__.py | 4 + .../HDINO/h-dino-4scale_r50_8xb2-12e_coco.py | 168 + mmdetection/projects/HDINO/h_dino.py | 149 + mmdetection/projects/HDINO/h_dino_head.py | 112 + .../LabelStudio/backend_template/_wsgi.py | 145 + .../backend_template/mmdetection.py | 148 + mmdetection/projects/LabelStudio/readme.md | 3 + .../projects/RF100-Benchmark/README.md | 215 + .../projects/RF100-Benchmark/README_zh-CN.md | 215 + .../projects/RF100-Benchmark/__init__.py | 4 + mmdetection/projects/RF100-Benchmark/coco.py | 213 + .../projects/RF100-Benchmark/coco_metric.py | 243 ++ .../dino_r50_fpn_ms_8xb8_tweeter-profile.py | 102 + ...er-rcnn_r50_fpn_ms_8xb8_tweeter-profile.py | 101 + .../tood_r50_fpn_ms_8xb8_tweeter-profile.py | 101 + .../scripts/create_new_config.py | 42 + .../scripts/download_dataset.py | 65 + .../RF100-Benchmark/scripts/labels_names.json | 882 ++++ .../RF100-Benchmark/scripts/log_extract.py | 286 ++ .../scripts/parse_dataset_link.py | 18 + mmdetection/projects/SparseInst/README.md | 131 + .../sparseinst_r50_iam_8xb8-ms-270k_coco.py | 146 + .../SparseInst/sparseinst/__init__.py | 10 + .../projects/SparseInst/sparseinst/decoder.py | 400 ++ .../projects/SparseInst/sparseinst/encoder.py | 102 + .../projects/SparseInst/sparseinst/loss.py | 249 ++ .../SparseInst/sparseinst/sparseinst.py | 206 + .../projects/VISION-Datasets/README.md | 103 + .../projects/VISION-Datasets/README_zh-CN.md | 103 + mmdetection/projects/ViTDet/README.md | 110 + .../ViTDet/configs/lsj-100e_coco-instance.py | 135 + .../vitdet_mask-rcnn_vit-b-mae_lsj-100e.py | 60 + .../projects/ViTDet/vitdet/__init__.py | 9 + .../ViTDet/vitdet/fp16_compression_hook.py | 25 + .../layer_decay_optimizer_constructor.py | 109 + .../projects/ViTDet/vitdet/simple_fpn.py | 102 + mmdetection/projects/ViTDet/vitdet/vit.py | 448 ++ mmdetection/projects/XDecoder/README.md | 245 ++ .../configs/_base_/xdecoder-tiny_caption.py | 3 + .../xdecoder-tiny_open-vocab-instance.py | 3 + .../xdecoder-tiny_open-vocab-panoptic.py | 4 + .../_base_/xdecoder-tiny_open-vocab-semseg.py | 29 + .../configs/_base_/xdecoder-tiny_ref-seg.py | 3 + ...xdecoder-tiny_zeroshot_caption_coco2014.py | 18 + ...iny_zeroshot_open-vocab-instance_ade20k.py | 20 + ...-tiny_zeroshot_open-vocab-instance_coco.py | 27 + ...iny_zeroshot_open-vocab-panoptic_ade20k.py | 51 + ...-tiny_zeroshot_open-vocab-panoptic_coco.py | 27 + ...ny_zeroshot_open-vocab-ref-seg_refcoco+.py | 3 + ...iny_zeroshot_open-vocab-ref-seg_refcoco.py | 3 + ...ny_zeroshot_open-vocab-ref-seg_refcocog.py | 3 + ...-tiny_zeroshot_open-vocab-semseg_ade20k.py | 50 + ...er-tiny_zeroshot_open-vocab-semseg_coco.py | 68 + .../xdecoder-tiny_zeroshot_ref-caption.py | 17 + ...oder-tiny_zeroshot_text-image-retrieval.py | 24 + mmdetection/projects/XDecoder/demo.py | 99 + .../projects/XDecoder/xdecoder/__init__.py | 10 + .../projects/XDecoder/xdecoder/focalnet.py | 522 +++ .../XDecoder/xdecoder/inference/__init__.py | 8 + .../xdecoder/inference/image_caption.py | 308 ++ .../texttoimage_regionretrieval_inferencer.py | 226 + .../XDecoder/xdecoder/language_model.py | 251 ++ .../XDecoder/xdecoder/pixel_decoder.py | 214 + .../XDecoder/xdecoder/transformer_blocks.py | 473 ++ .../XDecoder/xdecoder/transformer_decoder.py | 439 ++ .../XDecoder/xdecoder/unified_head.py | 363 ++ .../projects/XDecoder/xdecoder/utils.py | 215 + .../projects/XDecoder/xdecoder/xdecoder.py | 36 + .../projects/example_largemodel/README.md | 75 + .../example_largemodel/README_zh-CN.md | 75 + .../projects/example_largemodel/__init__.py | 3 + ...o-5scale_swin-l_deepspeed_8xb2-12e_coco.py | 44 + .../dino-5scale_swin-l_fsdp_8xb2-12e_coco.py | 18 + .../projects/example_largemodel/fsdp_utils.py | 38 + .../projects/example_project/README.md | 116 + .../faster-rcnn_dummy-resnet_fpn_1x_coco.py | 5 + .../example_project/dummy/__init__.py | 4 + .../example_project/dummy/dummy_resnet.py | 15 + mmdetection/projects/gradio_demo/README.md | 49 + mmdetection/projects/gradio_demo/launch.py | 623 +++ mmdetection/projects/iSAID/README.md | 85 + mmdetection/projects/iSAID/README_zh-CN.md | 85 + .../configs/mask_rcnn_r50_fpn_1x_isaid.py | 6 + mmdetection/projects/iSAID/isaid_json.py | 29 + mmdetection/pytest.ini | 7 + mmdetection/setup.cfg | 24 + mmdetection/setup.py | 224 + .../tools/analysis_tools/analyze_logs.py | 211 + .../tools/analysis_tools/analyze_results.py | 407 ++ mmdetection/tools/analysis_tools/benchmark.py | 133 + .../tools/analysis_tools/browse_dataset.py | 89 + .../analysis_tools/coco_error_analysis.py | 339 ++ .../coco_occluded_separated_recall.py | 48 + .../tools/analysis_tools/confusion_matrix.py | 273 ++ .../tools/analysis_tools/eval_metric.py | 50 + .../tools/analysis_tools/fuse_results.py | 142 + mmdetection/tools/analysis_tools/get_flops.py | 140 + .../analysis_tools/mot/browse_dataset.py | 85 + .../analysis_tools/mot/mot_error_visualize.py | 211 + .../analysis_tools/mot/mot_param_search.py | 155 + .../tools/analysis_tools/optimize_anchors.py | 382 ++ .../tools/analysis_tools/robustness_eval.py | 263 ++ .../tools/analysis_tools/test_robustness.py | 239 + .../tools/dataset_converters/ade20k2coco.py | 367 ++ .../tools/dataset_converters/cityscapes.py | 153 + .../dataset_converters/coco_stuff164k.py | 254 ++ .../dataset_converters/crowdhuman2coco.py | 100 + .../tools/dataset_converters/images2coco.py | 102 + .../tools/dataset_converters/mot2coco.py | 220 + .../tools/dataset_converters/mot2reid.py | 191 + .../tools/dataset_converters/pascal_voc.py | 238 + ...coco_semantic_annos_from_panoptic_annos.py | 899 ++++ .../dataset_converters/youtubevis2coco.py | 157 + .../tools/deployment/mmdet2torchserve.py | 112 + mmdetection/tools/deployment/mmdet_handler.py | 72 + .../tools/deployment/test_torchserver.py | 113 + mmdetection/tools/misc/download_dataset.py | 229 + .../tools/misc/gen_coco_panoptic_test_info.py | 33 + .../tools/misc/get_crowdhuman_id_hw.py | 87 + mmdetection/tools/misc/get_image_metas.py | 125 + mmdetection/tools/misc/print_config.py | 60 + mmdetection/tools/misc/split_coco.py | 110 + .../model_converters/detectron2_to_mmdet.py | 48 + .../model_converters/detectron2pytorch.py | 83 + .../tools/model_converters/detic_to_mmdet.py | 195 + .../tools/model_converters/glip_to_mmdet.py | 125 + .../groundingdino_to_mmdet.py | 213 + .../tools/model_converters/publish_model.py | 62 + .../tools/model_converters/regnet2mmdet.py | 90 + .../tools/model_converters/selfsup2mmdet.py | 42 + .../tools/model_converters/swinv1_to_mmdet.py | 86 + .../model_converters/upgrade_model_version.py | 210 + .../model_converters/upgrade_ssd_version.py | 59 + mmdetection/tools/test.py | 149 + mmdetection/tools/test_tracking.py | 101 + mmdetection/tools/train.py | 121 + 1993 files changed, 235461 insertions(+) create mode 100644 mmdetection/.circleci/config.yml create mode 100644 mmdetection/.circleci/docker/Dockerfile create mode 100644 mmdetection/.circleci/test.yml create mode 100644 mmdetection/.dev_scripts/batch_test_list.py create mode 100644 mmdetection/.dev_scripts/benchmark_filter.py create mode 100644 mmdetection/.dev_scripts/benchmark_inference_fps.py create mode 100644 mmdetection/.dev_scripts/benchmark_options.py create mode 100644 mmdetection/.dev_scripts/benchmark_test.py create mode 100644 mmdetection/.dev_scripts/benchmark_test_image.py create mode 100644 mmdetection/.dev_scripts/benchmark_train.py create mode 100644 mmdetection/.dev_scripts/benchmark_valid_flops.py create mode 100755 mmdetection/.dev_scripts/check_links.py create mode 100644 mmdetection/.dev_scripts/convert_test_benchmark_script.py create mode 100644 mmdetection/.dev_scripts/convert_train_benchmark_script.py create mode 100644 mmdetection/.dev_scripts/covignore.cfg create mode 100644 mmdetection/.dev_scripts/download_checkpoints.py create mode 100644 mmdetection/.dev_scripts/gather_models.py create mode 100644 mmdetection/.dev_scripts/gather_test_benchmark_metric.py create mode 100644 mmdetection/.dev_scripts/gather_train_benchmark_metric.py create mode 100644 mmdetection/.dev_scripts/test_init_backbone.py create mode 100644 mmdetection/.owners.yml create mode 100644 mmdetection/.pre-commit-config-zh-cn.yaml create mode 100644 mmdetection/.pre-commit-config.yaml create mode 100644 mmdetection/.readthedocs.yml create mode 100644 mmdetection/CITATION.cff create mode 100644 mmdetection/LICENSE create mode 100644 mmdetection/MANIFEST.in create mode 100644 mmdetection/README.md create mode 100644 mmdetection/README_zh-CN.md create mode 100644 mmdetection/configs/_base_/datasets/ade20k_instance.py create mode 100644 mmdetection/configs/_base_/datasets/ade20k_panoptic.py create mode 100644 mmdetection/configs/_base_/datasets/ade20k_semantic.py create mode 100644 mmdetection/configs/_base_/datasets/cityscapes_detection.py create mode 100644 mmdetection/configs/_base_/datasets/cityscapes_instance.py create mode 100644 mmdetection/configs/_base_/datasets/coco_caption.py create mode 100644 mmdetection/configs/_base_/datasets/coco_detection.py create mode 100644 mmdetection/configs/_base_/datasets/coco_instance.py create mode 100644 mmdetection/configs/_base_/datasets/coco_instance_semantic.py create mode 100644 mmdetection/configs/_base_/datasets/coco_panoptic.py create mode 100644 mmdetection/configs/_base_/datasets/coco_semantic.py create mode 100644 mmdetection/configs/_base_/datasets/deepfashion.py create mode 100644 mmdetection/configs/_base_/datasets/dsdl.py create mode 100644 mmdetection/configs/_base_/datasets/isaid_instance.py create mode 100644 mmdetection/configs/_base_/datasets/lvis_v0.5_instance.py create mode 100644 mmdetection/configs/_base_/datasets/lvis_v1_instance.py create mode 100644 mmdetection/configs/_base_/datasets/mot_challenge.py create mode 100644 mmdetection/configs/_base_/datasets/mot_challenge_det.py create mode 100644 mmdetection/configs/_base_/datasets/mot_challenge_reid.py create mode 100644 mmdetection/configs/_base_/datasets/objects365v1_detection.py create mode 100644 mmdetection/configs/_base_/datasets/objects365v2_detection.py create mode 100644 mmdetection/configs/_base_/datasets/openimages_detection.py create mode 100644 mmdetection/configs/_base_/datasets/refcoco+.py create mode 100644 mmdetection/configs/_base_/datasets/refcoco.py create mode 100644 mmdetection/configs/_base_/datasets/refcocog.py create mode 100644 mmdetection/configs/_base_/datasets/semi_coco_detection.py create mode 100644 mmdetection/configs/_base_/datasets/v3det.py create mode 100644 mmdetection/configs/_base_/datasets/voc0712.py create mode 100644 mmdetection/configs/_base_/datasets/wider_face.py create mode 100644 mmdetection/configs/_base_/datasets/youtube_vis.py create mode 100644 mmdetection/configs/_base_/default_runtime.py create mode 100644 mmdetection/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py create mode 100644 mmdetection/configs/_base_/models/cascade-rcnn_r50_fpn.py create mode 100644 mmdetection/configs/_base_/models/fast-rcnn_r50_fpn.py create mode 100644 mmdetection/configs/_base_/models/faster-rcnn_r50-caffe-c4.py create mode 100644 mmdetection/configs/_base_/models/faster-rcnn_r50-caffe-dc5.py create mode 100644 mmdetection/configs/_base_/models/faster-rcnn_r50_fpn.py create mode 100644 mmdetection/configs/_base_/models/mask-rcnn_r50-caffe-c4.py create mode 100644 mmdetection/configs/_base_/models/mask-rcnn_r50_fpn.py create mode 100644 mmdetection/configs/_base_/models/retinanet_r50_fpn.py create mode 100644 mmdetection/configs/_base_/models/rpn_r50-caffe-c4.py create mode 100644 mmdetection/configs/_base_/models/rpn_r50_fpn.py create mode 100644 mmdetection/configs/_base_/models/ssd300.py create mode 100644 mmdetection/configs/_base_/schedules/schedule_1x.py create mode 100644 mmdetection/configs/_base_/schedules/schedule_20e.py create mode 100644 mmdetection/configs/_base_/schedules/schedule_2x.py create mode 100644 mmdetection/configs/albu_example/README.md create mode 100644 mmdetection/configs/albu_example/mask-rcnn_r50_fpn_albu-1x_coco.py create mode 100644 mmdetection/configs/albu_example/metafile.yml create mode 100644 mmdetection/configs/atss/README.md create mode 100644 mmdetection/configs/atss/atss_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/atss/atss_r101_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/atss/atss_r18_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/atss/atss_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/atss/atss_r50_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/atss/metafile.yml create mode 100644 mmdetection/configs/autoassign/README.md create mode 100644 mmdetection/configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/autoassign/metafile.yml create mode 100644 mmdetection/configs/boxinst/README.md create mode 100644 mmdetection/configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py create mode 100644 mmdetection/configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py create mode 100644 mmdetection/configs/boxinst/metafile.yml create mode 100644 mmdetection/configs/bytetrack/README.md create mode 100644 mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py create mode 100644 mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py create mode 100644 mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py create mode 100644 mmdetection/configs/bytetrack/metafile.yml create mode 100644 mmdetection/configs/bytetrack/yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/carafe/README.md create mode 100644 mmdetection/configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py create mode 100644 mmdetection/configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py create mode 100644 mmdetection/configs/carafe/metafile.yml create mode 100644 mmdetection/configs/cascade_rcnn/README.md create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py create mode 100644 mmdetection/configs/cascade_rcnn/metafile.yml create mode 100644 mmdetection/configs/cascade_rpn/README.md create mode 100644 mmdetection/configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rpn/cascade-rpn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/cascade_rpn/metafile.yml create mode 100644 mmdetection/configs/centernet/README.md create mode 100644 mmdetection/configs/centernet/centernet-update_r101_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/centernet/centernet-update_r18_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py create mode 100644 mmdetection/configs/centernet/centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py create mode 100644 mmdetection/configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py create mode 100644 mmdetection/configs/centernet/centernet_tta.py create mode 100644 mmdetection/configs/centernet/metafile.yml create mode 100644 mmdetection/configs/centripetalnet/README.md create mode 100644 mmdetection/configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py create mode 100644 mmdetection/configs/centripetalnet/metafile.yml create mode 100644 mmdetection/configs/cityscapes/README.md create mode 100644 mmdetection/configs/cityscapes/faster-rcnn_r50_fpn_1x_cityscapes.py create mode 100644 mmdetection/configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py create mode 100644 mmdetection/configs/common/lsj-100e_coco-detection.py create mode 100644 mmdetection/configs/common/lsj-100e_coco-instance.py create mode 100644 mmdetection/configs/common/lsj-200e_coco-detection.py create mode 100644 mmdetection/configs/common/lsj-200e_coco-instance.py create mode 100644 mmdetection/configs/common/ms-90k_coco.py create mode 100644 mmdetection/configs/common/ms-poly-90k_coco-instance.py create mode 100644 mmdetection/configs/common/ms-poly_3x_coco-instance.py create mode 100644 mmdetection/configs/common/ms_3x_coco-instance.py create mode 100644 mmdetection/configs/common/ms_3x_coco.py create mode 100644 mmdetection/configs/common/ssj_270k_coco-instance.py create mode 100644 mmdetection/configs/common/ssj_scp_270k_coco-instance.py create mode 100644 mmdetection/configs/condinst/README.md create mode 100644 mmdetection/configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py create mode 100644 mmdetection/configs/condinst/metafile.yml create mode 100644 mmdetection/configs/conditional_detr/README.md create mode 100644 mmdetection/configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py create mode 100644 mmdetection/configs/conditional_detr/metafile.yml create mode 100644 mmdetection/configs/convnext/README.md create mode 100644 mmdetection/configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py create mode 100644 mmdetection/configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py create mode 100644 mmdetection/configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py create mode 100644 mmdetection/configs/convnext/metafile.yml create mode 100644 mmdetection/configs/cornernet/README.md create mode 100644 mmdetection/configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py create mode 100644 mmdetection/configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py create mode 100644 mmdetection/configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py create mode 100644 mmdetection/configs/cornernet/metafile.yml create mode 100644 mmdetection/configs/crowddet/README.md create mode 100644 mmdetection/configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py create mode 100644 mmdetection/configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py create mode 100644 mmdetection/configs/crowddet/metafile.yml create mode 100644 mmdetection/configs/dab_detr/README.md create mode 100644 mmdetection/configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py create mode 100644 mmdetection/configs/dab_detr/metafile.yml create mode 100644 mmdetection/configs/dcn/README.md create mode 100644 mmdetection/configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py create mode 100644 mmdetection/configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py create mode 100644 mmdetection/configs/dcn/metafile.yml create mode 100644 mmdetection/configs/dcnv2/README.md create mode 100644 mmdetection/configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py create mode 100644 mmdetection/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py create mode 100644 mmdetection/configs/dcnv2/metafile.yml create mode 100644 mmdetection/configs/ddod/README.md create mode 100644 mmdetection/configs/ddod/ddod_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/ddod/metafile.yml create mode 100644 mmdetection/configs/ddq/README.md create mode 100644 mmdetection/configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py create mode 100644 mmdetection/configs/ddq/ddq-detr-4scale_swinl_8xb2-30e_coco.py create mode 100644 mmdetection/configs/ddq/ddq-detr-5scale_r50_8xb2-12e_coco.py create mode 100644 mmdetection/configs/ddq/metafile.yml create mode 100644 mmdetection/configs/deepfashion/README.md create mode 100644 mmdetection/configs/deepfashion/mask-rcnn_r50_fpn_15e_deepfashion.py create mode 100644 mmdetection/configs/deepsort/README.md create mode 100644 mmdetection/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py create mode 100644 mmdetection/configs/deepsort/metafile.yml create mode 100644 mmdetection/configs/deformable_detr/README.md create mode 100644 mmdetection/configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py create mode 100644 mmdetection/configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py create mode 100644 mmdetection/configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py create mode 100644 mmdetection/configs/deformable_detr/metafile.yml create mode 100644 mmdetection/configs/detectors/README.md create mode 100644 mmdetection/configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py create mode 100644 mmdetection/configs/detectors/cascade-rcnn_r50-sac_1x_coco.py create mode 100644 mmdetection/configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py create mode 100644 mmdetection/configs/detectors/detectors_htc-r101_20e_coco.py create mode 100644 mmdetection/configs/detectors/detectors_htc-r50_1x_coco.py create mode 100644 mmdetection/configs/detectors/htc_r50-rfp_1x_coco.py create mode 100644 mmdetection/configs/detectors/htc_r50-sac_1x_coco.py create mode 100644 mmdetection/configs/detectors/metafile.yml create mode 100644 mmdetection/configs/detr/README.md create mode 100644 mmdetection/configs/detr/detr_r101_8xb2-500e_coco.py create mode 100644 mmdetection/configs/detr/detr_r18_8xb2-500e_coco.py create mode 100644 mmdetection/configs/detr/detr_r50_8xb2-150e_coco.py create mode 100644 mmdetection/configs/detr/detr_r50_8xb2-500e_coco.py create mode 100644 mmdetection/configs/detr/metafile.yml create mode 100644 mmdetection/configs/dino/README.md create mode 100644 mmdetection/configs/dino/dino-4scale_r50_8xb2-12e_coco.py create mode 100644 mmdetection/configs/dino/dino-4scale_r50_8xb2-24e_coco.py create mode 100644 mmdetection/configs/dino/dino-4scale_r50_8xb2-36e_coco.py create mode 100644 mmdetection/configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py create mode 100644 mmdetection/configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py create mode 100644 mmdetection/configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py create mode 100644 mmdetection/configs/dino/metafile.yml create mode 100644 mmdetection/configs/double_heads/README.md create mode 100644 mmdetection/configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/double_heads/metafile.yml create mode 100644 mmdetection/configs/dsdl/README.md create mode 100644 mmdetection/configs/dsdl/coco.py create mode 100644 mmdetection/configs/dsdl/coco_instance.py create mode 100644 mmdetection/configs/dsdl/objects365v2.py create mode 100644 mmdetection/configs/dsdl/openimagesv6.py create mode 100644 mmdetection/configs/dsdl/voc07.py create mode 100644 mmdetection/configs/dsdl/voc0712.py create mode 100644 mmdetection/configs/dyhead/README.md create mode 100644 mmdetection/configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py create mode 100644 mmdetection/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py create mode 100644 mmdetection/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py create mode 100644 mmdetection/configs/dyhead/metafile.yml create mode 100644 mmdetection/configs/dynamic_rcnn/README.md create mode 100644 mmdetection/configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/dynamic_rcnn/metafile.yml create mode 100644 mmdetection/configs/efficientnet/README.md create mode 100644 mmdetection/configs/efficientnet/metafile.yml create mode 100644 mmdetection/configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py create mode 100644 mmdetection/configs/empirical_attention/README.md create mode 100644 mmdetection/configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py create mode 100644 mmdetection/configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py create mode 100644 mmdetection/configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py create mode 100644 mmdetection/configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py create mode 100644 mmdetection/configs/empirical_attention/metafile.yml create mode 100644 mmdetection/configs/fast_rcnn/README.md create mode 100644 mmdetection/configs/fast_rcnn/fast-rcnn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/fast_rcnn/fast-rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/fast_rcnn/fast-rcnn_r101_fpn_2x_coco.py create mode 100644 mmdetection/configs/fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/fast_rcnn/fast-rcnn_r50_fpn_2x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/README.md create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_90k_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-90k_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ohem_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_soft-nms_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/faster_rcnn/metafile.yml create mode 100644 mmdetection/configs/fcos/README.md create mode 100644 mmdetection/configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r101_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r18_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head-center_1x_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py create mode 100644 mmdetection/configs/fcos/fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py create mode 100644 mmdetection/configs/fcos/metafile.yml create mode 100644 mmdetection/configs/foveabox/README.md create mode 100644 mmdetection/configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py create mode 100644 mmdetection/configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py create mode 100644 mmdetection/configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py create mode 100644 mmdetection/configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py create mode 100644 mmdetection/configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py create mode 100644 mmdetection/configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py create mode 100644 mmdetection/configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py create mode 100644 mmdetection/configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py create mode 100644 mmdetection/configs/foveabox/metafile.yml create mode 100644 mmdetection/configs/fpg/README.md create mode 100644 mmdetection/configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py create mode 100644 mmdetection/configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py create mode 100644 mmdetection/configs/fpg/faster-rcnn_r50_fpn_crop640-50e_coco.py create mode 100644 mmdetection/configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py create mode 100644 mmdetection/configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py create mode 100644 mmdetection/configs/fpg/mask-rcnn_r50_fpn_crop640-50e_coco.py create mode 100644 mmdetection/configs/fpg/metafile.yml create mode 100644 mmdetection/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py create mode 100644 mmdetection/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py create mode 100644 mmdetection/configs/free_anchor/README.md create mode 100644 mmdetection/configs/free_anchor/freeanchor_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/free_anchor/freeanchor_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/free_anchor/metafile.yml create mode 100644 mmdetection/configs/fsaf/README.md create mode 100644 mmdetection/configs/fsaf/fsaf_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/fsaf/fsaf_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/fsaf/metafile.yml create mode 100644 mmdetection/configs/gcnet/README.md create mode 100644 mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py create mode 100644 mmdetection/configs/gcnet/metafile.yml create mode 100644 mmdetection/configs/gfl/README.md create mode 100644 mmdetection/configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/gfl/gfl_r101_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/gfl/gfl_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/gfl/gfl_r50_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/gfl/metafile.yml create mode 100644 mmdetection/configs/ghm/README.md create mode 100644 mmdetection/configs/ghm/metafile.yml create mode 100644 mmdetection/configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py create mode 100644 mmdetection/configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py create mode 100644 mmdetection/configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py create mode 100644 mmdetection/configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py create mode 100644 mmdetection/configs/glip/README.md create mode 100644 mmdetection/configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py create mode 100644 mmdetection/configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py create mode 100644 mmdetection/configs/glip/metafile.yml create mode 100644 mmdetection/configs/gn+ws/README.md create mode 100644 mmdetection/configs/gn+ws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py create mode 100644 mmdetection/configs/gn+ws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py create mode 100644 mmdetection/configs/gn+ws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py create mode 100644 mmdetection/configs/gn+ws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py create mode 100644 mmdetection/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py create mode 100644 mmdetection/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py create mode 100644 mmdetection/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py create mode 100644 mmdetection/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py create mode 100644 mmdetection/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py create mode 100644 mmdetection/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py create mode 100644 mmdetection/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py create mode 100644 mmdetection/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py create mode 100644 mmdetection/configs/gn+ws/metafile.yml create mode 100644 mmdetection/configs/gn/README.md create mode 100644 mmdetection/configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py create mode 100644 mmdetection/configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py create mode 100644 mmdetection/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py create mode 100644 mmdetection/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py create mode 100644 mmdetection/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py create mode 100644 mmdetection/configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py create mode 100644 mmdetection/configs/gn/metafile.yml create mode 100644 mmdetection/configs/grid_rcnn/README.md create mode 100644 mmdetection/configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py create mode 100644 mmdetection/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py create mode 100644 mmdetection/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py create mode 100644 mmdetection/configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py create mode 100644 mmdetection/configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py create mode 100644 mmdetection/configs/grid_rcnn/metafile.yml create mode 100644 mmdetection/configs/groie/README.md create mode 100644 mmdetection/configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py create mode 100644 mmdetection/configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py create mode 100644 mmdetection/configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py create mode 100644 mmdetection/configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py create mode 100644 mmdetection/configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py create mode 100644 mmdetection/configs/groie/metafile.yml create mode 100644 mmdetection/configs/grounding_dino/README.md create mode 100644 mmdetection/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py create mode 100644 mmdetection/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py create mode 100644 mmdetection/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py create mode 100644 mmdetection/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py create mode 100644 mmdetection/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py create mode 100644 mmdetection/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py create mode 100644 mmdetection/configs/grounding_dino/metafile.yml create mode 100644 mmdetection/configs/guided_anchoring/README.md create mode 100644 mmdetection/configs/guided_anchoring/ga-fast-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-faster-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_ms-2x.py create mode 100644 mmdetection/configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-retinanet_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-rpn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/guided_anchoring/metafile.yml create mode 100644 mmdetection/configs/hrnet/README.md create mode 100644 mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py create mode 100644 mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py create mode 100644 mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py create mode 100644 mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py create mode 100644 mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py create mode 100644 mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py create mode 100644 mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py create mode 100644 mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py create mode 100644 mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py create mode 100644 mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py create mode 100644 mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py create mode 100644 mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py create mode 100644 mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py create mode 100644 mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py create mode 100644 mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py create mode 100644 mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py create mode 100644 mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py create mode 100644 mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py create mode 100644 mmdetection/configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py create mode 100644 mmdetection/configs/hrnet/htc_hrnetv2p-w18_20e_coco.py create mode 100644 mmdetection/configs/hrnet/htc_hrnetv2p-w32_20e_coco.py create mode 100644 mmdetection/configs/hrnet/htc_hrnetv2p-w40_20e_coco.py create mode 100644 mmdetection/configs/hrnet/htc_hrnetv2p-w40_28e_coco.py create mode 100644 mmdetection/configs/hrnet/htc_x101-64x4d_fpn_16xb1-28e_coco.py create mode 100644 mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py create mode 100644 mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py create mode 100644 mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py create mode 100644 mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py create mode 100644 mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py create mode 100644 mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py create mode 100644 mmdetection/configs/hrnet/metafile.yml create mode 100644 mmdetection/configs/htc/README.md create mode 100644 mmdetection/configs/htc/htc-without-semantic_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/htc/htc_r101_fpn_20e_coco.py create mode 100644 mmdetection/configs/htc/htc_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/htc/htc_r50_fpn_20e_coco.py create mode 100644 mmdetection/configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py create mode 100644 mmdetection/configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py create mode 100644 mmdetection/configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py create mode 100644 mmdetection/configs/htc/metafile.yml create mode 100644 mmdetection/configs/instaboost/README.md create mode 100644 mmdetection/configs/instaboost/cascade-mask-rcnn_r101_fpn_instaboost-4x_coco.py create mode 100644 mmdetection/configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py create mode 100644 mmdetection/configs/instaboost/cascade-mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py create mode 100644 mmdetection/configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py create mode 100644 mmdetection/configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py create mode 100644 mmdetection/configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py create mode 100644 mmdetection/configs/instaboost/metafile.yml create mode 100644 mmdetection/configs/lad/README.md create mode 100644 mmdetection/configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py create mode 100644 mmdetection/configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py create mode 100644 mmdetection/configs/lad/metafile.yml create mode 100644 mmdetection/configs/ld/README.md create mode 100644 mmdetection/configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py create mode 100644 mmdetection/configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/ld/metafile.yml create mode 100644 mmdetection/configs/legacy_1.x/README.md create mode 100644 mmdetection/configs/legacy_1.x/cascade-mask-rcnn_r50_fpn_1x_coco_v1.py create mode 100644 mmdetection/configs/legacy_1.x/faster-rcnn_r50_fpn_1x_coco_v1.py create mode 100644 mmdetection/configs/legacy_1.x/mask-rcnn_r50_fpn_1x_coco_v1.py create mode 100644 mmdetection/configs/legacy_1.x/retinanet_r50-caffe_fpn_1x_coco_v1.py create mode 100644 mmdetection/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py create mode 100644 mmdetection/configs/legacy_1.x/ssd300_coco_v1.py create mode 100644 mmdetection/configs/libra_rcnn/README.md create mode 100644 mmdetection/configs/libra_rcnn/libra-fast-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/libra_rcnn/metafile.yml create mode 100644 mmdetection/configs/lvis/README.md create mode 100644 mmdetection/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py create mode 100644 mmdetection/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py create mode 100644 mmdetection/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py create mode 100644 mmdetection/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py create mode 100644 mmdetection/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py create mode 100644 mmdetection/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py create mode 100644 mmdetection/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py create mode 100644 mmdetection/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py create mode 100644 mmdetection/configs/lvis/metafile.yml create mode 100644 mmdetection/configs/mask2former/README.md create mode 100644 mmdetection/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py create mode 100644 mmdetection/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py create mode 100644 mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py create mode 100644 mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py create mode 100644 mmdetection/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py create mode 100644 mmdetection/configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py create mode 100644 mmdetection/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py create mode 100644 mmdetection/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py create mode 100644 mmdetection/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py create mode 100644 mmdetection/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py create mode 100644 mmdetection/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py create mode 100644 mmdetection/configs/mask2former/metafile.yml create mode 100644 mmdetection/configs/mask2former_vis/README.md create mode 100644 mmdetection/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2019.py create mode 100644 mmdetection/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py create mode 100644 mmdetection/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2019.py create mode 100644 mmdetection/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py create mode 100644 mmdetection/configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py create mode 100644 mmdetection/configs/mask2former_vis/metafile.yml create mode 100644 mmdetection/configs/mask_rcnn/README.md create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_1x-wandb_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_poly-1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py create mode 100644 mmdetection/configs/mask_rcnn/metafile.yml create mode 100644 mmdetection/configs/maskformer/README.md create mode 100644 mmdetection/configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py create mode 100644 mmdetection/configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py create mode 100644 mmdetection/configs/maskformer/metafile.yml create mode 100644 mmdetection/configs/masktrack_rcnn/README.md create mode 100644 mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py create mode 100644 mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py create mode 100644 mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py create mode 100644 mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py create mode 100644 mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py create mode 100644 mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py create mode 100644 mmdetection/configs/masktrack_rcnn/metafile.yml create mode 100644 mmdetection/configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py create mode 100644 mmdetection/configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py create mode 100644 mmdetection/configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py create mode 100644 mmdetection/configs/ms_rcnn/README.md create mode 100644 mmdetection/configs/ms_rcnn/metafile.yml create mode 100644 mmdetection/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py create mode 100644 mmdetection/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py create mode 100644 mmdetection/configs/ms_rcnn/ms-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/nas_fcos/README.md create mode 100644 mmdetection/configs/nas_fcos/metafile.yml create mode 100644 mmdetection/configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py create mode 100644 mmdetection/configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py create mode 100644 mmdetection/configs/nas_fpn/README.md create mode 100644 mmdetection/configs/nas_fpn/metafile.yml create mode 100644 mmdetection/configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py create mode 100644 mmdetection/configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py create mode 100644 mmdetection/configs/objects365/README.md create mode 100644 mmdetection/configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py create mode 100644 mmdetection/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py create mode 100644 mmdetection/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py create mode 100644 mmdetection/configs/objects365/metafile.yml create mode 100644 mmdetection/configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py create mode 100644 mmdetection/configs/objects365/retinanet_r50_fpn_1x_objects365v1.py create mode 100644 mmdetection/configs/objects365/retinanet_r50_fpn_1x_objects365v2.py create mode 100644 mmdetection/configs/ocsort/README.md create mode 100644 mmdetection/configs/ocsort/metafile.yml create mode 100644 mmdetection/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py create mode 100644 mmdetection/configs/openimages/README.md create mode 100644 mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py create mode 100644 mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py create mode 100644 mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py create mode 100644 mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py create mode 100644 mmdetection/configs/openimages/metafile.yml create mode 100644 mmdetection/configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py create mode 100644 mmdetection/configs/openimages/ssd300_32xb8-36e_openimages.py create mode 100644 mmdetection/configs/paa/README.md create mode 100644 mmdetection/configs/paa/metafile.yml create mode 100644 mmdetection/configs/paa/paa_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/paa/paa_r101_fpn_2x_coco.py create mode 100644 mmdetection/configs/paa/paa_r101_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/paa/paa_r50_fpn_1.5x_coco.py create mode 100644 mmdetection/configs/paa/paa_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/paa/paa_r50_fpn_2x_coco.py create mode 100644 mmdetection/configs/paa/paa_r50_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/pafpn/README.md create mode 100644 mmdetection/configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py create mode 100644 mmdetection/configs/pafpn/metafile.yml create mode 100644 mmdetection/configs/panoptic_fpn/README.md create mode 100644 mmdetection/configs/panoptic_fpn/metafile.yml create mode 100644 mmdetection/configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/pascal_voc/README.md create mode 100644 mmdetection/configs/pascal_voc/faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py create mode 100644 mmdetection/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712-cocofmt.py create mode 100644 mmdetection/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py create mode 100644 mmdetection/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py create mode 100644 mmdetection/configs/pascal_voc/ssd300_voc0712.py create mode 100644 mmdetection/configs/pascal_voc/ssd512_voc0712.py create mode 100644 mmdetection/configs/pisa/README.md create mode 100644 mmdetection/configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py create mode 100644 mmdetection/configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py create mode 100644 mmdetection/configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py create mode 100644 mmdetection/configs/pisa/mask-rcnn_x101-32x4d_fpn_pisa_1x_coco.py create mode 100644 mmdetection/configs/pisa/metafile.yml create mode 100644 mmdetection/configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py create mode 100644 mmdetection/configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py create mode 100644 mmdetection/configs/pisa/ssd300_pisa_coco.py create mode 100644 mmdetection/configs/pisa/ssd512_pisa_coco.py create mode 100644 mmdetection/configs/point_rend/README.md create mode 100644 mmdetection/configs/point_rend/metafile.yml create mode 100644 mmdetection/configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py create mode 100644 mmdetection/configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/pvt/README.md create mode 100644 mmdetection/configs/pvt/metafile.yml create mode 100644 mmdetection/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py create mode 100644 mmdetection/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py create mode 100644 mmdetection/configs/qdtrack/README.md create mode 100644 mmdetection/configs/qdtrack/metafile.yml create mode 100644 mmdetection/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py create mode 100644 mmdetection/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/queryinst/README.md create mode 100644 mmdetection/configs/queryinst/metafile.yml create mode 100644 mmdetection/configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py create mode 100644 mmdetection/configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py create mode 100644 mmdetection/configs/queryinst/queryinst_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py create mode 100644 mmdetection/configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py create mode 100644 mmdetection/configs/recycle/detr_r50_8xb2-150e_recycle.py create mode 100644 mmdetection/configs/recycle/faster-rcnn_r50_fpn_1x_recycle.py create mode 100644 mmdetection/configs/regnet/README.md create mode 100644 mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py create mode 100644 mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py create mode 100644 mmdetection/configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/metafile.yml create mode 100644 mmdetection/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py create mode 100644 mmdetection/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py create mode 100644 mmdetection/configs/reid/README.md create mode 100644 mmdetection/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py create mode 100644 mmdetection/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py create mode 100644 mmdetection/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py create mode 100644 mmdetection/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py create mode 100644 mmdetection/configs/reppoints/README.md create mode 100644 mmdetection/configs/reppoints/metafile.yml create mode 100644 mmdetection/configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-minmax_r50_fpn-gn_head-gn_1x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py create mode 100644 mmdetection/configs/reppoints/reppoints-partial-minmax_r50_fpn-gn_head-gn_1x_coco.py create mode 100644 mmdetection/configs/res2net/README.md create mode 100644 mmdetection/configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py create mode 100644 mmdetection/configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py create mode 100644 mmdetection/configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py create mode 100644 mmdetection/configs/res2net/htc_res2net-101_fpn_20e_coco.py create mode 100644 mmdetection/configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py create mode 100644 mmdetection/configs/res2net/metafile.yml create mode 100644 mmdetection/configs/resnest/README.md create mode 100644 mmdetection/configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py create mode 100644 mmdetection/configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py create mode 100644 mmdetection/configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py create mode 100644 mmdetection/configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py create mode 100644 mmdetection/configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py create mode 100644 mmdetection/configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py create mode 100644 mmdetection/configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py create mode 100644 mmdetection/configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py create mode 100644 mmdetection/configs/resnest/metafile.yml create mode 100644 mmdetection/configs/resnet_strikes_back/README.md create mode 100644 mmdetection/configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py create mode 100644 mmdetection/configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py create mode 100644 mmdetection/configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py create mode 100644 mmdetection/configs/resnet_strikes_back/metafile.yml create mode 100644 mmdetection/configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py create mode 100644 mmdetection/configs/retinanet/README.md create mode 100644 mmdetection/configs/retinanet/metafile.yml create mode 100644 mmdetection/configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r101_fpn_2x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r101_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r18_fpn_1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r18_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50_fpn_2x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50_fpn_90k_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_tta.py create mode 100644 mmdetection/configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py create mode 100644 mmdetection/configs/rpn/README.md create mode 100644 mmdetection/configs/rpn/metafile.yml create mode 100644 mmdetection/configs/rpn/rpn_r101-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_r101_fpn_2x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_r50-caffe-c4_1x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_r50-caffe_fpn_1x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_r50_fpn_2x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py create mode 100644 mmdetection/configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py create mode 100644 mmdetection/configs/rtmdet/README.md create mode 100644 mmdetection/configs/rtmdet/classification/README.md create mode 100644 mmdetection/configs/rtmdet/classification/cspnext-l_8xb256-rsb-a1-600e_in1k.py create mode 100644 mmdetection/configs/rtmdet/classification/cspnext-m_8xb256-rsb-a1-600e_in1k.py create mode 100644 mmdetection/configs/rtmdet/classification/cspnext-s_8xb256-rsb-a1-600e_in1k.py create mode 100644 mmdetection/configs/rtmdet/classification/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py create mode 100644 mmdetection/configs/rtmdet/classification/cspnext-x_8xb256-rsb-a1-600e_in1k.py create mode 100644 mmdetection/configs/rtmdet/metafile.yml create mode 100644 mmdetection/configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet_m_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet_tta.py create mode 100644 mmdetection/configs/rtmdet/rtmdet_x_8xb32-300e_coco.py create mode 100644 mmdetection/configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py create mode 100644 mmdetection/configs/sabl/README.md create mode 100644 mmdetection/configs/sabl/metafile.yml create mode 100644 mmdetection/configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py create mode 100644 mmdetection/configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/scnet/README.md create mode 100644 mmdetection/configs/scnet/metafile.yml create mode 100644 mmdetection/configs/scnet/scnet_r101_fpn_20e_coco.py create mode 100644 mmdetection/configs/scnet/scnet_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/scnet/scnet_r50_fpn_20e_coco.py create mode 100644 mmdetection/configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py create mode 100644 mmdetection/configs/scnet/scnet_x101-64x4d_fpn_8xb1-20e_coco.py create mode 100644 mmdetection/configs/scratch/README.md create mode 100644 mmdetection/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py create mode 100644 mmdetection/configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py create mode 100644 mmdetection/configs/scratch/metafile.yml create mode 100644 mmdetection/configs/seesaw_loss/README.md create mode 100644 mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py create mode 100644 mmdetection/configs/seesaw_loss/metafile.yml create mode 100644 mmdetection/configs/selfsup_pretrain/README.md create mode 100644 mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py create mode 100644 mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_1x_coco.py create mode 100644 mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/simple_copy_paste/README.md create mode 100644 mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py create mode 100644 mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py create mode 100644 mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py create mode 100644 mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py create mode 100644 mmdetection/configs/simple_copy_paste/metafile.yml create mode 100644 mmdetection/configs/soft_teacher/README.md create mode 100644 mmdetection/configs/soft_teacher/metafile.yml create mode 100644 mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py create mode 100644 mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py create mode 100644 mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py create mode 100644 mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py create mode 100644 mmdetection/configs/solo/README.md create mode 100644 mmdetection/configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py create mode 100644 mmdetection/configs/solo/decoupled-solo_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/solo/decoupled-solo_r50_fpn_3x_coco.py create mode 100644 mmdetection/configs/solo/metafile.yml create mode 100644 mmdetection/configs/solo/solo_r101_fpn_8xb8-lsj-200e_coco.py create mode 100644 mmdetection/configs/solo/solo_r18_fpn_8xb8-lsj-200e_coco.py create mode 100644 mmdetection/configs/solo/solo_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/solo/solo_r50_fpn_3x_coco.py create mode 100644 mmdetection/configs/solo/solo_r50_fpn_8xb8-lsj-200e_coco.py create mode 100644 mmdetection/configs/solov2/README.md create mode 100644 mmdetection/configs/solov2/metafile.yml create mode 100644 mmdetection/configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/solov2/solov2-light_r34_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/solov2/solov2-light_r50-dcn_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/solov2/solov2_r101_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/solov2/solov2_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/solov2/solov2_r50_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py create mode 100644 mmdetection/configs/sort/README.md create mode 100644 mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py create mode 100644 mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py create mode 100644 mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py create mode 100644 mmdetection/configs/sort/metafile.yml create mode 100644 mmdetection/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py create mode 100644 mmdetection/configs/sparse_rcnn/README.md create mode 100644 mmdetection/configs/sparse_rcnn/metafile.yml create mode 100644 mmdetection/configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py create mode 100644 mmdetection/configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py create mode 100644 mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py create mode 100644 mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py create mode 100644 mmdetection/configs/ssd/README.md create mode 100644 mmdetection/configs/ssd/metafile.yml create mode 100644 mmdetection/configs/ssd/ssd300_coco.py create mode 100644 mmdetection/configs/ssd/ssd512_coco.py create mode 100644 mmdetection/configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py create mode 100644 mmdetection/configs/strong_baselines/README.md create mode 100644 mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py create mode 100644 mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py create mode 100644 mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-400e_coco.py create mode 100644 mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py create mode 100644 mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py create mode 100644 mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py create mode 100644 mmdetection/configs/strong_baselines/metafile.yml create mode 100644 mmdetection/configs/strongsort/README.md create mode 100644 mmdetection/configs/strongsort/metafile.yml create mode 100644 mmdetection/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py create mode 100644 mmdetection/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py create mode 100644 mmdetection/configs/swin/README.md create mode 100644 mmdetection/configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py create mode 100644 mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py create mode 100644 mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py create mode 100644 mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py create mode 100644 mmdetection/configs/swin/metafile.yml create mode 100644 mmdetection/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py create mode 100644 mmdetection/configs/timm_example/README.md create mode 100644 mmdetection/configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py create mode 100644 mmdetection/configs/timm_example/retinanet_timm-tv-resnet50_fpn_1x_coco.py create mode 100644 mmdetection/configs/tood/README.md create mode 100644 mmdetection/configs/tood/metafile.yml create mode 100644 mmdetection/configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/tood/tood_r101_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/tood/tood_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/tood/tood_r50_fpn_anchor-based_1x_coco.py create mode 100644 mmdetection/configs/tood/tood_r50_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/tood/tood_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/tridentnet/README.md create mode 100644 mmdetection/configs/tridentnet/metafile.yml create mode 100644 mmdetection/configs/tridentnet/tridentnet_r50-caffe_1x_coco.py create mode 100644 mmdetection/configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py create mode 100644 mmdetection/configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py create mode 100644 mmdetection/configs/v3det/README.md create mode 100644 mmdetection/configs/v3det/cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py create mode 100644 mmdetection/configs/v3det/cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py create mode 100644 mmdetection/configs/v3det/deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py create mode 100644 mmdetection/configs/v3det/deformable-detr-refine-twostage_swin_16xb2_sample1e-3_v3det_50e.py create mode 100644 mmdetection/configs/v3det/dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py create mode 100644 mmdetection/configs/v3det/dino-4scale_swin_16xb1_sample1e-3_v3det_36e.py create mode 100644 mmdetection/configs/v3det/faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py create mode 100644 mmdetection/configs/v3det/faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py create mode 100644 mmdetection/configs/v3det/fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py create mode 100644 mmdetection/configs/v3det/fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py create mode 100644 mmdetection/configs/vfnet/README.md create mode 100644 mmdetection/configs/vfnet/metafile.yml create mode 100644 mmdetection/configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_r101_fpn_1x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_r101_fpn_2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_r50_fpn_1x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_res2net-101_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_res2net101-mdconv-c3-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_x101-32x4d_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/vfnet/vfnet_x101-64x4d_fpn_ms-2x_coco.py create mode 100644 mmdetection/configs/wider_face/README.md create mode 100644 mmdetection/configs/wider_face/retinanet_r50_fpn_1x_widerface.py create mode 100644 mmdetection/configs/wider_face/ssd300_8xb32-24e_widerface.py create mode 100644 mmdetection/configs/yolact/README.md create mode 100644 mmdetection/configs/yolact/metafile.yml create mode 100644 mmdetection/configs/yolact/yolact_r101_1xb8-55e_coco.py create mode 100644 mmdetection/configs/yolact/yolact_r50_1xb8-55e_coco.py create mode 100644 mmdetection/configs/yolact/yolact_r50_8xb8-55e_coco.py create mode 100644 mmdetection/configs/yolo/README.md create mode 100644 mmdetection/configs/yolo/metafile.yml create mode 100644 mmdetection/configs/yolo/yolov3_d53_8xb8-320-273e_coco.py create mode 100644 mmdetection/configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py create mode 100644 mmdetection/configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py create mode 100644 mmdetection/configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py create mode 100644 mmdetection/configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py create mode 100644 mmdetection/configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py create mode 100644 mmdetection/configs/yolof/README.md create mode 100644 mmdetection/configs/yolof/metafile.yml create mode 100644 mmdetection/configs/yolof/yolof_r50-c5_8xb8-1x_coco.py create mode 100644 mmdetection/configs/yolof/yolof_r50-c5_8xb8-iter-1x_coco.py create mode 100644 mmdetection/configs/yolox/README.md create mode 100644 mmdetection/configs/yolox/metafile.yml create mode 100644 mmdetection/configs/yolox/yolox_l_8xb8-300e_coco.py create mode 100644 mmdetection/configs/yolox/yolox_m_8xb8-300e_coco.py create mode 100644 mmdetection/configs/yolox/yolox_nano_8xb8-300e_coco.py create mode 100644 mmdetection/configs/yolox/yolox_s_8xb8-300e_coco.py create mode 100644 mmdetection/configs/yolox/yolox_tiny_8xb8-300e_coco.py create mode 100644 mmdetection/configs/yolox/yolox_tta.py create mode 100644 mmdetection/configs/yolox/yolox_x_8xb8-300e_coco.py create mode 100644 mmdetection/dataset-index.yml create mode 100644 mmdetection/demo/create_result_gif.py create mode 100644 mmdetection/demo/demo_multi_model.py create mode 100644 mmdetection/demo/image_demo.py create mode 100644 mmdetection/demo/large_image_demo.py create mode 100644 mmdetection/demo/mot_demo.py create mode 100644 mmdetection/demo/video_demo.py create mode 100644 mmdetection/demo/video_gpuaccel_demo.py create mode 100644 mmdetection/demo/webcam_demo.py create mode 100644 mmdetection/docker/Dockerfile create mode 100644 mmdetection/docker/serve/Dockerfile create mode 100644 mmdetection/docker/serve/config.properties create mode 100644 mmdetection/docker/serve_cn/Dockerfile create mode 100644 mmdetection/docs/en/Makefile create mode 100644 mmdetection/docs/en/_static/css/readthedocs.css create mode 100644 mmdetection/docs/en/advanced_guides/conventions.md create mode 100644 mmdetection/docs/en/advanced_guides/customize_dataset.md create mode 100644 mmdetection/docs/en/advanced_guides/customize_losses.md create mode 100644 mmdetection/docs/en/advanced_guides/customize_models.md create mode 100644 mmdetection/docs/en/advanced_guides/customize_runtime.md create mode 100644 mmdetection/docs/en/advanced_guides/customize_transforms.md create mode 100644 mmdetection/docs/en/advanced_guides/data_flow.md create mode 100644 mmdetection/docs/en/advanced_guides/datasets.md create mode 100644 mmdetection/docs/en/advanced_guides/engine.md create mode 100644 mmdetection/docs/en/advanced_guides/evaluation.md create mode 100644 mmdetection/docs/en/advanced_guides/how_to.md create mode 100644 mmdetection/docs/en/advanced_guides/index.rst create mode 100644 mmdetection/docs/en/advanced_guides/models.md create mode 100644 mmdetection/docs/en/advanced_guides/structures.md create mode 100644 mmdetection/docs/en/advanced_guides/transforms.md create mode 100644 mmdetection/docs/en/api.rst create mode 100644 mmdetection/docs/en/conf.py create mode 100644 mmdetection/docs/en/dataset_zoo.md create mode 100644 mmdetection/docs/en/get_started.md create mode 100644 mmdetection/docs/en/index.rst create mode 100644 mmdetection/docs/en/make.bat create mode 100644 mmdetection/docs/en/migration.md create mode 100644 mmdetection/docs/en/migration/api_and_registry_migration.md create mode 100644 mmdetection/docs/en/migration/config_migration.md create mode 100644 mmdetection/docs/en/migration/dataset_migration.md create mode 100644 mmdetection/docs/en/migration/migration.md create mode 100644 mmdetection/docs/en/migration/migration_faq.md create mode 100644 mmdetection/docs/en/migration/model_migration.md create mode 100644 mmdetection/docs/en/model_zoo.md create mode 100644 mmdetection/docs/en/notes/changelog.md create mode 100644 mmdetection/docs/en/notes/changelog_v2.x.md create mode 100644 mmdetection/docs/en/notes/compatibility.md create mode 100644 mmdetection/docs/en/notes/contribution_guide.md create mode 100644 mmdetection/docs/en/notes/faq.md create mode 100644 mmdetection/docs/en/notes/projects.md create mode 100644 mmdetection/docs/en/overview.md create mode 100755 mmdetection/docs/en/stat.py create mode 100644 mmdetection/docs/en/switch_language.md create mode 100644 mmdetection/docs/en/user_guides/config.md create mode 100644 mmdetection/docs/en/user_guides/dataset_prepare.md create mode 100644 mmdetection/docs/en/user_guides/deploy.md create mode 100644 mmdetection/docs/en/user_guides/finetune.md create mode 100644 mmdetection/docs/en/user_guides/index.rst create mode 100644 mmdetection/docs/en/user_guides/inference.md create mode 100644 mmdetection/docs/en/user_guides/init_cfg.md create mode 100644 mmdetection/docs/en/user_guides/label_studio.md create mode 100644 mmdetection/docs/en/user_guides/new_model.md create mode 100644 mmdetection/docs/en/user_guides/robustness_benchmarking.md create mode 100644 mmdetection/docs/en/user_guides/semi_det.md create mode 100644 mmdetection/docs/en/user_guides/single_stage_as_rpn.md create mode 100644 mmdetection/docs/en/user_guides/test.md create mode 100644 mmdetection/docs/en/user_guides/test_results_submission.md create mode 100644 mmdetection/docs/en/user_guides/tracking_analysis_tools.md create mode 100644 mmdetection/docs/en/user_guides/tracking_config.md create mode 100644 mmdetection/docs/en/user_guides/tracking_dataset_prepare.md create mode 100644 mmdetection/docs/en/user_guides/tracking_inference.md create mode 100644 mmdetection/docs/en/user_guides/tracking_train_test.md create mode 100644 mmdetection/docs/en/user_guides/tracking_visualization.md create mode 100644 mmdetection/docs/en/user_guides/train.md create mode 100644 mmdetection/docs/en/user_guides/useful_hooks.md create mode 100644 mmdetection/docs/en/user_guides/useful_tools.md create mode 100644 mmdetection/docs/en/user_guides/visualization.md create mode 100644 mmdetection/docs/zh_cn/Makefile create mode 100644 mmdetection/docs/zh_cn/_static/css/readthedocs.css create mode 100644 mmdetection/docs/zh_cn/advanced_guides/conventions.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/customize_dataset.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/customize_losses.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/customize_models.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/customize_runtime.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/customize_transforms.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/data_flow.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/datasets.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/engine.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/evaluation.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/how_to.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/index.rst create mode 100644 mmdetection/docs/zh_cn/advanced_guides/models.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/structures.md create mode 100644 mmdetection/docs/zh_cn/advanced_guides/transforms.md create mode 100644 mmdetection/docs/zh_cn/api.rst create mode 100644 mmdetection/docs/zh_cn/article.md create mode 100644 mmdetection/docs/zh_cn/conf.py create mode 100644 mmdetection/docs/zh_cn/get_started.md create mode 100644 mmdetection/docs/zh_cn/index.rst create mode 100644 mmdetection/docs/zh_cn/make.bat create mode 100644 mmdetection/docs/zh_cn/migration/api_and_registry_migration.md create mode 100644 mmdetection/docs/zh_cn/migration/config_migration.md create mode 100644 mmdetection/docs/zh_cn/migration/dataset_migration.md create mode 100644 mmdetection/docs/zh_cn/migration/migration.md create mode 100644 mmdetection/docs/zh_cn/migration/migration_faq.md create mode 100644 mmdetection/docs/zh_cn/migration/model_migration.md create mode 100644 mmdetection/docs/zh_cn/model_zoo.md create mode 100644 mmdetection/docs/zh_cn/notes/compatibility.md create mode 100644 mmdetection/docs/zh_cn/notes/faq.md create mode 100644 mmdetection/docs/zh_cn/notes/projects.md create mode 100644 mmdetection/docs/zh_cn/overview.md create mode 100755 mmdetection/docs/zh_cn/stat.py create mode 100644 mmdetection/docs/zh_cn/switch_language.md create mode 100644 mmdetection/docs/zh_cn/user_guides/config.md create mode 100644 mmdetection/docs/zh_cn/user_guides/dataset_prepare.md create mode 100644 mmdetection/docs/zh_cn/user_guides/deploy.md create mode 100644 mmdetection/docs/zh_cn/user_guides/finetune.md create mode 100644 mmdetection/docs/zh_cn/user_guides/index.rst create mode 100644 mmdetection/docs/zh_cn/user_guides/inference.md create mode 100644 mmdetection/docs/zh_cn/user_guides/init_cfg.md create mode 100644 mmdetection/docs/zh_cn/user_guides/label_studio.md create mode 100644 mmdetection/docs/zh_cn/user_guides/new_model.md create mode 100644 mmdetection/docs/zh_cn/user_guides/robustness_benchmarking.md create mode 100644 mmdetection/docs/zh_cn/user_guides/semi_det.md create mode 100644 mmdetection/docs/zh_cn/user_guides/single_stage_as_rpn.md create mode 100644 mmdetection/docs/zh_cn/user_guides/test.md create mode 100644 mmdetection/docs/zh_cn/user_guides/test_results_submission.md create mode 100644 mmdetection/docs/zh_cn/user_guides/tracking_analysis_tools.md create mode 100644 mmdetection/docs/zh_cn/user_guides/tracking_config.md create mode 100644 mmdetection/docs/zh_cn/user_guides/tracking_dataset_prepare.md create mode 100644 mmdetection/docs/zh_cn/user_guides/tracking_interference.md create mode 100644 mmdetection/docs/zh_cn/user_guides/tracking_train_test_zh_cn.md create mode 100644 mmdetection/docs/zh_cn/user_guides/tracking_visualization.md create mode 100644 mmdetection/docs/zh_cn/user_guides/train.md create mode 100644 mmdetection/docs/zh_cn/user_guides/useful_hooks.md create mode 100644 mmdetection/docs/zh_cn/user_guides/useful_tools.md create mode 100644 mmdetection/docs/zh_cn/user_guides/visualization.md create mode 100644 mmdetection/mmdet/__init__.py create mode 100644 mmdetection/mmdet/apis/__init__.py create mode 100644 mmdetection/mmdet/apis/det_inferencer.py create mode 100644 mmdetection/mmdet/apis/inference.py create mode 100644 mmdetection/mmdet/configs/_base_/datasets/coco_detection.py create mode 100644 mmdetection/mmdet/configs/_base_/datasets/coco_instance.py create mode 100644 mmdetection/mmdet/configs/_base_/datasets/coco_instance_semantic.py create mode 100644 mmdetection/mmdet/configs/_base_/datasets/coco_panoptic.py create mode 100644 mmdetection/mmdet/configs/_base_/datasets/mot_challenge.py create mode 100644 mmdetection/mmdet/configs/_base_/default_runtime.py create mode 100644 mmdetection/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py create mode 100644 mmdetection/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py create mode 100644 mmdetection/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py create mode 100644 mmdetection/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py create mode 100644 mmdetection/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py create mode 100644 mmdetection/mmdet/configs/_base_/models/retinanet_r50_fpn.py create mode 100644 mmdetection/mmdet/configs/_base_/schedules/schedule_1x.py create mode 100644 mmdetection/mmdet/configs/_base_/schedules/schedule_2x.py create mode 100644 mmdetection/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/common/lsj_100e_coco_detection.py create mode 100644 mmdetection/mmdet/configs/common/lsj_100e_coco_instance.py create mode 100644 mmdetection/mmdet/configs/common/lsj_200e_coco_detection.py create mode 100644 mmdetection/mmdet/configs/common/lsj_200e_coco_instance.py create mode 100644 mmdetection/mmdet/configs/common/ms_3x_coco.py create mode 100644 mmdetection/mmdet/configs/common/ms_3x_coco_instance.py create mode 100644 mmdetection/mmdet/configs/common/ms_90k_coco.py create mode 100644 mmdetection/mmdet/configs/common/ms_poly_3x_coco_instance.py create mode 100644 mmdetection/mmdet/configs/common/ms_poly_90k_coco_instance.py create mode 100644 mmdetection/mmdet/configs/common/ssj_270_coco_instance.py create mode 100644 mmdetection/mmdet/configs/common/ssj_scp_270k_coco_instance.py create mode 100644 mmdetection/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py create mode 100644 mmdetection/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py create mode 100644 mmdetection/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py create mode 100644 mmdetection/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py create mode 100644 mmdetection/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py create mode 100644 mmdetection/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py create mode 100644 mmdetection/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py create mode 100644 mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py create mode 100644 mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py create mode 100644 mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py create mode 100644 mmdetection/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py create mode 100644 mmdetection/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py create mode 100644 mmdetection/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py create mode 100644 mmdetection/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py create mode 100644 mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py create mode 100644 mmdetection/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py create mode 100644 mmdetection/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py create mode 100644 mmdetection/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py create mode 100644 mmdetection/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py create mode 100644 mmdetection/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py create mode 100644 mmdetection/mmdet/configs/retinanet/retinanet_tta.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_tta.py create mode 100644 mmdetection/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py create mode 100644 mmdetection/mmdet/datasets/__init__.py create mode 100644 mmdetection/mmdet/datasets/ade20k.py create mode 100644 mmdetection/mmdet/datasets/api_wrappers/__init__.py create mode 100644 mmdetection/mmdet/datasets/api_wrappers/coco_api.py create mode 100644 mmdetection/mmdet/datasets/api_wrappers/cocoeval_mp.py create mode 100644 mmdetection/mmdet/datasets/base_det_dataset.py create mode 100644 mmdetection/mmdet/datasets/base_semseg_dataset.py create mode 100644 mmdetection/mmdet/datasets/base_video_dataset.py create mode 100644 mmdetection/mmdet/datasets/cityscapes.py create mode 100644 mmdetection/mmdet/datasets/coco.py create mode 100644 mmdetection/mmdet/datasets/coco_caption.py create mode 100644 mmdetection/mmdet/datasets/coco_panoptic.py create mode 100644 mmdetection/mmdet/datasets/coco_semantic.py create mode 100644 mmdetection/mmdet/datasets/crowdhuman.py create mode 100644 mmdetection/mmdet/datasets/dataset_wrappers.py create mode 100644 mmdetection/mmdet/datasets/deepfashion.py create mode 100644 mmdetection/mmdet/datasets/dsdl.py create mode 100644 mmdetection/mmdet/datasets/isaid.py create mode 100644 mmdetection/mmdet/datasets/lvis.py create mode 100644 mmdetection/mmdet/datasets/mot_challenge_dataset.py create mode 100644 mmdetection/mmdet/datasets/objects365.py create mode 100644 mmdetection/mmdet/datasets/openimages.py create mode 100644 mmdetection/mmdet/datasets/recycle.py create mode 100644 mmdetection/mmdet/datasets/refcoco.py create mode 100644 mmdetection/mmdet/datasets/reid_dataset.py create mode 100644 mmdetection/mmdet/datasets/samplers/__init__.py create mode 100644 mmdetection/mmdet/datasets/samplers/batch_sampler.py create mode 100644 mmdetection/mmdet/datasets/samplers/class_aware_sampler.py create mode 100644 mmdetection/mmdet/datasets/samplers/multi_data_sampler.py create mode 100644 mmdetection/mmdet/datasets/samplers/multi_source_sampler.py create mode 100644 mmdetection/mmdet/datasets/samplers/track_img_sampler.py create mode 100644 mmdetection/mmdet/datasets/transforms/__init__.py create mode 100644 mmdetection/mmdet/datasets/transforms/augment_wrappers.py create mode 100644 mmdetection/mmdet/datasets/transforms/colorspace.py create mode 100644 mmdetection/mmdet/datasets/transforms/formatting.py create mode 100644 mmdetection/mmdet/datasets/transforms/frame_sampling.py create mode 100644 mmdetection/mmdet/datasets/transforms/geometric.py create mode 100644 mmdetection/mmdet/datasets/transforms/instaboost.py create mode 100644 mmdetection/mmdet/datasets/transforms/loading.py create mode 100644 mmdetection/mmdet/datasets/transforms/transformers_glip.py create mode 100644 mmdetection/mmdet/datasets/transforms/transforms.py create mode 100644 mmdetection/mmdet/datasets/transforms/wrappers.py create mode 100644 mmdetection/mmdet/datasets/utils.py create mode 100644 mmdetection/mmdet/datasets/v3det.py create mode 100644 mmdetection/mmdet/datasets/voc.py create mode 100644 mmdetection/mmdet/datasets/wider_face.py create mode 100644 mmdetection/mmdet/datasets/xml_style.py create mode 100644 mmdetection/mmdet/datasets/youtube_vis_dataset.py create mode 100644 mmdetection/mmdet/engine/__init__.py create mode 100644 mmdetection/mmdet/engine/hooks/__init__.py create mode 100644 mmdetection/mmdet/engine/hooks/checkloss_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/mean_teacher_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/memory_profiler_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/num_class_check_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/pipeline_switch_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/set_epoch_info_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/submission_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/sync_norm_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/utils.py create mode 100644 mmdetection/mmdet/engine/hooks/visualization_hook.py create mode 100644 mmdetection/mmdet/engine/hooks/yolox_mode_switch_hook.py create mode 100644 mmdetection/mmdet/engine/optimizers/__init__.py create mode 100644 mmdetection/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py create mode 100644 mmdetection/mmdet/engine/runner/__init__.py create mode 100644 mmdetection/mmdet/engine/runner/loops.py create mode 100644 mmdetection/mmdet/engine/schedulers/__init__.py create mode 100644 mmdetection/mmdet/engine/schedulers/quadratic_warmup.py create mode 100644 mmdetection/mmdet/evaluation/__init__.py create mode 100644 mmdetection/mmdet/evaluation/functional/__init__.py create mode 100644 mmdetection/mmdet/evaluation/functional/bbox_overlaps.py create mode 100644 mmdetection/mmdet/evaluation/functional/cityscapes_utils.py create mode 100644 mmdetection/mmdet/evaluation/functional/class_names.py create mode 100644 mmdetection/mmdet/evaluation/functional/mean_ap.py create mode 100644 mmdetection/mmdet/evaluation/functional/panoptic_utils.py create mode 100644 mmdetection/mmdet/evaluation/functional/recall.py create mode 100644 mmdetection/mmdet/evaluation/functional/ytvis.py create mode 100644 mmdetection/mmdet/evaluation/functional/ytviseval.py create mode 100644 mmdetection/mmdet/evaluation/metrics/__init__.py create mode 100644 mmdetection/mmdet/evaluation/metrics/base_video_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/cityscapes_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/coco_caption_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/coco_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/coco_occluded_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/coco_panoptic_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/coco_video_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/crowdhuman_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/dump_det_results.py create mode 100644 mmdetection/mmdet/evaluation/metrics/dump_proposals_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/lvis_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/mot_challenge_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/openimages_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/refseg_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/reid_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/semseg_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/voc_metric.py create mode 100644 mmdetection/mmdet/evaluation/metrics/youtube_vis_metric.py create mode 100644 mmdetection/mmdet/models/__init__.py create mode 100644 mmdetection/mmdet/models/backbones/__init__.py create mode 100644 mmdetection/mmdet/models/backbones/csp_darknet.py create mode 100644 mmdetection/mmdet/models/backbones/cspnext.py create mode 100644 mmdetection/mmdet/models/backbones/darknet.py create mode 100644 mmdetection/mmdet/models/backbones/detectors_resnet.py create mode 100644 mmdetection/mmdet/models/backbones/detectors_resnext.py create mode 100644 mmdetection/mmdet/models/backbones/efficientnet.py create mode 100644 mmdetection/mmdet/models/backbones/hourglass.py create mode 100644 mmdetection/mmdet/models/backbones/hrnet.py create mode 100644 mmdetection/mmdet/models/backbones/mobilenet_v2.py create mode 100644 mmdetection/mmdet/models/backbones/pvt.py create mode 100644 mmdetection/mmdet/models/backbones/regnet.py create mode 100644 mmdetection/mmdet/models/backbones/res2net.py create mode 100644 mmdetection/mmdet/models/backbones/resnest.py create mode 100644 mmdetection/mmdet/models/backbones/resnet.py create mode 100644 mmdetection/mmdet/models/backbones/resnext.py create mode 100644 mmdetection/mmdet/models/backbones/ssd_vgg.py create mode 100644 mmdetection/mmdet/models/backbones/swin.py create mode 100644 mmdetection/mmdet/models/backbones/trident_resnet.py create mode 100644 mmdetection/mmdet/models/data_preprocessors/__init__.py create mode 100644 mmdetection/mmdet/models/data_preprocessors/data_preprocessor.py create mode 100644 mmdetection/mmdet/models/data_preprocessors/reid_data_preprocessor.py create mode 100644 mmdetection/mmdet/models/data_preprocessors/track_data_preprocessor.py create mode 100644 mmdetection/mmdet/models/dense_heads/__init__.py create mode 100644 mmdetection/mmdet/models/dense_heads/anchor_free_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/anchor_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/atss_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/atss_vlfusion_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/autoassign_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/base_dense_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/base_mask_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/boxinst_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/cascade_rpn_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/centernet_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/centernet_update_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/centripetal_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/condinst_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/conditional_detr_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/corner_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/dab_detr_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/ddod_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/ddq_detr_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/deformable_detr_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/dense_test_mixins.py create mode 100644 mmdetection/mmdet/models/dense_heads/detr_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/dino_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/embedding_rpn_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/fcos_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/fovea_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/free_anchor_retina_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/fsaf_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/ga_retina_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/ga_rpn_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/gfl_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/grounding_dino_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/guided_anchor_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/lad_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/ld_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/mask2former_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/maskformer_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/nasfcos_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/paa_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/pisa_retinanet_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/pisa_ssd_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/reppoints_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/retina_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/retina_sepbn_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/rpn_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/rtmdet_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/rtmdet_ins_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/sabl_retina_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/solo_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/solov2_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/ssd_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/tood_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/vfnet_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/yolact_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/yolo_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/yolof_head.py create mode 100644 mmdetection/mmdet/models/dense_heads/yolox_head.py create mode 100644 mmdetection/mmdet/models/detectors/__init__.py create mode 100644 mmdetection/mmdet/models/detectors/atss.py create mode 100644 mmdetection/mmdet/models/detectors/autoassign.py create mode 100644 mmdetection/mmdet/models/detectors/base.py create mode 100644 mmdetection/mmdet/models/detectors/base_detr.py create mode 100644 mmdetection/mmdet/models/detectors/boxinst.py create mode 100644 mmdetection/mmdet/models/detectors/cascade_rcnn.py create mode 100644 mmdetection/mmdet/models/detectors/centernet.py create mode 100644 mmdetection/mmdet/models/detectors/condinst.py create mode 100644 mmdetection/mmdet/models/detectors/conditional_detr.py create mode 100644 mmdetection/mmdet/models/detectors/cornernet.py create mode 100644 mmdetection/mmdet/models/detectors/crowddet.py create mode 100644 mmdetection/mmdet/models/detectors/d2_wrapper.py create mode 100644 mmdetection/mmdet/models/detectors/dab_detr.py create mode 100644 mmdetection/mmdet/models/detectors/ddod.py create mode 100644 mmdetection/mmdet/models/detectors/ddq_detr.py create mode 100644 mmdetection/mmdet/models/detectors/deformable_detr.py create mode 100644 mmdetection/mmdet/models/detectors/detr.py create mode 100644 mmdetection/mmdet/models/detectors/dino.py create mode 100644 mmdetection/mmdet/models/detectors/fast_rcnn.py create mode 100644 mmdetection/mmdet/models/detectors/faster_rcnn.py create mode 100644 mmdetection/mmdet/models/detectors/fcos.py create mode 100644 mmdetection/mmdet/models/detectors/fovea.py create mode 100644 mmdetection/mmdet/models/detectors/fsaf.py create mode 100644 mmdetection/mmdet/models/detectors/gfl.py create mode 100644 mmdetection/mmdet/models/detectors/glip.py create mode 100644 mmdetection/mmdet/models/detectors/grid_rcnn.py create mode 100644 mmdetection/mmdet/models/detectors/grounding_dino.py create mode 100644 mmdetection/mmdet/models/detectors/htc.py create mode 100644 mmdetection/mmdet/models/detectors/kd_one_stage.py create mode 100644 mmdetection/mmdet/models/detectors/lad.py create mode 100644 mmdetection/mmdet/models/detectors/mask2former.py create mode 100644 mmdetection/mmdet/models/detectors/mask_rcnn.py create mode 100644 mmdetection/mmdet/models/detectors/mask_scoring_rcnn.py create mode 100644 mmdetection/mmdet/models/detectors/maskformer.py create mode 100644 mmdetection/mmdet/models/detectors/nasfcos.py create mode 100644 mmdetection/mmdet/models/detectors/paa.py create mode 100644 mmdetection/mmdet/models/detectors/panoptic_fpn.py create mode 100644 mmdetection/mmdet/models/detectors/panoptic_two_stage_segmentor.py create mode 100644 mmdetection/mmdet/models/detectors/point_rend.py create mode 100644 mmdetection/mmdet/models/detectors/queryinst.py create mode 100644 mmdetection/mmdet/models/detectors/reppoints_detector.py create mode 100644 mmdetection/mmdet/models/detectors/retinanet.py create mode 100644 mmdetection/mmdet/models/detectors/rpn.py create mode 100644 mmdetection/mmdet/models/detectors/rtmdet.py create mode 100644 mmdetection/mmdet/models/detectors/scnet.py create mode 100644 mmdetection/mmdet/models/detectors/semi_base.py create mode 100644 mmdetection/mmdet/models/detectors/single_stage.py create mode 100644 mmdetection/mmdet/models/detectors/single_stage_instance_seg.py create mode 100644 mmdetection/mmdet/models/detectors/soft_teacher.py create mode 100644 mmdetection/mmdet/models/detectors/solo.py create mode 100644 mmdetection/mmdet/models/detectors/solov2.py create mode 100644 mmdetection/mmdet/models/detectors/sparse_rcnn.py create mode 100644 mmdetection/mmdet/models/detectors/tood.py create mode 100644 mmdetection/mmdet/models/detectors/trident_faster_rcnn.py create mode 100644 mmdetection/mmdet/models/detectors/two_stage.py create mode 100644 mmdetection/mmdet/models/detectors/vfnet.py create mode 100644 mmdetection/mmdet/models/detectors/yolact.py create mode 100644 mmdetection/mmdet/models/detectors/yolo.py create mode 100644 mmdetection/mmdet/models/detectors/yolof.py create mode 100644 mmdetection/mmdet/models/detectors/yolox.py create mode 100644 mmdetection/mmdet/models/language_models/__init__.py create mode 100644 mmdetection/mmdet/models/language_models/bert.py create mode 100644 mmdetection/mmdet/models/layers/__init__.py create mode 100644 mmdetection/mmdet/models/layers/activations.py create mode 100644 mmdetection/mmdet/models/layers/bbox_nms.py create mode 100644 mmdetection/mmdet/models/layers/brick_wrappers.py create mode 100644 mmdetection/mmdet/models/layers/conv_upsample.py create mode 100644 mmdetection/mmdet/models/layers/csp_layer.py create mode 100644 mmdetection/mmdet/models/layers/dropblock.py create mode 100644 mmdetection/mmdet/models/layers/ema.py create mode 100644 mmdetection/mmdet/models/layers/inverted_residual.py create mode 100644 mmdetection/mmdet/models/layers/matrix_nms.py create mode 100644 mmdetection/mmdet/models/layers/msdeformattn_pixel_decoder.py create mode 100644 mmdetection/mmdet/models/layers/normed_predictor.py create mode 100644 mmdetection/mmdet/models/layers/pixel_decoder.py create mode 100644 mmdetection/mmdet/models/layers/positional_encoding.py create mode 100644 mmdetection/mmdet/models/layers/res_layer.py create mode 100644 mmdetection/mmdet/models/layers/se_layer.py create mode 100644 mmdetection/mmdet/models/layers/transformer/__init__.py create mode 100644 mmdetection/mmdet/models/layers/transformer/conditional_detr_layers.py create mode 100644 mmdetection/mmdet/models/layers/transformer/dab_detr_layers.py create mode 100644 mmdetection/mmdet/models/layers/transformer/ddq_detr_layers.py create mode 100644 mmdetection/mmdet/models/layers/transformer/deformable_detr_layers.py create mode 100644 mmdetection/mmdet/models/layers/transformer/detr_layers.py create mode 100644 mmdetection/mmdet/models/layers/transformer/dino_layers.py create mode 100644 mmdetection/mmdet/models/layers/transformer/grounding_dino_layers.py create mode 100644 mmdetection/mmdet/models/layers/transformer/mask2former_layers.py create mode 100644 mmdetection/mmdet/models/layers/transformer/utils.py create mode 100644 mmdetection/mmdet/models/losses/__init__.py create mode 100644 mmdetection/mmdet/models/losses/accuracy.py create mode 100644 mmdetection/mmdet/models/losses/ae_loss.py create mode 100644 mmdetection/mmdet/models/losses/balanced_l1_loss.py create mode 100644 mmdetection/mmdet/models/losses/cross_entropy_loss.py create mode 100644 mmdetection/mmdet/models/losses/ddq_detr_aux_loss.py create mode 100644 mmdetection/mmdet/models/losses/dice_loss.py create mode 100644 mmdetection/mmdet/models/losses/eqlv2_loss.py create mode 100644 mmdetection/mmdet/models/losses/focal_loss.py create mode 100644 mmdetection/mmdet/models/losses/gaussian_focal_loss.py create mode 100644 mmdetection/mmdet/models/losses/gfocal_loss.py create mode 100644 mmdetection/mmdet/models/losses/ghm_loss.py create mode 100644 mmdetection/mmdet/models/losses/iou_loss.py create mode 100644 mmdetection/mmdet/models/losses/kd_loss.py create mode 100644 mmdetection/mmdet/models/losses/l2_loss.py create mode 100644 mmdetection/mmdet/models/losses/margin_loss.py create mode 100644 mmdetection/mmdet/models/losses/mse_loss.py create mode 100644 mmdetection/mmdet/models/losses/multipos_cross_entropy_loss.py create mode 100644 mmdetection/mmdet/models/losses/pisa_loss.py create mode 100644 mmdetection/mmdet/models/losses/seesaw_loss.py create mode 100644 mmdetection/mmdet/models/losses/smooth_l1_loss.py create mode 100644 mmdetection/mmdet/models/losses/triplet_loss.py create mode 100644 mmdetection/mmdet/models/losses/utils.py create mode 100644 mmdetection/mmdet/models/losses/varifocal_loss.py create mode 100644 mmdetection/mmdet/models/mot/__init__.py create mode 100644 mmdetection/mmdet/models/mot/base.py create mode 100644 mmdetection/mmdet/models/mot/bytetrack.py create mode 100644 mmdetection/mmdet/models/mot/deep_sort.py create mode 100644 mmdetection/mmdet/models/mot/ocsort.py create mode 100644 mmdetection/mmdet/models/mot/qdtrack.py create mode 100644 mmdetection/mmdet/models/mot/strongsort.py create mode 100644 mmdetection/mmdet/models/necks/__init__.py create mode 100644 mmdetection/mmdet/models/necks/bfp.py create mode 100644 mmdetection/mmdet/models/necks/channel_mapper.py create mode 100644 mmdetection/mmdet/models/necks/cspnext_pafpn.py create mode 100644 mmdetection/mmdet/models/necks/ct_resnet_neck.py create mode 100644 mmdetection/mmdet/models/necks/dilated_encoder.py create mode 100644 mmdetection/mmdet/models/necks/dyhead.py create mode 100644 mmdetection/mmdet/models/necks/fpg.py create mode 100644 mmdetection/mmdet/models/necks/fpn.py create mode 100644 mmdetection/mmdet/models/necks/fpn_carafe.py create mode 100644 mmdetection/mmdet/models/necks/fpn_dropblock.py create mode 100644 mmdetection/mmdet/models/necks/hrfpn.py create mode 100644 mmdetection/mmdet/models/necks/nas_fpn.py create mode 100644 mmdetection/mmdet/models/necks/nasfcos_fpn.py create mode 100644 mmdetection/mmdet/models/necks/pafpn.py create mode 100644 mmdetection/mmdet/models/necks/rfp.py create mode 100644 mmdetection/mmdet/models/necks/ssd_neck.py create mode 100644 mmdetection/mmdet/models/necks/ssh.py create mode 100644 mmdetection/mmdet/models/necks/yolo_neck.py create mode 100644 mmdetection/mmdet/models/necks/yolox_pafpn.py create mode 100644 mmdetection/mmdet/models/reid/__init__.py create mode 100644 mmdetection/mmdet/models/reid/base_reid.py create mode 100644 mmdetection/mmdet/models/reid/fc_module.py create mode 100644 mmdetection/mmdet/models/reid/gap.py create mode 100644 mmdetection/mmdet/models/reid/linear_reid_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/__init__.py create mode 100644 mmdetection/mmdet/models/roi_heads/base_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/bbox_heads/__init__.py create mode 100644 mmdetection/mmdet/models/roi_heads/bbox_heads/bbox_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/bbox_heads/dii_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/bbox_heads/sabl_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/cascade_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/double_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/dynamic_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/grid_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/htc_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/__init__.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/feature_relay_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/global_context_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/grid_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/htc_mask_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/mask_point_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/maskiou_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/mask_scoring_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/multi_instance_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/pisa_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/point_rend_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/roi_extractors/__init__.py create mode 100644 mmdetection/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py create mode 100644 mmdetection/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py create mode 100644 mmdetection/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py create mode 100644 mmdetection/mmdet/models/roi_heads/scnet_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/shared_heads/__init__.py create mode 100644 mmdetection/mmdet/models/roi_heads/shared_heads/res_layer.py create mode 100644 mmdetection/mmdet/models/roi_heads/sparse_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/standard_roi_head.py create mode 100644 mmdetection/mmdet/models/roi_heads/test_mixins.py create mode 100644 mmdetection/mmdet/models/roi_heads/trident_roi_head.py create mode 100644 mmdetection/mmdet/models/seg_heads/__init__.py create mode 100644 mmdetection/mmdet/models/seg_heads/base_semantic_head.py create mode 100644 mmdetection/mmdet/models/seg_heads/panoptic_fpn_head.py create mode 100644 mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py create mode 100644 mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py create mode 100644 mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py create mode 100644 mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py create mode 100644 mmdetection/mmdet/models/task_modules/__init__.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/__init__.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/assign_result.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/atss_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/base_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/center_region_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/grid_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/hungarian_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/iou2d_calculator.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/match_cost.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/max_iou_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/multi_instance_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/point_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/region_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/sim_ota_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/task_aligned_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/assigners/uniform_assigner.py create mode 100644 mmdetection/mmdet/models/task_modules/builder.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/__init__.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/base_bbox_coder.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/bucketing_bbox_coder.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/distance_point_bbox_coder.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/pseudo_bbox_coder.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/tblr_bbox_coder.py create mode 100644 mmdetection/mmdet/models/task_modules/coders/yolo_bbox_coder.py create mode 100644 mmdetection/mmdet/models/task_modules/prior_generators/__init__.py create mode 100644 mmdetection/mmdet/models/task_modules/prior_generators/anchor_generator.py create mode 100644 mmdetection/mmdet/models/task_modules/prior_generators/point_generator.py create mode 100644 mmdetection/mmdet/models/task_modules/prior_generators/utils.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/__init__.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/base_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/combined_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/mask_sampling_result.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/ohem_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/pseudo_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/random_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/sampling_result.py create mode 100644 mmdetection/mmdet/models/task_modules/samplers/score_hlr_sampler.py create mode 100644 mmdetection/mmdet/models/task_modules/tracking/__init__.py create mode 100644 mmdetection/mmdet/models/task_modules/tracking/aflink.py create mode 100644 mmdetection/mmdet/models/task_modules/tracking/camera_motion_compensation.py create mode 100644 mmdetection/mmdet/models/task_modules/tracking/interpolation.py create mode 100644 mmdetection/mmdet/models/task_modules/tracking/kalman_filter.py create mode 100644 mmdetection/mmdet/models/task_modules/tracking/similarity.py create mode 100644 mmdetection/mmdet/models/test_time_augs/__init__.py create mode 100644 mmdetection/mmdet/models/test_time_augs/det_tta.py create mode 100644 mmdetection/mmdet/models/test_time_augs/merge_augs.py create mode 100644 mmdetection/mmdet/models/trackers/__init__.py create mode 100644 mmdetection/mmdet/models/trackers/base_tracker.py create mode 100644 mmdetection/mmdet/models/trackers/byte_tracker.py create mode 100644 mmdetection/mmdet/models/trackers/masktrack_rcnn_tracker.py create mode 100644 mmdetection/mmdet/models/trackers/ocsort_tracker.py create mode 100644 mmdetection/mmdet/models/trackers/quasi_dense_tracker.py create mode 100644 mmdetection/mmdet/models/trackers/sort_tracker.py create mode 100644 mmdetection/mmdet/models/trackers/strongsort_tracker.py create mode 100644 mmdetection/mmdet/models/tracking_heads/__init__.py create mode 100644 mmdetection/mmdet/models/tracking_heads/mask2former_track_head.py create mode 100644 mmdetection/mmdet/models/tracking_heads/quasi_dense_embed_head.py create mode 100644 mmdetection/mmdet/models/tracking_heads/quasi_dense_track_head.py create mode 100644 mmdetection/mmdet/models/tracking_heads/roi_embed_head.py create mode 100644 mmdetection/mmdet/models/tracking_heads/roi_track_head.py create mode 100644 mmdetection/mmdet/models/utils/__init__.py create mode 100644 mmdetection/mmdet/models/utils/gaussian_target.py create mode 100644 mmdetection/mmdet/models/utils/image.py create mode 100644 mmdetection/mmdet/models/utils/make_divisible.py create mode 100644 mmdetection/mmdet/models/utils/misc.py create mode 100644 mmdetection/mmdet/models/utils/panoptic_gt_processing.py create mode 100644 mmdetection/mmdet/models/utils/point_sample.py create mode 100644 mmdetection/mmdet/models/utils/vlfuse_helper.py create mode 100644 mmdetection/mmdet/models/utils/wbf.py create mode 100644 mmdetection/mmdet/models/vis/__init__.py create mode 100644 mmdetection/mmdet/models/vis/mask2former_vis.py create mode 100644 mmdetection/mmdet/models/vis/masktrack_rcnn.py create mode 100644 mmdetection/mmdet/registry.py create mode 100644 mmdetection/mmdet/structures/__init__.py create mode 100644 mmdetection/mmdet/structures/bbox/__init__.py create mode 100644 mmdetection/mmdet/structures/bbox/base_boxes.py create mode 100644 mmdetection/mmdet/structures/bbox/bbox_overlaps.py create mode 100644 mmdetection/mmdet/structures/bbox/box_type.py create mode 100644 mmdetection/mmdet/structures/bbox/horizontal_boxes.py create mode 100644 mmdetection/mmdet/structures/bbox/transforms.py create mode 100644 mmdetection/mmdet/structures/det_data_sample.py create mode 100644 mmdetection/mmdet/structures/mask/__init__.py create mode 100644 mmdetection/mmdet/structures/mask/mask_target.py create mode 100644 mmdetection/mmdet/structures/mask/structures.py create mode 100644 mmdetection/mmdet/structures/mask/utils.py create mode 100644 mmdetection/mmdet/structures/reid_data_sample.py create mode 100644 mmdetection/mmdet/structures/track_data_sample.py create mode 100644 mmdetection/mmdet/testing/__init__.py create mode 100644 mmdetection/mmdet/testing/_fast_stop_training_hook.py create mode 100644 mmdetection/mmdet/testing/_utils.py create mode 100644 mmdetection/mmdet/utils/__init__.py create mode 100644 mmdetection/mmdet/utils/benchmark.py create mode 100644 mmdetection/mmdet/utils/collect_env.py create mode 100644 mmdetection/mmdet/utils/compat_config.py create mode 100644 mmdetection/mmdet/utils/contextmanagers.py create mode 100644 mmdetection/mmdet/utils/dist_utils.py create mode 100644 mmdetection/mmdet/utils/large_image.py create mode 100644 mmdetection/mmdet/utils/logger.py create mode 100644 mmdetection/mmdet/utils/memory.py create mode 100644 mmdetection/mmdet/utils/misc.py create mode 100644 mmdetection/mmdet/utils/mot_error_visualize.py create mode 100644 mmdetection/mmdet/utils/profiling.py create mode 100644 mmdetection/mmdet/utils/replace_cfg_vals.py create mode 100644 mmdetection/mmdet/utils/setup_env.py create mode 100644 mmdetection/mmdet/utils/split_batch.py create mode 100644 mmdetection/mmdet/utils/typing_utils.py create mode 100644 mmdetection/mmdet/utils/util_mixins.py create mode 100644 mmdetection/mmdet/utils/util_random.py create mode 100644 mmdetection/mmdet/version.py create mode 100644 mmdetection/mmdet/visualization/__init__.py create mode 100644 mmdetection/mmdet/visualization/local_visualizer.py create mode 100644 mmdetection/mmdet/visualization/palette.py create mode 100644 mmdetection/model-index.yml create mode 100644 mmdetection/projects/AlignDETR/README.md create mode 100644 mmdetection/projects/AlignDETR/align_detr/__init__.py create mode 100644 mmdetection/projects/AlignDETR/align_detr/align_detr_head.py create mode 100644 mmdetection/projects/AlignDETR/align_detr/mixed_hungarian_assigner.py create mode 100644 mmdetection/projects/AlignDETR/align_detr/utils.py create mode 100644 mmdetection/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-12e_coco.py create mode 100644 mmdetection/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-24e_coco.py create mode 100644 mmdetection/projects/CO-DETR/README.md create mode 100644 mmdetection/projects/CO-DETR/codetr/__init__.py create mode 100644 mmdetection/projects/CO-DETR/codetr/co_atss_head.py create mode 100644 mmdetection/projects/CO-DETR/codetr/co_dino_head.py create mode 100644 mmdetection/projects/CO-DETR/codetr/co_roi_head.py create mode 100644 mmdetection/projects/CO-DETR/codetr/codetr.py create mode 100644 mmdetection/projects/CO-DETR/codetr/transformer.py create mode 100644 mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py create mode 100644 mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py create mode 100644 mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py create mode 100644 mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py create mode 100644 mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py create mode 100644 mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py create mode 100644 mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py create mode 100644 mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py create mode 100644 mmdetection/projects/ConvNeXt-V2/README.md create mode 100644 mmdetection/projects/ConvNeXt-V2/configs/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py create mode 100644 mmdetection/projects/Detic/README.md create mode 100644 mmdetection/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py create mode 100644 mmdetection/projects/Detic/demo.py create mode 100644 mmdetection/projects/Detic/detic/__init__.py create mode 100644 mmdetection/projects/Detic/detic/centernet_rpn_head.py create mode 100644 mmdetection/projects/Detic/detic/detic_bbox_head.py create mode 100644 mmdetection/projects/Detic/detic/detic_roi_head.py create mode 100644 mmdetection/projects/Detic/detic/text_encoder.py create mode 100644 mmdetection/projects/Detic/detic/utils.py create mode 100644 mmdetection/projects/Detic/detic/zero_shot_classifier.py create mode 100644 mmdetection/projects/Detic_new/README.md create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py create mode 100644 mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py create mode 100644 mmdetection/projects/Detic_new/detic/__init__.py create mode 100644 mmdetection/projects/Detic_new/detic/centernet_rpn_head.py create mode 100644 mmdetection/projects/Detic_new/detic/detic.py create mode 100644 mmdetection/projects/Detic_new/detic/detic_bbox_head.py create mode 100644 mmdetection/projects/Detic_new/detic/detic_roi_head.py create mode 100644 mmdetection/projects/Detic_new/detic/heatmap_focal_loss.py create mode 100644 mmdetection/projects/Detic_new/detic/imagenet_lvis.py create mode 100644 mmdetection/projects/Detic_new/detic/iou_loss.py create mode 100644 mmdetection/projects/Detic_new/detic/zero_shot_classifier.py create mode 100644 mmdetection/projects/DiffusionDet/README.md create mode 100644 mmdetection/projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py create mode 100644 mmdetection/projects/DiffusionDet/diffusiondet/__init__.py create mode 100644 mmdetection/projects/DiffusionDet/diffusiondet/diffusiondet.py create mode 100644 mmdetection/projects/DiffusionDet/diffusiondet/head.py create mode 100644 mmdetection/projects/DiffusionDet/diffusiondet/loss.py create mode 100644 mmdetection/projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py create mode 100644 mmdetection/projects/EfficientDet/README.md create mode 100644 mmdetection/projects/EfficientDet/configs/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco.py create mode 100644 mmdetection/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco-90cls.py create mode 100644 mmdetection/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py create mode 100644 mmdetection/projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py create mode 100644 mmdetection/projects/EfficientDet/convert_tf_to_pt.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/__init__.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/bifpn.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/efficientdet.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/efficientdet_head.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/huber_loss.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/tensorflow/anchor_generator.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/__init__.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/coco_api.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/tensorflow/coco_90class.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/tensorflow/coco_90metric.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/tensorflow/trans_max_iou_assigner.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/tensorflow/yxyx_bbox_coder.py create mode 100644 mmdetection/projects/EfficientDet/efficientdet/utils.py create mode 100644 mmdetection/projects/HDINO/README.md create mode 100644 mmdetection/projects/HDINO/__init__.py create mode 100644 mmdetection/projects/HDINO/h-dino-4scale_r50_8xb2-12e_coco.py create mode 100644 mmdetection/projects/HDINO/h_dino.py create mode 100644 mmdetection/projects/HDINO/h_dino_head.py create mode 100644 mmdetection/projects/LabelStudio/backend_template/_wsgi.py create mode 100644 mmdetection/projects/LabelStudio/backend_template/mmdetection.py create mode 100644 mmdetection/projects/LabelStudio/readme.md create mode 100644 mmdetection/projects/RF100-Benchmark/README.md create mode 100644 mmdetection/projects/RF100-Benchmark/README_zh-CN.md create mode 100644 mmdetection/projects/RF100-Benchmark/__init__.py create mode 100644 mmdetection/projects/RF100-Benchmark/coco.py create mode 100644 mmdetection/projects/RF100-Benchmark/coco_metric.py create mode 100644 mmdetection/projects/RF100-Benchmark/configs/dino_r50_fpn_ms_8xb8_tweeter-profile.py create mode 100644 mmdetection/projects/RF100-Benchmark/configs/faster-rcnn_r50_fpn_ms_8xb8_tweeter-profile.py create mode 100644 mmdetection/projects/RF100-Benchmark/configs/tood_r50_fpn_ms_8xb8_tweeter-profile.py create mode 100644 mmdetection/projects/RF100-Benchmark/scripts/create_new_config.py create mode 100644 mmdetection/projects/RF100-Benchmark/scripts/download_dataset.py create mode 100644 mmdetection/projects/RF100-Benchmark/scripts/labels_names.json create mode 100644 mmdetection/projects/RF100-Benchmark/scripts/log_extract.py create mode 100644 mmdetection/projects/RF100-Benchmark/scripts/parse_dataset_link.py create mode 100644 mmdetection/projects/SparseInst/README.md create mode 100644 mmdetection/projects/SparseInst/configs/sparseinst_r50_iam_8xb8-ms-270k_coco.py create mode 100644 mmdetection/projects/SparseInst/sparseinst/__init__.py create mode 100644 mmdetection/projects/SparseInst/sparseinst/decoder.py create mode 100644 mmdetection/projects/SparseInst/sparseinst/encoder.py create mode 100644 mmdetection/projects/SparseInst/sparseinst/loss.py create mode 100644 mmdetection/projects/SparseInst/sparseinst/sparseinst.py create mode 100644 mmdetection/projects/VISION-Datasets/README.md create mode 100644 mmdetection/projects/VISION-Datasets/README_zh-CN.md create mode 100644 mmdetection/projects/ViTDet/README.md create mode 100644 mmdetection/projects/ViTDet/configs/lsj-100e_coco-instance.py create mode 100644 mmdetection/projects/ViTDet/configs/vitdet_mask-rcnn_vit-b-mae_lsj-100e.py create mode 100644 mmdetection/projects/ViTDet/vitdet/__init__.py create mode 100644 mmdetection/projects/ViTDet/vitdet/fp16_compression_hook.py create mode 100644 mmdetection/projects/ViTDet/vitdet/layer_decay_optimizer_constructor.py create mode 100644 mmdetection/projects/ViTDet/vitdet/simple_fpn.py create mode 100644 mmdetection/projects/ViTDet/vitdet/vit.py create mode 100644 mmdetection/projects/XDecoder/README.md create mode 100644 mmdetection/projects/XDecoder/configs/_base_/xdecoder-tiny_caption.py create mode 100644 mmdetection/projects/XDecoder/configs/_base_/xdecoder-tiny_open-vocab-instance.py create mode 100644 mmdetection/projects/XDecoder/configs/_base_/xdecoder-tiny_open-vocab-panoptic.py create mode 100644 mmdetection/projects/XDecoder/configs/_base_/xdecoder-tiny_open-vocab-semseg.py create mode 100644 mmdetection/projects/XDecoder/configs/_base_/xdecoder-tiny_ref-seg.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_caption_coco2014.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-instance_ade20k.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-instance_coco.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-panoptic_ade20k.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-panoptic_coco.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-ref-seg_refcoco+.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-ref-seg_refcoco.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-ref-seg_refcocog.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-semseg_ade20k.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_open-vocab-semseg_coco.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_ref-caption.py create mode 100644 mmdetection/projects/XDecoder/configs/xdecoder-tiny_zeroshot_text-image-retrieval.py create mode 100644 mmdetection/projects/XDecoder/demo.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/__init__.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/focalnet.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/inference/__init__.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/inference/image_caption.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/inference/texttoimage_regionretrieval_inferencer.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/language_model.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/pixel_decoder.py create mode 100755 mmdetection/projects/XDecoder/xdecoder/transformer_blocks.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/transformer_decoder.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/unified_head.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/utils.py create mode 100644 mmdetection/projects/XDecoder/xdecoder/xdecoder.py create mode 100644 mmdetection/projects/example_largemodel/README.md create mode 100644 mmdetection/projects/example_largemodel/README_zh-CN.md create mode 100644 mmdetection/projects/example_largemodel/__init__.py create mode 100644 mmdetection/projects/example_largemodel/dino-5scale_swin-l_deepspeed_8xb2-12e_coco.py create mode 100644 mmdetection/projects/example_largemodel/dino-5scale_swin-l_fsdp_8xb2-12e_coco.py create mode 100644 mmdetection/projects/example_largemodel/fsdp_utils.py create mode 100644 mmdetection/projects/example_project/README.md create mode 100644 mmdetection/projects/example_project/configs/faster-rcnn_dummy-resnet_fpn_1x_coco.py create mode 100644 mmdetection/projects/example_project/dummy/__init__.py create mode 100644 mmdetection/projects/example_project/dummy/dummy_resnet.py create mode 100644 mmdetection/projects/gradio_demo/README.md create mode 100644 mmdetection/projects/gradio_demo/launch.py create mode 100644 mmdetection/projects/iSAID/README.md create mode 100644 mmdetection/projects/iSAID/README_zh-CN.md create mode 100644 mmdetection/projects/iSAID/configs/mask_rcnn_r50_fpn_1x_isaid.py create mode 100644 mmdetection/projects/iSAID/isaid_json.py create mode 100644 mmdetection/pytest.ini create mode 100644 mmdetection/setup.cfg create mode 100755 mmdetection/setup.py create mode 100755 mmdetection/tools/analysis_tools/analyze_logs.py create mode 100644 mmdetection/tools/analysis_tools/analyze_results.py create mode 100644 mmdetection/tools/analysis_tools/benchmark.py create mode 100644 mmdetection/tools/analysis_tools/browse_dataset.py create mode 100644 mmdetection/tools/analysis_tools/coco_error_analysis.py create mode 100644 mmdetection/tools/analysis_tools/coco_occluded_separated_recall.py create mode 100644 mmdetection/tools/analysis_tools/confusion_matrix.py create mode 100644 mmdetection/tools/analysis_tools/eval_metric.py create mode 100644 mmdetection/tools/analysis_tools/fuse_results.py create mode 100644 mmdetection/tools/analysis_tools/get_flops.py create mode 100644 mmdetection/tools/analysis_tools/mot/browse_dataset.py create mode 100644 mmdetection/tools/analysis_tools/mot/mot_error_visualize.py create mode 100644 mmdetection/tools/analysis_tools/mot/mot_param_search.py create mode 100644 mmdetection/tools/analysis_tools/optimize_anchors.py create mode 100644 mmdetection/tools/analysis_tools/robustness_eval.py create mode 100644 mmdetection/tools/analysis_tools/test_robustness.py create mode 100644 mmdetection/tools/dataset_converters/ade20k2coco.py create mode 100644 mmdetection/tools/dataset_converters/cityscapes.py create mode 100644 mmdetection/tools/dataset_converters/coco_stuff164k.py create mode 100644 mmdetection/tools/dataset_converters/crowdhuman2coco.py create mode 100644 mmdetection/tools/dataset_converters/images2coco.py create mode 100644 mmdetection/tools/dataset_converters/mot2coco.py create mode 100644 mmdetection/tools/dataset_converters/mot2reid.py create mode 100644 mmdetection/tools/dataset_converters/pascal_voc.py create mode 100644 mmdetection/tools/dataset_converters/prepare_coco_semantic_annos_from_panoptic_annos.py create mode 100644 mmdetection/tools/dataset_converters/youtubevis2coco.py create mode 100644 mmdetection/tools/deployment/mmdet2torchserve.py create mode 100644 mmdetection/tools/deployment/mmdet_handler.py create mode 100644 mmdetection/tools/deployment/test_torchserver.py create mode 100644 mmdetection/tools/misc/download_dataset.py create mode 100644 mmdetection/tools/misc/gen_coco_panoptic_test_info.py create mode 100644 mmdetection/tools/misc/get_crowdhuman_id_hw.py create mode 100644 mmdetection/tools/misc/get_image_metas.py create mode 100644 mmdetection/tools/misc/print_config.py create mode 100644 mmdetection/tools/misc/split_coco.py create mode 100644 mmdetection/tools/model_converters/detectron2_to_mmdet.py create mode 100644 mmdetection/tools/model_converters/detectron2pytorch.py create mode 100644 mmdetection/tools/model_converters/detic_to_mmdet.py create mode 100644 mmdetection/tools/model_converters/glip_to_mmdet.py create mode 100644 mmdetection/tools/model_converters/groundingdino_to_mmdet.py create mode 100644 mmdetection/tools/model_converters/publish_model.py create mode 100644 mmdetection/tools/model_converters/regnet2mmdet.py create mode 100644 mmdetection/tools/model_converters/selfsup2mmdet.py create mode 100644 mmdetection/tools/model_converters/swinv1_to_mmdet.py create mode 100644 mmdetection/tools/model_converters/upgrade_model_version.py create mode 100644 mmdetection/tools/model_converters/upgrade_ssd_version.py create mode 100644 mmdetection/tools/test.py create mode 100644 mmdetection/tools/test_tracking.py create mode 100644 mmdetection/tools/train.py diff --git a/mmdetection/.circleci/config.yml b/mmdetection/.circleci/config.yml new file mode 100644 index 0000000..1a24b82 --- /dev/null +++ b/mmdetection/.circleci/config.yml @@ -0,0 +1,34 @@ +version: 2.1 + +# this allows you to use CircleCI's dynamic configuration feature +setup: true + +# the path-filtering orb is required to continue a pipeline based on +# the path of an updated fileset +orbs: + path-filtering: circleci/path-filtering@0.1.2 + +workflows: + # the always-run workflow is always triggered, regardless of the pipeline parameters. + always-run: + jobs: + # the path-filtering/filter job determines which pipeline + # parameters to update. + - path-filtering/filter: + name: check-updated-files + # 3-column, whitespace-delimited mapping. One mapping per + # line: + # + mapping: | + mmdet/.* lint_only false + requirements/.* lint_only false + tests/.* lint_only false + tools/.* lint_only false + configs/.* lint_only false + .circleci/.* lint_only false + base-revision: dev-3.x + # this is the path of the configuration we should trigger once + # path filtering and pipeline parameter value updates are + # complete. In this case, we are using the parent dynamic + # configuration itself. + config-path: .circleci/test.yml diff --git a/mmdetection/.circleci/docker/Dockerfile b/mmdetection/.circleci/docker/Dockerfile new file mode 100644 index 0000000..d9cf8cc --- /dev/null +++ b/mmdetection/.circleci/docker/Dockerfile @@ -0,0 +1,11 @@ +ARG PYTORCH="1.8.1" +ARG CUDA="10.2" +ARG CUDNN="7" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +# To fix GPG key error when running apt-get update +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx diff --git a/mmdetection/.circleci/test.yml b/mmdetection/.circleci/test.yml new file mode 100644 index 0000000..e5e14d3 --- /dev/null +++ b/mmdetection/.circleci/test.yml @@ -0,0 +1,210 @@ +version: 2.1 + +# the default pipeline parameters, which will be updated according to +# the results of the path-filtering orb +parameters: + lint_only: + type: boolean + default: true + +jobs: + lint: + docker: + - image: cimg/python:3.7.4 + steps: + - checkout + - run: + name: Install pre-commit hook + command: | + pip install pre-commit + pre-commit install + - run: + name: Linting + command: pre-commit run --all-files + - run: + name: Check docstring coverage + command: | + pip install interrogate + interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 85 mmdet + + build_cpu: + parameters: + # The python version must match available image tags in + # https://circleci.com/developer/images/image/cimg/python + python: + type: string + torch: + type: string + torchvision: + type: string + docker: + - image: cimg/python:<< parameters.python >> + resource_class: large + steps: + - checkout + - run: + name: Install Libraries + command: | + sudo apt-get update + sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5 + - run: + name: Configure Python & pip + command: | + pip install --upgrade pip + pip install wheel + - run: + name: Install PyTorch + command: | + python -V + python -m pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html + - when: + condition: + equal: ["3.9.0", << parameters.python >>] + steps: + - run: pip install "protobuf <= 3.20.1" && sudo apt-get update && sudo apt-get -y install libprotobuf-dev protobuf-compiler cmake + - run: pip install dsdl + - run: + name: Install mmdet dependencies + # numpy may be downgraded after building pycocotools, which causes `ImportError: numpy.core.multiarray failed to import` + # force reinstall pycocotools to ensure pycocotools being built under the currenct numpy + command: | + python -m pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main + pip install -U openmim + mim install 'mmcv >= 2.0.0rc4' + pip install -r requirements/tests.txt -r requirements/optional.txt + pip install --force-reinstall pycocotools + pip install albumentations>=0.3.2 --no-binary imgaug,albumentations + pip install -r requirements/tracking.txt + pip install git+https://github.com/cocodataset/panopticapi.git + pip install git+https://github.com/JonathonLuiten/TrackEval.git + - run: + name: Build and install + command: | + pip install -e . + - run: + name: Run unittests + command: | + python -m coverage run --branch --source mmdet -m pytest tests/ + python -m coverage xml + python -m coverage report -m + + build_cuda: + parameters: + torch: + type: string + cuda: + type: enum + enum: ["11.1", "11.7", "11.8"] + cudnn: + type: integer + default: 8 + machine: + image: linux-cuda-11:default + # docker_layer_caching: true + resource_class: gpu.nvidia.small.multi + steps: + - checkout + - run: + # CLoning repos in VM since Docker doesn't have access to the private key + name: Clone Repos + command: | + git clone -b main --depth 1 ssh://git@github.com/open-mmlab/mmengine.git /home/circleci/mmengine + - run: + name: Install nvidia-container-toolkit and Restart Docker + command: | + sudo apt-get update + sudo apt-get install -y nvidia-container-toolkit + sudo systemctl restart docker + - run: + name: Build Docker image + command: | + docker build .circleci/docker -t mmdetection:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >> + docker run --gpus all -t -d -v /home/circleci/project:/mmdetection -v /home/circleci/mmengine:/mmengine -w /mmdetection --name mmdetection mmdetection:gpu + docker exec mmdetection apt-get install -y git + - run: + name: Install mmdet dependencies + command: | + docker exec mmdetection pip install -e /mmengine + docker exec mmdetection pip install -U openmim + docker exec mmdetection mim install 'mmcv >= 2.0.0rc4' + docker exec mmdetection pip install -r requirements/tests.txt -r requirements/optional.txt + docker exec mmdetection pip install pycocotools + docker exec mmdetection pip install albumentations>=0.3.2 --no-binary imgaug,albumentations + docker exec mmdetection pip install -r requirements/tracking.txt + docker exec mmdetection pip install git+https://github.com/cocodataset/panopticapi.git + docker exec mmdetection pip install git+https://github.com/JonathonLuiten/TrackEval.git + docker exec mmdetection python -c 'import mmcv; print(mmcv.__version__)' + - run: + name: Build and install + command: | + docker exec mmdetection pip install -e . + - run: + name: Run unittests + command: | + docker exec mmdetection python -m pytest tests/ + +workflows: + pr_stage_lint: + when: << pipeline.parameters.lint_only >> + jobs: + - lint: + name: lint + filters: + branches: + ignore: + - dev-3.x + pr_stage_test: + when: + not: << pipeline.parameters.lint_only >> + jobs: + - lint: + name: lint + filters: + branches: + ignore: + - dev-3.x + - build_cpu: + name: minimum_version_cpu + torch: 1.8.0 + torchvision: 0.9.0 + python: 3.7.16 + requires: + - lint + - build_cpu: + name: maximum_version_cpu + torch: 2.0.0 + torchvision: 0.15.1 + python: 3.9.0 + requires: + - minimum_version_cpu + - hold: + type: approval + requires: + - maximum_version_cpu + - build_cuda: + name: mainstream_version_gpu + torch: 1.8.1 + # Use double quotation mark to explicitly specify its type + # as string instead of number + cuda: "11.1" + requires: + - hold + - build_cuda: + name: maximum_version_gpu + torch: 2.0.0 + cuda: "11.7" + cudnn: 8 + requires: + - hold + merge_stage_test: + when: + not: << pipeline.parameters.lint_only >> + jobs: + - build_cuda: + name: minimum_version_gpu + torch: 1.8.0 + cuda: "11.1" + filters: + branches: + only: + - dev-3.x diff --git a/mmdetection/.dev_scripts/batch_test_list.py b/mmdetection/.dev_scripts/batch_test_list.py new file mode 100644 index 0000000..b28d403 --- /dev/null +++ b/mmdetection/.dev_scripts/batch_test_list.py @@ -0,0 +1,545 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# missing wider_face/timm_example/strong_baselines/simple_copy_paste/ +# selfsup_pretrain/seesaw_loss/pascal_voc/openimages/lvis/ld/lad/cityscapes/deepfashion + +# yapf: disable +atss = dict( + config='configs/atss/atss_r50_fpn_1x_coco.py', + checkpoint='atss_r50_fpn_1x_coco_20200209-985f7bd0.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=39.4), +) +autoassign = dict( + config='configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py', + checkpoint='auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.4), +) +carafe = dict( + config='configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py', + checkpoint='faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=38.6), +) +cascade_rcnn = [ + dict( + config='configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py', + checkpoint='cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth', + eval='bbox', + url='https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth', # noqa + metric=dict(bbox_mAP=40.3), + ), + dict( + config='configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py', + checkpoint='cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=41.2, segm_mAP=35.9), + ), +] +cascade_rpn = dict( + config='configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py', # noqa + checkpoint='crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco/crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.4), +) +centernet = dict( + config='configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py', + checkpoint='centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=29.5), +) +centripetalnet = dict( + config='configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py', # noqa + checkpoint='centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=44.7), +) +convnext = dict( + config='configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py', # noqa + checkpoint='cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004-3d24f5a4.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004-3d24f5a4.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=51.8, segm_mAP=44.8), +) +cornernet = dict( + config='configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py', + checkpoint='cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=41.2), +) +dcn = dict( + config='configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py', + checkpoint='faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=41.3), +) +dcnv2 = dict( + config='configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py', + checkpoint='faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=38.7), +) +ddod = dict( + config='configs/ddod/ddod_r50_fpn_1x_coco.py', + checkpoint='ddod_r50_fpn_1x_coco_20220523_223737-29b2fc67.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/ddod/ddod_r50_fpn_1x_coco/ddod_r50_fpn_1x_coco_20220523_223737-29b2fc67.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=41.7), +) +deformable_detr = dict( + config='configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py', + checkpoint='deformable_detr_r50_16x2_50e_coco_20210419_220030-a12b9512.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/deformable_detr/deformable_detr_r50_16x2_50e_coco/deformable_detr_r50_16x2_50e_coco_20210419_220030-a12b9512.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=44.5), +) +detectors = dict( + config='configs/detectors/detectors_htc-r50_1x_coco.py', + checkpoint='detectors_htc_r50_1x_coco-329b1453.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco-329b1453.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=49.1, segm_mAP=42.6), +) +detr = dict( + config='configs/detr/detr_r50_8xb2-150e_coco.py', + checkpoint='detr_r50_8x2_150e_coco_20201130_194835-2c4b8974.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/detr/detr_r50_8x2_150e_coco/detr_r50_8x2_150e_coco_20201130_194835-2c4b8974.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.1), +) +double_heads = dict( + config='configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py', + checkpoint='dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.0), +) +dyhead = dict( + config='configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py', + checkpoint='atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=43.3), +) +dynamic_rcnn = dict( + config='configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py', + checkpoint='dynamic_rcnn_r50_fpn_1x-62a3f276.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x-62a3f276.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=38.9), +) +efficientnet = dict( + config='configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py', + checkpoint='retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.5), +) +empirical_attention = dict( + config='configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py', # noqa + checkpoint='faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.0), +) +faster_rcnn = dict( + config='configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py', + checkpoint='faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.4), +) +fcos = dict( + config='configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py', # noqa + checkpoint='fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=38.7), +) +foveabox = dict( + config='configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py', + checkpoint='fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.9), +) +fpg = dict( + config='configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py', + checkpoint='mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=43.0, segm_mAP=38.1), +) +free_anchor = dict( + config='configs/free_anchor/freeanchor_r50_fpn_1x_coco.py', + checkpoint='retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=38.7), +) +fsaf = dict( + config='configs/fsaf/fsaf_r50_fpn_1x_coco.py', + checkpoint='fsaf_r50_fpn_1x_coco-94ccc51f.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.4), +) +gcnet = dict( + config='configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py', # noqa + checkpoint='mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=40.4, segm_mAP=36.2), +) +gfl = dict( + config='configs/gfl/gfl_r50_fpn_1x_coco.py', + checkpoint='gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.2), +) +ghm = dict( + config='configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py', + checkpoint='retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.0), +) +gn = dict( + config='configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py', + checkpoint='mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=40.1, segm_mAP=36.4), +) +gn_ws = dict( + config='configs/gn+ws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py', + checkpoint='faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=39.7), +) +grid_rcnn = dict( + config='configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py', + checkpoint='grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.4), +) +groie = dict( + config='configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py', + checkpoint='faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=38.3), +) +guided_anchoring = dict( + config='configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py', # noqa + checkpoint='ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=36.9), + ) +hrnet = dict( + config='configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py', + checkpoint='faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=36.9), +) +htc = dict( + config='configs/htc/htc_r50_fpn_1x_coco.py', + checkpoint='htc_r50_fpn_1x_coco_20200317-7332cf16.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317-7332cf16.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=42.3, segm_mAP=37.4), +) +instaboost = dict( + config='configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py', + checkpoint='mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=40.6, segm_mAP=36.6), +) +libra_rcnn = dict( + config='configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py', + checkpoint='libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=38.3), +) +mask2former = dict( + config='configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py', + checkpoint='mask2former_r50_lsj_8x2_50e_coco-panoptic_20220326_224516-11a44721.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco-panoptic/mask2former_r50_lsj_8x2_50e_coco-panoptic_20220326_224516-11a44721.pth', # noqa + eval=['bbox', 'segm', 'PQ'], + metric=dict(PQ=51.9, bbox_mAP=44.8, segm_mAP=41.9), +) +mask_rcnn = dict( + config='configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py', + checkpoint='mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=38.2, segm_mAP=34.7), +) +maskformer = dict( + config='configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py', + checkpoint='maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_r50_mstrain_16x1_75e_coco/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth', # noqa + eval='PQ', + metric=dict(PQ=46.9), +) +ms_rcnn = dict( + config='configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py', + checkpoint='ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=38.2, segm_mAP=36.0), +) +nas_fcos = dict( + config='configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py', # noqa + checkpoint='nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=39.4), +) +nas_fpn = dict( + config='configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py', + checkpoint='retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.5), +) +paa = dict( + config='configs/paa/paa_r50_fpn_1x_coco.py', + checkpoint='paa_r50_fpn_1x_coco_20200821-936edec3.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.4), +) +pafpn = dict( + config='configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py', + checkpoint='faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.5), +) +panoptic_fpn = dict( + config='configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py', + checkpoint='panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth', # noqa + eval='PQ', + metric=dict(PQ=40.2), +) +pisa = dict( + config='configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py', + checkpoint='pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=38.4), +) +point_rend = dict( + config='configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py', + checkpoint='point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=38.4, segm_mAP=36.3), +) +pvt = dict( + config='configs/pvt/retinanet_pvt-s_fpn_1x_coco.py', + checkpoint='retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=40.4), +) +queryinst = dict( + config='configs/queryinst/queryinst_r50_fpn_1x_coco.py', + checkpoint='queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=42.0, segm_mAP=37.5), +) +regnet = dict( + config='configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py', + checkpoint='mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=40.4, segm_mAP=36.7), +) +reppoints = dict( + config='configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py', + checkpoint='reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.0), +) +res2net = dict( + config='configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py', + checkpoint='faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=43.0), +) +resnest = dict( + config='configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py', # noqa + checkpoint='faster_rcnn_s50_fpn_syncbn-backbone+head_mstrain-range_1x_coco_20200926_125502-20289c16.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20200926_125502-20289c16.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=42.0), +) +resnet_strikes_back = dict( + config='configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py', # noqa + checkpoint='mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=41.2, segm_mAP=38.2), +) +retinanet = dict( + config='configs/retinanet/retinanet_r50_fpn_1x_coco.py', + checkpoint='retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=36.5), +) +rpn = dict( + config='configs/rpn/rpn_r50_fpn_1x_coco.py', + checkpoint='rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth', # noqa + eval='proposal_fast', + metric=dict(AR_1000=58.2), +) +sabl = [ + dict( + config='configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py', + checkpoint='sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.7), + ), + dict( + config='configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py', + checkpoint='sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=39.9), + ), +] +scnet = dict( + config='configs/scnet/scnet_r50_fpn_1x_coco.py', + checkpoint='scnet_r50_fpn_1x_coco-c3f09857.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco-c3f09857.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=43.5), +) +scratch = dict( + config='configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py', + checkpoint='scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=41.2, segm_mAP=37.4), +) +solo = dict( + config='configs/solo/decoupled-solo_r50_fpn_1x_coco.py', + checkpoint='decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth', # noqa + eval='segm', + metric=dict(segm_mAP=33.9), +) +solov2 = dict( + config='configs/solov2/solov2_r50_fpn_1x_coco.py', + checkpoint='solov2_r50_fpn_1x_coco_20220512_125858-a357fa23.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_1x_coco/solov2_r50_fpn_1x_coco_20220512_125858-a357fa23.pth', # noqa + eval='segm', + metric=dict(segm_mAP=34.8), +) +sparse_rcnn = dict( + config='configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py', + checkpoint='sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.9), +) +ssd = [ + dict( + config='configs/ssd/ssd300_coco.py', + checkpoint='ssd300_coco_20210803_015428-d231a06e.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428-d231a06e.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=25.5), + ), + dict( + config='configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py', + checkpoint='ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=21.3), + ), +] +swin = dict( + config='configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py', + checkpoint='mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=42.7, segm_mAP=39.3), +) +tood = dict( + config='configs/tood/tood_r50_fpn_1x_coco.py', + checkpoint='tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=42.4), +) +tridentnet = dict( + config='configs/tridentnet/tridentnet_r50-caffe_1x_coco.py', + checkpoint='tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.6), +) +vfnet = dict( + config='configs/vfnet/vfnet_r50_fpn_1x_coco.py', + checkpoint='vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=41.6), +) +yolact = dict( + config='configs/yolact/yolact_r50_1xb8-55e_coco.py', + checkpoint='yolact_r50_1x8_coco_20200908-f38d58df.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_1x8_coco/yolact_r50_1x8_coco_20200908-f38d58df.pth', # noqa + eval=['bbox', 'segm'], + metric=dict(bbox_mAP=31.2, segm_mAP=29.0), +) +yolo = dict( + config='configs/yolo/yolov3_d53_8xb8-320-273e_coco.py', + checkpoint='yolov3_d53_320_273e_coco-421362b6.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-421362b6.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=27.9), +) +yolof = dict( + config='configs/yolof/yolof_r50-c5_8xb8-1x_coco.py', + checkpoint='yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth', + url='https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=37.5), +) +yolox = dict( + config='configs/yolox/yolox_tiny_8xb8-300e_coco.py', + checkpoint='yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth', # noqa + url='https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth', # noqa + eval='bbox', + metric=dict(bbox_mAP=31.8), +) +# yapf: enable diff --git a/mmdetection/.dev_scripts/benchmark_filter.py b/mmdetection/.dev_scripts/benchmark_filter.py new file mode 100644 index 0000000..178cd9c --- /dev/null +++ b/mmdetection/.dev_scripts/benchmark_filter.py @@ -0,0 +1,167 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + + +def parse_args(): + parser = argparse.ArgumentParser(description='Filter configs to train') + parser.add_argument( + '--basic-arch', + action='store_true', + help='to train models in basic arch') + parser.add_argument( + '--datasets', action='store_true', help='to train models in dataset') + parser.add_argument( + '--data-pipeline', + action='store_true', + help='to train models related to data pipeline, e.g. augmentations') + parser.add_argument( + '--nn-module', + action='store_true', + help='to train models related to neural network modules') + parser.add_argument( + '--model-options', + nargs='+', + help='custom options to special model benchmark') + parser.add_argument( + '--out', + type=str, + default='batch_train_list.txt', + help='output path of gathered metrics to be stored') + args = parser.parse_args() + return args + + +basic_arch_root = [ + 'atss', 'autoassign', 'cascade_rcnn', 'cascade_rpn', 'centripetalnet', + 'cornernet', 'detectors', 'deformable_detr', 'detr', 'double_heads', + 'dynamic_rcnn', 'faster_rcnn', 'fcos', 'foveabox', 'fp16', 'free_anchor', + 'fsaf', 'gfl', 'ghm', 'grid_rcnn', 'guided_anchoring', 'htc', 'ld', + 'libra_rcnn', 'mask_rcnn', 'ms_rcnn', 'nas_fcos', 'paa', 'pisa', + 'point_rend', 'reppoints', 'retinanet', 'rpn', 'sabl', 'ssd', 'tridentnet', + 'vfnet', 'yolact', 'yolo', 'sparse_rcnn', 'scnet', 'yolof', 'centernet' +] + +datasets_root = [ + 'wider_face', 'pascal_voc', 'cityscapes', 'lvis', 'deepfashion' +] + +data_pipeline_root = ['albu_example', 'instaboost'] + +nn_module_root = [ + 'carafe', 'dcn', 'empirical_attention', 'gcnet', 'gn', 'gn+ws', 'hrnet', + 'pafpn', 'nas_fpn', 'regnet', 'resnest', 'res2net', 'groie' +] + +benchmark_pool = [ + 'configs/albu_example/mask_rcnn_r50_fpn_albu_1x_coco.py', + 'configs/atss/atss_r50_fpn_1x_coco.py', + 'configs/autoassign/autoassign_r50_fpn_8x2_1x_coco.py', + 'configs/carafe/mask_rcnn_r50_fpn_carafe_1x_coco.py', + 'configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py', + 'configs/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco.py', + 'configs/centernet/centernet_resnet18_dcnv2_140e_coco.py', + 'configs/centripetalnet/' + 'centripetalnet_hourglass104_mstest_16x6_210e_coco.py', + 'configs/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes.py', + 'configs/cornernet/' + 'cornernet_hourglass104_mstest_8x6_210e_coco.py', + 'configs/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py', + 'configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py', + 'configs/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco.py', + 'configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py', + 'configs/deformable_detr/deformable_detr_r50_16x2_50e_coco.py', + 'configs/detectors/detectors_htc_r50_1x_coco.py', + 'configs/detr/detr_r50_8x2_150e_coco.py', + 'configs/double_heads/dh_faster_rcnn_r50_fpn_1x_coco.py', + 'configs/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x_coco.py', + 'configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py', # noqa + 'configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py', + 'configs/faster_rcnn/faster_rcnn_r50_fpn_ohem_1x_coco.py', + 'configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco.py', + 'configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco.py', + 'configs/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco.py', + 'configs/fcos/fcos_center_r50_caffe_fpn_gn-head_4x4_1x_coco.py', + 'configs/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco.py', + 'configs/retinanet/retinanet_r50_fpn_fp16_1x_coco.py', + 'configs/mask_rcnn/mask_rcnn_r50_fpn_fp16_1x_coco.py', + 'configs/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco.py', + 'configs/fsaf/fsaf_r50_fpn_1x_coco.py', + 'configs/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco.py', + 'configs/gfl/gfl_r50_fpn_1x_coco.py', + 'configs/ghm/retinanet_ghm_r50_fpn_1x_coco.py', + 'configs/gn/mask_rcnn_r50_fpn_gn-all_2x_coco.py', + 'configs/gn+ws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco.py', + 'configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py', + 'configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py', + 'configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco.py', + 'configs/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco.py', + 'configs/htc/htc_r50_fpn_1x_coco.py', + 'configs/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco.py', + 'configs/ld/ld_r18_gflv1_r101_fpn_coco_1x.py', + 'configs/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco.py', + 'configs/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1.py', + 'configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py', + 'configs/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco.py', + 'configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py', + 'configs/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco.py', + 'configs/paa/paa_r50_fpn_1x_coco.py', + 'configs/pafpn/faster_rcnn_r50_pafpn_1x_coco.py', + 'configs/pisa/pisa_mask_rcnn_r50_fpn_1x_coco.py', + 'configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py', + 'configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco.py', + 'configs/reppoints/reppoints_moment_r50_fpn_gn-neck+head_1x_coco.py', + 'configs/res2net/faster_rcnn_r2_101_fpn_2x_coco.py', + 'configs/resnest/' + 'mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py', + 'configs/retinanet/retinanet_r50_caffe_fpn_1x_coco.py', + 'configs/rpn/rpn_r50_fpn_1x_coco.py', + 'configs/sabl/sabl_retinanet_r50_fpn_1x_coco.py', + 'configs/ssd/ssd300_coco.py', + 'configs/tridentnet/tridentnet_r50_caffe_1x_coco.py', + 'configs/vfnet/vfnet_r50_fpn_1x_coco.py', + 'configs/yolact/yolact_r50_1x8_coco.py', + 'configs/yolo/yolov3_d53_320_273e_coco.py', + 'configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py', + 'configs/scnet/scnet_r50_fpn_1x_coco.py', + 'configs/yolof/yolof_r50_c5_8x8_1x_coco.py', +] + + +def main(): + args = parse_args() + + benchmark_type = [] + if args.basic_arch: + benchmark_type += basic_arch_root + if args.datasets: + benchmark_type += datasets_root + if args.data_pipeline: + benchmark_type += data_pipeline_root + if args.nn_module: + benchmark_type += nn_module_root + + special_model = args.model_options + if special_model is not None: + benchmark_type += special_model + + config_dpath = 'configs/' + benchmark_configs = [] + for cfg_root in benchmark_type: + cfg_dir = osp.join(config_dpath, cfg_root) + configs = os.scandir(cfg_dir) + for cfg in configs: + config_path = osp.join(cfg_dir, cfg.name) + if (config_path in benchmark_pool + and config_path not in benchmark_configs): + benchmark_configs.append(config_path) + + print(f'Totally found {len(benchmark_configs)} configs to benchmark') + with open(args.out, 'w') as f: + for config in benchmark_configs: + f.write(config + '\n') + + +if __name__ == '__main__': + main() diff --git a/mmdetection/.dev_scripts/benchmark_inference_fps.py b/mmdetection/.dev_scripts/benchmark_inference_fps.py new file mode 100644 index 0000000..6099ed1 --- /dev/null +++ b/mmdetection/.dev_scripts/benchmark_inference_fps.py @@ -0,0 +1,171 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmengine.config import Config, DictAction +from mmengine.dist import init_dist +from mmengine.fileio import dump +from mmengine.utils import mkdir_or_exist +from terminaltables import GithubFlavoredMarkdownTable + +from tools.analysis_tools.benchmark import repeat_measure_inference_speed + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDet benchmark a model of FPS') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint_root', help='Checkpoint file root path') + parser.add_argument( + '--round-num', + type=int, + default=1, + help='round a number to a given precision in decimal digits') + parser.add_argument( + '--repeat-num', + type=int, + default=1, + help='number of repeat times of measurement for averaging the results') + parser.add_argument( + '--out', type=str, help='output path of gathered fps to be stored') + parser.add_argument( + '--max-iter', type=int, default=2000, help='num of max iter') + parser.add_argument( + '--log-interval', type=int, default=50, help='interval of logging') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def results2markdown(result_dict): + table_data = [] + is_multiple_results = False + for cfg_name, value in result_dict.items(): + name = cfg_name.replace('configs/', '') + fps = value['fps'] + ms_times_pre_image = value['ms_times_pre_image'] + if isinstance(fps, list): + is_multiple_results = True + mean_fps = value['mean_fps'] + mean_times_pre_image = value['mean_times_pre_image'] + fps_str = ','.join([str(s) for s in fps]) + ms_times_pre_image_str = ','.join( + [str(s) for s in ms_times_pre_image]) + table_data.append([ + name, fps_str, mean_fps, ms_times_pre_image_str, + mean_times_pre_image + ]) + else: + table_data.append([name, fps, ms_times_pre_image]) + + if is_multiple_results: + table_data.insert(0, [ + 'model', 'fps', 'mean_fps', 'times_pre_image(ms)', + 'mean_times_pre_image(ms)' + ]) + + else: + table_data.insert(0, ['model', 'fps', 'times_pre_image(ms)']) + table = GithubFlavoredMarkdownTable(table_data) + print(table.table, flush=True) + + +if __name__ == '__main__': + args = parse_args() + assert args.round_num >= 0 + assert args.repeat_num >= 1 + + config = Config.fromfile(args.config) + + if args.launcher == 'none': + raise NotImplementedError('Only supports distributed mode') + else: + init_dist(args.launcher) + + result_dict = {} + for model_key in config: + model_infos = config[model_key] + if not isinstance(model_infos, list): + model_infos = [model_infos] + for model_info in model_infos: + record_metrics = model_info['metric'] + cfg_path = model_info['config'].strip() + cfg = Config.fromfile(cfg_path) + checkpoint = osp.join(args.checkpoint_root, + model_info['checkpoint'].strip()) + try: + fps = repeat_measure_inference_speed(cfg, checkpoint, + args.max_iter, + args.log_interval, + args.fuse_conv_bn, + args.repeat_num) + if args.repeat_num > 1: + fps_list = [round(fps_, args.round_num) for fps_ in fps] + times_pre_image_list = [ + round(1000 / fps_, args.round_num) for fps_ in fps + ] + mean_fps = round( + sum(fps_list) / len(fps_list), args.round_num) + mean_times_pre_image = round( + sum(times_pre_image_list) / len(times_pre_image_list), + args.round_num) + print( + f'{cfg_path} ' + f'Overall fps: {fps_list}[{mean_fps}] img / s, ' + f'times per image: ' + f'{times_pre_image_list}[{mean_times_pre_image}] ' + f'ms / img', + flush=True) + result_dict[cfg_path] = dict( + fps=fps_list, + mean_fps=mean_fps, + ms_times_pre_image=times_pre_image_list, + mean_times_pre_image=mean_times_pre_image) + else: + print( + f'{cfg_path} fps : {fps:.{args.round_num}f} img / s, ' + f'times per image: {1000 / fps:.{args.round_num}f} ' + f'ms / img', + flush=True) + result_dict[cfg_path] = dict( + fps=round(fps, args.round_num), + ms_times_pre_image=round(1000 / fps, args.round_num)) + except Exception as e: + print(f'{cfg_path} error: {repr(e)}') + if args.repeat_num > 1: + result_dict[cfg_path] = dict( + fps=[0], + mean_fps=0, + ms_times_pre_image=[0], + mean_times_pre_image=0) + else: + result_dict[cfg_path] = dict(fps=0, ms_times_pre_image=0) + + if args.out: + mkdir_or_exist(args.out) + dump(result_dict, osp.join(args.out, 'batch_inference_fps.json')) + + results2markdown(result_dict) diff --git a/mmdetection/.dev_scripts/benchmark_options.py b/mmdetection/.dev_scripts/benchmark_options.py new file mode 100644 index 0000000..cdb1f87 --- /dev/null +++ b/mmdetection/.dev_scripts/benchmark_options.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +third_part_libs = [ + 'pip install -r ../requirements/albu.txt', + 'pip install instaboostfast', + 'pip install git+https://github.com/cocodataset/panopticapi.git', + 'pip install timm', + 'pip install mmpretrain', + 'pip install git+https://github.com/lvis-dataset/lvis-api.git', + 'pip install -r ../requirements/multimodal.txt', + 'pip install -r ../requirements/tracking.txt', + 'pip install git+https://github.com/JonathonLuiten/TrackEval.git', +] + +default_floating_range = 0.5 +model_floating_ranges = {'atss/atss_r50_fpn_1x_coco.py': 0.3} diff --git a/mmdetection/.dev_scripts/benchmark_test.py b/mmdetection/.dev_scripts/benchmark_test.py new file mode 100644 index 0000000..dddfca1 --- /dev/null +++ b/mmdetection/.dev_scripts/benchmark_test.py @@ -0,0 +1,115 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +import os +import os.path as osp +from argparse import ArgumentParser + +from mmengine.config import Config, DictAction +from mmengine.logging import MMLogger +from mmengine.registry import RUNNERS +from mmengine.runner import Runner + +from mmdet.testing import replace_to_ceph +from mmdet.utils import register_all_modules, replace_cfg_vals + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint_root', help='Checkpoint file root path') + parser.add_argument('--work-dir', help='the dir to save logs') + parser.add_argument('--ceph', action='store_true') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + args = parser.parse_args() + return args + + +# TODO: Need to refactor test.py so that it can be reused. +def fast_test_model(config_name, checkpoint, args, logger=None): + cfg = Config.fromfile(config_name) + cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = osp.join(args.work_dir, + osp.splitext(osp.basename(config_name))[0]) + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(config_name))[0]) + + if args.ceph: + replace_to_ceph(cfg) + + cfg.load_from = checkpoint + + # TODO: temporary plan + if 'visualizer' in cfg: + if 'name' in cfg.visualizer: + del cfg.visualizer.name + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + runner.test() + + +# Sample test whether the inference code is correct +def main(args): + # register all modules in mmdet into the registries + register_all_modules(init_default_scope=False) + + config = Config.fromfile(args.config) + + # test all model + logger = MMLogger.get_instance( + name='MMLogger', + log_file='benchmark_test.log', + log_level=logging.ERROR) + + for model_key in config: + model_infos = config[model_key] + if not isinstance(model_infos, list): + model_infos = [model_infos] + for model_info in model_infos: + print('processing: ', model_info['config'], flush=True) + config_name = model_info['config'].strip() + checkpoint = osp.join(args.checkpoint_root, + model_info['checkpoint'].strip()) + try: + fast_test_model(config_name, checkpoint, args, logger) + except Exception as e: + logger.error(f'{config_name} " : {repr(e)}') + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/mmdetection/.dev_scripts/benchmark_test_image.py b/mmdetection/.dev_scripts/benchmark_test_image.py new file mode 100644 index 0000000..62fa57e --- /dev/null +++ b/mmdetection/.dev_scripts/benchmark_test_image.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +import os.path as osp +from argparse import ArgumentParser + +import mmcv +from mmengine.config import Config +from mmengine.logging import MMLogger +from mmengine.utils import mkdir_or_exist + +from mmdet.apis import inference_detector, init_detector +from mmdet.registry import VISUALIZERS +from mmdet.utils import register_all_modules + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint_root', help='Checkpoint file root path') + parser.add_argument('--img', default='demo/demo.jpg', help='Image file') + parser.add_argument('--aug', action='store_true', help='aug test') + parser.add_argument('--model-name', help='model name to inference') + parser.add_argument('--show', action='store_true', help='show results') + parser.add_argument('--out-dir', default=None, help='Dir to output file') + parser.add_argument( + '--wait-time', + type=float, + default=1, + help='the interval of show (s), 0 is block') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--palette', + default='coco', + choices=['coco', 'voc', 'citys', 'random'], + help='Color palette used for visualization') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='bbox score threshold') + args = parser.parse_args() + return args + + +def inference_model(config_name, checkpoint, visualizer, args, logger=None): + cfg = Config.fromfile(config_name) + if args.aug: + raise NotImplementedError() + + model = init_detector( + cfg, checkpoint, palette=args.palette, device=args.device) + visualizer.dataset_meta = model.dataset_meta + + # test a single image + result = inference_detector(model, args.img) + + # show the results + if args.show or args.out_dir is not None: + img = mmcv.imread(args.img) + img = mmcv.imconvert(img, 'bgr', 'rgb') + out_file = None + if args.out_dir is not None: + out_dir = args.out_dir + mkdir_or_exist(out_dir) + + out_file = osp.join( + out_dir, + config_name.split('/')[-1].replace('py', 'jpg')) + + visualizer.add_datasample( + 'result', + img, + data_sample=result, + draw_gt=False, + show=args.show, + wait_time=args.wait_time, + out_file=out_file, + pred_score_thr=args.score_thr) + + return result + + +# Sample test whether the inference code is correct +def main(args): + # register all modules in mmdet into the registries + register_all_modules() + + config = Config.fromfile(args.config) + + # init visualizer + visualizer_cfg = dict(type='DetLocalVisualizer', name='visualizer') + visualizer = VISUALIZERS.build(visualizer_cfg) + + # test single model + if args.model_name: + if args.model_name in config: + model_infos = config[args.model_name] + if not isinstance(model_infos, list): + model_infos = [model_infos] + model_info = model_infos[0] + config_name = model_info['config'].strip() + print(f'processing: {config_name}', flush=True) + checkpoint = osp.join(args.checkpoint_root, + model_info['checkpoint'].strip()) + # build the model from a config file and a checkpoint file + inference_model(config_name, checkpoint, visualizer, args) + return + else: + raise RuntimeError('model name input error.') + + # test all model + logger = MMLogger.get_instance( + name='MMLogger', + log_file='benchmark_test_image.log', + log_level=logging.ERROR) + + for model_key in config: + model_infos = config[model_key] + if not isinstance(model_infos, list): + model_infos = [model_infos] + for model_info in model_infos: + print('processing: ', model_info['config'], flush=True) + config_name = model_info['config'].strip() + checkpoint = osp.join(args.checkpoint_root, + model_info['checkpoint'].strip()) + try: + # build the model from a config file and a checkpoint file + inference_model(config_name, checkpoint, visualizer, args, + logger) + except Exception as e: + logger.error(f'{config_name} " : {repr(e)}') + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/mmdetection/.dev_scripts/benchmark_train.py b/mmdetection/.dev_scripts/benchmark_train.py new file mode 100644 index 0000000..cd1e70c --- /dev/null +++ b/mmdetection/.dev_scripts/benchmark_train.py @@ -0,0 +1,178 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +import os +import os.path as osp +from argparse import ArgumentParser + +from mmengine.config import Config, DictAction +from mmengine.logging import MMLogger, print_log +from mmengine.registry import RUNNERS +from mmengine.runner import Runner + +from mmdet.testing import replace_to_ceph +from mmdet.utils import register_all_modules, replace_cfg_vals + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('config', help='test config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument('--ceph', action='store_true') + parser.add_argument('--save-ckpt', action='store_true') + parser.add_argument( + '--amp', + action='store_true', + default=False, + help='enable automatic-mixed-precision training') + parser.add_argument( + '--auto-scale-lr', + action='store_true', + help='enable automatically scaling LR.') + parser.add_argument( + '--resume', + action='store_true', + help='resume from the latest checkpoint in the work_dir automatically') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + args = parser.parse_args() + return args + + +# TODO: Need to refactor train.py so that it can be reused. +def fast_train_model(config_name, args, logger=None): + cfg = Config.fromfile(config_name) + cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = osp.join(args.work_dir, + osp.splitext(osp.basename(config_name))[0]) + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(config_name))[0]) + + ckpt_hook = cfg.default_hooks.checkpoint + by_epoch = ckpt_hook.get('by_epoch', True) + fast_stop_hook = dict(type='FastStopTrainingHook') + fast_stop_hook['by_epoch'] = by_epoch + if args.save_ckpt: + if by_epoch: + interval = 1 + stop_iter_or_epoch = 2 + else: + interval = 4 + stop_iter_or_epoch = 10 + fast_stop_hook['stop_iter_or_epoch'] = stop_iter_or_epoch + fast_stop_hook['save_ckpt'] = True + ckpt_hook.interval = interval + + if 'custom_hooks' in cfg: + cfg.custom_hooks.append(fast_stop_hook) + else: + custom_hooks = [fast_stop_hook] + cfg.custom_hooks = custom_hooks + + # TODO: temporary plan + if 'visualizer' in cfg: + if 'name' in cfg.visualizer: + del cfg.visualizer.name + + # enable automatic-mixed-precision training + if args.amp is True: + optim_wrapper = cfg.optim_wrapper.type + if optim_wrapper == 'AmpOptimWrapper': + print_log( + 'AMP training is already enabled in your config.', + logger='current', + level=logging.WARNING) + else: + assert optim_wrapper == 'OptimWrapper', ( + '`--amp` is only supported when the optimizer wrapper type is ' + f'`OptimWrapper` but got {optim_wrapper}.') + cfg.optim_wrapper.type = 'AmpOptimWrapper' + cfg.optim_wrapper.loss_scale = 'dynamic' + + # enable automatically scaling LR + if args.auto_scale_lr: + if 'auto_scale_lr' in cfg and \ + 'enable' in cfg.auto_scale_lr and \ + 'base_batch_size' in cfg.auto_scale_lr: + cfg.auto_scale_lr.enable = True + else: + raise RuntimeError('Can not find "auto_scale_lr" or ' + '"auto_scale_lr.enable" or ' + '"auto_scale_lr.base_batch_size" in your' + ' configuration file.') + + if args.ceph: + replace_to_ceph(cfg) + + cfg.resume = args.resume + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + runner.train() + + +# Sample test whether the train code is correct +def main(args): + # register all modules in mmdet into the registries + register_all_modules(init_default_scope=False) + + config = Config.fromfile(args.config) + + # test all model + logger = MMLogger.get_instance( + name='MMLogger', + log_file='benchmark_train.log', + log_level=logging.ERROR) + + for model_key in config: + model_infos = config[model_key] + if not isinstance(model_infos, list): + model_infos = [model_infos] + for model_info in model_infos: + print('processing: ', model_info['config'], flush=True) + config_name = model_info['config'].strip() + try: + fast_train_model(config_name, args, logger) + except RuntimeError as e: + # quick exit is the normal exit message + if 'quick exit' not in repr(e): + logger.error(f'{config_name} " : {repr(e)}') + except Exception as e: + logger.error(f'{config_name} " : {repr(e)}') + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/mmdetection/.dev_scripts/benchmark_valid_flops.py b/mmdetection/.dev_scripts/benchmark_valid_flops.py new file mode 100644 index 0000000..7dc81f6 --- /dev/null +++ b/mmdetection/.dev_scripts/benchmark_valid_flops.py @@ -0,0 +1,295 @@ +import logging +import re +import tempfile +from argparse import ArgumentParser +from collections import OrderedDict +from functools import partial +from pathlib import Path + +import numpy as np +import pandas as pd +import torch +from mmengine import Config, DictAction +from mmengine.analysis import get_model_complexity_info +from mmengine.analysis.print_helper import _format_size +from mmengine.fileio import FileClient +from mmengine.logging import MMLogger +from mmengine.model import revert_sync_batchnorm +from mmengine.runner import Runner +from modelindex.load_model_index import load +from rich.console import Console +from rich.table import Table +from rich.text import Text +from tqdm import tqdm + +from mmdet.registry import MODELS +from mmdet.utils import register_all_modules + +console = Console() +MMDET_ROOT = Path(__file__).absolute().parents[1] + + +def parse_args(): + parser = ArgumentParser(description='Valid all models in model-index.yml') + parser.add_argument( + '--shape', + type=int, + nargs='+', + default=[1280, 800], + help='input image size') + parser.add_argument( + '--checkpoint_root', + help='Checkpoint file root path. If set, load checkpoint before test.') + parser.add_argument('--img', default='demo/demo.jpg', help='Image file') + parser.add_argument('--models', nargs='+', help='models name to inference') + parser.add_argument( + '--batch-size', + type=int, + default=1, + help='The batch size during the inference.') + parser.add_argument( + '--flops', action='store_true', help='Get Flops and Params of models') + parser.add_argument( + '--flops-str', + action='store_true', + help='Output FLOPs and params counts in a string form.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--size_divisor', + type=int, + default=32, + help='Pad the input image, the minimum size that is divisible ' + 'by size_divisor, -1 means do not pad the image.') + args = parser.parse_args() + return args + + +def inference(config_file, checkpoint, work_dir, args, exp_name): + logger = MMLogger.get_instance(name='MMLogger') + logger.warning('if you want test flops, please make sure torch>=1.12') + cfg = Config.fromfile(config_file) + cfg.work_dir = work_dir + cfg.load_from = checkpoint + cfg.log_level = 'WARN' + cfg.experiment_name = exp_name + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # forward the model + result = {'model': config_file.stem} + + if args.flops: + + if len(args.shape) == 1: + h = w = args.shape[0] + elif len(args.shape) == 2: + h, w = args.shape + else: + raise ValueError('invalid input shape') + divisor = args.size_divisor + if divisor > 0: + h = int(np.ceil(h / divisor)) * divisor + w = int(np.ceil(w / divisor)) * divisor + + input_shape = (3, h, w) + result['resolution'] = input_shape + + try: + cfg = Config.fromfile(config_file) + if hasattr(cfg, 'head_norm_cfg'): + cfg['head_norm_cfg'] = dict(type='SyncBN', requires_grad=True) + cfg['model']['roi_head']['bbox_head']['norm_cfg'] = dict( + type='SyncBN', requires_grad=True) + cfg['model']['roi_head']['mask_head']['norm_cfg'] = dict( + type='SyncBN', requires_grad=True) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + model = MODELS.build(cfg.model) + input = torch.rand(1, *input_shape) + if torch.cuda.is_available(): + model.cuda() + input = input.cuda() + model = revert_sync_batchnorm(model) + inputs = (input, ) + model.eval() + outputs = get_model_complexity_info( + model, input_shape, inputs, show_table=False, show_arch=False) + flops = outputs['flops'] + params = outputs['params'] + activations = outputs['activations'] + result['Get Types'] = 'direct' + except: # noqa 772 + logger = MMLogger.get_instance(name='MMLogger') + logger.warning( + 'Direct get flops failed, try to get flops with data') + cfg = Config.fromfile(config_file) + if hasattr(cfg, 'head_norm_cfg'): + cfg['head_norm_cfg'] = dict(type='SyncBN', requires_grad=True) + cfg['model']['roi_head']['bbox_head']['norm_cfg'] = dict( + type='SyncBN', requires_grad=True) + cfg['model']['roi_head']['mask_head']['norm_cfg'] = dict( + type='SyncBN', requires_grad=True) + data_loader = Runner.build_dataloader(cfg.val_dataloader) + data_batch = next(iter(data_loader)) + model = MODELS.build(cfg.model) + if torch.cuda.is_available(): + model = model.cuda() + model = revert_sync_batchnorm(model) + model.eval() + _forward = model.forward + data = model.data_preprocessor(data_batch) + del data_loader + model.forward = partial( + _forward, data_samples=data['data_samples']) + outputs = get_model_complexity_info( + model, + input_shape, + data['inputs'], + show_table=False, + show_arch=False) + flops = outputs['flops'] + params = outputs['params'] + activations = outputs['activations'] + result['Get Types'] = 'dataloader' + + if args.flops_str: + flops = _format_size(flops) + params = _format_size(params) + activations = _format_size(activations) + + result['flops'] = flops + result['params'] = params + + return result + + +def show_summary(summary_data, args): + table = Table(title='Validation Benchmark Regression Summary') + table.add_column('Model') + table.add_column('Validation') + table.add_column('Resolution (c, h, w)') + if args.flops: + table.add_column('Flops', justify='right', width=11) + table.add_column('Params', justify='right') + + for model_name, summary in summary_data.items(): + row = [model_name] + valid = summary['valid'] + color = 'green' if valid == 'PASS' else 'red' + row.append(f'[{color}]{valid}[/{color}]') + if valid == 'PASS': + row.append(str(summary['resolution'])) + if args.flops: + row.append(str(summary['flops'])) + row.append(str(summary['params'])) + table.add_row(*row) + + console.print(table) + table_data = { + x.header: [Text.from_markup(y).plain for y in x.cells] + for x in table.columns + } + table_pd = pd.DataFrame(table_data) + table_pd.to_csv('./mmdetection_flops.csv') + + +# Sample test whether the inference code is correct +def main(args): + register_all_modules() + model_index_file = MMDET_ROOT / 'model-index.yml' + model_index = load(str(model_index_file)) + model_index.build_models_with_collections() + models = OrderedDict({model.name: model for model in model_index.models}) + + logger = MMLogger( + 'validation', + logger_name='validation', + log_file='benchmark_test_image.log', + log_level=logging.INFO) + + if args.models: + patterns = [ + re.compile(pattern.replace('+', '_')) for pattern in args.models + ] + filter_models = {} + for k, v in models.items(): + k = k.replace('+', '_') + if any([re.match(pattern, k) for pattern in patterns]): + filter_models[k] = v + if len(filter_models) == 0: + print('No model found, please specify models in:') + print('\n'.join(models.keys())) + return + models = filter_models + + summary_data = {} + tmpdir = tempfile.TemporaryDirectory() + for model_name, model_info in tqdm(models.items()): + + if model_info.config is None: + continue + + model_info.config = model_info.config.replace('%2B', '+') + config = Path(model_info.config) + + try: + config.exists() + except: # noqa 722 + logger.error(f'{model_name}: {config} not found.') + continue + + logger.info(f'Processing: {model_name}') + + http_prefix = 'https://download.openmmlab.com/mmdetection/' + if args.checkpoint_root is not None: + root = args.checkpoint_root + if 's3://' in args.checkpoint_root: + from petrel_client.common.exception import AccessDeniedError + file_client = FileClient.infer_client(uri=root) + checkpoint = file_client.join_path( + root, model_info.weights[len(http_prefix):]) + try: + exists = file_client.exists(checkpoint) + except AccessDeniedError: + exists = False + else: + checkpoint = Path(root) / model_info.weights[len(http_prefix):] + exists = checkpoint.exists() + if exists: + checkpoint = str(checkpoint) + else: + print(f'WARNING: {model_name}: {checkpoint} not found.') + checkpoint = None + else: + checkpoint = None + + try: + # build the model from a config file and a checkpoint file + result = inference(MMDET_ROOT / config, checkpoint, tmpdir.name, + args, model_name) + result['valid'] = 'PASS' + except Exception: # noqa 722 + import traceback + logger.error(f'"{config}" :\n{traceback.format_exc()}') + result = {'valid': 'FAIL'} + + summary_data[model_name] = result + + tmpdir.cleanup() + show_summary(summary_data, args) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/mmdetection/.dev_scripts/check_links.py b/mmdetection/.dev_scripts/check_links.py new file mode 100755 index 0000000..ccf4fad --- /dev/null +++ b/mmdetection/.dev_scripts/check_links.py @@ -0,0 +1,157 @@ +# Modified from: +# https://github.com/allenai/allennlp/blob/main/scripts/check_links.py + +import argparse +import logging +import os +import pathlib +import re +import sys +from multiprocessing.dummy import Pool +from typing import NamedTuple, Optional, Tuple + +import requests +from mmengine.logging import MMLogger + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Goes through all the inline-links ' + 'in markdown files and reports the breakages') + parser.add_argument( + '--num-threads', + type=int, + default=100, + help='Number of processes to confirm the link') + parser.add_argument('--https-proxy', type=str, help='https proxy') + parser.add_argument( + '--out', + type=str, + default='link_reports.txt', + help='output path of reports') + args = parser.parse_args() + return args + + +OK_STATUS_CODES = ( + 200, + 401, # the resource exists but may require some sort of login. + 403, # ^ same + 405, # HEAD method not allowed. + # the resource exists, but our default 'Accept-' header may not + # match what the server can provide. + 406, +) + + +class MatchTuple(NamedTuple): + source: str + name: str + link: str + + +def check_link( + match_tuple: MatchTuple, + http_session: requests.Session, + logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]: + reason: Optional[str] = None + if match_tuple.link.startswith('http'): + result_ok, reason = check_url(match_tuple, http_session) + else: + result_ok = check_path(match_tuple) + if logger is None: + print(f" {'✓' if result_ok else '✗'} {match_tuple.link}") + else: + logger.info(f" {'✓' if result_ok else '✗'} {match_tuple.link}") + return match_tuple, result_ok, reason + + +def check_url(match_tuple: MatchTuple, + http_session: requests.Session) -> Tuple[bool, str]: + """Check if a URL is reachable.""" + try: + result = http_session.head( + match_tuple.link, timeout=5, allow_redirects=True) + return ( + result.ok or result.status_code in OK_STATUS_CODES, + f'status code = {result.status_code}', + ) + except (requests.ConnectionError, requests.Timeout): + return False, 'connection error' + + +def check_path(match_tuple: MatchTuple) -> bool: + """Check if a file in this repository exists.""" + relative_path = match_tuple.link.split('#')[0] + full_path = os.path.join( + os.path.dirname(str(match_tuple.source)), relative_path) + return os.path.exists(full_path) + + +def main(): + args = parse_args() + + # setup logger + logger = MMLogger.get_instance(name='mmdet', log_file=args.out) + + # setup https_proxy + if args.https_proxy: + os.environ['https_proxy'] = args.https_proxy + + # setup http_session + http_session = requests.Session() + for resource_prefix in ('http://', 'https://'): + http_session.mount( + resource_prefix, + requests.adapters.HTTPAdapter( + max_retries=5, + pool_connections=20, + pool_maxsize=args.num_threads), + ) + + logger.info('Finding all markdown files in the current directory...') + + project_root = (pathlib.Path(__file__).parent / '..').resolve() + markdown_files = project_root.glob('**/*.md') + + all_matches = set() + url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)') + for markdown_file in markdown_files: + with open(markdown_file) as handle: + for line in handle.readlines(): + matches = url_regex.findall(line) + for name, link in matches: + if 'localhost' not in link: + all_matches.add( + MatchTuple( + source=str(markdown_file), + name=name, + link=link)) + + logger.info(f' {len(all_matches)} markdown files found') + logger.info('Checking to make sure we can retrieve each link...') + + with Pool(processes=args.num_threads) as pool: + results = pool.starmap(check_link, [(match, http_session, logger) + for match in list(all_matches)]) + + # collect unreachable results + unreachable_results = [(match_tuple, reason) + for match_tuple, success, reason in results + if not success] + + if unreachable_results: + logger.info('================================================') + logger.info(f'Unreachable links ({len(unreachable_results)}):') + for match_tuple, reason in unreachable_results: + logger.info(' > Source: ' + match_tuple.source) + logger.info(' Name: ' + match_tuple.name) + logger.info(' Link: ' + match_tuple.link) + if reason is not None: + logger.info(' Reason: ' + reason) + sys.exit(1) + logger.info('No Unreachable link found.') + + +if __name__ == '__main__': + main() diff --git a/mmdetection/.dev_scripts/convert_test_benchmark_script.py b/mmdetection/.dev_scripts/convert_test_benchmark_script.py new file mode 100644 index 0000000..6d7ce8a --- /dev/null +++ b/mmdetection/.dev_scripts/convert_test_benchmark_script.py @@ -0,0 +1,114 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmengine import Config + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert benchmark model list to script') + parser.add_argument('config', help='test config file path') + parser.add_argument('--port', type=int, default=29666, help='dist port') + parser.add_argument( + '--run', action='store_true', help='run script directly') + parser.add_argument( + '--out', type=str, help='path to save model benchmark script') + + args = parser.parse_args() + return args + + +def process_model_info(model_info, work_dir): + config = model_info['config'].strip() + fname, _ = osp.splitext(osp.basename(config)) + job_name = fname + work_dir = '$WORK_DIR/' + fname + checkpoint = model_info['checkpoint'].strip() + return dict( + config=config, + job_name=job_name, + work_dir=work_dir, + checkpoint=checkpoint) + + +def create_test_bash_info(commands, model_test_dict, port, script_name, + partition): + config = model_test_dict['config'] + job_name = model_test_dict['job_name'] + checkpoint = model_test_dict['checkpoint'] + work_dir = model_test_dict['work_dir'] + + echo_info = f' \necho \'{config}\' &' + commands.append(echo_info) + commands.append('\n') + + command_info = f'GPUS=8 GPUS_PER_NODE=8 ' \ + f'CPUS_PER_TASK=$CPUS_PRE_TASK {script_name} ' + + command_info += f'{partition} ' + command_info += f'{job_name} ' + command_info += f'{config} ' + command_info += f'$CHECKPOINT_DIR/{checkpoint} ' + command_info += f'--work-dir {work_dir} ' + + command_info += f'--cfg-option env_cfg.dist_cfg.port={port} ' + command_info += ' &' + + commands.append(command_info) + + +def main(): + args = parse_args() + if args.out: + out_suffix = args.out.split('.')[-1] + assert args.out.endswith('.sh'), \ + f'Expected out file path suffix is .sh, but get .{out_suffix}' + assert args.out or args.run, \ + ('Please specify at least one operation (save/run/ the ' + 'script) with the argument "--out" or "--run"') + + commands = [] + partition_name = 'PARTITION=$1 ' + commands.append(partition_name) + commands.append('\n') + + checkpoint_root = 'CHECKPOINT_DIR=$2 ' + commands.append(checkpoint_root) + commands.append('\n') + + work_dir = 'WORK_DIR=$3 ' + commands.append(work_dir) + commands.append('\n') + + cpus_pre_task = 'CPUS_PER_TASK=${4:-2} ' + commands.append(cpus_pre_task) + commands.append('\n') + + script_name = osp.join('tools', 'slurm_test.sh') + port = args.port + + cfg = Config.fromfile(args.config) + + for model_key in cfg: + model_infos = cfg[model_key] + if not isinstance(model_infos, list): + model_infos = [model_infos] + for model_info in model_infos: + print('processing: ', model_info['config']) + model_test_dict = process_model_info(model_info, work_dir) + create_test_bash_info(commands, model_test_dict, port, script_name, + '$PARTITION') + port += 1 + + command_str = ''.join(commands) + if args.out: + with open(args.out, 'w') as f: + f.write(command_str) + if args.run: + os.system(command_str) + + +if __name__ == '__main__': + main() diff --git a/mmdetection/.dev_scripts/convert_train_benchmark_script.py b/mmdetection/.dev_scripts/convert_train_benchmark_script.py new file mode 100644 index 0000000..278a76c --- /dev/null +++ b/mmdetection/.dev_scripts/convert_train_benchmark_script.py @@ -0,0 +1,104 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert benchmark model json to script') + parser.add_argument( + 'txt_path', type=str, help='txt path output by benchmark_filter') + parser.add_argument( + '--run', action='store_true', help='run script directly') + parser.add_argument( + '--out', type=str, help='path to save model benchmark script') + + args = parser.parse_args() + return args + + +def determine_gpus(cfg_name): + gpus = 8 + gpus_pre_node = 8 + + if cfg_name.find('16x') >= 0: + gpus = 16 + elif cfg_name.find('4xb4') >= 0: + gpus = 4 + gpus_pre_node = 4 + elif 'lad' in cfg_name: + gpus = 2 + gpus_pre_node = 2 + + return gpus, gpus_pre_node + + +def main(): + args = parse_args() + if args.out: + out_suffix = args.out.split('.')[-1] + assert args.out.endswith('.sh'), \ + f'Expected out file path suffix is .sh, but get .{out_suffix}' + assert args.out or args.run, \ + ('Please specify at least one operation (save/run/ the ' + 'script) with the argument "--out" or "--run"') + + root_name = './tools' + train_script_name = osp.join(root_name, 'slurm_train.sh') + + commands = [] + partition_name = 'PARTITION=$1 ' + commands.append(partition_name) + commands.append('\n') + + work_dir = 'WORK_DIR=$2 ' + commands.append(work_dir) + commands.append('\n') + + cpus_pre_task = 'CPUS_PER_TASK=${3:-4} ' + commands.append(cpus_pre_task) + commands.append('\n') + commands.append('\n') + + with open(args.txt_path, 'r') as f: + model_cfgs = f.readlines() + for i, cfg in enumerate(model_cfgs): + cfg = cfg.strip() + if len(cfg) == 0: + continue + # print cfg name + echo_info = f'echo \'{cfg}\' &' + commands.append(echo_info) + commands.append('\n') + + fname, _ = osp.splitext(osp.basename(cfg)) + out_fname = '$WORK_DIR/' + fname + + gpus, gpus_pre_node = determine_gpus(cfg) + command_info = f'GPUS={gpus} GPUS_PER_NODE={gpus_pre_node} ' \ + f'CPUS_PER_TASK=$CPUS_PRE_TASK {train_script_name} ' + command_info += '$PARTITION ' + command_info += f'{fname} ' + command_info += f'{cfg} ' + command_info += f'{out_fname} ' + + command_info += '--cfg-options default_hooks.checkpoint.' \ + 'max_keep_ckpts=1 ' + command_info += '&' + + commands.append(command_info) + + if i < len(model_cfgs): + commands.append('\n') + + command_str = ''.join(commands) + if args.out: + with open(args.out, 'w') as f: + f.write(command_str) + if args.run: + os.system(command_str) + + +if __name__ == '__main__': + main() diff --git a/mmdetection/.dev_scripts/covignore.cfg b/mmdetection/.dev_scripts/covignore.cfg new file mode 100644 index 0000000..a3de535 --- /dev/null +++ b/mmdetection/.dev_scripts/covignore.cfg @@ -0,0 +1,5 @@ +# Each line should be the relative path to the root directory +# of this repo. Support regular expression as well. +# For example: + +.*/__init__.py diff --git a/mmdetection/.dev_scripts/download_checkpoints.py b/mmdetection/.dev_scripts/download_checkpoints.py new file mode 100644 index 0000000..fa5ef9d --- /dev/null +++ b/mmdetection/.dev_scripts/download_checkpoints.py @@ -0,0 +1,83 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import argparse +import math +import os +import os.path as osp +from multiprocessing import Pool + +import torch +from mmengine.config import Config +from mmengine.utils import mkdir_or_exist + + +def download(url, out_file, min_bytes=math.pow(1024, 2), progress=True): + # math.pow(1024, 2) is mean 1 MB + assert_msg = f"Downloaded url '{url}' does not exist " \ + f'or size is < min_bytes={min_bytes}' + try: + print(f'Downloading {url} to {out_file}...') + torch.hub.download_url_to_file(url, str(out_file), progress=progress) + assert osp.exists( + out_file) and osp.getsize(out_file) > min_bytes, assert_msg + except Exception as e: + if osp.exists(out_file): + os.remove(out_file) + print(f'ERROR: {e}\nRe-attempting {url} to {out_file} ...') + os.system(f"curl -L '{url}' -o '{out_file}' --retry 3 -C -" + ) # curl download, retry and resume on fail + finally: + if osp.exists(out_file) and osp.getsize(out_file) < min_bytes: + os.remove(out_file) # remove partial downloads + + if not osp.exists(out_file): + print(f'ERROR: {assert_msg}\n') + print('=========================================\n') + + +def parse_args(): + parser = argparse.ArgumentParser(description='Download checkpoints') + parser.add_argument('config', help='test config file path') + parser.add_argument( + 'out', type=str, help='output dir of checkpoints to be stored') + parser.add_argument( + '--nproc', type=int, default=16, help='num of Processes') + parser.add_argument( + '--intranet', + action='store_true', + help='switch to internal network url') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + mkdir_or_exist(args.out) + + cfg = Config.fromfile(args.config) + + checkpoint_url_list = [] + checkpoint_out_list = [] + + for model in cfg: + model_infos = cfg[model] + if not isinstance(model_infos, list): + model_infos = [model_infos] + for model_info in model_infos: + checkpoint = model_info['checkpoint'] + out_file = osp.join(args.out, checkpoint) + if not osp.exists(out_file): + + url = model_info['url'] + if args.intranet is True: + url = url.replace('.com', '.sensetime.com') + url = url.replace('https', 'http') + + checkpoint_url_list.append(url) + checkpoint_out_list.append(out_file) + + if len(checkpoint_url_list) > 0: + pool = Pool(min(os.cpu_count(), args.nproc)) + pool.starmap(download, zip(checkpoint_url_list, checkpoint_out_list)) + else: + print('No files to download!') diff --git a/mmdetection/.dev_scripts/gather_models.py b/mmdetection/.dev_scripts/gather_models.py new file mode 100644 index 0000000..52acdc3 --- /dev/null +++ b/mmdetection/.dev_scripts/gather_models.py @@ -0,0 +1,308 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os +import os.path as osp +import shutil +import subprocess +import time +from collections import OrderedDict + +import torch +import yaml +from mmengine.config import Config +from mmengine.fileio import dump +from mmengine.utils import digit_version, mkdir_or_exist, scandir + + +def ordered_yaml_dump(data, stream=None, Dumper=yaml.SafeDumper, **kwds): + + class OrderedDumper(Dumper): + pass + + def _dict_representer(dumper, data): + return dumper.represent_mapping( + yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, data.items()) + + OrderedDumper.add_representer(OrderedDict, _dict_representer) + return yaml.dump(data, stream, OrderedDumper, **kwds) + + +def process_checkpoint(in_file, out_file): + checkpoint = torch.load(in_file, map_location='cpu') + # remove optimizer for smaller file size + if 'optimizer' in checkpoint: + del checkpoint['optimizer'] + if 'ema_state_dict' in checkpoint: + del checkpoint['ema_state_dict'] + + # remove ema state_dict + for key in list(checkpoint['state_dict']): + if key.startswith('ema_'): + checkpoint['state_dict'].pop(key) + elif key.startswith('data_preprocessor'): + checkpoint['state_dict'].pop(key) + + # if it is necessary to remove some sensitive data in checkpoint['meta'], + # add the code here. + if digit_version(torch.__version__) >= digit_version('1.6'): + torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False) + else: + torch.save(checkpoint, out_file) + sha = subprocess.check_output(['sha256sum', out_file]).decode() + final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8]) + subprocess.Popen(['mv', out_file, final_file]) + return final_file + + +def is_by_epoch(config): + cfg = Config.fromfile('./configs/' + config) + return cfg.train_cfg.type == 'EpochBasedTrainLoop' + + +def get_final_epoch_or_iter(config): + cfg = Config.fromfile('./configs/' + config) + if cfg.train_cfg.type == 'EpochBasedTrainLoop': + return cfg.train_cfg.max_epochs + else: + return cfg.train_cfg.max_iters + + +def get_best_epoch_or_iter(exp_dir): + best_epoch_iter_full_path = list( + sorted(glob.glob(osp.join(exp_dir, 'best_*.pth'))))[-1] + best_epoch_or_iter_model_path = best_epoch_iter_full_path.split('/')[-1] + best_epoch_or_iter = best_epoch_or_iter_model_path.\ + split('_')[-1].split('.')[0] + return best_epoch_or_iter_model_path, int(best_epoch_or_iter) + + +def get_real_epoch_or_iter(config): + cfg = Config.fromfile('./configs/' + config) + if cfg.train_cfg.type == 'EpochBasedTrainLoop': + epoch = cfg.train_cfg.max_epochs + return epoch + else: + return cfg.train_cfg.max_iters + + +def get_final_results(log_json_path, + epoch_or_iter, + results_lut='coco/bbox_mAP', + by_epoch=True): + result_dict = dict() + with open(log_json_path) as f: + r = f.readlines()[-1] + last_metric = r.split(',')[0].split(': ')[-1].strip() + result_dict[results_lut] = last_metric + return result_dict + + +def get_dataset_name(config): + # If there are more dataset, add here. + name_map = dict( + CityscapesDataset='Cityscapes', + CocoDataset='COCO', + CocoPanopticDataset='COCO', + DeepFashionDataset='Deep Fashion', + LVISV05Dataset='LVIS v0.5', + LVISV1Dataset='LVIS v1', + VOCDataset='Pascal VOC', + WIDERFaceDataset='WIDER Face', + OpenImagesDataset='OpenImagesDataset', + OpenImagesChallengeDataset='OpenImagesChallengeDataset', + Objects365V1Dataset='Objects365 v1', + Objects365V2Dataset='Objects365 v2') + cfg = Config.fromfile('./configs/' + config) + return name_map[cfg.dataset_type] + + +def find_last_dir(model_dir): + dst_times = [] + for time_stamp in os.scandir(model_dir): + if osp.isdir(time_stamp): + dst_time = time.mktime( + time.strptime(time_stamp.name, '%Y%m%d_%H%M%S')) + dst_times.append([dst_time, time_stamp.name]) + return max(dst_times, key=lambda x: x[0])[1] + + +def convert_model_info_to_pwc(model_infos): + pwc_files = {} + for model in model_infos: + cfg_folder_name = osp.split(model['config'])[-2] + pwc_model_info = OrderedDict() + pwc_model_info['Name'] = osp.split(model['config'])[-1].split('.')[0] + pwc_model_info['In Collection'] = 'Please fill in Collection name' + pwc_model_info['Config'] = osp.join('configs', model['config']) + + # get metadata + meta_data = OrderedDict() + if 'epochs' in model: + meta_data['Epochs'] = get_real_epoch_or_iter(model['config']) + else: + meta_data['Iterations'] = get_real_epoch_or_iter(model['config']) + pwc_model_info['Metadata'] = meta_data + + # get dataset name + dataset_name = get_dataset_name(model['config']) + + # get results + results = [] + # if there are more metrics, add here. + if 'bbox_mAP' in model['results']: + metric = round(model['results']['bbox_mAP'] * 100, 1) + results.append( + OrderedDict( + Task='Object Detection', + Dataset=dataset_name, + Metrics={'box AP': metric})) + if 'segm_mAP' in model['results']: + metric = round(model['results']['segm_mAP'] * 100, 1) + results.append( + OrderedDict( + Task='Instance Segmentation', + Dataset=dataset_name, + Metrics={'mask AP': metric})) + if 'PQ' in model['results']: + metric = round(model['results']['PQ'], 1) + results.append( + OrderedDict( + Task='Panoptic Segmentation', + Dataset=dataset_name, + Metrics={'PQ': metric})) + pwc_model_info['Results'] = results + + link_string = 'https://download.openmmlab.com/mmdetection/v3.0/' + link_string += '{}/{}'.format(model['config'].rstrip('.py'), + osp.split(model['model_path'])[-1]) + pwc_model_info['Weights'] = link_string + if cfg_folder_name in pwc_files: + pwc_files[cfg_folder_name].append(pwc_model_info) + else: + pwc_files[cfg_folder_name] = [pwc_model_info] + return pwc_files + + +def parse_args(): + parser = argparse.ArgumentParser(description='Gather benchmarked models') + parser.add_argument( + 'root', + type=str, + default='work_dirs', + help='root path of benchmarked models to be gathered') + parser.add_argument( + '--out', + type=str, + default='gather', + help='output path of gathered models to be stored') + parser.add_argument( + '--best', + action='store_true', + help='whether to gather the best model.') + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + models_root = args.root + models_out = args.out + mkdir_or_exist(models_out) + + # find all models in the root directory to be gathered + raw_configs = list(scandir('./configs', '.py', recursive=True)) + + # filter configs that is not trained in the experiments dir + used_configs = [] + for raw_config in raw_configs: + if osp.exists(osp.join(models_root, raw_config)): + used_configs.append(raw_config) + print(f'Find {len(used_configs)} models to be gathered') + + # find final_ckpt and log file for trained each config + # and parse the best performance + model_infos = [] + for used_config in used_configs: + exp_dir = osp.join(models_root, used_config) + by_epoch = is_by_epoch(used_config) + # check whether the exps is finished + if args.best is True: + final_model, final_epoch_or_iter = get_best_epoch_or_iter(exp_dir) + else: + final_epoch_or_iter = get_final_epoch_or_iter(used_config) + final_model = '{}_{}.pth'.format('epoch' if by_epoch else 'iter', + final_epoch_or_iter) + + model_path = osp.join(exp_dir, final_model) + # skip if the model is still training + if not osp.exists(model_path): + continue + + # get the latest logs + latest_exp_name = find_last_dir(exp_dir) + latest_exp_json = osp.join(exp_dir, latest_exp_name, 'vis_data', + latest_exp_name + '.json') + + model_performance = get_final_results( + latest_exp_json, final_epoch_or_iter, by_epoch=by_epoch) + + if model_performance is None: + continue + + model_info = dict( + config=used_config, + results=model_performance, + final_model=final_model, + latest_exp_json=latest_exp_json, + latest_exp_name=latest_exp_name) + model_info['epochs' if by_epoch else 'iterations'] =\ + final_epoch_or_iter + model_infos.append(model_info) + + # publish model for each checkpoint + publish_model_infos = [] + for model in model_infos: + model_publish_dir = osp.join(models_out, model['config'].rstrip('.py')) + mkdir_or_exist(model_publish_dir) + + model_name = osp.split(model['config'])[-1].split('.')[0] + + model_name += '_' + model['latest_exp_name'] + publish_model_path = osp.join(model_publish_dir, model_name) + trained_model_path = osp.join(models_root, model['config'], + model['final_model']) + + # convert model + final_model_path = process_checkpoint(trained_model_path, + publish_model_path) + + # copy log + shutil.copy(model['latest_exp_json'], + osp.join(model_publish_dir, f'{model_name}.log.json')) + + # copy config to guarantee reproducibility + config_path = model['config'] + config_path = osp.join( + 'configs', + config_path) if 'configs' not in config_path else config_path + target_config_path = osp.split(config_path)[-1] + shutil.copy(config_path, osp.join(model_publish_dir, + target_config_path)) + + model['model_path'] = final_model_path + publish_model_infos.append(model) + + models = dict(models=publish_model_infos) + print(f'Totally gathered {len(publish_model_infos)} models') + dump(models, osp.join(models_out, 'model_info.json')) + + pwc_files = convert_model_info_to_pwc(publish_model_infos) + for name in pwc_files: + with open(osp.join(models_out, name + '_metafile.yml'), 'w') as f: + ordered_yaml_dump(pwc_files[name], f, encoding='utf-8') + + +if __name__ == '__main__': + main() diff --git a/mmdetection/.dev_scripts/gather_test_benchmark_metric.py b/mmdetection/.dev_scripts/gather_test_benchmark_metric.py new file mode 100644 index 0000000..951bfe6 --- /dev/null +++ b/mmdetection/.dev_scripts/gather_test_benchmark_metric.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os.path as osp + +from mmengine.config import Config +from mmengine.fileio import dump, load +from mmengine.utils import mkdir_or_exist + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Gather benchmarked models metric') + parser.add_argument('config', help='test config file path') + parser.add_argument( + 'root', + type=str, + help='root path of benchmarked models to be gathered') + parser.add_argument( + '--out', type=str, help='output path of gathered metrics to be stored') + parser.add_argument( + '--not-show', action='store_true', help='not show metrics') + parser.add_argument( + '--show-all', action='store_true', help='show all model metrics') + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + + root_path = args.root + metrics_out = args.out + result_dict = {} + + cfg = Config.fromfile(args.config) + + for model_key in cfg: + model_infos = cfg[model_key] + if not isinstance(model_infos, list): + model_infos = [model_infos] + for model_info in model_infos: + record_metrics = model_info['metric'] + config = model_info['config'].strip() + fname, _ = osp.splitext(osp.basename(config)) + metric_json_dir = osp.join(root_path, fname) + if osp.exists(metric_json_dir): + json_list = glob.glob(osp.join(metric_json_dir, '*.json')) + if len(json_list) > 0: + log_json_path = list(sorted(json_list))[-1] + + metric = load(log_json_path) + if config in metric.get('config', {}): + + new_metrics = dict() + for record_metric_key in record_metrics: + record_metric_key_bk = record_metric_key + old_metric = record_metrics[record_metric_key] + if record_metric_key == 'AR_1000': + record_metric_key = 'AR@1000' + if record_metric_key not in metric['metric']: + raise KeyError( + 'record_metric_key not exist, please ' + 'check your config') + new_metric = round( + metric['metric'][record_metric_key] * 100, 1) + new_metrics[record_metric_key_bk] = new_metric + + if args.show_all: + result_dict[config] = dict( + before=record_metrics, after=new_metrics) + else: + for record_metric_key in record_metrics: + old_metric = record_metrics[record_metric_key] + new_metric = new_metrics[record_metric_key] + if old_metric != new_metric: + result_dict[config] = dict( + before=record_metrics, + after=new_metrics) + break + else: + print(f'{config} not included in: {log_json_path}') + else: + print(f'{config} not exist file: {metric_json_dir}') + else: + print(f'{config} not exist dir: {metric_json_dir}') + + if metrics_out: + mkdir_or_exist(metrics_out) + dump(result_dict, osp.join(metrics_out, 'batch_test_metric_info.json')) + if not args.not_show: + print('===================================') + for config_name, metrics in result_dict.items(): + print(config_name, metrics) + print('===================================') diff --git a/mmdetection/.dev_scripts/gather_train_benchmark_metric.py b/mmdetection/.dev_scripts/gather_train_benchmark_metric.py new file mode 100644 index 0000000..3d4c9cf --- /dev/null +++ b/mmdetection/.dev_scripts/gather_train_benchmark_metric.py @@ -0,0 +1,151 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os.path as osp + +from gather_models import get_final_results +from mmengine.config import Config +from mmengine.fileio import dump +from mmengine.utils import mkdir_or_exist + +try: + import xlrd +except ImportError: + xlrd = None +try: + import xlutils + from xlutils.copy import copy +except ImportError: + xlutils = None + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Gather benchmarked models metric') + parser.add_argument( + 'root', + type=str, + help='root path of benchmarked models to be gathered') + parser.add_argument( + 'txt_path', type=str, help='txt path output by benchmark_filter') + parser.add_argument( + '--out', type=str, help='output path of gathered metrics to be stored') + parser.add_argument( + '--not-show', action='store_true', help='not show metrics') + parser.add_argument( + '--excel', type=str, help='input path of excel to be recorded') + parser.add_argument( + '--ncol', type=int, help='Number of column to be modified or appended') + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + + if args.excel: + assert args.ncol, 'Please specify "--excel" and "--ncol" ' \ + 'at the same time' + if xlrd is None: + raise RuntimeError( + 'xlrd is not installed,' + 'Please use “pip install xlrd==1.2.0” to install') + if xlutils is None: + raise RuntimeError( + 'xlutils is not installed,' + 'Please use “pip install xlutils==2.0.0” to install') + readbook = xlrd.open_workbook(args.excel) + sheet = readbook.sheet_by_name('Sheet1') + sheet_info = {} + total_nrows = sheet.nrows + for i in range(3, sheet.nrows): + sheet_info[sheet.row_values(i)[0]] = i + xlrw = copy(readbook) + table = xlrw.get_sheet(0) + + root_path = args.root + metrics_out = args.out + + result_dict = {} + with open(args.txt_path, 'r') as f: + model_cfgs = f.readlines() + for i, config in enumerate(model_cfgs): + config = config.strip() + if len(config) == 0: + continue + + config_name = osp.split(config)[-1] + config_name = osp.splitext(config_name)[0] + result_path = osp.join(root_path, config_name) + if osp.exists(result_path): + # 1 read config + cfg = Config.fromfile(config) + total_epochs = cfg.runner.max_epochs + final_results = cfg.evaluation.metric + if not isinstance(final_results, list): + final_results = [final_results] + final_results_out = [] + for key in final_results: + if 'proposal_fast' in key: + final_results_out.append('AR@1000') # RPN + elif 'mAP' not in key: + final_results_out.append(key + '_mAP') + + # 2 determine whether total_epochs ckpt exists + ckpt_path = f'epoch_{total_epochs}.pth' + if osp.exists(osp.join(result_path, ckpt_path)): + log_json_path = list( + sorted(glob.glob(osp.join(result_path, + '*.log.json'))))[-1] + + # 3 read metric + model_performance = get_final_results( + log_json_path, total_epochs, final_results_out) + if model_performance is None: + print(f'log file error: {log_json_path}') + continue + for performance in model_performance: + if performance in ['AR@1000', 'bbox_mAP', 'segm_mAP']: + metric = round( + model_performance[performance] * 100, 1) + model_performance[performance] = metric + result_dict[config] = model_performance + + # update and append excel content + if args.excel: + if 'AR@1000' in model_performance: + metrics = f'{model_performance["AR@1000"]}' \ + f'(AR@1000)' + elif 'segm_mAP' in model_performance: + metrics = f'{model_performance["bbox_mAP"]}/' \ + f'{model_performance["segm_mAP"]}' + else: + metrics = f'{model_performance["bbox_mAP"]}' + + row_num = sheet_info.get(config, None) + if row_num: + table.write(row_num, args.ncol, metrics) + else: + table.write(total_nrows, 0, config) + table.write(total_nrows, args.ncol, metrics) + total_nrows += 1 + + else: + print(f'{config} not exist: {ckpt_path}') + else: + print(f'not exist: {config}') + + # 4 save or print results + if metrics_out: + mkdir_or_exist(metrics_out) + dump(result_dict, osp.join(metrics_out, 'model_metric_info.json')) + if not args.not_show: + print('===================================') + for config_name, metrics in result_dict.items(): + print(config_name, metrics) + print('===================================') + if args.excel: + filename, sufflx = osp.splitext(args.excel) + xlrw.save(f'{filename}_o{sufflx}') + print(f'>>> Output {filename}_o{sufflx}') diff --git a/mmdetection/.dev_scripts/test_init_backbone.py b/mmdetection/.dev_scripts/test_init_backbone.py new file mode 100644 index 0000000..d38d180 --- /dev/null +++ b/mmdetection/.dev_scripts/test_init_backbone.py @@ -0,0 +1,178 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Check out backbone whether successfully load pretrained checkpoint.""" +import copy +import os +from os.path import dirname, exists, join + +import pytest +from mmengine.config import Config +from mmengine.runner import CheckpointLoader +from mmengine.utils import ProgressBar + +from mmdet.registry import MODELS + + +def _get_config_directory(): + """Find the predefined detector config directory.""" + try: + # Assume we are running in the source mmdetection repo + repo_dpath = dirname(dirname(__file__)) + except NameError: + # For IPython development when this __file__ is not defined + import mmdet + repo_dpath = dirname(dirname(mmdet.__file__)) + config_dpath = join(repo_dpath, 'configs') + if not exists(config_dpath): + raise Exception('Cannot find config path') + return config_dpath + + +def _get_config_module(fname): + """Load a configuration as a python module.""" + config_dpath = _get_config_directory() + config_fpath = join(config_dpath, fname) + config_mod = Config.fromfile(config_fpath) + return config_mod + + +def _get_detector_cfg(fname): + """Grab configs necessary to create a detector. + + These are deep copied to allow for safe modification of parameters without + influencing other tests. + """ + config = _get_config_module(fname) + model = copy.deepcopy(config.model) + return model + + +def _traversed_config_file(): + """We traversed all potential config files under the `config` file. If you + need to print details or debug code, you can use this function. + + If the `backbone.init_cfg` is None (do not use `Pretrained` init way), you + need add the folder name in `ignores_folder` (if the config files in this + folder all set backbone.init_cfg is None) or add config name in + `ignores_file` (if the config file set backbone.init_cfg is None) + """ + config_path = _get_config_directory() + check_cfg_names = [] + + # `base`, `legacy_1.x` and `common` ignored by default. + ignores_folder = ['_base_', 'legacy_1.x', 'common'] + # 'ld' need load teacher model, if want to check 'ld', + # please check teacher_config path first. + ignores_folder += ['ld'] + # `selfsup_pretrain` need convert model, if want to check this model, + # need to convert the model first. + ignores_folder += ['selfsup_pretrain'] + + # the `init_cfg` in 'centripetalnet', 'cornernet', 'cityscapes', + # 'scratch' is None. + # the `init_cfg` in ssdlite(`ssdlite_mobilenetv2_scratch_600e_coco.py`) + # is None + # Please confirm `bockbone.init_cfg` is None first. + ignores_folder += ['centripetalnet', 'cornernet', 'cityscapes', 'scratch'] + ignores_file = ['ssdlite_mobilenetv2_scratch_600e_coco.py'] + + for config_file_name in os.listdir(config_path): + if config_file_name not in ignores_folder: + config_file = join(config_path, config_file_name) + if os.path.isdir(config_file): + for config_sub_file in os.listdir(config_file): + if config_sub_file.endswith('py') and \ + config_sub_file not in ignores_file: + name = join(config_file, config_sub_file) + check_cfg_names.append(name) + return check_cfg_names + + +def _check_backbone(config, print_cfg=True): + """Check out backbone whether successfully load pretrained model, by using + `backbone.init_cfg`. + + First, using `CheckpointLoader.load_checkpoint` to load the checkpoint + without loading models. + Then, using `MODELS.build` to build models, and using + `model.init_weights()` to initialize the parameters. + Finally, assert weights and bias of each layer loaded from pretrained + checkpoint are equal to the weights and bias of original checkpoint. + For the convenience of comparison, we sum up weights and bias of + each loaded layer separately. + + Args: + config (str): Config file path. + print_cfg (bool): Whether print logger and return the result. + + Returns: + results (str or None): If backbone successfully load pretrained + checkpoint, return None; else, return config file path. + """ + if print_cfg: + print('-' * 15 + 'loading ', config) + cfg = Config.fromfile(config) + init_cfg = None + try: + init_cfg = cfg.model.backbone.init_cfg + init_flag = True + except AttributeError: + init_flag = False + if init_cfg is None or init_cfg.get('type') != 'Pretrained': + init_flag = False + if init_flag: + checkpoint = CheckpointLoader.load_checkpoint(init_cfg.checkpoint) + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + model = MODELS.build(cfg.model) + model.init_weights() + + checkpoint_layers = state_dict.keys() + for name, value in model.backbone.state_dict().items(): + if name in checkpoint_layers: + assert value.equal(state_dict[name]) + + if print_cfg: + print('-' * 10 + 'Successfully load checkpoint' + '-' * 10 + + '\n', ) + return None + else: + if print_cfg: + print(config + '\n' + '-' * 10 + + 'config file do not have init_cfg' + '-' * 10 + '\n') + return config + + +@pytest.mark.parametrize('config', _traversed_config_file()) +def test_load_pretrained(config): + """Check out backbone whether successfully load pretrained model by using + `backbone.init_cfg`. + + Details please refer to `_check_backbone` + """ + _check_backbone(config, print_cfg=False) + + +def _test_load_pretrained(): + """We traversed all potential config files under the `config` file. If you + need to print details or debug code, you can use this function. + + Returns: + check_cfg_names (list[str]): Config files that backbone initialized + from pretrained checkpoint might be problematic. Need to recheck + the config file. The output including the config files that the + backbone.init_cfg is None + """ + check_cfg_names = _traversed_config_file() + need_check_cfg = [] + + prog_bar = ProgressBar(len(check_cfg_names)) + for config in check_cfg_names: + init_cfg_name = _check_backbone(config) + if init_cfg_name is not None: + need_check_cfg.append(init_cfg_name) + prog_bar.update() + print('These config files need to be checked again') + print(need_check_cfg) diff --git a/mmdetection/.owners.yml b/mmdetection/.owners.yml new file mode 100644 index 0000000..97296aa --- /dev/null +++ b/mmdetection/.owners.yml @@ -0,0 +1,14 @@ +assign: + strategy: + # random + daily-shift-based + scedule: + '*/1 * * * *' + assignees: + - Czm369 + - hhaAndroid + - jbwang1997 + - RangiLyu + - BIGWangYuDong + - chhluo + - ZwwWayne diff --git a/mmdetection/.pre-commit-config-zh-cn.yaml b/mmdetection/.pre-commit-config-zh-cn.yaml new file mode 100644 index 0000000..3859170 --- /dev/null +++ b/mmdetection/.pre-commit-config-zh-cn.yaml @@ -0,0 +1,61 @@ +exclude: ^tests/data/ +repos: + - repo: https://gitee.com/openmmlab/mirrors-flake8 + rev: 5.0.4 + hooks: + - id: flake8 + - repo: https://gitee.com/openmmlab/mirrors-isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://gitee.com/openmmlab/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://gitee.com/openmmlab/mirrors-mdformat + rev: 0.7.9 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://gitee.com/openmmlab/mirrors-codespell + rev: v2.2.1 + hooks: + - id: codespell + - repo: https://gitee.com/openmmlab/mirrors-docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + - repo: https://gitee.com/openmmlab/mirrors-pyupgrade + rev: v3.0.0 + hooks: + - id: pyupgrade + args: ["--py36-plus"] + - repo: https://gitee.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 + hooks: + - id: check-algo-readme + - id: check-copyright + args: ["mmdet"] +# - repo: https://gitee.com/openmmlab/mirrors-mypy +# rev: v0.812 +# hooks: +# - id: mypy +# exclude: "docs" diff --git a/mmdetection/.pre-commit-config.yaml b/mmdetection/.pre-commit-config.yaml new file mode 100644 index 0000000..6ea250c --- /dev/null +++ b/mmdetection/.pre-commit-config.yaml @@ -0,0 +1,50 @@ +repos: + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + - repo: https://github.com/PyCQA/isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://github.com/codespell-project/codespell + rev: v2.2.1 + hooks: + - id: codespell + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.9 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://github.com/myint/docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + - repo: https://github.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 # Use the ref you want to point at + hooks: + - id: check-algo-readme + - id: check-copyright + args: ["mmdet"] # replace the dir_to_check with your expected directory to check diff --git a/mmdetection/.readthedocs.yml b/mmdetection/.readthedocs.yml new file mode 100644 index 0000000..9b59797 --- /dev/null +++ b/mmdetection/.readthedocs.yml @@ -0,0 +1,14 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.8" + +formats: + - epub + +python: + install: + - requirements: requirements/docs.txt + - requirements: requirements/readthedocs.txt diff --git a/mmdetection/CITATION.cff b/mmdetection/CITATION.cff new file mode 100644 index 0000000..aac9313 --- /dev/null +++ b/mmdetection/CITATION.cff @@ -0,0 +1,8 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: + - name: "MMDetection Contributors" +title: "OpenMMLab Detection Toolbox and Benchmark" +date-released: 2018-08-22 +url: "https://github.com/open-mmlab/mmdetection" +license: Apache-2.0 diff --git a/mmdetection/LICENSE b/mmdetection/LICENSE new file mode 100644 index 0000000..1bfc23e --- /dev/null +++ b/mmdetection/LICENSE @@ -0,0 +1,203 @@ +Copyright 2018-2023 OpenMMLab. All rights reserved. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018-2023 OpenMMLab. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/mmdetection/MANIFEST.in b/mmdetection/MANIFEST.in new file mode 100644 index 0000000..7398e6a --- /dev/null +++ b/mmdetection/MANIFEST.in @@ -0,0 +1,7 @@ +include requirements/*.txt +include mmdet/VERSION +include mmdet/.mim/model-index.yml +include mmdet/.mim/dataset-index.yml +include mmdet/.mim/demo/*/* +recursive-include mmdet/.mim/configs *.py *.yml +recursive-include mmdet/.mim/tools *.sh *.py diff --git a/mmdetection/README.md b/mmdetection/README.md new file mode 100644 index 0000000..09e20cf --- /dev/null +++ b/mmdetection/README.md @@ -0,0 +1,487 @@ +
+ +
 
+
+ OpenMMLab website + + + HOT + + +      + OpenMMLab platform + + + TRY IT OUT + + +
+
 
+ +[![PyPI](https://img.shields.io/pypi/v/mmdet)](https://pypi.org/project/mmdet) +[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmdetection.readthedocs.io/en/latest/) +[![badge](https://github.com/open-mmlab/mmdetection/workflows/build/badge.svg)](https://github.com/open-mmlab/mmdetection/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmdetection/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmdetection) +[![license](https://img.shields.io/github/license/open-mmlab/mmdetection.svg)](https://github.com/open-mmlab/mmdetection/blob/main/LICENSE) +[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmdetection.svg)](https://github.com/open-mmlab/mmdetection/issues) +[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmdetection.svg)](https://github.com/open-mmlab/mmdetection/issues) +[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_demo.svg)](https://openxlab.org.cn/apps?search=mmdet) + +[📘Documentation](https://mmdetection.readthedocs.io/en/latest/) | +[🛠️Installation](https://mmdetection.readthedocs.io/en/latest/get_started.html) | +[👀Model Zoo](https://mmdetection.readthedocs.io/en/latest/model_zoo.html) | +[🆕Update News](https://mmdetection.readthedocs.io/en/latest/notes/changelog.html) | +[🚀Ongoing Projects](https://github.com/open-mmlab/mmdetection/projects) | +[🤔Reporting Issues](https://github.com/open-mmlab/mmdetection/issues/new/choose) + +
+ +
+ +English | [简体中文](README_zh-CN.md) + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +
+ +
+ +## Introduction + +MMDetection is an open source object detection toolbox based on PyTorch. It is +a part of the [OpenMMLab](https://openmmlab.com/) project. + +The main branch works with **PyTorch 1.8+**. + + + +
+Major features + +- **Modular Design** + + We decompose the detection framework into different components and one can easily construct a customized object detection framework by combining different modules. + +- **Support of multiple tasks out of box** + + The toolbox directly supports multiple detection tasks such as **object detection**, **instance segmentation**, **panoptic segmentation**, and **semi-supervised object detection**. + +- **High efficiency** + + All basic bbox and mask operations run on GPUs. The training speed is faster than or comparable to other codebases, including [Detectron2](https://github.com/facebookresearch/detectron2), [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark) and [SimpleDet](https://github.com/TuSimple/simpledet). + +- **State of the art** + + The toolbox stems from the codebase developed by the *MMDet* team, who won [COCO Detection Challenge](http://cocodataset.org/#detection-leaderboard) in 2018, and we keep pushing it forward. + The newly released [RTMDet](configs/rtmdet) also obtains new state-of-the-art results on real-time instance segmentation and rotated object detection tasks and the best parameter-accuracy trade-off on object detection. + +
+ +Apart from MMDetection, we also released [MMEngine](https://github.com/open-mmlab/mmengine) for model training and [MMCV](https://github.com/open-mmlab/mmcv) for computer vision research, which are heavily depended on by this toolbox. + +## What's New + +### Highlight + +**v3.2.0** was released in 12/10/2023: + +**1. Detection Transformer SOTA Model Collection** +(1) Supported four updated and stronger SOTA Transformer models: [DDQ](configs/ddq/README.md), [CO-DETR](projects/CO-DETR/README.md), [AlignDETR](projects/AlignDETR/README.md), and [H-DINO](projects/HDINO/README.md). +(2) Based on CO-DETR, MMDet released a model with a COCO performance of 64.1 mAP. +(3) Algorithms such as DINO support `AMP/Checkpoint/FrozenBN`, which can effectively reduce memory usage. + +**2. [Comprehensive Performance Comparison between CNN and Transformer](<(projects/RF100-Benchmark/README.md)>)** +RF100 consists of a dataset collection of 100 real-world datasets, including 7 domains. It can be used to assess the performance differences of Transformer models like DINO and CNN-based algorithms under different scenarios and data volumes. Users can utilize this benchmark to quickly evaluate the robustness of their algorithms in various scenarios. + +
+ +
+ +**3. Support for [GLIP](configs/glip/README.md) and [Grounding DINO](configs/grounding_dino/README.md) fine-tuning, the only algorithm library that supports Grounding DINO fine-tuning** +The Grounding DINO algorithm in MMDet is the only library that supports fine-tuning. Its performance is one point higher than the official version, and of course, GLIP also outperforms the official version. +We also provide a detailed process for training and evaluating Grounding DINO on custom datasets. Everyone is welcome to give it a try. + +| Model | Backbone | Style | COCO mAP | Official COCO mAP | +| :----------------: | :------: | :-------: | :--------: | :---------------: | +| Grounding DINO-T | Swin-T | Zero-shot | 48.5 | 48.4 | +| Grounding DINO-T | Swin-T | Finetune | 58.1(+0.9) | 57.2 | +| Grounding DINO-B | Swin-B | Zero-shot | 56.9 | 56.7 | +| Grounding DINO-B | Swin-B | Finetune | 59.7 | | +| Grounding DINO-R50 | R50 | Scratch | 48.9(+0.8) | 48.1 | + +**4. Support for the open-vocabulary detection algorithm [Detic](projects/Detic_new/README.md) and multi-dataset joint training.** +**5. Training detection models using [FSDP and DeepSpeed](<(projects/example_largemodel/README.md)>).** + +| ID | AMP | GC of Backbone | GC of Encoder | FSDP | Peak Mem (GB) | Iter Time (s) | +| :-: | :-: | :------------: | :-----------: | :--: | :-----------: | :-----------: | +| 1 | | | | | 49 (A100) | 0.9 | +| 2 | √ | | | | 39 (A100) | 1.2 | +| 3 | | √ | | | 33 (A100) | 1.1 | +| 4 | √ | √ | | | 25 (A100) | 1.3 | +| 5 | | √ | √ | | 18 | 2.2 | +| 6 | √ | √ | √ | | 13 | 1.6 | +| 7 | | √ | √ | √ | 14 | 2.9 | +| 8 | √ | √ | √ | √ | 8.5 | 2.4 | + +**6. Support for the [V3Det](configs/v3det/README.md) dataset, a large-scale detection dataset with over 13,000 categories.** + +
+ +
+ +We are excited to announce our latest work on real-time object recognition tasks, **RTMDet**, a family of fully convolutional single-stage detectors. RTMDet not only achieves the best parameter-accuracy trade-off on object detection from tiny to extra-large model sizes but also obtains new state-of-the-art performance on instance segmentation and rotated object detection tasks. Details can be found in the [technical report](https://arxiv.org/abs/2212.07784). Pre-trained models are [here](configs/rtmdet). + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + +| Task | Dataset | AP | FPS(TRT FP16 BS1 3090) | +| ------------------------ | ------- | ------------------------------------ | ---------------------- | +| Object Detection | COCO | 52.8 | 322 | +| Instance Segmentation | COCO | 44.6 | 188 | +| Rotated Object Detection | DOTA | 78.9(single-scale)/81.3(multi-scale) | 121 | + +
+ +
+ +## Installation + +Please refer to [Installation](https://mmdetection.readthedocs.io/en/latest/get_started.html) for installation instructions. + +## Getting Started + +Please see [Overview](https://mmdetection.readthedocs.io/en/latest/get_started.html) for the general introduction of MMDetection. + +For detailed user guides and advanced guides, please refer to our [documentation](https://mmdetection.readthedocs.io/en/latest/): + +- User Guides + +
+ + - [Train & Test](https://mmdetection.readthedocs.io/en/latest/user_guides/index.html#train-test) + - [Learn about Configs](https://mmdetection.readthedocs.io/en/latest/user_guides/config.html) + - [Inference with existing models](https://mmdetection.readthedocs.io/en/latest/user_guides/inference.html) + - [Dataset Prepare](https://mmdetection.readthedocs.io/en/latest/user_guides/dataset_prepare.html) + - [Test existing models on standard datasets](https://mmdetection.readthedocs.io/en/latest/user_guides/test.html) + - [Train predefined models on standard datasets](https://mmdetection.readthedocs.io/en/latest/user_guides/train.html) + - [Train with customized datasets](https://mmdetection.readthedocs.io/en/latest/user_guides/train.html#train-with-customized-datasets) + - [Train with customized models and standard datasets](https://mmdetection.readthedocs.io/en/latest/user_guides/new_model.html) + - [Finetuning Models](https://mmdetection.readthedocs.io/en/latest/user_guides/finetune.html) + - [Test Results Submission](https://mmdetection.readthedocs.io/en/latest/user_guides/test_results_submission.html) + - [Weight initialization](https://mmdetection.readthedocs.io/en/latest/user_guides/init_cfg.html) + - [Use a single stage detector as RPN](https://mmdetection.readthedocs.io/en/latest/user_guides/single_stage_as_rpn.html) + - [Semi-supervised Object Detection](https://mmdetection.readthedocs.io/en/latest/user_guides/semi_det.html) + - [Useful Tools](https://mmdetection.readthedocs.io/en/latest/user_guides/index.html#useful-tools) + +
+ +- Advanced Guides + +
+ + - [Basic Concepts](https://mmdetection.readthedocs.io/en/latest/advanced_guides/index.html#basic-concepts) + - [Component Customization](https://mmdetection.readthedocs.io/en/latest/advanced_guides/index.html#component-customization) + - [How to](https://mmdetection.readthedocs.io/en/latest/advanced_guides/index.html#how-to) + +
+ +We also provide object detection colab tutorial [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](demo/MMDet_Tutorial.ipynb) and instance segmentation colab tutorial [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](demo/MMDet_InstanceSeg_Tutorial.ipynb). + +To migrate from MMDetection 2.x, please refer to [migration](https://mmdetection.readthedocs.io/en/latest/migration.html). + +## Overview of Benchmark and Model Zoo + +Results and models are available in the [model zoo](docs/en/model_zoo.md). + +
+ Architectures +
+ + + + + + + + + + + + + + + + + +
+ Object Detection + + Instance Segmentation + + Panoptic Segmentation + + Other +
+ + + + + + + +
  • Contrastive Learning
  • + + +
  • Distillation
  • + +
  • Semi-Supervised Object Detection
  • + + +
    + +
    + Components +
    + + + + + + + + + + + + + + + + + +
    + Backbones + + Necks + + Loss + + Common +
    + + + + + + + +
    + +Some other methods are also supported in [projects using MMDetection](./docs/en/notes/projects.md). + +## FAQ + +Please refer to [FAQ](docs/en/notes/faq.md) for frequently asked questions. + +## Contributing + +We appreciate all contributions to improve MMDetection. Ongoing projects can be found in out [GitHub Projects](https://github.com/open-mmlab/mmdetection/projects). Welcome community users to participate in these projects. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline. + +## Acknowledgement + +MMDetection is an open source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedbacks. +We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their own new detectors. + +## Citation + +If you use this toolbox or benchmark in your research, please cite this project. + +``` +@article{mmdetection, + title = {{MMDetection}: Open MMLab Detection Toolbox and Benchmark}, + author = {Chen, Kai and Wang, Jiaqi and Pang, Jiangmiao and Cao, Yuhang and + Xiong, Yu and Li, Xiaoxiao and Sun, Shuyang and Feng, Wansen and + Liu, Ziwei and Xu, Jiarui and Zhang, Zheng and Cheng, Dazhi and + Zhu, Chenchen and Cheng, Tianheng and Zhao, Qijie and Li, Buyu and + Lu, Xin and Zhu, Rui and Wu, Yue and Dai, Jifeng and Wang, Jingdong + and Shi, Jianping and Ouyang, Wanli and Loy, Chen Change and Lin, Dahua}, + journal= {arXiv preprint arXiv:1906.07155}, + year={2019} +} +``` + +## License + +This project is released under the [Apache 2.0 license](LICENSE). + +## Projects in OpenMMLab + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models. +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision. +- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark. +- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox. +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark. +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark. +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark. +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark. +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark. +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox. +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework. +- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages. +- [MMEval](https://github.com/open-mmlab/mmeval): A unified evaluation library for multiple machine learning libraries. +- [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab. diff --git a/mmdetection/README_zh-CN.md b/mmdetection/README_zh-CN.md new file mode 100644 index 0000000..ccf1cbf --- /dev/null +++ b/mmdetection/README_zh-CN.md @@ -0,0 +1,507 @@ +
    + +
     
    +
    + OpenMMLab 官网 + + + HOT + + +      + OpenMMLab 开放平台 + + + TRY IT OUT + + +
    +
     
    + +[![PyPI](https://img.shields.io/pypi/v/mmdet)](https://pypi.org/project/mmdet) +[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmdetection.readthedocs.io/en/latest/) +[![badge](https://github.com/open-mmlab/mmdetection/workflows/build/badge.svg)](https://github.com/open-mmlab/mmdetection/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmdetection/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmdetection) +[![license](https://img.shields.io/github/license/open-mmlab/mmdetection.svg)](https://github.com/open-mmlab/mmdetection/blob/main/LICENSE) +[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmdetection.svg)](https://github.com/open-mmlab/mmdetection/issues) +[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmdetection.svg)](https://github.com/open-mmlab/mmdetection/issues) +[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_demo.svg)](https://openxlab.org.cn/apps?search=mmdet) + +[📘使用文档](https://mmdetection.readthedocs.io/zh_CN/latest/) | +[🛠️安装教程](https://mmdetection.readthedocs.io/zh_CN/latest/get_started.html) | +[👀模型库](https://mmdetection.readthedocs.io/zh_CN/latest/model_zoo.html) | +[🆕更新日志](https://mmdetection.readthedocs.io/en/latest/notes/changelog.html) | +[🚀进行中的项目](https://github.com/open-mmlab/mmdetection/projects) | +[🤔报告问题](https://github.com/open-mmlab/mmdetection/issues/new/choose) + +
    + +
    + +[English](README.md) | 简体中文 + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
    + +
    + +## 简介 + +MMDetection 是一个基于 PyTorch 的目标检测开源工具箱。它是 [OpenMMLab](https://openmmlab.com/) 项目的一部分。 + +主分支代码目前支持 PyTorch 1.8 及其以上的版本。 + + + +
    +主要特性 + +- **模块化设计** + + MMDetection 将检测框架解耦成不同的模块组件,通过组合不同的模块组件,用户可以便捷地构建自定义的检测模型 + +- **支持多种检测任务** + + MMDetection 支持了各种不同的检测任务,包括**目标检测**,**实例分割**,**全景分割**,以及**半监督目标检测**。 + +- **速度快** + + 基本的框和 mask 操作都实现了 GPU 版本,训练速度比其他代码库更快或者相当,包括 [Detectron2](https://github.com/facebookresearch/detectron2), [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark) 和 [SimpleDet](https://github.com/TuSimple/simpledet)。 + +- **性能高** + + MMDetection 这个算法库源自于 COCO 2018 目标检测竞赛的冠军团队 *MMDet* 团队开发的代码,我们在之后持续进行了改进和提升。 + 新发布的 [RTMDet](configs/rtmdet) 还在实时实例分割和旋转目标检测任务中取得了最先进的成果,同时也在目标检测模型中取得了最佳的的参数量和精度平衡。 + +
    + +除了 MMDetection 之外,我们还开源了深度学习训练库 [MMEngine](https://github.com/open-mmlab/mmengine) 和计算机视觉基础库 [MMCV](https://github.com/open-mmlab/mmcv),它们是 MMDetection 的主要依赖。 + +## 最新进展 + +### 亮点 + +**v3.2.0** 版本已经在 2023.10.12 发布: + +**1. 检测 Transformer SOTA 模型大合集** +(1) 支持了 [DDQ](configs/ddq/README.md)、[CO-DETR](projects/CO-DETR/README.md)、[AlignDETR](projects/AlignDETR/README.md) 和 [H-DINO](projects/HDINO/README.md) 4 个更新更强的 SOTA Transformer 模型 +(2) 基于 CO-DETR, MMDet 中发布了 COCO 性能为 64.1 mAP 的模型 +(3) DINO 等算法支持 AMP/Checkpoint/FrozenBN,可以有效降低显存 + +**2. [提供了全面的 CNN 和 Transformer 的性能对比](projects/RF100-Benchmark/README_zh-CN.md)** +RF100 是由 100 个现实收集的数据集组成,包括 7 个域,可以验证 DINO 等 Transformer 模型和 CNN 类算法在不同场景不同数据量下的性能差异。用户可以用这个 Benchmark 快速验证自己的算法在不同场景下的鲁棒性。 + +
    + +
    + +**3. 支持了 [GLIP](configs/glip/README.md) 和 [Grounding DINO](configs/grounding_dino/README.md) 微调,全网唯一支持 Grounding DINO 微调** +MMDet 中的 Grounding DINO 是全网唯一支持微调的算法库,且性能高于官方 1 个点,当然 GLIP 也比官方高。 +我们还提供了详细的 Grounding DINO 在自定义数据集上训练评估的流程,欢迎大家试用。 + +| Model | Backbone | Style | COCO mAP | Official COCO mAP | +| :----------------: | :------: | :-------: | :--------: | :---------------: | +| Grounding DINO-T | Swin-T | Zero-shot | 48.5 | 48.4 | +| Grounding DINO-T | Swin-T | Finetune | 58.1(+0.9) | 57.2 | +| Grounding DINO-B | Swin-B | Zero-shot | 56.9 | 56.7 | +| Grounding DINO-B | Swin-B | Finetune | 59.7 | | +| Grounding DINO-R50 | R50 | Scratch | 48.9(+0.8) | 48.1 | + +**4. 支持开放词汇检测算法 [Detic](projects/Detic_new/README.md) 并提供多数据集联合训练可能** + +**5. 轻松使用 [FSDP 和 DeepSpeed 训练检测模型](projects/example_largemodel/README_zh-CN.md)** + +| ID | AMP | GC of Backbone | GC of Encoder | FSDP | Peak Mem (GB) | Iter Time (s) | +| :-: | :-: | :------------: | :-----------: | :--: | :-----------: | :-----------: | +| 1 | | | | | 49 (A100) | 0.9 | +| 2 | √ | | | | 39 (A100) | 1.2 | +| 3 | | √ | | | 33 (A100) | 1.1 | +| 4 | √ | √ | | | 25 (A100) | 1.3 | +| 5 | | √ | √ | | 18 | 2.2 | +| 6 | √ | √ | √ | | 13 | 1.6 | +| 7 | | √ | √ | √ | 14 | 2.9 | +| 8 | √ | √ | √ | √ | 8.5 | 2.4 | + +**6. 支持了 [V3Det](configs/v3det/README.md) 1.3w+ 类别的超大词汇检测数据集** + +
    + +
    + +我们很高兴向大家介绍我们在实时目标识别任务方面的最新成果 RTMDet,包含了一系列的全卷积单阶段检测模型。 RTMDet 不仅在从 tiny 到 extra-large 尺寸的目标检测模型上实现了最佳的参数量和精度的平衡,而且在实时实例分割和旋转目标检测任务上取得了最先进的成果。 更多细节请参阅[技术报告](https://arxiv.org/abs/2212.07784)。 预训练模型可以在[这里](configs/rtmdet)找到。 + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + +| Task | Dataset | AP | FPS(TRT FP16 BS1 3090) | +| ------------------------ | ------- | ------------------------------------ | ---------------------- | +| Object Detection | COCO | 52.8 | 322 | +| Instance Segmentation | COCO | 44.6 | 188 | +| Rotated Object Detection | DOTA | 78.9(single-scale)/81.3(multi-scale) | 121 | + +
    + +
    + +## 安装 + +请参考[快速入门文档](https://mmdetection.readthedocs.io/zh_CN/latest/get_started.html)进行安装。 + +## 教程 + +请阅读[概述](https://mmdetection.readthedocs.io/zh_CN/latest/get_started.html)对 MMDetection 进行初步的了解。 + +为了帮助用户更进一步了解 MMDetection,我们准备了用户指南和进阶指南,请阅读我们的[文档](https://mmdetection.readthedocs.io/zh_CN/latest/): + +- 用户指南 + +
    + + - [训练 & 测试](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/index.html#train-test) + - [学习配置文件](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/config.html) + - [使用已有模型在标准数据集上进行推理](https://mmdetection.readthedocs.io/en/latest/user_guides/inference.html) + - [数据集准备](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/dataset_prepare.html) + - [测试现有模型](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/test.html) + - [在标准数据集上训练预定义的模型](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html) + - [在自定义数据集上进行训练](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html#train-with-customized-datasets) + - [在标准数据集上训练自定义模型](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/new_model.html) + - [模型微调](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/finetune.html) + - [提交测试结果](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/test_results_submission.html) + - [权重初始化](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/init_cfg.html) + - [将单阶段检测器作为 RPN](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/single_stage_as_rpn.html) + - [半监督目标检测](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/semi_det.html) + - [实用工具](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/index.html#useful-tools) + +
    + +- 进阶指南 + +
    + + - [基础概念](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/index.html#basic-concepts) + - [组件定制](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/index.html#component-customization) + - [How to](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/index.html#how-to) + +
    + +我们提供了检测的 colab 教程 [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](demo/MMDet_Tutorial.ipynb) 和 实例分割的 colab 教程 [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](demo/MMDet_Tutorial.ipynb) + +同时,我们还提供了 [MMDetection 中文解读文案汇总](docs/zh_cn/article.md) + +若需要将2.x版本的代码迁移至新版,请参考[迁移文档](https://mmdetection.readthedocs.io/en/latest/migration.html)。 + +## 基准测试和模型库 + +测试结果和模型可以在[模型库](docs/zh_cn/model_zoo.md)中找到。 + +
    + 算法架构 +
    + + + + + + + + + + + + + + + + + +
    + Object Detection + + Instance Segmentation + + Panoptic Segmentation + + Other +
    + + + + + + + +
  • Contrastive Learning
  • + + +
  • Distillation
  • + +
  • Semi-Supervised Object Detection
  • + + +
    + +
    + 模块组件 +
    + + + + + + + + + + + + + + + + + +
    + Backbones + + Necks + + Loss + + Common +
    + + + + + + + +
    + +我们在[基于 MMDetection 的项目](./docs/zh_cn/notes/projects.md)中列举了一些其他的支持的算法。 + +## 常见问题 + +请参考 [FAQ](docs/zh_cn/notes/faq.md) 了解其他用户的常见问题。 + +## 贡献指南 + +我们感谢所有的贡献者为改进和提升 MMDetection 所作出的努力。我们将正在进行中的项目添加进了[GitHub Projects](https://github.com/open-mmlab/mmdetection/projects)页面,非常欢迎社区用户能参与进这些项目中来。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。 + +## 致谢 + +MMDetection 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者,以及提供宝贵反馈的用户。 我们希望这个工具箱和基准测试可以为社区提供灵活的代码工具,供用户复现已有算法并开发自己的新模型,从而不断为开源社区提供贡献。 + +## 引用 + +如果你在研究中使用了本项目的代码或者性能基准,请参考如下 bibtex 引用 MMDetection。 + +``` +@article{mmdetection, + title = {{MMDetection}: Open MMLab Detection Toolbox and Benchmark}, + author = {Chen, Kai and Wang, Jiaqi and Pang, Jiangmiao and Cao, Yuhang and + Xiong, Yu and Li, Xiaoxiao and Sun, Shuyang and Feng, Wansen and + Liu, Ziwei and Xu, Jiarui and Zhang, Zheng and Cheng, Dazhi and + Zhu, Chenchen and Cheng, Tianheng and Zhao, Qijie and Li, Buyu and + Lu, Xin and Zhu, Rui and Wu, Yue and Dai, Jifeng and Wang, Jingdong + and Shi, Jianping and Ouyang, Wanli and Loy, Chen Change and Lin, Dahua}, + journal= {arXiv preprint arXiv:1906.07155}, + year={2019} +} +``` + +## 开源许可证 + +该项目采用 [Apache 2.0 开源许可证](LICENSE)。 + +## OpenMMLab 的其他项目 + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库 +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库 +- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab 深度学习预训练工具箱 +- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成(AIGC)工具箱 +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱 +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台 +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准 +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准 +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱 +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包 +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱 +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准 +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准 +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准 +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准 +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱 +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台 +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准 +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱 +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱 +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架 +- [MIM](https://github.com/open-mmlab/mim): OpenMMlab 项目、算法、模型的统一入口 +- [MMEval](https://github.com/open-mmlab/mmeval): 统一开放的跨框架算法评测库 +- [Playground](https://github.com/open-mmlab/playground): 收集和展示 OpenMMLab 相关的前沿、有趣的社区项目 + +## 欢迎加入 OpenMMLab 社区 + +扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),扫描下方微信二维码添加喵喵好友,进入 MMDectection 微信交流社群。【加好友申请格式:研究方向+地区+学校/公司+姓名】 + +
    + +
    + +我们会在 OpenMMLab 社区为大家 + +- 📢 分享 AI 框架的前沿核心技术 +- 💻 解读 PyTorch 常用模块源码 +- 📰 发布 OpenMMLab 的相关新闻 +- 🚀 介绍 OpenMMLab 开发的前沿算法 +- 🏃 获取更高效的问题答疑和意见反馈 +- 🔥 提供与各行各业开发者充分交流的平台 + +干货满满 📘,等你来撩 💗,OpenMMLab 社区期待您的加入 👬 diff --git a/mmdetection/configs/_base_/datasets/ade20k_instance.py b/mmdetection/configs/_base_/datasets/ade20k_instance.py new file mode 100644 index 0000000..57f657a --- /dev/null +++ b/mmdetection/configs/_base_/datasets/ade20k_instance.py @@ -0,0 +1,53 @@ +# dataset settings +dataset_type = 'ADE20KInstanceDataset' +data_root = 'data/ADEChallengeData2016/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/ADEChallengeData2016/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(2560, 640), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='ade20k_instance_val.json', + data_prefix=dict(img='images/validation'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'ade20k_instance_val.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/ade20k_panoptic.py b/mmdetection/configs/_base_/datasets/ade20k_panoptic.py new file mode 100644 index 0000000..7be5ddd --- /dev/null +++ b/mmdetection/configs/_base_/datasets/ade20k_panoptic.py @@ -0,0 +1,38 @@ +# dataset settings +dataset_type = 'ADE20KPanopticDataset' +data_root = 'data/ADEChallengeData2016/' + +backend_args = None + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(2560, 640), keep_ratio=True), + dict(type='LoadPanopticAnnotations', backend_args=backend_args), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=0, + persistent_workers=False, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='ade20k_panoptic_val.json', + data_prefix=dict(img='images/validation/', seg='ade20k_panoptic_val/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoPanopticMetric', + ann_file=data_root + 'ade20k_panoptic_val.json', + seg_prefix=data_root + 'ade20k_panoptic_val/', + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/ade20k_semantic.py b/mmdetection/configs/_base_/datasets/ade20k_semantic.py new file mode 100644 index 0000000..522a775 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/ade20k_semantic.py @@ -0,0 +1,48 @@ +dataset_type = 'ADE20KSegDataset' +data_root = 'data/ADEChallengeData2016/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/ADEChallengeData2016/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(2048, 512), keep_ratio=True), + dict( + type='LoadAnnotations', + with_bbox=False, + with_mask=False, + with_seg=True, + reduce_zero_label=True), + dict( + type='PackDetInputs', meta_keys=('img_path', 'ori_shape', 'img_shape')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='images/validation', + seg_map_path='annotations/validation'), + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type='SemSegMetric', iou_metrics=['mIoU']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/cityscapes_detection.py b/mmdetection/configs/_base_/datasets/cityscapes_detection.py new file mode 100644 index 0000000..caeba6b --- /dev/null +++ b/mmdetection/configs/_base_/datasets/cityscapes_detection.py @@ -0,0 +1,84 @@ +# dataset settings +dataset_type = 'CityscapesDataset' +data_root = 'data/cityscapes/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/segmentation/cityscapes/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/segmentation/', +# 'data/': 's3://openmmlab/datasets/segmentation/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=[(2048, 800), (2048, 1024)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(2048, 1024), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='RepeatDataset', + times=8, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instancesonly_filtered_gtFine_train.json', + data_prefix=dict(img='leftImg8bit/train/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instancesonly_filtered_gtFine_val.json', + data_prefix=dict(img='leftImg8bit/val/'), + test_mode=True, + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=test_pipeline, + backend_args=backend_args)) + +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instancesonly_filtered_gtFine_val.json', + metric='bbox', + backend_args=backend_args) + +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/cityscapes_instance.py b/mmdetection/configs/_base_/datasets/cityscapes_instance.py new file mode 100644 index 0000000..1364031 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/cityscapes_instance.py @@ -0,0 +1,113 @@ +# dataset settings +dataset_type = 'CityscapesDataset' +data_root = 'data/cityscapes/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/segmentation/cityscapes/' + +# Method 2: Use backend_args, file_client_args in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/segmentation/', +# 'data/': 's3://openmmlab/datasets/segmentation/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=[(2048, 800), (2048, 1024)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(2048, 1024), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='RepeatDataset', + times=8, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instancesonly_filtered_gtFine_train.json', + data_prefix=dict(img='leftImg8bit/train/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instancesonly_filtered_gtFine_val.json', + data_prefix=dict(img='leftImg8bit/val/'), + test_mode=True, + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=test_pipeline, + backend_args=backend_args)) + +test_dataloader = val_dataloader + +val_evaluator = [ + dict( + type='CocoMetric', + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + metric=['bbox', 'segm'], + backend_args=backend_args), + dict( + type='CityScapesMetric', + seg_prefix=data_root + 'gtFine/val', + outfile_prefix='./work_dirs/cityscapes_metric/instance', + backend_args=backend_args) +] + +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=2, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False), +# dataset=dict( +# type=dataset_type, +# data_root=data_root, +# ann_file='annotations/instancesonly_filtered_gtFine_test.json', +# data_prefix=dict(img='leftImg8bit/test/'), +# test_mode=True, +# filter_cfg=dict(filter_empty_gt=True, min_size=32), +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type='CityScapesMetric', +# format_only=True, +# outfile_prefix='./work_dirs/cityscapes_metric/test') diff --git a/mmdetection/configs/_base_/datasets/coco_caption.py b/mmdetection/configs/_base_/datasets/coco_caption.py new file mode 100644 index 0000000..a1bd898 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/coco_caption.py @@ -0,0 +1,60 @@ +# data settings + +dataset_type = 'CocoCaptionDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +test_pipeline = [ + dict( + type='LoadImageFromFile', + imdecode_backend='pillow', + backend_args=backend_args), + dict( + type='Resize', + scale=(224, 224), + interpolation='bicubic', + backend='pillow'), + dict(type='PackInputs', meta_keys=['image_id']), +] + +# ann_file download from +# train dataset: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json # noqa +# val dataset: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json # noqa +# test dataset: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json # noqa +# val evaluator: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json # noqa +# test evaluator: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json # noqa +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/coco_karpathy_val.json', + pipeline=test_pipeline, + )) + +val_evaluator = dict( + type='COCOCaptionMetric', + ann_file=data_root + 'annotations/coco_karpathy_val_gt.json', +) + +# # If you want standard test, please manually configure the test dataset +test_dataloader = val_dataloader +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/coco_detection.py b/mmdetection/configs/_base_/datasets/coco_detection.py new file mode 100644 index 0000000..fdf8dfa --- /dev/null +++ b/mmdetection/configs/_base_/datasets/coco_detection.py @@ -0,0 +1,95 @@ +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=2, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False), +# dataset=dict( +# type=dataset_type, +# data_root=data_root, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type='CocoMetric', +# metric='bbox', +# format_only=True, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_detection/test') diff --git a/mmdetection/configs/_base_/datasets/coco_instance.py b/mmdetection/configs/_base_/datasets/coco_instance.py new file mode 100644 index 0000000..e91cb35 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/coco_instance.py @@ -0,0 +1,95 @@ +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=2, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False), +# dataset=dict( +# type=dataset_type, +# data_root=data_root, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type='CocoMetric', +# metric=['bbox', 'segm'], +# format_only=True, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_instance/test') diff --git a/mmdetection/configs/_base_/datasets/coco_instance_semantic.py b/mmdetection/configs/_base_/datasets/coco_instance_semantic.py new file mode 100644 index 0000000..cc96186 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/coco_instance_semantic.py @@ -0,0 +1,78 @@ +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/', seg='stuffthingmaps/train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) + +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/coco_panoptic.py b/mmdetection/configs/_base_/datasets/coco_panoptic.py new file mode 100644 index 0000000..0b95b61 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/coco_panoptic.py @@ -0,0 +1,94 @@ +# dataset settings +dataset_type = 'CocoPanopticDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadPanopticAnnotations', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadPanopticAnnotations', backend_args=backend_args), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/panoptic_train2017.json', + data_prefix=dict( + img='train2017/', seg='annotations/panoptic_train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/panoptic_val2017.json', + data_prefix=dict(img='val2017/', seg='annotations/panoptic_val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoPanopticMetric', + ann_file=data_root + 'annotations/panoptic_val2017.json', + seg_prefix=data_root + 'annotations/panoptic_val2017/', + backend_args=backend_args) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=1, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False), +# dataset=dict( +# type=dataset_type, +# data_root=data_root, +# ann_file='annotations/panoptic_image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type='CocoPanopticMetric', +# format_only=True, +# ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_panoptic/test') diff --git a/mmdetection/configs/_base_/datasets/coco_semantic.py b/mmdetection/configs/_base_/datasets/coco_semantic.py new file mode 100644 index 0000000..944bbba --- /dev/null +++ b/mmdetection/configs/_base_/datasets/coco_semantic.py @@ -0,0 +1,78 @@ +# dataset settings +dataset_type = 'CocoSegDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict( + type='LoadAnnotations', + with_bbox=False, + with_label=False, + with_seg=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='LoadAnnotations', + with_bbox=False, + with_label=False, + with_seg=True), + dict( + type='PackDetInputs', + meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor')) +] + +# For stuffthingmaps_semseg, please refer to +# `docs/en/user_guides/dataset_prepare.md` +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='train2017/', + seg_map_path='stuffthingmaps_semseg/train2017/'), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img_path='val2017/', + seg_map_path='stuffthingmaps_semseg/val2017/'), + pipeline=test_pipeline)) + +test_dataloader = val_dataloader + +val_evaluator = dict(type='SemSegMetric', iou_metrics=['mIoU']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/deepfashion.py b/mmdetection/configs/_base_/datasets/deepfashion.py new file mode 100644 index 0000000..a93dc71 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/deepfashion.py @@ -0,0 +1,95 @@ +# dataset settings +dataset_type = 'DeepFashionDataset' +data_root = 'data/DeepFashion/In-shop/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', scale=(750, 1101), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(750, 1101), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='Anno/segmentation/DeepFashion_segmentation_train.json', + data_prefix=dict(img='Img/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='Anno/segmentation/DeepFashion_segmentation_query.json', + data_prefix=dict(img='Img/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='Anno/segmentation/DeepFashion_segmentation_gallery.json', + data_prefix=dict(img='Img/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + + 'Anno/segmentation/DeepFashion_segmentation_query.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + + 'Anno/segmentation/DeepFashion_segmentation_gallery.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) diff --git a/mmdetection/configs/_base_/datasets/dsdl.py b/mmdetection/configs/_base_/datasets/dsdl.py new file mode 100644 index 0000000..1f19e5e --- /dev/null +++ b/mmdetection/configs/_base_/datasets/dsdl.py @@ -0,0 +1,62 @@ +dataset_type = 'DSDLDetDataset' +data_root = 'path to dataset folder' +train_ann = 'path to train yaml file' +val_ann = 'path to val yaml file' + +backend_args = None +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': "s3://open_data/", +# 'data/': "s3://open_data/" +# })) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'instances')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann, + filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=val_ann, + test_mode=True, + pipeline=test_pipeline)) + +test_dataloader = val_dataloader + +val_evaluator = dict(type='CocoMetric', metric='bbox') +# val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/isaid_instance.py b/mmdetection/configs/_base_/datasets/isaid_instance.py new file mode 100644 index 0000000..09ddcab --- /dev/null +++ b/mmdetection/configs/_base_/datasets/isaid_instance.py @@ -0,0 +1,59 @@ +# dataset settings +dataset_type = 'iSAIDDataset' +data_root = 'data/iSAID/' +backend_args = None + +# Please see `projects/iSAID/README.md` for data preparation +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', scale=(800, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(800, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='train/instancesonly_filtered_train.json', + data_prefix=dict(img='train/images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='val/instancesonly_filtered_val.json', + data_prefix=dict(img='val/images/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'val/instancesonly_filtered_val.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/lvis_v0.5_instance.py b/mmdetection/configs/_base_/datasets/lvis_v0.5_instance.py new file mode 100644 index 0000000..d0ca44e --- /dev/null +++ b/mmdetection/configs/_base_/datasets/lvis_v0.5_instance.py @@ -0,0 +1,79 @@ +# dataset settings +dataset_type = 'LVISV05Dataset' +data_root = 'data/lvis_v0.5/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/lvis_v0.5/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/lvis_v0.5_train.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/lvis_v0.5_val.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='LVISMetric', + ann_file=data_root + 'annotations/lvis_v0.5_val.json', + metric=['bbox', 'segm'], + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/lvis_v1_instance.py b/mmdetection/configs/_base_/datasets/lvis_v1_instance.py new file mode 100644 index 0000000..0413f37 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/lvis_v1_instance.py @@ -0,0 +1,22 @@ +# dataset settings +_base_ = 'lvis_v0.5_instance.py' +dataset_type = 'LVISV1Dataset' +data_root = 'data/lvis_v1/' + +train_dataloader = dict( + dataset=dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/lvis_v1_train.json', + data_prefix=dict(img='')))) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/lvis_v1_val.json', + data_prefix=dict(img=''))) +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/lvis_v1_val.json') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/mot_challenge.py b/mmdetection/configs/_base_/datasets/mot_challenge.py new file mode 100644 index 0000000..ce2828e --- /dev/null +++ b/mmdetection/configs/_base_/datasets/mot_challenge.py @@ -0,0 +1,90 @@ +# dataset settings +dataset_type = 'MOTChallengeDataset' +data_root = 'data/MOT17/' +img_scale = (1088, 1088) + +backend_args = None +# data pipeline +train_pipeline = [ + dict( + type='UniformRefFrameSample', + num_ref_imgs=1, + frame_range=10, + filter_key_img=True), + dict( + type='TransformBroadcaster', + share_random_params=True, + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadTrackAnnotations'), + dict( + type='RandomResize', + scale=img_scale, + ratio_range=(0.8, 1.2), + keep_ratio=True, + clip_object_border=False), + dict(type='PhotoMetricDistortion') + ]), + dict( + type='TransformBroadcaster', + # different cropped positions for different frames + share_random_params=False, + transforms=[ + dict( + type='RandomCrop', crop_size=img_scale, bbox_clip_border=False) + ]), + dict( + type='TransformBroadcaster', + share_random_params=True, + transforms=[ + dict(type='RandomFlip', prob=0.5), + ]), + dict(type='PackTrackInputs') +] + +test_pipeline = [ + dict( + type='TransformBroadcaster', + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict(type='LoadTrackAnnotations') + ]), + dict(type='PackTrackInputs') +] + +# dataloader +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='TrackImgSampler'), # image-based sampling + dataset=dict( + type=dataset_type, + data_root=data_root, + visibility_thr=-1, + ann_file='annotations/half-train_cocoformat.json', + data_prefix=dict(img_path='train'), + metainfo=dict(classes=('pedestrian', )), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + # Now we support two ways to test, image_based and video_based + # if you want to use video_based sampling, you can use as follows + # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + sampler=dict(type='TrackImgSampler'), # image-based sampling + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/half-val_cocoformat.json', + data_prefix=dict(img_path='train'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# evaluator +val_evaluator = dict( + type='MOTChallengeMetric', metric=['HOTA', 'CLEAR', 'Identity']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/mot_challenge_det.py b/mmdetection/configs/_base_/datasets/mot_challenge_det.py new file mode 100644 index 0000000..a988572 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/mot_challenge_det.py @@ -0,0 +1,66 @@ +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/MOT17/' + +backend_args = None +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args, to_float32=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(1088, 1088), + ratio_range=(0.8, 1.2), + keep_ratio=True, + clip_object_border=False), + dict(type='PhotoMetricDistortion'), + dict(type='RandomCrop', crop_size=(1088, 1088), bbox_clip_border=False), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1088, 1088), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/half-train_cocoformat.json', + data_prefix=dict(img='train/'), + metainfo=dict(classes=('pedestrian', )), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/half-val_cocoformat.json', + data_prefix=dict(img='train/'), + metainfo=dict(classes=('pedestrian', )), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/half-val_cocoformat.json', + metric='bbox', + format_only=False) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/mot_challenge_reid.py b/mmdetection/configs/_base_/datasets/mot_challenge_reid.py new file mode 100644 index 0000000..57a95b5 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/mot_challenge_reid.py @@ -0,0 +1,61 @@ +# dataset settings +dataset_type = 'ReIDDataset' +data_root = 'data/MOT17/' + +backend_args = None +# data pipeline +train_pipeline = [ + dict( + type='TransformBroadcaster', + share_random_params=False, + transforms=[ + dict( + type='LoadImageFromFile', + backend_args=backend_args, + to_float32=True), + dict( + type='Resize', + scale=(128, 256), + keep_ratio=False, + clip_object_border=False), + dict(type='RandomFlip', prob=0.5, direction='horizontal'), + ]), + dict(type='PackReIDInputs', meta_keys=('flip', 'flip_direction')) +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args, to_float32=True), + dict(type='Resize', scale=(128, 256), keep_ratio=False), + dict(type='PackReIDInputs') +] + +# dataloader +train_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + triplet_sampler=dict(num_ids=8, ins_per_id=4), + data_prefix=dict(img_path='reid/imgs'), + ann_file='reid/meta/train_80.txt', + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + triplet_sampler=None, + data_prefix=dict(img_path='reid/imgs'), + ann_file='reid/meta/val_20.txt', + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# evaluator +val_evaluator = dict(type='ReIDMetrics', metric=['mAP', 'CMC']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/objects365v1_detection.py b/mmdetection/configs/_base_/datasets/objects365v1_detection.py new file mode 100644 index 0000000..ee39869 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/objects365v1_detection.py @@ -0,0 +1,74 @@ +# dataset settings +dataset_type = 'Objects365V1Dataset' +data_root = 'data/Objects365/Obj365_v1/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/objects365_val.json', + data_prefix=dict(img='val/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/objects365_val.json', + metric='bbox', + sort_categories=True, + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/objects365v2_detection.py b/mmdetection/configs/_base_/datasets/objects365v2_detection.py new file mode 100644 index 0000000..b25a7ba --- /dev/null +++ b/mmdetection/configs/_base_/datasets/objects365v2_detection.py @@ -0,0 +1,73 @@ +# dataset settings +dataset_type = 'Objects365V2Dataset' +data_root = 'data/Objects365/Obj365_v2/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/zhiyuan_objv2_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/zhiyuan_objv2_val.json', + data_prefix=dict(img='val/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/zhiyuan_objv2_val.json', + metric='bbox', + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/openimages_detection.py b/mmdetection/configs/_base_/datasets/openimages_detection.py new file mode 100644 index 0000000..129661b --- /dev/null +++ b/mmdetection/configs/_base_/datasets/openimages_detection.py @@ -0,0 +1,81 @@ +# dataset settings +dataset_type = 'OpenImagesDataset' +data_root = 'data/OpenImages/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1024, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1024, 800), keep_ratio=True), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + # TODO: find a better way to collect image_level_labels + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'instances', 'image_level_labels')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=0, # workers_per_gpu > 0 may occur out of memory + persistent_workers=False, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/oidv6-train-annotations-bbox.csv', + data_prefix=dict(img='OpenImages/train/'), + label_file='annotations/class-descriptions-boxable.csv', + hierarchy_file='annotations/bbox_labels_600_hierarchy.json', + meta_file='annotations/train-image-metas.pkl', + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=0, + persistent_workers=False, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/validation-annotations-bbox.csv', + data_prefix=dict(img='OpenImages/validation/'), + label_file='annotations/class-descriptions-boxable.csv', + hierarchy_file='annotations/bbox_labels_600_hierarchy.json', + meta_file='annotations/validation-image-metas.pkl', + image_level_ann_file='annotations/validation-' + 'annotations-human-imagelabels-boxable.csv', + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='OpenImagesMetric', + iou_thrs=0.5, + ioa_thrs=0.5, + use_group_of=True, + get_supercategory=True) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/refcoco+.py b/mmdetection/configs/_base_/datasets/refcoco+.py new file mode 100644 index 0000000..ae0278d --- /dev/null +++ b/mmdetection/configs/_base_/datasets/refcoco+.py @@ -0,0 +1,55 @@ +# dataset settings +dataset_type = 'RefCocoDataset' +data_root = 'data/coco/' + +backend_args = None + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='LoadAnnotations', + with_mask=True, + with_bbox=False, + with_seg=False, + with_label=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'gt_masks', 'text')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict(img_path='train2014/'), + ann_file='refcoco+/instances.json', + split_file='refcoco+/refs(unc).p', + split='val', + text_mode='select_first', + pipeline=test_pipeline)) + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict(img_path='train2014/'), + ann_file='refcoco+/instances.json', + split_file='refcoco+/refs(unc).p', + split='testA', # or 'testB' + text_mode='select_first', + pipeline=test_pipeline)) + +val_evaluator = dict(type='RefSegMetric', metric=['cIoU', 'mIoU']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/refcoco.py b/mmdetection/configs/_base_/datasets/refcoco.py new file mode 100644 index 0000000..7b6caef --- /dev/null +++ b/mmdetection/configs/_base_/datasets/refcoco.py @@ -0,0 +1,55 @@ +# dataset settings +dataset_type = 'RefCocoDataset' +data_root = 'data/coco/' + +backend_args = None + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='LoadAnnotations', + with_mask=True, + with_bbox=False, + with_seg=False, + with_label=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'gt_masks', 'text')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict(img_path='train2014/'), + ann_file='refcoco/instances.json', + split_file='refcoco/refs(unc).p', + split='val', + text_mode='select_first', + pipeline=test_pipeline)) + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict(img_path='train2014/'), + ann_file='refcoco/instances.json', + split_file='refcoco/refs(unc).p', + split='testA', # or 'testB' + text_mode='select_first', + pipeline=test_pipeline)) + +val_evaluator = dict(type='RefSegMetric', metric=['cIoU', 'mIoU']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/refcocog.py b/mmdetection/configs/_base_/datasets/refcocog.py new file mode 100644 index 0000000..19dbeef --- /dev/null +++ b/mmdetection/configs/_base_/datasets/refcocog.py @@ -0,0 +1,55 @@ +# dataset settings +dataset_type = 'RefCocoDataset' +data_root = 'data/coco/' + +backend_args = None + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='LoadAnnotations', + with_mask=True, + with_bbox=False, + with_seg=False, + with_label=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'gt_masks', 'text')) +] + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict(img_path='train2014/'), + ann_file='refcocog/instances.json', + split_file='refcocog/refs(umd).p', + split='val', + text_mode='select_first', + pipeline=test_pipeline)) + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict(img_path='train2014/'), + ann_file='refcocog/instances.json', + split_file='refcocog/refs(umd).p', + split='test', + text_mode='select_first', + pipeline=test_pipeline)) + +val_evaluator = dict(type='RefSegMetric', metric=['cIoU', 'mIoU']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/semi_coco_detection.py b/mmdetection/configs/_base_/datasets/semi_coco_detection.py new file mode 100644 index 0000000..694f25f --- /dev/null +++ b/mmdetection/configs/_base_/datasets/semi_coco_detection.py @@ -0,0 +1,178 @@ +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +color_space = [ + [dict(type='ColorTransform')], + [dict(type='AutoContrast')], + [dict(type='Equalize')], + [dict(type='Sharpness')], + [dict(type='Posterize')], + [dict(type='Solarize')], + [dict(type='Color')], + [dict(type='Contrast')], + [dict(type='Brightness')], +] + +geometric = [ + [dict(type='Rotate')], + [dict(type='ShearX')], + [dict(type='ShearY')], + [dict(type='TranslateX')], + [dict(type='TranslateY')], +] + +scale = [(1333, 400), (1333, 1200)] + +branch_field = ['sup', 'unsup_teacher', 'unsup_student'] +# pipeline used to augment labeled data, +# which will be sent to student model for supervised training. +sup_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='RandAugment', aug_space=color_space, aug_num=1), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict( + type='MultiBranch', + branch_field=branch_field, + sup=dict(type='PackDetInputs')) +] + +# pipeline used to augment unlabeled data weakly, +# which will be sent to teacher model for predicting pseudo instances. +weak_pipeline = [ + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', + 'homography_matrix')), +] + +# pipeline used to augment unlabeled data strongly, +# which will be sent to student model for unsupervised training. +strong_pipeline = [ + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomOrder', + transforms=[ + dict(type='RandAugment', aug_space=color_space, aug_num=1), + dict(type='RandAugment', aug_space=geometric, aug_num=1), + ]), + dict(type='RandomErasing', n_patches=(1, 5), ratio=(0, 0.2)), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', + 'homography_matrix')), +] + +# pipeline used to augment unlabeled data into different views +unsup_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadEmptyAnnotations'), + dict( + type='MultiBranch', + branch_field=branch_field, + unsup_teacher=weak_pipeline, + unsup_student=strong_pipeline, + ) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +batch_size = 5 +num_workers = 5 +# There are two common semi-supervised learning settings on the coco dataset: +# (1) Divide the train2017 into labeled and unlabeled datasets +# by a fixed percentage, such as 1%, 2%, 5% and 10%. +# The format of labeled_ann_file and unlabeled_ann_file are +# instances_train2017.{fold}@{percent}.json, and +# instances_train2017.{fold}@{percent}-unlabeled.json +# `fold` is used for cross-validation, and `percent` represents +# the proportion of labeled data in the train2017. +# (2) Choose the train2017 as the labeled dataset +# and unlabeled2017 as the unlabeled dataset. +# The labeled_ann_file and unlabeled_ann_file are +# instances_train2017.json and image_info_unlabeled2017.json +# We use this configuration by default. +labeled_dataset = dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=sup_pipeline, + backend_args=backend_args) + +unlabeled_dataset = dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_unlabeled2017.json', + data_prefix=dict(img='unlabeled2017/'), + filter_cfg=dict(filter_empty_gt=False), + pipeline=unsup_pipeline, + backend_args=backend_args) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=num_workers, + persistent_workers=True, + sampler=dict( + type='GroupMultiSourceSampler', + batch_size=batch_size, + source_ratio=[1, 4]), + dataset=dict( + type='ConcatDataset', datasets=[labeled_dataset, unlabeled_dataset])) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) + +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/v3det.py b/mmdetection/configs/_base_/datasets/v3det.py new file mode 100644 index 0000000..38ccbf8 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/v3det.py @@ -0,0 +1,69 @@ +# dataset settings +dataset_type = 'V3DetDataset' +data_root = 'data/V3Det/' + +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/v3det_2023_v1_train.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=4), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/v3det_2023_v1_val.json', + data_prefix=dict(img=''), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/v3det_2023_v1_val.json', + metric='bbox', + format_only=False, + backend_args=backend_args, + use_mp_eval=True, + proposal_nums=[300]) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/voc0712.py b/mmdetection/configs/_base_/datasets/voc0712.py new file mode 100644 index 0000000..47f5e65 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/voc0712.py @@ -0,0 +1,92 @@ +# dataset settings +dataset_type = 'VOCDataset' +data_root = 'data/VOCdevkit/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically Infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/segmentation/VOCdevkit/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/segmentation/', +# 'data/': 's3://openmmlab/datasets/segmentation/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1000, 600), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1000, 600), keep_ratio=True), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type='ConcatDataset', + # VOCDataset will add different `dataset_type` in dataset.metainfo, + # which will get error if using ConcatDataset. Adding + # `ignore_keys` can avoid this error. + ignore_keys=['dataset_type'], + datasets=[ + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict( + filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline, + backend_args=backend_args), + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2012/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2012/'), + filter_cfg=dict( + filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline, + backend_args=backend_args) + ]))) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2007/ImageSets/Main/test.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +# Pascal VOC2007 uses `11points` as default evaluate mode, while PASCAL +# VOC2012 defaults to use 'area'. +val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/wider_face.py b/mmdetection/configs/_base_/datasets/wider_face.py new file mode 100644 index 0000000..7042bc4 --- /dev/null +++ b/mmdetection/configs/_base_/datasets/wider_face.py @@ -0,0 +1,73 @@ +# dataset settings +dataset_type = 'WIDERFaceDataset' +data_root = 'data/WIDERFace/' +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/cityscapes/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +img_scale = (640, 640) # VGA resolution + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='train.txt', + data_prefix=dict(img='WIDER_train'), + filter_cfg=dict(filter_empty_gt=True, bbox_min_size=17, min_size=32), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='val.txt', + data_prefix=dict(img='WIDER_val'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict( + # TODO: support WiderFace-Evaluation for easy, medium, hard cases + type='VOCMetric', + metric='mAP', + eval_mode='11points') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/_base_/datasets/youtube_vis.py b/mmdetection/configs/_base_/datasets/youtube_vis.py new file mode 100644 index 0000000..ece07cc --- /dev/null +++ b/mmdetection/configs/_base_/datasets/youtube_vis.py @@ -0,0 +1,66 @@ +dataset_type = 'YouTubeVISDataset' +data_root = 'data/youtube_vis_2019/' +dataset_version = data_root[-5:-1] # 2019 or 2021 + +backend_args = None + +# dataset settings +train_pipeline = [ + dict( + type='UniformRefFrameSample', + num_ref_imgs=1, + frame_range=100, + filter_key_img=True), + dict( + type='TransformBroadcaster', + share_random_params=True, + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadTrackAnnotations', with_mask=True), + dict(type='Resize', scale=(640, 360), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + ]), + dict(type='PackTrackInputs') +] + +test_pipeline = [ + dict( + type='TransformBroadcaster', + transforms=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(640, 360), keep_ratio=True), + dict(type='LoadTrackAnnotations', with_mask=True), + ]), + dict(type='PackTrackInputs') +] + +# dataloader +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + # sampler=dict(type='TrackImgSampler'), # image-based sampling + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='TrackAspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2019_train.json', + data_prefix=dict(img_path='train/JPEGImages'), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2019_valid.json', + data_prefix=dict(img_path='valid/JPEGImages'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/_base_/default_runtime.py b/mmdetection/configs/_base_/default_runtime.py new file mode 100644 index 0000000..870e561 --- /dev/null +++ b/mmdetection/configs/_base_/default_runtime.py @@ -0,0 +1,24 @@ +default_scope = 'mmdet' + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/mmdetection/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py b/mmdetection/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py new file mode 100644 index 0000000..c5167f7 --- /dev/null +++ b/mmdetection/configs/_base_/models/cascade-mask-rcnn_r50_fpn.py @@ -0,0 +1,203 @@ +# model settings +model = dict( + type='CascadeRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdetection/configs/_base_/models/cascade-rcnn_r50_fpn.py b/mmdetection/configs/_base_/models/cascade-rcnn_r50_fpn.py new file mode 100644 index 0000000..50c57f0 --- /dev/null +++ b/mmdetection/configs/_base_/models/cascade-rcnn_r50_fpn.py @@ -0,0 +1,185 @@ +# model settings +model = dict( + type='CascadeRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/mmdetection/configs/_base_/models/fast-rcnn_r50_fpn.py b/mmdetection/configs/_base_/models/fast-rcnn_r50_fpn.py new file mode 100644 index 0000000..2bd45e9 --- /dev/null +++ b/mmdetection/configs/_base_/models/fast-rcnn_r50_fpn.py @@ -0,0 +1,68 @@ +# model settings +model = dict( + type='FastRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/mmdetection/configs/_base_/models/faster-rcnn_r50-caffe-c4.py b/mmdetection/configs/_base_/models/faster-rcnn_r50-caffe-c4.py new file mode 100644 index 0000000..15d2db7 --- /dev/null +++ b/mmdetection/configs/_base_/models/faster-rcnn_r50-caffe-c4.py @@ -0,0 +1,123 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='FasterRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=3, + strides=(1, 2, 2), + dilations=(1, 1, 1), + out_indices=(2, ), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + type='RPNHead', + in_channels=1024, + feat_channels=1024, + anchor_generator=dict( + type='AnchorGenerator', + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + shared_head=dict( + type='ResLayer', + depth=50, + stage=3, + stride=2, + dilation=1, + style='caffe', + norm_cfg=norm_cfg, + norm_eval=True, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=1024, + featmap_strides=[16]), + bbox_head=dict( + type='BBoxHead', + with_avg_pool=True, + roi_feat_size=7, + in_channels=2048, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=6000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/mmdetection/configs/_base_/models/faster-rcnn_r50-caffe-dc5.py b/mmdetection/configs/_base_/models/faster-rcnn_r50-caffe-dc5.py new file mode 100644 index 0000000..189915e --- /dev/null +++ b/mmdetection/configs/_base_/models/faster-rcnn_r50-caffe-dc5.py @@ -0,0 +1,111 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='FasterRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + strides=(1, 2, 2, 1), + dilations=(1, 1, 1, 2), + out_indices=(3, ), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + type='RPNHead', + in_channels=2048, + feat_channels=2048, + anchor_generator=dict( + type='AnchorGenerator', + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=2048, + featmap_strides=[16]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=2048, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms=dict(type='nms', iou_threshold=0.7), + nms_pre=6000, + max_per_img=1000, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/mmdetection/configs/_base_/models/faster-rcnn_r50_fpn.py b/mmdetection/configs/_base_/models/faster-rcnn_r50_fpn.py new file mode 100644 index 0000000..31aa146 --- /dev/null +++ b/mmdetection/configs/_base_/models/faster-rcnn_r50_fpn.py @@ -0,0 +1,114 @@ +# model settings +model = dict( + type='FasterRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + )) diff --git a/mmdetection/configs/_base_/models/mask-rcnn_r50-caffe-c4.py b/mmdetection/configs/_base_/models/mask-rcnn_r50-caffe-c4.py new file mode 100644 index 0000000..de1131b --- /dev/null +++ b/mmdetection/configs/_base_/models/mask-rcnn_r50-caffe-c4.py @@ -0,0 +1,132 @@ +# model settings +norm_cfg = dict(type='BN', requires_grad=False) +model = dict( + type='MaskRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=3, + strides=(1, 2, 2), + dilations=(1, 1, 1), + out_indices=(2, ), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + type='RPNHead', + in_channels=1024, + feat_channels=1024, + anchor_generator=dict( + type='AnchorGenerator', + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + shared_head=dict( + type='ResLayer', + depth=50, + stage=3, + stride=2, + dilation=1, + style='caffe', + norm_cfg=norm_cfg, + norm_eval=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=1024, + featmap_strides=[16]), + bbox_head=dict( + type='BBoxHead', + with_avg_pool=True, + roi_feat_size=7, + in_channels=2048, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=None, + mask_head=dict( + type='FCNMaskHead', + num_convs=0, + in_channels=2048, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=14, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=6000, + nms=dict(type='nms', iou_threshold=0.7), + max_per_img=1000, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdetection/configs/_base_/models/mask-rcnn_r50_fpn.py b/mmdetection/configs/_base_/models/mask-rcnn_r50_fpn.py new file mode 100644 index 0000000..b4ff7a4 --- /dev/null +++ b/mmdetection/configs/_base_/models/mask-rcnn_r50_fpn.py @@ -0,0 +1,127 @@ +# model settings +model = dict( + type='MaskRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdetection/configs/_base_/models/retinanet_r50_fpn.py b/mmdetection/configs/_base_/models/retinanet_r50_fpn.py new file mode 100644 index 0000000..53662c9 --- /dev/null +++ b/mmdetection/configs/_base_/models/retinanet_r50_fpn.py @@ -0,0 +1,68 @@ +# model settings +model = dict( + type='RetinaNet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5), + bbox_head=dict( + type='RetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + sampler=dict( + type='PseudoSampler'), # Focal loss should use PseudoSampler + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)) diff --git a/mmdetection/configs/_base_/models/rpn_r50-caffe-c4.py b/mmdetection/configs/_base_/models/rpn_r50-caffe-c4.py new file mode 100644 index 0000000..ed1dbe7 --- /dev/null +++ b/mmdetection/configs/_base_/models/rpn_r50-caffe-c4.py @@ -0,0 +1,64 @@ +# model settings +model = dict( + type='RPN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=3, + strides=(1, 2, 2), + dilations=(1, 1, 1), + out_indices=(2, ), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=None, + rpn_head=dict( + type='RPNHead', + in_channels=1024, + feat_channels=1024, + anchor_generator=dict( + type='AnchorGenerator', + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/mmdetection/configs/_base_/models/rpn_r50_fpn.py b/mmdetection/configs/_base_/models/rpn_r50_fpn.py new file mode 100644 index 0000000..6bc4790 --- /dev/null +++ b/mmdetection/configs/_base_/models/rpn_r50_fpn.py @@ -0,0 +1,64 @@ +# model settings +model = dict( + type='RPN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/mmdetection/configs/_base_/models/ssd300.py b/mmdetection/configs/_base_/models/ssd300.py new file mode 100644 index 0000000..fd113c7 --- /dev/null +++ b/mmdetection/configs/_base_/models/ssd300.py @@ -0,0 +1,63 @@ +# model settings +input_size = 300 +model = dict( + type='SingleStageDetector', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[1, 1, 1], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='SSDVGG', + depth=16, + with_last_pool=False, + ceil_mode=True, + out_indices=(3, 4), + out_feature_indices=(22, 34), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://vgg16_caffe')), + neck=dict( + type='SSDNeck', + in_channels=(512, 1024), + out_channels=(512, 1024, 512, 256, 256, 256), + level_strides=(2, 2, 1, 1), + level_paddings=(1, 1, 0, 0), + l2_norm_scale=20), + bbox_head=dict( + type='SSDHead', + in_channels=(512, 1024, 512, 256, 256, 256), + num_classes=80, + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + input_size=input_size, + basesize_ratio_range=(0.15, 0.9), + strides=[8, 16, 32, 64, 100, 300], + ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2])), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + sampler=dict(type='PseudoSampler'), + smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False), + test_cfg=dict( + nms_pre=1000, + nms=dict(type='nms', iou_threshold=0.45), + min_bbox_size=0, + score_thr=0.02, + max_per_img=200)) +cudnn_benchmark = True diff --git a/mmdetection/configs/_base_/schedules/schedule_1x.py b/mmdetection/configs/_base_/schedules/schedule_1x.py new file mode 100644 index 0000000..95f30be --- /dev/null +++ b/mmdetection/configs/_base_/schedules/schedule_1x.py @@ -0,0 +1,28 @@ +# training schedule for 1x +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/_base_/schedules/schedule_20e.py b/mmdetection/configs/_base_/schedules/schedule_20e.py new file mode 100644 index 0000000..75f958b --- /dev/null +++ b/mmdetection/configs/_base_/schedules/schedule_20e.py @@ -0,0 +1,28 @@ +# training schedule for 20e +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=20, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[16, 19], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/_base_/schedules/schedule_2x.py b/mmdetection/configs/_base_/schedules/schedule_2x.py new file mode 100644 index 0000000..5b7b241 --- /dev/null +++ b/mmdetection/configs/_base_/schedules/schedule_2x.py @@ -0,0 +1,28 @@ +# training schedule for 2x +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=24, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/albu_example/README.md b/mmdetection/configs/albu_example/README.md new file mode 100644 index 0000000..fa362f9 --- /dev/null +++ b/mmdetection/configs/albu_example/README.md @@ -0,0 +1,31 @@ +# Albu Example + +> [Albumentations: fast and flexible image augmentations](https://arxiv.org/abs/1809.06839) + + + +## Abstract + +Data augmentation is a commonly used technique for increasing both the size and the diversity of labeled training sets by leveraging input transformations that preserve output labels. In computer vision domain, image augmentations have become a common implicit regularization technique to combat overfitting in deep convolutional neural networks and are ubiquitously used to improve performance. While most deep learning frameworks implement basic image transformations, the list is typically limited to some variations and combinations of flipping, rotating, scaling, and cropping. Moreover, the image processing speed varies in existing tools for image augmentation. We present Albumentations, a fast and flexible library for image augmentations with many various image transform operations available, that is also an easy-to-use wrapper around other augmentation libraries. We provide examples of image augmentations for different computer vision tasks and show that Albumentations is faster than other commonly used image augmentation tools on the most of commonly used image transformations. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | 1x | 4.4 | 16.6 | 38.0 | 34.5 | [config](./mask-rcnn_r50_fpn_albu-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208-ab203bcd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208_225520.log.json) | + +## Citation + +```latex +@article{2018arXiv180906839B, + author = {A. Buslaev, A. Parinov, E. Khvedchenya, V.~I. Iglovikov and A.~A. Kalinin}, + title = "{Albumentations: fast and flexible image augmentations}", + journal = {ArXiv e-prints}, + eprint = {1809.06839}, + year = 2018 +} +``` diff --git a/mmdetection/configs/albu_example/mask-rcnn_r50_fpn_albu-1x_coco.py b/mmdetection/configs/albu_example/mask-rcnn_r50_fpn_albu-1x_coco.py new file mode 100644 index 0000000..b8a2780 --- /dev/null +++ b/mmdetection/configs/albu_example/mask-rcnn_r50_fpn_albu-1x_coco.py @@ -0,0 +1,66 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' + +albu_train_transforms = [ + dict( + type='ShiftScaleRotate', + shift_limit=0.0625, + scale_limit=0.0, + rotate_limit=0, + interpolation=1, + p=0.5), + dict( + type='RandomBrightnessContrast', + brightness_limit=[0.1, 0.3], + contrast_limit=[0.1, 0.3], + p=0.2), + dict( + type='OneOf', + transforms=[ + dict( + type='RGBShift', + r_shift_limit=10, + g_shift_limit=10, + b_shift_limit=10, + p=1.0), + dict( + type='HueSaturationValue', + hue_shift_limit=20, + sat_shift_limit=30, + val_shift_limit=20, + p=1.0) + ], + p=0.1), + dict(type='JpegCompression', quality_lower=85, quality_upper=95, p=0.2), + dict(type='ChannelShuffle', p=0.1), + dict( + type='OneOf', + transforms=[ + dict(type='Blur', blur_limit=3, p=1.0), + dict(type='MedianBlur', blur_limit=3, p=1.0) + ], + p=0.1), +] +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags'], + min_visibility=0.0, + filter_lost_elements=True), + keymap={ + 'img': 'image', + 'gt_masks': 'masks', + 'gt_bboxes': 'bboxes' + }, + skip_img_without_anno=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/albu_example/metafile.yml b/mmdetection/configs/albu_example/metafile.yml new file mode 100644 index 0000000..3b54bdf --- /dev/null +++ b/mmdetection/configs/albu_example/metafile.yml @@ -0,0 +1,17 @@ +Models: + - Name: mask-rcnn_r50_fpn_albu-1x_coco + In Collection: Mask R-CNN + Config: mask-rcnn_r50_fpn_albu-1x_coco.py + Metadata: + Training Memory (GB): 4.4 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/albu_example/mask_rcnn_r50_fpn_albu_1x_coco/mask_rcnn_r50_fpn_albu_1x_coco_20200208-ab203bcd.pth diff --git a/mmdetection/configs/atss/README.md b/mmdetection/configs/atss/README.md new file mode 100644 index 0000000..1411672 --- /dev/null +++ b/mmdetection/configs/atss/README.md @@ -0,0 +1,31 @@ +# ATSS + +> [Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection](https://arxiv.org/abs/1912.02424) + + + +## Abstract + +Object detection has been dominated by anchor-based detectors for several years. Recently, anchor-free detectors have become popular due to the proposal of FPN and Focal Loss. In this paper, we first point out that the essential difference between anchor-based and anchor-free detection is actually how to define positive and negative training samples, which leads to the performance gap between them. If they adopt the same definition of positive and negative samples during training, there is no obvious difference in the final performance, no matter regressing from a box or a point. This shows that how to select positive and negative training samples is important for current object detectors. Then, we propose an Adaptive Training Sample Selection (ATSS) to automatically select positive and negative samples according to statistical characteristics of object. It significantly improves the performance of anchor-based and anchor-free detectors and bridges the gap between them. Finally, we discuss the necessity of tiling multiple anchors per location on the image to detect objects. Extensive experiments conducted on MS COCO support our aforementioned analysis and conclusions. With the newly introduced ATSS, we improve state-of-the-art detectors by a large margin to 50.7% AP without introducing any overhead. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :-----: | :-----: | :------: | :------------: | :----: | :----------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | 1x | 3.7 | 19.7 | 39.4 | [config](./atss_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209_102539.log.json) | +| R-101 | pytorch | 1x | 5.6 | 12.3 | 41.5 | [config](./atss_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.log.json) | + +## Citation + +```latex +@article{zhang2019bridging, + title = {Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection}, + author = {Zhang, Shifeng and Chi, Cheng and Yao, Yongqiang and Lei, Zhen and Li, Stan Z.}, + journal = {arXiv preprint arXiv:1912.02424}, + year = {2019} +} +``` diff --git a/mmdetection/configs/atss/atss_r101_fpn_1x_coco.py b/mmdetection/configs/atss/atss_r101_fpn_1x_coco.py new file mode 100644 index 0000000..5225d2a --- /dev/null +++ b/mmdetection/configs/atss/atss_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './atss_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/atss/atss_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/atss/atss_r101_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..69999ce --- /dev/null +++ b/mmdetection/configs/atss/atss_r101_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './atss_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/atss/atss_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/atss/atss_r18_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..12d9f13 --- /dev/null +++ b/mmdetection/configs/atss/atss_r18_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './atss_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/atss/atss_r50_fpn_1x_coco.py b/mmdetection/configs/atss/atss_r50_fpn_1x_coco.py new file mode 100644 index 0000000..306435d --- /dev/null +++ b/mmdetection/configs/atss/atss_r50_fpn_1x_coco.py @@ -0,0 +1,71 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='ATSS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='ATSSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/atss/atss_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/atss/atss_r50_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..e3b3c46 --- /dev/null +++ b/mmdetection/configs/atss/atss_r50_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,81 @@ +_base_ = '../common/lsj-200e_coco-detection.py' + +image_size = (1024, 1024) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +model = dict( + type='ATSS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='ATSSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +train_dataloader = dict(batch_size=8, num_workers=4) + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/atss/metafile.yml b/mmdetection/configs/atss/metafile.yml new file mode 100644 index 0000000..f4c567e --- /dev/null +++ b/mmdetection/configs/atss/metafile.yml @@ -0,0 +1,60 @@ +Collections: + - Name: ATSS + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ATSS + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1912.02424 + Title: 'Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection' + README: configs/atss/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/atss.py#L6 + Version: v2.0.0 + +Models: + - Name: atss_r50_fpn_1x_coco + In Collection: ATSS + Config: configs/atss/atss_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.7 + inference time (ms/im): + - value: 50.76 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth + + - Name: atss_r101_fpn_1x_coco + In Collection: ATSS + Config: configs/atss/atss_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.6 + inference time (ms/im): + - value: 81.3 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r101_fpn_1x_coco/atss_r101_fpn_1x_20200825-dfcadd6f.pth diff --git a/mmdetection/configs/autoassign/README.md b/mmdetection/configs/autoassign/README.md new file mode 100644 index 0000000..f6b0573 --- /dev/null +++ b/mmdetection/configs/autoassign/README.md @@ -0,0 +1,35 @@ +# AutoAssign + +> [AutoAssign: Differentiable Label Assignment for Dense Object Detection](https://arxiv.org/abs/2007.03496) + + + +## Abstract + +Determining positive/negative samples for object detection is known as label assignment. Here we present an anchor-free detector named AutoAssign. It requires little human knowledge and achieves appearance-aware through a fully differentiable weighting mechanism. During training, to both satisfy the prior distribution of data and adapt to category characteristics, we present Center Weighting to adjust the category-specific prior distributions. To adapt to object appearances, Confidence Weighting is proposed to adjust the specific assign strategy of each instance. The two weighting modules are then combined to generate positive and negative weights to adjust each location's confidence. Extensive experiments on the MS COCO show that our method steadily surpasses other best sampling strategies by large margins with various backbones. Moreover, our best model achieves 52.1% AP, outperforming all existing one-stage detectors. Besides, experiments on other datasets, e.g., PASCAL VOC, Objects365, and WiderFace, demonstrate the broad applicability of AutoAssign. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | box AP | Config | Download | +| :------: | :---: | :-----: | :------: | :----: | :---------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | caffe | 1x | 4.08 | 40.4 | [config](./autoassign_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.log.json) | + +**Note**: + +1. We find that the performance is unstable with 1x setting and may fluctuate by about 0.3 mAP. mAP 40.3 ~ 40.6 is acceptable. Such fluctuation can also be found in the original implementation. +2. You can get a more stable results ~ mAP 40.6 with a schedule total 13 epoch, and learning rate is divided by 10 at 10th and 13th epoch. + +## Citation + +```latex +@article{zhu2020autoassign, + title={AutoAssign: Differentiable Label Assignment for Dense Object Detection}, + author={Zhu, Benjin and Wang, Jianfeng and Jiang, Zhengkai and Zong, Fuhang and Liu, Songtao and Li, Zeming and Sun, Jian}, + journal={arXiv preprint arXiv:2007.03496}, + year={2020} +} +``` diff --git a/mmdetection/configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..76a3619 --- /dev/null +++ b/mmdetection/configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,69 @@ +# We follow the original implementation which +# adopts the Caffe pre-trained backbone. +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='AutoAssign', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[102.9801, 115.9465, 122.7717], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + relu_before_extra_convs=True, + init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')), + bbox_head=dict( + type='AutoAssignHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_bbox=dict(type='GIoULoss', loss_weight=5.0)), + train_cfg=None, + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.01), paramwise_cfg=dict(norm_decay_mult=0.)) diff --git a/mmdetection/configs/autoassign/metafile.yml b/mmdetection/configs/autoassign/metafile.yml new file mode 100644 index 0000000..ab7a4af --- /dev/null +++ b/mmdetection/configs/autoassign/metafile.yml @@ -0,0 +1,33 @@ +Collections: + - Name: AutoAssign + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - AutoAssign + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2007.03496 + Title: 'AutoAssign: Differentiable Label Assignment for Dense Object Detection' + README: configs/autoassign/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/autoassign.py#L6 + Version: v2.12.0 + +Models: + - Name: autoassign_r50-caffe_fpn_1x_coco + In Collection: AutoAssign + Config: configs/autoassign/autoassign_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.08 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/autoassign/auto_assign_r50_fpn_1x_coco/auto_assign_r50_fpn_1x_coco_20210413_115540-5e17991f.pth diff --git a/mmdetection/configs/boxinst/README.md b/mmdetection/configs/boxinst/README.md new file mode 100644 index 0000000..f6f01c5 --- /dev/null +++ b/mmdetection/configs/boxinst/README.md @@ -0,0 +1,32 @@ +# BoxInst + +> [BoxInst: High-Performance Instance Segmentation with Box Annotations](https://arxiv.org/pdf/2012.02310.pdf) + + + +## Abstract + +We present a high-performance method that can achieve mask-level instance segmentation with only bounding-box annotations for training. While this setting has been studied in the literature, here we show significantly stronger performance with a simple design (e.g., dramatically improving previous best reported mask AP of 21.1% to 31.6% on the COCO dataset). Our core idea is to redesign the loss +of learning masks in instance segmentation, with no modification to the segmentation network itself. The new loss functions can supervise the mask training without relying on mask annotations. This is made possible with two loss terms, namely, 1) a surrogate term that minimizes the discrepancy between the projections of the ground-truth box and the predicted mask; 2) a pairwise loss that can exploit the prior that proximal pixels with similar colors are very likely to have the same category label. Experiments demonstrate that the redesigned mask loss can yield surprisingly high-quality instance masks with only box annotations. For example, without using any mask annotations, with a ResNet-101 backbone and 3× training schedule, we achieve 33.2% mask AP on COCO test-dev split (vs. 39.1% of the fully supervised counterpart). Our excellent experiment results on COCO and Pascal VOC indicate that our method dramatically narrows the performance gap between weakly and fully supervised instance segmentation. + +
    + +
    + +## Results and Models + +| Backbone | Style | MS train | Lr schd | bbox AP | mask AP | Config | Download | +| :------: | :-----: | :------: | :-----: | :-----: | :-----: | :-----------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | Y | 1x | 39.6 | 31.1 | [config](./boxinst_r50_fpn_ms-90k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/boxinst/boxinst_r50_fpn_ms-90k_coco/boxinst_r50_fpn_ms-90k_coco_20221228_163052-6add751a.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/boxinst/boxinst_r50_fpn_ms-90k_coco/boxinst_r50_fpn_ms-90k_coco_20221228_163052.log.json) | +| R-101 | pytorch | Y | 1x | 41.8 | 32.7 | [config](./boxinst_r101_fpn_ms-90k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/boxinst/boxinst_r101_fpn_ms-90k_coco/boxinst_r101_fpn_ms-90k_coco_20221229_145106-facf375b.pth) \|[log](https://download.openmmlab.com/mmdetection/v3.0/boxinst/boxinst_r101_fpn_ms-90k_coco/boxinst_r101_fpn_ms-90k_coco_20221229_145106.log.json) | + +## Citation + +```latex +@inproceedings{tian2020boxinst, + title = {{BoxInst}: High-Performance Instance Segmentation with Box Annotations}, + author = {Tian, Zhi and Shen, Chunhua and Wang, Xinlong and Chen, Hao}, + booktitle = {Proc. IEEE Conf. Computer Vision and Pattern Recognition (CVPR)}, + year = {2021} +} +``` diff --git a/mmdetection/configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py b/mmdetection/configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py new file mode 100644 index 0000000..ab2b116 --- /dev/null +++ b/mmdetection/configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py @@ -0,0 +1,8 @@ +_base_ = './boxinst_r50_fpn_ms-90k_coco.py' + +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py b/mmdetection/configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py new file mode 100644 index 0000000..371f252 --- /dev/null +++ b/mmdetection/configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py @@ -0,0 +1,93 @@ +_base_ = '../common/ms-90k_coco.py' + +# model settings +model = dict( + type='BoxInst', + data_preprocessor=dict( + type='BoxInstDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + mask_stride=4, + pairwise_size=3, + pairwise_dilation=2, + pairwise_color_thresh=0.3, + bottom_pixels_removed=10), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='BoxInstBboxHead', + num_params=593, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + norm_on_bbox=True, + centerness_on_reg=True, + dcn_on_last_conv=False, + center_sampling=True, + conv_bias=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + mask_head=dict( + type='BoxInstMaskHead', + num_layers=3, + feat_channels=16, + size_of_interest=8, + mask_out_stride=4, + topk_masks_per_img=64, + mask_feature_head=dict( + in_channels=256, + feat_channels=128, + start_level=0, + end_level=2, + out_channels=16, + mask_stride=8, + num_stacked_convs=4, + norm_cfg=dict(type='BN', requires_grad=True)), + loss_mask=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + eps=5e-6, + loss_weight=1.0)), + # model training and testing settings + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100, + mask_thr=0.5)) + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=0.01)) + +# evaluator +val_evaluator = dict(metric=['bbox', 'segm']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/boxinst/metafile.yml b/mmdetection/configs/boxinst/metafile.yml new file mode 100644 index 0000000..c97fcdc --- /dev/null +++ b/mmdetection/configs/boxinst/metafile.yml @@ -0,0 +1,52 @@ +Collections: + - Name: BoxInst + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x A100 GPUs + Architecture: + - ResNet + - FPN + - CondInst + Paper: + URL: https://arxiv.org/abs/2012.02310 + Title: 'BoxInst: High-Performance Instance Segmentation with Box Annotations' + README: configs/boxinst/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v3.0.0rc6/mmdet/models/detectors/boxinst.py#L8 + Version: v3.0.0rc6 + +Models: + - Name: boxinst_r50_fpn_ms-90k_coco + In Collection: BoxInst + Config: configs/boxinst/boxinst_r50_fpn_ms-90k_coco.py + Metadata: + Iterations: 90000 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 30.8 + Weights: https://download.openmmlab.com/mmdetection/v3.0/boxinst/boxinst_r50_fpn_ms-90k_coco/boxinst_r50_fpn_ms-90k_coco_20221228_163052-6add751a.pth + + - Name: boxinst_r101_fpn_ms-90k_coco + In Collection: BoxInst + Config: configs/boxinst/boxinst_r101_fpn_ms-90k_coco.py + Metadata: + Iterations: 90000 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.7 + Weights: https://download.openmmlab.com/mmdetection/v3.0/boxinst/boxinst_r101_fpn_ms-90k_coco/boxinst_r101_fpn_ms-90k_coco_20221229_145106-facf375b.pth diff --git a/mmdetection/configs/bytetrack/README.md b/mmdetection/configs/bytetrack/README.md new file mode 100644 index 0000000..30b96f0 --- /dev/null +++ b/mmdetection/configs/bytetrack/README.md @@ -0,0 +1,132 @@ +# ByteTrack: Multi-Object Tracking by Associating Every Detection Box + +## Abstract + + + +Multi-object tracking (MOT) aims at estimating bounding boxes and identities of objects in videos. Most methods obtain identities by associating detection boxes whose scores are higher than a threshold. The objects with low detection scores, e.g. occluded objects, are simply thrown away, which brings non-negligible true object missing and fragmented trajectories. To solve this problem, we present a simple, effective and generic association method, tracking by associating every detection box instead of only the high score ones. For the low score detection boxes, we utilize their similarities with tracklets to recover true objects and filter out the background detections. When applied to 9 different state-of-the-art trackers, our method achieves consistent improvement on IDF1 score ranging from 1 to 10 points. To put forwards the state-of-the-art performance of MOT, we design a simple and strong tracker, named ByteTrack. For the first time, we achieve 80.3 MOTA, 77.3 IDF1 and 63.1 HOTA on the test set of MOT17 with 30 FPS running speed on a single V100 GPU. + + + +
    + +
    + +## Citation + + + +```latex +@inproceedings{zhang2021bytetrack, + title={ByteTrack: Multi-Object Tracking by Associating Every Detection Box}, + author={Zhang, Yifu and Sun, Peize and Jiang, Yi and Yu, Dongdong and Yuan, Zehuan and Luo, Ping and Liu, Wenyu and Wang, Xinggang}, + journal={arXiv preprint arXiv:2110.06864}, + year={2021} +} +``` + +## Results and models on MOT17 + +Please note that the performance on `MOT17-half-val` is comparable with the performance reported in the manuscript, while the performance on `MOT17-test` is lower than the performance reported in the manuscript. + +The reason is that ByteTrack tunes customized hyper-parameters (e.g., image resolution and the high threshold of detection score) for each video in `MOT17-test` set, while we use unified parameters. + +| Method | Detector | Train Set | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 | FP | FN | IDSw. | Config | Download | +| :-------: | :------: | :---------------------------: | :------------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :-------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| ByteTrack | YOLOX-X | CrowdHuman + MOT17-half-train | MOT17-half-val | N | - | 67.5 | 78.6 | 78.5 | 12852 | 21060 | 672 | [config](bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500.log.json) | +| ByteTrack | YOLOX-X | CrowdHuman + MOT17-half-train | MOT17-test | N | - | 61.7 | 78.1 | 74.8 | 36705 | 85032 | 2049 | [config](bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500.log.json) | + +## Results and models on MOT20 + +Since there are only 4 videos in `MOT20-train`, ByteTrack is validated on `MOT17-train` rather than `MOT20-half-train`. + +Please note that the MOTA on `MOT20-test` is slightly lower than that reported in the manuscript, because we don't tune the threshold for each video. + +| Method | Detector | Train Set | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 | FP | FN | IDSw. | Config | Download | +| :-------: | :------: | :----------------------: | :---------: | :----: | :------------: | :--: | :--: | :--: | :----: | :----: | :---: | :------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| ByteTrack | YOLOX-X | CrowdHuman + MOT20-train | MOT17-train | N | - | 57.3 | 64.9 | 71.8 | 33,747 | 83,385 | 1,263 | [config](bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040-9ce38a60.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040.log.json) | +| ByteTrack | YOLOX-X | CrowdHuman + MOT20-train | MOT20-test | N | - | 61.5 | 77.0 | 75.4 | 33,083 | 84,433 | 1,345 | [config](bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py) | [model](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040-9ce38a60.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040.log.json) | + +## Get started + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Prepare + +Tracking Dataset Prepare can refer to this [document](../../docs/en/user_guides/tracking_dataset_prepare.md). + +### 3. Training + +Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training. + +**3.1 Joint training and tracking** + +Some algorithm like ByteTrack, OCSORT don't need reid model, so we provide joint training and tracking for convenient. + +```shell +# Training Bytetrack on crowdhuman and mot17-half-train dataset with following command +# The number after config file represents the number of GPUs used. Here we use 8 GPUs +bash tools/dist_train.sh configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8 +``` + +**3.2 Separate training and tracking** + +Of course, we provide train detector independently like SORT, DeepSORT, StrongSORT. Then use this detector to track. + +```shell +# Training Bytetrack on crowdhuman and mot17-half-train dataset with following command +# The number after config file represents the number of GPUs used. Here we use 8 GPUs +bash tools/dist_train.sh configs/bytetrack/yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8 +``` + +If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 4. Testing and evaluation + +### 4.1 Example on MOTxx-halfval dataset + +**4.1.1 use joint trained detector to evaluating and testing** + +```shell +bash tools/dist_test_tracking.sh configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8 --checkpoint ${CHECKPOINT_FILE} +``` + +**4.1.2 use separate trained detector to evaluating and testing** + +```shell +bash tools/dist_test_tracking.sh configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8 --detector ${CHECKPOINT_FILE} +``` + +**4.1.3 use video_baesd to evaluating and testing** + +we also provide two_ways(img_based or video_based) to evaluating and testing. +if you want to use video_based to evaluating and testing, you can modify config as follows + +``` +val_dataloader = dict( + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False)) +``` + +#### 4.2 Example on MOTxx-test dataset + +If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set, please use the following command to generate result files that can be used for submission. It will be stored in `./mot_17_test_res`, you can modify the saved path in `test_evaluator` of the config. + +```shell +bash tools/dist_test_tracking.sh configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py 8 --checkpoint ${CHECKPOINT_FILE} +``` + +If you want to know about more detailed usage of `test_tracking.py/dist_test_tracking.sh/slurm_test_tracking.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 5.Inference + +Use a single GPU to predict a video and save it as a video. + +```shell +python demo/mot_demo.py demo/demo_mot.mp4 configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py --checkpoint ${CHECKPOINT_FILE} --out mot.mp4 +``` + +If you want to know about more detailed usage of `mot_demo.py`, please refer to this [document](../../docs/en/user_guides/tracking_inference.md). diff --git a/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..24b3f78 --- /dev/null +++ b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py @@ -0,0 +1,249 @@ +_base_ = ['../yolox/yolox_x_8xb8-300e_coco.py'] + +dataset_type = 'MOTChallengeDataset' +data_root = 'data/MOT17/' + +img_scale = (1440, 800) # weight, height +batch_size = 4 + +detector = _base_.model +detector.pop('data_preprocessor') +detector.bbox_head.update(dict(num_classes=1)) +detector.test_cfg.nms.update(dict(iou_threshold=0.7)) +detector['init_cfg'] = dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth' # noqa: E501 +) +del _base_.model + +model = dict( + type='ByteTrack', + data_preprocessor=dict( + type='TrackDataPreprocessor', + pad_size_divisor=32, + # in bytetrack, we provide joint train detector and evaluate tracking + # performance, use_det_processor means use independent detector + # data_preprocessor. of course, you can train detector independently + # like strongsort + use_det_processor=True, + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(576, 1024), + size_divisor=32, + interval=10) + ]), + detector=detector, + tracker=dict( + type='ByteTracker', + motion=dict(type='KalmanFilter'), + obj_score_thrs=dict(high=0.6, low=0.1), + init_track_thr=0.7, + weight_iou_with_det_scores=True, + match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3), + num_frames_retain=30)) + +train_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + bbox_clip_border=False), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + bbox_clip_border=False), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0, + bbox_clip_border=False), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Resize', + scale=img_scale, + keep_ratio=True, + clip_object_border=False), + dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict( + type='TransformBroadcaster', + transforms=[ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='Pad', + size_divisor=32, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadTrackAnnotations'), + ]), + dict(type='PackTrackInputs') +] +train_dataloader = dict( + _delete_=True, + batch_size=batch_size, + num_workers=4, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='MultiImageMixDataset', + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type='CocoDataset', + data_root='data/MOT17', + ann_file='annotations/half-train_cocoformat.json', + data_prefix=dict(img='train'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + dict( + type='CocoDataset', + data_root='data/crowdhuman', + ann_file='annotations/crowdhuman_train.json', + data_prefix=dict(img='train'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + dict( + type='CocoDataset', + data_root='data/crowdhuman', + ann_file='annotations/crowdhuman_val.json', + data_prefix=dict(img='val'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + ]), + pipeline=train_pipeline)) + +val_dataloader = dict( + _delete_=True, + batch_size=1, + num_workers=2, + persistent_workers=True, + pin_memory=True, + drop_last=False, + # video_based + # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + sampler=dict(type='TrackImgSampler'), # image_based + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/half-val_cocoformat.json', + data_prefix=dict(img_path='train'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +# default 8 gpu +base_lr = 0.001 / 8 * batch_size +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +# some hyper parameters +# training settings +max_epochs = 80 +num_last_epochs = 10 +interval = 5 + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=70, + val_interval=1) + +# learning policy +param_scheduler = [ + dict( + # use quadratic formula to warm up 1 epochs + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + # use cosine lr from 1 to 70 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=1, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + # use fixed lr during last 10 epochs + type='ConstantLR', + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ) +] + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + priority=49) +] + +default_hooks = dict( + checkpoint=dict( + _delete_=True, type='CheckpointHook', interval=1, max_keep_ckpts=10), + visualization=dict(type='TrackVisualizationHook', draw=False)) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# evaluator +val_evaluator = dict( + _delete_=True, + type='MOTChallengeMetric', + metric=['HOTA', 'CLEAR', 'Identity'], + postprocess_tracklet_cfg=[ + dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20) + ]) +test_evaluator = val_evaluator + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (4 samples per GPU) +auto_scale_lr = dict(base_batch_size=32) + +del detector +del _base_.tta_model +del _base_.tta_pipeline +del _base_.train_dataset diff --git a/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py new file mode 100644 index 0000000..9202f5f --- /dev/null +++ b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py @@ -0,0 +1,127 @@ +_base_ = [ + './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_' + 'test-mot17halfval.py' +] + +dataset_type = 'MOTChallengeDataset' + +img_scale = (1600, 896) # weight, height + +model = dict( + data_preprocessor=dict( + type='TrackDataPreprocessor', + use_det_processor=True, + pad_size_divisor=32, + batch_augments=[ + dict(type='BatchSyncRandomResize', random_size_range=(640, 1152)) + ]), + tracker=dict( + weight_iou_with_det_scores=False, + match_iou_thrs=dict(high=0.3), + )) + +train_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + bbox_clip_border=True), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + bbox_clip_border=True), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0, + bbox_clip_border=True), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Resize', + scale=img_scale, + keep_ratio=True, + clip_object_border=True), + dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict( + type='TransformBroadcaster', + transforms=[ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='Pad', + size_divisor=32, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadTrackAnnotations'), + ]), + dict(type='PackTrackInputs') +] +train_dataloader = dict( + dataset=dict( + type='MultiImageMixDataset', + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type='CocoDataset', + data_root='data/MOT20', + ann_file='annotations/train_cocoformat.json', + # TODO: mmdet use img as key, but img_path is needed + data_prefix=dict(img='train'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + dict( + type='CocoDataset', + data_root='data/crowdhuman', + ann_file='annotations/crowdhuman_train.json', + data_prefix=dict(img='train'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + dict( + type='CocoDataset', + data_root='data/crowdhuman', + ann_file='annotations/crowdhuman_val.json', + data_prefix=dict(img='val'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + ]), + pipeline=train_pipeline)) +val_dataloader = dict( + dataset=dict(ann_file='annotations/train_cocoformat.json')) + +test_dataloader = dict( + dataset=dict( + data_root='data/MOT20', ann_file='annotations/test_cocoformat.json')) + +test_evaluator = dict( + type='MOTChallengeMetrics', + postprocess_tracklet_cfg=[ + dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20) + ], + format_only=True, + outfile_prefix='./mot_20_test_res') diff --git a/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..9c21192 --- /dev/null +++ b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py @@ -0,0 +1,9 @@ +_base_ = [ + './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_' + 'test-mot17halfval.py' +] + +# fp16 settings +optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic') +val_cfg = dict(type='ValLoop', fp16=True) +test_cfg = dict(type='TestLoop', fp16=True) diff --git a/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py new file mode 100644 index 0000000..3f4427c --- /dev/null +++ b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py @@ -0,0 +1,17 @@ +_base_ = [ + './bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-' + 'mot17halftrain_test-mot17halfval.py' +] + +test_dataloader = dict( + dataset=dict( + data_root='data/MOT17/', + ann_file='annotations/test_cocoformat.json', + data_prefix=dict(img_path='test'))) +test_evaluator = dict( + type='MOTChallengeMetrics', + postprocess_tracklet_cfg=[ + dict(type='InterpolateTracklets', min_num_frames=5, max_num_frames=20) + ], + format_only=True, + outfile_prefix='./mot_17_test_res') diff --git a/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py new file mode 100644 index 0000000..1016999 --- /dev/null +++ b/mmdetection/configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py @@ -0,0 +1,8 @@ +_base_ = [ + './bytetrack_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py' +] + +# fp16 settings +optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic') +val_cfg = dict(type='ValLoop', fp16=True) +test_cfg = dict(type='TestLoop', fp16=True) diff --git a/mmdetection/configs/bytetrack/metafile.yml b/mmdetection/configs/bytetrack/metafile.yml new file mode 100644 index 0000000..8ed638c --- /dev/null +++ b/mmdetection/configs/bytetrack/metafile.yml @@ -0,0 +1,53 @@ +Collections: + - Name: ByteTrack + Metadata: + Training Techniques: + - SGD with Momentum + Training Resources: 8x V100 GPUs + Architecture: + - YOLOX + Paper: + URL: https://arxiv.org/abs/2110.06864 + Title: ByteTrack Multi-Object Tracking by Associating Every Detection Box + README: configs/bytetrack/README.md + +Models: + - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval + In Collection: ByteTrack + Config: configs/bytetrack/bytetrack_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py + Metadata: + Training Data: CrowdHuman + MOT17-half-train + Results: + - Task: Multiple Object Tracking + Dataset: MOT17-half-val + Metrics: + HOTA: 67.5 + MOTA: 78.6 + IDF1: 78.5 + Weights: https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth + + - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test + In Collection: ByteTrack + Config: configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17test.py + Metadata: + Training Data: CrowdHuman + MOT17-half-train + Results: + - Task: Multiple Object Tracking + Dataset: MOT17-test + Metrics: + MOTA: 78.1 + IDF1: 74.8 + Weights: https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth + + - Name: bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test + In Collection: ByteTrack + Config: configs/bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py + Metadata: + Training Data: CrowdHuman + MOT20-train + Results: + - Task: Multiple Object Tracking + Dataset: MOT20-test + Metrics: + MOTA: 77.0 + IDF1: 75.4 + Weights: https://download.openmmlab.com/mmtracking/mot/bytetrack/bytetrack_yolox_x/bytetrack_yolox_x_crowdhuman_mot20-private_20220506_101040-9ce38a60.pth diff --git a/mmdetection/configs/bytetrack/yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmdetection/configs/bytetrack/yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..8fc3acd --- /dev/null +++ b/mmdetection/configs/bytetrack/yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py @@ -0,0 +1,6 @@ +_base_ = [ + '../strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py' # noqa: E501 +] + +# fp16 settings +optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic') diff --git a/mmdetection/configs/carafe/README.md b/mmdetection/configs/carafe/README.md new file mode 100644 index 0000000..61e1fa6 --- /dev/null +++ b/mmdetection/configs/carafe/README.md @@ -0,0 +1,42 @@ +# CARAFE + +> [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188) + + + +## Abstract + +Feature upsampling is a key operation in a number of modern convolutional network architectures, e.g. feature pyramids. Its design is critical for dense prediction tasks such as object detection and semantic/instance segmentation. In this work, we propose Content-Aware ReAssembly of FEatures (CARAFE), a universal, lightweight and highly effective operator to fulfill this goal. CARAFE has several appealing properties: (1) Large field of view. Unlike previous works (e.g. bilinear interpolation) that only exploit sub-pixel neighborhood, CARAFE can aggregate contextual information within a large receptive field. (2) Content-aware handling. Instead of using a fixed kernel for all samples (e.g. deconvolution), CARAFE enables instance-specific content-aware handling, which generates adaptive kernels on-the-fly. (3) Lightweight and fast to compute. CARAFE introduces little computational overhead and can be readily integrated into modern network architectures. We conduct comprehensive evaluations on standard benchmarks in object detection, instance/semantic segmentation and inpainting. CARAFE shows consistent and substantial gains across all the tasks (1.2%, 1.3%, 1.8%, 1.1db respectively) with negligible computational overhead. It has great potential to serve as a strong building block for future research. It has great potential to serve as a strong building block for future research. + +
    + +
    + +## Results and Models + +The results on COCO 2017 val is shown in the below table. + +| Method | Backbone | Style | Lr schd | Test Proposal Num | Inf time (fps) | Box AP | Mask AP | Config | Download | +| :--------------------: | :------: | :-----: | :-----: | :---------------: | :------------: | :----: | :-----: | :-----------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN w/ CARAFE | R-50-FPN | pytorch | 1x | 1000 | 16.5 | 38.6 | 38.6 | [config](./faster-rcnn_r50_fpn-carafe_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_20200504_175733.log.json) | +| - | - | - | - | 2000 | | | | | | +| Mask R-CNN w/ CARAFE | R-50-FPN | pytorch | 1x | 1000 | 14.0 | 39.3 | 35.8 | [config](./mask-rcnn_r50_fpn-carafe_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.393__segm_mAP-0.358_20200503_135957-8687f195.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_20200503_135957.log.json) | +| - | - | - | - | 2000 | | | | | | + +## Implementation + +The CUDA implementation of CARAFE can be find at https://github.com/myownskyW7/CARAFE. + +## Citation + +We provide config files to reproduce the object detection & instance segmentation results in the ICCV 2019 Oral paper for [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188). + +```latex +@inproceedings{Wang_2019_ICCV, + title = {CARAFE: Content-Aware ReAssembly of FEatures}, + author = {Wang, Jiaqi and Chen, Kai and Xu, Rui and Liu, Ziwei and Loy, Chen Change and Lin, Dahua}, + booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2019} +} +``` diff --git a/mmdetection/configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py b/mmdetection/configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py new file mode 100644 index 0000000..388305c --- /dev/null +++ b/mmdetection/configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py @@ -0,0 +1,20 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + data_preprocessor=dict(pad_size_divisor=64), + neck=dict( + type='FPN_CARAFE', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5, + start_level=0, + end_level=-1, + norm_cfg=None, + act_cfg=None, + order=('conv', 'norm', 'act'), + upsample_cfg=dict( + type='carafe', + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1, + compressed_channels=64))) diff --git a/mmdetection/configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py b/mmdetection/configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py new file mode 100644 index 0000000..6ce621d --- /dev/null +++ b/mmdetection/configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + data_preprocessor=dict(pad_size_divisor=64), + neck=dict( + type='FPN_CARAFE', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5, + start_level=0, + end_level=-1, + norm_cfg=None, + act_cfg=None, + order=('conv', 'norm', 'act'), + upsample_cfg=dict( + type='carafe', + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1, + compressed_channels=64)), + roi_head=dict( + mask_head=dict( + upsample_cfg=dict( + type='carafe', + scale_factor=2, + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1, + compressed_channels=64)))) diff --git a/mmdetection/configs/carafe/metafile.yml b/mmdetection/configs/carafe/metafile.yml new file mode 100644 index 0000000..863c0f4 --- /dev/null +++ b/mmdetection/configs/carafe/metafile.yml @@ -0,0 +1,55 @@ +Collections: + - Name: CARAFE + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RPN + - FPN_CARAFE + - ResNet + - RoIPool + Paper: + URL: https://arxiv.org/abs/1905.02188 + Title: 'CARAFE: Content-Aware ReAssembly of FEatures' + README: configs/carafe/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/necks/fpn_carafe.py#L11 + Version: v2.12.0 + +Models: + - Name: faster-rcnn_r50_fpn_carafe_1x_coco + In Collection: CARAFE + Config: configs/carafe/faster-rcnn_r50_fpn-carafe_1x_coco.py + Metadata: + Training Memory (GB): 4.26 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/carafe/faster_rcnn_r50_fpn_carafe_1x_coco/faster_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.386_20200504_175733-385a75b7.pth + + - Name: mask-rcnn_r50_fpn_carafe_1x_coco + In Collection: CARAFE + Config: configs/carafe/mask-rcnn_r50_fpn-carafe_1x_coco.py + Metadata: + Training Memory (GB): 4.31 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/carafe/mask_rcnn_r50_fpn_carafe_1x_coco/mask_rcnn_r50_fpn_carafe_1x_coco_bbox_mAP-0.393__segm_mAP-0.358_20200503_135957-8687f195.pth diff --git a/mmdetection/configs/cascade_rcnn/README.md b/mmdetection/configs/cascade_rcnn/README.md new file mode 100644 index 0000000..81fce44 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/README.md @@ -0,0 +1,79 @@ +# Cascade R-CNN + +> [Cascade R-CNN: High Quality Object Detection and Instance Segmentation](https://arxiv.org/abs/1906.09756) + + + +## Abstract + +In object detection, the intersection over union (IoU) threshold is frequently used to define positives/negatives. The threshold used to train a detector defines its quality. While the commonly used threshold of 0.5 leads to noisy (low-quality) detections, detection performance frequently degrades for larger thresholds. This paradox of high-quality detection has two causes: 1) overfitting, due to vanishing positive samples for large thresholds, and 2) inference-time quality mismatch between detector and test hypotheses. A multi-stage object detection architecture, the Cascade R-CNN, composed of a sequence of detectors trained with increasing IoU thresholds, is proposed to address these problems. The detectors are trained sequentially, using the output of a detector as training set for the next. This resampling progressively improves hypotheses quality, guaranteeing a positive training set of equivalent size for all detectors and minimizing overfitting. The same cascade is applied at inference, to eliminate quality mismatches between hypotheses and detectors. An implementation of the Cascade R-CNN without bells or whistles achieves state-of-the-art performance on the COCO dataset, and significantly improves high-quality detection on generic and specific object detection datasets, including VOC, KITTI, CityPerson, and WiderFace. Finally, the Cascade R-CNN is generalized to instance segmentation, with nontrivial improvements over the Mask R-CNN. + +
    + +
    + +## Results and Models + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | caffe | 1x | 4.2 | | 40.4 | [config](./cascade-rcnn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.404_20200504_174853-b857be87.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_20200504_174853.log.json) | +| R-50-FPN | pytorch | 1x | 4.4 | 16.1 | 40.3 | [config](./cascade-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316_214748.log.json) | +| R-50-FPN | pytorch | 20e | - | - | 41.0 | [config](./cascade-rcnn_r50_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_bbox_mAP-0.41_20200504_175131-e9872a90.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_20200504_175131.log.json) | +| R-101-FPN | caffe | 1x | 6.2 | | 42.3 | [config](./cascade-rcnn_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.423_20200504_175649-cab8dbd5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_20200504_175649.log.json) | +| R-101-FPN | pytorch | 1x | 6.4 | 13.5 | 42.0 | [config](./cascade-rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317_101744.log.json) | +| R-101-FPN | pytorch | 20e | - | - | 42.5 | [config](./cascade-rcnn_r101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_bbox_mAP-0.425_20200504_231812-5057dcc5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_20200504_231812.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.6 | 10.9 | 43.7 | [config](./cascade-rcnn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316-95c2deb6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316_055608.log.json) | +| X-101-32x4d-FPN | pytorch | 20e | 7.6 | | 43.7 | [config](./cascade-rcnn_x101-32x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608-9ae0a720.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.7 | | 44.7 | [config](./cascade-rcnn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702.log.json) | +| X-101-64x4d-FPN | pytorch | 20e | 10.7 | | 44.5 | [config](./cascade-rcnn_x101_64x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357.log.json) | + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | caffe | 1x | 5.9 | | 41.2 | 36.0 | [config](./cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.412__segm_mAP-0.36_20200504_174659-5004b251.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_20200504_174659.log.json) | +| R-50-FPN | pytorch | 1x | 6.0 | 11.2 | 41.2 | 35.9 | [config](./cascade-mask-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203_170449.log.json) | +| R-50-FPN | pytorch | 20e | - | - | 41.9 | 36.5 | [config](./cascade-mask-rcnn_r50_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_20200504_174711.log.json) | +| R-101-FPN | caffe | 1x | 7.8 | | 43.2 | 37.6 | [config](./cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.432__segm_mAP-0.376_20200504_174813-5c1e9599.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_20200504_174813.log.json) | +| R-101-FPN | pytorch | 1x | 7.9 | 9.8 | 42.9 | 37.3 | [config](./cascade-mask-rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203-befdf6ee.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203_092521.log.json) | +| R-101-FPN | pytorch | 20e | - | - | 43.4 | 37.8 | [config](./cascade-mask-rcnn_r101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_bbox_mAP-0.434__segm_mAP-0.378_20200504_174836-005947da.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_20200504_174836.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 9.2 | 8.6 | 44.3 | 38.3 | [config](./cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201-0f411b1f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201_052416.log.json) | +| X-101-32x4d-FPN | pytorch | 20e | 9.2 | - | 45.0 | 39.0 | [config](./cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917-ed1f4751.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 12.2 | 6.7 | 45.3 | 39.2 | [config](./cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203-9a2db89d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203_044059.log.json) | +| X-101-64x4d-FPN | pytorch | 20e | 12.2 | | 45.6 | 39.5 | [config](./cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033-bdb5126a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033.log.json) | + +**Notes:** + +- The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs. + +## Pre-trained Models + +We also train some models with longer schedules and multi-scale training for Cascade Mask R-CNN. The users could finetune them for downstream tasks. + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | caffe | 3x | 5.7 | | 44.0 | 38.1 | [config](./cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651-6e29b3a6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651.log.json) | +| R-50-FPN | pytorch | 3x | 5.9 | | 44.3 | 38.5 | [config](./cascade-mask-rcnn_r50_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719-5bdc3824.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719.log.json) | +| R-101-FPN | caffe | 3x | 7.7 | | 45.4 | 39.5 | [config](./cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620-a5bd2389.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620.log.json) | +| R-101-FPN | pytorch | 3x | 7.8 | | 45.5 | 39.6 | [config](./cascade-mask-rcnn_r101_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236-51a2d363.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236.log.json) | +| X-101-32x4d-FPN | pytorch | 3x | 9.0 | | 46.3 | 40.1 | [config](./cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234-40773067.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234.log.json) | +| X-101-32x8d-FPN | pytorch | 3x | 12.1 | | 46.1 | 39.9 | [config](./cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640-9ff7e76f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640.log.json) | +| X-101-64x4d-FPN | pytorch | 3x | 12.0 | | 46.6 | 40.3 | [config](./cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311-d3e64ba0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311.log.json) | + +## Citation + +```latex +@article{Cai_2019, + title={Cascade R-CNN: High Quality Object Detection and Instance Segmentation}, + ISSN={1939-3539}, + url={http://dx.doi.org/10.1109/tpami.2019.2956516}, + DOI={10.1109/tpami.2019.2956516}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher={Institute of Electrical and Electronics Engineers (IEEE)}, + author={Cai, Zhaowei and Vasconcelos, Nuno}, + year={2019}, + pages={1–1} +} +``` diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..6d85340 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py new file mode 100644 index 0000000..a6855ee --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..c3d962c --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py new file mode 100644 index 0000000..497148f --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py new file mode 100644 index 0000000..183b5c5 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..497f68c --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = ['./cascade-mask-rcnn_r50_fpn_1x_coco.py'] + +model = dict( + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py new file mode 100644 index 0000000..6677a9f --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../common/ms_3x_coco-instance.py', + '../_base_/models/cascade-mask-rcnn_r50_fpn.py' +] + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..f59bb94 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py new file mode 100644 index 0000000..35c8aa6 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_20e.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py new file mode 100644 index 0000000..b15006f --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py @@ -0,0 +1,4 @@ +_base_ = [ + '../common/ms_3x_coco-instance.py', + '../_base_/models/cascade-mask-rcnn_r50_fpn.py' +] diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..87a4cc3 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py new file mode 100644 index 0000000..5e8dcaa --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py new file mode 100644 index 0000000..3a0f61b --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py new file mode 100644 index 0000000..8cf0830 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py @@ -0,0 +1,24 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_ms-3x_coco.py' + +model = dict( + # ResNeXt-101-32x8d model trained with Caffe2 at FB, + # so the mean and std need to be changed. + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..fb2e6b6 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py new file mode 100644 index 0000000..cc20c17 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py new file mode 100644 index 0000000..f4ecc42 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..b6eaee2 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..1cdf510 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py new file mode 100644 index 0000000..84c285f --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py @@ -0,0 +1,6 @@ +_base_ = './cascade-rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..1fc52e9 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..aa30a3d --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..ad90e25 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = './cascade-rcnn_r50_fpn_1x_coco.py' + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..1a07c8b --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py new file mode 100644 index 0000000..30f3ff1 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_20e.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..cd25f02 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,23 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', + '../common/lsj-200e_coco-detection.py' +] +image_size = (1024, 1024) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +# disable allowed_border to avoid potential errors. +model = dict( + data_preprocessor=dict(batch_augments=batch_augments), + train_cfg=dict(rpn=dict(allowed_border=-1))) + +train_dataloader = dict(batch_size=8, num_workers=4) +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.02 * 4, momentum=0.9, weight_decay=0.00004)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..50e0b95 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py new file mode 100644 index 0000000..6120189 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..29475e3 --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,15 @@ +_base_ = './cascade-rcnn_r50_fpn_1x_coco.py' +model = dict( + type='CascadeRCNN', + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py b/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py new file mode 100644 index 0000000..e2aa57e --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py @@ -0,0 +1,15 @@ +_base_ = './cascade-rcnn_r50_fpn_20e_coco.py' +model = dict( + type='CascadeRCNN', + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/cascade_rcnn/metafile.yml b/mmdetection/configs/cascade_rcnn/metafile.yml new file mode 100644 index 0000000..7e0385d --- /dev/null +++ b/mmdetection/configs/cascade_rcnn/metafile.yml @@ -0,0 +1,545 @@ +Collections: + - Name: Cascade R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Cascade R-CNN + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: http://dx.doi.org/10.1109/tpami.2019.2956516 + Title: 'Cascade R-CNN: Delving into High Quality Object Detection' + README: configs/cascade_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/cascade_rcnn.py#L6 + Version: v2.0.0 + - Name: Cascade Mask R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Cascade R-CNN + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: http://dx.doi.org/10.1109/tpami.2019.2956516 + Title: 'Cascade R-CNN: Delving into High Quality Object Detection' + README: configs/cascade_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/cascade_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: cascade-rcnn_r50-caffe_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_caffe_fpn_1x_coco/cascade_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.404_20200504_174853-b857be87.pth + + - Name: cascade-rcnn_r50_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 62.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco/cascade_rcnn_r50_fpn_1x_coco_20200316-3dc56deb.pth + + - Name: cascade-rcnn_r50_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 62.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco/cascade_rcnn_r50_fpn_20e_coco_bbox_mAP-0.41_20200504_175131-e9872a90.pth + + - Name: cascade-rcnn_r101-caffe_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_caffe_fpn_1x_coco/cascade_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.423_20200504_175649-cab8dbd5.pth + + - Name: cascade-rcnn_r101_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 74.07 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco/cascade_rcnn_r101_fpn_1x_coco_20200317-0b6a2fbf.pth + + - Name: cascade-rcnn_r101_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_r101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 74.07 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_r101_fpn_20e_coco/cascade_rcnn_r101_fpn_20e_coco_bbox_mAP-0.425_20200504_231812-5057dcc5.pth + + - Name: cascade-rcnn_x101-32x4d_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 91.74 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco/cascade_rcnn_x101_32x4d_fpn_1x_coco_20200316-95c2deb6.pth + + - Name: cascade-rcnn_x101-32x4d_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_x101-32x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 7.6 + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_20e_coco/cascade_rcnn_x101_32x4d_fpn_20e_coco_20200906_134608-9ae0a720.pth + + - Name: cascade-rcnn_x101-64x4d_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_1x_coco/cascade_rcnn_x101_64x4d_fpn_1x_coco_20200515_075702-43ce6a30.pth + + - Name: cascade-rcnn_x101_64x4d_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-rcnn_x101_64x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_rcnn_x101_64x4d_fpn_20e_coco/cascade_rcnn_x101_64x4d_fpn_20e_coco_20200509_224357-051557b1.pth + + - Name: cascade-mask-rcnn_r50-caffe_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.9 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_1x_coco/cascade_mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.412__segm_mAP-0.36_20200504_174659-5004b251.pth + + - Name: cascade-mask-rcnn_r50_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 89.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth + + - Name: cascade-mask-rcnn_r50_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 89.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth + + - Name: cascade-mask-rcnn_r101-caffe_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_1x_coco/cascade_mask_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.432__segm_mAP-0.376_20200504_174813-5c1e9599.pth + + - Name: cascade-mask-rcnn_r101_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.9 + inference time (ms/im): + - value: 102.04 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco/cascade_mask_rcnn_r101_fpn_1x_coco_20200203-befdf6ee.pth + + - Name: cascade-mask-rcnn_r101_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 7.9 + inference time (ms/im): + - value: 102.04 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_20e_coco/cascade_mask_rcnn_r101_fpn_20e_coco_bbox_mAP-0.434__segm_mAP-0.378_20200504_174836-005947da.pth + + - Name: cascade-mask-rcnn_x101-32x4d_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.2 + inference time (ms/im): + - value: 116.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco_20200201-0f411b1f.pth + + - Name: cascade-mask-rcnn_x101-32x4d_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 9.2 + inference time (ms/im): + - value: 116.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco/cascade_mask_rcnn_x101_32x4d_fpn_20e_coco_20200528_083917-ed1f4751.pth + + - Name: cascade-mask-rcnn_x101-64x4d_fpn_1x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 12.2 + inference time (ms/im): + - value: 149.25 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco/cascade_mask_rcnn_x101_64x4d_fpn_1x_coco_20200203-9a2db89d.pth + + - Name: cascade-mask-rcnn_x101-64x4d_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco/cascade_mask_rcnn_x101_64x4d_fpn_20e_coco_20200512_161033-bdb5126a.pth + + - Name: cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r50-caffe_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 5.7 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210707_002651-6e29b3a6.pth + + - Name: cascade-mask-rcnn_r50_fpn_mstrain_3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r50_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 5.9 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco/cascade_mask_rcnn_r50_fpn_mstrain_3x_coco_20210628_164719-5bdc3824.pth + + - Name: cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r101-caffe_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 7.7 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210707_002620-a5bd2389.pth + + - Name: cascade-mask-rcnn_r101_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_r101_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco/cascade_mask_rcnn_r101_fpn_mstrain_3x_coco_20210628_165236-51a2d363.pth + + - Name: cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 9.0 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210706_225234-40773067.pth + + - Name: cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-32x8d_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 12.1 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210719_180640-9ff7e76f.pth + + - Name: cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/cascade_rcnn/cascade-mask-rcnn_x101-64x4d_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 12.0 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco/cascade_mask_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210719_210311-d3e64ba0.pth diff --git a/mmdetection/configs/cascade_rpn/README.md b/mmdetection/configs/cascade_rpn/README.md new file mode 100644 index 0000000..868a25e --- /dev/null +++ b/mmdetection/configs/cascade_rpn/README.md @@ -0,0 +1,41 @@ +# Cascade RPN + +> [Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution](https://arxiv.org/abs/1909.06720) + + + +## Abstract + +This paper considers an architecture referred to as Cascade Region Proposal Network (Cascade RPN) for improving the region-proposal quality and detection performance by systematically addressing the limitation of the conventional RPN that heuristically defines the anchors and aligns the features to the anchors. First, instead of using multiple anchors with predefined scales and aspect ratios, Cascade RPN relies on a single anchor per location and performs multi-stage refinement. Each stage is progressively more stringent in defining positive samples by starting out with an anchor-free metric followed by anchor-based metrics in the ensuing stages. Second, to attain alignment between the features and the anchors throughout the stages, adaptive convolution is proposed that takes the anchors in addition to the image features as its input and learns the sampled features guided by the anchors. A simple implementation of a two-stage Cascade RPN achieves AR 13.4 points higher than that of the conventional RPN, surpassing any existing region proposal methods. When adopting to Fast R-CNN and Faster R-CNN, Cascade RPN can improve the detection mAP by 3.1 and 3.5 points, respectively. + +
    + +
    + +## Results and Models + +### Region proposal performance + +| Method | Backbone | Style | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR 1000 | Config | Download | +| :----: | :------: | :---: | :------: | :-----------------: | :------------: | :-----: | :----------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------: | +| CRPN | R-50-FPN | caffe | - | - | - | 72.0 | [config](./cascade-rpn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_r50_caffe_fpn_1x_coco/cascade_rpn_r50_caffe_fpn_1x_coco-7aa93cef.pth) | + +### Detection performance + +| Method | Proposal | Backbone | Style | Schedule | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Config | Download | +| :----------: | :---------: | :------: | :---: | :------: | :------: | :-----------------: | :------------: | :----: | :----------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Fast R-CNN | Cascade RPN | R-50-FPN | caffe | 1x | - | - | - | 39.9 | [config](./cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco/crpn_fast_rcnn_r50_caffe_fpn_1x_coco-cb486e66.pth) | +| Faster R-CNN | Cascade RPN | R-50-FPN | caffe | 1x | - | - | - | 40.4 | [config](./cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco/crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth) | + +## Citation + +We provide the code for reproducing experiment results of [Cascade RPN](https://arxiv.org/abs/1909.06720). + +```latex +@inproceedings{vu2019cascade, + title={Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution}, + author={Vu, Thang and Jang, Hyunjun and Pham, Trung X and Yoo, Chang D}, + booktitle={Conference on Neural Information Processing Systems (NeurIPS)}, + year={2019} +} +``` diff --git a/mmdetection/configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..ba23ce9 --- /dev/null +++ b/mmdetection/configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,27 @@ +_base_ = '../fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + bbox_coder=dict(target_stds=[0.04, 0.04, 0.08, 0.08]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.5), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + assigner=dict( + pos_iou_thr=0.65, neg_iou_thr=0.65, min_pos_iou=0.65), + sampler=dict(num=256))), + test_cfg=dict(rcnn=dict(score_thr=1e-3))) + +# MMEngine support the following two ways, users can choose +# according to convenience +# train_dataloader = dict(dataset=dict(proposal_file='proposals/crpn_r50_caffe_fpn_1x_train2017.pkl')) # noqa +_base_.train_dataloader.dataset.proposal_file = 'proposals/crpn_r50_caffe_fpn_1x_train2017.pkl' # noqa + +# val_dataloader = dict(dataset=dict(proposal_file='proposals/crpn_r50_caffe_fpn_1x_val2017.pkl')) # noqa +# test_dataloader = val_dataloader +_base_.val_dataloader.dataset.proposal_file = 'proposals/crpn_r50_caffe_fpn_1x_val2017.pkl' # noqa +test_dataloader = _base_.val_dataloader + +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..2f7eced --- /dev/null +++ b/mmdetection/configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,89 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py' +rpn_weight = 0.7 +model = dict( + rpn_head=dict( + _delete_=True, + type='CascadeRPNHead', + num_stages=2, + stages=[ + dict( + type='StageCascadeRPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[1.0], + strides=[4, 8, 16, 32, 64]), + adapt_cfg=dict(type='dilation', dilation=3), + bridged_feature=True, + with_cls=False, + reg_decoded_bbox=True, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=(.0, .0, .0, .0), + target_stds=(0.1, 0.1, 0.5, 0.5)), + loss_bbox=dict( + type='IoULoss', linear=True, + loss_weight=10.0 * rpn_weight)), + dict( + type='StageCascadeRPNHead', + in_channels=256, + feat_channels=256, + adapt_cfg=dict(type='offset'), + bridged_feature=False, + with_cls=True, + reg_decoded_bbox=True, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=(.0, .0, .0, .0), + target_stds=(0.05, 0.05, 0.1, 0.1)), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0 * rpn_weight), + loss_bbox=dict( + type='IoULoss', linear=True, + loss_weight=10.0 * rpn_weight)) + ]), + roi_head=dict( + bbox_head=dict( + bbox_coder=dict(target_stds=[0.04, 0.04, 0.08, 0.08]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.5), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=[ + dict( + assigner=dict( + type='RegionAssigner', center_ratio=0.2, ignore_ratio=0.5), + allowed_border=-1, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False) + ], + rpn_proposal=dict(max_per_img=300, nms=dict(iou_threshold=0.8)), + rcnn=dict( + assigner=dict( + pos_iou_thr=0.65, neg_iou_thr=0.65, min_pos_iou=0.65), + sampler=dict(type='RandomSampler', num=256))), + test_cfg=dict( + rpn=dict(max_per_img=300, nms=dict(iou_threshold=0.8)), + rcnn=dict(score_thr=1e-3))) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/cascade_rpn/cascade-rpn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/cascade_rpn/cascade-rpn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..6eba24d --- /dev/null +++ b/mmdetection/configs/cascade_rpn/cascade-rpn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,76 @@ +_base_ = '../rpn/rpn_r50-caffe_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='CascadeRPNHead', + num_stages=2, + stages=[ + dict( + type='StageCascadeRPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[1.0], + strides=[4, 8, 16, 32, 64]), + adapt_cfg=dict(type='dilation', dilation=3), + bridged_feature=True, + sampling=False, + with_cls=False, + reg_decoded_bbox=True, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=(.0, .0, .0, .0), + target_stds=(0.1, 0.1, 0.5, 0.5)), + loss_bbox=dict(type='IoULoss', linear=True, loss_weight=10.0)), + dict( + type='StageCascadeRPNHead', + in_channels=256, + feat_channels=256, + adapt_cfg=dict(type='offset'), + bridged_feature=False, + sampling=True, + with_cls=True, + reg_decoded_bbox=True, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=(.0, .0, .0, .0), + target_stds=(0.05, 0.05, 0.1, 0.1)), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', linear=True, loss_weight=10.0)) + ]), + train_cfg=dict(rpn=[ + dict( + assigner=dict( + type='RegionAssigner', center_ratio=0.2, ignore_ratio=0.5), + allowed_border=-1, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.3, + ignore_iof_thr=-1, + iou_calculator=dict(type='BboxOverlaps2D')), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.8), + min_bbox_size=0))) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/cascade_rpn/metafile.yml b/mmdetection/configs/cascade_rpn/metafile.yml new file mode 100644 index 0000000..62a88c5 --- /dev/null +++ b/mmdetection/configs/cascade_rpn/metafile.yml @@ -0,0 +1,44 @@ +Collections: + - Name: Cascade RPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Cascade RPN + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1909.06720 + Title: 'Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution' + README: configs/cascade_rpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.8.0/mmdet/models/dense_heads/cascade_rpn_head.py#L538 + Version: v2.8.0 + +Models: + - Name: cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco + In Collection: Cascade RPN + Config: configs/cascade_rpn/cascade-rpn_fast-rcnn_r50-caffe_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_fast_rcnn_r50_caffe_fpn_1x_coco/crpn_fast_rcnn_r50_caffe_fpn_1x_coco-cb486e66.pth + + - Name: cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco + In Collection: Cascade RPN + Config: configs/cascade_rpn/cascade-rpn_faster-rcnn_r50-caffe_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cascade_rpn/crpn_faster_rcnn_r50_caffe_fpn_1x_coco/crpn_faster_rcnn_r50_caffe_fpn_1x_coco-c8283cca.pth diff --git a/mmdetection/configs/centernet/README.md b/mmdetection/configs/centernet/README.md new file mode 100644 index 0000000..81e229c --- /dev/null +++ b/mmdetection/configs/centernet/README.md @@ -0,0 +1,58 @@ +# CenterNet + +> [Objects as Points](https://arxiv.org/abs/1904.07850) + + + +## Abstract + +Detection identifies objects as axis-aligned boxes in an image. Most successful object detectors enumerate a nearly exhaustive list of potential object locations and classify each. This is wasteful, inefficient, and requires additional post-processing. In this paper, we take a different approach. We model an object as a single point --- the center point of its bounding box. Our detector uses keypoint estimation to find center points and regresses to all other object properties, such as size, 3D location, orientation, and even pose. Our center point based approach, CenterNet, is end-to-end differentiable, simpler, faster, and more accurate than corresponding bounding box based detectors. CenterNet achieves the best speed-accuracy trade-off on the MS COCO dataset, with 28.1% AP at 142 FPS, 37.4% AP at 52 FPS, and 45.1% AP with multi-scale testing at 1.4 FPS. We use the same approach to estimate 3D bounding box in the KITTI benchmark and human pose on the COCO keypoint dataset. Our method performs competitively with sophisticated multi-stage methods and runs in real-time. + +
    + +
    + +## Results and Models + +| Backbone | DCN | Mem (GB) | Box AP | Flip box AP | Config | Download | +| :-------: | :-: | :------: | :----: | :---------: | :--------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| ResNet-18 | N | 3.45 | 25.9 | 27.3 | [config](./centernet_r18_8xb16-crop512-140e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630-bb5b3bf7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630.log.json) | +| ResNet-18 | Y | 3.47 | 29.5 | 30.9 | [config](./centernet_r18-dcnv2_8xb16-crop512-140e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131.log.json) | + +Note: + +- Flip box AP setting is single-scale and `flip=True`. +- Due to complex data enhancement, we find that the performance is unstable and may fluctuate by about 0.4 mAP. mAP 29.4 ~ 29.8 is acceptable in ResNet-18-DCNv2. +- Compared to the source code, we refer to [CenterNet-Better](https://github.com/FateScript/CenterNet-better), and make the following changes + - fix wrong image mean and variance in image normalization to be compatible with the pre-trained backbone. + - Use SGD rather than ADAM optimizer and add warmup and grad clip. + - Use DistributedDataParallel as other models in MMDetection rather than using DataParallel. + +## CenterNet Update + +| Backbone | Style | Lr schd | MS train | Mem (GB) | Box AP | Config | Download | +| :-------: | :---: | :-----: | :------: | :------: | :----: | :------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| ResNet-50 | caffe | 1x | True | 3.3 | 40.2 | [config](./centernet-update_r50-caffe_fpn_ms-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco/centernet-update_r50-caffe_fpn_ms-1x_coco_20230512_203845-8306baf2.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco/centernet-update_r50-caffe_fpn_ms-1x_coco_20230512_203845.log.json) | + +CenterNet Update from the paper of [Probabilistic two-stage detection](https://arxiv.org/abs/2103.07461). The author has updated CenterNet to greatly improve performance and convergence speed. +The [Details](https://github.com/xingyizhou/CenterNet2/blob/master/docs/MODEL_ZOO.md) are as follows: + +- Using top-left-right-bottom box encoding and GIoU Loss +- Adding regression loss to the center 3x3 region +- Adding more positive pixels for the heatmap loss whose regression loss is small and is within the center3x3 region +- Using RetinaNet-style optimizer (SGD), learning rate rule (0.01 for each batch size 16), and schedule (12 epochs) +- Added FPN neck layers, and assigns objects to FPN levels based on a fixed size range. +- Using standard NMS instead of max pooling + +Note: We found that the performance of the r50 model fluctuates greatly and sometimes it does not converge. If the model does not converge, you can try running it again or reduce the learning rate. + +## Citation + +```latex +@article{zhou2019objects, + title={Objects as Points}, + author={Zhou, Xingyi and Wang, Dequan and Kr{\"a}henb{\"u}hl, Philipp}, + booktitle={arXiv preprint arXiv:1904.07850}, + year={2019} +} +``` diff --git a/mmdetection/configs/centernet/centernet-update_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/centernet/centernet-update_r101_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..4fc65e0 --- /dev/null +++ b/mmdetection/configs/centernet/centernet-update_r101_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/centernet/centernet-update_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/centernet/centernet-update_r18_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..ab3ae32 --- /dev/null +++ b/mmdetection/configs/centernet/centernet-update_r18_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py b/mmdetection/configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py new file mode 100644 index 0000000..1f6e2b3 --- /dev/null +++ b/mmdetection/configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py @@ -0,0 +1,105 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='CenterNet', + # use caffe img_norm + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5, + # There is a chance to get 40.3 after switching init_cfg, + # otherwise it is about 39.9~40.1 + init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'), + relu_before_extra_convs=True), + bbox_head=dict( + type='CenterNetUpdateHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + hm_min_radius=4, + hm_min_overlap=0.8, + more_pos_thresh=0.2, + more_pos_topk=9, + soft_weight_on_reg=False, + loss_cls=dict( + type='GaussianFocalLoss', + pos_weight=0.25, + neg_weight=0.75, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + ), + train_cfg=None, + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# single-scale training is about 39.3 +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.00025, + by_epoch=False, + begin=0, + end=4000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(lr=0.01), + # Experiments show that there is no need to turn on clip_grad. + paramwise_cfg=dict(norm_decay_mult=0.)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/centernet/centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/centernet/centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..34e0c68 --- /dev/null +++ b/mmdetection/configs/centernet/centernet-update_r50_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,83 @@ +_base_ = '../common/lsj-200e_coco-detection.py' + +image_size = (1024, 1024) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +model = dict( + type='CenterNet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5, + init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'), + relu_before_extra_convs=True), + bbox_head=dict( + type='CenterNetUpdateHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='GaussianFocalLoss', + pos_weight=0.25, + neg_weight=0.75, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + ), + train_cfg=None, + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +train_dataloader = dict(batch_size=8, num_workers=4) +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004), + paramwise_cfg=dict(norm_decay_mult=0.)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.00025, + by_epoch=False, + begin=0, + end=4000), + dict( + type='MultiStepLR', + begin=0, + end=25, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py b/mmdetection/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py new file mode 100644 index 0000000..732a55d --- /dev/null +++ b/mmdetection/configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py @@ -0,0 +1,136 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', + './centernet_tta.py' +] + +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# model settings +model = dict( + type='CenterNet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNet', + depth=18, + norm_eval=False, + norm_cfg=dict(type='BN'), + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict( + type='CTResNetNeck', + in_channels=512, + num_deconv_filters=(256, 128, 64), + num_deconv_kernels=(4, 4, 4), + use_dcn=True), + bbox_head=dict( + type='CenterNetHead', + num_classes=80, + in_channels=64, + feat_channels=64, + loss_center_heatmap=dict(type='GaussianFocalLoss', loss_weight=1.0), + loss_wh=dict(type='L1Loss', loss_weight=0.1), + loss_offset=dict(type='L1Loss', loss_weight=1.0)), + train_cfg=None, + test_cfg=dict(topk=100, local_maximum_kernel=3, max_per_img=100)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='RandomCenterCropPad', + # The cropped images are padded into squares during training, + # but may be less than crop_size. + crop_size=(512, 512), + ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3), + mean=[0, 0, 0], + std=[1, 1, 1], + to_rgb=True, + test_pad_mode=None), + # Make sure the output is always crop_size. + dict(type='Resize', scale=(512, 512), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args={{_base_.backend_args}}, + to_float32=True), + # don't need Resize + dict( + type='RandomCenterCropPad', + ratios=None, + border=None, + mean=[0, 0, 0], + std=[1, 1, 1], + to_rgb=True, + test_mode=True, + test_pad_mode=['logical_or', 31], + test_pad_add_pix=1), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'border')) +] + +# Use RepeatDataset to speed up training +train_dataloader = dict( + batch_size=16, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args={{_base_.backend_args}}, + ))) + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +# Based on the default settings of modern detectors, the SGD effect is better +# than the Adam in the source code, so we use SGD default settings and +# if you use adam+lr5e-4, the map is 29.1. +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) + +max_epochs = 28 +# learning policy +# Based on the default settings of modern detectors, we added warmup settings. +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[18, 24], # the real step is [18*5, 24*5] + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) # the real epoch is 28*5=140 + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (16 samples per GPU) +auto_scale_lr = dict(base_batch_size=128) diff --git a/mmdetection/configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py b/mmdetection/configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py new file mode 100644 index 0000000..6094b64 --- /dev/null +++ b/mmdetection/configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py @@ -0,0 +1,3 @@ +_base_ = './centernet_r18-dcnv2_8xb16-crop512-140e_coco.py' + +model = dict(neck=dict(use_dcn=False)) diff --git a/mmdetection/configs/centernet/centernet_tta.py b/mmdetection/configs/centernet/centernet_tta.py new file mode 100644 index 0000000..edd7b03 --- /dev/null +++ b/mmdetection/configs/centernet/centernet_tta.py @@ -0,0 +1,39 @@ +# This is different from the TTA of official CenterNet. + +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.5), max_per_img=100)) + +tta_pipeline = [ + dict(type='LoadImageFromFile', to_float32=True, backend_args=None), + dict( + type='TestTimeAug', + transforms=[ + [ + # ``RandomFlip`` must be placed before ``RandomCenterCropPad``, + # otherwise bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], + [ + dict( + type='RandomCenterCropPad', + ratios=None, + border=None, + mean=[0, 0, 0], + std=[1, 1, 1], + to_rgb=True, + test_mode=True, + test_pad_mode=['logical_or', 31], + test_pad_add_pix=1), + ], + [dict(type='LoadAnnotations', with_bbox=True)], + [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'flip', 'flip_direction', 'border')) + ] + ]) +] diff --git a/mmdetection/configs/centernet/metafile.yml b/mmdetection/configs/centernet/metafile.yml new file mode 100644 index 0000000..496b8ea --- /dev/null +++ b/mmdetection/configs/centernet/metafile.yml @@ -0,0 +1,60 @@ +Collections: + - Name: CenterNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x TITANXP GPUs + Architecture: + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.07850 + Title: 'Objects as Points' + README: configs/centernet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.13.0/mmdet/models/detectors/centernet.py#L10 + Version: v2.13.0 + +Models: + - Name: centernet_r18-dcnv2_8xb16-crop512-140e_coco + In Collection: CenterNet + Config: configs/centernet/centernet_r18-dcnv2_8xb16-crop512-140e_coco.py + Metadata: + Batch Size: 128 + Training Memory (GB): 3.47 + Epochs: 140 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 29.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_dcnv2_140e_coco/centernet_resnet18_dcnv2_140e_coco_20210702_155131-c8cd631f.pth + + - Name: centernet_r18_8xb16-crop512-140e_coco + In Collection: CenterNet + Config: configs/centernet/centernet_r18_8xb16-crop512-140e_coco.py + Metadata: + Batch Size: 128 + Training Memory (GB): 3.45 + Epochs: 140 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 25.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/centernet/centernet_resnet18_140e_coco/centernet_resnet18_140e_coco_20210705_093630-bb5b3bf7.pth + + - Name: centernet-update_r50-caffe_fpn_ms-1x_coco + In Collection: CenterNet + Config: configs/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco.py + Metadata: + Batch Size: 16 + Training Memory (GB): 3.3 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v3.0/centernet/centernet-update_r50-caffe_fpn_ms-1x_coco/centernet-update_r50-caffe_fpn_ms-1x_coco_20230512_203845-8306baf2.pth diff --git a/mmdetection/configs/centripetalnet/README.md b/mmdetection/configs/centripetalnet/README.md new file mode 100644 index 0000000..21edbd2 --- /dev/null +++ b/mmdetection/configs/centripetalnet/README.md @@ -0,0 +1,36 @@ +# CentripetalNet + +> [CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection](https://arxiv.org/abs/2003.09119) + + + +## Abstract + +Keypoint-based detectors have achieved pretty-well performance. However, incorrect keypoint matching is still widespread and greatly affects the performance of the detector. In this paper, we propose CentripetalNet which uses centripetal shift to pair corner keypoints from the same instance. CentripetalNet predicts the position and the centripetal shift of the corner points and matches corners whose shifted results are aligned. Combining position information, our approach matches corner points more accurately than the conventional embedding approaches do. Corner pooling extracts information inside the bounding boxes onto the border. To make this information more aware at the corners, we design a cross-star deformable convolution network to conduct feature adaption. Furthermore, we explore instance segmentation on anchor-free detectors by equipping our CentripetalNet with a mask prediction module. On MS-COCO test-dev, our CentripetalNet not only outperforms all existing anchor-free detectors with an AP of 48.0% but also achieves comparable performance to the state-of-the-art instance segmentation approaches with a 40.2% MaskAP. + +
    + +
    + +## Results and Models + +| Backbone | Batch Size | Step/Total Epochs | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :--------------: | :-----------------------------------------------------------------------: | :---------------: | :------: | :------------: | :----: | :-----------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| HourglassNet-104 | [16 x 6](./centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py) | 190/210 | 16.7 | 3.7 | 44.8 | [config](./centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804.log.json) | + +Note: + +- TTA setting is single-scale and `flip=True`. If you want to reproduce the TTA performance, please add `--tta` in the test command. +- The model we released is the best checkpoint rather than the latest checkpoint (box AP 44.8 vs 44.6 in our experiment). + +## Citation + +```latex +@InProceedings{Dong_2020_CVPR, +author = {Dong, Zhiwei and Li, Guoxuan and Liao, Yue and Wang, Fei and Ren, Pengju and Qian, Chen}, +title = {CentripetalNet: Pursuing High-Quality Keypoint Pairs for Object Detection}, +booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, +month = {June}, +year = {2020} +} +``` diff --git a/mmdetection/configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py b/mmdetection/configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py new file mode 100644 index 0000000..b757ffd --- /dev/null +++ b/mmdetection/configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py @@ -0,0 +1,181 @@ +_base_ = [ + '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py' +] + +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True) + +# model settings +model = dict( + type='CornerNet', + data_preprocessor=data_preprocessor, + backbone=dict( + type='HourglassNet', + downsample_times=5, + num_stacks=2, + stage_channels=[256, 256, 384, 384, 384, 512], + stage_blocks=[2, 2, 2, 2, 2, 4], + norm_cfg=dict(type='BN', requires_grad=True)), + neck=None, + bbox_head=dict( + type='CentripetalHead', + num_classes=80, + in_channels=256, + num_feat_levels=2, + corner_emb_channels=0, + loss_heatmap=dict( + type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1), + loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1), + loss_guiding_shift=dict( + type='SmoothL1Loss', beta=1.0, loss_weight=0.05), + loss_centripetal_shift=dict( + type='SmoothL1Loss', beta=1.0, loss_weight=1)), + # training and testing settings + train_cfg=None, + test_cfg=dict( + corner_topk=100, + local_maximum_kernel=3, + distance_threshold=0.5, + score_thr=0.05, + max_per_img=100, + nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'))) + +# data settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + # The cropped images are padded into squares during training, + # but may be smaller than crop_size. + type='RandomCenterCropPad', + crop_size=(511, 511), + ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3), + test_mode=False, + test_pad_mode=None, + mean=data_preprocessor['mean'], + std=data_preprocessor['std'], + # Image data is not converted to rgb. + to_rgb=data_preprocessor['bgr_to_rgb']), + dict(type='Resize', scale=(511, 511), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs'), +] + +test_pipeline = [ + dict( + type='LoadImageFromFile', + to_float32=True, + backend_args=_base_.backend_args), + # don't need Resize + dict( + type='RandomCenterCropPad', + crop_size=None, + ratios=None, + border=None, + test_mode=True, + test_pad_mode=['logical_or', 127], + mean=data_preprocessor['mean'], + std=data_preprocessor['std'], + # Image data is not converted to rgb. + to_rgb=data_preprocessor['bgr_to_rgb']), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'border')) +] + +train_dataloader = dict( + batch_size=6, + num_workers=3, + batch_sampler=None, + dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='Adam', lr=0.0005), + clip_grad=dict(max_norm=35, norm_type=2)) + +max_epochs = 210 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[190], + gamma=0.1) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (16 GPUs) x (6 samples per GPU) +auto_scale_lr = dict(base_batch_size=96) + +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict( + nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'), + max_per_img=100)) + +tta_pipeline = [ + dict( + type='LoadImageFromFile', + to_float32=True, + backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + [ + # ``RandomFlip`` must be placed before ``RandomCenterCropPad``, + # otherwise bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], + [ + dict( + type='RandomCenterCropPad', + crop_size=None, + ratios=None, + border=None, + test_mode=True, + test_pad_mode=['logical_or', 127], + mean=data_preprocessor['mean'], + std=data_preprocessor['std'], + # Image data is not converted to rgb. + to_rgb=data_preprocessor['bgr_to_rgb']) + ], + [dict(type='LoadAnnotations', with_bbox=True)], + [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'flip', 'flip_direction', 'border')) + ] + ]) +] diff --git a/mmdetection/configs/centripetalnet/metafile.yml b/mmdetection/configs/centripetalnet/metafile.yml new file mode 100644 index 0000000..526572d --- /dev/null +++ b/mmdetection/configs/centripetalnet/metafile.yml @@ -0,0 +1,39 @@ +Collections: + - Name: CentripetalNet + Metadata: + Training Data: COCO + Training Techniques: + - Adam + Training Resources: 16x V100 GPUs + Architecture: + - Corner Pooling + - Stacked Hourglass Network + Paper: + URL: https://arxiv.org/abs/2003.09119 + Title: 'CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection' + README: configs/centripetalnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.5.0/mmdet/models/detectors/cornernet.py#L9 + Version: v2.5.0 + +Models: + - Name: centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco + In Collection: CentripetalNet + Config: configs/centripetalnet/centripetalnet_hourglass104_16xb6-crop511-210e-mstest_coco.py + Metadata: + Batch Size: 96 + Training Memory (GB): 16.7 + inference time (ms/im): + - value: 270.27 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 210 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/centripetalnet/centripetalnet_hourglass104_mstest_16x6_210e_coco/centripetalnet_hourglass104_mstest_16x6_210e_coco_20200915_204804-3ccc61e5.pth diff --git a/mmdetection/configs/cityscapes/README.md b/mmdetection/configs/cityscapes/README.md new file mode 100644 index 0000000..9e37b64 --- /dev/null +++ b/mmdetection/configs/cityscapes/README.md @@ -0,0 +1,46 @@ +# Cityscapes + +> [The Cityscapes Dataset for Semantic Urban Scene Understanding](https://arxiv.org/abs/1604.01685) + + + +## Abstract + +Visual understanding of complex urban street scenes is an enabling factor for a wide range of applications. Object detection has benefited enormously from large-scale datasets, especially in the context of deep learning. For semantic urban scene understanding, however, no current dataset adequately captures the complexity of real-world urban scenes. +To address this, we introduce Cityscapes, a benchmark suite and large-scale dataset to train and test approaches for pixel-level and instance-level semantic labeling. Cityscapes is comprised of a large, diverse set of stereo video sequences recorded in streets from 50 different cities. 5000 of these images have high quality pixel-level annotations; 20000 additional images have coarse annotations to enable methods that leverage large volumes of weakly-labeled data. Crucially, our effort exceeds previous attempts in terms of dataset size, annotation richness, scene variability, and complexity. Our accompanying empirical study provides an in-depth analysis of the dataset characteristics, as well as a performance evaluation of several state-of-the-art approaches based on our benchmark. + +
    + +
    + +## Common settings + +- All baselines were trained using 8 GPU with a batch size of 8 (1 images per GPU) using the [linear scaling rule](https://arxiv.org/abs/1706.02677) to scale the learning rate. +- All models were trained on `cityscapes_train`, and tested on `cityscapes_val`. +- 1x training schedule indicates 64 epochs which corresponds to slightly less than the 24k iterations reported in the original schedule from the [Mask R-CNN paper](https://arxiv.org/abs/1703.06870) +- COCO pre-trained weights are used to initialize. +- A conversion [script](../../tools/dataset_converters/cityscapes.py) is provided to convert Cityscapes into COCO format. Please refer to [install.md](../../docs/1_exist_data_model.md#prepare-datasets) for details. +- `CityscapesDataset` implemented three evaluation methods. `bbox` and `segm` are standard COCO bbox/mask AP. `cityscapes` is the cityscapes dataset official evaluation, which may be slightly higher than COCO. + +### Faster R-CNN + +| Backbone | Style | Lr schd | Scale | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :-----: | :-----: | :------: | :------: | :------------: | :----: | :----------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 800-1024 | 5.2 | - | 40.3 | [config](./faster-rcnn_r50_fpn_1x_cityscapes.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes_20200502-829424c0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/faster_rcnn_r50_fpn_1x_cityscapes_20200502_114915.log.json) | + +### Mask R-CNN + +| Backbone | Style | Lr schd | Scale | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :------: | :-----: | :-----: | :------: | :------: | :------------: | :----: | :-----: | :--------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 800-1024 | 5.3 | - | 40.9 | 36.4 | [config](./mask-rcnn_r50_fpn_1x_cityscapes.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes/mask_rcnn_r50_fpn_1x_cityscapes_20201211_133733-d2858245.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cityscapes/mask_rcnn_r50_fpn_1x_cityscapes/mask_rcnn_r50_fpn_1x_cityscapes_20201211_133733.log.json) | + +## Citation + +```latex +@inproceedings{Cordts2016Cityscapes, + title={The Cityscapes Dataset for Semantic Urban Scene Understanding}, + author={Cordts, Marius and Omran, Mohamed and Ramos, Sebastian and Rehfeld, Timo and Enzweiler, Markus and Benenson, Rodrigo and Franke, Uwe and Roth, Stefan and Schiele, Bernt}, + booktitle={Proc. of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2016} +} +``` diff --git a/mmdetection/configs/cityscapes/faster-rcnn_r50_fpn_1x_cityscapes.py b/mmdetection/configs/cityscapes/faster-rcnn_r50_fpn_1x_cityscapes.py new file mode 100644 index 0000000..ccd0de2 --- /dev/null +++ b/mmdetection/configs/cityscapes/faster-rcnn_r50_fpn_1x_cityscapes.py @@ -0,0 +1,41 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/cityscapes_detection.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py' +] +model = dict( + backbone=dict(init_cfg=None), + roi_head=dict( + bbox_head=dict( + num_classes=8, + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)))) + +# optimizer +# lr is set for a batch size of 8 +optim_wrapper = dict(optimizer=dict(lr=0.01)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=8, + by_epoch=True, + # [7] yields higher performance than [6] + milestones=[7], + gamma=0.1) +] + +# actual epoch = 8 * 8 = 64 +train_cfg = dict(max_epochs=8) + +# For better, more stable performance initialize from COCO +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth' # noqa + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +# TODO: support auto scaling lr +# auto_scale_lr = dict(base_batch_size=8) diff --git a/mmdetection/configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py b/mmdetection/configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py new file mode 100644 index 0000000..772268b --- /dev/null +++ b/mmdetection/configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py @@ -0,0 +1,43 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/cityscapes_instance.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py' +] +model = dict( + backbone=dict(init_cfg=None), + roi_head=dict( + bbox_head=dict( + type='Shared2FCBBoxHead', + num_classes=8, + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_head=dict(num_classes=8))) + +# optimizer +# lr is set for a batch size of 8 +optim_wrapper = dict(optimizer=dict(lr=0.01)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=8, + by_epoch=True, + # [7] yields higher performance than [6] + milestones=[7], + gamma=0.1) +] + +# actual epoch = 8 * 8 = 64 +train_cfg = dict(max_epochs=8) + +# For better, more stable performance initialize from COCO +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth' # noqa + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +# TODO: support auto scaling lr +# auto_scale_lr = dict(base_batch_size=8) diff --git a/mmdetection/configs/common/lsj-100e_coco-detection.py b/mmdetection/configs/common/lsj-100e_coco-detection.py new file mode 100644 index 0000000..bb631e5 --- /dev/null +++ b/mmdetection/configs/common/lsj-100e_coco-detection.py @@ -0,0 +1,122 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +image_size = (1024, 1024) + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +# Use RepeatDataset to speed up training +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=4, # simply change this from 2 to 16 for 50e - 400e training. + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +max_epochs = 25 + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# optimizer assumes bs=64 +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] + +# only keep latest 2 checkpoints +default_hooks = dict(checkpoint=dict(max_keep_ckpts=2)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/common/lsj-100e_coco-instance.py b/mmdetection/configs/common/lsj-100e_coco-instance.py new file mode 100644 index 0000000..6e62729 --- /dev/null +++ b/mmdetection/configs/common/lsj-100e_coco-instance.py @@ -0,0 +1,122 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +image_size = (1024, 1024) + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +# Use RepeatDataset to speed up training +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=4, # simply change this from 2 to 16 for 50e - 400e training. + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +max_epochs = 25 + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# optimizer assumes bs=64 +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] + +# only keep latest 2 checkpoints +default_hooks = dict(checkpoint=dict(max_keep_ckpts=2)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/common/lsj-200e_coco-detection.py b/mmdetection/configs/common/lsj-200e_coco-detection.py new file mode 100644 index 0000000..83d1294 --- /dev/null +++ b/mmdetection/configs/common/lsj-200e_coco-detection.py @@ -0,0 +1,18 @@ +_base_ = './lsj-100e_coco-detection.py' + +# 8x25=200e +train_dataloader = dict(dataset=dict(times=8)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=25, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] diff --git a/mmdetection/configs/common/lsj-200e_coco-instance.py b/mmdetection/configs/common/lsj-200e_coco-instance.py new file mode 100644 index 0000000..af3e4bf --- /dev/null +++ b/mmdetection/configs/common/lsj-200e_coco-instance.py @@ -0,0 +1,18 @@ +_base_ = './lsj-100e_coco-instance.py' + +# 8x25=200e +train_dataloader = dict(dataset=dict(times=8)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=25, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] diff --git a/mmdetection/configs/common/ms-90k_coco.py b/mmdetection/configs/common/ms-90k_coco.py new file mode 100644 index 0000000..e2d6c3d --- /dev/null +++ b/mmdetection/configs/common/ms-90k_coco.py @@ -0,0 +1,122 @@ +_base_ = '../_base_/default_runtime.py' + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# Align with Detectron2 +backend = 'pillow' +train_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=backend_args, + imdecode_backend=backend), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True, + backend=backend), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=backend_args, + imdecode_backend=backend), + dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# training schedule for 90k +max_iter = 90000 +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iter, val_interval=10000) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) + +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000)) +log_processor = dict(by_epoch=False) diff --git a/mmdetection/configs/common/ms-poly-90k_coco-instance.py b/mmdetection/configs/common/ms-poly-90k_coco-instance.py new file mode 100644 index 0000000..d5566b3 --- /dev/null +++ b/mmdetection/configs/common/ms-poly-90k_coco-instance.py @@ -0,0 +1,130 @@ +_base_ = '../_base_/default_runtime.py' + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# Align with Detectron2 +backend = 'pillow' +train_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=backend_args, + imdecode_backend=backend), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True, + backend=backend), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=backend_args, + imdecode_backend=backend), + dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='InfiniteSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# training schedule for 90k +max_iter = 90000 +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iter, val_interval=10000) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) + +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000)) +log_processor = dict(by_epoch=False) diff --git a/mmdetection/configs/common/ms-poly_3x_coco-instance.py b/mmdetection/configs/common/ms-poly_3x_coco-instance.py new file mode 100644 index 0000000..04072f9 --- /dev/null +++ b/mmdetection/configs/common/ms-poly_3x_coco-instance.py @@ -0,0 +1,118 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs'), +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + backend_args=backend_args) +test_evaluator = val_evaluator + +# training schedule for 3x with `RepeatDataset` +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +# Experiments show that using milestones=[9, 11] has higher performance +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[9, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/common/ms_3x_coco-instance.py b/mmdetection/configs/common/ms_3x_coco-instance.py new file mode 100644 index 0000000..f80cf88 --- /dev/null +++ b/mmdetection/configs/common/ms_3x_coco-instance.py @@ -0,0 +1,108 @@ +_base_ = '../_base_/default_runtime.py' + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +# training schedule for 3x with `RepeatDataset` +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +# Experiments show that using milestones=[9, 11] has higher performance +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[9, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/common/ms_3x_coco.py b/mmdetection/configs/common/ms_3x_coco.py new file mode 100644 index 0000000..facbb34 --- /dev/null +++ b/mmdetection/configs/common/ms_3x_coco.py @@ -0,0 +1,108 @@ +_base_ = '../_base_/default_runtime.py' + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +# training schedule for 3x with `RepeatDataset` +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +# Experiments show that using milestones=[9, 11] has higher performance +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[9, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/common/ssj_270k_coco-instance.py b/mmdetection/configs/common/ssj_270k_coco-instance.py new file mode 100644 index 0000000..7407644 --- /dev/null +++ b/mmdetection/configs/common/ssj_270k_coco-instance.py @@ -0,0 +1,125 @@ +_base_ = '../_base_/default_runtime.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +image_size = (1024, 1024) + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# Standard Scale Jittering (SSJ) resizes and crops an image +# with a resize range of 0.8 to 1.25 of the original image size. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.8, 1.25), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='InfiniteSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# The model is trained by 270k iterations with batch_size 64, +# which is roughly equivalent to 144 epochs. + +max_iters = 270000 +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iters, val_interval=10000) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# optimizer assumes bs=64 +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00004)) + +# learning rate policy +# lr steps at [0.9, 0.95, 0.975] of the maximum iterations +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=270000, + by_epoch=False, + milestones=[243000, 256500, 263250], + gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000)) +log_processor = dict(by_epoch=False) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/common/ssj_scp_270k_coco-instance.py b/mmdetection/configs/common/ssj_scp_270k_coco-instance.py new file mode 100644 index 0000000..06159dd --- /dev/null +++ b/mmdetection/configs/common/ssj_scp_270k_coco-instance.py @@ -0,0 +1,60 @@ +_base_ = 'ssj_270k_coco-instance.py' +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +image_size = (1024, 1024) + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# Standard Scale Jittering (SSJ) resizes and crops an image +# with a resize range of 0.8 to 1.25 of the original image size. +load_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.8, 1.25), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=image_size), +] +train_pipeline = [ + dict(type='CopyPaste', max_num_pasted=100), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + dataset=dict( + _delete_=True, + type='MultiImageMixDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=load_pipeline, + backend_args=backend_args), + pipeline=train_pipeline)) diff --git a/mmdetection/configs/condinst/README.md b/mmdetection/configs/condinst/README.md new file mode 100644 index 0000000..01deb0e --- /dev/null +++ b/mmdetection/configs/condinst/README.md @@ -0,0 +1,40 @@ +# CondInst + +> [CondInst: Conditional Convolutions for Instance +> Segmentation](https://arxiv.org/pdf/2003.05664.pdf) + + + +## Abstract + +We propose a simple yet effective instance segmentation framework, termed CondInst (conditional convolutions for instance segmentation). Top-performing instance segmentation methods such as Mask +R-CNN rely on ROI operations (typically ROIPool or ROIAlign) to +obtain the final instance masks. In contrast, we propose to solve instance segmentation from a new perspective. Instead of using instancewise ROIs as inputs to a network of fixed weights, we employ dynamic +instance-aware networks, conditioned on instances. CondInst enjoys two +advantages: 1) Instance segmentation is solved by a fully convolutional +network, eliminating the need for ROI cropping and feature alignment. +2\) Due to the much improved capacity of dynamically-generated conditional convolutions, the mask head can be very compact (e.g., 3 conv. +layers, each having only 8 channels), leading to significantly faster inference. We demonstrate a simpler instance segmentation method that can +achieve improved performance in both accuracy and inference speed. On +the COCO dataset, we outperform a few recent methods including welltuned Mask R-CNN baselines, without longer training schedules needed. + +
    + +
    + +## Results and Models + +| Backbone | Style | MS train | Lr schd | bbox AP | mask AP | Config | Download | +| :------: | :-----: | :------: | :-----: | :-----: | :-----: | :-------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | Y | 1x | 39.8 | 36.0 | [config](./condinst_r50_fpn_ms-poly-90k_coco_instance.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance/condinst_r50_fpn_ms-poly-90k_coco_instance_20221129_125223-4c186406.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance/condinst_r50_fpn_ms-poly-90k_coco_instance_20221129_125223.json) | + +## Citation + +```latex +@inproceedings{tian2020conditional, + title = {Conditional Convolutions for Instance Segmentation}, + author = {Tian, Zhi and Shen, Chunhua and Chen, Hao}, + booktitle = {Proc. Eur. Conf. Computer Vision (ECCV)}, + year = {2020} +} +``` diff --git a/mmdetection/configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py b/mmdetection/configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py new file mode 100644 index 0000000..39639d8 --- /dev/null +++ b/mmdetection/configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py @@ -0,0 +1,85 @@ +_base_ = '../common/ms-poly-90k_coco-instance.py' + +# model settings +model = dict( + type='CondInst', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='CondInstBboxHead', + num_params=169, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + norm_on_bbox=True, + centerness_on_reg=True, + dcn_on_last_conv=False, + center_sampling=True, + conv_bias=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + mask_head=dict( + type='CondInstMaskHead', + num_layers=3, + feat_channels=8, + size_of_interest=8, + mask_out_stride=4, + max_masks_to_train=300, + mask_feature_head=dict( + in_channels=256, + feat_channels=128, + start_level=0, + end_level=2, + out_channels=8, + mask_stride=8, + num_stacked_convs=4, + norm_cfg=dict(type='BN', requires_grad=True)), + loss_mask=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + eps=5e-6, + loss_weight=1.0)), + # model training and testing settings + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100, + mask_thr=0.5)) + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/condinst/metafile.yml b/mmdetection/configs/condinst/metafile.yml new file mode 100644 index 0000000..1237b74 --- /dev/null +++ b/mmdetection/configs/condinst/metafile.yml @@ -0,0 +1,32 @@ +Collections: + - Name: CondInst + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x A100 GPUs + Architecture: + - FPN + - FCOS + - ResNet + Paper: https://arxiv.org/abs/2003.05664 + README: configs/condinst/README.md + +Models: + - Name: condinst_r50_fpn_ms-poly-90k_coco_instance + In Collection: CondInst + Config: configs/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance.py + Metadata: + Training Memory (GB): 4.4 + Iterations: 90000 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v3.0/condinst/condinst_r50_fpn_ms-poly-90k_coco_instance/condinst_r50_fpn_ms-poly-90k_coco_instance_20221129_125223-4c186406.pth diff --git a/mmdetection/configs/conditional_detr/README.md b/mmdetection/configs/conditional_detr/README.md new file mode 100644 index 0000000..4043571 --- /dev/null +++ b/mmdetection/configs/conditional_detr/README.md @@ -0,0 +1,39 @@ +# Conditional DETR + +> [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) + + + +## Abstract + +The DETR approach applies the transformer encoder and decoder architecture to object detection and achieves promising performance. In this paper, we handle the critical issue, slow training convergence, and present a conditional cross-attention mechanism for fast DETR training. Our approach is motivated by that the cross-attention in DETR relies highly on the content embeddings and that the spatial embeddings make minor contributions, increasing the need for high-quality content embeddings and thus increasing the training difficulty. + +
    + +
    + +Our conditional DETR learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box (Figure 1). This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. Empirical results show that conditional DETR converges 6.7x faster for the backbones R50 and R101 and 10x faster for stronger backbones DC5-R50 and DC5-R101. + +
    + + +
    + +## Results and Models + +We provide the config files and models for Conditional DETR: [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152). + +| Backbone | Model | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :--------------: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | Conditional DETR | 50e | | | 41.1 | [config](./conditional-detr_r50_8xb2-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/conditional_detr/conditional-detr_r50_8xb2-50e_coco/conditional-detr_r50_8xb2-50e_coco_20221121_180202-c83a1dc0.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/conditional_detr/conditional-detr_r50_8xb2-50e_coco/conditional-detr_r50_8xb2-50e_coco_20221121_180202.log.json) | + +## Citation + +```latex +@inproceedings{meng2021-CondDETR, + title = {Conditional DETR for Fast Training Convergence}, + author = {Meng, Depu and Chen, Xiaokang and Fan, Zejia and Zeng, Gang and Li, Houqiang and Yuan, Yuhui and Sun, Lei and Wang, Jingdong}, + booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)}, + year = {2021} +} +``` diff --git a/mmdetection/configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py b/mmdetection/configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py new file mode 100644 index 0000000..a214764 --- /dev/null +++ b/mmdetection/configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py @@ -0,0 +1,42 @@ +_base_ = ['../detr/detr_r50_8xb2-150e_coco.py'] +model = dict( + type='ConditionalDETR', + num_queries=300, + decoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict( + _delete_=True, + embed_dims=256, + num_heads=8, + attn_drop=0.1, + cross_attn=False), + cross_attn_cfg=dict( + _delete_=True, + embed_dims=256, + num_heads=8, + attn_drop=0.1, + cross_attn=True))), + bbox_head=dict( + type='ConditionalDETRHead', + loss_cls=dict( + _delete_=True, + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ]))) + +# learning policy +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=50, val_interval=1) + +param_scheduler = [dict(type='MultiStepLR', end=50, milestones=[40])] diff --git a/mmdetection/configs/conditional_detr/metafile.yml b/mmdetection/configs/conditional_detr/metafile.yml new file mode 100644 index 0000000..83f5532 --- /dev/null +++ b/mmdetection/configs/conditional_detr/metafile.yml @@ -0,0 +1,32 @@ +Collections: + - Name: Conditional DETR + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 8x A100 GPUs + Architecture: + - ResNet + - Transformer + Paper: + URL: https://arxiv.org/abs/2108.06152 + Title: 'Conditional DETR for Fast Training Convergence' + README: configs/conditional_detr/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/f4112c9e5611468ffbd57cfba548fd1289264b52/mmdet/models/detectors/conditional_detr.py#L14 + Version: v3.0.0rc6 + +Models: + - Name: conditional-detr_r50_8xb2-50e_coco + In Collection: Conditional DETR + Config: configs/conditional_detr/conditional-detr_r50_8xb2-50e_coco.py + Metadata: + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/conditional_detr/conditional-detr_r50_8xb2-50e_coco/conditional-detr_r50_8xb2-50e_coco_20221121_180202-c83a1dc0.pth diff --git a/mmdetection/configs/convnext/README.md b/mmdetection/configs/convnext/README.md new file mode 100644 index 0000000..33497bb --- /dev/null +++ b/mmdetection/configs/convnext/README.md @@ -0,0 +1,42 @@ +# ConvNeXt + +> [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) + + + +## Abstract + +The "Roaring 20s" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model. A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually "modernize" a standard ResNet toward the design of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets. + +
    + +
    + +## Results and models + +| Method | Backbone | Pretrain | Lr schd | Multi-scale crop | FP16 | Mem (GB) | box AP | mask AP | Config | Download | +| :----------------: | :--------: | :---------: | :-----: | :--------------: | :--: | :------: | :----: | :-----: | :-------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Mask R-CNN | ConvNeXt-T | ImageNet-1K | 3x | yes | yes | 7.3 | 46.2 | 41.7 | [config](./mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953-050731f4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953.log.json) | +| Cascade Mask R-CNN | ConvNeXt-T | ImageNet-1K | 3x | yes | yes | 9.0 | 50.3 | 43.6 | [config](./cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220509_204200-8f07c40b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220509_204200.log.json) | +| Cascade Mask R-CNN | ConvNeXt-S | ImageNet-1K | 3x | yes | yes | 12.3 | 51.8 | 44.8 | [config](./cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004-3d24f5a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004.log.json) | + +**Note**: + +- ConvNeXt backbone needs to install [MMPreTrain](https://github.com/open-mmlab/mmpretrain) first, which has abundant backbones for downstream tasks. + +```shell +pip install mmpretrain +``` + +- The performance is unstable. `Cascade Mask R-CNN` may fluctuate about 0.2 mAP. + +## Citation + +```bibtex +@article{liu2022convnet, + title={A ConvNet for the 2020s}, + author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining}, + journal={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2022} +} +``` diff --git a/mmdetection/configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py b/mmdetection/configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py new file mode 100644 index 0000000..9a5fbed --- /dev/null +++ b/mmdetection/configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py @@ -0,0 +1,26 @@ +_base_ = './cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py' # noqa + +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict( + imports=['mmpretrain.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-small_3rdparty_32xb128-noema_in1k_20220301-303e75e3.pth' # noqa + +model = dict( + backbone=dict( + _delete_=True, + type='mmpretrain.ConvNeXt', + arch='small', + out_indices=[0, 1, 2, 3], + drop_path_rate=0.6, + layer_scale_init_value=1.0, + gap_before_final_norm=False, + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint_file, + prefix='backbone.'))) + +optim_wrapper = dict(paramwise_cfg={ + 'decay_rate': 0.7, + 'decay_type': 'layer_wise', + 'num_layers': 12 +}) diff --git a/mmdetection/configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py b/mmdetection/configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py new file mode 100644 index 0000000..c92f868 --- /dev/null +++ b/mmdetection/configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py @@ -0,0 +1,154 @@ +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict( + imports=['mmpretrain.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth' # noqa + +model = dict( + backbone=dict( + _delete_=True, + type='mmpretrain.ConvNeXt', + arch='tiny', + out_indices=[0, 1, 2, 3], + drop_path_rate=0.4, + layer_scale_init_value=1.0, + gap_before_final_norm=False, + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint_file, + prefix='backbone.')), + neck=dict(in_channels=[96, 192, 384, 768]), + roi_head=dict(bbox_head=[ + dict( + type='ConvFCBBoxHead', + num_shared_convs=4, + num_shared_fcs=1, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + reg_decoded_bbox=True, + norm_cfg=dict(type='SyncBN', requires_grad=True), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=10.0)), + dict( + type='ConvFCBBoxHead', + num_shared_convs=4, + num_shared_fcs=1, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=False, + reg_decoded_bbox=True, + norm_cfg=dict(type='SyncBN', requires_grad=True), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=10.0)), + dict( + type='ConvFCBBoxHead', + num_shared_convs=4, + num_shared_fcs=1, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=False, + reg_decoded_bbox=True, + norm_cfg=dict(type='SyncBN', requires_grad=True), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=10.0)) + ])) + +# augmentation strategy originates from DETR / Sparse RCNN +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + constructor='LearningRateDecayOptimizerConstructor', + paramwise_cfg={ + 'decay_rate': 0.7, + 'decay_type': 'layer_wise', + 'num_layers': 6 + }, + optimizer=dict( + _delete_=True, + type='AdamW', + lr=0.0002, + betas=(0.9, 0.999), + weight_decay=0.05)) diff --git a/mmdetection/configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py b/mmdetection/configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py new file mode 100644 index 0000000..5792b5b --- /dev/null +++ b/mmdetection/configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py @@ -0,0 +1,96 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict( + imports=['mmpretrain.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth' # noqa + +model = dict( + backbone=dict( + _delete_=True, + type='mmpretrain.ConvNeXt', + arch='tiny', + out_indices=[0, 1, 2, 3], + drop_path_rate=0.4, + layer_scale_init_value=1.0, + gap_before_final_norm=False, + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint_file, + prefix='backbone.')), + neck=dict(in_channels=[96, 192, 384, 768])) + +# augmentation strategy originates from DETR / Sparse RCNN +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + constructor='LearningRateDecayOptimizerConstructor', + paramwise_cfg={ + 'decay_rate': 0.95, + 'decay_type': 'layer_wise', + 'num_layers': 6 + }, + optimizer=dict( + _delete_=True, + type='AdamW', + lr=0.0001, + betas=(0.9, 0.999), + weight_decay=0.05, + )) diff --git a/mmdetection/configs/convnext/metafile.yml b/mmdetection/configs/convnext/metafile.yml new file mode 100644 index 0000000..b9fd750 --- /dev/null +++ b/mmdetection/configs/convnext/metafile.yml @@ -0,0 +1,93 @@ +Models: + - Name: mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco + In Collection: Mask R-CNN + Config: configs/convnext/mask-rcnn_convnext-t-p4-w7_fpn_amp-ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 7.3 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + - Mixed Precision Training + Training Resources: 8x A100 GPUs + Architecture: + - ConvNeXt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco/mask_rcnn_convnext-t_p4_w7_fpn_fp16_ms-crop_3x_coco_20220426_154953-050731f4.pth + Paper: + URL: https://arxiv.org/abs/2201.03545 + Title: 'A ConvNet for the 2020s' + README: configs/convnext/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 + + - Name: cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco + In Collection: Cascade Mask R-CNN + Config: configs/convnext/cascade-mask-rcnn_convnext-t-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 9.0 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + - Mixed Precision Training + Training Resources: 8x A100 GPUs + Architecture: + - ConvNeXt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-t_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220509_204200-8f07c40b.pth + Paper: + URL: https://arxiv.org/abs/2201.03545 + Title: 'A ConvNet for the 2020s' + README: configs/convnext/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.25.0 + + - Name: cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco + In Collection: Cascade Mask R-CNN + Config: configs/convnext/cascade-mask-rcnn_convnext-s-p4-w7_fpn_4conv1fc-giou_amp-ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 12.3 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + - Mixed Precision Training + Training Resources: 8x A100 GPUs + Architecture: + - ConvNeXt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/convnext/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco/cascade_mask_rcnn_convnext-s_p4_w7_fpn_giou_4conv1f_fp16_ms-crop_3x_coco_20220510_201004-3d24f5a4.pth + Paper: + URL: https://arxiv.org/abs/2201.03545 + Title: 'A ConvNet for the 2020s' + README: configs/convnext/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.25.0 diff --git a/mmdetection/configs/cornernet/README.md b/mmdetection/configs/cornernet/README.md new file mode 100644 index 0000000..e44964d --- /dev/null +++ b/mmdetection/configs/cornernet/README.md @@ -0,0 +1,43 @@ +# CornerNet + +> [Cornernet: Detecting objects as paired keypoints](https://arxiv.org/abs/1808.01244) + + + +## Abstract + +We propose CornerNet, a new approach to object detection where we detect an object bounding box as a pair of keypoints, the top-left corner and the bottom-right corner, using a single convolution neural network. By detecting objects as paired keypoints, we eliminate the need for designing a set of anchor boxes commonly used in prior single-stage detectors. In addition to our novel formulation, we introduce corner pooling, a new type of pooling layer that helps the network better localize corners. Experiments show that CornerNet achieves a 42.2% AP on MS COCO, outperforming all existing one-stage detectors. + +
    + +
    + +## Results and Models + +| Backbone | Batch Size | Step/Total Epochs | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :--------------: | :------------------------------------------------------------------: | :---------------: | :------: | :------------: | :----: | :------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| HourglassNet-104 | [10 x 5](./cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py) | 180/210 | 13.9 | 4.2 | 41.2 | [config](./cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720-5fefbf1c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720.log.json) | +| HourglassNet-104 | [8 x 6](./cornernet_hourglass104_8xb6-210e-mstest_coco.py) | 180/210 | 15.9 | 4.2 | 41.2 | [config](./cornernet_hourglass104_8xb6-210e-mstest_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618.log.json) | +| HourglassNet-104 | [32 x 3](./cornernet_hourglass104_32xb3-210e-mstest_coco.py) | 180/210 | 9.5 | 3.9 | 40.4 | [config](./cornernet_hourglass104_32xb3-210e-mstest_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110-1efaea91.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110.log.json) | + +Note: + +- TTA setting is single-scale and `flip=True`. If you want to reproduce the TTA performance, please add `--tta` in the test command. +- Experiments with `images_per_gpu=6` are conducted on Tesla V100-SXM2-32GB, `images_per_gpu=3` are conducted on GeForce GTX 1080 Ti. +- Here are the descriptions of each experiment setting: + - 10 x 5: 10 GPUs with 5 images per gpu. This is the same setting as that reported in the original paper. + - 8 x 6: 8 GPUs with 6 images per gpu. The total batchsize is similar to paper and only need 1 node to train. + - 32 x 3: 32 GPUs with 3 images per gpu. The default setting for 1080TI and need 4 nodes to train. + +## Citation + +```latex +@inproceedings{law2018cornernet, + title={Cornernet: Detecting objects as paired keypoints}, + author={Law, Hei and Deng, Jia}, + booktitle={15th European Conference on Computer Vision, ECCV 2018}, + pages={765--781}, + year={2018}, + organization={Springer Verlag} +} +``` diff --git a/mmdetection/configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py b/mmdetection/configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py new file mode 100644 index 0000000..7633916 --- /dev/null +++ b/mmdetection/configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py @@ -0,0 +1,8 @@ +_base_ = './cornernet_hourglass104_8xb6-210e-mstest_coco.py' + +train_dataloader = dict(batch_size=5) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (10 GPUs) x (5 samples per GPU) +auto_scale_lr = dict(base_batch_size=50) diff --git a/mmdetection/configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py b/mmdetection/configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py new file mode 100644 index 0000000..51a4740 --- /dev/null +++ b/mmdetection/configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py @@ -0,0 +1,8 @@ +_base_ = './cornernet_hourglass104_8xb6-210e-mstest_coco.py' + +train_dataloader = dict(batch_size=3) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (3 samples per GPU) +auto_scale_lr = dict(base_batch_size=96) diff --git a/mmdetection/configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py b/mmdetection/configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py new file mode 100644 index 0000000..bdb46ff --- /dev/null +++ b/mmdetection/configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py @@ -0,0 +1,183 @@ +_base_ = [ + '../_base_/default_runtime.py', '../_base_/datasets/coco_detection.py' +] + +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True) + +# model settings +model = dict( + type='CornerNet', + data_preprocessor=data_preprocessor, + backbone=dict( + type='HourglassNet', + downsample_times=5, + num_stacks=2, + stage_channels=[256, 256, 384, 384, 384, 512], + stage_blocks=[2, 2, 2, 2, 2, 4], + norm_cfg=dict(type='BN', requires_grad=True)), + neck=None, + bbox_head=dict( + type='CornerHead', + num_classes=80, + in_channels=256, + num_feat_levels=2, + corner_emb_channels=1, + loss_heatmap=dict( + type='GaussianFocalLoss', alpha=2.0, gamma=4.0, loss_weight=1), + loss_embedding=dict( + type='AssociativeEmbeddingLoss', + pull_weight=0.10, + push_weight=0.10), + loss_offset=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1)), + # training and testing settings + train_cfg=None, + test_cfg=dict( + corner_topk=100, + local_maximum_kernel=3, + distance_threshold=0.5, + score_thr=0.05, + max_per_img=100, + nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'))) + +# data settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + # The cropped images are padded into squares during training, + # but may be smaller than crop_size. + type='RandomCenterCropPad', + crop_size=(511, 511), + ratios=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3), + test_mode=False, + test_pad_mode=None, + mean=data_preprocessor['mean'], + std=data_preprocessor['std'], + # Image data is not converted to rgb. + to_rgb=data_preprocessor['bgr_to_rgb']), + # Make sure the output is always crop_size. + dict(type='Resize', scale=(511, 511), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs'), +] + +test_pipeline = [ + dict( + type='LoadImageFromFile', + to_float32=True, + backend_args=_base_.backend_args, + ), + # don't need Resize + dict( + type='RandomCenterCropPad', + crop_size=None, + ratios=None, + border=None, + test_mode=True, + test_pad_mode=['logical_or', 127], + mean=data_preprocessor['mean'], + std=data_preprocessor['std'], + # Image data is not converted to rgb. + to_rgb=data_preprocessor['bgr_to_rgb']), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'border')) +] + +train_dataloader = dict( + batch_size=6, + num_workers=3, + batch_sampler=None, + dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='Adam', lr=0.0005), + clip_grad=dict(max_norm=35, norm_type=2)) + +max_epochs = 210 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[180], + gamma=0.1) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (6 samples per GPU) +auto_scale_lr = dict(base_batch_size=48) + +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict( + nms=dict(type='soft_nms', iou_threshold=0.5, method='gaussian'), + max_per_img=100)) + +tta_pipeline = [ + dict( + type='LoadImageFromFile', + to_float32=True, + backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + [ + # ``RandomFlip`` must be placed before ``RandomCenterCropPad``, + # otherwise bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], + [ + dict( + type='RandomCenterCropPad', + crop_size=None, + ratios=None, + border=None, + test_mode=True, + test_pad_mode=['logical_or', 127], + mean=data_preprocessor['mean'], + std=data_preprocessor['std'], + # Image data is not converted to rgb. + to_rgb=data_preprocessor['bgr_to_rgb']) + ], + [dict(type='LoadAnnotations', with_bbox=True)], + [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'flip', 'flip_direction', 'border')) + ] + ]) +] diff --git a/mmdetection/configs/cornernet/metafile.yml b/mmdetection/configs/cornernet/metafile.yml new file mode 100644 index 0000000..f915cf3 --- /dev/null +++ b/mmdetection/configs/cornernet/metafile.yml @@ -0,0 +1,83 @@ +Collections: + - Name: CornerNet + Metadata: + Training Data: COCO + Training Techniques: + - Adam + Training Resources: 8x V100 GPUs + Architecture: + - Corner Pooling + - Stacked Hourglass Network + Paper: + URL: https://arxiv.org/abs/1808.01244 + Title: 'CornerNet: Detecting Objects as Paired Keypoints' + README: configs/cornernet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.3.0/mmdet/models/detectors/cornernet.py#L9 + Version: v2.3.0 + +Models: + - Name: cornernet_hourglass104_10xb5-crop511-210e-mstest_coco + In Collection: CornerNet + Config: configs/cornernet/cornernet_hourglass104_10xb5-crop511-210e-mstest_coco.py + Metadata: + Training Resources: 10x V100 GPUs + Batch Size: 50 + Training Memory (GB): 13.9 + inference time (ms/im): + - value: 238.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 210 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_10x5_210e_coco/cornernet_hourglass104_mstest_10x5_210e_coco_20200824_185720-5fefbf1c.pth + + - Name: cornernet_hourglass104_8xb6-210e-mstest_coco + In Collection: CornerNet + Config: configs/cornernet/cornernet_hourglass104_8xb6-210e-mstest_coco.py + Metadata: + Batch Size: 48 + Training Memory (GB): 15.9 + inference time (ms/im): + - value: 238.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 210 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_8x6_210e_coco/cornernet_hourglass104_mstest_8x6_210e_coco_20200825_150618-79b44c30.pth + + - Name: cornernet_hourglass104_32xb3-210e-mstest_coco + In Collection: CornerNet + Config: configs/cornernet/cornernet_hourglass104_32xb3-210e-mstest_coco.py + Metadata: + Training Resources: 32x V100 GPUs + Batch Size: 96 + Training Memory (GB): 9.5 + inference time (ms/im): + - value: 256.41 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 210 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/cornernet/cornernet_hourglass104_mstest_32x3_210e_coco/cornernet_hourglass104_mstest_32x3_210e_coco_20200819_203110-1efaea91.pth diff --git a/mmdetection/configs/crowddet/README.md b/mmdetection/configs/crowddet/README.md new file mode 100644 index 0000000..abc0f2d --- /dev/null +++ b/mmdetection/configs/crowddet/README.md @@ -0,0 +1,37 @@ +# CrowdDet + +> [Detection in Crowded Scenes: One Proposal, Multiple Predictions](https://arxiv.org/abs/2003.09163) + + + +## Abstract + +We propose a simple yet effective proposal-based object detector, aiming at detecting highly-overlapped instances in crowded scenes. The key of our approach is to let each proposal predict a set of correlated instances rather than a single one in previous proposal-based frameworks. Equipped with new techniques such as EMD Loss and Set NMS, our detector can effectively handle the difficulty of detecting highly overlapped objects. On a FPN-Res50 baseline, our detector can obtain 4.9% AP gains on challenging CrowdHuman dataset and 1.0% MR^−2 improvements on CityPersons dataset, without bells and whistles. Moreover, on less crowed datasets like COCO, our approach can still achieve moderate improvement, suggesting the proposed method is robust to crowdedness. Code and pre-trained models will be released at https://github.com/megvii-model/CrowdDetection. + +
    + +
    + +## Results and Models + +| Backbone | RM | Style | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :---: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | False | pytorch | 4.4 | - | 90.0 | [config](./crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman_20221023_174954-dc319c2d.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman_20221023_174954.log.json) | +| R-50-FPN | True | pytorch | 4.8 | - | 90.32 | [config](./crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman_20221024_215917-45602806.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman_20221024_215917.log.json) | + +Note: + +- RM indicates whether to use the refine module. +- The dataset for training and testing this model is `CrowdHuman`, and the metric of `box AP` is calculated by `mmdet/evaluation/metrics/crowdhuman_metric.py`. + +## Citation + +```latex +@inproceedings{Chu_2020_CVPR, + title={Detection in Crowded Scenes: One Proposal, Multiple Predictions}, + author={Chu, Xuangeng and Zheng, Anlin and Zhang, Xiangyu and Sun, Jian}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2020} +} +``` diff --git a/mmdetection/configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py b/mmdetection/configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py new file mode 100644 index 0000000..8815be7 --- /dev/null +++ b/mmdetection/configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py @@ -0,0 +1,227 @@ +_base_ = ['../_base_/default_runtime.py'] + +model = dict( + type='CrowdDet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False, + pad_size_divisor=64, + # This option is set according to https://github.com/Purkialo/CrowdDet/ + # blob/master/lib/data/CrowdHuman.py The images in the entire batch are + # resize together. + batch_augments=[ + dict(type='BatchResize', scale=(1400, 800), pad_size_divisor=64) + ]), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5, + upsample_cfg=dict(mode='bilinear', align_corners=False)), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[1.0, 2.0, 3.0], + strides=[4, 8, 16, 32, 64], + centers=[(8, 8), (8, 8), (8, 8), (8, 8), (8, 8)]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0], + clip_border=False), + loss_cls=dict(type='CrossEntropyLoss', loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='MultiInstanceRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=-1, + aligned=True, + use_torchvision=True), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='MultiInstanceBBoxHead', + with_refine=False, + num_shared_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', + loss_weight=1.0, + use_sigmoid=False, + reduction='none'), + loss_bbox=dict( + type='SmoothL1Loss', loss_weight=1.0, reduction='none'))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=(0.3, 0.7), + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2400, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=2), + rcnn=dict( + assigner=dict( + type='MultiInstanceAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.3, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='MultiInsRandomSampler', + num=512, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1200, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=2), + rcnn=dict( + nms=dict(type='nms', iou_threshold=0.5), + score_thr=0.01, + max_per_img=500))) + +dataset_type = 'CrowdHumanDataset' +data_root = 'data/CrowdHuman/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/tracking/CrowdHuman/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/tracking/', +# 'data/': 's3://openmmlab/datasets/tracking/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1400, 800), keep_ratio=True), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=None, # The 'batch_sampler' may decrease the precision + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotation_train.odgt', + data_prefix=dict(img='Images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotation_val.odgt', + data_prefix=dict(img='Images/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CrowdHumanMetric', + ann_file=data_root + 'annotation_val.odgt', + metric=['AP', 'MR', 'JI'], + backend_args=backend_args) +test_evaluator = val_evaluator + +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=30, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=800), + dict( + type='MultiStepLR', + begin=0, + end=30, + by_epoch=True, + milestones=[24, 27], + gamma=0.1) +] + +# optimizer +auto_scale_lr = dict(base_batch_size=16) +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.002, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py b/mmdetection/configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py new file mode 100644 index 0000000..80277ce --- /dev/null +++ b/mmdetection/configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py @@ -0,0 +1,3 @@ +_base_ = './crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py' + +model = dict(roi_head=dict(bbox_head=dict(with_refine=True))) diff --git a/mmdetection/configs/crowddet/metafile.yml b/mmdetection/configs/crowddet/metafile.yml new file mode 100644 index 0000000..4f191de --- /dev/null +++ b/mmdetection/configs/crowddet/metafile.yml @@ -0,0 +1,47 @@ +Collections: + - Name: CrowdDet + Metadata: + Training Data: CrowdHuman + Training Techniques: + - SGD + - EMD Loss + Training Resources: 8x A100 GPUs + Architecture: + - FPN + - RPN + - ResNet + - RoIPool + Paper: + URL: https://arxiv.org/abs/2003.09163 + Title: 'Detection in Crowded Scenes: One Proposal, Multiple Predictions' + README: configs/crowddet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v3.0.0rc3/mmdet/models/detectors/crowddet.py + Version: v3.0.0rc3 + +Models: + - Name: crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman + In Collection: CrowdDet + Config: configs/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 30 + Results: + - Task: Object Detection + Dataset: CrowdHuman + Metrics: + box AP: 90.32 + Weights: https://download.openmmlab.com/mmdetection/v3.0/crowddet/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman/crowddet-rcnn_refine_r50_fpn_8xb2-30e_crowdhuman_20221024_215917-45602806.pth + + - Name: crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman + In Collection: CrowdDet + Config: configs/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman.py + Metadata: + Training Memory (GB): 4.4 + Epochs: 30 + Results: + - Task: Object Detection + Dataset: CrowdHuman + Metrics: + box AP: 90.0 + Weights: https://download.openmmlab.com/mmdetection/v3.0/crowddet/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman/crowddet-rcnn_r50_fpn_8xb2-30e_crowdhuman_20221023_174954-dc319c2d.pth diff --git a/mmdetection/configs/dab_detr/README.md b/mmdetection/configs/dab_detr/README.md new file mode 100644 index 0000000..5661f27 --- /dev/null +++ b/mmdetection/configs/dab_detr/README.md @@ -0,0 +1,40 @@ +# DAB-DETR + +> [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https://arxiv.org/abs/2201.12329) + + + +## Abstract + +We present in this paper a novel query formulation using dynamic anchor boxes for DETR (DEtection TRansformer) and offer a deeper understanding of the role of queries in DETR. This new formulation directly uses box coordinates as queries in Transformer decoders and dynamically updates them layer-by-layer. Using box coordinates not only helps using explicit positional priors to improve the query-to-feature similarity and eliminate the slow training convergence issue in DETR, but also allows us to modulate the positional attention map using the box width and height information. Such a design makes it clear that queries in DETR can be implemented as performing soft ROI pooling layer-by-layer in a cascade manner. As a result, it leads to the best performance on MS-COCO benchmark among the DETR-like detection models under the same setting, e.g., AP 45.7% using ResNet50-DC5 as backbone trained in 50 epochs. We also conducted extensive experiments to confirm our analysis and verify the effectiveness of our methods. + +
    + +
    +
    + +
    +
    + +
    + +## Results and Models + +We provide the config files and models for DAB-DETR: [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https://arxiv.org/abs/2201.12329). + +| Backbone | Model | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :------: | :-----: | :------: | :------------: | :----: | :---------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | DAB-DETR | 50e | | | 42.3 | [config](./dab-detr_r50_8xb2-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/dab_detr/dab-detr_r50_8xb2-50e_coco/dab-detr_r50_8xb2-50e_coco_20221122_120837-c1035c8c.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/dab_detr/dab-detr_r50_8xb2-50e_coco/dab-detr_r50_8xb2-50e_coco_20221122_120837.log.json) | + +## Citation + +```latex +@inproceedings{ + liu2022dabdetr, + title={{DAB}-{DETR}: Dynamic Anchor Boxes are Better Queries for {DETR}}, + author={Shilong Liu and Feng Li and Hao Zhang and Xiao Yang and Xianbiao Qi and Hang Su and Jun Zhu and Lei Zhang}, + booktitle={International Conference on Learning Representations}, + year={2022}, + url={https://openreview.net/forum?id=oMI9PjOb9Jl} +} +``` diff --git a/mmdetection/configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py b/mmdetection/configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py new file mode 100644 index 0000000..314ed97 --- /dev/null +++ b/mmdetection/configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py @@ -0,0 +1,159 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +model = dict( + type='DABDETR', + num_queries=300, + with_random_refpoints=False, + num_patterns=0, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=None, + num_outs=1), + encoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict( + embed_dims=256, num_heads=8, dropout=0., batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='PReLU')))), + decoder=dict( + num_layers=6, + query_dim=4, + query_scale_type='cond_elewise', + with_modulated_hw_attn=True, + layer_cfg=dict( + self_attn_cfg=dict( + embed_dims=256, + num_heads=8, + attn_drop=0., + proj_drop=0., + cross_attn=False), + cross_attn_cfg=dict( + embed_dims=256, + num_heads=8, + attn_drop=0., + proj_drop=0., + cross_attn=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='PReLU'))), + return_intermediate=True), + positional_encoding=dict(num_feats=128, temperature=20, normalize=True), + bbox_head=dict( + type='DABDETRHead', + num_classes=80, + embed_dims=256, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2., eps=1e-8), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) + +# learning policy +max_epochs = 50 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[40], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16, enable=False) diff --git a/mmdetection/configs/dab_detr/metafile.yml b/mmdetection/configs/dab_detr/metafile.yml new file mode 100644 index 0000000..94383a0 --- /dev/null +++ b/mmdetection/configs/dab_detr/metafile.yml @@ -0,0 +1,32 @@ +Collections: + - Name: DAB-DETR + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 8x A100 GPUs + Architecture: + - ResNet + - Transformer + Paper: + URL: https://arxiv.org/abs/2201.12329 + Title: 'DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR' + README: configs/dab_detr/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/f4112c9e5611468ffbd57cfba548fd1289264b52/mmdet/models/detectors/dab_detr.py#L15 + Version: v3.0.0rc6 + +Models: + - Name: dab-detr_r50_8xb2-50e_coco + In Collection: DAB-DETR + Config: configs/dab_detr/dab-detr_r50_8xb2-50e_coco.py + Metadata: + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/dab_detr/dab-detr_r50_8xb2-50e_coco/dab-detr_r50_8xb2-50e_coco_20221122_120837-c1035c8c.pth diff --git a/mmdetection/configs/dcn/README.md b/mmdetection/configs/dcn/README.md new file mode 100644 index 0000000..e287e1d --- /dev/null +++ b/mmdetection/configs/dcn/README.md @@ -0,0 +1,48 @@ +# DCN + +> [Deformable Convolutional Networks](https://arxiv.org/abs/1703.06211) + + + +## Abstract + +Convolutional neural networks (CNNs) are inherently limited to model geometric transformations due to the fixed geometric structures in its building modules. In this work, we introduce two new modules to enhance the transformation modeling capacity of CNNs, namely, deformable convolution and deformable RoI pooling. Both are based on the idea of augmenting the spatial sampling locations in the modules with additional offsets and learning the offsets from target tasks, without additional supervision. The new modules can readily replace their plain counterparts in existing CNNs and can be easily trained end-to-end by standard back-propagation, giving rise to deformable convolutional networks. Extensive experiments validate the effectiveness of our approach on sophisticated vision tasks of object detection and semantic segmentation. + +
    + +
    + +## Results and Models + +| Backbone | Model | Style | Conv | Pool | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :----------: | :-----: | :----------: | :---: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 4.0 | 17.8 | 41.3 | | [config](./faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130_212941.log.json) | +| R-50-FPN | Faster | pytorch | - | dpool | 1x | 5.0 | 17.2 | 38.9 | | [config](./faster-rcnn_r50_fpn_dpool_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307-90d3c01d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307_203250.log.json) | +| R-101-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 6.0 | 12.5 | 42.7 | | [config](./faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-1377f13d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203_230019.log.json) | +| X-101-32x4d-FPN | Faster | pytorch | dconv(c3-c5) | - | 1x | 7.3 | 10.0 | 44.5 | | [config](./faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203-4f85c69c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203_001325.log.json) | +| R-50-FPN | Mask | pytorch | dconv(c3-c5) | - | 1x | 4.5 | 15.4 | 41.8 | 37.4 | [config](./mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203-4d9ad43b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203_061339.log.json) | +| R-101-FPN | Mask | pytorch | dconv(c3-c5) | - | 1x | 6.5 | 11.7 | 43.5 | 38.9 | [config](./mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216-a71f5bce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216_191601.log.json) | +| R-50-FPN | Cascade | pytorch | dconv(c3-c5) | - | 1x | 4.5 | 14.6 | 43.8 | | [config](./cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-2f1fca44.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130_220843.log.json) | +| R-101-FPN | Cascade | pytorch | dconv(c3-c5) | - | 1x | 6.4 | 11.0 | 45.0 | | [config](./cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-3b2f0594.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203_224829.log.json) | +| R-50-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 6.0 | 10.0 | 44.4 | 38.6 | [config](./cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202-42e767a2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202_010309.log.json) | +| R-101-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 8.0 | 8.6 | 45.8 | 39.7 | [config](./cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204-df0c5f10.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204_134006.log.json) | +| X-101-32x4d-FPN | Cascade Mask | pytorch | dconv(c3-c5) | - | 1x | 9.2 | | 47.3 | 41.1 | [config](./cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-e75f90c8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-20200606_183737.log.json) | +| R-50-FPN (FP16) | Mask | pytorch | dconv(c3-c5) | - | 1x | 3.0 | | 41.9 | 37.5 | [config](./mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247-c06429d2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247.log.json) | + +**Notes:** + +- `dconv` denotes deformable convolution, `c3-c5` means adding dconv in resnet stage 3 to 5. `dpool` denotes deformable roi pooling. +- The dcn ops are modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch, which should be more memory efficient and slightly faster. +- (\*) For R-50-FPN (dg=4), dg is short for deformable_group. This model is trained and tested on Amazon EC2 p3dn.24xlarge instance. +- **Memory, Train/Inf time is outdated.** + +## Citation + +```latex +@inproceedings{dai2017deformable, + title={Deformable Convolutional Networks}, + author={Dai, Jifeng and Qi, Haozhi and Xiong, Yuwen and Li, Yi and Zhang, Guodong and Hu, Han and Wei, Yichen}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + year={2017} +} +``` diff --git a/mmdetection/configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..8c0ff98 --- /dev/null +++ b/mmdetection/configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..cfcc5e7 --- /dev/null +++ b/mmdetection/configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..48b25f6 --- /dev/null +++ b/mmdetection/configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..8a942da --- /dev/null +++ b/mmdetection/configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..f6bf5b7 --- /dev/null +++ b/mmdetection/configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..db44e7e --- /dev/null +++ b/mmdetection/configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..95f2046 --- /dev/null +++ b/mmdetection/configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py b/mmdetection/configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py new file mode 100644 index 0000000..c65ce5f --- /dev/null +++ b/mmdetection/configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py @@ -0,0 +1,12 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + _delete_=True, + type='DeformRoIPoolPack', + output_size=7, + output_channels=256), + out_channels=256, + featmap_strides=[4, 8, 16, 32]))) diff --git a/mmdetection/configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..e4ed832 --- /dev/null +++ b/mmdetection/configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..3f36714 --- /dev/null +++ b/mmdetection/configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..0b281d4 --- /dev/null +++ b/mmdetection/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py b/mmdetection/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py new file mode 100644 index 0000000..9d01594 --- /dev/null +++ b/mmdetection/configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py @@ -0,0 +1,10 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) + +# MMEngine support the following two ways, users can choose +# according to convenience +# optim_wrapper = dict(type='AmpOptimWrapper') +_base_.optim_wrapper.type = 'AmpOptimWrapper' diff --git a/mmdetection/configs/dcn/metafile.yml b/mmdetection/configs/dcn/metafile.yml new file mode 100644 index 0000000..4aa35b5 --- /dev/null +++ b/mmdetection/configs/dcn/metafile.yml @@ -0,0 +1,272 @@ +Collections: + - Name: Deformable Convolutional Networks + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Deformable Convolution + Paper: + URL: https://arxiv.org/abs/1703.06211 + Title: "Deformable Convolutional Networks" + README: configs/dcn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/dcn/deform_conv.py#L15 + Version: v2.0.0 + +Models: + - Name: faster-rcnn_r50_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/faster-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 56.18 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-d68aed1e.pth + + - Name: faster-rcnn_r50_fpn_dpool_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/faster-rcnn_r50_fpn_dpool_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + inference time (ms/im): + - value: 58.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_dpool_1x_coco/faster_rcnn_r50_fpn_dpool_1x_coco_20200307-90d3c01d.pth + + - Name: faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/faster-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 80 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-1377f13d.pth + + - Name: faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/faster-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.3 + inference time (ms/im): + - value: 100 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco_20200203-4f85c69c.pth + + - Name: mask-rcnn_r50_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + inference time (ms/im): + - value: 64.94 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200203-4d9ad43b.pth + + - Name: mask-rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/mask-rcnn_r50-dconv-c3-c5_fpn_amp-1x_coco.py + Metadata: + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + Training Memory (GB): 3.0 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_dconv_c3-c5_1x_coco_20210520_180247-c06429d2.pth + + - Name: mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.5 + inference time (ms/im): + - value: 85.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200216-a71f5bce.pth + + - Name: cascade-rcnn_r50_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + inference time (ms/im): + - value: 68.49 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200130-2f1fca44.pth + + - Name: cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200203-3b2f0594.pth + + - Name: cascade-mask-rcnn_r50_fpn_dconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade-mask-rcnn_r50-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 100 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco_20200202-42e767a2.pth + + - Name: cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade-mask-rcnn_r101-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.0 + inference time (ms/im): + - value: 116.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco_20200204-df0c5f10.pth + + - Name: cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco + In Collection: Deformable Convolutional Networks + Config: configs/dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco-e75f90c8.pth diff --git a/mmdetection/configs/dcnv2/README.md b/mmdetection/configs/dcnv2/README.md new file mode 100644 index 0000000..7f42c93 --- /dev/null +++ b/mmdetection/configs/dcnv2/README.md @@ -0,0 +1,37 @@ +# DCNv2 + +> [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168) + + + +## Abstract + +The superior performance of Deformable Convolutional Networks arises from its ability to adapt to the geometric variations of objects. Through an examination of its adaptive behavior, we observe that while the spatial support for its neural features conforms more closely than regular ConvNets to object structure, this support may nevertheless extend well beyond the region of interest, causing features to be influenced by irrelevant image content. To address this problem, we present a reformulation of Deformable ConvNets that improves its ability to focus on pertinent image regions, through increased modeling power and stronger training. The modeling power is enhanced through a more comprehensive integration of deformable convolution within the network, and by introducing a modulation mechanism that expands the scope of deformation modeling. To effectively harness this enriched modeling capability, we guide network training via a proposed feature mimicking scheme that helps the network to learn features that reflect the object focus and classification power of RCNN features. With the proposed contributions, this new version of Deformable ConvNets yields significant performance gains over the original model and produces leading results on the COCO benchmark for object detection and instance segmentation. + +## Results and Models + +| Backbone | Model | Style | Conv | Pool | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :---------------: | :----: | :-----: | :-----------: | :----: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | Faster | pytorch | mdconv(c3-c5) | - | 1x | 4.1 | 17.6 | 41.4 | | [config](./faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130-d099253b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130_222144.log.json) | +| \*R-50-FPN (dg=4) | Faster | pytorch | mdconv(c3-c5) | - | 1x | 4.2 | 17.4 | 41.5 | | [config](./faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130-01262257.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130_222058.log.json) | +| R-50-FPN | Faster | pytorch | - | mdpool | 1x | 5.8 | 16.6 | 38.7 | | [config](./faster-rcnn_r50_fpn_mdpool_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307_203304.log.json) | +| R-50-FPN | Mask | pytorch | mdconv(c3-c5) | - | 1x | 4.5 | 15.1 | 41.5 | 37.1 | [config](./mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203-ad97591f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203_063443.log.json) | +| R-50-FPN (FP16) | Mask | pytorch | mdconv(c3-c5) | - | 1x | 3.1 | | 42.0 | 37.6 | [config](./mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434-cf8fefa5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434.log.json) | + +**Notes:** + +- `mdconv` denotes modulated deformable convolution, `c3-c5` means adding dconv in resnet stage 3 to 5. `mdpool` denotes modulated deformable roi pooling. +- The dcn ops are modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch, which should be more memory efficient and slightly faster. +- (\*) For R-50-FPN (dg=4), dg is short for deformable_group. This model is trained and tested on Amazon EC2 p3dn.24xlarge instance. +- **Memory, Train/Inf time is outdated.** + +## Citation + +```latex +@article{zhu2018deformable, + title={Deformable ConvNets v2: More Deformable, Better Results}, + author={Zhu, Xizhou and Hu, Han and Lin, Stephen and Dai, Jifeng}, + journal={arXiv preprint arXiv:1811.11168}, + year={2018} +} +``` diff --git a/mmdetection/configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..a7f7e4e --- /dev/null +++ b/mmdetection/configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..5c58dbe --- /dev/null +++ b/mmdetection/configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=4, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py b/mmdetection/configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py new file mode 100644 index 0000000..6198d6d --- /dev/null +++ b/mmdetection/configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py @@ -0,0 +1,12 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + _delete_=True, + type='ModulatedDeformRoIPoolPack', + output_size=7, + output_channels=256), + out_channels=256, + featmap_strides=[4, 8, 16, 32]))) diff --git a/mmdetection/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..f7a90bb --- /dev/null +++ b/mmdetection/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py b/mmdetection/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py new file mode 100644 index 0000000..3b3894c --- /dev/null +++ b/mmdetection/configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py @@ -0,0 +1,10 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) + +# MMEngine support the following two ways, users can choose +# according to convenience +# optim_wrapper = dict(type='AmpOptimWrapper') +_base_.optim_wrapper.type = 'AmpOptimWrapper' diff --git a/mmdetection/configs/dcnv2/metafile.yml b/mmdetection/configs/dcnv2/metafile.yml new file mode 100644 index 0000000..dea7bfa --- /dev/null +++ b/mmdetection/configs/dcnv2/metafile.yml @@ -0,0 +1,123 @@ +Collections: + - Name: Deformable Convolutional Networks v2 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Deformable Convolution + Paper: + URL: https://arxiv.org/abs/1811.11168 + Title: "Deformable ConvNets v2: More Deformable, Better Results" + README: configs/dcnv2/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/dcn/deform_conv.py#L15 + Version: v2.0.0 + +Models: + - Name: faster-rcnn_r50_fpn_mdconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcnv2/faster-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.1 + inference time (ms/im): + - value: 56.82 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200130-d099253b.pth + + - Name: faster-rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcnv2/faster-rcnn_r50-mdconv-group4-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + inference time (ms/im): + - value: 57.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco_20200130-01262257.pth + + - Name: faster-rcnn_r50_fpn_mdpool_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcnv2/faster-rcnn_r50_fpn_mdpool_1x_coco.py + Metadata: + Training Memory (GB): 5.8 + inference time (ms/im): + - value: 60.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco/faster_rcnn_r50_fpn_mdpool_1x_coco_20200307-c0df27ff.pth + + - Name: mask-rcnn_r50_fpn_mdconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + inference time (ms/im): + - value: 66.23 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco_20200203-ad97591f.pth + + - Name: mask-rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco + In Collection: Deformable Convolutional Networks v2 + Config: configs/dcnv2/mask-rcnn_r50-mdconv-c3-c5_fpn_amp-1x_coco.py + Metadata: + Training Memory (GB): 3.1 + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco/mask_rcnn_r50_fpn_fp16_mdconv_c3-c5_1x_coco_20210520_180434-cf8fefa5.pth diff --git a/mmdetection/configs/ddod/README.md b/mmdetection/configs/ddod/README.md new file mode 100644 index 0000000..d5ea9cd --- /dev/null +++ b/mmdetection/configs/ddod/README.md @@ -0,0 +1,31 @@ +# DDOD + +> [Disentangle Your Dense Object Detector](https://arxiv.org/pdf/2107.02963.pdf) + + + +## Abstract + +Deep learning-based dense object detectors have achieved great success in the past few years and have been applied to numerous multimedia applications such as video understanding. However, the current training pipeline for dense detectors is compromised to lots of conjunctions that may not hold. In this paper, we investigate three such important conjunctions: 1) only samples assigned as positive in classification head are used to train the regression head; 2) classification and regression share the same input feature and computational fields defined by the parallel head architecture; and 3) samples distributed in different feature pyramid layers are treated equally when computing the loss. We first carry out a series of pilot experiments to show disentangling such conjunctions can lead to persistent performance improvement. Then, based on these findings, we propose Disentangled Dense Object Detector(DDOD), in which simple and effective disentanglement mechanisms are designed and integrated into the current state-of-the-art dense object detectors. Extensive experiments on MS COCO benchmark show that our approach can lead to 2.0 mAP, 2.4 mAP and 2.2 mAP absolute improvements on RetinaNet, FCOS, and ATSS baselines with negligible extra overhead. Notably, our best model reaches 55.0 mAP on the COCO test-dev set and 93.5 AP on the hard subset of WIDER FACE, achieving new state-of-the-art performance on these two competitive benchmarks. Code is available at https://github.com/zehuichen123/DDOD. + +
    + +
    + +## Results and Models + +| Model | Backbone | Style | Lr schd | Mem (GB) | box AP | Config | Download | +| :-------: | :------: | :-----: | :-----: | :------: | :----: | :---------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| DDOD-ATSS | R-50 | pytorch | 1x | 3.4 | 41.7 | [config](./ddod_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ddod/ddod_r50_fpn_1x_coco/ddod_r50_fpn_1x_coco_20220523_223737-29b2fc67.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ddod/ddod_r50_fpn_1x_coco/ddod_r50_fpn_1x_coco_20220523_223737.log.json) | + +## Citation + +```latex +@inproceedings{chen2021disentangle, +title={Disentangle Your Dense Object Detector}, +author={Chen, Zehui and Yang, Chenhongyi and Li, Qiaofei and Zhao, Feng and Zha, Zheng-Jun and Wu, Feng}, +booktitle={Proceedings of the 29th ACM International Conference on Multimedia}, +pages={4939--4948}, +year={2021} +} +``` diff --git a/mmdetection/configs/ddod/ddod_r50_fpn_1x_coco.py b/mmdetection/configs/ddod/ddod_r50_fpn_1x_coco.py new file mode 100644 index 0000000..fed1116 --- /dev/null +++ b/mmdetection/configs/ddod/ddod_r50_fpn_1x_coco.py @@ -0,0 +1,72 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='DDOD', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='DDODHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_iou=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + train_cfg=dict( + # assigner is mean cls_assigner + assigner=dict(type='ATSSAssigner', topk=9, alpha=0.8), + reg_assigner=dict(type='ATSSAssigner', topk=9, alpha=0.5), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/ddod/metafile.yml b/mmdetection/configs/ddod/metafile.yml new file mode 100644 index 0000000..c223950 --- /dev/null +++ b/mmdetection/configs/ddod/metafile.yml @@ -0,0 +1,33 @@ +Collections: + - Name: DDOD + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - DDOD + - FPN + - ResNet + Paper: + URL: https://arxiv.org/pdf/2107.02963.pdf + Title: 'Disentangle Your Dense Object Detector' + README: configs/ddod/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.25.0/mmdet/models/detectors/ddod.py#L6 + Version: v2.25.0 + +Models: + - Name: ddod_r50_fpn_1x_coco + In Collection: DDOD + Config: configs/ddod/ddod_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.4 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ddod/ddod_r50_fpn_1x_coco/ddod_r50_fpn_1x_coco_20220523_223737-29b2fc67.pth diff --git a/mmdetection/configs/ddq/README.md b/mmdetection/configs/ddq/README.md new file mode 100644 index 0000000..3f6f459 --- /dev/null +++ b/mmdetection/configs/ddq/README.md @@ -0,0 +1,39 @@ +# DDQ + +> [Dense Distinct Query for End-to-End Object Detection](https://arxiv.org/abs/2303.12776) + + + +## Abstract + + + +One-to-one label assignment in object detection has successfully obviated the need for non-maximum suppression (NMS) as postprocessing and makes the pipeline end-to-end. However, it triggers a new dilemma as the widely used sparse queries cannot guarantee a high recall, while dense queries inevitably bring more similar queries and encounter optimization difficulties. As both sparse and dense queries are problematic, then what are the expected queries in end-to-end object detection? This paper shows that the solution should be Dense Distinct Queries (DDQ). Concretely, we first lay dense queries like traditional detectors and then select distinct ones for one-to-one assignments. DDQ blends the advantages of traditional and recent end-to-end detectors and significantly improves the performance of various detectors including FCN, R-CNN, and DETRs. Most impressively, DDQ-DETR achieves 52.1 AP on MS-COCO dataset within 12 epochs using a ResNet-50 backbone, outperforming all existing detectors in the same setting. DDQ also shares the benefit of end-to-end detectors in crowded scenes and achieves 93.8 AP on CrowdHuman. We hope DDQ can inspire researchers to consider the complementarity between traditional methods and end-to-end detectors. + +![ddq_arch](https://github.com/open-mmlab/mmdetection/assets/33146359/5ca9f11b-b6f3-454f-a2d1-3009ee337bbc) + +## Results and Models + +| Model | Backbone | Lr schd | Augmentation | box AP(val) | Config | Download | +| :---------------: | :------: | :-----: | :----------: | :---------: | :------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| DDQ DETR-4scale | R-50 | 12e | DETR | 51.4 | [config](./ddq-detr-4scale_r50_8xb2-12e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq-detr-4scale_r50_8xb2-12e_coco/ddq-detr-4scale_r50_8xb2-12e_coco_20230809_170711-42528127.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq-detr-4scale_r50_8xb2-12e_coco/ddq-detr-4scale_r50_8xb2-12e_coco_20230809_170711.log.json) | +| DDQ DETR-5scale\* | R-50 | 12e | DETR | 52.1 | [config](./ddq-detr-5scale_r50_8xb2-12e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq_detr_5scale_coco_1x.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq_detr_5scale_coco_1x_20230319_103307.log) | +| DDQ DETR-4scale\* | Swin-L | 30e | DETR | 58.7 | [config](./ddq-detr-4scale_swinl_8xb2-30e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq_detr_swinl_30e.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq_detr_swinl_30e_20230316_221721_20230318_143554.log) | + +**Note** + +- Models labeled * are not trained by us, but from [DDQ official website](https://github.com/jshilong/DDQ). +- We find that the performance is unstable and may fluctuate by about 0.2 mAP. + +## Citation + +```latex +@InProceedings{Zhang_2023_CVPR, + author = {Zhang, Shilong and Wang, Xinjiang and Wang, Jiaqi and Pang, Jiangmiao and Lyu, Chengqi and Zhang, Wenwei and Luo, Ping and Chen, Kai}, + title = {Dense Distinct Query for End-to-End Object Detection}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2023}, + pages = {7329-7338} +} +``` diff --git a/mmdetection/configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py b/mmdetection/configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py new file mode 100644 index 0000000..5e64afc --- /dev/null +++ b/mmdetection/configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py @@ -0,0 +1,170 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +model = dict( + type='DDQDETR', + num_queries=900, # num_matching_queries + # ratio of num_dense queries to num_queries + dense_topk_ratio=1.5, + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + # encoder class name: DeformableDetrTransformerEncoder + encoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0))), # 0.1 for DeformDETR + # decoder class name: DDQTransformerDecoder + decoder=dict( + # `num_layers` >= 2, because attention masks of the last + # `num_layers` - 1 layers are used for distinct query selection + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, + dropout=0.0), # 0.1 for DeformDETR + cross_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0)), # 0.1 for DeformDETR + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, + normalize=True, + offset=0.0, # -0.5 for DeformDETR + temperature=20), # 10000 for DeformDETR + bbox_head=dict( + type='DDQDETRHead', + num_classes=80, + sync_cls_avg_factor=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + dn_cfg=dict( + label_noise_scale=0.5, + box_noise_scale=1.0, + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), + dqs_cfg=dict(type='nms', iou_threshold=0.8), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.05), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})) + +# learning policy +max_epochs = 12 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=False, + begin=0, + end=2000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/ddq/ddq-detr-4scale_swinl_8xb2-30e_coco.py b/mmdetection/configs/ddq/ddq-detr-4scale_swinl_8xb2-30e_coco.py new file mode 100644 index 0000000..d863649 --- /dev/null +++ b/mmdetection/configs/ddq/ddq-detr-4scale_swinl_8xb2-30e_coco.py @@ -0,0 +1,177 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa: E501 +model = dict( + type='DDQDETR', + num_queries=900, # num_matching_queries + # ratio of num_dense queries to num_queries + dense_topk_ratio=1.5, + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict( + type='ChannelMapper', + in_channels=[384, 768, 1536], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + # encoder class name: DeformableDetrTransformerEncoder + encoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0))), # 0.1 for DeformDETR + # decoder class name: DDQTransformerDecoder + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, + dropout=0.0), # 0.1 for DeformDETR + cross_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0)), # 0.1 for DeformDETR + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, + normalize=True, + offset=0.0, # -0.5 for DeformDETR + temperature=20), # 10000 for DeformDETR + bbox_head=dict( + type='DDQDETRHead', + num_classes=80, + sync_cls_avg_factor=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + dn_cfg=dict( + label_noise_scale=0.5, + box_noise_scale=1.0, + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), + dqs_cfg=dict(type='nms', iou_threshold=0.8), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.05), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.05)})) + +# learning policy +max_epochs = 30 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=False, + begin=0, + end=2000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20, 26], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/ddq/ddq-detr-5scale_r50_8xb2-12e_coco.py b/mmdetection/configs/ddq/ddq-detr-5scale_r50_8xb2-12e_coco.py new file mode 100644 index 0000000..3c38f55 --- /dev/null +++ b/mmdetection/configs/ddq/ddq-detr-5scale_r50_8xb2-12e_coco.py @@ -0,0 +1,171 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +model = dict( + type='DDQDETR', + num_queries=900, # num_matching_queries + # ratio of num_dense queries to num_queries + dense_topk_ratio=1.5, + with_box_refine=True, + as_two_stage=True, + num_feature_levels=5, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[256, 512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=5), + # encoder class name: DeformableDetrTransformerEncoder + encoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=5, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0))), # 0.1 for DeformDETR + # decoder class name: DDQTransformerDecoder + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, + dropout=0.0), # 0.1 for DeformDETR + cross_attn_cfg=dict(embed_dims=256, num_levels=5, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0)), # 0.1 for DeformDETR + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, + normalize=True, + offset=0.0, # -0.5 for DeformDETR + temperature=20), # 10000 for DeformDETR + bbox_head=dict( + type='DDQDETRHead', + num_classes=80, + sync_cls_avg_factor=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + dn_cfg=dict( + label_noise_scale=0.5, + box_noise_scale=1.0, + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), + dqs_cfg=dict(type='nms', iou_threshold=0.8), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.05), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})) + +# learning policy +max_epochs = 12 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=False, + begin=0, + end=2000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/ddq/metafile.yml b/mmdetection/configs/ddq/metafile.yml new file mode 100644 index 0000000..bd33abe --- /dev/null +++ b/mmdetection/configs/ddq/metafile.yml @@ -0,0 +1,56 @@ +Collections: + - Name: DDQ + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 8x A100 GPUs + Architecture: + - ResNet + - Transformer + Paper: + URL: https://arxiv.org/abs/2303.12776 + Title: 'Dense Distinct Query for End-to-End Object Detection' + README: configs/ddq/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/detectors/ddq_detr.py#L21 + Version: dev-3.x + +Models: + - Name: ddq-detr-4scale_r50_8xb2-12e_coco + In Collection: DDQ + Config: configs/ddq/ddq-detr-4scale_r50_8xb2-12e_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.4 + Weights: https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq-detr-4scale_r50_8xb2-12e_coco/ddq-detr-4scale_r50_8xb2-12e_coco_20230809_170711-42528127.pth + + - Name: ddq-detr-5scale_r50_8xb2-12e_coco + In Collection: DDQ + Config: configs/dino/ddq-detr-5scale_r50_8xb2-12e_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.1 + Weights: https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq_detr_5scale_coco_1x.pth + + - Name: ddq-detr-4scale_swinl_8xb2-30e_coco + In Collection: DDQ + Config: configs/dino/ddq-detr-4scale_swinl_8xb2-30e_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 58.7 + Weights: https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq_detr_swinl_30e.pth diff --git a/mmdetection/configs/deepfashion/README.md b/mmdetection/configs/deepfashion/README.md new file mode 100644 index 0000000..844e29d --- /dev/null +++ b/mmdetection/configs/deepfashion/README.md @@ -0,0 +1,70 @@ +# DeepFashion + +> [DeepFashion: Powering Robust Clothes Recognition and Retrieval With Rich Annotations](https://openaccess.thecvf.com/content_cvpr_2016/html/Liu_DeepFashion_Powering_Robust_CVPR_2016_paper.html) + + + +## Abstract + +Recent advances in clothes recognition have been driven by the construction of clothes datasets. Existing datasets are limited in the amount of annotations and are difficult to cope with the various challenges in real-world applications. In this work, we introduce DeepFashion, a large-scale clothes dataset with comprehensive annotations. It contains over 800,000 images, which are richly annotated with massive attributes, clothing landmarks, and correspondence of images taken under different scenarios including store, street snapshot, and consumer. Such rich annotations enable the development of powerful algorithms in clothes recognition and facilitating future researches. To demonstrate the advantages of DeepFashion, we propose a new deep model, namely FashionNet, which learns clothing features by jointly predicting clothing attributes and landmarks. The estimated landmarks are then employed to pool or gate the learned features. It is optimized in an iterative manner. Extensive experiments demonstrate the effectiveness of FashionNet and the usefulness of DeepFashion. + +
    + +
    + +## Introduction + +[MMFashion](https://github.com/open-mmlab/mmfashion) develops "fashion parsing and segmentation" module +based on the dataset +[DeepFashion-Inshop](https://drive.google.com/drive/folders/0B7EVK8r0v71pVDZFQXRsMDZCX1E?usp=sharing). +Its annotation follows COCO style. +To use it, you need to first download the data. Note that we only use "img_highres" in this task. +The file tree should be like this: + +```sh +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── DeepFashion +│ │ ├── In-shop +| │ │ ├── Anno +| │ │ │   ├── segmentation +| │ │ │   | ├── DeepFashion_segmentation_train.json +| │ │ │   | ├── DeepFashion_segmentation_query.json +| │ │ │   | ├── DeepFashion_segmentation_gallery.json +| │ │ │   ├── list_bbox_inshop.txt +| │ │ │   ├── list_description_inshop.json +| │ │ │   ├── list_item_inshop.txt +| │ │ │   └── list_landmarks_inshop.txt +| │ │ ├── Eval +| │ │ │ └── list_eval_partition.txt +| │ │ ├── Img +| │ │ │ ├── img +| │ │ │ │ ├──XXX.jpg +| │ │ │ ├── img_highres +| │ │ │ └── ├──XXX.jpg + +``` + +After that you can train the Mask RCNN r50 on DeepFashion-In-shop dataset by launching training with the `mask_rcnn_r50_fpn_1x.py` config +or creating your own config file. + +## Results and Models + +| Backbone | Model type | Dataset | bbox detection Average Precision | segmentation Average Precision | Config | Download (Google) | +| :------: | :--------: | :-----------------: | :------------------------------: | :----------------------------: | :----------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| ResNet50 | Mask RCNN | DeepFashion-In-shop | 0.599 | 0.584 | [config](./mask-rcnn_r50_fpn_15e_deepfashion.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion/mask_rcnn_r50_fpn_15e_deepfashion_20200329_192752.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/deepfashion/mask_rcnn_r50_fpn_15e_deepfashion/20200329_192752.log.json) | + +## Citation + +```latex +@inproceedings{liuLQWTcvpr16DeepFashion, + author = {Liu, Ziwei and Luo, Ping and Qiu, Shi and Wang, Xiaogang and Tang, Xiaoou}, + title = {DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations}, + booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + month = {June}, + year = {2016} +} +``` diff --git a/mmdetection/configs/deepfashion/mask-rcnn_r50_fpn_15e_deepfashion.py b/mmdetection/configs/deepfashion/mask-rcnn_r50_fpn_15e_deepfashion.py new file mode 100644 index 0000000..403b18a --- /dev/null +++ b/mmdetection/configs/deepfashion/mask-rcnn_r50_fpn_15e_deepfashion.py @@ -0,0 +1,23 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/deepfashion.py', '../_base_/schedules/schedule_1x.py', + '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict(num_classes=15), mask_head=dict(num_classes=15))) +# runtime settings +max_epochs = 15 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] diff --git a/mmdetection/configs/deepsort/README.md b/mmdetection/configs/deepsort/README.md new file mode 100644 index 0000000..e50ec17 --- /dev/null +++ b/mmdetection/configs/deepsort/README.md @@ -0,0 +1,109 @@ +# Simple online and realtime tracking with a deep association metric + +## Abstract + + + +Simple Online and Realtime Tracking (SORT) is a pragmatic approach to multiple object tracking with a focus on simple, effective algorithms. In this paper, we integrate appearance information to improve the performance of SORT. Due to this extension we are able to track objects through longer periods of occlusions, effectively reducing the number of identity switches. In spirit of the original framework we place much of the computational complexity into an offline pre-training stage where we learn a deep association metric on a largescale person re-identification dataset. During online application, we establish measurement-to-track associations using nearest neighbor queries in visual appearance space. Experimental evaluation shows that our extensions reduce the number of identity switches by 45%, achieving overall competitive performance at high frame rates. + + + +
    + +
    + +## Results and models on MOT17 + +Currently we do not support training ReID models for DeepSORT. +We directly use the ReID model from [Tracktor](https://github.com/phil-bergmann/tracking_wo_bnw). These missed features will be supported in the future. + +| Method | Detector | ReID | Train Set | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 | FP | FN | IDSw. | Config | Download | +| :------: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :--------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| DeepSORT | R50-FasterRCNN-FPN | R50 | half-train | half-val | N | 13.8 | 57.0 | 63.7 | 69.5 | 15063 | 40323 | 3276 | [config](deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth) | + +## Get started + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Prepare + +Tracking Dataset Prepare can refer to this [document](../../docs/en/user_guides/tracking_dataset_prepare.md). + +### 3. Training + +We implement DeepSORT with independent detector and ReID models. +Note that, due to the influence of parameters such as learning rate in default configuration file, +we recommend using 8 GPUs for training in order to reproduce accuracy. + +You can train the detector as follows. + +```shell script +# Training Faster R-CNN on mot17-half-train dataset with following command. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_train.sh configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 +``` + +If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 4. Testing and evaluation + +### 4.1 Example on MOTxx-halfval dataset + +**4.1.1 use separate trained detector and reid model to evaluating and testing** + +```shell +# Example 1: Test on motXX-half-val set. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_test_tracking.sh configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 --detector ${DETECTOR_CHECKPOINT_PATH} --reid ${REID_CHECKPOINT_PATH} +``` + +**4.1.2 use video_baesd to evaluating and testing** + +we also provide two_ways(img_based or video_based) to evaluating and testing. +if you want to use video_based to evaluating and testing, you can modify config as follows + +``` +val_dataloader = dict( + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False)) +``` + +### 4.2 Example on MOTxx-test dataset + +If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set, +please use the following command to generate result files that can be used for submission. +It will be stored in `./mot_17_test_res`, you can modify the saved path in `test_evaluator` of the config. + +```shell script +# Example 2: Test on motxx-test set +# The number after config file represents the number of GPUs used +bash tools/dist_test_tracking.sh configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test 8 --detector ${DETECTOR_CHECKPOINT_PATH} --reid ${REID_CHECKPOINT_PATH} +``` + +If you want to know about more detailed usage of `test_tracking.py/dist_test_tracking.sh/slurm_test_tracking.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 5.Inference + +Use a single GPU to predict a video and save it as a video. + +```shell +python demo/mot_demo.py demo/demo_mot.mp4 configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test --detector ${DETECTOR_CHECKPOINT_PATH} --reid ${REID_CHECKPOINT_PATH} --out mot.mp4 +``` + +## Citation + + + +```latex +@inproceedings{wojke2017simple, + title={Simple online and realtime tracking with a deep association metric}, + author={Wojke, Nicolai and Bewley, Alex and Paulus, Dietrich}, + booktitle={2017 IEEE international conference on image processing (ICIP)}, + pages={3645--3649}, + year={2017}, + organization={IEEE} +} +``` diff --git a/mmdetection/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmdetection/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..70d3393 --- /dev/null +++ b/mmdetection/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py @@ -0,0 +1,85 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/mot_challenge.py', '../_base_/default_runtime.py' +] + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=1), + visualization=dict(type='TrackVisualizationHook', draw=False)) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer') +# custom hooks +custom_hooks = [ + # Synchronize model buffers such as running_mean and running_var in BN + # at the end of each epoch + dict(type='SyncBuffersHook') +] + +detector = _base_.model +detector.pop('data_preprocessor') +detector.rpn_head.bbox_coder.update(dict(clip_border=False)) +detector.roi_head.bbox_head.update(dict(num_classes=1)) +detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False)) +detector['init_cfg'] = dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmtracking/mot/faster_rcnn/' + 'faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth') +del _base_.model + +model = dict( + type='DeepSORT', + data_preprocessor=dict( + type='TrackDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + detector=detector, + reid=dict( + type='BaseReID', + data_preprocessor=dict(type='mmpretrain.ClsDataPreprocessor'), + backbone=dict( + type='mmpretrain.ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + style='pytorch'), + neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1), + head=dict( + type='LinearReIDHead', + num_fcs=1, + in_channels=2048, + fc_channels=1024, + out_channels=128, + num_classes=380, + loss_cls=dict(type='mmpretrain.CrossEntropyLoss', loss_weight=1.0), + loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU')), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth' # noqa: E501 + )), + tracker=dict( + type='SORTTracker', + motion=dict(type='KalmanFilter', center_only=False), + obj_score_thr=0.5, + reid=dict( + num_samples=10, + img_scale=(256, 128), + img_norm_cfg=None, + match_score_thr=2.0), + match_iou_thr=0.5, + momentums=None, + num_tentatives=2, + num_frames_retain=100)) + +train_dataloader = None + +train_cfg = None +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/mmdetection/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py b/mmdetection/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py new file mode 100644 index 0000000..687ce7a --- /dev/null +++ b/mmdetection/configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py @@ -0,0 +1,15 @@ +_base_ = [ + './deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain' + '_test-mot17halfval.py' +] + +# dataloader +val_dataloader = dict( + dataset=dict(ann_file='annotations/train_cocoformat.json')) +test_dataloader = dict( + dataset=dict( + ann_file='annotations/test_cocoformat.json', + data_prefix=dict(img_path='test'))) + +# evaluator +test_evaluator = dict(format_only=True, outfile_prefix='./mot_17_test_res') diff --git a/mmdetection/configs/deepsort/metafile.yml b/mmdetection/configs/deepsort/metafile.yml new file mode 100644 index 0000000..2feb358 --- /dev/null +++ b/mmdetection/configs/deepsort/metafile.yml @@ -0,0 +1,37 @@ +Collections: + - Name: DeepSORT + Metadata: + Training Techniques: + - SGD with Momentum + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - FPN + Paper: + URL: https://arxiv.org/abs/1703.07402 + Title: Simple Online and Realtime Tracking with a Deep Association Metric + README: configs/deepsort/README.md + +Models: + - Name: deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval + In Collection: DeepSORT + Config: configs/deepsort/deepsort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py + Metadata: + Training Data: MOT17-half-train + inference time (ms/im): + - value: 72.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (640, 1088) + Results: + - Task: Multiple Object Tracking + Dataset: MOT17-half-val + Metrics: + MOTA: 63.7 + IDF1: 69.5 + HOTA: 57.0 + Weights: + - https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth + - https://download.openmmlab.com/mmtracking/mot/reid/tracktor_reid_r50_iter25245-a452f51f.pth diff --git a/mmdetection/configs/deformable_detr/README.md b/mmdetection/configs/deformable_detr/README.md new file mode 100644 index 0000000..ca897cd --- /dev/null +++ b/mmdetection/configs/deformable_detr/README.md @@ -0,0 +1,41 @@ +# Deformable DETR + +> [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) + + + +## Abstract + +DETR has been recently proposed to eliminate the need for many hand-designed components in object detection while demonstrating good performance. However, it suffers from slow convergence and limited feature spatial resolution, due to the limitation of Transformer attention modules in processing image feature maps. To mitigate these issues, we proposed Deformable DETR, whose attention modules only attend to a small set of key sampling points around a reference. Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach. + +
    + +
    + +## Results and Models + +| Backbone | Model | Lr schd | box AP | Config | Download | +| :------: | :---------------------------------: | :-----: | :----: | :---------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | Deformable DETR | 50e | 44.3 | [config](./deformable-detr_r50_16xb2-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr_r50_16xb2-50e_coco/deformable-detr_r50_16xb2-50e_coco_20221029_210934-6bc7d21b.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr_r50_16xb2-50e_coco/deformable-detr_r50_16xb2-50e_coco_20221029_210934.log.json) | +| R-50 | + iterative bounding box refinement | 50e | 46.2 | [config](./deformable-detr-refine_r50_16xb2-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco/deformable-detr-refine_r50_16xb2-50e_coco_20221022_225303-844e0f93.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco/deformable-detr-refine_r50_16xb2-50e_coco_20221022_225303.log.json) | +| R-50 | ++ two-stage Deformable DETR | 50e | 47.0 | [config](./deformable-detr-refine-twostage_r50_16xb2-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco/deformable-detr-refine-twostage_r50_16xb2-50e_coco_20221021_184714-acc8a5ff.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco/deformable-detr-refine-twostage_r50_16xb2-50e_coco_20221021_184714.log.json) | + +### NOTE + +1. All models are trained with batch size 32. +2. The performance is unstable. `Deformable DETR` and `iterative bounding box refinement` may fluctuate about 0.3 mAP. `two-stage Deformable DETR` may fluctuate about 0.2 mAP. + +## Citation + +We provide the config files for Deformable DETR: [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159). + +```latex +@inproceedings{ +zhu2021deformable, +title={Deformable DETR: Deformable Transformers for End-to-End Object Detection}, +author={Xizhou Zhu and Weijie Su and Lewei Lu and Bin Li and Xiaogang Wang and Jifeng Dai}, +booktitle={International Conference on Learning Representations}, +year={2021}, +url={https://openreview.net/forum?id=gZ9hCDWe6ke} +} +``` diff --git a/mmdetection/configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py b/mmdetection/configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py new file mode 100644 index 0000000..eeb67fc --- /dev/null +++ b/mmdetection/configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py @@ -0,0 +1,2 @@ +_base_ = 'deformable-detr-refine_r50_16xb2-50e_coco.py' +model = dict(as_two_stage=True) diff --git a/mmdetection/configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py b/mmdetection/configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py new file mode 100644 index 0000000..b968674 --- /dev/null +++ b/mmdetection/configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py @@ -0,0 +1,2 @@ +_base_ = 'deformable-detr_r50_16xb2-50e_coco.py' +model = dict(with_box_refine=True) diff --git a/mmdetection/configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py b/mmdetection/configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py new file mode 100644 index 0000000..e0dee41 --- /dev/null +++ b/mmdetection/configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py @@ -0,0 +1,156 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +model = dict( + type='DeformableDETR', + num_queries=300, + num_feature_levels=4, + with_box_refine=False, + as_two_stage=False, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=1024, ffn_drop=0.1))), + decoder=dict( # DeformableDetrTransformerDecoder + num_layers=6, + return_intermediate=True, + layer_cfg=dict( # DeformableDetrTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + cross_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=1024, ffn_drop=0.1)), + post_norm_cfg=None), + positional_encoding=dict(num_feats=128, normalize=True, offset=-0.5), + bbox_head=dict( + type='DeformableDETRHead', + num_classes=80, + sync_cls_avg_factor=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=100)) + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] +train_dataloader = dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1), + 'sampling_offsets': dict(lr_mult=0.1), + 'reference_points': dict(lr_mult=0.1) + })) + +# learning policy +max_epochs = 50 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[40], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (16 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=32) diff --git a/mmdetection/configs/deformable_detr/metafile.yml b/mmdetection/configs/deformable_detr/metafile.yml new file mode 100644 index 0000000..a30c979 --- /dev/null +++ b/mmdetection/configs/deformable_detr/metafile.yml @@ -0,0 +1,56 @@ +Collections: + - Name: Deformable DETR + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - Transformer + Paper: + URL: https://openreview.net/forum?id=gZ9hCDWe6ke + Title: 'Deformable DETR: Deformable Transformers for End-to-End Object Detection' + README: configs/deformable_detr/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/deformable_detr.py#L6 + Version: v2.12.0 + +Models: + - Name: deformable-detr_r50_16xb2-50e_coco + In Collection: Deformable DETR + Config: configs/deformable_detr/deformable-detr_r50_16xb2-50e_coco.py + Metadata: + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr_r50_16xb2-50e_coco/deformable-detr_r50_16xb2-50e_coco_20221029_210934-6bc7d21b.pth + + - Name: deformable-detr-refine_r50_16xb2-50e_coco + In Collection: Deformable DETR + Config: configs/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco.py + Metadata: + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.2 + Weights: https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr-refine_r50_16xb2-50e_coco/deformable-detr-refine_r50_16xb2-50e_coco_20221022_225303-844e0f93.pth + + - Name: deformable-detr-refine-twostage_r50_16xb2-50e_coco + In Collection: Deformable DETR + Config: configs/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py + Metadata: + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.0 + Weights: https://download.openmmlab.com/mmdetection/v3.0/deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco/deformable-detr-refine-twostage_r50_16xb2-50e_coco_20221021_184714-acc8a5ff.pth diff --git a/mmdetection/configs/detectors/README.md b/mmdetection/configs/detectors/README.md new file mode 100644 index 0000000..2918d6e --- /dev/null +++ b/mmdetection/configs/detectors/README.md @@ -0,0 +1,69 @@ +# DetectoRS + +> [DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution](https://arxiv.org/abs/2006.02334) + + + +## Abstract + +Many modern object detectors demonstrate outstanding performances by using the mechanism of looking and thinking twice. In this paper, we explore this mechanism in the backbone design for object detection. At the macro level, we propose Recursive Feature Pyramid, which incorporates extra feedback connections from Feature Pyramid Networks into the bottom-up backbone layers. At the micro level, we propose Switchable Atrous Convolution, which convolves the features with different atrous rates and gathers the results using switch functions. Combining them results in DetectoRS, which significantly improves the performances of object detection. On COCO test-dev, DetectoRS achieves state-of-the-art 55.7% box AP for object detection, 48.5% mask AP for instance segmentation, and 50.0% PQ for panoptic segmentation. + +
    + +
    + +## Introduction + +DetectoRS requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +| | ├── stuffthingmaps +``` + +## Results and Models + +DetectoRS includes two major components: + +- Recursive Feature Pyramid (RFP). +- Switchable Atrous Convolution (SAC). + +They can be used independently. +Combining them together results in DetectoRS. +The results on COCO 2017 val are shown in the below table. + +| Method | Detector | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------: | :-----------------: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RFP | Cascade + ResNet-50 | 1x | 7.5 | - | 44.8 | | [config](./cascade-rcnn_r50-rfp_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco-8cf51bfd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco_20200624_104126.log.json) | +| SAC | Cascade + ResNet-50 | 1x | 5.6 | - | 45.0 | | [config](./cascade-rcnn_r50-sac_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco-24bfda62.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco_20200624_104402.log.json) | +| DetectoRS | Cascade + ResNet-50 | 1x | 9.9 | - | 47.4 | | [config](./detectors_cascade-rcnn_r50_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco-32a10ba0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco_20200706_001203.log.json) | +| RFP | HTC + ResNet-50 | 1x | 11.2 | - | 46.6 | 40.9 | [config](./htc_r50-rfp_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco-8ff87c51.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco_20200624_103053.log.json) | +| SAC | HTC + ResNet-50 | 1x | 9.3 | - | 46.4 | 40.9 | [config](./htc_r50-sac_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco-bfa60c54.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco_20200624_103111.log.json) | +| DetectoRS | HTC + ResNet-50 | 1x | 13.6 | - | 49.1 | 42.6 | [config](./detectors_htc-r50_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco-329b1453.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco_20200624_103659.log.json) | +| DetectoRS | HTC + ResNet-101 | 20e | 19.6 | | 50.5 | 43.9 | [config](./detectors_htc-r101_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r101_20e_coco/detectors_htc_r101_20e_coco_20210419_203638-348d533b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r101_20e_coco/detectors_htc_r101_20e_coco_20210419_203638.log.json) | + +*Note*: This is a re-implementation based on MMDetection-V2. +The original implementation is based on MMDetection-V1. + +## Citation + +We provide the config files for [DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution](https://arxiv.org/pdf/2006.02334.pdf). + +```latex +@article{qiao2020detectors, + title={DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution}, + author={Qiao, Siyuan and Chen, Liang-Chieh and Yuille, Alan}, + journal={arXiv preprint arXiv:2006.02334}, + year={2020} +} +``` diff --git a/mmdetection/configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py b/mmdetection/configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py new file mode 100644 index 0000000..c30c84d --- /dev/null +++ b/mmdetection/configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py @@ -0,0 +1,28 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + pretrained='torchvision://resnet50', + style='pytorch'))) diff --git a/mmdetection/configs/detectors/cascade-rcnn_r50-sac_1x_coco.py b/mmdetection/configs/detectors/cascade-rcnn_r50-sac_1x_coco.py new file mode 100644 index 0000000..24d6cd3 --- /dev/null +++ b/mmdetection/configs/detectors/cascade-rcnn_r50-sac_1x_coco.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True))) diff --git a/mmdetection/configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py b/mmdetection/configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py new file mode 100644 index 0000000..19d13d9 --- /dev/null +++ b/mmdetection/configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py @@ -0,0 +1,32 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + pretrained='torchvision://resnet50', + style='pytorch'))) diff --git a/mmdetection/configs/detectors/detectors_htc-r101_20e_coco.py b/mmdetection/configs/detectors/detectors_htc-r101_20e_coco.py new file mode 100644 index 0000000..93d7d2b --- /dev/null +++ b/mmdetection/configs/detectors/detectors_htc-r101_20e_coco.py @@ -0,0 +1,28 @@ +_base_ = '../htc/htc_r101_fpn_20e_coco.py' + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + pretrained='torchvision://resnet101', + style='pytorch'))) diff --git a/mmdetection/configs/detectors/detectors_htc-r50_1x_coco.py b/mmdetection/configs/detectors/detectors_htc-r50_1x_coco.py new file mode 100644 index 0000000..0d2fc4f --- /dev/null +++ b/mmdetection/configs/detectors/detectors_htc-r50_1x_coco.py @@ -0,0 +1,28 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True), + pretrained='torchvision://resnet50', + style='pytorch'))) diff --git a/mmdetection/configs/detectors/htc_r50-rfp_1x_coco.py b/mmdetection/configs/detectors/htc_r50-rfp_1x_coco.py new file mode 100644 index 0000000..496104e --- /dev/null +++ b/mmdetection/configs/detectors/htc_r50-rfp_1x_coco.py @@ -0,0 +1,24 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + output_img=True), + neck=dict( + type='RFP', + rfp_steps=2, + aspp_out_channels=64, + aspp_dilations=(1, 3, 6, 1), + rfp_backbone=dict( + rfp_inplanes=256, + type='DetectoRS_ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + conv_cfg=dict(type='ConvAWS'), + pretrained='torchvision://resnet50', + style='pytorch'))) diff --git a/mmdetection/configs/detectors/htc_r50-sac_1x_coco.py b/mmdetection/configs/detectors/htc_r50-sac_1x_coco.py new file mode 100644 index 0000000..72d4db9 --- /dev/null +++ b/mmdetection/configs/detectors/htc_r50-sac_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' + +model = dict( + backbone=dict( + type='DetectoRS_ResNet', + conv_cfg=dict(type='ConvAWS'), + sac=dict(type='SAC', use_deform=True), + stage_with_sac=(False, True, True, True))) diff --git a/mmdetection/configs/detectors/metafile.yml b/mmdetection/configs/detectors/metafile.yml new file mode 100644 index 0000000..196a1ce --- /dev/null +++ b/mmdetection/configs/detectors/metafile.yml @@ -0,0 +1,114 @@ +Collections: + - Name: DetectoRS + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ASPP + - FPN + - RFP + - RPN + - ResNet + - RoIAlign + - SAC + Paper: + URL: https://arxiv.org/abs/2006.02334 + Title: 'DetectoRS: Detecting Objects with Recursive Feature Pyramid and Switchable Atrous Convolution' + README: configs/detectors/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/backbones/detectors_resnet.py#L205 + Version: v2.2.0 + +Models: + - Name: cascade-rcnn_r50-rfp_1x_coco + In Collection: DetectoRS + Config: configs/detectors/cascade-rcnn_r50-rfp_1x_coco.py + Metadata: + Training Memory (GB): 7.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_rfp_1x_coco/cascade_rcnn_r50_rfp_1x_coco-8cf51bfd.pth + + - Name: cascade-rcnn_r50-sac_1x_coco + In Collection: DetectoRS + Config: configs/detectors/cascade-rcnn_r50-sac_1x_coco.py + Metadata: + Training Memory (GB): 5.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/cascade_rcnn_r50_sac_1x_coco/cascade_rcnn_r50_sac_1x_coco-24bfda62.pth + + - Name: detectors_cascade-rcnn_r50_1x_coco + In Collection: DetectoRS + Config: configs/detectors/detectors_cascade-rcnn_r50_1x_coco.py + Metadata: + Training Memory (GB): 9.9 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_cascade_rcnn_r50_1x_coco/detectors_cascade_rcnn_r50_1x_coco-32a10ba0.pth + + - Name: htc_r50-rfp_1x_coco + In Collection: DetectoRS + Config: configs/detectors/htc_r50-rfp_1x_coco.py + Metadata: + Training Memory (GB): 11.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_rfp_1x_coco/htc_r50_rfp_1x_coco-8ff87c51.pth + + - Name: htc_r50-sac_1x_coco + In Collection: DetectoRS + Config: configs/detectors/htc_r50-sac_1x_coco.py + Metadata: + Training Memory (GB): 9.3 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/htc_r50_sac_1x_coco/htc_r50_sac_1x_coco-bfa60c54.pth + + - Name: detectors_htc-r50_1x_coco + In Collection: DetectoRS + Config: configs/detectors/detectors_htc-r50_1x_coco.py + Metadata: + Training Memory (GB): 13.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/detectors/detectors_htc_r50_1x_coco/detectors_htc_r50_1x_coco-329b1453.pth diff --git a/mmdetection/configs/detr/README.md b/mmdetection/configs/detr/README.md new file mode 100644 index 0000000..8e843f3 --- /dev/null +++ b/mmdetection/configs/detr/README.md @@ -0,0 +1,37 @@ +# DETR + +> [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) + + + +## Abstract + +We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries, DETR reasons about the relations of the objects and the global image context to directly output the final set of predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive baselines. + +
    + +
    + +## Results and Models + +| Backbone | Model | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :---: | :-----: | :------: | :------------: | :----: | :------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | DETR | 150e | 7.9 | | 39.9 | [config](./detr_r50_8xb2-150e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/detr/detr_r50_8xb2-150e_coco/detr_r50_8xb2-150e_coco_20221023_153551-436d03e8.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detr/detr_r50_8xb2-150e_coco/detr_r50_8xb2-150e_coco_20221023_153551.log.json) | + +## Citation + +We provide the config files for DETR: [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872). + +```latex +@inproceedings{detr, + author = {Nicolas Carion and + Francisco Massa and + Gabriel Synnaeve and + Nicolas Usunier and + Alexander Kirillov and + Sergey Zagoruyko}, + title = {End-to-End Object Detection with Transformers}, + booktitle = {ECCV}, + year = {2020} +} +``` diff --git a/mmdetection/configs/detr/detr_r101_8xb2-500e_coco.py b/mmdetection/configs/detr/detr_r101_8xb2-500e_coco.py new file mode 100644 index 0000000..6661aac --- /dev/null +++ b/mmdetection/configs/detr/detr_r101_8xb2-500e_coco.py @@ -0,0 +1,7 @@ +_base_ = './detr_r50_8xb2-500e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/detr/detr_r18_8xb2-500e_coco.py b/mmdetection/configs/detr/detr_r18_8xb2-500e_coco.py new file mode 100644 index 0000000..305b9d6 --- /dev/null +++ b/mmdetection/configs/detr/detr_r18_8xb2-500e_coco.py @@ -0,0 +1,7 @@ +_base_ = './detr_r50_8xb2-500e_coco.py' + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[512])) diff --git a/mmdetection/configs/detr/detr_r50_8xb2-150e_coco.py b/mmdetection/configs/detr/detr_r50_8xb2-150e_coco.py new file mode 100644 index 0000000..aaa1541 --- /dev/null +++ b/mmdetection/configs/detr/detr_r50_8xb2-150e_coco.py @@ -0,0 +1,155 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +model = dict( + type='DETR', + num_queries=100, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=None, + num_outs=1), + encoder=dict( # DetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type='ReLU', inplace=True)))), + decoder=dict( # DetrTransformerDecoder + num_layers=6, + layer_cfg=dict( # DetrTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type='ReLU', inplace=True))), + return_intermediate=True), + positional_encoding=dict(num_feats=128, normalize=True), + bbox_head=dict( + type='DETRHead', + num_classes=80, + embed_dims=256, + loss_cls=dict( + type='CrossEntropyLoss', + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='ClassificationCost', weight=1.), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=100)) + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) + +# learning policy +max_epochs = 150 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[100], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/detr/detr_r50_8xb2-500e_coco.py b/mmdetection/configs/detr/detr_r50_8xb2-500e_coco.py new file mode 100644 index 0000000..f07d5dc --- /dev/null +++ b/mmdetection/configs/detr/detr_r50_8xb2-500e_coco.py @@ -0,0 +1,24 @@ +_base_ = './detr_r50_8xb2-150e_coco.py' + +# learning policy +max_epochs = 500 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=10) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[334], + gamma=0.1) +] + +# only keep latest 2 checkpoints +default_hooks = dict(checkpoint=dict(max_keep_ckpts=2)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/detr/metafile.yml b/mmdetection/configs/detr/metafile.yml new file mode 100644 index 0000000..a9132df --- /dev/null +++ b/mmdetection/configs/detr/metafile.yml @@ -0,0 +1,33 @@ +Collections: + - Name: DETR + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - Transformer + Paper: + URL: https://arxiv.org/abs/2005.12872 + Title: 'End-to-End Object Detection with Transformers' + README: configs/detr/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/detectors/detr.py#L7 + Version: v2.7.0 + +Models: + - Name: detr_r50_8xb2-150e_coco + In Collection: DETR + Config: configs/detr/detr_r50_8xb2-150e_coco.py + Metadata: + Training Memory (GB): 7.9 + Epochs: 150 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/detr/detr_r50_8xb2-150e_coco/detr_r50_8xb2-150e_coco_20221023_153551-436d03e8.pth diff --git a/mmdetection/configs/dino/README.md b/mmdetection/configs/dino/README.md new file mode 100644 index 0000000..d8a01bd --- /dev/null +++ b/mmdetection/configs/dino/README.md @@ -0,0 +1,40 @@ +# DINO + +> [DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection](https://arxiv.org/abs/2203.03605) + + + +## Abstract + +We present DINO (DETR with Improved deNoising anchOr boxes), a state-of-the-art end-to-end object detector. DINO improves over previous DETR-like models in performance and efficiency by using a contrastive way for denoising training, a mixed query selection method for anchor initialization, and a look forward twice scheme for box prediction. DINO achieves 49.4AP in 12 epochs and 51.3AP in 24 epochs on COCO with a ResNet-50 backbone and multi-scale features, yielding a significant improvement of +6.0AP and +2.7AP, respectively, compared to DN-DETR, the previous best DETR-like model. DINO scales well in both model size and data size. Without bells and whistles, after pre-training on the Objects365 dataset with a SwinL backbone, DINO obtains the best results on both COCO val2017 (63.2AP) and test-dev (63.3AP). Compared to other models on the leaderboard, DINO significantly reduces its model size and pre-training data size while achieving better results. + +
    + +
    + +## Results and Models + +| Backbone | Model | Lr schd | Better-Hyper | box AP | Config | Download | +| :------: | :---------: | :-----: | :----------: | :----: | :---------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | DINO-4scale | 12e | False | 49.0 | [config](./dino-4scale_r50_8xb2-12e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/dino/dino-4scale_r50_8xb2-12e_coco/dino-4scale_r50_8xb2-12e_coco_20221202_182705-55b2bba2.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/dino/dino-4scale_r50_8xb2-12e_coco/dino-4scale_r50_8xb2-12e_coco_20221202_182705.log.json) | +| R-50 | DINO-4scale | 12e | True | 50.1 | [config](./dino-4scale_r50_improved_8xb2-12e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/dino/dino-4scale_r50_improved_8xb2-12e_coco/dino-4scale_r50_improved_8xb2-12e_coco_20230818_162607-6f47a913.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/dino/dino-4scale_r50_improved_8xb2-12e_coco/dino-4scale_r50_improved_8xb2-12e_coco_20230818_162607.log.json) | +| Swin-L | DINO-5scale | 12e | False | 57.2 | [config](./dino-5scale_swin-l_8xb2-12e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/dino/dino-5scale_swin-l_8xb2-12e_coco/dino-5scale_swin-l_8xb2-12e_coco_20230228_072924-a654145f.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/dino/dino-5scale_swin-l_8xb2-12e_coco/dino-5scale_swin-l_8xb2-12e_coco_20230228_072924.log) | +| Swin-L | DINO-5scale | 36e | False | 58.4 | [config](./dino-5scale_swin-l_8xb2-36e_coco.py) | [model](https://github.com/RistoranteRist/mmlab-weights/releases/download/dino-swinl/dino-5scale_swin-l_8xb2-36e_coco-5486e051.pth) \| [log](https://github.com/RistoranteRist/mmlab-weights/releases/download/dino-swinl/20230307_032359.log) | + +### NOTE + +The performance is unstable. `DINO-4scale` with `R-50` may fluctuate about 0.4 mAP. + +## Citation + +We provide the config files for DINO: [DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection](https://arxiv.org/abs/2203.03605). + +```latex +@misc{zhang2022dino, + title={DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection}, + author={Hao Zhang and Feng Li and Shilong Liu and Lei Zhang and Hang Su and Jun Zhu and Lionel M. Ni and Heung-Yeung Shum}, + year={2022}, + eprint={2203.03605}, + archivePrefix={arXiv}, + primaryClass={cs.CV}} +``` diff --git a/mmdetection/configs/dino/dino-4scale_r50_8xb2-12e_coco.py b/mmdetection/configs/dino/dino-4scale_r50_8xb2-12e_coco.py new file mode 100644 index 0000000..5831f89 --- /dev/null +++ b/mmdetection/configs/dino/dino-4scale_r50_8xb2-12e_coco.py @@ -0,0 +1,163 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +model = dict( + type='DINO', + num_queries=900, # num_matching_queries + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + encoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0))), # 0.1 for DeformDETR + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, + dropout=0.0), # 0.1 for DeformDETR + cross_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0)), # 0.1 for DeformDETR + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, + normalize=True, + offset=0.0, # -0.5 for DeformDETR + temperature=20), # 10000 for DeformDETR + bbox_head=dict( + type='DINOHead', + num_classes=80, + sync_cls_avg_factor=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), # 2.0 in DeformDETR + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + dn_cfg=dict( # TODO: Move to model.train_cfg ? + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, + num_dn_queries=100)), # TODO: half num_dn_queries + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) # 100 for DeformDETR + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] +train_dataloader = dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='AdamW', + lr=0.0001, # 0.0002 for DeformDETR + weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}) +) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa + +# learning policy +max_epochs = 12 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/dino/dino-4scale_r50_8xb2-24e_coco.py b/mmdetection/configs/dino/dino-4scale_r50_8xb2-24e_coco.py new file mode 100644 index 0000000..8534ac6 --- /dev/null +++ b/mmdetection/configs/dino/dino-4scale_r50_8xb2-24e_coco.py @@ -0,0 +1,13 @@ +_base_ = './dino-4scale_r50_8xb2-12e_coco.py' +max_epochs = 24 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20], + gamma=0.1) +] diff --git a/mmdetection/configs/dino/dino-4scale_r50_8xb2-36e_coco.py b/mmdetection/configs/dino/dino-4scale_r50_8xb2-36e_coco.py new file mode 100644 index 0000000..1c2cf46 --- /dev/null +++ b/mmdetection/configs/dino/dino-4scale_r50_8xb2-36e_coco.py @@ -0,0 +1,13 @@ +_base_ = './dino-4scale_r50_8xb2-12e_coco.py' +max_epochs = 36 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[30], + gamma=0.1) +] diff --git a/mmdetection/configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py b/mmdetection/configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py new file mode 100644 index 0000000..6a4a82b --- /dev/null +++ b/mmdetection/configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py @@ -0,0 +1,18 @@ +_base_ = ['dino-4scale_r50_8xb2-12e_coco.py'] + +# from deformable detr hyper +model = dict( + backbone=dict(frozen_stages=-1), + bbox_head=dict(loss_cls=dict(loss_weight=2.0)), + positional_encoding=dict(offset=-0.5, temperature=10000), + dn_cfg=dict(group_cfg=dict(num_dn_queries=300))) + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.0002), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1), + 'sampling_offsets': dict(lr_mult=0.1), + 'reference_points': dict(lr_mult=0.1) + })) diff --git a/mmdetection/configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py b/mmdetection/configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py new file mode 100644 index 0000000..3d39f22 --- /dev/null +++ b/mmdetection/configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py @@ -0,0 +1,30 @@ +_base_ = './dino-4scale_r50_8xb2-12e_coco.py' + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa +num_levels = 5 +model = dict( + num_feature_levels=num_levels, + backbone=dict( + _delete_=True, + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(0, 1, 2, 3), + # Please only add indices that would be used + # in FPN, otherwise some parameter will not be used + with_cp=True, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels), + encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))), + decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels)))) diff --git a/mmdetection/configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py b/mmdetection/configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py new file mode 100644 index 0000000..d55a38e --- /dev/null +++ b/mmdetection/configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py @@ -0,0 +1,13 @@ +_base_ = './dino-5scale_swin-l_8xb2-12e_coco.py' +max_epochs = 36 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] diff --git a/mmdetection/configs/dino/metafile.yml b/mmdetection/configs/dino/metafile.yml new file mode 100644 index 0000000..f276a04 --- /dev/null +++ b/mmdetection/configs/dino/metafile.yml @@ -0,0 +1,85 @@ +Collections: + - Name: DINO + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 8x A100 GPUs + Architecture: + - ResNet + - Transformer + Paper: + URL: https://arxiv.org/abs/2203.03605 + Title: 'DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection' + README: configs/dino/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/f4112c9e5611468ffbd57cfba548fd1289264b52/mmdet/models/detectors/dino.py#L17 + Version: v3.0.0rc6 + +Models: + - Name: dino-4scale_r50_8xb2-12e_coco + In Collection: DINO + Config: configs/dino/dino-4scale_r50_8xb2-12e_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.0 + Weights: https://download.openmmlab.com/mmdetection/v3.0/dino/dino-4scale_r50_8xb2-12e_coco/dino-4scale_r50_8xb2-12e_coco_20221202_182705-55b2bba2.pth + + - Name: dino-4scale_r50_8xb2-24e_coco + In Collection: DINO + Config: configs/dino/dino-4scale_r50_8xb2-24e_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + + - Name: dino-4scale_r50_8xb2-36e_coco + In Collection: DINO + Config: configs/dino/dino-4scale_r50_8xb2-36e_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + + - Name: dino-5scale_swin-l_8xb2-12e_coco + In Collection: DINO + Config: configs/dino/dino-5scale_swin-l_8xb2-12e_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 57.2 + Weights: https://download.openmmlab.com/mmdetection/v3.0/dino/dino-5scale_swin-l_8xb2-12e_coco/dino-5scale_swin-l_8xb2-12e_coco_20230228_072924-a654145f.pth + + - Name: dino-5scale_swin-l_8xb2-36e_coco + In Collection: DINO + Config: configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 58.4 + Weights: https://github.com/RistoranteRist/mmlab-weights/releases/download/dino-swinl/dino-5scale_swin-l_8xb2-36e_coco-5486e051.pth + - Name: dino-4scale_r50_improved_8xb2-12e_coco + In Collection: DINO + Config: configs/dino/dino-4scale_r50_improved_8xb2-12e_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.1 + Weights: https://download.openmmlab.com/mmdetection/v3.0/dino/dino-4scale_r50_improved_8xb2-12e_coco/dino-4scale_r50_improved_8xb2-12e_coco_20230818_162607-6f47a913.pth diff --git a/mmdetection/configs/double_heads/README.md b/mmdetection/configs/double_heads/README.md new file mode 100644 index 0000000..1b97dbc --- /dev/null +++ b/mmdetection/configs/double_heads/README.md @@ -0,0 +1,32 @@ +# Double Heads + +> [Rethinking Classification and Localization for Object Detection](https://arxiv.org/abs/1904.06493) + + + +## Abstract + +Two head structures (i.e. fully connected head and convolution head) have been widely used in R-CNN based detectors for classification and localization tasks. However, there is a lack of understanding of how does these two head structures work for these two tasks. To address this issue, we perform a thorough analysis and find an interesting fact that the two head structures have opposite preferences towards the two tasks. Specifically, the fully connected head (fc-head) is more suitable for the classification task, while the convolution head (conv-head) is more suitable for the localization task. Furthermore, we examine the output feature maps of both heads and find that fc-head has more spatial sensitivity than conv-head. Thus, fc-head has more capability to distinguish a complete object from part of an object, but is not robust to regress the whole object. Based upon these findings, we propose a Double-Head method, which has a fully connected head focusing on classification and a convolution head for bounding box regression. Without bells and whistles, our method gains +3.5 and +2.8 AP on MS COCO dataset from Feature Pyramid Network (FPN) baselines with ResNet-50 and ResNet-101 backbones, respectively. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 6.8 | 9.5 | 40.0 | [config](./dh-faster-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130_220238.log.json) | + +## Citation + +```latex +@article{wu2019rethinking, + title={Rethinking Classification and Localization for Object Detection}, + author={Yue Wu and Yinpeng Chen and Lu Yuan and Zicheng Liu and Lijuan Wang and Hongzhi Li and Yun Fu}, + year={2019}, + eprint={1904.06493}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/mmdetection/configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..6b9b6e6 --- /dev/null +++ b/mmdetection/configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,23 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + type='DoubleHeadRoIHead', + reg_roi_scale_factor=1.3, + bbox_head=dict( + _delete_=True, + type='DoubleConvFCBBoxHead', + num_convs=4, + num_fcs=2, + in_channels=256, + conv_out_channels=1024, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=2.0)))) diff --git a/mmdetection/configs/double_heads/metafile.yml b/mmdetection/configs/double_heads/metafile.yml new file mode 100644 index 0000000..bb14e79 --- /dev/null +++ b/mmdetection/configs/double_heads/metafile.yml @@ -0,0 +1,41 @@ +Collections: + - Name: Rethinking Classification and Localization for Object Detection + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/pdf/1904.06493 + Title: 'Rethinking Classification and Localization for Object Detection' + README: configs/double_heads/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/roi_heads/double_roi_head.py#L6 + Version: v2.0.0 + +Models: + - Name: dh-faster-rcnn_r50_fpn_1x_coco + In Collection: Rethinking Classification and Localization for Object Detection + Config: configs/double_heads/dh-faster-rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.8 + inference time (ms/im): + - value: 105.26 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/double_heads/dh_faster_rcnn_r50_fpn_1x_coco/dh_faster_rcnn_r50_fpn_1x_coco_20200130-586b67df.pth diff --git a/mmdetection/configs/dsdl/README.md b/mmdetection/configs/dsdl/README.md new file mode 100644 index 0000000..f38c3b6 --- /dev/null +++ b/mmdetection/configs/dsdl/README.md @@ -0,0 +1,63 @@ +# DSDL: Standard Description Language for DataSet + + + +## 1. Abstract + +Data is the cornerstone of artificial intelligence. The efficiency of data acquisition, exchange, and application directly impacts the advances in technologies and applications. Over the long history of AI, a vast quantity of data sets have been developed and distributed. However, these datasets are defined in very different forms, which incurs significant overhead when it comes to exchange, integration, and utilization -- it is often the case that one needs to develop a new customized tool or script in order to incorporate a new dataset into a workflow. + +To overcome such difficulties, we develop **Data Set Description Language (DSDL)**. More details please visit our [official documents](https://opendatalab.github.io/dsdl-docs/getting_started/overview/), dsdl datasets can be downloaded from our platform [OpenDataLab](https://opendatalab.com/). + +## 2. Steps + +- install dsdl: + + install by pip: + + ``` + pip install dsdl + ``` + + install by source code: + + ``` + git clone https://github.com/opendatalab/dsdl-sdk.git -b schema-dsdl + cd dsdl-sdk + python setup.py install + ``` + +- install mmdet and pytorch: + please refer this [installation documents](https://mmdetection.readthedocs.io/en/latest/get_started.html). + +- train: + + - using single gpu: + + ``` + python tools/train.py {config_file} + ``` + + - using slurm: + + ``` + ./tools/slurm_train.sh {partition} {job_name} {config_file} {work_dir} {gpu_nums} + ``` + +## 3. Test Results + +- detection task: + + | Datasets | Model | box AP | Config | + | :--------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----: | :-------------------------: | + | VOC07+12 | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/faster_rcnn_r50_fpn_1x_voc0712_20220320_192712-54bef0f3.pth) | 80.3\* | [config](./voc0712.py) | + | COCO | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) | 37.4 | [config](./coco.py) | + | Objects365 | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2/faster_rcnn_r50_fpn_16x4_1x_obj365v2_20221220_175040-5910b015.pth) | 19.8 | [config](./objects365v2.py) | + | OpenImages | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424-98c630e5.pth) | 59.9\* | [config](./openimagesv6.py) | + + \*: box AP in voc metric and openimages metric, actually means AP_50. + +- instance segmentation task: + + | Datasets | Model | box AP | mask AP | Config | + | :------: | :------------------------------------------------------------------------------------------------------------------------------------------: | :----: | :-----: | :--------------------------: | + | COCO | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) | 38.1 | 34.7 | [config](./coco_instance.py) | diff --git a/mmdetection/configs/dsdl/coco.py b/mmdetection/configs/dsdl/coco.py new file mode 100644 index 0000000..3c9e895 --- /dev/null +++ b/mmdetection/configs/dsdl/coco.py @@ -0,0 +1,33 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', + '../_base_/datasets/dsdl.py' +] + +# dsdl dataset settings + +# please visit our platform [OpenDataLab](https://opendatalab.com/) +# to downloaded dsdl dataset. +data_root = 'data/COCO2017' +img_prefix = 'original' +train_ann = 'dsdl/set-train/train.yaml' +val_ann = 'dsdl/set-val/val.yaml' +specific_key_path = dict(ignore_flag='./annotations/*/iscrowd') + +train_dataloader = dict( + dataset=dict( + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=train_ann, + data_prefix=dict(img_path=img_prefix), + filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32), + )) + +val_dataloader = dict( + dataset=dict( + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=val_ann, + data_prefix=dict(img_path=img_prefix), + )) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/dsdl/coco_instance.py b/mmdetection/configs/dsdl/coco_instance.py new file mode 100644 index 0000000..e34f93c --- /dev/null +++ b/mmdetection/configs/dsdl/coco_instance.py @@ -0,0 +1,62 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', + '../_base_/datasets/dsdl.py' +] + +# dsdl dataset settings. + +# please visit our platform [OpenDataLab](https://opendatalab.com/) +# to downloaded dsdl dataset. +data_root = 'data/COCO2017' +img_prefix = 'original' +train_ann = 'dsdl/set-train/train.yaml' +val_ann = 'dsdl/set-val/val.yaml' +specific_key_path = dict(ignore_flag='./annotations/*/iscrowd') + +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'instances')) +] + +train_dataloader = dict( + dataset=dict( + with_polygon=True, + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=train_ann, + data_prefix=dict(img_path=img_prefix), + filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline, + )) + +val_dataloader = dict( + dataset=dict( + with_polygon=True, + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=val_ann, + data_prefix=dict(img_path=img_prefix), + pipeline=test_pipeline, + )) + +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', metric=['bbox', 'segm'], format_only=False) + +test_evaluator = val_evaluator diff --git a/mmdetection/configs/dsdl/objects365v2.py b/mmdetection/configs/dsdl/objects365v2.py new file mode 100644 index 0000000..d25a232 --- /dev/null +++ b/mmdetection/configs/dsdl/objects365v2.py @@ -0,0 +1,54 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', + '../_base_/datasets/dsdl.py' +] + +model = dict(roi_head=dict(bbox_head=dict(num_classes=365))) + +# dsdl dataset settings + +# please visit our platform [OpenDataLab](https://opendatalab.com/) +# to downloaded dsdl dataset. +data_root = 'data/Objects365' +img_prefix = 'original' +train_ann = 'dsdl/set-train/train.yaml' +val_ann = 'dsdl/set-val/val.yaml' +specific_key_path = dict(ignore_flag='./annotations/*/iscrowd') + +train_dataloader = dict( + dataset=dict( + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=train_ann, + data_prefix=dict(img_path=img_prefix), + filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32), + )) + +val_dataloader = dict( + dataset=dict( + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=val_ann, + data_prefix=dict(img_path=img_prefix), + test_mode=True, + )) +test_dataloader = val_dataloader + +default_hooks = dict(logger=dict(type='LoggerHook', interval=1000), ) +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=3, val_interval=1) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[1, 2], + gamma=0.1) +] +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/dsdl/openimagesv6.py b/mmdetection/configs/dsdl/openimagesv6.py new file mode 100644 index 0000000..a65f942 --- /dev/null +++ b/mmdetection/configs/dsdl/openimagesv6.py @@ -0,0 +1,94 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/schedules/schedule_1x.py', + '../_base_/default_runtime.py', +] + +model = dict(roi_head=dict(bbox_head=dict(num_classes=601))) + +# dsdl dataset settings + +# please visit our platform [OpenDataLab](https://opendatalab.com/) +# to downloaded dsdl dataset. +dataset_type = 'DSDLDetDataset' +data_root = 'data/OpenImages' +train_ann = 'dsdl/set-train/train.yaml' +val_ann = 'dsdl/set-val/val.yaml' +specific_key_path = dict( + image_level_labels='./image_labels/*/label', + Label='./objects/*/label', + is_group_of='./objects/*/isgroupof', +) + +backend_args = dict( + backend='petrel', + path_mapping=dict({'data/': 's3://open_dataset_original/'})) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1024, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1024, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'instances', 'image_level_labels')) +] + +train_dataloader = dict( + sampler=dict(type='ClassAwareSampler', num_sample_class=1), + dataset=dict( + type=dataset_type, + with_imagelevel_label=True, + with_hierarchy=True, + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=train_ann, + filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline)) + +val_dataloader = dict( + dataset=dict( + type=dataset_type, + with_imagelevel_label=True, + with_hierarchy=True, + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=val_ann, + test_mode=True, + pipeline=test_pipeline)) + +test_dataloader = val_dataloader + +default_hooks = dict(logger=dict(type='LoggerHook', interval=1000), ) +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=3, val_interval=1) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[1, 2], + gamma=0.1) +] +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +val_evaluator = dict( + type='OpenImagesMetric', + iou_thrs=0.5, + ioa_thrs=0.5, + use_group_of=True, + get_supercategory=True) + +test_evaluator = val_evaluator diff --git a/mmdetection/configs/dsdl/voc07.py b/mmdetection/configs/dsdl/voc07.py new file mode 100644 index 0000000..b7b8647 --- /dev/null +++ b/mmdetection/configs/dsdl/voc07.py @@ -0,0 +1,94 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/default_runtime.py' +] + +# model setting +model = dict(roi_head=dict(bbox_head=dict(num_classes=20))) + +# dsdl dataset settings + +# please visit our platform [OpenDataLab](https://opendatalab.com/) +# to downloaded dsdl dataset. +dataset_type = 'DSDLDetDataset' +data_root = 'data/VOC07-det' +img_prefix = 'original' +train_ann = 'dsdl/set-train/train.yaml' +val_ann = 'dsdl/set-test/test.yaml' + +specific_key_path = dict(ignore_flag='./objects/*/difficult') + +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1000, 600), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1000, 600), keep_ratio=True), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'instances')) +] +train_dataloader = dict( + dataset=dict( + type=dataset_type, + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=train_ann, + data_prefix=dict(img_path=img_prefix), + filter_cfg=dict(filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline)) + +val_dataloader = dict( + dataset=dict( + type=dataset_type, + specific_key_path=specific_key_path, + data_root=data_root, + ann_file=val_ann, + data_prefix=dict(img_path=img_prefix), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# Pascal VOC2007 uses `11points` as default evaluate mode, while PASCAL +# VOC2012 defaults to use 'area'. +val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points') +# val_evaluator = dict(type='CocoMetric', metric='bbox') +test_evaluator = val_evaluator + +# training schedule, voc dataset is repeated 3 times, in +# `_base_/datasets/voc0712.py`, so the actual epoch = 4 * 3 = 12 +max_epochs = 12 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=3) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[9], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/dsdl/voc0712.py b/mmdetection/configs/dsdl/voc0712.py new file mode 100644 index 0000000..9ec1bb8 --- /dev/null +++ b/mmdetection/configs/dsdl/voc0712.py @@ -0,0 +1,132 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/schedules/schedule_1x.py', + '../_base_/default_runtime.py', + # '../_base_/datasets/dsdl.py' +] + +# model setting +model = dict(roi_head=dict(bbox_head=dict(num_classes=20))) + +# dsdl dataset settings + +# please visit our platform [OpenDataLab](https://opendatalab.com/) +# to downloaded dsdl dataset. +dataset_type = 'DSDLDetDataset' +data_root_07 = 'data/VOC07-det' +data_root_12 = 'data/VOC12-det' +img_prefix = 'original' + +train_ann = 'dsdl/set-train/train.yaml' +val_ann = 'dsdl/set-val/val.yaml' +test_ann = 'dsdl/set-test/test.yaml' + +backend_args = None +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1000, 600), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(1000, 600), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'instances')) +] + +specific_key_path = dict(ignore_flag='./objects/*/difficult', ) + +train_dataloader = dict( + dataset=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type=dataset_type, + specific_key_path=specific_key_path, + data_root=data_root_07, + ann_file=train_ann, + data_prefix=dict(img_path=img_prefix), + filter_cfg=dict( + filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline), + dict( + type=dataset_type, + specific_key_path=specific_key_path, + data_root=data_root_07, + ann_file=val_ann, + data_prefix=dict(img_path=img_prefix), + filter_cfg=dict( + filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline), + dict( + type=dataset_type, + specific_key_path=specific_key_path, + data_root=data_root_12, + ann_file=train_ann, + data_prefix=dict(img_path=img_prefix), + filter_cfg=dict( + filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline), + dict( + type=dataset_type, + specific_key_path=specific_key_path, + data_root=data_root_12, + ann_file=val_ann, + data_prefix=dict(img_path=img_prefix), + filter_cfg=dict( + filter_empty_gt=True, min_size=32, bbox_min_size=32), + pipeline=train_pipeline), + ]))) + +val_dataloader = dict( + dataset=dict( + type=dataset_type, + specific_key_path=specific_key_path, + data_root=data_root_07, + ann_file=test_ann, + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type='CocoMetric', metric='bbox') +# val_evaluator = dict(type='VOCMetric', metric='mAP', eval_mode='11points') +test_evaluator = val_evaluator + +# training schedule, voc dataset is repeated 3 times, in +# `_base_/datasets/voc0712.py`, so the actual epoch = 4 * 3 = 12 +max_epochs = 4 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[3], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/dyhead/README.md b/mmdetection/configs/dyhead/README.md new file mode 100644 index 0000000..decd480 --- /dev/null +++ b/mmdetection/configs/dyhead/README.md @@ -0,0 +1,52 @@ +# DyHead + +> [Dynamic Head: Unifying Object Detection Heads with Attentions](https://arxiv.org/abs/2106.08322) + + + +## Abstract + +The complex nature of combining localization and classification in object detection has resulted in the flourished development of methods. Previous works tried to improve the performance in various object detection heads but failed to present a unified view. In this paper, we present a novel dynamic head framework to unify object detection heads with attentions. By coherently combining multiple self-attention mechanisms between feature levels for scale-awareness, among spatial locations for spatial-awareness, and within output channels for task-awareness, the proposed approach significantly improves the representation ability of object detection heads without any computational overhead. Further experiments demonstrate that the effectiveness and efficiency of the proposed dynamic head on the COCO benchmark. With a standard ResNeXt-101-DCN backbone, we largely improve the performance over popular object detectors and achieve a new state-of-the-art at 54.0 AP. Furthermore, with latest transformer backbone and extra data, we can push current best COCO result to a new record at 60.6 AP. + +
    + +
    + +## Results and Models + +| Method | Backbone | Style | Setting | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :----: | :------: | :-----: | :----------: | :-----: | :------: | :------------: | :----: | :----------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| ATSS | R-50 | caffe | reproduction | 1x | 5.4 | 13.2 | 42.5 | [config](./atss_r50-caffe_fpn_dyhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939.log.json) | +| ATSS | R-50 | pytorch | simple | 1x | 4.9 | 13.7 | 43.3 | [config](./atss_r50_fpn_dyhead_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314.log.json) | + +- We trained the above models with 4 GPUs and 4 `samples_per_gpu`. +- The `reproduction` setting aims to reproduce the official implementation based on Detectron2. +- The `simple` setting serves as a minimum example to use DyHead in MMDetection. Specifically, + - it adds `DyHead` to `neck` after `FPN` + - it sets `stacked_convs=0` to `bbox_head` +- The `simple` setting achieves higher AP than the original implementation. + We have not conduct ablation study between the two settings. + `dict(type='Pad', size_divisor=128)` may further improve AP by prefer spatial alignment across pyramid levels, although large padding reduces efficiency. + +We also trained the model with Swin-L backbone. Results are as below. + +| Method | Backbone | Style | Setting | Lr schd | mstrain | box AP | Config | Download | +| :----: | :------: | :---: | :----------: | :-----: | :------: | :----: | :-----------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| ATSS | Swin-L | caffe | reproduction | 2x | 480~1200 | 56.2 | [config](./atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco_20220509_100315-bc5b6516.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco_20220509_100315.log.json) | + +## Relation to Other Methods + +- DyHead can be regarded as an improved [SEPC](https://arxiv.org/abs/2005.03101) with [DyReLU modules](https://arxiv.org/abs/2003.10027) and simplified [SE blocks](https://arxiv.org/abs/1709.01507). +- Xiyang Dai et al., the author team of DyHead, adopt it for [Dynamic DETR](https://openaccess.thecvf.com/content/ICCV2021/html/Dai_Dynamic_DETR_End-to-End_Object_Detection_With_Dynamic_Attention_ICCV_2021_paper.html). + The description of Dynamic Encoder in Sec. 3.2 will help you understand DyHead. + +## Citation + +```latex +@inproceedings{DyHead_CVPR2021, + author = {Dai, Xiyang and Chen, Yinpeng and Xiao, Bin and Chen, Dongdong and Liu, Mengchen and Yuan, Lu and Zhang, Lei}, + title = {Dynamic Head: Unifying Object Detection Heads With Attentions}, + booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2021} +} +``` diff --git a/mmdetection/configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py b/mmdetection/configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py new file mode 100644 index 0000000..8716f12 --- /dev/null +++ b/mmdetection/configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py @@ -0,0 +1,103 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='ATSS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=128), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + dict( + type='DyHead', + in_channels=256, + out_channels=256, + num_blocks=6, + # disable zero_init_offset to follow official implementation + zero_init_offset=False) + ], + bbox_head=dict( + type='ATSSHead', + num_classes=80, + in_channels=256, + pred_kernel_size=1, # follow DyHead official implementation + stacked_convs=0, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128], + center_offset=0.5), # follow DyHead official implementation + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=0.01)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend='pillow'), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend='pillow'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py b/mmdetection/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py new file mode 100644 index 0000000..89e89b9 --- /dev/null +++ b/mmdetection/configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py @@ -0,0 +1,72 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='ATSS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + dict(type='DyHead', in_channels=256, out_channels=256, num_blocks=6) + ], + bbox_head=dict( + type='ATSSHead', + num_classes=80, + in_channels=256, + stacked_convs=0, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py b/mmdetection/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py new file mode 100644 index 0000000..f537b9d --- /dev/null +++ b/mmdetection/configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py @@ -0,0 +1,140 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa +model = dict( + type='ATSS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=128), + backbone=dict( + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + # Please only add indices that would be used + # in FPN, otherwise some parameter will not be used + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=[ + dict( + type='FPN', + in_channels=[384, 768, 1536], + out_channels=256, + start_level=0, + add_extra_convs='on_output', + num_outs=5), + dict( + type='DyHead', + in_channels=256, + out_channels=256, + num_blocks=6, + # disable zero_init_offset to follow official implementation + zero_init_offset=False) + ], + bbox_head=dict( + type='ATSSHead', + num_classes=80, + in_channels=256, + pred_kernel_size=1, # follow DyHead official implementation + stacked_convs=0, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128], + center_offset=0.5), # follow DyHead official implementation + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=[(2000, 480), (2000, 1200)], + keep_ratio=True, + backend='pillow'), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(2000, 1200), keep_ratio=True, backend='pillow'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=2, + dataset=dict( + type={{_base_.dataset_type}}, + data_root={{_base_.data_root}}, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args={{_base_.backend_args}}))) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='OptimWrapper', + optimizer=dict( + type='AdamW', lr=0.00005, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + }), + clip_grad=None) diff --git a/mmdetection/configs/dyhead/metafile.yml b/mmdetection/configs/dyhead/metafile.yml new file mode 100644 index 0000000..28b5a58 --- /dev/null +++ b/mmdetection/configs/dyhead/metafile.yml @@ -0,0 +1,76 @@ +Collections: + - Name: DyHead + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 4x T4 GPUs + Architecture: + - ATSS + - DyHead + - FPN + - ResNet + - Deformable Convolution + - Pyramid Convolution + Paper: + URL: https://arxiv.org/abs/2106.08322 + Title: 'Dynamic Head: Unifying Object Detection Heads with Attentions' + README: configs/dyhead/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/necks/dyhead.py#L130 + Version: v2.22.0 + +Models: + - Name: atss_r50-caffe_fpn_dyhead_1x_coco + In Collection: DyHead + Config: configs/dyhead/atss_r50-caffe_fpn_dyhead_1x_coco.py + Metadata: + Training Memory (GB): 5.4 + inference time (ms/im): + - value: 75.7 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_for_reproduction_1x_coco/atss_r50_fpn_dyhead_for_reproduction_4x4_1x_coco_20220107_213939-162888e6.pth + + - Name: atss_r50_fpn_dyhead_1x_coco + In Collection: DyHead + Config: configs/dyhead/atss_r50_fpn_dyhead_1x_coco.py + Metadata: + Training Memory (GB): 4.9 + inference time (ms/im): + - value: 73.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_r50_fpn_dyhead_4x4_1x_coco/atss_r50_fpn_dyhead_4x4_1x_coco_20211219_023314-eaa620c6.pth + + - Name: atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco + In Collection: DyHead + Config: configs/dyhead/atss_swin-l-p4-w12_fpn_dyhead_ms-2x_coco.py + Metadata: + Training Memory (GB): 58.4 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 56.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dyhead/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco/atss_swin-l-p4-w12_fpn_dyhead_mstrain_2x_coco_20220509_100315-bc5b6516.pth diff --git a/mmdetection/configs/dynamic_rcnn/README.md b/mmdetection/configs/dynamic_rcnn/README.md new file mode 100644 index 0000000..b5e803a --- /dev/null +++ b/mmdetection/configs/dynamic_rcnn/README.md @@ -0,0 +1,30 @@ +# Dynamic R-CNN + +> [Dynamic R-CNN: Towards High Quality Object Detection via Dynamic Training](https://arxiv.org/abs/2004.06002) + + + +## Abstract + +Although two-stage object detectors have continuously advanced the state-of-the-art performance in recent years, the training process itself is far from crystal. In this work, we first point out the inconsistency problem between the fixed network settings and the dynamic training procedure, which greatly affects the performance. For example, the fixed label assignment strategy and regression loss function cannot fit the distribution change of proposals and thus are harmful to training high quality detectors. Consequently, we propose Dynamic R-CNN to adjust the label assignment criteria (IoU threshold) and the shape of regression loss function (parameters of SmoothL1 Loss) automatically based on the statistics of proposals during training. This dynamic design makes better use of the training samples and pushes the detector to fit more high quality samples. Specifically, our method improves upon ResNet-50-FPN baseline with 1.9% AP and 5.5% AP90 on the MS COCO dataset with no extra overhead. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | 1x | 3.8 | | 38.9 | [config](./dynamic-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x-62a3f276.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x_20200618_095048.log.json) | + +## Citation + +```latex +@article{DynamicRCNN, + author = {Hongkai Zhang and Hong Chang and Bingpeng Ma and Naiyan Wang and Xilin Chen}, + title = {Dynamic {R-CNN}: Towards High Quality Object Detection via Dynamic Training}, + journal = {arXiv preprint arXiv:2004.06002}, + year = {2020} +} +``` diff --git a/mmdetection/configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..f64dfa0 --- /dev/null +++ b/mmdetection/configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,28 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + type='DynamicRoIHead', + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict(nms=dict(iou_threshold=0.85)), + rcnn=dict( + dynamic_rcnn=dict( + iou_topk=75, + beta_topk=10, + update_iter_interval=100, + initial_iou=0.4, + initial_beta=1.0))), + test_cfg=dict(rpn=dict(nms=dict(iou_threshold=0.85)))) diff --git a/mmdetection/configs/dynamic_rcnn/metafile.yml b/mmdetection/configs/dynamic_rcnn/metafile.yml new file mode 100644 index 0000000..64ab3b0 --- /dev/null +++ b/mmdetection/configs/dynamic_rcnn/metafile.yml @@ -0,0 +1,35 @@ +Collections: + - Name: Dynamic R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Dynamic R-CNN + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/pdf/2004.06002 + Title: 'Dynamic R-CNN: Towards High Quality Object Detection via Dynamic Training' + README: configs/dynamic_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/roi_heads/dynamic_roi_head.py#L11 + Version: v2.2.0 + +Models: + - Name: dynamic-rcnn_r50_fpn_1x_coco + In Collection: Dynamic R-CNN + Config: configs/dynamic_rcnn/dynamic-rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/dynamic_rcnn/dynamic_rcnn_r50_fpn_1x/dynamic_rcnn_r50_fpn_1x-62a3f276.pth diff --git a/mmdetection/configs/efficientnet/README.md b/mmdetection/configs/efficientnet/README.md new file mode 100644 index 0000000..941944d --- /dev/null +++ b/mmdetection/configs/efficientnet/README.md @@ -0,0 +1,30 @@ +# EfficientNet + +> [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946v5) + + + +## Introduction + +Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are available. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on scaling up MobileNets and ResNet. + +To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves state-of-the-art 84.3% top-1 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet. Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters. + +## Results and Models + +### RetinaNet + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Efficientnet-b3 | pytorch | 1x | - | - | 40.5 | [config](./retinanet_effb3_fpn_8xb4-crop896-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806.log.json) | + +## Citation + +```latex +@article{tan2019efficientnet, + title={Efficientnet: Rethinking model scaling for convolutional neural networks}, + author={Tan, Mingxing and Le, Quoc V}, + journal={arXiv preprint arXiv:1905.11946}, + year={2019} +} +``` diff --git a/mmdetection/configs/efficientnet/metafile.yml b/mmdetection/configs/efficientnet/metafile.yml new file mode 100644 index 0000000..6e220c8 --- /dev/null +++ b/mmdetection/configs/efficientnet/metafile.yml @@ -0,0 +1,19 @@ +Models: + - Name: retinanet_effb3_fpn_8xb4-crop896-1x_coco + In Collection: RetinaNet + Config: configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/efficientnet/retinanet_effb3_fpn_crop896_8x4_1x_coco/retinanet_effb3_fpn_crop896_8x4_1x_coco_20220322_234806-615a0dda.pth + Paper: + URL: https://arxiv.org/abs/1905.11946v5 + Title: 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks' + README: configs/efficientnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/backbones/efficientnet.py#L159 + Version: v2.23.0 diff --git a/mmdetection/configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py b/mmdetection/configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py new file mode 100644 index 0000000..2d0d9ce --- /dev/null +++ b/mmdetection/configs/efficientnet/retinanet_effb3_fpn_8xb4-crop896-1x_coco.py @@ -0,0 +1,94 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/schedules/schedule_1x.py', + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] + +image_size = (896, 896) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] +norm_cfg = dict(type='BN', requires_grad=True) +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth' # noqa +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + _delete_=True, + type='EfficientNet', + arch='b3', + drop_path_rate=0.2, + out_indices=(3, 4, 5), + frozen_stages=0, + norm_cfg=dict( + type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01), + norm_eval=False, + init_cfg=dict( + type='Pretrained', prefix='backbone', checkpoint=checkpoint)), + neck=dict( + in_channels=[48, 136, 384], + start_level=0, + out_channels=256, + relu_before_extra_convs=True, + no_norm_on_lateral=True, + norm_cfg=norm_cfg), + bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg), + # training and testing settings + train_cfg=dict(assigner=dict(neg_iou_thr=0.5))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict(type='RandomCrop', crop_size=image_size), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=image_size, keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=4, num_workers=4, dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.04), + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True)) + +# learning policy +max_epochs = 12 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) + +# cudnn_benchmark=True can accelerate fix-size training +env_cfg = dict(cudnn_benchmark=True) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (4 samples per GPU) +auto_scale_lr = dict(base_batch_size=32) diff --git a/mmdetection/configs/empirical_attention/README.md b/mmdetection/configs/empirical_attention/README.md new file mode 100644 index 0000000..c0b4a68 --- /dev/null +++ b/mmdetection/configs/empirical_attention/README.md @@ -0,0 +1,33 @@ +# Empirical Attention + +> [An Empirical Study of Spatial Attention Mechanisms in Deep Networks](https://arxiv.org/abs/1904.05873) + + + +## Abstract + +Attention mechanisms have become a popular component in deep neural networks, yet there has been little examination of how different influencing factors and methods for computing attention from these factors affect performance. Toward a better general understanding of attention mechanisms, we present an empirical study that ablates various spatial attention elements within a generalized attention formulation, encompassing the dominant Transformer attention as well as the prevalent deformable convolution and dynamic convolution modules. Conducted on a variety of applications, the study yields significant findings about spatial attention in deep networks, some of which run counter to conventional understanding. For example, we find that the query and key content comparison in Transformer attention is negligible for self-attention, but vital for encoder-decoder attention. A proper combination of deformable convolution with key content only saliency achieves the best accuracy-efficiency tradeoff in self-attention. Our results suggest that there exists much room for improvement in the design of attention mechanisms. + +
    + +
    + +## Results and Models + +| Backbone | Attention Component | DCN | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :-----------------: | :-: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | 1111 | N | 1x | 8.0 | 13.8 | 40.0 | [config](./faster-rcnn_r50-attn1111_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130_210344.log.json) | +| R-50 | 0010 | N | 1x | 4.2 | 18.4 | 39.1 | [config](./faster-rcnn_r50-attn0010_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130-7cb0c14d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130_210125.log.json) | +| R-50 | 1111 | Y | 1x | 8.0 | 12.7 | 42.1 | [config](./faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130-8b2523a6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130_204442.log.json) | +| R-50 | 0010 | Y | 1x | 4.2 | 17.1 | 42.0 | [config](./faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130-1a2e831d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130_210410.log.json) | + +## Citation + +```latex +@article{zhu2019empirical, + title={An Empirical Study of Spatial Attention Mechanisms in Deep Networks}, + author={Zhu, Xizhou and Cheng, Dazhi and Zhang, Zheng and Lin, Stephen and Dai, Jifeng}, + journal={arXiv preprint arXiv:1904.05873}, + year={2019} +} +``` diff --git a/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py b/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py new file mode 100644 index 0000000..e1ae17a --- /dev/null +++ b/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='0010', + kv_stride=2), + stages=(False, False, True, True), + position='after_conv2') + ], + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py b/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py new file mode 100644 index 0000000..7336d29 --- /dev/null +++ b/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='0010', + kv_stride=2), + stages=(False, False, True, True), + position='after_conv2') + ])) diff --git a/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py b/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py new file mode 100644 index 0000000..980e23d --- /dev/null +++ b/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='1111', + kv_stride=2), + stages=(False, False, True, True), + position='after_conv2') + ], + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) diff --git a/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py b/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py new file mode 100644 index 0000000..426bc09 --- /dev/null +++ b/mmdetection/configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='1111', + kv_stride=2), + stages=(False, False, True, True), + position='after_conv2') + ])) diff --git a/mmdetection/configs/empirical_attention/metafile.yml b/mmdetection/configs/empirical_attention/metafile.yml new file mode 100644 index 0000000..b488da7 --- /dev/null +++ b/mmdetection/configs/empirical_attention/metafile.yml @@ -0,0 +1,103 @@ +Collections: + - Name: Empirical Attention + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Deformable Convolution + - FPN + - RPN + - ResNet + - RoIAlign + - Spatial Attention + Paper: + URL: https://arxiv.org/pdf/1904.05873 + Title: 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks' + README: configs/empirical_attention/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/generalized_attention.py#L10 + Version: v2.0.0 + +Models: + - Name: faster-rcnn_r50_fpn_attention_1111_1x_coco + In Collection: Empirical Attention + Config: configs/empirical_attention/faster-rcnn_r50-attn1111_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.0 + inference time (ms/im): + - value: 72.46 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_1x_coco/faster_rcnn_r50_fpn_attention_1111_1x_coco_20200130-403cccba.pth + + - Name: faster-rcnn_r50_fpn_attention_0010_1x_coco + In Collection: Empirical Attention + Config: configs/empirical_attention/faster-rcnn_r50-attn0010_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + inference time (ms/im): + - value: 54.35 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_1x_coco/faster_rcnn_r50_fpn_attention_0010_1x_coco_20200130-7cb0c14d.pth + + - Name: faster-rcnn_r50_fpn_attention_1111_dcn_1x_coco + In Collection: Empirical Attention + Config: configs/empirical_attention/faster-rcnn_r50-attn1111-dcn_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.0 + inference time (ms/im): + - value: 78.74 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco_20200130-8b2523a6.pth + + - Name: faster-rcnn_r50_fpn_attention_0010_dcn_1x_coco + In Collection: Empirical Attention + Config: configs/empirical_attention/faster-rcnn_r50-attn0010-dcn_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + inference time (ms/im): + - value: 58.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco_20200130-1a2e831d.pth diff --git a/mmdetection/configs/fast_rcnn/README.md b/mmdetection/configs/fast_rcnn/README.md new file mode 100644 index 0000000..0bdc935 --- /dev/null +++ b/mmdetection/configs/fast_rcnn/README.md @@ -0,0 +1,121 @@ +# Fast R-CNN + +> [Fast R-CNN](https://arxiv.org/abs/1504.08083) + + + +## Abstract + +This paper proposes a Fast Region-based Convolutional Network method (Fast R-CNN) for object detection. Fast R-CNN builds on previous work to efficiently classify object proposals using deep convolutional networks. Compared to previous work, Fast R-CNN employs several innovations to improve training and testing speed while also increasing detection accuracy. Fast R-CNN trains the very deep VGG16 network 9x faster than R-CNN, is 213x faster at test-time, and achieves a higher mAP on PASCAL VOC 2012. Compared to SPPnet, Fast R-CNN trains VGG16 3x faster, tests 10x faster, and is more accurate. + +
    + +
    + +## Introduction + +Before training the Fast R-CNN, users should first train an [RPN](../rpn/README.md), and use the RPN to extract the region proposals. +The region proposals can be obtained by setting `DumpProposals` pseudo metric. The dumped results is a `dict(file_name: pred_instance)`. +The `pred_instance` is an `InstanceData` containing the sorted boxes and scores predicted by RPN. We provide example of dumping proposals in [RPN config](../rpn/rpn_r50_fpn_1x_coco.py). + +- First, it should be obtained the region proposals in both training and validation (or testing) set. + change the type of `test_evaluator` to `DumpProposals` in the RPN config to get the region proposals as below: + + The config of get training image region proposals can be set as below: + + ```python + # For training set + val_dataloader = dict( + dataset=dict( + ann_file='data/coco/annotations/instances_train2017.json', + data_prefix=dict(img='val2017/'))) + val_dataloader = dict( + _delete_=True, + type='DumpProposals', + output_dir='data/coco/proposals/', + proposals_file='rpn_r50_fpn_1x_train2017.pkl') + test_dataloader = val_dataloader + test_evaluator = val_dataloader + ``` + + The config of get validation image region proposals can be set as below: + + ```python + # For validation set + val_dataloader = dict( + _delete_=True, + type='DumpProposals', + output_dir='data/coco/proposals/', + proposals_file='rpn_r50_fpn_1x_val2017.pkl') + test_evaluator = val_dataloader + ``` + + Extract the region proposals command can be set as below: + + ```bash + ./tools/dist_test.sh \ + configs/rpn_r50_fpn_1x_coco.py \ + checkpoints/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth \ + 8 + ``` + + Users can refer to [test tutorial](https://mmdetection.readthedocs.io/en/latest/user_guides/test.html) for more details. + +- Then, modify the path of `proposal_file` in the dataset and using `ProposalBroadcaster` to process both ground truth bounding boxes and region proposals in pipelines. + An example of Fast R-CNN important setting can be seen as below: + + ```python + train_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args={{_base_.backend_args}}), + dict(type='LoadProposals', num_max_proposals=2000), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='ProposalBroadcaster', + transforms=[ + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + ]), + dict(type='PackDetInputs') + ] + test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args={{_base_.backend_args}}), + dict(type='LoadProposals', num_max_proposals=None), + dict( + type='ProposalBroadcaster', + transforms=[ + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + ]), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) + ] + train_dataloader = dict( + dataset=dict( + proposal_file='proposals/rpn_r50_fpn_1x_train2017.pkl', + pipeline=train_pipeline)) + val_dataloader = dict( + dataset=dict( + proposal_file='proposals/rpn_r50_fpn_1x_val2017.pkl', + pipeline=test_pipeline)) + test_dataloader = val_dataloader + ``` + +- Finally, users can start training the Fast R-CNN. + +## Results and Models + +## Citation + +```latex +@inproceedings{girshick2015fast, + title={Fast r-cnn}, + author={Girshick, Ross}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + year={2015} +} +``` diff --git a/mmdetection/configs/fast_rcnn/fast-rcnn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/fast_rcnn/fast-rcnn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..02c7029 --- /dev/null +++ b/mmdetection/configs/fast_rcnn/fast-rcnn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './fast-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/fast_rcnn/fast-rcnn_r101_fpn_1x_coco.py b/mmdetection/configs/fast_rcnn/fast-rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..5af6b22 --- /dev/null +++ b/mmdetection/configs/fast_rcnn/fast-rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fast-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/fast_rcnn/fast-rcnn_r101_fpn_2x_coco.py b/mmdetection/configs/fast_rcnn/fast-rcnn_r101_fpn_2x_coco.py new file mode 100644 index 0000000..73425cf --- /dev/null +++ b/mmdetection/configs/fast_rcnn/fast-rcnn_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fast-rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..3110f9f --- /dev/null +++ b/mmdetection/configs/fast_rcnn/fast-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = './fast-rcnn_r50_fpn_1x_coco.py' + +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + norm_cfg=dict(type='BN', requires_grad=False), + style='caffe', + norm_eval=True, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..daefe2d --- /dev/null +++ b/mmdetection/configs/fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/fast-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadProposals', num_max_proposals=2000), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='ProposalBroadcaster', + transforms=[ + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + ]), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadProposals', num_max_proposals=None), + dict( + type='ProposalBroadcaster', + transforms=[ + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + ]), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + dataset=dict( + proposal_file='proposals/rpn_r50_fpn_1x_train2017.pkl', + pipeline=train_pipeline)) +val_dataloader = dict( + dataset=dict( + proposal_file='proposals/rpn_r50_fpn_1x_val2017.pkl', + pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/fast_rcnn/fast-rcnn_r50_fpn_2x_coco.py b/mmdetection/configs/fast_rcnn/fast-rcnn_r50_fpn_2x_coco.py new file mode 100644 index 0000000..d609a7c --- /dev/null +++ b/mmdetection/configs/fast_rcnn/fast-rcnn_r50_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './fast-rcnn_r50_fpn_1x_coco.py' + +train_cfg = dict(max_epochs=24) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=24, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/faster_rcnn/README.md b/mmdetection/configs/faster_rcnn/README.md new file mode 100644 index 0000000..0d9912d --- /dev/null +++ b/mmdetection/configs/faster_rcnn/README.md @@ -0,0 +1,88 @@ +# Faster R-CNN + +> [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497) + + + +## Abstract + +State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-C4 | caffe | 1x | - | - | 35.6 | [config](./faster-rcnn_r50-caffe_c4-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152-3f885b85.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152.log.json) | +| R-50-DC5 | caffe | 1x | - | - | 37.2 | [config](./faster-rcnn_r50-caffe-dc5_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909-531f0f43.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909.log.json) | +| R-50-FPN | caffe | 1x | 3.8 | | 37.8 | [config](./faster-rcnn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_20200504_180032.log.json) | +| R-50-FPN | pytorch | 1x | 4.0 | 21.4 | 37.4 | [config](./faster-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| R-50-FPN (FP16) | pytorch | 1x | 3.4 | 28.8 | 37.5 | [config](./faster-rcnn_r50_fpn_amp-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204-d4dc1471.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204_143530.log.json) | +| R-50-FPN | pytorch | 2x | - | - | 38.4 | [config](./faster-rcnn_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_20200504_210434.log.json) | +| R-101-FPN | caffe | 1x | 5.7 | | 39.8 | [config](./faster-rcnn_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.398_20200504_180057-b269e9dd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_20200504_180057.log.json) | +| R-101-FPN | pytorch | 1x | 6.0 | 15.6 | 39.4 | [config](./faster-rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130-f513f705.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130_204655.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 39.8 | [config](./faster-rcnn_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_20200504_210455.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.2 | 13.8 | 41.2 | [config](./faster-rcnn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203-cff10310.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203_000520.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 41.2 | [config](./faster-rcnn_x101-32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.412_20200506_041400-64a12c0b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_20200506_041400.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.3 | 9.4 | 42.1 | [config](./faster-rcnn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204-833ee192.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204_134340.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 41.6 | [config](./faster-rcnn_x101-64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033-5961fa95.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033.log.json) | + +## Different regression loss + +We trained with R-50-FPN pytorch style backbone for 1x schedule. + +| Backbone | Loss type | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :------------: | :------: | :------------: | :----: | :----------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | L1Loss | 4.0 | 21.4 | 37.4 | [config](./faster-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| R-50-FPN | IoULoss | | | 37.9 | [config](./faster-rcnn_r50_fpn_iou_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco_20200506_095954-938e81f0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco_20200506_095954.log.json) | +| R-50-FPN | GIoULoss | | | 37.6 | [config](./faster-rcnn_r50_fpn_giou_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco_20200505_161120.log.json) | +| R-50-FPN | BoundedIoULoss | | | 37.4 | [config](./faster-rcnn_r50_fpn_bounded-iou_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco-98ad993b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco_20200505_160738.log.json) | + +## Pre-trained Models + +We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks. + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-----------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [R-50-C4](./faster-rcnn_r50-caffe-c4_ms-1x_coco.py) | caffe | 1x | - | | 35.9 | [config](./faster-rcnn_r50-caffe-c4_ms-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527-db276fed.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527.log.json) | +| [R-50-DC5](./faster-rcnn_r50-caffe-dc5_ms-1x_coco.py) | caffe | 1x | - | | 37.4 | [config](./faster-rcnn_r50-caffe-dc5_ms-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851-b33d21b9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851.log.json) | +| [R-50-DC5](./faster-rcnn_r50-caffe-dc5_ms-3x_coco.py) | caffe | 3x | - | | 38.7 | [config](./faster-rcnn_r50-caffe-dc5_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107-34a53b2c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107.log.json) | +| [R-50-FPN](./faster-rcnn_r50-caffe_fpn_ms-2x_coco.py) | caffe | 2x | 3.7 | | 39.7 | [config](./faster-rcnn_r50-caffe_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_bbox_mAP-0.397_20200504_231813-10b2de58.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_20200504_231813.log.json) | +| [R-50-FPN](./faster-rcnn_r50-caffe_fpn_ms-3x_coco.py) | caffe | 3x | 3.7 | | 39.9 | [config](./faster-rcnn_r50-caffe_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054-1f77628b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054.log.json) | +| [R-50-FPN](./faster-rcnn_r50_fpn_ms-3x_coco.py) | pytorch | 3x | 3.9 | | 40.3 | [config](./faster-rcnn_r50_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822.log.json) | +| [R-101-FPN](./faster-rcnn_r101-caffe_fpn_ms-3x_coco.py) | caffe | 3x | 5.6 | | 42.0 | [config](./faster-rcnn_r101-caffe_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742-a7ae426d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742.log.json) | +| [R-101-FPN](./faster-rcnn_r101_fpn_ms-3x_coco.py) | pytorch | 3x | 5.8 | | 41.8 | [config](./faster-rcnn_r101_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822-4d4d2ca8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822.log.json) | +| [X-101-32x4d-FPN](./faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py) | pytorch | 3x | 7.0 | | 42.5 | [config](./faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151-16b9b260.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151.log.json) | +| [X-101-32x8d-FPN](./faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py) | pytorch | 3x | 10.1 | | 42.4 | [config](./faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954-002e082a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954.log.json) | +| [X-101-64x4d-FPN](./faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py) | pytorch | 3x | 10.0 | | 43.1 | [config](./faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528-26c63de6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528.log.json) | + +We further finetune some pre-trained models on the COCO subsets, which only contain only a few of the 80 categories. + +| Backbone | Style | Class name | Pre-traind model | Mem (GB) | box AP | Config | Download | +| ------------------------------------------------------------------------ | ----- | ------------------ | -------------------------------------------------------------- | -------- | ------ | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [R-50-FPN](./faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py) | caffe | person | [R-50-FPN-Caffe-3x](./faster-rcnn_r50-caffe_fpn_ms-3x_coco.py) | 3.7 | 55.8 | [config](./faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929.log.json) | +| [R-50-FPN](./faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py) | caffe | person-bicycle-car | [R-50-FPN-Caffe-3x](./faster-rcnn_r50-caffe_fpn_ms-3x_coco.py) | 3.7 | 44.1 | [config](./faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car_20201216_173117-6eda6d92.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car/faster_rcnn_r50_fpn_1x_coco-person-bicycle-car_20201216_173117.log.json) | + +## Torchvision New Receipe (TNR) + +Torchvision released its high-precision ResNet models. The training details can be found on the [Pytorch website](https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/). Here, we have done grid searches on learning rate and weight decay and found the optimal hyper-parameter on the detection task. + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :--------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [R-50-TNR](./faster-rcnn_r50-tnr-pre_fpn_1x_coco.py) | pytorch | 1x | - | | 40.2 | [config](./faster-rcnn_r50-tnr-pre_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147.log.json) | + +## Citation + +```latex +@article{Ren_2017, + title={Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks}, + journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, + publisher={Institute of Electrical and Electronics Engineers (IEEE)}, + author={Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian}, + year={2017}, + month={Jun}, +} +``` diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..a18f1ad --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './faster-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py new file mode 100644 index 0000000..1cdb4d4 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py @@ -0,0 +1,11 @@ +_base_ = 'faster-rcnn_r50_fpn_ms-3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..d113ae6 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py new file mode 100644 index 0000000..b471fb3 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster-rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..a71d4af --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py new file mode 100644 index 0000000..8ef6d1f --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py @@ -0,0 +1,7 @@ +_base_ = 'faster-rcnn_r50_fpn_ms-3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..65515c9 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py new file mode 100644 index 0000000..7e231e8 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster-rcnn_r50-caffe_c4-1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +_base_.train_dataloader.dataset.pipeline = train_pipeline diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py new file mode 100644 index 0000000..8952a5c --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50-caffe-dc5.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py new file mode 100644 index 0000000..63a6885 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py @@ -0,0 +1,14 @@ +_base_ = 'faster-rcnn_r50-caffe-dc5_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +_base_.train_dataloader.dataset.pipeline = train_pipeline diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py new file mode 100644 index 0000000..2706346 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py @@ -0,0 +1,18 @@ +_base_ = './faster-rcnn_r50-caffe-dc5_ms-1x_coco.py' + +# MMEngine support the following two ways, users can choose +# according to convenience +# param_scheduler = [ +# dict( +# type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), # noqa +# dict( +# type='MultiStepLR', +# begin=0, +# end=12, +# by_epoch=True, +# milestones=[28, 34], +# gamma=0.1) +# ] +_base_.param_scheduler[1].milestones = [28, 34] + +train_cfg = dict(max_epochs=36) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py new file mode 100644 index 0000000..0888fc0 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50-caffe-c4.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..9129a95 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,15 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_90k_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_90k_coco.py new file mode 100644 index 0000000..27f4935 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_90k_coco.py @@ -0,0 +1,22 @@ +_base_ = 'faster-rcnn_r50-caffe_fpn_1x_coco.py' +max_iter = 90000 + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] + +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=max_iter, + val_interval=10000) +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000)) +log_processor = dict(by_epoch=False) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py new file mode 100644 index 0000000..f36bb05 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person-bicycle-car.py @@ -0,0 +1,16 @@ +_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py' +model = dict(roi_head=dict(bbox_head=dict(num_classes=3))) +metainfo = { + 'classes': ('person', 'bicycle', 'car'), + 'palette': [ + (220, 20, 60), + (119, 11, 32), + (0, 0, 142), + ] +} + +train_dataloader = dict(dataset=dict(metainfo=metainfo)) +val_dataloader = dict(dataset=dict(metainfo=metainfo)) +test_dataloader = dict(dataset=dict(metainfo=metainfo)) + +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_bbox_mAP-0.398_20200504_163323-30042637.pth' # noqa diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py new file mode 100644 index 0000000..9528b63 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py @@ -0,0 +1,14 @@ +_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py' +model = dict(roi_head=dict(bbox_head=dict(num_classes=1))) +metainfo = { + 'classes': ('person', ), + 'palette': [ + (220, 20, 60), + ] +} + +train_dataloader = dict(dataset=dict(metainfo=metainfo)) +val_dataloader = dict(dataset=dict(metainfo=metainfo)) +test_dataloader = dict(dataset=dict(metainfo=metainfo)) + +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_bbox_mAP-0.398_20200504_163323-30042637.pth' # noqa diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py new file mode 100644 index 0000000..59f1633 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py @@ -0,0 +1,31 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +# MMEngine support the following two ways, users can choose +# according to convenience +# train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.train_dataloader.dataset.pipeline = train_pipeline diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py new file mode 100644 index 0000000..44d320e --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py @@ -0,0 +1,18 @@ +_base_ = './faster-rcnn_r50-caffe_fpn_ms-1x_coco.py' + +# MMEngine support the following two ways, users can choose +# according to convenience +# param_scheduler = [ +# dict( +# type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), # noqa +# dict( +# type='MultiStepLR', +# begin=0, +# end=12, +# by_epoch=True, +# milestones=[16, 23], +# gamma=0.1) +# ] +_base_.param_scheduler[1].milestones = [16, 23] + +train_cfg = dict(max_epochs=24) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py new file mode 100644 index 0000000..365f643 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py @@ -0,0 +1,15 @@ +_base_ = 'faster-rcnn_r50_fpn_ms-3x_coco.py' +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-90k_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-90k_coco.py new file mode 100644 index 0000000..6b9b3eb --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-90k_coco.py @@ -0,0 +1,23 @@ +_base_ = 'faster-rcnn_r50-caffe_fpn_ms-1x_coco.py' + +max_iter = 90000 + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] + +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=max_iter, + val_interval=10000) +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000)) +log_processor = dict(by_epoch=False) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py new file mode 100644 index 0000000..7b3e5de --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.pytorch.org/models/resnet50-11ad3fa6.pth' +model = dict( + backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint))) + +# `lr` and `weight_decay` have been searched to be optimal. +optim_wrapper = dict( + optimizer=dict(_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.1), + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..8a45417 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py new file mode 100644 index 0000000..2981c6f --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..3d366f3 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../common/lsj-200e_coco-detection.py' +] +image_size = (1024, 1024) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +model = dict(data_preprocessor=dict(batch_augments=batch_augments)) + +train_dataloader = dict(batch_size=8, num_workers=4) +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.02 * 4, momentum=0.9, weight_decay=0.00004)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py new file mode 100644 index 0000000..f765dea --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' + +# MMEngine support the following two ways, users can choose +# according to convenience +# optim_wrapper = dict(type='AmpOptimWrapper') +_base_.optim_wrapper.type = 'AmpOptimWrapper' diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py new file mode 100644 index 0000000..7758ca8 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + reg_decoded_bbox=True, + loss_bbox=dict(type='BoundedIoULoss', loss_weight=10.0)))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_coco.py new file mode 100644 index 0000000..e8d8a30 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ciou_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + reg_decoded_bbox=True, + loss_bbox=dict(type='CIoULoss', loss_weight=12.0)))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py new file mode 100644 index 0000000..b5a34d9 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py @@ -0,0 +1,48 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + # copied from configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py + neck=dict( + start_level=1, + add_extra_convs='on_output', # use P5 + relu_before_extra_convs=True), + rpn_head=dict( + _delete_=True, # ignore the unused old settings + type='FCOSHead', + # num_classes = 1 for rpn, + # if num_classes > 1, it will be set to 1 in + # TwoStageDetector automatically + num_classes=1, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + roi_head=dict( # update featmap_strides + bbox_roi_extractor=dict(featmap_strides=[8, 16, 32, 64, 128]))) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), # Slowly increase lr, otherwise loss becomes NAN + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py new file mode 100644 index 0000000..82b71d7 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + reg_decoded_bbox=True, + loss_bbox=dict(type='GIoULoss', loss_weight=10.0)))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py new file mode 100644 index 0000000..e21c436 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + bbox_head=dict( + reg_decoded_bbox=True, + loss_bbox=dict(type='IoULoss', loss_weight=10.0)))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py new file mode 100644 index 0000000..75dcfeb --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py @@ -0,0 +1 @@ +_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py'] diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ohem_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ohem_1x_coco.py new file mode 100644 index 0000000..4f804b9 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_ohem_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict(train_cfg=dict(rcnn=dict(sampler=dict(type='OHEMSampler')))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_soft-nms_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_soft-nms_1x_coco.py new file mode 100644 index 0000000..3775d8e --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_soft-nms_1x_coco.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + test_cfg=dict( + rcnn=dict( + score_thr=0.05, + nms=dict(type='soft_nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..395c98c --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py new file mode 100644 index 0000000..6232d0e --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster-rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py new file mode 100644 index 0000000..88cb40f --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py @@ -0,0 +1,14 @@ +_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py'] +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py new file mode 100644 index 0000000..28d6290 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py @@ -0,0 +1,23 @@ +_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py'] +model = dict( + # ResNeXt-101-32x8d model trained with Caffe2 at FB, + # so the mean and std need to be changed. + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..f39d632 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py new file mode 100644 index 0000000..97a3c13 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './faster-rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py new file mode 100644 index 0000000..eeaa218 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py @@ -0,0 +1,14 @@ +_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py'] +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/faster_rcnn/metafile.yml b/mmdetection/configs/faster_rcnn/metafile.yml new file mode 100644 index 0000000..6a201e1 --- /dev/null +++ b/mmdetection/configs/faster_rcnn/metafile.yml @@ -0,0 +1,451 @@ +Collections: + - Name: Faster R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - RPN + - ResNet + - RoIPool + Paper: + URL: https://arxiv.org/abs/1506.01497 + Title: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" + README: configs/faster_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/faster_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: faster-rcnn_r50-caffe-c4_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-caffe_c4-1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_1x_coco/faster_rcnn_r50_caffe_c4_1x_coco_20220316_150152-3f885b85.pth + + - Name: faster-rcnn_r50-caffe-c4_mstrain_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-caffe-c4_ms-1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_c4_mstrain_1x_coco/faster_rcnn_r50_caffe_c4_mstrain_1x_coco_20220316_150527-db276fed.pth + + - Name: faster-rcnn_r50-caffe-dc5_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_1x_coco/faster_rcnn_r50_caffe_dc5_1x_coco_20201030_151909-531f0f43.pth + + - Name: faster-rcnn_r50-caffe_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth + + - Name: faster-rcnn_r50_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 46.73 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth + + - Name: faster-rcnn_r50_fpn_fp16_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50_fpn_amp-1x_coco.py + Metadata: + Training Memory (GB): 3.4 + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + inference time (ms/im): + - value: 34.72 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP16 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/faster_rcnn_r50_fpn_fp16_1x_coco/faster_rcnn_r50_fpn_fp16_1x_coco_20200204-d4dc1471.pth + + - Name: faster-rcnn_r50_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 46.73 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth + + - Name: faster-rcnn_r101-caffe_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.7 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_1x_coco/faster_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.398_20200504_180057-b269e9dd.pth + + - Name: faster-rcnn_r101_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 64.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_1x_coco/faster_rcnn_r101_fpn_1x_coco_20200130-f513f705.pth + + - Name: faster-rcnn_r101_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 64.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth + + - Name: faster-rcnn_x101-32x4d_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.2 + inference time (ms/im): + - value: 72.46 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_1x_coco/faster_rcnn_x101_32x4d_fpn_1x_coco_20200203-cff10310.pth + + - Name: faster-rcnn_x101-32x4d_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.2 + inference time (ms/im): + - value: 72.46 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_2x_coco/faster_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.412_20200506_041400-64a12c0b.pth + + - Name: faster-rcnn_x101-64x4d_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 106.38 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_1x_coco/faster_rcnn_x101_64x4d_fpn_1x_coco_20200204-833ee192.pth + + - Name: faster-rcnn_x101-64x4d_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 106.38 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_2x_coco/faster_rcnn_x101_64x4d_fpn_2x_coco_20200512_161033-5961fa95.pth + + - Name: faster-rcnn_r50_fpn_iou_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50_fpn_iou_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_iou_1x_coco/faster_rcnn_r50_fpn_iou_1x_coco_20200506_095954-938e81f0.pth + + - Name: faster-rcnn_r50_fpn_giou_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50_fpn_giou_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_giou_1x_coco-0eada910.pth + + - Name: faster-rcnn_r50_fpn_bounded_iou_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50_fpn_bounded-iou_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_bounded_iou_1x_coco-98ad993b.pth + + - Name: faster-rcnn_r50-caffe-dc5_mstrain_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco/faster_rcnn_r50_caffe_dc5_mstrain_1x_coco_20201028_233851-b33d21b9.pth + + - Name: faster-rcnn_r50-caffe-dc5_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-caffe-dc5_ms-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco/faster_rcnn_r50_caffe_dc5_mstrain_3x_coco_20201028_002107-34a53b2c.pth + + - Name: faster-rcnn_r50-caffe_fpn_ms-2x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-2x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco/faster_rcnn_r50_caffe_fpn_mstrain_2x_coco_bbox_mAP-0.397_20200504_231813-10b2de58.pth + + - Name: faster-rcnn_r50-caffe_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 3.7 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco/faster_rcnn_r50_caffe_fpn_mstrain_3x_coco_20210526_095054-1f77628b.pth + + - Name: faster-rcnn_r50_fpn_mstrain_3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 3.9 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_mstrain_3x_coco/faster_rcnn_r50_fpn_mstrain_3x_coco_20210524_110822-e10bd31c.pth + + - Name: faster-rcnn_r101-caffe_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r101-caffe_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 5.6 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco/faster_rcnn_r101_caffe_fpn_mstrain_3x_coco_20210526_095742-a7ae426d.pth + + - Name: faster-rcnn_r101_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r101_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 5.8 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_mstrain_3x_coco/faster_rcnn_r101_fpn_mstrain_3x_coco_20210524_110822-4d4d2ca8.pth + + - Name: faster-rcnn_x101-32x4d_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_x101-32x4d_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 7.0 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x4d_fpn_mstrain_3x_coco_20210524_124151-16b9b260.pth + + - Name: faster-rcnn_x101-32x8d_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_x101-32x8d_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 10.1 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco/faster_rcnn_x101_32x8d_fpn_mstrain_3x_coco_20210604_182954-002e082a.pth + + - Name: faster-rcnn_x101-64x4d_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_x101-64x4d_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 10.0 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco/faster_rcnn_x101_64x4d_fpn_mstrain_3x_coco_20210524_124528-26c63de6.pth + + - Name: faster-rcnn_r50_fpn_tnr-pretrain_1x_coco + In Collection: Faster R-CNN + Config: configs/faster_rcnn/faster-rcnn_r50-tnr-pre_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 46.73 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco/faster_rcnn_r50_fpn_tnr-pretrain_1x_coco_20220320_085147-efedfda4.pth diff --git a/mmdetection/configs/fcos/README.md b/mmdetection/configs/fcos/README.md new file mode 100644 index 0000000..8d72237 --- /dev/null +++ b/mmdetection/configs/fcos/README.md @@ -0,0 +1,45 @@ +# FCOS + +> [FCOS: Fully Convolutional One-Stage Object Detection](https://arxiv.org/abs/1904.01355) + + + +## Abstract + +We propose a fully convolutional one-stage object detector (FCOS) to solve object detection in a per-pixel prediction fashion, analogue to semantic segmentation. Almost all state-of-the-art object detectors such as RetinaNet, SSD, YOLOv3, and Faster R-CNN rely on pre-defined anchor boxes. In contrast, our proposed detector FCOS is anchor box free, as well as proposal free. By eliminating the predefined set of anchor boxes, FCOS completely avoids the complicated computation related to anchor boxes such as calculating overlapping during training. More importantly, we also avoid all hyper-parameters related to anchor boxes, which are often very sensitive to the final detection performance. With the only post-processing non-maximum suppression (NMS), FCOS with ResNeXt-64x4d-101 achieves 44.7% in AP with single-model and single-scale testing, surpassing previous one-stage detectors with the advantage of being much simpler. For the first time, we demonstrate a much simpler and flexible detection framework achieving improved detection accuracy. We hope that the proposed FCOS framework can serve as a simple and strong alternative for many other instance-level tasks. + +
    + +
    + +## Results and Models + +| Backbone | Style | GN | MS train | Tricks | DCN | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :---: | :-: | :------: | :----: | :-: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | caffe | Y | N | N | N | 1x | 3.6 | 22.7 | 36.6 | [config](./fcos_r50-caffe_fpn_gn-head_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/20201227_180009.log.json) | +| R-50 | caffe | Y | N | Y | N | 1x | 3.7 | - | 38.7 | [config](./fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/20210105_135818.log.json) | +| R-50 | caffe | Y | N | Y | Y | 1x | 3.8 | - | 42.3 | [config](./fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco-ae4d8b3d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/20210105_224556.log.json) | +| R-101 | caffe | Y | N | N | N | 1x | 5.5 | 17.3 | 39.1 | [config](./fcos_r101-caffe_fpn_gn-head-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/fcos_r101_caffe_fpn_gn-head_1x_coco-0e37b982.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/20210103_155046.log.json) | + +| Backbone | Style | GN | MS train | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :-----: | :-: | :------: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | caffe | Y | Y | 2x | 2.6 | 22.9 | 38.5 | [config](./fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco-d92ceeea.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/20201227_161900.log.json) | +| R-101 | caffe | Y | Y | 2x | 5.5 | 17.3 | 40.8 | [config](./fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco-511424d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/20210103_155046.log.json) | +| X-101 | pytorch | Y | Y | 2x | 10.0 | 9.7 | 42.6 | [config](./fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco-ede514a8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/20210114_133041.log.json) | + +**Notes:** + +- The X-101 backbone is X-101-64x4d. +- Tricks means setting `norm_on_bbox`, `centerness_on_reg`, `center_sampling` as `True`. +- DCN means using `DCNv2` in both backbone and head. + +## Citation + +```latex +@article{tian2019fcos, + title={FCOS: Fully Convolutional One-Stage Object Detection}, + author={Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong}, + journal={arXiv preprint arXiv:1904.01355}, + year={2019} +} +``` diff --git a/mmdetection/configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py b/mmdetection/configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py new file mode 100644 index 0000000..5380e87 --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py @@ -0,0 +1,9 @@ +_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py' + +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet101_caffe'))) diff --git a/mmdetection/configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py b/mmdetection/configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py new file mode 100644 index 0000000..286a07a --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py @@ -0,0 +1,38 @@ +_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py' + +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet101_caffe'))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# training schedule for 2x +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict(type='ConstantLR', factor=1.0 / 3, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/fcos/fcos_r101_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/fcos/fcos_r101_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..77250e6 --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r101_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py' # noqa + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/fcos/fcos_r18_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/fcos/fcos_r18_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..6f00102 --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r18_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py' # noqa + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py new file mode 100644 index 0000000..2a77641 --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py @@ -0,0 +1,43 @@ +_base_ = 'fcos_r50-caffe_fpn_gn-head_1x_coco.py' + +# model setting +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + bbox_head=dict( + norm_on_bbox=True, + centerness_on_reg=True, + dcn_on_last_conv=False, + center_sampling=True, + conv_bias=True, + loss_bbox=dict(type='GIoULoss', loss_weight=1.0)), + # training and testing settings + test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6))) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3.0, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict(clip_grad=None) diff --git a/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head-center_1x_coco.py b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head-center_1x_coco.py new file mode 100644 index 0000000..9e4eb1d --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head-center_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py' + +# model settings +model = dict(bbox_head=dict(center_sampling=True, center_sample_radius=1.5)) diff --git a/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py new file mode 100644 index 0000000..928a9b4 --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py @@ -0,0 +1,75 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='FCOS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[102.9801, 115.9465, 122.7717], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet50_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # testing settings + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)) + +# learning rate +param_scheduler = [ + dict(type='ConstantLR', factor=1.0 / 3, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.01), + paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.), + clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py new file mode 100644 index 0000000..32358cd --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py @@ -0,0 +1,5 @@ +# TODO: Remove this config after benchmarking all related configs +_base_ = 'fcos_r50-caffe_fpn_gn-head_1x_coco.py' + +# dataset settings +train_dataloader = dict(batch_size=4, num_workers=4) diff --git a/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py new file mode 100644 index 0000000..4d50b4e --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py @@ -0,0 +1,30 @@ +_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py' + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# training schedule for 2x +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict(type='ConstantLR', factor=1.0 / 3, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py b/mmdetection/configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py new file mode 100644 index 0000000..a6a6c44 --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = 'fcos_r50-caffe_fpn_gn-head_1x_coco.py' + +# model settings +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + bbox_head=dict( + norm_on_bbox=True, + centerness_on_reg=True, + dcn_on_last_conv=True, + center_sampling=True, + conv_bias=True, + loss_bbox=dict(type='GIoULoss', loss_weight=1.0)), + # training and testing settings + test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6))) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3.0, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict(clip_grad=None) diff --git a/mmdetection/configs/fcos/fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/fcos/fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..b51556b --- /dev/null +++ b/mmdetection/configs/fcos/fcos_r50_fpn_gn-head-center-normbbox-centeronreg-giou_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,75 @@ +_base_ = '../common/lsj-200e_coco-detection.py' + +image_size = (1024, 1024) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +# model settings +model = dict( + type='FCOS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + norm_on_bbox=True, + centerness_on_reg=True, + dcn_on_last_conv=False, + center_sampling=True, + conv_bias=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # testing settings + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +train_dataloader = dict(batch_size=8, num_workers=4) +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004), + paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.), + clip_grad=dict(max_norm=35, norm_type=2)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py b/mmdetection/configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py new file mode 100644 index 0000000..503c0e1 --- /dev/null +++ b/mmdetection/configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py @@ -0,0 +1,52 @@ +_base_ = './fcos_r50-caffe_fpn_gn-head_1x_coco.py' + +# model settings +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# training schedule for 2x +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict(type='ConstantLR', factor=1.0 / 3, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/fcos/metafile.yml b/mmdetection/configs/fcos/metafile.yml new file mode 100644 index 0000000..fb6527c --- /dev/null +++ b/mmdetection/configs/fcos/metafile.yml @@ -0,0 +1,146 @@ +Collections: + - Name: FCOS + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - Group Normalization + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.01355 + Title: 'FCOS: Fully Convolutional One-Stage Object Detection' + README: configs/fcos/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/fcos.py#L6 + Version: v2.0.0 + +Models: + - Name: fcos_r50-caffe_fpn_gn-head_1x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py + Metadata: + Training Memory (GB): 3.6 + inference time (ms/im): + - value: 44.05 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth + + - Name: fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r50-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py + Metadata: + Training Memory (GB): 3.7 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth + + - Name: fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r50-dcn-caffe_fpn_gn-head-center-normbbox-centeronreg-giou_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_1x_coco-ae4d8b3d.pth + + - Name: fcos_r101-caffe_fpn_gn-head-1x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r101-caffe_fpn_gn-head-1x_coco.py + Metadata: + Training Memory (GB): 5.5 + inference time (ms/im): + - value: 57.8 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_1x_coco/fcos_r101_caffe_fpn_gn-head_1x_coco-0e37b982.pth + + - Name: fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r50-caffe_fpn_gn-head_ms-640-800-2x_coco.py + Metadata: + Training Memory (GB): 2.6 + inference time (ms/im): + - value: 43.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r50_caffe_fpn_gn-head_mstrain_640-800_2x_coco-d92ceeea.pth + + - Name: fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco + In Collection: FCOS + Config: configs/fcos/fcos_r101-caffe_fpn_gn-head_ms-640-800-2x_coco.py + Metadata: + Training Memory (GB): 5.5 + inference time (ms/im): + - value: 57.8 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco/fcos_r101_caffe_fpn_gn-head_mstrain_640-800_2x_coco-511424d6.pth + + - Name: fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco + In Collection: FCOS + Config: configs/fcos/fcos_x101-64x4d_fpn_gn-head_ms-640-800-2x_coco.py + Metadata: + Training Memory (GB): 10.0 + inference time (ms/im): + - value: 103.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco/fcos_x101_64x4d_fpn_gn-head_mstrain_640-800_2x_coco-ede514a8.pth diff --git a/mmdetection/configs/foveabox/README.md b/mmdetection/configs/foveabox/README.md new file mode 100644 index 0000000..96f1358 --- /dev/null +++ b/mmdetection/configs/foveabox/README.md @@ -0,0 +1,53 @@ +# FoveaBox + +> [FoveaBox: Beyond Anchor-based Object Detector](https://arxiv.org/abs/1904.03797) + + + +## Abstract + +We present FoveaBox, an accurate, flexible, and completely anchor-free framework for object detection. While almost all state-of-the-art object detectors utilize predefined anchors to enumerate possible locations, scales and aspect ratios for the search of the objects, their performance and generalization ability are also limited to the design of anchors. Instead, FoveaBox directly learns the object existing possibility and the bounding box coordinates without anchor reference. This is achieved by: (a) predicting category-sensitive semantic maps for the object existing possibility, and (b) producing category-agnostic bounding box for each position that potentially contains an object. The scales of target boxes are naturally associated with feature pyramid representations. In FoveaBox, an instance is assigned to adjacent feature levels to make the model more accurate.We demonstrate its effectiveness on standard benchmarks and report extensive experimental analysis. Without bells and whistles, FoveaBox achieves state-of-the-art single model performance on the standard COCO and Pascal VOC object detection benchmark. More importantly, FoveaBox avoids all computation and hyper-parameters related to anchor boxes, which are often sensitive to the final detection performance. We believe the simple and effective approach will serve as a solid baseline and help ease future research for object detection. + +
    + +
    + +## Introduction + +FoveaBox is an accurate, flexible and completely anchor-free object detection system for object detection framework, as presented in our paper [https://arxiv.org/abs/1904.03797](https://arxiv.org/abs/1904.03797): +Different from previous anchor-based methods, FoveaBox directly learns the object existing possibility and the bounding box coordinates without anchor reference. This is achieved by: (a) predicting category-sensitive semantic maps for the object existing possibility, and (b) producing category-agnostic bounding box for each position that potentially contains an object. + +## Results and Models + +### Results on R50/101-FPN + +| Backbone | Style | align | ms-train | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :-----: | :---: | :------: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | N | N | 1x | 5.6 | 24.1 | 36.5 | [config](./fovea_r50_fpn_4xb4-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219-ee4d5303.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219_223025.log.json) | +| R-50 | pytorch | N | N | 2x | 5.6 | - | 37.2 | [config](./fovea_r50_fpn_4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203-2df792b1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203_112043.log.json) | +| R-50 | pytorch | Y | N | 2x | 8.1 | 19.4 | 37.9 | [config](./fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203_134252.log.json) | +| R-50 | pytorch | Y | Y | 2x | 8.1 | 18.3 | 40.4 | [config](./fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205-85ce26cb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205_112557.log.json) | +| R-101 | pytorch | N | N | 1x | 9.2 | 17.4 | 38.6 | [config](./fovea_r101_fpn_4xb4-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219-05e38f1c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219_011740.log.json) | +| R-101 | pytorch | N | N | 2x | 11.7 | - | 40.0 | [config](./fovea_r101_fpn_4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208-02320ea4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208_202059.log.json) | +| R-101 | pytorch | Y | N | 2x | 11.7 | 14.7 | 40.0 | [config](./fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208-c39a027a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208_203337.log.json) | +| R-101 | pytorch | Y | Y | 2x | 11.7 | 14.7 | 42.0 | [config](./fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208-649c5eb6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208_202124.log.json) | + +\[1\] *1x and 2x mean the model is trained for 12 and 24 epochs, respectively.* \ +\[2\] *Align means utilizing deformable convolution to align the cls branch.* \ +\[3\] *All results are obtained with a single model and without any test time data augmentation.*\ +\[4\] *We use 4 GPUs for training.* + +Any pull requests or issues are welcome. + +## Citation + +Please consider citing our paper in your publications if the project helps your research. BibTeX reference is as follows. + +```latex +@article{kong2019foveabox, + title={FoveaBox: Beyond Anchor-based Object Detector}, + author={Kong, Tao and Sun, Fuchun and Liu, Huaping and Jiang, Yuning and Shi, Jianbo}, + journal={arXiv preprint arXiv:1904.03797}, + year={2019} +} +``` diff --git a/mmdetection/configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py b/mmdetection/configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py new file mode 100644 index 0000000..7e8ccf9 --- /dev/null +++ b/mmdetection/configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fovea_r50_fpn_4xb4-1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py b/mmdetection/configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py new file mode 100644 index 0000000..0dc9851 --- /dev/null +++ b/mmdetection/configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fovea_r50_fpn_4xb4-2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py b/mmdetection/configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py new file mode 100644 index 0000000..222671d --- /dev/null +++ b/mmdetection/configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py @@ -0,0 +1,23 @@ +_base_ = './fovea_r50_fpn_4xb4-1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + with_deform=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) +# learning policy +max_epochs = 24 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py b/mmdetection/configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py new file mode 100644 index 0000000..e1852d5 --- /dev/null +++ b/mmdetection/configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py @@ -0,0 +1,34 @@ +_base_ = './fovea_r50_fpn_4xb4-1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + with_deform=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +# learning policy +max_epochs = 24 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py b/mmdetection/configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py new file mode 100644 index 0000000..13cf3ae --- /dev/null +++ b/mmdetection/configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py @@ -0,0 +1,59 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='FOVEA', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + num_outs=5, + add_extra_convs='on_input'), + bbox_head=dict( + type='FoveaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + base_edge_list=[16, 32, 64, 128, 256], + scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)), + sigma=0.4, + with_deform=False, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=1.50, + alpha=0.4, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + nms_pre=1000, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)) +train_dataloader = dict(batch_size=4, num_workers=4) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py b/mmdetection/configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py new file mode 100644 index 0000000..f9d06ef --- /dev/null +++ b/mmdetection/configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './fovea_r50_fpn_4xb4-1x_coco.py' +# learning policy +max_epochs = 24 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py b/mmdetection/configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py new file mode 100644 index 0000000..877bb4f --- /dev/null +++ b/mmdetection/configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py @@ -0,0 +1,20 @@ +_base_ = './fovea_r50_fpn_4xb4-1x_coco.py' +model = dict( + bbox_head=dict( + with_deform=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) +# learning policy +max_epochs = 24 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py b/mmdetection/configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py new file mode 100644 index 0000000..5690bca --- /dev/null +++ b/mmdetection/configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py @@ -0,0 +1,30 @@ +_base_ = './fovea_r50_fpn_4xb4-1x_coco.py' +model = dict( + bbox_head=dict( + with_deform=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +# learning policy +max_epochs = 24 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/foveabox/metafile.yml b/mmdetection/configs/foveabox/metafile.yml new file mode 100644 index 0000000..9ab2f54 --- /dev/null +++ b/mmdetection/configs/foveabox/metafile.yml @@ -0,0 +1,172 @@ +Collections: + - Name: FoveaBox + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 4x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.03797 + Title: 'FoveaBox: Beyond Anchor-based Object Detector' + README: configs/foveabox/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/fovea.py#L6 + Version: v2.0.0 + +Models: + - Name: fovea_r50_fpn_4xb4-1x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r50_fpn_4xb4-1x_coco.py + Metadata: + Training Memory (GB): 5.6 + inference time (ms/im): + - value: 41.49 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_1x_coco/fovea_r50_fpn_4x4_1x_coco_20200219-ee4d5303.pth + + - Name: fovea_r50_fpn_4xb4-2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r50_fpn_4xb4-2x_coco.py + Metadata: + Training Memory (GB): 5.6 + inference time (ms/im): + - value: 41.49 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r50_fpn_4x4_2x_coco/fovea_r50_fpn_4x4_2x_coco_20200203-2df792b1.pth + + - Name: fovea_r50_fpn_gn-head-align_4xb4-2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r50_fpn_gn-head-align_4xb4-2x_coco.py + Metadata: + Training Memory (GB): 8.1 + inference time (ms/im): + - value: 51.55 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_4x4_2x_coco/fovea_align_r50_fpn_gn-head_4x4_2x_coco_20200203-8987880d.pth + + - Name: fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r50_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py + Metadata: + Training Memory (GB): 8.1 + inference time (ms/im): + - value: 54.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r50_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200205-85ce26cb.pth + + - Name: fovea_r101_fpn_4xb4-1x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r101_fpn_4xb4-1x_coco.py + Metadata: + Training Memory (GB): 9.2 + inference time (ms/im): + - value: 57.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_1x_coco/fovea_r101_fpn_4x4_1x_coco_20200219-05e38f1c.pth + + - Name: fovea_r101_fpn_4xb4-2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r101_fpn_4xb4-2x_coco.py + Metadata: + Training Memory (GB): 11.7 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_r101_fpn_4x4_2x_coco/fovea_r101_fpn_4x4_2x_coco_20200208-02320ea4.pth + + - Name: fovea_r101_fpn_gn-head-align_4xb4-2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r101_fpn_gn-head-align_4xb4-2x_coco.py + Metadata: + Training Memory (GB): 11.7 + inference time (ms/im): + - value: 68.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_4x4_2x_coco/fovea_align_r101_fpn_gn-head_4x4_2x_coco_20200208-c39a027a.pth + + - Name: fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco + In Collection: FoveaBox + Config: configs/foveabox/fovea_r101_fpn_gn-head-align_ms-640-800-4xb4-2x_coco.py + Metadata: + Training Memory (GB): 11.7 + inference time (ms/im): + - value: 68.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/foveabox/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco/fovea_align_r101_fpn_gn-head_mstrain_640-800_4x4_2x_coco_20200208-649c5eb6.pth diff --git a/mmdetection/configs/fpg/README.md b/mmdetection/configs/fpg/README.md new file mode 100644 index 0000000..1e2fd40 --- /dev/null +++ b/mmdetection/configs/fpg/README.md @@ -0,0 +1,43 @@ +# FPG + +> [Feature Pyramid Grids](https://arxiv.org/abs/2004.03580) + + + +## Abstract + +Feature pyramid networks have been widely adopted in the object detection literature to improve feature representations for better handling of variations in scale. In this paper, we present Feature Pyramid Grids (FPG), a deep multi-pathway feature pyramid, that represents the feature scale-space as a regular grid of parallel bottom-up pathways which are fused by multi-directional lateral connections. FPG can improve single-pathway feature pyramid networks by significantly increasing its performance at similar computation cost, highlighting importance of deep pyramid representations. In addition to its general and uniform structure, over complicated structures that have been found with neural architecture search, it also compares favorably against such approaches without relying on search. We hope that FPG with its uniform and effective nature can serve as a strong component for future work in object recognition. + +
    + +
    + +## Results and Models + +We benchmark the new training schedule (crop training, large batch, unfrozen BN, 50 epochs) introduced in NAS-FPN. +All backbones are Resnet-50 in pytorch style. + +| Method | Neck | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------: | :--------: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN | FPG | 50e | 20.0 | - | 42.3 | - | [config](./faster-rcnn_r50_fpg_crop640-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856.log.json) | +| Faster R-CNN | FPG-chn128 | 50e | 11.9 | - | 41.2 | - | [config](./faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857.log.json) | +| Faster R-CNN | FPN | 50e | 20.0 | - | 38.9 | - | [config](./faster-rcnn_r50_fpn_crop640-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpn_crop640_50e_coco/faster_rcnn_r50_fpn_crop640_50e_coco_20220311_011857-be7c9f42.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpn_crop640_50e_coco/faster_rcnn_r50_fpn_crop640_50e_coco_20220311_011857.log.json) | +| Mask R-CNN | FPG | 50e | 23.2 | - | 43.0 | 38.1 | [config](./mask-rcnn_r50_fpg_crop640-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857.log.json) | +| Mask R-CNN | FPG-chn128 | 50e | 15.3 | - | 41.7 | 37.1 | [config](./mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859.log.json) | +| Mask R-CNN | FPN | 50e | 23.2 | - | 49.6 | 35.6 | [config](./mask-rcnn_r50_fpn_crop640-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpn_crop640_50e_coco/mask_rcnn_r50_fpn_crop640_50e_coco_20220311_011855-a756664a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpn_crop640_50e_coco/mask_rcnn_r50_fpn_crop640_50e_coco_20220311_011855.log.json) | +| RetinaNet | FPG | 50e | 20.8 | - | 40.5 | - | [config](./retinanet_r50_fpg_crop640_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809.log.json) | +| RetinaNet | FPG-chn128 | 50e | 19.9 | - | 39.9 | - | [config](./retinanet_r50_fpg-chn128_crop640_50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829.log.json) | + +**Note**: Chn128 means to decrease the number of channels of features and convs from 256 (default) to 128 in +Neck and BBox Head, which can greatly decrease memory consumption without sacrificing much precision. + +## Citation + +```latex +@article{chen2020feature, + title={Feature pyramid grids}, + author={Chen, Kai and Cao, Yuhang and Loy, Chen Change and Lin, Dahua and Feichtenhofer, Christoph}, + journal={arXiv preprint arXiv:2004.03580}, + year={2020} +} +``` diff --git a/mmdetection/configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py b/mmdetection/configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py new file mode 100644 index 0000000..cb9160f --- /dev/null +++ b/mmdetection/configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py @@ -0,0 +1,9 @@ +_base_ = 'faster-rcnn_r50_fpg_crop640-50e_coco.py' + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + neck=dict(out_channels=128, inter_channels=128), + rpn_head=dict(in_channels=128), + roi_head=dict( + bbox_roi_extractor=dict(out_channels=128), + bbox_head=dict(in_channels=128))) diff --git a/mmdetection/configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py b/mmdetection/configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py new file mode 100644 index 0000000..d0d366f --- /dev/null +++ b/mmdetection/configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py @@ -0,0 +1,48 @@ +_base_ = 'faster-rcnn_r50_fpn_crop640-50e_coco.py' + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + neck=dict( + type='FPG', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + inter_channels=256, + num_outs=5, + stack_times=9, + paths=['bu'] * 9, + same_down_trans=None, + same_up_trans=dict( + type='conv', + kernel_size=3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_lateral_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_down_trans=dict( + type='interpolation_conv', + mode='nearest', + kernel_size=3, + norm_cfg=norm_cfg, + order=('act', 'conv', 'norm'), + inplace=False), + across_up_trans=None, + across_skip_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + output_trans=dict( + type='last_conv', + kernel_size=3, + order=('act', 'conv', 'norm'), + inplace=False), + norm_cfg=norm_cfg, + skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])) diff --git a/mmdetection/configs/fpg/faster-rcnn_r50_fpn_crop640-50e_coco.py b/mmdetection/configs/fpg/faster-rcnn_r50_fpn_crop640-50e_coco.py new file mode 100644 index 0000000..46211de --- /dev/null +++ b/mmdetection/configs/fpg/faster-rcnn_r50_fpn_crop640-50e_coco.py @@ -0,0 +1,73 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +norm_cfg = dict(type='BN', requires_grad=True) +image_size = (640, 640) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +model = dict( + data_preprocessor=dict(pad_size_divisor=64, batch_augments=batch_augments), + backbone=dict(norm_cfg=norm_cfg, norm_eval=False), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict(bbox_head=dict(norm_cfg=norm_cfg))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + allow_negative_crop=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=image_size, keep_ratio=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# learning policy +max_epochs = 50 +train_cfg = dict(max_epochs=max_epochs, val_interval=2) +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[30, 40], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001), + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True), + clip_grad=None) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py b/mmdetection/configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py new file mode 100644 index 0000000..8043939 --- /dev/null +++ b/mmdetection/configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py @@ -0,0 +1,10 @@ +_base_ = 'mask-rcnn_r50_fpg_crop640-50e_coco.py' + +model = dict( + neck=dict(out_channels=128, inter_channels=128), + rpn_head=dict(in_channels=128), + roi_head=dict( + bbox_roi_extractor=dict(out_channels=128), + bbox_head=dict(in_channels=128), + mask_roi_extractor=dict(out_channels=128), + mask_head=dict(in_channels=128))) diff --git a/mmdetection/configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py b/mmdetection/configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py new file mode 100644 index 0000000..135bb60 --- /dev/null +++ b/mmdetection/configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py @@ -0,0 +1,48 @@ +_base_ = 'mask-rcnn_r50_fpn_crop640-50e_coco.py' + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + neck=dict( + type='FPG', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + inter_channels=256, + num_outs=5, + stack_times=9, + paths=['bu'] * 9, + same_down_trans=None, + same_up_trans=dict( + type='conv', + kernel_size=3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_lateral_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_down_trans=dict( + type='interpolation_conv', + mode='nearest', + kernel_size=3, + norm_cfg=norm_cfg, + order=('act', 'conv', 'norm'), + inplace=False), + across_up_trans=None, + across_skip_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + output_trans=dict( + type='last_conv', + kernel_size=3, + order=('act', 'conv', 'norm'), + inplace=False), + norm_cfg=norm_cfg, + skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])) diff --git a/mmdetection/configs/fpg/mask-rcnn_r50_fpn_crop640-50e_coco.py b/mmdetection/configs/fpg/mask-rcnn_r50_fpn_crop640-50e_coco.py new file mode 100644 index 0000000..08ca5b6 --- /dev/null +++ b/mmdetection/configs/fpg/mask-rcnn_r50_fpn_crop640-50e_coco.py @@ -0,0 +1,79 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +norm_cfg = dict(type='BN', requires_grad=True) +image_size = (640, 640) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +model = dict( + data_preprocessor=dict(pad_size_divisor=64, batch_augments=batch_augments), + backbone=dict(norm_cfg=norm_cfg, norm_eval=False), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + norm_cfg=norm_cfg, + num_outs=5), + roi_head=dict( + bbox_head=dict(norm_cfg=norm_cfg), mask_head=dict(norm_cfg=norm_cfg))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + allow_negative_crop=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=image_size, keep_ratio=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# learning policy +max_epochs = 50 +train_cfg = dict(max_epochs=max_epochs, val_interval=2) +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[30, 40], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001), + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True), + clip_grad=None) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/fpg/metafile.yml b/mmdetection/configs/fpg/metafile.yml new file mode 100644 index 0000000..7d7634a --- /dev/null +++ b/mmdetection/configs/fpg/metafile.yml @@ -0,0 +1,104 @@ +Collections: + - Name: Feature Pyramid Grids + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Feature Pyramid Grids + Paper: + URL: https://arxiv.org/abs/2004.03580 + Title: 'Feature Pyramid Grids' + README: configs/fpg/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.10.0/mmdet/models/necks/fpg.py#L101 + Version: v2.10.0 + +Models: + - Name: faster-rcnn_r50_fpg_crop640-50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/faster-rcnn_r50_fpg_crop640-50e_coco.py + Metadata: + Training Memory (GB): 20.0 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg_crop640_50e_coco/faster_rcnn_r50_fpg_crop640_50e_coco_20220311_011856-74109f42.pth + + - Name: faster-rcnn_r50_fpg-chn128_crop640-50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/faster-rcnn_r50_fpg-chn128_crop640-50e_coco.py + Metadata: + Training Memory (GB): 11.9 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/faster_rcnn_r50_fpg-chn128_crop640_50e_coco/faster_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011857-9376aa9d.pth + + - Name: mask-rcnn_r50_fpg_crop640-50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/mask-rcnn_r50_fpg_crop640-50e_coco.py + Metadata: + Training Memory (GB): 23.2 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg_crop640_50e_coco/mask_rcnn_r50_fpg_crop640_50e_coco_20220311_011857-233b8334.pth + + - Name: mask-rcnn_r50_fpg-chn128_crop640-50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/mask-rcnn_r50_fpg-chn128_crop640-50e_coco.py + Metadata: + Training Memory (GB): 15.3 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/mask_rcnn_r50_fpg-chn128_crop640_50e_coco/mask_rcnn_r50_fpg-chn128_crop640_50e_coco_20220311_011859-043c9b4e.pth + + - Name: retinanet_r50_fpg_crop640_50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py + Metadata: + Training Memory (GB): 20.8 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg_crop640_50e_coco/retinanet_r50_fpg_crop640_50e_coco_20220311_110809-b0bcf5f4.pth + + - Name: retinanet_r50_fpg-chn128_crop640_50e_coco + In Collection: Feature Pyramid Grids + Config: configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py + Metadata: + Training Memory (GB): 19.9 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco/retinanet_r50_fpg-chn128_crop640_50e_coco_20220313_104829-ee99a686.pth diff --git a/mmdetection/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py b/mmdetection/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py new file mode 100644 index 0000000..9a6cf7e --- /dev/null +++ b/mmdetection/configs/fpg/retinanet_r50_fpg-chn128_crop640_50e_coco.py @@ -0,0 +1,5 @@ +_base_ = 'retinanet_r50_fpg_crop640_50e_coco.py' + +model = dict( + neck=dict(out_channels=128, inter_channels=128), + bbox_head=dict(in_channels=128)) diff --git a/mmdetection/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py b/mmdetection/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py new file mode 100644 index 0000000..e2aac28 --- /dev/null +++ b/mmdetection/configs/fpg/retinanet_r50_fpg_crop640_50e_coco.py @@ -0,0 +1,53 @@ +_base_ = '../nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py' + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + neck=dict( + _delete_=True, + type='FPG', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + inter_channels=256, + num_outs=5, + add_extra_convs=True, + start_level=1, + stack_times=9, + paths=['bu'] * 9, + same_down_trans=None, + same_up_trans=dict( + type='conv', + kernel_size=3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_lateral_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + across_down_trans=dict( + type='interpolation_conv', + mode='nearest', + kernel_size=3, + norm_cfg=norm_cfg, + order=('act', 'conv', 'norm'), + inplace=False), + across_up_trans=None, + across_skip_trans=dict( + type='conv', + kernel_size=1, + norm_cfg=norm_cfg, + inplace=False, + order=('act', 'conv', 'norm')), + output_trans=dict( + type='last_conv', + kernel_size=3, + order=('act', 'conv', 'norm'), + inplace=False), + norm_cfg=norm_cfg, + skip_inds=[(0, 1, 2, 3), (0, 1, 2), (0, 1), (0, ), ()])) + +train_cfg = dict(val_interval=2) diff --git a/mmdetection/configs/free_anchor/README.md b/mmdetection/configs/free_anchor/README.md new file mode 100644 index 0000000..03dc828 --- /dev/null +++ b/mmdetection/configs/free_anchor/README.md @@ -0,0 +1,37 @@ +# FreeAnchor + +> [FreeAnchor: Learning to Match Anchors for Visual Object Detection](https://arxiv.org/abs/1909.02466) + + + +## Abstract + +Modern CNN-based object detectors assign anchors for ground-truth objects under the restriction of object-anchor Intersection-over-Unit (IoU). In this study, we propose a learning-to-match approach to break IoU restriction, allowing objects to match anchors in a flexible manner. Our approach, referred to as FreeAnchor, updates hand-crafted anchor assignment to "free" anchor matching by formulating detector training as a maximum likelihood estimation (MLE) procedure. FreeAnchor targets at learning features which best explain a class of objects in terms of both classification and localization. FreeAnchor is implemented by optimizing detection customized likelihood and can be fused with CNN-based detectors in a plug-and-play manner. Experiments on COCO demonstrate that FreeAnchor consistently outperforms their counterparts with significant margins. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------: | :-----: | :-----: | :------: | :------------: | :----: | :----------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | 1x | 4.9 | 18.4 | 38.7 | [config](./freeanchor_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130_095625.log.json) | +| R-101 | pytorch | 1x | 6.8 | 14.9 | 40.3 | [config](./freeanchor_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130-358324e6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130_100723.log.json) | +| X-101-32x4d | pytorch | 1x | 8.1 | 11.1 | 41.9 | [config](./freeanchor_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130-d4846968.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130_095627.log.json) | + +**Notes:** + +- We use 8 GPUs with 2 images/GPU. +- For more settings and models, please refer to the [official repo](https://github.com/zhangxiaosong18/FreeAnchor). + +## Citation + +```latex +@inproceedings{zhang2019freeanchor, + title = {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection}, + author = {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang}, + booktitle = {Neural Information Processing Systems}, + year = {2019} +} +``` diff --git a/mmdetection/configs/free_anchor/freeanchor_r101_fpn_1x_coco.py b/mmdetection/configs/free_anchor/freeanchor_r101_fpn_1x_coco.py new file mode 100644 index 0000000..dc323d9 --- /dev/null +++ b/mmdetection/configs/free_anchor/freeanchor_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './freeanchor_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/free_anchor/freeanchor_r50_fpn_1x_coco.py b/mmdetection/configs/free_anchor/freeanchor_r50_fpn_1x_coco.py new file mode 100644 index 0000000..13f64d1 --- /dev/null +++ b/mmdetection/configs/free_anchor/freeanchor_r50_fpn_1x_coco.py @@ -0,0 +1,22 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='FreeAnchorRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.75))) + +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..8e448bc --- /dev/null +++ b/mmdetection/configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = './freeanchor_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/free_anchor/metafile.yml b/mmdetection/configs/free_anchor/metafile.yml new file mode 100644 index 0000000..cff19db --- /dev/null +++ b/mmdetection/configs/free_anchor/metafile.yml @@ -0,0 +1,79 @@ +Collections: + - Name: FreeAnchor + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FreeAnchor + - ResNet + Paper: + URL: https://arxiv.org/abs/1909.02466 + Title: 'FreeAnchor: Learning to Match Anchors for Visual Object Detection' + README: configs/free_anchor/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/dense_heads/free_anchor_retina_head.py#L10 + Version: v2.0.0 + +Models: + - Name: freeanchor_r50_fpn_1x_coco + In Collection: FreeAnchor + Config: configs/free_anchor/freeanchor_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.9 + inference time (ms/im): + - value: 54.35 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r50_fpn_1x_coco/retinanet_free_anchor_r50_fpn_1x_coco_20200130-0f67375f.pth + + - Name: freeanchor_r101_fpn_1x_coco + In Collection: FreeAnchor + Config: configs/free_anchor/freeanchor_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.8 + inference time (ms/im): + - value: 67.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_r101_fpn_1x_coco/retinanet_free_anchor_r101_fpn_1x_coco_20200130-358324e6.pth + + - Name: freeanchor_x101-32x4d_fpn_1x_coco + In Collection: FreeAnchor + Config: configs/free_anchor/freeanchor_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.1 + inference time (ms/im): + - value: 90.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/free_anchor/retinanet_free_anchor_x101_32x4d_fpn_1x_coco/retinanet_free_anchor_x101_32x4d_fpn_1x_coco_20200130-d4846968.pth diff --git a/mmdetection/configs/fsaf/README.md b/mmdetection/configs/fsaf/README.md new file mode 100644 index 0000000..46f6057 --- /dev/null +++ b/mmdetection/configs/fsaf/README.md @@ -0,0 +1,57 @@ +# FSAF + +> [Feature Selective Anchor-Free Module for Single-Shot Object Detection](https://arxiv.org/abs/1903.00621) + + + +## Abstract + +We motivate and present feature selective anchor-free (FSAF) module, a simple and effective building block for single-shot object detectors. It can be plugged into single-shot detectors with feature pyramid structure. The FSAF module addresses two limitations brought up by the conventional anchor-based detection: 1) heuristic-guided feature selection; 2) overlap-based anchor sampling. The general concept of the FSAF module is online feature selection applied to the training of multi-level anchor-free branches. Specifically, an anchor-free branch is attached to each level of the feature pyramid, allowing box encoding and decoding in the anchor-free manner at an arbitrary level. During training, we dynamically assign each instance to the most suitable feature level. At the time of inference, the FSAF module can work jointly with anchor-based branches by outputting predictions in parallel. We instantiate this concept with simple implementations of anchor-free branches and online feature selection strategy. Experimental results on the COCO detection track show that our FSAF module performs better than anchor-based counterparts while being faster. When working jointly with anchor-based branches, the FSAF module robustly improves the baseline RetinaNet by a large margin under various settings, while introducing nearly free inference overhead. And the resulting best model can achieve a state-of-the-art 44.6% mAP, outperforming all existing single-shot detectors on COCO. + +
    + +
    + +## Introduction + +FSAF is an anchor-free method published in CVPR2019 ([https://arxiv.org/pdf/1903.00621.pdf](https://arxiv.org/pdf/1903.00621.pdf)). +Actually it is equivalent to the anchor-based method with only one anchor at each feature map position in each FPN level. +And this is how we implemented it. +Only the anchor-free branch is released for its better compatibility with the current framework and less computational budget. + +In the original paper, feature maps within the central 0.2-0.5 area of a gt box are tagged as ignored. However, +it is empirically found that a hard threshold (0.2-0.2) gives a further gain on the performance. (see the table below) + +## Results and Models + +### Results on R50/R101/X101-FPN + +| Backbone | ignore range | ms-train | Lr schd | Train Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Config | Download | +| :------: | :----------: | :------: | :-----: | :------------: | :-----------------: | :------------: | :---------: | :----------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | 0.2-0.5 | N | 1x | 3.15 | 0.43 | 12.3 | 36.0 (35.9) | | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco_20200715-b555b0e0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco/fsaf_pscale0.2_nscale0.5_r50_fpn_1x_coco_20200715_094657.log.json) | +| R-50 | 0.2-0.2 | N | 1x | 3.15 | 0.43 | 13.0 | 37.4 | [config](./fsaf_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco_20200428_072327.log.json) | +| R-101 | 0.2-0.2 | N | 1x | 5.08 | 0.58 | 10.8 | 39.3 (37.9) | [config](./fsaf_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco-9e71098f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco_20200428_160348.log.json) | +| X-101 | 0.2-0.2 | N | 1x | 9.38 | 1.23 | 5.6 | 42.4 (41.0) | [config](./fsaf_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco-e3f6e6fd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco_20200428_160424.log.json) | + +**Notes:** + +- *1x means the model is trained for 12 epochs.* +- *AP values in the brackets represent those reported in the original paper.* +- *All results are obtained with a single model and single-scale test.* +- *X-101 backbone represents ResNext-101-64x4d.* +- *All pretrained backbones use pytorch style.* +- *All models are trained on 8 Titan-XP gpus and tested on a single gpu.* + +## Citation + +BibTeX reference is as follows. + +```latex +@inproceedings{zhu2019feature, + title={Feature Selective Anchor-Free Module for Single-Shot Object Detection}, + author={Zhu, Chenchen and He, Yihui and Savvides, Marios}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={840--849}, + year={2019} +} +``` diff --git a/mmdetection/configs/fsaf/fsaf_r101_fpn_1x_coco.py b/mmdetection/configs/fsaf/fsaf_r101_fpn_1x_coco.py new file mode 100644 index 0000000..12b49fe --- /dev/null +++ b/mmdetection/configs/fsaf/fsaf_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './fsaf_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/fsaf/fsaf_r50_fpn_1x_coco.py b/mmdetection/configs/fsaf/fsaf_r50_fpn_1x_coco.py new file mode 100644 index 0000000..e7165cd --- /dev/null +++ b/mmdetection/configs/fsaf/fsaf_r50_fpn_1x_coco.py @@ -0,0 +1,47 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +# model settings +model = dict( + type='FSAF', + bbox_head=dict( + type='FSAFHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + reg_decoded_bbox=True, + # Only anchor-free branch is implemented. The anchor generator only + # generates 1 anchor at each feature point, as a substitute of the + # grid of features. + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=1, + scales_per_octave=1, + ratios=[1.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict(_delete_=True, type='TBLRBBoxCoder', normalizer=4.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0, + reduction='none'), + loss_bbox=dict( + _delete_=True, + type='IoULoss', + eps=1e-6, + loss_weight=1.0, + reduction='none')), + # training and testing settings + train_cfg=dict( + assigner=dict( + _delete_=True, + type='CenterRegionAssigner', + pos_scale=0.2, + neg_scale=0.2, + min_pos_iof=0.01), + allowed_border=-1, + pos_weight=-1, + debug=False)) + +optim_wrapper = dict(clip_grad=dict(max_norm=10, norm_type=2)) diff --git a/mmdetection/configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..89c0c63 --- /dev/null +++ b/mmdetection/configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './fsaf_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/fsaf/metafile.yml b/mmdetection/configs/fsaf/metafile.yml new file mode 100644 index 0000000..daaad0d --- /dev/null +++ b/mmdetection/configs/fsaf/metafile.yml @@ -0,0 +1,80 @@ +Collections: + - Name: FSAF + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x Titan-XP GPUs + Architecture: + - FPN + - FSAF + - ResNet + Paper: + URL: https://arxiv.org/abs/1903.00621 + Title: 'Feature Selective Anchor-Free Module for Single-Shot Object Detection' + README: configs/fsaf/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/detectors/fsaf.py#L6 + Version: v2.1.0 + +Models: + - Name: fsaf_r50_fpn_1x_coco + In Collection: FSAF + Config: configs/fsaf/fsaf_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.15 + inference time (ms/im): + - value: 76.92 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth + + - Name: fsaf_r101_fpn_1x_coco + In Collection: FSAF + Config: configs/fsaf/fsaf_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.08 + inference time (ms/im): + - value: 92.59 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r101_fpn_1x_coco/fsaf_r101_fpn_1x_coco-9e71098f.pth + + - Name: fsaf_x101-64x4d_fpn_1x_coco + In Collection: FSAF + Config: configs/fsaf/fsaf_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.38 + inference time (ms/im): + - value: 178.57 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_x101_64x4d_fpn_1x_coco/fsaf_x101_64x4d_fpn_1x_coco-e3f6e6fd.pth diff --git a/mmdetection/configs/gcnet/README.md b/mmdetection/configs/gcnet/README.md new file mode 100644 index 0000000..1ba6f6f --- /dev/null +++ b/mmdetection/configs/gcnet/README.md @@ -0,0 +1,69 @@ +# GCNet + +> [GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond](https://arxiv.org/abs/1904.11492) + + + +## Abstract + +The Non-Local Network (NLNet) presents a pioneering approach for capturing long-range dependencies, via aggregating query-specific global context to each query position. However, through a rigorous empirical analysis, we have found that the global contexts modeled by non-local network are almost the same for different query positions within an image. In this paper, we take advantage of this finding to create a simplified network based on a query-independent formulation, which maintains the accuracy of NLNet but with significantly less computation. We further observe that this simplified design shares similar structure with Squeeze-Excitation Network (SENet). Hence we unify them into a three-step general framework for global context modeling. Within the general framework, we design a better instantiation, called the global context (GC) block, which is lightweight and can effectively model the global context. The lightweight property allows us to apply it for multiple layers in a backbone network to construct a global context network (GCNet), which generally outperforms both simplified NLNet and SENet on major benchmarks for various recognition tasks. + +
    + +
    + +## Introduction + +By [Yue Cao](http://yue-cao.me), [Jiarui Xu](http://jerryxu.net), [Stephen Lin](https://scholar.google.com/citations?user=c3PYmxUAAAAJ&hl=en), Fangyun Wei, [Han Hu](https://sites.google.com/site/hanhushomepage/). + +We provide config files to reproduce the results in the paper for +["GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond"](https://arxiv.org/abs/1904.11492) on COCO object detection. + +**GCNet** is initially described in [arxiv](https://arxiv.org/abs/1904.11492). Via absorbing advantages of Non-Local Networks (NLNet) and Squeeze-Excitation Networks (SENet), GCNet provides a simple, fast and effective approach for global context modeling, which generally outperforms both NLNet and SENet on major benchmarks for various recognition tasks. + +## Results and Models + +The results on COCO 2017val are shown in the below table. + +| Backbone | Model | Context | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------: | :---: | :------------: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | Mask | GC(c3-c5, r16) | 1x | 5.0 | | 39.7 | 35.9 | [config](./mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915-187da160.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915.log.json) | +| R-50-FPN | Mask | GC(c3-c5, r4) | 1x | 5.1 | 15.0 | 39.9 | 36.0 | [config](./mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204-17235656.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204_024626.log.json) | +| R-101-FPN | Mask | GC(c3-c5, r16) | 1x | 7.6 | 11.4 | 41.3 | 37.2 | [config](./mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205-e58ae947.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205_192835.log.json) | +| R-101-FPN | Mask | GC(c3-c5, r4) | 1x | 7.8 | 11.6 | 42.2 | 37.8 | [config](./mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206-af22dc9d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206_112128.log.json) | + +| Backbone | Model | Context | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------: | :--------------: | :------------: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | Mask | - | 1x | 4.4 | 16.6 | 38.4 | 34.6 | [config](./mask-rcnn_r50-syncbn_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202-bb3eb55c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202_214122.log.json) | +| R-50-FPN | Mask | GC(c3-c5, r16) | 1x | 5.0 | 15.5 | 40.4 | 36.2 | [config](./mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202_174907.log.json) | +| R-50-FPN | Mask | GC(c3-c5, r4) | 1x | 5.1 | 15.1 | 40.7 | 36.5 | [config](./mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202_085547.log.json) | +| R-101-FPN | Mask | - | 1x | 6.4 | 13.3 | 40.5 | 36.3 | [config](./mask-rcnn_r101-syncbn_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210-81658c8a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210_220422.log.json) | +| R-101-FPN | Mask | GC(c3-c5, r16) | 1x | 7.6 | 12.0 | 42.2 | 37.8 | [config](./mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207-945e77ca.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207_015330.log.json) | +| R-101-FPN | Mask | GC(c3-c5, r4) | 1x | 7.8 | 11.8 | 42.2 | 37.8 | [config](./mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206_142508.log.json) | +| X-101-FPN | Mask | - | 1x | 7.6 | 11.3 | 42.4 | 37.7 | [config](./mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211-7584841c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211_054326.log.json) | +| X-101-FPN | Mask | GC(c3-c5, r16) | 1x | 8.8 | 9.8 | 43.5 | 38.6 | [config](./mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-cbed3d2c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211_164715.log.json) | +| X-101-FPN | Mask | GC(c3-c5, r4) | 1x | 9.0 | 9.7 | 43.9 | 39.0 | [config](./mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212-68164964.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212_070942.log.json) | +| X-101-FPN | Cascade Mask | - | 1x | 9.2 | 8.4 | 44.7 | 38.6 | [config](./cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310-d5ad2a5e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310_115217.log.json) | +| X-101-FPN | Cascade Mask | GC(c3-c5, r16) | 1x | 10.3 | 7.7 | 46.2 | 39.7 | [config](./cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-10bf2463.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211_184154.log.json) | +| X-101-FPN | Cascade Mask | GC(c3-c5, r4) | 1x | 10.6 | | 46.4 | 40.1 | [config](./cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653-ed035291.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653.log.json) | +| X-101-FPN | DCN Cascade Mask | - | 1x | | | 47.5 | 40.9 | [config](./cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019-abbc39ea.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019.log.json) | +| X-101-FPN | DCN Cascade Mask | GC(c3-c5, r16) | 1x | | | 48.0 | 41.3 | [config](./cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648-44aa598a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648.log.json) | +| X-101-FPN | DCN Cascade Mask | GC(c3-c5, r4) | 1x | | | 47.9 | 41.1 | [config](./cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851-720338ec.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851.log.json) | + +**Notes:** + +- The `SyncBN` is added in the backbone for all models in **Table 2**. +- `GC` denotes Global Context (GC) block is inserted after 1x1 conv of backbone. +- `DCN` denotes replace 3x3 conv with 3x3 Deformable Convolution in `c3-c5` stages of backbone. +- `r4` and `r16` denote ratio 4 and ratio 16 in GC block respectively. + +## Citation + +```latex +@article{cao2019GCNet, + title={GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond}, + author={Cao, Yue and Xu, Jiarui and Lin, Stephen and Wei, Fangyun and Hu, Han}, + journal={arXiv preprint arXiv:1904.11492}, + year={2019} +} +``` diff --git a/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..6cf605b --- /dev/null +++ b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..95fc687 --- /dev/null +++ b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..9b77dc9 --- /dev/null +++ b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../dcn/cascade-mask-rcnn_x101-32x4d-dconv-c3-c5_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..8f97972 --- /dev/null +++ b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..8404cfd --- /dev/null +++ b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py new file mode 100644 index 0000000..87667de --- /dev/null +++ b/mmdetection/configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_x101-32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..447e2c6 --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..9c723a6 --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..6f9d03d --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..d07cb0d --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py new file mode 100644 index 0000000..957bdf5 --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..c9ec5ac --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..42474d5 --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict(plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..ac19280 --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..ae29f0c --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py new file mode 100644 index 0000000..f8ef27b --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..1a2e2c9 --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 16), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..65d3f9a --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = '../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + plugins=[ + dict( + cfg=dict(type='ContextBlock', ratio=1. / 4), + stages=(False, True, True, True), + position='after_conv3') + ])) diff --git a/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py b/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py new file mode 100644 index 0000000..b5343a6 --- /dev/null +++ b/mmdetection/configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py @@ -0,0 +1,4 @@ +_base_ = '../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False)) diff --git a/mmdetection/configs/gcnet/metafile.yml b/mmdetection/configs/gcnet/metafile.yml new file mode 100644 index 0000000..075a94c --- /dev/null +++ b/mmdetection/configs/gcnet/metafile.yml @@ -0,0 +1,440 @@ +Collections: + - Name: GCNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Global Context Block + - FPN + - RPN + - ResNet + - ResNeXt + Paper: + URL: https://arxiv.org/abs/1904.11492 + Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond' + README: configs/gcnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/ops/context_block.py#L13 + Version: v2.0.0 + +Models: + - Name: mask-rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r50-gcb-r16-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r16_gcb_c3-c5_1x_coco_20200515_211915-187da160.pth + + - Name: mask-rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r50-gcb-r4-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.1 + inference time (ms/im): + - value: 66.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_r4_gcb_c3-c5_1x_coco_20200204-17235656.pth + + - Name: mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r101-gcb-r16-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 87.72 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r16_gcb_c3-c5_1x_coco_20200205-e58ae947.pth + + - Name: mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r101-gcb-r4-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + inference time (ms/im): + - value: 86.21 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_r4_gcb_c3-c5_1x_coco_20200206-af22dc9d.pth + + - Name: mask-rcnn_r50_fpn_syncbn-backbone_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r50-syncbn_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 60.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_1x_coco_20200202-bb3eb55c.pth + + - Name: mask-rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r50-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + inference time (ms/im): + - value: 64.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200202-587b99aa.pth + + - Name: mask-rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.1 + inference time (ms/im): + - value: 66.23 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth + + - Name: mask-rcnn_r101-syncbn_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r101-syncbn_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 75.19 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_1x_coco_20200210-81658c8a.pth + + - Name: mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r101-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 83.33 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200207-945e77ca.pth + + - Name: mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + inference time (ms/im): + - value: 84.75 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth + + - Name: mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200211-7584841c.pth + + - Name: mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r16-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.8 + inference time (ms/im): + - value: 102.04 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-cbed3d2c.pth + + - Name: mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/mask-rcnn_x101-32x4d-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.0 + inference time (ms/im): + - value: 103.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200212-68164964.pth + + - Name: cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.2 + inference time (ms/im): + - value: 119.05 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_1x_coco_20200310-d5ad2a5e.pth + + - Name: cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r16-gcb-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 129.87 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r16_gcb_c3-c5_1x_coco_20200211-10bf2463.pth + + - Name: cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-r4-gcb-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200703_180653-ed035291.pth + + - Name: cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_1x_coco_20210615_211019-abbc39ea.pth + + - Name: cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r16-gcb-c3-c5_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r16_gcb_c3-c5_1x_coco_20210615_215648-44aa598a.pth + + - Name: cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco + In Collection: GCNet + Config: configs/gcnet/cascade-mask-rcnn_x101-32x4d-syncbn-dconv-c3-c5-r4-gcb-c3-c5_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco/cascade_mask_rcnn_x101_32x4d_fpn_syncbn-backbone_dconv_c3-c5_r4_gcb_c3-c5_1x_coco_20210615_161851-720338ec.pth diff --git a/mmdetection/configs/gfl/README.md b/mmdetection/configs/gfl/README.md new file mode 100644 index 0000000..123f303 --- /dev/null +++ b/mmdetection/configs/gfl/README.md @@ -0,0 +1,42 @@ +# GFL + +> [Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection](https://arxiv.org/abs/2006.04388) + + + +## Abstract + +One-stage detector basically formulates object detection as dense classification and localization. The classification is usually optimized by Focal Loss and the box location is commonly learned under Dirac delta distribution. A recent trend for one-stage detectors is to introduce an individual prediction branch to estimate the quality of localization, where the predicted quality facilitates the classification to improve detection performance. This paper delves into the representations of the above three fundamental elements: quality estimation, classification and localization. Two problems are discovered in existing practices, including (1) the inconsistent usage of the quality estimation and classification between training and inference and (2) the inflexible Dirac delta distribution for localization when there is ambiguity and uncertainty in complex scenes. To address the problems, we design new representations for these elements. Specifically, we merge the quality estimation into the class prediction vector to form a joint representation of localization quality and classification, and use a vector to represent arbitrary distribution of box locations. The improved representations eliminate the inconsistency risk and accurately depict the flexible distribution in real data, but contain continuous labels, which is beyond the scope of Focal Loss. We then propose Generalized Focal Loss (GFL) that generalizes Focal Loss from its discrete form to the continuous version for successful optimization. On COCO test-dev, GFL achieves 45.0% AP using ResNet-101 backbone, surpassing state-of-the-art SAPD (43.5%) and ATSS (43.6%) with higher or comparable inference speed, under the same backbone and training settings. Notably, our best model can achieve a single-model single-scale AP of 48.2%, at 10 FPS on a single 2080Ti GPU. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Multi-scale Training | Inf time (fps) | box AP | Config | Download | +| :---------------: | :-----: | :-----: | :------------------: | :------------: | :----: | :------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | 1x | No | 19.5 | 40.2 | [config](./gfl_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244.log.json) | +| R-50 | pytorch | 2x | Yes | 19.5 | 42.9 | [config](./gfl_r50_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802-37bb1edc.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802.log.json) | +| R-101 | pytorch | 2x | Yes | 14.7 | 44.7 | [config](./gfl_r101_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126.log.json) | +| R-101-dcnv2 | pytorch | 2x | Yes | 12.9 | 47.1 | [config](./gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002.log.json) | +| X-101-32x4d | pytorch | 2x | Yes | 12.1 | 45.9 | [config](./gfl_x101-32x4d_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002-50c1ffdb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002.log.json) | +| X-101-32x4d-dcnv2 | pytorch | 2x | Yes | 10.7 | 48.1 | [config](./gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002-14a2bf25.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002.log.json) | + +\[1\] *1x and 2x mean the model is trained for 90K and 180K iterations, respectively.* \ +\[2\] *All results are obtained with a single model and without any test time data augmentation such as multi-scale, flipping and etc..* \ +\[3\] *`dcnv2` denotes deformable convolutional networks v2.* \ +\[4\] *FPS is tested with a single GeForce RTX 2080Ti GPU, using a batch size of 1.* + +## Citation + +We provide config files to reproduce the object detection results in the paper [Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection](https://arxiv.org/abs/2006.04388) + +```latex +@article{li2020generalized, + title={Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection}, + author={Li, Xiang and Wang, Wenhai and Wu, Lijun and Chen, Shuo and Hu, Xiaolin and Li, Jun and Tang, Jinhui and Yang, Jian}, + journal={arXiv preprint arXiv:2006.04388}, + year={2020} +} +``` diff --git a/mmdetection/configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py b/mmdetection/configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..7f74893 --- /dev/null +++ b/mmdetection/configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './gfl_r50_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/gfl/gfl_r101_fpn_ms-2x_coco.py b/mmdetection/configs/gfl/gfl_r101_fpn_ms-2x_coco.py new file mode 100644 index 0000000..10135f1 --- /dev/null +++ b/mmdetection/configs/gfl/gfl_r101_fpn_ms-2x_coco.py @@ -0,0 +1,13 @@ +_base_ = './gfl_r50_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/gfl/gfl_r50_fpn_1x_coco.py b/mmdetection/configs/gfl/gfl_r50_fpn_1x_coco.py new file mode 100644 index 0000000..9023825 --- /dev/null +++ b/mmdetection/configs/gfl/gfl_r50_fpn_1x_coco.py @@ -0,0 +1,66 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='GFL', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='GFLHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25), + reg_max=16, + loss_bbox=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/gfl/gfl_r50_fpn_ms-2x_coco.py b/mmdetection/configs/gfl/gfl_r50_fpn_ms-2x_coco.py new file mode 100644 index 0000000..22770eb --- /dev/null +++ b/mmdetection/configs/gfl/gfl_r50_fpn_ms-2x_coco.py @@ -0,0 +1,28 @@ +_base_ = './gfl_r50_fpn_1x_coco.py' +max_epochs = 24 + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) + +# multi-scale training +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 480), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py b/mmdetection/configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..6aa98ee --- /dev/null +++ b/mmdetection/configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py @@ -0,0 +1,18 @@ +_base_ = './gfl_r50_fpn_ms-2x_coco.py' +model = dict( + type='GFL', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, False, True, True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py b/mmdetection/configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py new file mode 100644 index 0000000..ec629b1 --- /dev/null +++ b/mmdetection/configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './gfl_r50_fpn_ms-2x_coco.py' +model = dict( + type='GFL', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/gfl/metafile.yml b/mmdetection/configs/gfl/metafile.yml new file mode 100644 index 0000000..183fc14 --- /dev/null +++ b/mmdetection/configs/gfl/metafile.yml @@ -0,0 +1,134 @@ +Collections: + - Name: Generalized Focal Loss + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Generalized Focal Loss + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2006.04388 + Title: 'Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection' + README: configs/gfl/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/detectors/gfl.py#L6 + Version: v2.2.0 + +Models: + - Name: gfl_r50_fpn_1x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_r50_fpn_1x_coco.py + Metadata: + inference time (ms/im): + - value: 51.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_1x_coco/gfl_r50_fpn_1x_coco_20200629_121244-25944287.pth + + - Name: gfl_r50_fpn_ms-2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_r50_fpn_ms-2x_coco.py + Metadata: + inference time (ms/im): + - value: 51.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r50_fpn_mstrain_2x_coco/gfl_r50_fpn_mstrain_2x_coco_20200629_213802-37bb1edc.pth + + - Name: gfl_r101_fpn_ms-2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_r101_fpn_ms-2x_coco.py + Metadata: + inference time (ms/im): + - value: 68.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth + + - Name: gfl_r101-dconv-c3-c5_fpn_ms-2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py + Metadata: + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth + + - Name: gfl_x101-32x4d_fpn_ms-2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_x101-32x4d_fpn_ms-2x_coco.py + Metadata: + inference time (ms/im): + - value: 82.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_mstrain_2x_coco/gfl_x101_32x4d_fpn_mstrain_2x_coco_20200630_102002-50c1ffdb.pth + + - Name: gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco + In Collection: Generalized Focal Loss + Config: configs/gfl/gfl_x101-32x4d-dconv-c4-c5_fpn_ms-2x_coco.py + Metadata: + inference time (ms/im): + - value: 93.46 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco_20200630_102002-14a2bf25.pth diff --git a/mmdetection/configs/ghm/README.md b/mmdetection/configs/ghm/README.md new file mode 100644 index 0000000..c245cea --- /dev/null +++ b/mmdetection/configs/ghm/README.md @@ -0,0 +1,33 @@ +# GHM + +> [Gradient Harmonized Single-stage Detector](https://arxiv.org/abs/1811.05181) + + + +## Abstract + +Despite the great success of two-stage detectors, single-stage detector is still a more elegant and efficient way, yet suffers from the two well-known disharmonies during training, i.e. the huge difference in quantity between positive and negative examples as well as between easy and hard examples. In this work, we first point out that the essential effect of the two disharmonies can be summarized in term of the gradient. Further, we propose a novel gradient harmonizing mechanism (GHM) to be a hedging for the disharmonies. The philosophy behind GHM can be easily embedded into both classification loss function like cross-entropy (CE) and regression loss function like smooth-L1 (SL1) loss. To this end, two novel loss functions called GHM-C and GHM-R are designed to balancing the gradient flow for anchor classification and bounding box refinement, respectively. Ablation study on MS COCO demonstrates that without laborious hyper-parameter tuning, both GHM-C and GHM-R can bring substantial improvement for single-stage detector. Without any whistles and bells, our model achieves 41.6 mAP on COCO test-dev set which surpasses the state-of-the-art method, Focal Loss (FL) + SL1, by 0.8. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 4.0 | 3.3 | 37.0 | [config](./retinanet_r50_fpn_ghm-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130_004213.log.json) | +| R-101-FPN | pytorch | 1x | 6.0 | 4.4 | 39.1 | [config](./retinanet_r101_fpn_ghm-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130-c148ee8f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130_145259.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.2 | 5.1 | 40.7 | [config](./retinanet_x101-32x4d_fpn_ghm-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131-e4333bd0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131_113653.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.3 | 5.2 | 41.4 | [config](./retinanet_x101-64x4d_fpn_ghm-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131-dd381cef.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131_113723.log.json) | + +## Citation + +```latex +@inproceedings{li2019gradient, + title={Gradient Harmonized Single-stage Detector}, + author={Li, Buyu and Liu, Yu and Wang, Xiaogang}, + booktitle={AAAI Conference on Artificial Intelligence}, + year={2019} +} +``` diff --git a/mmdetection/configs/ghm/metafile.yml b/mmdetection/configs/ghm/metafile.yml new file mode 100644 index 0000000..63cb48f --- /dev/null +++ b/mmdetection/configs/ghm/metafile.yml @@ -0,0 +1,101 @@ +Collections: + - Name: GHM + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - GHM-C + - GHM-R + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1811.05181 + Title: 'Gradient Harmonized Single-stage Detector' + README: configs/ghm/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/losses/ghm_loss.py#L21 + Version: v2.0.0 + +Models: + - Name: retinanet_r50_fpn_ghm-1x_coco + In Collection: GHM + Config: configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 303.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r50_fpn_1x_coco/retinanet_ghm_r50_fpn_1x_coco_20200130-a437fda3.pth + + - Name: retinanet_r101_fpn_ghm-1x_coco + In Collection: GHM + Config: configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 227.27 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_r101_fpn_1x_coco/retinanet_ghm_r101_fpn_1x_coco_20200130-c148ee8f.pth + + - Name: retinanet_x101-32x4d_fpn_ghm-1x_coco + In Collection: GHM + Config: configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py + Metadata: + Training Memory (GB): 7.2 + inference time (ms/im): + - value: 196.08 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_32x4d_fpn_1x_coco/retinanet_ghm_x101_32x4d_fpn_1x_coco_20200131-e4333bd0.pth + + - Name: retinanet_x101-64x4d_fpn_ghm-1x_coco + In Collection: GHM + Config: configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 192.31 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ghm/retinanet_ghm_x101_64x4d_fpn_1x_coco/retinanet_ghm_x101_64x4d_fpn_1x_coco_20200131-dd381cef.pth diff --git a/mmdetection/configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py b/mmdetection/configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py new file mode 100644 index 0000000..090221e --- /dev/null +++ b/mmdetection/configs/ghm/retinanet_r101_fpn_ghm-1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './retinanet_r50_fpn_ghm-1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py b/mmdetection/configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py new file mode 100644 index 0000000..42b9aa6 --- /dev/null +++ b/mmdetection/configs/ghm/retinanet_r50_fpn_ghm-1x_coco.py @@ -0,0 +1,18 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +model = dict( + bbox_head=dict( + loss_cls=dict( + _delete_=True, + type='GHMC', + bins=30, + momentum=0.75, + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict( + _delete_=True, + type='GHMR', + mu=0.02, + bins=10, + momentum=0.7, + loss_weight=10.0))) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py b/mmdetection/configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py new file mode 100644 index 0000000..1240545 --- /dev/null +++ b/mmdetection/configs/ghm/retinanet_x101-32x4d_fpn_ghm-1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_ghm-1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py b/mmdetection/configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py new file mode 100644 index 0000000..689d2ed --- /dev/null +++ b/mmdetection/configs/ghm/retinanet_x101-64x4d_fpn_ghm-1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_ghm-1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/glip/README.md b/mmdetection/configs/glip/README.md new file mode 100644 index 0000000..1252d92 --- /dev/null +++ b/mmdetection/configs/glip/README.md @@ -0,0 +1,80 @@ +# GLIP: Grounded Language-Image Pre-training + +> [GLIP: Grounded Language-Image Pre-training](https://arxiv.org/abs/2112.03857) + + + +## Abstract + +This paper presents a grounded language-image pre-training (GLIP) model for learning object-level, language-aware, and semantic-rich visual representations. GLIP unifies object detection and phrase grounding for pre-training. The unification brings two benefits: 1) it allows GLIP to learn from both detection and grounding data to improve both tasks and bootstrap a good grounding model; 2) GLIP can leverage massive image-text pairs by generating grounding boxes in a self-training fashion, making the learned representation semantic-rich. In our experiments, we pre-train GLIP on 27M grounding data, including 3M human-annotated and 24M web-crawled image-text pairs. The learned representations demonstrate strong zero-shot and few-shot transferability to various object-level recognition tasks. 1) When directly evaluated on COCO and LVIS (without seeing any images in COCO during pre-training), GLIP achieves 49.8 AP and 26.9 AP, respectively, surpassing many supervised baselines. 2) After fine-tuned on COCO, GLIP achieves 60.8 AP on val and 61.5 AP on test-dev, surpassing prior SoTA. 3) When transferred to 13 downstream object detection tasks, a 1-shot GLIP rivals with a fully-supervised Dynamic Head. + +
    + +
    + +## Installation + +```shell +cd $MMDETROOT + +# source installation +pip install -r requirements/multimodal.txt + +# or mim installation +mim install mmdet[multimodal] +``` + +```shell +cd $MMDETROOT + +wget https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_a_mmdet-b3654169.pth + +python demo/image_demo.py demo/demo.jpg \ +configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py \ +--weights glip_tiny_a_mmdet-b3654169.pth \ +--texts 'bench. car' +``` + +
    + +
    + +## NOTE + +GLIP utilizes BERT as the language model, which requires access to https://huggingface.co/. If you encounter connection errors due to network access, you can download the required files on a computer with internet access and save them locally. Finally, modify the `lang_model_name` field in the config to the local path. Please refer to the following code: + +```python +from transformers import BertConfig, BertModel +from transformers import AutoTokenizer + +config = BertConfig.from_pretrained("bert-base-uncased") +model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, config=config) +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + +config.save_pretrained("your path/bert-base-uncased") +model.save_pretrained("your path/bert-base-uncased") +tokenizer.save_pretrained("your path/bert-base-uncased") +``` + +## Results and Models + +| Model | Zero-shot or Finetune | COCO mAP | Official COCO mAP | Pre-Train Data | Config | Download | +| :--------: | :-------------------: | :------: | ----------------: | :------------------------: | :---------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| GLIP-T (A) | Zero-shot | 43.0 | 42.9 | O365 | [config](glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_a_mmdet-b3654169.pth) | +| GLIP-T (A) | Finetune | 53.3 | 52.9 | O365 | [config](glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_180419-e6addd96.pth)\| [log](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_180419.log.json) | +| GLIP-T (B) | Zero-shot | 44.9 | 44.9 | O365 | [config](glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_b_mmdet-6dfbd102.pth) | +| GLIP-T (B) | Finetune | 54.1 | 53.8 | O365 | [config](glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230916_163538-650323ba.pth)\| [log](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230916_163538.log.json) | +| GLIP-T (C) | Zero-shot | 46.7 | 46.7 | O365,GoldG | [config](glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_c_mmdet-2fc427dd.pth) | +| GLIP-T (C) | Finetune | 55.2 | 55.1 | O365,GoldG | [config](glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_182935-4ba3fc3b.pth)\| [log](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_182935.log.json) | +| GLIP-T | Zero-shot | 46.6 | 46.6 | O365,GoldG,CC3M,SBU | [config](glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_mmdet-c24ce662.pth) | +| GLIP-T | Finetune | 55.4 | 55.2 | O365,GoldG,CC3M,SBU | [config](glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_224410-ba97be24.pth)\| [log](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_224410.log.json) | +| GLIP-L | Zero-shot | 51.3 | 51.4 | FourODs,GoldG,CC3M+12M,SBU | [config](glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_l_mmdet-abfe026b.pth) | +| GLIP-L | Finetune | 59.4 | | FourODs,GoldG,CC3M+12M,SBU | [config](glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230910_100800-e9be4274.pth)\| [log](https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230910_100800.log.json) | + +Note: + +1. The weights corresponding to the zero-shot model are adopted from the official weights and converted using the [script](../../tools/model_converters/glip_to_mmdet.py). We have not retrained the model for the time being. +2. Finetune refers to fine-tuning on the COCO 2017 dataset. The L model is trained using 16 A100 GPUs, while the remaining models are trained using 16 NVIDIA GeForce 3090 GPUs. +3. Taking the GLIP-T(A) model as an example, I trained it twice using the official code, and the fine-tuning mAP were 52.5 and 52.6. Therefore, the mAP we achieved in our reproduction is higher than the official results. The main reason is that we modified the `weight_decay` parameter. +4. Our experiments revealed that training for 24 epochs leads to overfitting. Therefore, we chose the best-performing model. If users want to train on a custom dataset, it is advisable to shorten the number of epochs and save the best-performing model. +5. Due to the official absence of fine-tuning hyperparameters for the GLIP-L model, we have not yet reproduced the official accuracy. I have found that overfitting can also occur, so it may be necessary to consider custom modifications to data augmentation and model enhancement. Given the high cost of training, we have not conducted any research on this matter at the moment. diff --git a/mmdetection/configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmdetection/configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py new file mode 100644 index 0000000..92a85a1 --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py @@ -0,0 +1,14 @@ +_base_ = './glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py' + +model = dict( + backbone=dict( + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + drop_path_rate=0.4, + ), + neck=dict(in_channels=[384, 768, 1536]), + bbox_head=dict(early_fuse=True, num_dyhead_blocks=8, use_checkpoint=True)) + +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_l_mmdet-abfe026b.pth' # noqa diff --git a/mmdetection/configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py b/mmdetection/configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py new file mode 100644 index 0000000..546ecfe --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py @@ -0,0 +1,12 @@ +_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py' + +model = dict( + backbone=dict( + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + drop_path_rate=0.4, + ), + neck=dict(in_channels=[384, 768, 1536]), + bbox_head=dict(early_fuse=True, num_dyhead_blocks=8)) diff --git a/mmdetection/configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmdetection/configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py new file mode 100644 index 0000000..4b28065 --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py @@ -0,0 +1,155 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_a_mmdet-b3654169.pth' # noqa +lang_model_name = 'bert-base-uncased' + +model = dict( + type='GLIP', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=False), + neck=dict( + type='FPN_DropBlock', + in_channels=[192, 384, 768], + out_channels=256, + start_level=0, + relu_before_extra_convs=True, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='ATSSVLFusionHead', + lang_model_name=lang_model_name, + num_classes=80, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128], + center_offset=0.5), + bbox_coder=dict( + type='DeltaXYWHBBoxCoderForGLIP', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + language_model=dict(type='BertModel', name=lang_model_name), + train_cfg=dict( + assigner=dict( + type='ATSSAssigner', + topk=9, + iou_calculator=dict(type='BboxOverlaps2D_GLIP')), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# dataset settings +train_pipeline = [ + dict( + type='LoadImageFromFile', + imdecode_backend='pillow', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='GTBoxSubOne_GLIP'), + dict( + type='RandomChoiceResize', + scales=[(1333, 480), (1333, 560), (1333, 640), (1333, 720), + (1333, 800)], + keep_ratio=True, + resize_type='FixScaleResize', + backend='pillow'), + dict(type='RandomFlip_GLIP', prob=0.5), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', 'text', + 'custom_entities')) +] + +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args, + imdecode_backend='pillow'), + dict( + type='FixScaleResize', + scale=(800, 1333), + keep_ratio=True, + backend='pillow'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'text', 'custom_entities')) +] + +train_dataloader = dict( + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=2, + dataset=dict( + type=_base_.dataset_type, + data_root=_base_.data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + return_classes=True, + backend_args=_base_.backend_args))) + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, return_classes=True)) +test_dataloader = val_dataloader + +# We did not adopt the official 24e optimizer strategy +# because the results indicate that the current strategy is superior. +optim_wrapper = dict( + _delete_=True, + type='OptimWrapper', + optimizer=dict( + type='AdamW', lr=0.00002, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + }), + clip_grad=None) diff --git a/mmdetection/configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py b/mmdetection/configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py new file mode 100644 index 0000000..34a818c --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +lang_model_name = 'bert-base-uncased' + +model = dict( + type='GLIP', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=False), + neck=dict( + type='FPN', + in_channels=[192, 384, 768], + out_channels=256, + start_level=0, + relu_before_extra_convs=True, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='ATSSVLFusionHead', + lang_model_name=lang_model_name, + num_classes=80, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128], + center_offset=0.5), + bbox_coder=dict( + type='DeltaXYWHBBoxCoderForGLIP', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + ), + language_model=dict(type='BertModel', name=lang_model_name), + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args, + imdecode_backend='pillow'), + dict( + type='FixScaleResize', + scale=(800, 1333), + keep_ratio=True, + backend='pillow'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'text', 'custom_entities')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, return_classes=True)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmdetection/configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py new file mode 100644 index 0000000..3487de3 --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py @@ -0,0 +1,9 @@ +_base_ = './glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py' + +model = dict(bbox_head=dict(early_fuse=True, use_checkpoint=True)) + +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_b_mmdet-6dfbd102.pth' # noqa + +optim_wrapper = dict( + optimizer=dict(lr=0.00001), + clip_grad=dict(_delete_=True, max_norm=1, norm_type=2)) diff --git a/mmdetection/configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py b/mmdetection/configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py new file mode 100644 index 0000000..6334e5e --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py @@ -0,0 +1,3 @@ +_base_ = './glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py' + +model = dict(bbox_head=dict(early_fuse=True)) diff --git a/mmdetection/configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmdetection/configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py new file mode 100644 index 0000000..5c315e4 --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py @@ -0,0 +1,3 @@ +_base_ = './glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py' + +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_c_mmdet-2fc427dd.pth' # noqa diff --git a/mmdetection/configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py b/mmdetection/configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py new file mode 100644 index 0000000..24898f4 --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py @@ -0,0 +1 @@ +_base_ = './glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py' diff --git a/mmdetection/configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py b/mmdetection/configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py new file mode 100644 index 0000000..3391272 --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py @@ -0,0 +1,3 @@ +_base_ = './glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py' + +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_mmdet-c24ce662.pth' # noqa diff --git a/mmdetection/configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py b/mmdetection/configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py new file mode 100644 index 0000000..24898f4 --- /dev/null +++ b/mmdetection/configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py @@ -0,0 +1 @@ +_base_ = './glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py' diff --git a/mmdetection/configs/glip/metafile.yml b/mmdetection/configs/glip/metafile.yml new file mode 100644 index 0000000..fbbf718 --- /dev/null +++ b/mmdetection/configs/glip/metafile.yml @@ -0,0 +1,111 @@ +Collections: + - Name: GLIP + Metadata: + Training Data: Objects365, GoldG, CC3M, SBU and COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: A100 GPUs + Architecture: + - Swin Transformer + - DYHead + - BERT + Paper: + URL: https://arxiv.org/abs/2112.03857 + Title: 'GLIP: Grounded Language-Image Pre-training' + README: configs/glip/README.md + Code: + URL: + Version: v3.0.0 + +Models: + - Name: glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 + In Collection: GLIP + Config: configs/glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_a_mmdet-b3654169.pth + - Name: glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365 + In Collection: GLIP + Config: configs/glip/glip_atss_swin-t_b_fpn_dyhead_pretrain_obj365.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_b_mmdet-6dfbd102.pth + - Name: glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg + In Collection: GLIP + Config: configs/glip/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.7 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_c_mmdet-2fc427dd.pth + - Name: glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub + In Collection: GLIP + Config: configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365-goldg-cc3m-sub.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_mmdet-c24ce662.pth + - Name: glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata + In Collection: GLIP + Config: configs/glip/glip_atss_swin-l_fpn_dyhead_pretrain_mixeddata.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_l_mmdet-abfe026b.pth + - Name: glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco + In Collection: GLIP + Config: configs/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 53.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_a_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_180419-e6addd96.pth + - Name: glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco + In Collection: GLIP + Config: configs/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 54.1 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_b_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230916_163538-650323ba.pth + - Name: glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco + In Collection: GLIP + Config: configs/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 55.2 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_c_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_182935-4ba3fc3b.pth + - Name: glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco + In Collection: GLIP + Config: configs/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 55.4 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-t_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230914_224410-ba97be24.pth + - Name: glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco + In Collection: GLIP + Config: configs/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 59.4 + Weights: https://download.openmmlab.com/mmdetection/v3.0/glip/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco/glip_atss_swin-l_fpn_dyhead_16xb2_ms-2x_funtune_coco_20230910_100800-e9be4274.pth diff --git a/mmdetection/configs/gn+ws/README.md b/mmdetection/configs/gn+ws/README.md new file mode 100644 index 0000000..ef8cfc8 --- /dev/null +++ b/mmdetection/configs/gn+ws/README.md @@ -0,0 +1,54 @@ +# GN + WS + +> [Weight Standardization](https://arxiv.org/abs/1903.10520) + + + +## Abstract + +Batch Normalization (BN) has become an out-of-box technique to improve deep network training. However, its effectiveness is limited for micro-batch training, i.e., each GPU typically has only 1-2 images for training, which is inevitable for many computer vision tasks, e.g., object detection and semantic segmentation, constrained by memory consumption. To address this issue, we propose Weight Standardization (WS) and Batch-Channel Normalization (BCN) to bring two success factors of BN into micro-batch training: 1) the smoothing effects on the loss landscape and 2) the ability to avoid harmful elimination singularities along the training trajectory. WS standardizes the weights in convolutional layers to smooth the loss landscape by reducing the Lipschitz constants of the loss and the gradients; BCN combines batch and channel normalizations and leverages estimated statistics of the activations in convolutional layers to keep networks away from elimination singularities. We validate WS and BCN on comprehensive computer vision tasks, including image classification, object detection, instance segmentation, video recognition and semantic segmentation. All experimental results consistently show that WS and BCN improve micro-batch training significantly. Moreover, using WS and BCN with micro-batch training is even able to match or outperform the performances of BN with large-batch training. + +
    + +
    + +## Results and Models + +Faster R-CNN + +| Backbone | Style | Normalization | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----------: | :-----: | :------: | :------------: | :----: | :-----: | :---------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | GN+WS | 1x | 5.9 | 11.7 | 39.7 | - | [config](./faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130_210936.log.json) | +| R-101-FPN | pytorch | GN+WS | 1x | 8.9 | 9.0 | 41.7 | - | [config](./faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205-a93b0d75.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205_232146.log.json) | +| X-50-32x4d-FPN | pytorch | GN+WS | 1x | 7.0 | 10.3 | 40.7 | - | [config](./faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203-839c5d9d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203_220113.log.json) | +| X-101-32x4d-FPN | pytorch | GN+WS | 1x | 10.8 | 7.6 | 42.1 | - | [config](./faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212-27da1bc2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212_195302.log.json) | + +Mask R-CNN + +| Backbone | Style | Normalization | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----------: | :-------: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | GN+WS | 2x | 7.3 | 10.5 | 40.6 | 36.6 | [config](./mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226-16acb762.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226_062128.log.json) | +| R-101-FPN | pytorch | GN+WS | 2x | 10.3 | 8.6 | 42.0 | 37.7 | [config](./mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212-ea357cd9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212_213627.log.json) | +| X-50-32x4d-FPN | pytorch | GN+WS | 2x | 8.4 | 9.3 | 41.1 | 37.0 | [config](./mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216-649fdb6f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216_201500.log.json) | +| X-101-32x4d-FPN | pytorch | GN+WS | 2x | 12.2 | 7.1 | 42.1 | 37.9 | [config](./mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319-33fb95b5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319_104101.log.json) | +| R-50-FPN | pytorch | GN+WS | 20-23-24e | 7.3 | - | 41.1 | 37.1 | [config](./mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213-487d1283.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213_035123.log.json) | +| R-101-FPN | pytorch | GN+WS | 20-23-24e | 10.3 | - | 43.1 | 38.6 | [config](./mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213-57b5a50f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213_130142.log.json) | +| X-50-32x4d-FPN | pytorch | GN+WS | 20-23-24e | 8.4 | - | 42.1 | 38.0 | [config](./mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226-969bcb2c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226_093732.log.json) | +| X-101-32x4d-FPN | pytorch | GN+WS | 20-23-24e | 12.2 | - | 42.7 | 38.5 | [config](./mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316-e6cd35ef.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316_013741.log.json) | + +Note: + +- GN+WS requires about 5% more memory than GN, and it is only 5% slower than GN. +- In the paper, a 20-23-24e lr schedule is used instead of 2x. +- The X-50-GN and X-101-GN pretrained models are also shared by the authors. + +## Citation + +```latex +@article{weightstandardization, + author = {Siyuan Qiao and Huiyu Wang and Chenxi Liu and Wei Shen and Alan Yuille}, + title = {Weight Standardization}, + journal = {arXiv preprint arXiv:1903.10520}, + year = {2019}, +} +``` diff --git a/mmdetection/configs/gn+ws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py b/mmdetection/configs/gn+ws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py new file mode 100644 index 0000000..a4cb828 --- /dev/null +++ b/mmdetection/configs/gn+ws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://jhu/resnet101_gn_ws'))) diff --git a/mmdetection/configs/gn+ws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py b/mmdetection/configs/gn+ws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py new file mode 100644 index 0000000..1a044c9 --- /dev/null +++ b/mmdetection/configs/gn+ws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://jhu/resnet50_gn_ws')), + neck=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg))) diff --git a/mmdetection/configs/gn+ws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py b/mmdetection/configs/gn+ws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py new file mode 100644 index 0000000..b2a317d --- /dev/null +++ b/mmdetection/configs/gn+ws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = './faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py' +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://jhu/resnext101_32x4d_gn_ws'))) diff --git a/mmdetection/configs/gn+ws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py b/mmdetection/configs/gn+ws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py new file mode 100644 index 0000000..dd75a2c --- /dev/null +++ b/mmdetection/configs/gn+ws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = './faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py' +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + type='ResNeXt', + depth=50, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://jhu/resnext50_32x4d_gn_ws'))) diff --git a/mmdetection/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py b/mmdetection/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py new file mode 100644 index 0000000..1815e3f --- /dev/null +++ b/mmdetection/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py' +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20, 23], + gamma=0.1) +] diff --git a/mmdetection/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py b/mmdetection/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py new file mode 100644 index 0000000..5de37de --- /dev/null +++ b/mmdetection/configs/gn+ws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://jhu/resnet101_gn_ws'))) diff --git a/mmdetection/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py b/mmdetection/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py new file mode 100644 index 0000000..287c652 --- /dev/null +++ b/mmdetection/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py' +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20, 23], + gamma=0.1) +] diff --git a/mmdetection/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py b/mmdetection/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py new file mode 100644 index 0000000..ed8b1b7 --- /dev/null +++ b/mmdetection/configs/gn+ws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py @@ -0,0 +1,33 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://jhu/resnet50_gn_ws')), + neck=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg), + mask_head=dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg))) +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py b/mmdetection/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py new file mode 100644 index 0000000..8ce9193 --- /dev/null +++ b/mmdetection/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py' +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20, 23], + gamma=0.1) +] diff --git a/mmdetection/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py b/mmdetection/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py new file mode 100644 index 0000000..bcfc371 --- /dev/null +++ b/mmdetection/configs/gn+ws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py @@ -0,0 +1,19 @@ +_base_ = './mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py' +# model settings +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://jhu/resnext101_32x4d_gn_ws'))) diff --git a/mmdetection/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py b/mmdetection/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py new file mode 100644 index 0000000..af9ea5a --- /dev/null +++ b/mmdetection/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py' +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20, 23], + gamma=0.1) +] diff --git a/mmdetection/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py b/mmdetection/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py new file mode 100644 index 0000000..ab2b140 --- /dev/null +++ b/mmdetection/configs/gn+ws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py @@ -0,0 +1,19 @@ +_base_ = './mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py' +# model settings +conv_cfg = dict(type='ConvWS') +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + type='ResNeXt', + depth=50, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://jhu/resnext50_32x4d_gn_ws'))) diff --git a/mmdetection/configs/gn+ws/metafile.yml b/mmdetection/configs/gn+ws/metafile.yml new file mode 100644 index 0000000..89b9107 --- /dev/null +++ b/mmdetection/configs/gn+ws/metafile.yml @@ -0,0 +1,263 @@ +Collections: + - Name: Weight Standardization + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Group Normalization + - Weight Standardization + Paper: + URL: https://arxiv.org/abs/1903.10520 + Title: 'Weight Standardization' + README: configs/gn+ws/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/gn%2Bws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py + Version: v2.0.0 + +Models: + - Name: faster-rcnn_r50_fpn_gn_ws-all_1x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/faster-rcnn_r50_fpn_gn-ws-all_1x_coco.py + Metadata: + Training Memory (GB): 5.9 + inference time (ms/im): + - value: 85.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r50_fpn_gn_ws-all_1x_coco/faster_rcnn_r50_fpn_gn_ws-all_1x_coco_20200130-613d9fe2.pth + + - Name: faster-rcnn_r101_fpn_gn-ws-all_1x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/faster-rcnn_r101_fpn_gn-ws-all_1x_coco.py + Metadata: + Training Memory (GB): 8.9 + inference time (ms/im): + - value: 111.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_r101_fpn_gn_ws-all_1x_coco/faster_rcnn_r101_fpn_gn_ws-all_1x_coco_20200205-a93b0d75.pth + + - Name: faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/faster-rcnn_x50-32x4d_fpn_gn-ws-all_1x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 97.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x50_32x4d_fpn_gn_ws-all_1x_coco_20200203-839c5d9d.pth + + - Name: faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/faster-rcnn_x101-32x4d_fpn_gn-ws-all_1x_coco.py + Metadata: + Training Memory (GB): 10.8 + inference time (ms/im): + - value: 131.58 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco/faster_rcnn_x101_32x4d_fpn_gn_ws-all_1x_coco_20200212-27da1bc2.pth + + - Name: mask-rcnn_r50_fpn_gn_ws-all_2x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask-rcnn_r50_fpn_gn-ws-all_2x_coco.py + Metadata: + Training Memory (GB): 7.3 + inference time (ms/im): + - value: 95.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_2x_coco/mask_rcnn_r50_fpn_gn_ws-all_2x_coco_20200226-16acb762.pth + + - Name: mask-rcnn_r101_fpn_gn-ws-all_2x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask-rcnn_r101_fpn_gn-ws-all_2x_coco.py + Metadata: + Training Memory (GB): 10.3 + inference time (ms/im): + - value: 116.28 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_2x_coco/mask_rcnn_r101_fpn_gn_ws-all_2x_coco_20200212-ea357cd9.pth + + - Name: mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_2x_coco.py + Metadata: + Training Memory (GB): 8.4 + inference time (ms/im): + - value: 107.53 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_2x_coco_20200216-649fdb6f.pth + + - Name: mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_2x_coco.py + Metadata: + Training Memory (GB): 12.2 + inference time (ms/im): + - value: 140.85 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_2x_coco_20200319-33fb95b5.pth + + - Name: mask-rcnn_r50_fpn_gn_ws-all_20_23_24e_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask-rcnn_r50_fpn_gn-ws-all_20-23-24e_coco.py + Metadata: + Training Memory (GB): 7.3 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r50_fpn_gn_ws-all_20_23_24e_coco_20200213-487d1283.pth + + - Name: mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask-rcnn_r101_fpn_gn-ws-all_20-23-24e_coco.py + Metadata: + Training Memory (GB): 10.3 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_r101_fpn_gn_ws-all_20_23_24e_coco_20200213-57b5a50f.pth + + - Name: mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask-rcnn_x50-32x4d_fpn_gn-ws-all_20-23-24e_coco.py + Metadata: + Training Memory (GB): 8.4 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x50_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200226-969bcb2c.pth + + - Name: mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco + In Collection: Weight Standardization + Config: configs/gn%2Bws/mask-rcnn_x101-32x4d_fpn_gn-ws-all_20-23-24e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn%2Bws/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco/mask_rcnn_x101_32x4d_fpn_gn_ws-all_20_23_24e_coco_20200316-e6cd35ef.pth diff --git a/mmdetection/configs/gn/README.md b/mmdetection/configs/gn/README.md new file mode 100644 index 0000000..1bc8192 --- /dev/null +++ b/mmdetection/configs/gn/README.md @@ -0,0 +1,41 @@ +# GN + +> [Group Normalization](https://arxiv.org/abs/1803.08494) + + + +## Abstract + +Batch Normalization (BN) is a milestone technique in the development of deep learning, enabling various networks to train. However, normalizing along the batch dimension introduces problems --- BN's error increases rapidly when the batch size becomes smaller, caused by inaccurate batch statistics estimation. This limits BN's usage for training larger models and transferring features to computer vision tasks including detection, segmentation, and video, which require small batches constrained by memory consumption. In this paper, we present Group Normalization (GN) as a simple alternative to BN. GN divides the channels into groups and computes within each group the mean and variance for normalization. GN's computation is independent of batch sizes, and its accuracy is stable in a wide range of batch sizes. On ResNet-50 trained in ImageNet, GN has 10.6% lower error than its BN counterpart when using a batch size of 2; when using typical batch sizes, GN is comparably good with BN and outperforms other normalization variants. Moreover, GN can be naturally transferred from pre-training to fine-tuning. GN can outperform its BN-based counterparts for object detection and segmentation in COCO, and for video classification in Kinetics, showing that GN can effectively replace the powerful BN in a variety of tasks. GN can be easily implemented by a few lines of code in modern libraries. + +
    + +
    + +## Results and Models + +| Backbone | model | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-----------: | :--------: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN (d) | Mask R-CNN | 2x | 7.1 | 11.0 | 40.2 | 36.4 | [config](./mask-rcnn_r50_fpn_gn-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206_050355.log.json) | +| R-50-FPN (d) | Mask R-CNN | 3x | 7.1 | - | 40.5 | 36.7 | [config](./mask-rcnn_r50_fpn_gn-all_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214-8b23b1e5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214_063512.log.json) | +| R-101-FPN (d) | Mask R-CNN | 2x | 9.9 | 9.0 | 41.9 | 37.6 | [config](./mask-rcnn_r101_fpn_gn-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205-d96b1b50.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205_234402.log.json) | +| R-101-FPN (d) | Mask R-CNN | 3x | 9.9 | | 42.1 | 38.0 | [config](./mask-rcnn_r101_fpn_gn-all_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609-0df864f4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609.log.json) | +| R-50-FPN (c) | Mask R-CNN | 2x | 7.1 | 10.9 | 40.0 | 36.1 | [config](./mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207-20d3e849.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207_225832.log.json) | +| R-50-FPN (c) | Mask R-CNN | 3x | 7.1 | - | 40.1 | 36.2 | [config](./mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225-542aefbc.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225_235135.log.json) | + +**Notes:** + +- (d) means pretrained model converted from Detectron, and (c) means the contributed model pretrained by [@thangvubk](https://github.com/thangvubk). +- The `3x` schedule is epoch \[28, 34, 36\]. +- **Memory, Train/Inf time is outdated.** + +## Citation + +```latex +@inproceedings{wu2018group, + title={Group Normalization}, + author={Wu, Yuxin and He, Kaiming}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, + year={2018} +} +``` diff --git a/mmdetection/configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py b/mmdetection/configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py new file mode 100644 index 0000000..54f57d8 --- /dev/null +++ b/mmdetection/configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask-rcnn_r50_fpn_gn-all_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet101_gn'))) diff --git a/mmdetection/configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py b/mmdetection/configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py new file mode 100644 index 0000000..a94e063 --- /dev/null +++ b/mmdetection/configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py @@ -0,0 +1,18 @@ +_base_ = './mask-rcnn_r101_fpn_gn-all_2x_coco.py' + +# learning policy +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] diff --git a/mmdetection/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py b/mmdetection/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py new file mode 100644 index 0000000..5515ec1 --- /dev/null +++ b/mmdetection/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py @@ -0,0 +1,31 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://contrib/resnet50_gn')), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg), + mask_head=dict(norm_cfg=norm_cfg))) + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py b/mmdetection/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py new file mode 100644 index 0000000..e6f7a97 --- /dev/null +++ b/mmdetection/configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py @@ -0,0 +1,18 @@ +_base_ = './mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py' + +# learning policy +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] diff --git a/mmdetection/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py b/mmdetection/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py new file mode 100644 index 0000000..1313b22 --- /dev/null +++ b/mmdetection/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py @@ -0,0 +1,36 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=norm_cfg, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet50_gn')), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg), + mask_head=dict(norm_cfg=norm_cfg))) + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py b/mmdetection/configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py new file mode 100644 index 0000000..e425de9 --- /dev/null +++ b/mmdetection/configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py @@ -0,0 +1,18 @@ +_base_ = './mask-rcnn_r50_fpn_gn-all_2x_coco.py' + +# learning policy +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] diff --git a/mmdetection/configs/gn/metafile.yml b/mmdetection/configs/gn/metafile.yml new file mode 100644 index 0000000..9781dc9 --- /dev/null +++ b/mmdetection/configs/gn/metafile.yml @@ -0,0 +1,162 @@ +Collections: + - Name: Group Normalization + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Group Normalization + Paper: + URL: https://arxiv.org/abs/1803.08494 + Title: 'Group Normalization' + README: configs/gn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py + Version: v2.0.0 + +Models: + - Name: mask-rcnn_r50_fpn_gn-all_2x_coco + In Collection: Group Normalization + Config: configs/gn/mask-rcnn_r50_fpn_gn-all_2x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_2x_coco/mask_rcnn_r50_fpn_gn-all_2x_coco_20200206-8eee02a6.pth + + - Name: mask-rcnn_r50_fpn_gn-all_3x_coco + In Collection: Group Normalization + Config: configs/gn/mask-rcnn_r50_fpn_gn-all_3x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_3x_coco/mask_rcnn_r50_fpn_gn-all_3x_coco_20200214-8b23b1e5.pth + + - Name: mask-rcnn_r101_fpn_gn-all_2x_coco + In Collection: Group Normalization + Config: configs/gn/mask-rcnn_r101_fpn_gn-all_2x_coco.py + Metadata: + Training Memory (GB): 9.9 + inference time (ms/im): + - value: 111.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_2x_coco/mask_rcnn_r101_fpn_gn-all_2x_coco_20200205-d96b1b50.pth + + - Name: mask-rcnn_r101_fpn_gn-all_3x_coco + In Collection: Group Normalization + Config: configs/gn/mask-rcnn_r101_fpn_gn-all_3x_coco.py + Metadata: + Training Memory (GB): 9.9 + inference time (ms/im): + - value: 111.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r101_fpn_gn-all_3x_coco/mask_rcnn_r101_fpn_gn-all_3x_coco_20200513_181609-0df864f4.pth + + - Name: mask-rcnn_r50_fpn_gn-all_contrib_2x_coco + In Collection: Group Normalization + Config: configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_2x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 91.74 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco/mask_rcnn_r50_fpn_gn-all_contrib_2x_coco_20200207-20d3e849.pth + + - Name: mask-rcnn_r50_fpn_gn-all_contrib_3x_coco + In Collection: Group Normalization + Config: configs/gn/mask-rcnn_r50-contrib_fpn_gn-all_3x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 91.74 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/gn/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco/mask_rcnn_r50_fpn_gn-all_contrib_3x_coco_20200225-542aefbc.pth diff --git a/mmdetection/configs/grid_rcnn/README.md b/mmdetection/configs/grid_rcnn/README.md new file mode 100644 index 0000000..3de810a --- /dev/null +++ b/mmdetection/configs/grid_rcnn/README.md @@ -0,0 +1,47 @@ +# Grid R-CNN + +> [Grid R-CNN](https://arxiv.org/abs/1811.12030) + + + +## Abstract + +This paper proposes a novel object detection framework named Grid R-CNN, which adopts a grid guided localization mechanism for accurate object detection. Different from the traditional regression based methods, the Grid R-CNN captures the spatial information explicitly and enjoys the position sensitive property of fully convolutional architecture. Instead of using only two independent points, we design a multi-point supervision formulation to encode more clues in order to reduce the impact of inaccurate prediction of specific points. To take the full advantage of the correlation of points in a grid, we propose a two-stage information fusion strategy to fuse feature maps of neighbor grid points. The grid guided localization approach is easy to be extended to different state-of-the-art detection frameworks. Grid R-CNN leads to high quality object localization, and experiments demonstrate that it achieves a 4.1% AP gain at IoU=0.8 and a 10.0% AP gain at IoU=0.9 on COCO benchmark compared to Faster R-CNN with Res50 backbone and FPN architecture. + +Grid R-CNN is a well-performed objection detection framework. It transforms the traditional box offset regression problem into a grid point estimation problem. With the guidance of the grid points, it can obtain high-quality localization results. However, the speed of Grid R-CNN is not so satisfactory. In this technical report we present Grid R-CNN Plus, a better and faster version of Grid R-CNN. We have made several updates that significantly speed up the framework and simultaneously improve the accuracy. On COCO dataset, the Res50-FPN based Grid R-CNN Plus detector achieves an mAP of 40.4%, outperforming the baseline on the same model by 3.0 points with similar inference time. + +
    + +
    + +## Results and Models + +| Backbone | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | 2x | 5.1 | 15.0 | 40.4 | [config](./grid-rcnn_r50_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130_221140.log.json) | +| R-101 | 2x | 7.0 | 12.6 | 41.5 | [config](./grid-rcnn_r101_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309-d6eca030.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309_164224.log.json) | +| X-101-32x4d | 2x | 8.3 | 10.8 | 42.9 | [config](./grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130-d8f0e3ff.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130_215413.log.json) | +| X-101-64x4d | 2x | 11.3 | 7.7 | 43.0 | [config](./grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204-ec76a754.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204_080641.log.json) | + +**Notes:** + +- All models are trained with 8 GPUs instead of 32 GPUs in the original paper. +- The warming up lasts for 1 epoch and `2x` here indicates 25 epochs. + +## Citation + +```latex +@inproceedings{lu2019grid, + title={Grid r-cnn}, + author={Lu, Xin and Li, Buyu and Yue, Yuxin and Li, Quanquan and Yan, Junjie}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} + +@article{lu2019grid, + title={Grid R-CNN Plus: Faster and Better}, + author={Lu, Xin and Li, Buyu and Yue, Yuxin and Li, Quanquan and Yan, Junjie}, + journal={arXiv preprint arXiv:1906.05688}, + year={2019} +} +``` diff --git a/mmdetection/configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py b/mmdetection/configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py new file mode 100644 index 0000000..46d41ed --- /dev/null +++ b/mmdetection/configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './grid-rcnn_r50_fpn_gn-head_2x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py b/mmdetection/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py new file mode 100644 index 0000000..3582806 --- /dev/null +++ b/mmdetection/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py @@ -0,0 +1,19 @@ +_base_ = './grid-rcnn_r50_fpn_gn-head_2x_coco.py' + +# training schedule +max_epochs = 12 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.0001, by_epoch=False, begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] diff --git a/mmdetection/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py b/mmdetection/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py new file mode 100644 index 0000000..228fca2 --- /dev/null +++ b/mmdetection/configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py @@ -0,0 +1,160 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='GridRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='GridRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + with_reg=False, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False), + grid_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + grid_head=dict( + type='GridHead', + grid_points=9, + num_convs=8, + in_channels=256, + point_feat_channels=64, + norm_cfg=dict(type='GN', num_groups=36), + loss_grid=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=15))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_radius=1, + pos_weight=-1, + max_num_grid=192, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.03, + nms=dict(type='nms', iou_threshold=0.3), + max_per_img=100))) +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# training schedule +max_epochs = 25 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 80, + by_epoch=False, + begin=0, + end=3665), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[17, 23], + gamma=0.1) +] + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py b/mmdetection/configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py new file mode 100644 index 0000000..dddf157 --- /dev/null +++ b/mmdetection/configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py @@ -0,0 +1,13 @@ +_base_ = './grid-rcnn_r50_fpn_gn-head_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py b/mmdetection/configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py new file mode 100644 index 0000000..e4ff50f --- /dev/null +++ b/mmdetection/configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py @@ -0,0 +1,13 @@ +_base_ = './grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/grid_rcnn/metafile.yml b/mmdetection/configs/grid_rcnn/metafile.yml new file mode 100644 index 0000000..cee91e3 --- /dev/null +++ b/mmdetection/configs/grid_rcnn/metafile.yml @@ -0,0 +1,101 @@ +Collections: + - Name: Grid R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RPN + - Dilated Convolution + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/1906.05688 + Title: 'Grid R-CNN' + README: configs/grid_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/grid_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: grid-rcnn_r50_fpn_gn-head_2x_coco + In Collection: Grid R-CNN + Config: configs/grid_rcnn/grid-rcnn_r50_fpn_gn-head_2x_coco.py + Metadata: + Training Memory (GB): 5.1 + inference time (ms/im): + - value: 66.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco/grid_rcnn_r50_fpn_gn-head_2x_coco_20200130-6cca8223.pth + + - Name: grid-rcnn_r101_fpn_gn-head_2x_coco + In Collection: Grid R-CNN + Config: configs/grid_rcnn/grid-rcnn_r101_fpn_gn-head_2x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 79.37 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_r101_fpn_gn-head_2x_coco/grid_rcnn_r101_fpn_gn-head_2x_coco_20200309-d6eca030.pth + + - Name: grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco + In Collection: Grid R-CNN + Config: configs/grid_rcnn/grid-rcnn_x101-32x4d_fpn_gn-head_2x_coco.py + Metadata: + Training Memory (GB): 8.3 + inference time (ms/im): + - value: 92.59 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_32x4d_fpn_gn-head_2x_coco_20200130-d8f0e3ff.pth + + - Name: grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco + In Collection: Grid R-CNN + Config: configs/grid_rcnn/grid-rcnn_x101-64x4d_fpn_gn-head_2x_coco.py + Metadata: + Training Memory (GB): 11.3 + inference time (ms/im): + - value: 129.87 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/grid_rcnn/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco/grid_rcnn_x101_64x4d_fpn_gn-head_2x_coco_20200204-ec76a754.pth diff --git a/mmdetection/configs/groie/README.md b/mmdetection/configs/groie/README.md new file mode 100644 index 0000000..9792df9 --- /dev/null +++ b/mmdetection/configs/groie/README.md @@ -0,0 +1,72 @@ +# GRoIE + +> [A novel Region of Interest Extraction Layer for Instance Segmentation](https://arxiv.org/abs/2004.13665) + + + +## Abstract + +Given the wide diffusion of deep neural network architectures for computer vision tasks, several new applications are nowadays more and more feasible. Among them, a particular attention has been recently given to instance segmentation, by exploiting the results achievable by two-stage networks (such as Mask R-CNN or Faster R-CNN), derived from R-CNN. In these complex architectures, a crucial role is played by the Region of Interest (RoI) extraction layer, devoted to extracting a coherent subset of features from a single Feature Pyramid Network (FPN) layer attached on top of a backbone. +This paper is motivated by the need to overcome the limitations of existing RoI extractors which select only one (the best) layer from FPN. Our intuition is that all the layers of FPN retain useful information. Therefore, the proposed layer (called Generic RoI Extractor - GRoIE) introduces non-local building blocks and attention mechanisms to boost the performance. +A comprehensive ablation study at component level is conducted to find the best set of algorithms and parameters for the GRoIE layer. Moreover, GRoIE can be integrated seamlessly with every two-stage architecture for both object detection and instance segmentation tasks. Therefore, the improvements brought about by the use of GRoIE in different state-of-the-art architectures are also evaluated. The proposed layer leads up to gain a 1.1% AP improvement on bounding box detection and 1.7% AP improvement on instance segmentation. + +
    + +
    + +## Introduction + +By Leonardo Rossi, Akbar Karimi and Andrea Prati from +[IMPLab](http://implab.ce.unipr.it/). + +We provide configs to reproduce the results in the paper for +"*A novel Region of Interest Extraction Layer for Instance Segmentation*" +on COCO object detection. + +This paper is motivated by the need to overcome to the limitations of existing +RoI extractors which select only one (the best) layer from FPN. + +Our intuition is that all the layers of FPN retain useful information. + +Therefore, the proposed layer (called Generic RoI Extractor - **GRoIE**) +introduces non-local building blocks and attention mechanisms to boost the +performance. + +## Results and Models + +The results on COCO 2017 minival (5k images) are shown in the below table. + +### Application of GRoIE to different architectures + +| Backbone | Method | Lr schd | box AP | mask AP | Config | Download | +| :-------: | :-------------: | :-----: | :----: | :-----: | :------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | Faster Original | 1x | 37.4 | | [config](../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| R-50-FPN | + GRoIE | 1x | 38.3 | | [config](./faste-rcnn_r50_fpn_groie_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715.log.json) | +| R-50-FPN | Grid R-CNN | 1x | 39.1 | | [config](./grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco/grid_rcnn_r50_fpn_gn-head_groie_1x_coco_20200605_202059-4b75d86f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco/grid_rcnn_r50_fpn_gn-head_groie_1x_coco_20200605_202059.log.json) | +| R-50-FPN | + GRoIE | 1x | | | [config](./grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py) | | +| R-50-FPN | Mask R-CNN | 1x | 38.2 | 34.7 | [config](../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json) | +| R-50-FPN | + GRoIE | 1x | 39.0 | 36.0 | [config](./mask-rcnn_r50_fpn_groie_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715-50d90c74.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715.log.json) | +| R-50-FPN | GC-Net | 1x | 40.7 | 36.5 | [config](../gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202-50b90e5c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200202_085547.log.json) | +| R-50-FPN | + GRoIE | 1x | 41.0 | 37.8 | [config](./mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth) | +| R-101-FPN | GC-Net | 1x | 42.2 | 37.8 | [config](../gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206-8407a3f0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/gcnet/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_1x_coco_20200206_142508.log.json) | +| R-101-FPN | + GRoIE | 1x | 42.6 | 38.7 | [config](./mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507-8daae01c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507.log.json) | + +## Citation + +If you use this work or benchmark in your research, please cite this project. + +```latex +@inproceedings{rossi2021novel, + title={A novel region of interest extraction layer for instance segmentation}, + author={Rossi, Leonardo and Karimi, Akbar and Prati, Andrea}, + booktitle={2020 25th International Conference on Pattern Recognition (ICPR)}, + pages={2203--2209}, + year={2021}, + organization={IEEE} +} +``` + +## Contact + +The implementation of GRoIE is currently maintained by +[Leonardo Rossi](https://github.com/hachreak/). diff --git a/mmdetection/configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py b/mmdetection/configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py new file mode 100644 index 0000000..0fbe8a3 --- /dev/null +++ b/mmdetection/configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py @@ -0,0 +1,25 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/mmdetection/configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py b/mmdetection/configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py new file mode 100644 index 0000000..dadccb7 --- /dev/null +++ b/mmdetection/configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = '../grid_rcnn/grid-rcnn_r50_fpn_gn-head_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)), + grid_roi_extractor=dict( + type='GenericRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/mmdetection/configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py b/mmdetection/configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py new file mode 100644 index 0000000..5699b42 --- /dev/null +++ b/mmdetection/configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = '../gcnet/mask-rcnn_r101-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)), + mask_roi_extractor=dict( + type='GenericRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/mmdetection/configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py b/mmdetection/configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py new file mode 100644 index 0000000..4c9521e --- /dev/null +++ b/mmdetection/configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)), + mask_roi_extractor=dict( + type='GenericRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/mmdetection/configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py b/mmdetection/configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py new file mode 100644 index 0000000..22e97b6 --- /dev/null +++ b/mmdetection/configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = '../gcnet/mask-rcnn_r50-syncbn-gcb-r4-c3-c5_fpn_1x_coco.py' +# model settings +model = dict( + roi_head=dict( + bbox_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='sum', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)), + mask_roi_extractor=dict( + type='GenericRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32], + pre_cfg=dict( + type='ConvModule', + in_channels=256, + out_channels=256, + kernel_size=5, + padding=2, + inplace=False, + ), + post_cfg=dict( + type='GeneralizedAttention', + in_channels=256, + spatial_range=-1, + num_heads=6, + attention_type='0100', + kv_stride=2)))) diff --git a/mmdetection/configs/groie/metafile.yml b/mmdetection/configs/groie/metafile.yml new file mode 100644 index 0000000..ce95700 --- /dev/null +++ b/mmdetection/configs/groie/metafile.yml @@ -0,0 +1,94 @@ +Collections: + - Name: GRoIE + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Generic RoI Extractor + - FPN + - RPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/2004.13665 + Title: 'A novel Region of Interest Extraction Layer for Instance Segmentation' + README: configs/groie/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/roi_heads/roi_extractors/groie.py#L15 + Version: v2.1.0 + +Models: + - Name: faster-rcnn_r50_fpn_groie_1x_coco + In Collection: GRoIE + Config: configs/groie/faste-rcnn_r50_fpn_groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/faster_rcnn_r50_fpn_groie_1x_coco/faster_rcnn_r50_fpn_groie_1x_coco_20200604_211715-66ee9516.pth + + - Name: grid-rcnn_r50_fpn_gn-head-groie_1x_coco + In Collection: GRoIE + Config: configs/groie/grid-rcnn_r50_fpn_gn-head-groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco/grid_rcnn_r50_fpn_gn-head_groie_1x_coco_20200605_202059-4b75d86f.pth + + - Name: mask-rcnn_r50_fpn_groie_1x_coco + In Collection: GRoIE + Config: configs/groie/mask-rcnn_r50_fpn_groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_groie_1x_coco/mask_rcnn_r50_fpn_groie_1x_coco_20200604_211715-50d90c74.pth + + - Name: mask-rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco + In Collection: GRoIE + Config: configs/groie/mask-rcnn_r50_fpn_syncbn-r4-gcb-c3-c5-groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200604_211715-42eb79e1.pth + + - Name: mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco + In Collection: GRoIE + Config: configs/groie/mask-rcnn_r101_fpn_syncbn-r4-gcb_c3-c5-groie_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco_20200607_224507-8daae01c.pth diff --git a/mmdetection/configs/grounding_dino/README.md b/mmdetection/configs/grounding_dino/README.md new file mode 100644 index 0000000..715b630 --- /dev/null +++ b/mmdetection/configs/grounding_dino/README.md @@ -0,0 +1,172 @@ +# Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection + +[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) + + + +## Abstract + +In this paper, we present an open-set object detector, called Grounding DINO, by marrying Transformer-based detector DINO with grounded pre-training, which can detect arbitrary objects with human inputs such as category names or referring expressions. The key solution of open-set object detection is introducing language to a closed-set detector for open-set concept generalization. To effectively fuse language and vision modalities, we conceptually divide a closed-set detector into three phases and propose a tight fusion solution, which includes a feature enhancer, a language-guided query selection, and a cross-modality decoder for cross-modality fusion. While previous works mainly evaluate open-set object detection on novel categories, we propose to also perform evaluations on referring expression comprehension for objects specified with attributes. Grounding DINO performs remarkably well on all three settings, including benchmarks on COCO, LVIS, ODinW, and RefCOCO/+/g. Grounding DINO achieves a 52.5 AP on the COCO detection zero-shot transfer benchmark, i.e., without any training data from COCO. It sets a new record on the ODinW zero-shot benchmark with a mean 26.1 AP. + +
    + +
    + +## Installation + +```shell +cd $MMDETROOT + +# source installation +pip install -r requirements/multimodal.txt + +# or mim installation +mim install mmdet[multimodal] +``` + +## NOTE + +Grounding DINO utilizes BERT as the language model, which requires access to https://huggingface.co/. If you encounter connection errors due to network access, you can download the required files on a computer with internet access and save them locally. Finally, modify the `lang_model_name` field in the config to the local path. Please refer to the following code: + +```python +from transformers import BertConfig, BertModel +from transformers import AutoTokenizer + +config = BertConfig.from_pretrained("bert-base-uncased") +model = BertModel.from_pretrained("bert-base-uncased", add_pooling_layer=False, config=config) +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + +config.save_pretrained("your path/bert-base-uncased") +model.save_pretrained("your path/bert-base-uncased") +tokenizer.save_pretrained("your path/bert-base-uncased") +``` + +## Inference + +``` +cd $MMDETROOT + +wget https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth + +python demo/image_demo.py \ + demo/demo.jpg \ + configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py \ + --weights groundingdino_swint_ogc_mmdet-822d7e9d.pth \ + --texts 'bench . car .' +``` + +
    + +
    + +## Results and Models + +| Model | Backbone | Style | COCO mAP | Official COCO mAP | Pre-Train Data | Config | Download | +| :----------------: | :------: | :-------: | :--------: | :---------------: | :----------------------------------------------: | :------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Grounding DINO-T | Swin-T | Zero-shot | 48.5 | 48.4 | O365,GoldG,Cap4M | [config](grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth) | +| Grounding DINO-T | Swin-T | Finetune | 58.1(+0.9) | 57.2 | O365,GoldG,Cap4M | [config](grounding_dino_swin-t_finetune_16xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544-5f234b20.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544.log.json) | +| Grounding DINO-B | Swin-B | Zero-shot | 56.9 | 56.7 | COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO | [config](grounding_dino_swin-b_pretrain_mixeddata.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth) | +| Grounding DINO-B | Swin-B | Finetune | 59.7 | | COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO | [config](grounding_dino_swin-b_finetune_16xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco/grounding_dino_swin-b_finetune_16xb2_1x_coco_20230921_153201-f219e0c0.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco/grounding_dino_swin-b_finetune_16xb2_1x_coco_20230921_153201.log.json) | +| Grounding DINO-R50 | R50 | Scratch | 48.9(+0.8) | 48.1 | | [config](grounding_dino_r50_scratch_8xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco/grounding_dino_r50_scratch_1x_coco-fe0002f2.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco/20230922_114218.json) | + +Note: + +1. The weights corresponding to the zero-shot model are adopted from the official weights and converted using the [script](../../tools/model_converters/groundingdino_to_mmdet.py). We have not retrained the model for the time being. +2. Finetune refers to fine-tuning on the COCO 2017 dataset. The R50 model is trained using 8 NVIDIA GeForce 3090 GPUs, while the remaining models are trained using 16 NVIDIA GeForce 3090 GPUs. The GPU memory usage is approximately 8.5GB. +3. Our performance is higher than the official model due to two reasons: we modified the initialization strategy and introduced a log scaler. + +## Custom Dataset + +To facilitate fine-tuning on custom datasets, we use a simple cat dataset as an example, as shown in the following steps. + +### 1. Dataset Preparation + +```shell +cd mmdetection +wget https://download.openmmlab.com/mmyolo/data/cat_dataset.zip +unzip cat_dataset.zip -d data/cat/ +``` + +cat dataset is a single-category dataset with 144 images, which has been converted to coco format. + +
    +cat dataset +
    + +### 2. Config Preparation + +Due to the simplicity and small number of cat datasets, we use 8 cards to train 20 epochs, scale the learning rate accordingly, and do not train the language model, only the visual model. + +The Details of the configuration can be found in [grounding_dino_swin-t_finetune_8xb2_20e_cat](grounding_dino_swin-t_finetune_8xb2_20e_cat.py) + +### 3. Visualization and Evaluation + +Due to the Grounding DINO is an open detection model, so it can be detected and evaluated even if it is not trained on the cat dataset. + +The single image visualization is as follows: + +```shell +cd mmdetection +python demo/image_demo.py data/cat/images/IMG_20211205_120756.jpg configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py --weights https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth --texts cat. +``` + +
    +cat dataset +
    + +The test dataset evaluation on single card is as follows: + +```shell +python tools/test.py configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth +``` + +```text + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.867 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = 0.931 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.867 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.903 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=300 ] = 0.907 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=1000 ] = 0.907 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.907 +``` + +### 4. Model Training and Visualization + +```shell +./tools/dist_train.sh configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py 8 --work-dir cat_work_dir +``` + +The model will be saved based on the best performance on the test set. The performance of the best model (at epoch 16) is as follows: + +```text + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.905 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = 0.923 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.905 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.927 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=300 ] = 0.937 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=1000 ] = 0.937 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.937 +``` + +We can find that after fine-tuning training, the training of the cat dataset is increased from 86.7 to 90.5. + +If we do single image inference visualization again, the result is as follows: + +```shell +cd mmdetection +python demo/image_demo.py data/cat/images/IMG_20211205_120756.jpg configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py --weights cat_work_dir/best_coco_bbox_mAP_epoch_16.pth --texts cat. +``` + +
    +cat dataset +
    diff --git a/mmdetection/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py b/mmdetection/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py new file mode 100644 index 0000000..623a29b --- /dev/null +++ b/mmdetection/configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py @@ -0,0 +1,208 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +lang_model_name = 'bert-base-uncased' + +model = dict( + type='GroundingDINO', + num_queries=900, + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=False, + ), + language_model=dict( + type='BertModel', + name=lang_model_name, + pad_to_max=False, + use_sub_sentence_represent=True, + special_tokens_list=['[CLS]', '[SEP]', '.', '?'], + add_pooling_layer=False, + ), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + bias=True, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + encoder=dict( + num_layers=6, + num_cp=6, + # visual layer config + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)), + # text layer config + text_layer_cfg=dict( + self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)), + # fusion layer config + fusion_layer_cfg=dict( + v_dim=256, + l_dim=256, + embed_dim=1024, + num_heads=4, + init_values=1e-4), + ), + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + # query self attention layer + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to text + cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to image + cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)), + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, normalize=True, offset=0.0, temperature=20), + bbox_head=dict( + type='GroundingDINOHead', + num_classes=80, + sync_cls_avg_factor=True, + contrastive_cfg=dict(max_text_len=256, log_scale='auto', bias=True), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), # 2.0 in DeformDETR + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + dn_cfg=dict( # TODO: Move to model.train_cfg ? + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, + num_dn_queries=100)), # TODO: half num_dn_queries + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='BinaryFocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', 'text', + 'custom_entities')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='FixScaleResize', scale=(800, 1333), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'text', 'custom_entities')) +] + +train_dataloader = dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), + pipeline=train_pipeline, + return_classes=True)) +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, return_classes=True)) +test_dataloader = val_dataloader + +# We did not adopt the official 24e optimizer strategy +# because the results indicate that the current strategy is superior. +optim_wrapper = dict( + _delete_=True, + type='OptimWrapper', + optimizer=dict( + type='AdamW', + lr=0.0001, # 0.0002 for DeformDETR + weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'backbone': dict(lr_mult=0.1) + })) +# learning policy +max_epochs = 12 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py b/mmdetection/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py new file mode 100644 index 0000000..3554ee2 --- /dev/null +++ b/mmdetection/configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = [ + './grounding_dino_swin-t_finetune_16xb2_1x_coco.py', +] + +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth' # noqa +model = dict( + type='GroundingDINO', + backbone=dict( + pretrain_img_size=384, + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=12, + drop_path_rate=0.3, + patch_norm=True), + neck=dict(in_channels=[256, 512, 1024]), +) diff --git a/mmdetection/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py b/mmdetection/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py new file mode 100644 index 0000000..92f327f --- /dev/null +++ b/mmdetection/configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py @@ -0,0 +1,16 @@ +_base_ = [ + './grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py', +] + +model = dict( + type='GroundingDINO', + backbone=dict( + pretrain_img_size=384, + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=12, + drop_path_rate=0.3, + patch_norm=True), + neck=dict(in_channels=[256, 512, 1024]), +) diff --git a/mmdetection/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py b/mmdetection/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py new file mode 100644 index 0000000..0c6403e --- /dev/null +++ b/mmdetection/configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py @@ -0,0 +1,204 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth' # noqa +lang_model_name = 'bert-base-uncased' + +model = dict( + type='GroundingDINO', + num_queries=900, + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=False, + ), + language_model=dict( + type='BertModel', + name=lang_model_name, + pad_to_max=False, + use_sub_sentence_represent=True, + special_tokens_list=['[CLS]', '[SEP]', '.', '?'], + add_pooling_layer=False, + ), + backbone=dict( + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=True, + convert_weights=False), + neck=dict( + type='ChannelMapper', + in_channels=[192, 384, 768], + kernel_size=1, + out_channels=256, + act_cfg=None, + bias=True, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + encoder=dict( + num_layers=6, + num_cp=6, + # visual layer config + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)), + # text layer config + text_layer_cfg=dict( + self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)), + # fusion layer config + fusion_layer_cfg=dict( + v_dim=256, + l_dim=256, + embed_dim=1024, + num_heads=4, + init_values=1e-4), + ), + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + # query self attention layer + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to text + cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to image + cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)), + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, normalize=True, offset=0.0, temperature=20), + bbox_head=dict( + type='GroundingDINOHead', + num_classes=80, + sync_cls_avg_factor=True, + contrastive_cfg=dict(max_text_len=256, log_scale=0.0, bias=False), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), # 2.0 in DeformDETR + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + dn_cfg=dict( # TODO: Move to model.train_cfg ? + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, + num_dn_queries=100)), # TODO: half num_dn_queries + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='BinaryFocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', 'text', + 'custom_entities')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='FixScaleResize', scale=(800, 1333), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'text', 'custom_entities')) +] + +train_dataloader = dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), + pipeline=train_pipeline, + return_classes=True)) +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, return_classes=True)) +test_dataloader = val_dataloader + +optim_wrapper = dict( + _delete_=True, + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'backbone': dict(lr_mult=0.1) + })) +# learning policy +max_epochs = 12 +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (16 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=32) diff --git a/mmdetection/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py b/mmdetection/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py new file mode 100644 index 0000000..c2265e8 --- /dev/null +++ b/mmdetection/configs/grounding_dino/grounding_dino_swin-t_finetune_8xb2_20e_cat.py @@ -0,0 +1,56 @@ +_base_ = 'grounding_dino_swin-t_finetune_16xb2_1x_coco.py' + +data_root = 'data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(220, 20, 60)]) + +model = dict(bbox_head=dict(num_classes=num_classes)) + +train_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +max_epoch = 20 + +default_hooks = dict( + checkpoint=dict(interval=1, max_keep_ckpts=1, save_best='auto'), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epoch, val_interval=1) + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=30), + dict( + type='MultiStepLR', + begin=0, + end=max_epoch, + by_epoch=True, + milestones=[15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(lr=0.00005), + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'backbone': dict(lr_mult=0.1), + 'language_model': dict(lr_mult=0), + })) + +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py b/mmdetection/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py new file mode 100644 index 0000000..1117cb0 --- /dev/null +++ b/mmdetection/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py @@ -0,0 +1,127 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +lang_model_name = 'bert-base-uncased' + +model = dict( + type='GroundingDINO', + num_queries=900, + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=False, + ), + language_model=dict( + type='BertModel', + name=lang_model_name, + pad_to_max=False, + use_sub_sentence_represent=True, + special_tokens_list=['[CLS]', '[SEP]', '.', '?'], + add_pooling_layer=True, + ), + backbone=dict( + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=False), + neck=dict( + type='ChannelMapper', + in_channels=[192, 384, 768], + kernel_size=1, + out_channels=256, + act_cfg=None, + bias=True, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + encoder=dict( + num_layers=6, + # visual layer config + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)), + # text layer config + text_layer_cfg=dict( + self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)), + # fusion layer config + fusion_layer_cfg=dict( + v_dim=256, + l_dim=256, + embed_dim=1024, + num_heads=4, + init_values=1e-4), + ), + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + # query self attention layer + self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to text + cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + # cross attention layer query to image + cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)), + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, normalize=True, offset=0.0, temperature=20), + bbox_head=dict( + type='GroundingDINOHead', + num_classes=80, + sync_cls_avg_factor=True, + contrastive_cfg=dict(max_text_len=256), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), # 2.0 in DeformDETR + loss_bbox=dict(type='L1Loss', loss_weight=5.0)), + dn_cfg=dict( # TODO: Move to model.train_cfg ? + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, + num_dn_queries=100)), # TODO: half num_dn_queries + # training and testing settings + train_cfg=None, + test_cfg=dict(max_per_img=300)) + +test_pipeline = [ + dict( + type='LoadImageFromFile', backend_args=None, + imdecode_backend='pillow'), + dict( + type='FixScaleResize', + scale=(800, 1333), + keep_ratio=True, + backend='pillow'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'text', 'custom_entities')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, return_classes=True)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/grounding_dino/metafile.yml b/mmdetection/configs/grounding_dino/metafile.yml new file mode 100644 index 0000000..dcb5ebf --- /dev/null +++ b/mmdetection/configs/grounding_dino/metafile.yml @@ -0,0 +1,67 @@ +Collections: + - Name: Grounding DINO + Metadata: + Training Data: Objects365, GoldG, CC3M and COCO + Training Techniques: + - AdamW + - Multi Scale Train + - Gradient Clip + Training Resources: 3090 GPUs + Architecture: + - Swin Transformer + - BERT + Paper: + URL: https://arxiv.org/abs/2303.05499 + Title: 'Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection +' + README: configs/grounding_dino/README.md + Code: + URL: + Version: v3.0.0 + +Models: + - Name: grounding_dino_swin-t_pretrain_obj365_goldg_cap4m + In Collection: Grounding DINO + Config: configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.5 + Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth + - Name: grounding_dino_swin-b_pretrain_mixeddata + In Collection: Grounding DINO + Config: configs/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 56.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth + - Name: grounding_dino_swin-t_finetune_16xb2_1x_coco + In Collection: Grounding DINO + Config: configs/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 58.1 + Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544-5f234b20.pth + - Name: grounding_dino_swin-b_finetune_16xb2_1x_coco + In Collection: Grounding DINO + Config: configs/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 59.7 + Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-b_finetune_16xb2_1x_coco/grounding_dino_swin-b_finetune_16xb2_1x_coco_20230921_153201-f219e0c0.pth + - Name: grounding_dino_r50_scratch_8xb2_1x_coco + In Collection: Grounding DINO + Config: configs/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco.py + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_r50_scratch_8xb2_1x_coco/grounding_dino_r50_scratch_1x_coco-fe0002f2.pth diff --git a/mmdetection/configs/guided_anchoring/README.md b/mmdetection/configs/guided_anchoring/README.md new file mode 100644 index 0000000..1a5e505 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/README.md @@ -0,0 +1,59 @@ +# Guided Anchoring + +> [Region Proposal by Guided Anchoring](https://arxiv.org/abs/1901.03278) + + + +## Abstract + +Region anchors are the cornerstone of modern object detection techniques. State-of-the-art detectors mostly rely on a dense anchoring scheme, where anchors are sampled uniformly over the spatial domain with a predefined set of scales and aspect ratios. In this paper, we revisit this foundational stage. Our study shows that it can be done much more effectively and efficiently. Specifically, we present an alternative scheme, named Guided Anchoring, which leverages semantic features to guide the anchoring. The proposed method jointly predicts the locations where the center of objects of interest are likely to exist as well as the scales and aspect ratios at different locations. On top of predicted anchor shapes, we mitigate the feature inconsistency with a feature adaption module. We also study the use of high-quality proposals to improve detection performance. The anchoring scheme can be seamlessly integrated into proposal methods and detectors. With Guided Anchoring, we achieve 9.1% higher recall on MS COCO with 90% fewer anchors than the RPN baseline. We also adopt Guided Anchoring in Fast R-CNN, Faster R-CNN and RetinaNet, respectively improving the detection mAP by 2.2%, 2.7% and 1.2%. + +
    + +
    + +## Results and Models + +The results on COCO 2017 val is shown in the below table. (results on test-dev are usually slightly higher than val). + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | AR 1000 | Config | Download | +| :----: | :-------------: | :-----: | :-----: | :------: | :------------: | :-----: | :------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| GA-RPN | R-50-FPN | caffe | 1x | 5.3 | 15.8 | 68.4 | [config](./ga-rpn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531-899008a6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531_011819.log.json) | +| GA-RPN | R-101-FPN | caffe | 1x | 7.3 | 13.0 | 69.5 | [config](./ga-rpn_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531-ca9ba8fb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531_011812.log.json) | +| GA-RPN | X-101-32x4d-FPN | pytorch | 1x | 8.5 | 10.0 | 70.6 | [config](./ga-rpn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220-c28d1b18.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220_221326.log.json) | +| GA-RPN | X-101-64x4d-FPN | pytorch | 1x | 7.1 | 7.5 | 71.2 | [config](./ga-rpn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225-3c6e1aa2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225_152704.log.json) | + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------------: | :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| GA-Faster RCNN | R-50-FPN | caffe | 1x | 5.5 | | 39.6 | [config](./ga-faster-rcnn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718-a11ccfe6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718.log.json) | +| GA-Faster RCNN | R-101-FPN | caffe | 1x | 7.5 | | 41.5 | [config](./ga-faster-rcnn_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_bbox_mAP-0.415_20200505_115528-fb82e499.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_20200505_115528.log.json) | +| GA-Faster RCNN | X-101-32x4d-FPN | pytorch | 1x | 8.7 | 9.7 | 43.0 | [config](./ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215-1ded9da3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215_184547.log.json) | +| GA-Faster RCNN | X-101-64x4d-FPN | pytorch | 1x | 11.8 | 7.3 | 43.9 | [config](./ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215-0fa7bde7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215_104455.log.json) | +| GA-RetinaNet | R-50-FPN | caffe | 1x | 3.5 | 16.8 | 36.9 | [config](./ga-retinanet_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020_225450.log.json) | +| GA-RetinaNet | R-101-FPN | caffe | 1x | 5.5 | 12.9 | 39.0 | [config](./ga-retinanet_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531-6266453c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531_012847.log.json) | +| GA-RetinaNet | X-101-32x4d-FPN | pytorch | 1x | 6.9 | 10.6 | 40.5 | [config](./ga-retinanet_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219-40c56caa.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219_223025.log.json) | +| GA-RetinaNet | X-101-64x4d-FPN | pytorch | 1x | 9.9 | 7.7 | 41.3 | [config](./ga-retinanet_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226-ef9f7f1f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226_221123.log.json) | + +- In the Guided Anchoring paper, `score_thr` is set to 0.001 in Fast/Faster RCNN and 0.05 in RetinaNet for both baselines and Guided Anchoring. + +- Performance on COCO test-dev benchmark are shown as follows. + +| Method | Backbone | Style | Lr schd | Aug Train | Score thr | AP | AP_50 | AP_75 | AP_small | AP_medium | AP_large | Download | +| :------------: | :-------: | :---: | :-----: | :-------: | :-------: | :-: | :---: | :---: | :------: | :-------: | :------: | :------: | +| GA-Faster RCNN | R-101-FPN | caffe | 1x | F | 0.05 | | | | | | | | +| GA-Faster RCNN | R-101-FPN | caffe | 1x | F | 0.001 | | | | | | | | +| GA-RetinaNet | R-101-FPN | caffe | 1x | F | 0.05 | | | | | | | | +| GA-RetinaNet | R-101-FPN | caffe | 2x | T | 0.05 | | | | | | | | + +## Citation + +We provide config files to reproduce the results in the CVPR 2019 paper for [Region Proposal by Guided Anchoring](https://arxiv.org/abs/1901.03278). + +```latex +@inproceedings{wang2019region, + title={Region Proposal by Guided Anchoring}, + author={Jiaqi Wang and Kai Chen and Shuo Yang and Chen Change Loy and Dahua Lin}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} +``` diff --git a/mmdetection/configs/guided_anchoring/ga-fast-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-fast-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..2d0579c --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-fast-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,66 @@ +_base_ = '../fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + roi_head=dict( + bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6), + sampler=dict(num=256))), + test_cfg=dict(rcnn=dict(score_thr=1e-3))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=300), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'proposals', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadProposals', num_max_proposals=None), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img', 'proposals']), + ]) +] +# TODO: support loading proposals +data = dict( + train=dict( + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_train2017.pkl', + pipeline=train_pipeline), + val=dict( + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl', + pipeline=test_pipeline), + test=dict( + proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl', + pipeline=test_pipeline)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..f585dc3 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './ga-faster-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..6cd44de --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,64 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='GARPNHead', + in_channels=256, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.14, 0.14]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.11, 0.11]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + roi_head=dict( + bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + center_ratio=0.2, + ignore_ratio=0.5), + rpn_proposal=dict(nms_post=1000, max_per_img=300), + rcnn=dict( + assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6), + sampler=dict(type='RandomSampler', num=256))), + test_cfg=dict( + rpn=dict(nms_post=1000, max_per_img=300), rcnn=dict(score_thr=1e-3))) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..3007fbe --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,64 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='GARPNHead', + in_channels=256, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.14, 0.14]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.11, 0.11]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + roi_head=dict( + bbox_head=dict(bbox_coder=dict(target_stds=[0.05, 0.05, 0.1, 0.1]))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + center_ratio=0.2, + ignore_ratio=0.5), + rpn_proposal=dict(nms_post=1000, max_per_img=300), + rcnn=dict( + assigner=dict(pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6), + sampler=dict(type='RandomSampler', num=256))), + test_cfg=dict( + rpn=dict(nms_post=1000, max_per_img=300), rcnn=dict(score_thr=1e-3))) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..8a22a1e --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga-faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..3d6aaea --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga-faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..9adbae5 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './ga-retinanet_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_ms-2x.py b/mmdetection/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_ms-2x.py new file mode 100644 index 0000000..012e89b --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_ms-2x.py @@ -0,0 +1,34 @@ +_base_ = './ga-retinanet_r101-caffe_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 480), (1333, 960)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# learning policy +max_epochs = 24 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3.0, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..b62aba6 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,61 @@ +_base_ = '../retinanet/retinanet_r50-caffe_fpn_1x_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='GARetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict(neg_iou_thr=0.5, min_pos_iou=0.0), + center_ratio=0.2, + ignore_ratio=0.5)) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/guided_anchoring/ga-retinanet_r50_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-retinanet_r50_fpn_1x_coco.py new file mode 100644 index 0000000..da39c70 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,61 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='GARetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + assigner=dict(neg_iou_thr=0.5, min_pos_iou=0.0), + center_ratio=0.2, + ignore_ratio=0.5)) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..478a8e5 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga-retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..cb7721d --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga-retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..b375c87 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = './ga-rpn_r50-caffe_fpn_1x_coco.py' +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..aa58426 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,57 @@ +_base_ = '../rpn/rpn_r50-caffe_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='GARPNHead', + in_channels=256, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.14, 0.14]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.11, 0.11]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + center_ratio=0.2, + ignore_ratio=0.5)), + test_cfg=dict(rpn=dict(nms_post=1000))) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/guided_anchoring/ga-rpn_r50_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-rpn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..2973f27 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-rpn_r50_fpn_1x_coco.py @@ -0,0 +1,57 @@ +_base_ = '../rpn/rpn_r50_fpn_1x_coco.py' +model = dict( + rpn_head=dict( + _delete_=True, + type='GARPNHead', + in_channels=256, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.14, 0.14]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.07, 0.07, 0.11, 0.11]), + loc_filter_thr=0.01, + loss_loc=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + rpn=dict( + ga_assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ga_sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + center_ratio=0.2, + ignore_ratio=0.5)), + test_cfg=dict(rpn=dict(nms_post=1000))) +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..276d45d --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga-rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..f29fe9a --- /dev/null +++ b/mmdetection/configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ga-rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/guided_anchoring/metafile.yml b/mmdetection/configs/guided_anchoring/metafile.yml new file mode 100644 index 0000000..516b3e9 --- /dev/null +++ b/mmdetection/configs/guided_anchoring/metafile.yml @@ -0,0 +1,246 @@ +Collections: + - Name: Guided Anchoring + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - Guided Anchoring + - ResNet + Paper: + URL: https://arxiv.org/abs/1901.03278 + Title: 'Region Proposal by Guided Anchoring' + README: configs/guided_anchoring/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/dense_heads/ga_retina_head.py#L10 + Version: v2.0.0 + +Models: + - Name: ga-rpn_r50-caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-rpn_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.3 + inference time (ms/im): + - value: 63.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Region Proposal + Dataset: COCO + Metrics: + AR@1000: 68.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_coco/ga_rpn_r50_caffe_fpn_1x_coco_20200531-899008a6.pth + + - Name: ga-rpn_r101-caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-rpn_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.3 + inference time (ms/im): + - value: 76.92 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Region Proposal + Dataset: COCO + Metrics: + AR@1000: 69.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_r101_caffe_fpn_1x_coco/ga_rpn_r101_caffe_fpn_1x_coco_20200531-ca9ba8fb.pth + + - Name: ga-rpn_x101-32x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-rpn_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.5 + inference time (ms/im): + - value: 100 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Region Proposal + Dataset: COCO + Metrics: + AR@1000: 70.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x_coco/ga_rpn_x101_32x4d_fpn_1x_coco_20200220-c28d1b18.pth + + - Name: ga-rpn_x101-64x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-rpn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 133.33 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Region Proposal + Dataset: COCO + Metrics: + AR@1000: 70.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_rpn_x101_64x4d_fpn_1x_coco/ga_rpn_x101_64x4d_fpn_1x_coco_20200225-3c6e1aa2.pth + + - Name: ga-faster-rcnn_r50-caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-faster-rcnn_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r50_caffe_fpn_1x_coco/ga_faster_r50_caffe_fpn_1x_coco_20200702_000718-a11ccfe6.pth + + - Name: ga-faster-rcnn_r101-caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-faster-rcnn_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_r101_caffe_fpn_1x_coco/ga_faster_r101_caffe_fpn_1x_coco_bbox_mAP-0.415_20200505_115528-fb82e499.pth + + - Name: ga-faster-rcnn_x101-32x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-faster-rcnn_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.7 + inference time (ms/im): + - value: 103.09 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_32x4d_fpn_1x_coco/ga_faster_x101_32x4d_fpn_1x_coco_20200215-1ded9da3.pth + + - Name: ga-faster-rcnn_x101-64x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-faster-rcnn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 11.8 + inference time (ms/im): + - value: 136.99 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_faster_x101_64x4d_fpn_1x_coco/ga_faster_x101_64x4d_fpn_1x_coco_20200215-0fa7bde7.pth + + - Name: ga-retinanet_r50-caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-retinanet_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.5 + inference time (ms/im): + - value: 59.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_coco/ga_retinanet_r50_caffe_fpn_1x_coco_20201020-39581c6f.pth + + - Name: ga-retinanet_r101-caffe_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-retinanet_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_r101_caffe_fpn_1x_coco/ga_retinanet_r101_caffe_fpn_1x_coco_20200531-6266453c.pth + + - Name: ga-retinanet_x101-32x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-retinanet_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.9 + inference time (ms/im): + - value: 94.34 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x_coco/ga_retinanet_x101_32x4d_fpn_1x_coco_20200219-40c56caa.pth + + - Name: ga-retinanet_x101-64x4d_fpn_1x_coco + In Collection: Guided Anchoring + Config: configs/guided_anchoring/ga-retinanet_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.9 + inference time (ms/im): + - value: 129.87 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/guided_anchoring/ga_retinanet_x101_64x4d_fpn_1x_coco/ga_retinanet_x101_64x4d_fpn_1x_coco_20200226-ef9f7f1f.pth diff --git a/mmdetection/configs/hrnet/README.md b/mmdetection/configs/hrnet/README.md new file mode 100644 index 0000000..fc1ed0c --- /dev/null +++ b/mmdetection/configs/hrnet/README.md @@ -0,0 +1,101 @@ +# HRNet + +> [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1902.09212) + + + +## Abstract + +This is an official pytorch implementation of Deep High-Resolution Representation Learning for Human Pose Estimation. In this work, we are interested in the human pose estimation problem with a focus on learning reliable high-resolution representations. Most existing methods recover high-resolution representations from low-resolution representations produced by a high-to-low resolution network. Instead, our proposed network maintains high-resolution representations through the whole process. We start from a high-resolution subnetwork as the first stage, gradually add high-to-low resolution subnetworks one by one to form more stages, and connect the mutli-resolution subnetworks in parallel. We conduct repeated multi-scale fusions such that each of the high-to-low resolution representations receives information from other parallel representations over and over, leading to rich high-resolution representations. As a result, the predicted keypoint heatmap is potentially more accurate and spatially more precise. We empirically demonstrate the effectiveness of our network through the superior pose estimation results over two benchmark datasets: the COCO keypoint detection dataset and the MPII Human Pose dataset. + +High-resolution representation learning plays an essential role in many vision problems, e.g., pose estimation and semantic segmentation. The high-resolution network (HRNet), recently developed for human pose estimation, maintains high-resolution representations through the whole process by connecting high-to-low resolution convolutions in parallel and produces strong high-resolution representations by repeatedly conducting fusions across parallel convolutions. +In this paper, we conduct a further study on high-resolution representations by introducing a simple yet effective modification and apply it to a wide range of vision tasks. We augment the high-resolution representation by aggregating the (upsampled) representations from all the parallel convolutions rather than only the representation from the high-resolution convolution as done in HRNet. This simple modification leads to stronger representations, evidenced by superior results. We show top results in semantic segmentation on Cityscapes, LIP, and PASCAL Context, and facial landmark detection on AFLW, COFW, 300W, and WFLW. In addition, we build a multi-level representation from the high-resolution representation and apply it to the Faster R-CNN object detection framework and the extended frameworks. The proposed approach achieves superior results to existing single-model networks on COCO object detection. + +
    + +
    + +## Results and Models + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :---------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| HRNetV2p-W18 | pytorch | 1x | 6.6 | 13.4 | 36.9 | [config](./faster-rcnn_hrnetv2p-w18-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130_211246.log.json) | +| HRNetV2p-W18 | pytorch | 2x | 6.6 | - | 38.9 | [config](./faster-rcnn_hrnetv2p-w18-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731-a4ec0611.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731.log.json) | +| HRNetV2p-W32 | pytorch | 1x | 9.0 | 12.4 | 40.2 | [config](./faster-rcnn_hrnetv2p-w32-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130-6e286425.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130_204442.log.json) | +| HRNetV2p-W32 | pytorch | 2x | 9.0 | - | 41.4 | [config](./faster-rcnn_hrnetv2p-w32_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927-976a9c15.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927.log.json) | +| HRNetV2p-W40 | pytorch | 1x | 10.4 | 10.5 | 41.2 | [config](./faster-rcnn_hrnetv2p-w40-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210-95c1f5ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210_125315.log.json) | +| HRNetV2p-W40 | pytorch | 2x | 10.4 | - | 42.1 | [config](./faster-rcnn_hrnetv2p-w40_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033-0f236ef4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033.log.json) | + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| HRNetV2p-W18 | pytorch | 1x | 7.0 | 11.7 | 37.7 | 34.2 | [config](./mask-rcnn_hrnetv2p-w18-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205-1c3d78ed.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205_232523.log.json) | +| HRNetV2p-W18 | pytorch | 2x | 7.0 | - | 39.8 | 36.0 | [config](./mask-rcnn_hrnetv2p-w18-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212-b3c825b1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212_134222.log.json) | +| HRNetV2p-W32 | pytorch | 1x | 9.4 | 11.3 | 41.2 | 37.1 | [config](./mask-rcnn_hrnetv2p-w32-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207-b29f616e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207_055017.log.json) | +| HRNetV2p-W32 | pytorch | 2x | 9.4 | - | 42.5 | 37.8 | [config](./mask-rcnn_hrnetv2p-w32-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213-45b75b4d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213_150518.log.json) | +| HRNetV2p-W40 | pytorch | 1x | 10.9 | | 42.1 | 37.5 | [config](./mask-rcnn_hrnetv2p-w40_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646-66738b35.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646.log.json) | +| HRNetV2p-W40 | pytorch | 2x | 10.9 | | 42.8 | 38.2 | [config](./mask-rcnn_hrnetv2p-w40-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732-aed5e4ab.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732.log.json) | + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| HRNetV2p-W18 | pytorch | 20e | 7.0 | 11.0 | 41.2 | [config](./cascade-rcnn_hrnetv2p-w18-20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210-434be9d7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210_105632.log.json) | +| HRNetV2p-W32 | pytorch | 20e | 9.4 | 11.0 | 43.3 | [config](./cascade-rcnn_hrnetv2p-w32-20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208-928455a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208_160511.log.json) | +| HRNetV2p-W40 | pytorch | 20e | 10.8 | | 43.8 | [config](./cascade-rcnn_hrnetv2p-w40-20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112-75e47b04.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112.log.json) | + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| HRNetV2p-W18 | pytorch | 20e | 8.5 | 8.5 | 41.6 | 36.4 | [config](./cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210-b543cd2b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210_093149.log.json) | +| HRNetV2p-W32 | pytorch | 20e | | 8.3 | 44.3 | 38.6 | [config](./cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043-39d9cf7b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043.log.json) | +| HRNetV2p-W40 | pytorch | 20e | 12.5 | | 45.1 | 39.3 | [config](./cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922-969c4610.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922.log.json) | + +### Hybrid Task Cascade (HTC) + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| HRNetV2p-W18 | pytorch | 20e | 10.8 | 4.7 | 42.8 | 37.9 | [config](./htc_hrnetv2p-w18_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210-b266988c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210_182735.log.json) | +| HRNetV2p-W32 | pytorch | 20e | 13.1 | 4.9 | 45.4 | 39.9 | [config](./htc_hrnetv2p-w32_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207-7639fa12.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207_193153.log.json) | +| HRNetV2p-W40 | pytorch | 20e | 14.6 | | 46.4 | 40.8 | [config](./htc_hrnetv2p-w40_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411-417c4d5b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411.log.json) | + +### FCOS + +| Backbone | Style | GN | MS train | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :----------: | :-----: | :-: | :------: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| HRNetV2p-W18 | pytorch | Y | N | 1x | 13.0 | 12.9 | 35.3 | [config](./fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710-4ad151de.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710.log.json) | +| HRNetV2p-W18 | pytorch | Y | N | 2x | 13.0 | - | 38.2 | [config](./fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110-5c575fa5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110.log.json) | +| HRNetV2p-W32 | pytorch | Y | N | 1x | 17.5 | 12.9 | 39.5 | [config](./fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730-cb8055c0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730.log.json) | +| HRNetV2p-W32 | pytorch | Y | N | 2x | 17.5 | - | 40.8 | [config](./fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133-77b6b9bb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133.log.json) | +| HRNetV2p-W18 | pytorch | Y | Y | 2x | 13.0 | 12.9 | 38.3 | [config](./fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651-441e9d9f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651.log.json) | +| HRNetV2p-W32 | pytorch | Y | Y | 2x | 17.5 | 12.4 | 41.9 | [config](./fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846-b6f2b49f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846.log.json) | +| HRNetV2p-W48 | pytorch | Y | Y | 2x | 20.3 | 10.8 | 42.7 | [config](./fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752-f22d2ce5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752.log.json) | + +**Note:** + +- The `28e` schedule in HTC indicates decreasing the lr at 24 and 27 epochs, with a total of 28 epochs. +- HRNetV2 ImageNet pretrained models are in [HRNets for Image Classification](https://github.com/HRNet/HRNet-Image-Classification). + +## Citation + +```latex +@inproceedings{SunXLW19, + title={Deep High-Resolution Representation Learning for Human Pose Estimation}, + author={Ke Sun and Bin Xiao and Dong Liu and Jingdong Wang}, + booktitle={CVPR}, + year={2019} +} + +@article{SunZJCXLMWLW19, + title={High-Resolution Representations for Labeling Pixels and Regions}, + author={Ke Sun and Yang Zhao and Borui Jiang and Tianheng Cheng and Bin Xiao + and Dong Liu and Yadong Mu and Xinggang Wang and Wenyu Liu and Jingdong Wang}, + journal = {CoRR}, + volume = {abs/1904.04514}, + year={2019} +} +``` diff --git a/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py b/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py new file mode 100644 index 0000000..5ca0ebf --- /dev/null +++ b/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py @@ -0,0 +1,11 @@ +_base_ = './cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py' +# model settings +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py b/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py new file mode 100644 index 0000000..1ffedc3 --- /dev/null +++ b/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py @@ -0,0 +1,51 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) +# learning policy +max_epochs = 20 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 19], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py b/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py new file mode 100644 index 0000000..4a51a02 --- /dev/null +++ b/mmdetection/configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py @@ -0,0 +1,12 @@ +_base_ = './cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py' +# model settings +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py b/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py new file mode 100644 index 0000000..8834c1d --- /dev/null +++ b/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py @@ -0,0 +1,11 @@ +_base_ = './cascade-rcnn_hrnetv2p-w32-20e_coco.py' +# model settings +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py b/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py new file mode 100644 index 0000000..afeb75d --- /dev/null +++ b/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py @@ -0,0 +1,51 @@ +_base_ = '../cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) +# learning policy +max_epochs = 20 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 19], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py b/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py new file mode 100644 index 0000000..66f8882 --- /dev/null +++ b/mmdetection/configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py @@ -0,0 +1,12 @@ +_base_ = './cascade-rcnn_hrnetv2p-w32-20e_coco.py' +# model settings +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py new file mode 100644 index 0000000..ee9a698 --- /dev/null +++ b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py @@ -0,0 +1,11 @@ +_base_ = './faster-rcnn_hrnetv2p-w32-1x_coco.py' +# model settings +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py new file mode 100644 index 0000000..0b72c68 --- /dev/null +++ b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './faster-rcnn_hrnetv2p-w18-1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py new file mode 100644 index 0000000..a27ad06 --- /dev/null +++ b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py @@ -0,0 +1,37 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) diff --git a/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py new file mode 100644 index 0000000..c9568ce --- /dev/null +++ b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './faster-rcnn_hrnetv2p-w32-1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py new file mode 100644 index 0000000..b362002 --- /dev/null +++ b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py @@ -0,0 +1,11 @@ +_base_ = './faster-rcnn_hrnetv2p-w32-1x_coco.py' +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py new file mode 100644 index 0000000..d1b4535 --- /dev/null +++ b/mmdetection/configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './faster-rcnn_hrnetv2p-w40-1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py b/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py new file mode 100644 index 0000000..c20ca77 --- /dev/null +++ b/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py @@ -0,0 +1,10 @@ +_base_ = './fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py' +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py b/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py new file mode 100644 index 0000000..f5b67f6 --- /dev/null +++ b/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py b/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py new file mode 100644 index 0000000..c5332d6 --- /dev/null +++ b/mmdetection/configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py @@ -0,0 +1,10 @@ +_base_ = './fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py' +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py b/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py new file mode 100644 index 0000000..159d96d --- /dev/null +++ b/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py @@ -0,0 +1,43 @@ +_base_ = '../fcos/fcos_r50-caffe_fpn_gn-head_4xb4-1x_coco.py' +model = dict( + data_preprocessor=dict( + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256, + stride=2, + num_outs=5)) diff --git a/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py b/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py new file mode 100644 index 0000000..73fd80e --- /dev/null +++ b/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py b/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py new file mode 100644 index 0000000..4c977bf --- /dev/null +++ b/mmdetection/configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py @@ -0,0 +1,35 @@ +_base_ = './fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py' + +model = dict( + data_preprocessor=dict( + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py b/mmdetection/configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py new file mode 100644 index 0000000..bb0ff6d --- /dev/null +++ b/mmdetection/configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py @@ -0,0 +1,11 @@ +_base_ = './fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py' +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/mmdetection/configs/hrnet/htc_hrnetv2p-w18_20e_coco.py b/mmdetection/configs/hrnet/htc_hrnetv2p-w18_20e_coco.py new file mode 100644 index 0000000..55255d5 --- /dev/null +++ b/mmdetection/configs/hrnet/htc_hrnetv2p-w18_20e_coco.py @@ -0,0 +1,10 @@ +_base_ = './htc_hrnetv2p-w32_20e_coco.py' +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/mmdetection/configs/hrnet/htc_hrnetv2p-w32_20e_coco.py b/mmdetection/configs/hrnet/htc_hrnetv2p-w32_20e_coco.py new file mode 100644 index 0000000..545cb83 --- /dev/null +++ b/mmdetection/configs/hrnet/htc_hrnetv2p-w32_20e_coco.py @@ -0,0 +1,37 @@ +_base_ = '../htc/htc_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) diff --git a/mmdetection/configs/hrnet/htc_hrnetv2p-w40_20e_coco.py b/mmdetection/configs/hrnet/htc_hrnetv2p-w40_20e_coco.py new file mode 100644 index 0000000..b09256a --- /dev/null +++ b/mmdetection/configs/hrnet/htc_hrnetv2p-w40_20e_coco.py @@ -0,0 +1,11 @@ +_base_ = './htc_hrnetv2p-w32_20e_coco.py' +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/mmdetection/configs/hrnet/htc_hrnetv2p-w40_28e_coco.py b/mmdetection/configs/hrnet/htc_hrnetv2p-w40_28e_coco.py new file mode 100644 index 0000000..1c13b58 --- /dev/null +++ b/mmdetection/configs/hrnet/htc_hrnetv2p-w40_28e_coco.py @@ -0,0 +1,16 @@ +_base_ = './htc_hrnetv2p-w40_20e_coco.py' + +# learning policy +max_epochs = 28 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[24, 27], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/htc_x101-64x4d_fpn_16xb1-28e_coco.py b/mmdetection/configs/hrnet/htc_x101-64x4d_fpn_16xb1-28e_coco.py new file mode 100644 index 0000000..1f1304e --- /dev/null +++ b/mmdetection/configs/hrnet/htc_x101-64x4d_fpn_16xb1-28e_coco.py @@ -0,0 +1,16 @@ +_base_ = '../htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py' + +# learning policy +max_epochs = 28 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[24, 27], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py new file mode 100644 index 0000000..5d5a463 --- /dev/null +++ b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py @@ -0,0 +1,10 @@ +_base_ = './mask-rcnn_hrnetv2p-w32-1x_coco.py' +model = dict( + backbone=dict( + extra=dict( + stage2=dict(num_channels=(18, 36)), + stage3=dict(num_channels=(18, 36, 72)), + stage4=dict(num_channels=(18, 36, 72, 144))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w18')), + neck=dict(type='HRFPN', in_channels=[18, 36, 72, 144], out_channels=256)) diff --git a/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py new file mode 100644 index 0000000..8abc559 --- /dev/null +++ b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './mask-rcnn_hrnetv2p-w18-1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py new file mode 100644 index 0000000..208b037 --- /dev/null +++ b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py @@ -0,0 +1,37 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict( + _delete_=True, + type='HRFPN', + in_channels=[32, 64, 128, 256], + out_channels=256)) diff --git a/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py new file mode 100644 index 0000000..d3741c8 --- /dev/null +++ b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './mask-rcnn_hrnetv2p-w32-1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py new file mode 100644 index 0000000..360420c --- /dev/null +++ b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './mask-rcnn_hrnetv2p-w40_1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py new file mode 100644 index 0000000..36e2305 --- /dev/null +++ b/mmdetection/configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py @@ -0,0 +1,11 @@ +_base_ = './mask-rcnn_hrnetv2p-w18-1x_coco.py' +model = dict( + backbone=dict( + type='HRNet', + extra=dict( + stage2=dict(num_channels=(40, 80)), + stage3=dict(num_channels=(40, 80, 160)), + stage4=dict(num_channels=(40, 80, 160, 320))), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w40')), + neck=dict(type='HRFPN', in_channels=[40, 80, 160, 320], out_channels=256)) diff --git a/mmdetection/configs/hrnet/metafile.yml b/mmdetection/configs/hrnet/metafile.yml new file mode 100644 index 0000000..54c6247 --- /dev/null +++ b/mmdetection/configs/hrnet/metafile.yml @@ -0,0 +1,971 @@ +Models: + - Name: faster-rcnn_hrnetv2p-w18-1x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster-rcnn_hrnetv2p-w18-1x_coco.py + Metadata: + Training Memory (GB): 6.6 + inference time (ms/im): + - value: 74.63 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_1x_coco/faster_rcnn_hrnetv2p_w18_1x_coco_20200130-56651a6d.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster-rcnn_hrnetv2p-w18-2x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster-rcnn_hrnetv2p-w18-2x_coco.py + Metadata: + Training Memory (GB): 6.6 + inference time (ms/im): + - value: 74.63 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w18_2x_coco/faster_rcnn_hrnetv2p_w18_2x_coco_20200702_085731-a4ec0611.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster-rcnn_hrnetv2p-w32-1x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster-rcnn_hrnetv2p-w32-1x_coco.py + Metadata: + Training Memory (GB): 9.0 + inference time (ms/im): + - value: 80.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_1x_coco/faster_rcnn_hrnetv2p_w32_1x_coco_20200130-6e286425.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster-rcnn_hrnetv2p-w32_2x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster-rcnn_hrnetv2p-w32_2x_coco.py + Metadata: + Training Memory (GB): 9.0 + inference time (ms/im): + - value: 80.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w32_2x_coco/faster_rcnn_hrnetv2p_w32_2x_coco_20200529_015927-976a9c15.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster-rcnn_hrnetv2p-w40-1x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster-rcnn_hrnetv2p-w40-1x_coco.py + Metadata: + Training Memory (GB): 10.4 + inference time (ms/im): + - value: 95.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_1x_coco/faster_rcnn_hrnetv2p_w40_1x_coco_20200210-95c1f5ce.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: faster-rcnn_hrnetv2p-w40_2x_coco + In Collection: Faster R-CNN + Config: configs/hrnet/faster-rcnn_hrnetv2p-w40_2x_coco.py + Metadata: + Training Memory (GB): 10.4 + inference time (ms/im): + - value: 95.24 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/faster_rcnn_hrnetv2p_w40_2x_coco/faster_rcnn_hrnetv2p_w40_2x_coco_20200512_161033-0f236ef4.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask-rcnn_hrnetv2p-w18-1x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask-rcnn_hrnetv2p-w18-1x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 85.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_1x_coco/mask_rcnn_hrnetv2p_w18_1x_coco_20200205-1c3d78ed.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask-rcnn_hrnetv2p-w18-2x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask-rcnn_hrnetv2p-w18-2x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 85.47 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w18_2x_coco/mask_rcnn_hrnetv2p_w18_2x_coco_20200212-b3c825b1.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask-rcnn_hrnetv2p-w32-1x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask-rcnn_hrnetv2p-w32-1x_coco.py + Metadata: + Training Memory (GB): 9.4 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_1x_coco/mask_rcnn_hrnetv2p_w32_1x_coco_20200207-b29f616e.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask-rcnn_hrnetv2p-w32-2x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask-rcnn_hrnetv2p-w32-2x_coco.py + Metadata: + Training Memory (GB): 9.4 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w32_2x_coco/mask_rcnn_hrnetv2p_w32_2x_coco_20200213-45b75b4d.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask-rcnn_hrnetv2p-w40_1x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask-rcnn_hrnetv2p-w40_1x_coco.py + Metadata: + Training Memory (GB): 10.9 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_1x_coco/mask_rcnn_hrnetv2p_w40_1x_coco_20200511_015646-66738b35.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: mask-rcnn_hrnetv2p-w40-2x_coco + In Collection: Mask R-CNN + Config: configs/hrnet/mask-rcnn_hrnetv2p-w40-2x_coco.py + Metadata: + Training Memory (GB): 10.9 + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/mask_rcnn_hrnetv2p_w40_2x_coco/mask_rcnn_hrnetv2p_w40_2x_coco_20200512_163732-aed5e4ab.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade-rcnn_hrnetv2p-w18-20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade-rcnn_hrnetv2p-w18-20e_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w18_20e_coco/cascade_rcnn_hrnetv2p_w18_20e_coco_20200210-434be9d7.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade-rcnn_hrnetv2p-w32-20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade-rcnn_hrnetv2p-w32-20e_coco.py + Metadata: + Training Memory (GB): 9.4 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w32_20e_coco/cascade_rcnn_hrnetv2p_w32_20e_coco_20200208-928455a4.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade-rcnn_hrnetv2p-w40-20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade-rcnn_hrnetv2p-w40-20e_coco.py + Metadata: + Training Memory (GB): 10.8 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_rcnn_hrnetv2p_w40_20e_coco/cascade_rcnn_hrnetv2p_w40_20e_coco_20200512_161112-75e47b04.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade-mask-rcnn_hrnetv2p-w18_20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade-mask-rcnn_hrnetv2p-w18_20e_coco.py + Metadata: + Training Memory (GB): 8.5 + inference time (ms/im): + - value: 117.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w18_20e_coco/cascade_mask_rcnn_hrnetv2p_w18_20e_coco_20200210-b543cd2b.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade-mask-rcnn_hrnetv2p-w32_20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade-mask-rcnn_hrnetv2p-w32_20e_coco.py + Metadata: + inference time (ms/im): + - value: 120.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/cascade_mask_rcnn_hrnetv2p_w32_20e_coco_20200512_154043-39d9cf7b.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: cascade-mask-rcnn_hrnetv2p-w40-20e_coco + In Collection: Cascade R-CNN + Config: configs/hrnet/cascade-mask-rcnn_hrnetv2p-w40-20e_coco.py + Metadata: + Training Memory (GB): 12.5 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/cascade_mask_rcnn_hrnetv2p_w40_20e_coco/cascade_mask_rcnn_hrnetv2p_w40_20e_coco_20200527_204922-969c4610.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: htc_hrnetv2p-w18_20e_coco + In Collection: HTC + Config: configs/hrnet/htc_hrnetv2p-w18_20e_coco.py + Metadata: + Training Memory (GB): 10.8 + inference time (ms/im): + - value: 212.77 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w18_20e_coco/htc_hrnetv2p_w18_20e_coco_20200210-b266988c.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: htc_hrnetv2p-w32_20e_coco + In Collection: HTC + Config: configs/hrnet/htc_hrnetv2p-w32_20e_coco.py + Metadata: + Training Memory (GB): 13.1 + inference time (ms/im): + - value: 204.08 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w32_20e_coco/htc_hrnetv2p_w32_20e_coco_20200207-7639fa12.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: htc_hrnetv2p-w40_20e_coco + In Collection: HTC + Config: configs/hrnet/htc_hrnetv2p-w40_20e_coco.py + Metadata: + Training Memory (GB): 14.6 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/htc_hrnetv2p_w40_20e_coco/htc_hrnetv2p_w40_20e_coco_20200529_183411-417c4d5b.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-1x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 13.0 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco/fcos_hrnetv2p_w18_gn-head_4x4_1x_coco_20201212_100710-4ad151de.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p-w18-gn-head_4xb4-2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 13.0 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_4x4_2x_coco_20201212_101110-5c575fa5.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-1x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 17.5 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco/fcos_hrnetv2p_w32_gn-head_4x4_1x_coco_20201211_134730-cb8055c0.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p-w32-gn-head_4xb4-2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 17.5 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_4x4_2x_coco_20201212_112133-77b6b9bb.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p-w18-gn-head_ms-640-800-4xb4-2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 13.0 + inference time (ms/im): + - value: 77.52 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w18_gn-head_mstrain_640-800_4x4_2x_coco_20201212_111651-441e9d9f.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p-w32-gn-head_ms-640-800-4xb4-2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 17.5 + inference time (ms/im): + - value: 80.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w32_gn-head_mstrain_640-800_4x4_2x_coco_20201212_090846-b6f2b49f.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 + + - Name: fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco + In Collection: FCOS + Config: configs/hrnet/fcos_hrnetv2p-w40-gn-head_ms-640-800-4xb4-2x_coco.py + Metadata: + Training Resources: 4x V100 GPUs + Batch Size: 16 + Training Memory (GB): 20.3 + inference time (ms/im): + - value: 92.59 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Architecture: + - HRNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/hrnet/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco/fcos_hrnetv2p_w40_gn-head_mstrain_640-800_4x4_2x_coco_20201212_124752-f22d2ce5.pth + Paper: + URL: https://arxiv.org/abs/1904.04514 + Title: 'Deep High-Resolution Representation Learning for Visual Recognition' + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/backbones/hrnet.py#L195 + Version: v2.0.0 diff --git a/mmdetection/configs/htc/README.md b/mmdetection/configs/htc/README.md new file mode 100644 index 0000000..a6b77ce --- /dev/null +++ b/mmdetection/configs/htc/README.md @@ -0,0 +1,67 @@ +# HTC + +> [Hybrid Task Cascade for Instance Segmentation](https://arxiv.org/abs/1901.07518) + + + +## Abstract + +Cascade is a classic yet powerful architecture that has boosted performance on various tasks. However, how to introduce cascade to instance segmentation remains an open question. A simple combination of Cascade R-CNN and Mask R-CNN only brings limited gain. In exploring a more effective approach, we find that the key to a successful instance segmentation cascade is to fully leverage the reciprocal relationship between detection and segmentation. In this work, we propose a new framework, Hybrid Task Cascade (HTC), which differs in two important aspects: (1) instead of performing cascaded refinement on these two tasks separately, it interweaves them for a joint multi-stage processing; (2) it adopts a fully convolutional branch to provide spatial context, which can help distinguishing hard foreground from cluttered background. Overall, this framework can learn more discriminative features progressively while integrating complementary features together in each stage. Without bells and whistles, a single HTC obtains 38.4 and 1.5 improvement over a strong Cascade Mask R-CNN baseline on MSCOCO dataset. Moreover, our overall system achieves 48.6 mask AP on the test-challenge split, ranking 1st in the COCO 2018 Challenge Object Detection Task. + +
    + +
    + +## Introduction + +HTC requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +| | ├── stuffthingmaps +``` + +## Results and Models + +The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val) + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 8.2 | 5.8 | 42.3 | 37.4 | [config](./htc_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317-7332cf16.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317_070435.log.json) | +| R-50-FPN | pytorch | 20e | 8.2 | - | 43.3 | 38.3 | [config](./htc_r50_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319_070313.log.json) | +| R-101-FPN | pytorch | 20e | 10.2 | 5.5 | 44.8 | 39.6 | [config](./htc_r101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317-9b41b48f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317_153107.log.json) | +| X-101-32x4d-FPN | pytorch | 20e | 11.4 | 5.0 | 46.1 | 40.5 | [config](./htc_x101-32x4d_fpn_16xb1-20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318-de97ae01.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318_034519.log.json) | +| X-101-64x4d-FPN | pytorch | 20e | 14.5 | 4.4 | 47.0 | 41.4 | [config](./htc_x101-64x4d_fpn_16xb1-20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318-b181fd7a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318_081711.log.json) | + +- In the HTC paper and COCO 2018 Challenge, `score_thr` is set to 0.001 for both baselines and HTC. +- We use 8 GPUs with 2 images/GPU for R-50 and R-101 models, and 16 GPUs with 1 image/GPU for X-101 models. + If you would like to train X-101 HTC with 8 GPUs, you need to change the lr from 0.02 to 0.01. + +We also provide a powerful HTC with DCN and multi-scale training model. No testing augmentation is used. + +| Backbone | Style | DCN | training scales | Lr schd | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :---: | :-------------: | :-----: | :----: | :-----: | :----------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| X-101-64x4d-FPN | pytorch | c3-c5 | 400~1400 | 20e | 50.4 | 43.8 | [config](./htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312_203410.log.json) | + +## Citation + +We provide config files to reproduce the results in the CVPR 2019 paper for [Hybrid Task Cascade](https://arxiv.org/abs/1901.07518). + +```latex +@inproceedings{chen2019hybrid, + title={Hybrid task cascade for instance segmentation}, + author={Chen, Kai and Pang, Jiangmiao and Wang, Jiaqi and Xiong, Yu and Li, Xiaoxiao and Sun, Shuyang and Feng, Wansen and Liu, Ziwei and Shi, Jianping and Ouyang, Wanli and Chen Change Loy and Dahua Lin}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} +``` diff --git a/mmdetection/configs/htc/htc-without-semantic_r50_fpn_1x_coco.py b/mmdetection/configs/htc/htc-without-semantic_r50_fpn_1x_coco.py new file mode 100644 index 0000000..791f4eb --- /dev/null +++ b/mmdetection/configs/htc/htc-without-semantic_r50_fpn_1x_coco.py @@ -0,0 +1,223 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='HybridTaskCascade', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='HybridTaskCascadeRoIHead', + interleaved=True, + mask_info_flow=True, + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=[ + dict( + type='HTCMaskHead', + with_conv_res=False, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdetection/configs/htc/htc_r101_fpn_20e_coco.py b/mmdetection/configs/htc/htc_r101_fpn_20e_coco.py new file mode 100644 index 0000000..28091aa --- /dev/null +++ b/mmdetection/configs/htc/htc_r101_fpn_20e_coco.py @@ -0,0 +1,6 @@ +_base_ = './htc_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/htc/htc_r50_fpn_1x_coco.py b/mmdetection/configs/htc/htc_r50_fpn_1x_coco.py new file mode 100644 index 0000000..3573f1f --- /dev/null +++ b/mmdetection/configs/htc/htc_r50_fpn_1x_coco.py @@ -0,0 +1,33 @@ +_base_ = './htc-without-semantic_r50_fpn_1x_coco.py' +model = dict( + data_preprocessor=dict(pad_seg=True), + roi_head=dict( + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + seg_scale_factor=1 / 8, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + loss_seg=dict( + type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2)))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict( + dataset=dict( + data_prefix=dict(img='train2017/', seg='stuffthingmaps/train2017/'), + pipeline=train_pipeline)) diff --git a/mmdetection/configs/htc/htc_r50_fpn_20e_coco.py b/mmdetection/configs/htc/htc_r50_fpn_20e_coco.py new file mode 100644 index 0000000..9f510fa --- /dev/null +++ b/mmdetection/configs/htc/htc_r50_fpn_20e_coco.py @@ -0,0 +1,16 @@ +_base_ = './htc_r50_fpn_1x_coco.py' + +# learning policy +max_epochs = 20 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 19], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py b/mmdetection/configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py new file mode 100644 index 0000000..396d3a0 --- /dev/null +++ b/mmdetection/configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py @@ -0,0 +1,32 @@ +_base_ = './htc_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) + +train_dataloader = dict(batch_size=1, num_workers=1) + +# learning policy +max_epochs = 20 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 19], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py b/mmdetection/configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py new file mode 100644 index 0000000..26d68e7 --- /dev/null +++ b/mmdetection/configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py @@ -0,0 +1,20 @@ +_base_ = './htc_x101-64x4d_fpn_16xb1-20e_coco.py' + +model = dict( + backbone=dict( + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), + dict( + type='RandomResize', + scale=[(1600, 400), (1600, 1400)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py b/mmdetection/configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py new file mode 100644 index 0000000..a600ddb --- /dev/null +++ b/mmdetection/configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py @@ -0,0 +1,7 @@ +_base_ = './htc_x101-32x4d_fpn_16xb1-20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + groups=64, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/htc/metafile.yml b/mmdetection/configs/htc/metafile.yml new file mode 100644 index 0000000..2f0f74d --- /dev/null +++ b/mmdetection/configs/htc/metafile.yml @@ -0,0 +1,165 @@ +Collections: + - Name: HTC + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - HTC + - RPN + - ResNet + - ResNeXt + - RoIAlign + Paper: + URL: https://arxiv.org/abs/1901.07518 + Title: 'Hybrid Task Cascade for Instance Segmentation' + README: configs/htc/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/htc.py#L6 + Version: v2.0.0 + +Models: + - Name: htc_r50_fpn_1x_coco + In Collection: HTC + Config: configs/htc/htc_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.2 + inference time (ms/im): + - value: 172.41 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_1x_coco/htc_r50_fpn_1x_coco_20200317-7332cf16.pth + + - Name: htc_r50_fpn_20e_coco + In Collection: HTC + Config: configs/htc/htc_r50_fpn_20e_coco.py + Metadata: + Training Memory (GB): 8.2 + inference time (ms/im): + - value: 172.41 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth + + - Name: htc_r101_fpn_20e_coco + In Collection: HTC + Config: configs/htc/htc_r101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 10.2 + inference time (ms/im): + - value: 181.82 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_r101_fpn_20e_coco/htc_r101_fpn_20e_coco_20200317-9b41b48f.pth + + - Name: htc_x101-32x4d_fpn_16xb1-20e_coco + In Collection: HTC + Config: configs/htc/htc_x101-32x4d_fpn_16xb1-20e_coco.py + Metadata: + Training Resources: 16x V100 GPUs + Batch Size: 16 + Training Memory (GB): 11.4 + inference time (ms/im): + - value: 200 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_32x4d_fpn_16x1_20e_coco/htc_x101_32x4d_fpn_16x1_20e_coco_20200318-de97ae01.pth + + - Name: htc_x101-64x4d_fpn_16xb1-20e_coco + In Collection: HTC + Config: configs/htc/htc_x101-64x4d_fpn_16xb1-20e_coco.py + Metadata: + Training Resources: 16x V100 GPUs + Batch Size: 16 + Training Memory (GB): 14.5 + inference time (ms/im): + - value: 227.27 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_16x1_20e_coco/htc_x101_64x4d_fpn_16x1_20e_coco_20200318-b181fd7a.pth + + - Name: htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco + In Collection: HTC + Config: configs/htc/htc_x101-64x4d-dconv-c3-c5_fpn_ms-400-1400-16xb1-20e_coco.py + Metadata: + Training Resources: 16x V100 GPUs + Batch Size: 16 + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth diff --git a/mmdetection/configs/instaboost/README.md b/mmdetection/configs/instaboost/README.md new file mode 100644 index 0000000..3413234 --- /dev/null +++ b/mmdetection/configs/instaboost/README.md @@ -0,0 +1,58 @@ +# Instaboost + +> [Instaboost: Boosting instance segmentation via probability map guided copy-pasting](https://arxiv.org/abs/1908.07801) + + + +## Abstract + +Instance segmentation requires a large number of training samples to achieve satisfactory performance and benefits from proper data augmentation. To enlarge the training set and increase the diversity, previous methods have investigated using data annotation from other domain (e.g. bbox, point) in a weakly supervised mechanism. In this paper, we present a simple, efficient and effective method to augment the training set using the existing instance mask annotations. Exploiting the pixel redundancy of the background, we are able to improve the performance of Mask R-CNN for 1.7 mAP on COCO dataset and 3.3 mAP on Pascal VOC dataset by simply introducing random jittering to objects. Furthermore, we propose a location probability map based approach to explore the feasible locations that objects can be placed based on local appearance similarity. With the guidance of such map, we boost the performance of R101-Mask R-CNN on instance segmentation from 35.7 mAP to 37.9 mAP without modifying the backbone or network structure. Our method is simple to implement and does not increase the computational complexity. It can be integrated into the training pipeline of any instance segmentation model without affecting the training and inference efficiency. + +
    + +
    + +## Introduction + +Configs in this directory is the implementation for ICCV2019 paper "InstaBoost: Boosting Instance Segmentation Via Probability Map Guided Copy-Pasting" and provided by the authors of the paper. InstaBoost is a data augmentation method for object detection and instance segmentation. The paper has been released on [`arXiv`](https://arxiv.org/abs/1908.07801). + +## Usage + +### Requirements + +You need to install `instaboostfast` before using it. + +```shell +pip install instaboostfast +``` + +The code and more details can be found [here](https://github.com/GothicAi/Instaboost). + +### Integration with MMDetection + +InstaBoost have been already integrated in the data pipeline, thus all you need is to add or change **InstaBoost** configurations after **LoadImageFromFile**. We have provided examples like [this](mask_rcnn_r50_fpn_instaboost_4x#L121). You can refer to [`InstaBoostConfig`](https://github.com/GothicAi/InstaBoost-pypi#instaboostconfig) for more details. + +## Results and Models + +- All models were trained on `coco_2017_train` and tested on `coco_2017_val` for convenience of evaluation and comparison. In the paper, the results are obtained from `test-dev`. +- To balance accuracy and training time when using InstaBoost, models released in this page are all trained for 48 Epochs. Other training and testing configs strictly follow the original framework. +- For results and models in MMDetection V1.x, please refer to [Instaboost](https://github.com/GothicAi/Instaboost). + +| Network | Backbone | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-----------: | :-------------: | :-----: | :------: | :------------: | :----: | :-----: | :---------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Mask R-CNN | R-50-FPN | 4x | 4.4 | 17.5 | 40.6 | 36.6 | [config](./mask-rcnn_r50_fpn_instaboost-4x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307_223635.log.json) | +| Mask R-CNN | R-101-FPN | 4x | 6.4 | | 42.5 | 38.0 | [config](./mask-rcnn_r101_fpn_instaboost-4x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738-f23f3a5f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738.log.json) | +| Mask R-CNN | X-101-64x4d-FPN | 4x | 10.7 | | 44.7 | 39.7 | [config](./mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947-8ed58c1b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947.log.json) | +| Cascade R-CNN | R-101-FPN | 4x | 6.0 | 12.0 | 43.7 | 38.0 | [config](./cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-c19d98d9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307_223646.log.json) | + +## Citation + +```latex +@inproceedings{fang2019instaboost, + title={Instaboost: Boosting instance segmentation via probability map guided copy-pasting}, + author={Fang, Hao-Shu and Sun, Jianhua and Wang, Runzhong and Gou, Minghao and Li, Yong-Lu and Lu, Cewu}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + pages={682--691}, + year={2019} +} +``` diff --git a/mmdetection/configs/instaboost/cascade-mask-rcnn_r101_fpn_instaboost-4x_coco.py b/mmdetection/configs/instaboost/cascade-mask-rcnn_r101_fpn_instaboost-4x_coco.py new file mode 100644 index 0000000..53e33b8 --- /dev/null +++ b/mmdetection/configs/instaboost/cascade-mask-rcnn_r101_fpn_instaboost-4x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py b/mmdetection/configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py new file mode 100644 index 0000000..f7736cf --- /dev/null +++ b/mmdetection/configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py @@ -0,0 +1,40 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='InstaBoost', + action_candidate=('normal', 'horizontal', 'skip'), + action_prob=(1, 0, 0), + scale=(0.8, 1.2), + dx=15, + dy=15, + theta=(-1, 1), + color_prob=0.5, + hflag=False, + aug_ratio=0.5), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +max_epochs = 48 + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[32, 44], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) + +# only keep latest 3 checkpoints +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) diff --git a/mmdetection/configs/instaboost/cascade-mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py b/mmdetection/configs/instaboost/cascade-mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py new file mode 100644 index 0000000..c7938d9 --- /dev/null +++ b/mmdetection/configs/instaboost/cascade-mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py @@ -0,0 +1,14 @@ +_base_ = './cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py b/mmdetection/configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py new file mode 100644 index 0000000..55bfa9f --- /dev/null +++ b/mmdetection/configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_instaboost-4x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py b/mmdetection/configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py new file mode 100644 index 0000000..0a8c9be --- /dev/null +++ b/mmdetection/configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py @@ -0,0 +1,40 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='InstaBoost', + action_candidate=('normal', 'horizontal', 'skip'), + action_prob=(1, 0, 0), + scale=(0.8, 1.2), + dx=15, + dy=15, + theta=(-1, 1), + color_prob=0.5, + hflag=False, + aug_ratio=0.5), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +max_epochs = 48 + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[32, 44], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) + +# only keep latest 3 checkpoints +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) diff --git a/mmdetection/configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py b/mmdetection/configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py new file mode 100644 index 0000000..9ba2ada --- /dev/null +++ b/mmdetection/configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_r50_fpn_instaboost-4x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/instaboost/metafile.yml b/mmdetection/configs/instaboost/metafile.yml new file mode 100644 index 0000000..228f31b --- /dev/null +++ b/mmdetection/configs/instaboost/metafile.yml @@ -0,0 +1,99 @@ +Collections: + - Name: InstaBoost + Metadata: + Training Data: COCO + Training Techniques: + - InstaBoost + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Paper: + URL: https://arxiv.org/abs/1908.07801 + Title: 'Instaboost: Boosting instance segmentation via probability map guided copy-pasting' + README: configs/instaboost/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/datasets/pipelines/instaboost.py#L7 + Version: v2.0.0 + +Models: + - Name: mask-rcnn_r50_fpn_instaboost_4x_coco + In Collection: InstaBoost + Config: configs/instaboost/mask-rcnn_r50_fpn_instaboost-4x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 57.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 48 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r50_fpn_instaboost_4x_coco/mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-d025f83a.pth + + - Name: mask-rcnn_r101_fpn_instaboost-4x_coco + In Collection: InstaBoost + Config: configs/instaboost/mask-rcnn_r101_fpn_instaboost-4x_coco.py + Metadata: + Training Memory (GB): 6.4 + Epochs: 48 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_r101_fpn_instaboost_4x_coco/mask_rcnn_r101_fpn_instaboost_4x_coco_20200703_235738-f23f3a5f.pth + + - Name: mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco + In Collection: InstaBoost + Config: configs/instaboost/mask-rcnn_x101-64x4d_fpn_instaboost-4x_coco.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 48 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco/mask_rcnn_x101_64x4d_fpn_instaboost_4x_coco_20200515_080947-8ed58c1b.pth + + - Name: cascade-mask-rcnn_r50_fpn_instaboost_4x_coco + In Collection: InstaBoost + Config: configs/instaboost/cascade-mask-rcnn_r50_fpn_instaboost-4x_coco.py + Metadata: + Training Memory (GB): 6.0 + inference time (ms/im): + - value: 83.33 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 48 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/instaboost/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco/cascade_mask_rcnn_r50_fpn_instaboost_4x_coco_20200307-c19d98d9.pth diff --git a/mmdetection/configs/lad/README.md b/mmdetection/configs/lad/README.md new file mode 100644 index 0000000..3c3b6b4 --- /dev/null +++ b/mmdetection/configs/lad/README.md @@ -0,0 +1,45 @@ +# LAD + +> [Improving Object Detection by Label Assignment Distillation](https://arxiv.org/abs/2108.10520) + + + +## Abstract + +Label assignment in object detection aims to assign targets, foreground or background, to sampled regions in an image. Unlike labeling for image classification, this problem is not well defined due to the object's bounding box. In this paper, we investigate the problem from a perspective of distillation, hence we call Label Assignment Distillation (LAD). Our initial motivation is very simple, we use a teacher network to generate labels for the student. This can be achieved in two ways: either using the teacher's prediction as the direct targets (soft label), or through the hard labels dynamically assigned by the teacher (LAD). Our experiments reveal that: (i) LAD is more effective than soft-label, but they are complementary. (ii) Using LAD, a smaller teacher can also improve a larger student significantly, while soft-label can't. We then introduce Co-learning LAD, in which two networks simultaneously learn from scratch and the role of teacher and student are dynamically interchanged. Using PAA-ResNet50 as a teacher, our LAD techniques can improve detectors PAA-ResNet101 and PAA-ResNeXt101 to 46AP and 47.5AP on the COCO test-dev set. With a stronger teacher PAA-SwinB, we improve the students PAA-ResNet50 to 43.7AP by only 1x schedule training and standard setting, and PAA-ResNet101 to 47.9AP, significantly surpassing the current methods. + +
    + +
    + +## Results and Models + +We provide config files to reproduce the object detection results in the +WACV 2022 paper for Improving Object Detection by Label Assignment +Distillation. + +### PAA with LAD + +| Teacher | Student | Training schedule | AP (val) | Config | Download | +| :-----: | :-----: | :---------------: | :------: | :----------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| -- | R-50 | 1x | 40.4 | [config](../paa/paa_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.log.json) | +| -- | R-101 | 1x | 42.6 | [config](../paa/paa_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.log.json) | +| R-101 | R-50 | 1x | 41.4 | [config](./lad_r50-paa-r101_fpn_2xb8_coco_1x.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r50_paa_r101_fpn_coco_1x/lad_r50_paa_r101_fpn_coco_1x_20220708_124246-74c76ff0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r50_paa_r101_fpn_coco_1x/lad_r50_paa_r101_fpn_coco_1x_20220708_124246.log.json) | +| R-50 | R-101 | 1x | 43.2 | [config](./lad_r101-paa-r50_fpn_2xb8_coco_1x.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r101_paa_r50_fpn_coco_1x/lad_r101_paa_r50_fpn_coco_1x_20220708_124357-9407ac54.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r101_paa_r50_fpn_coco_1x/lad_r101_paa_r50_fpn_coco_1x_20220708_124357.log.json) | + +## Note + +- Meaning of Config name: lad_r50(student model)\_paa(based on paa)\_r101(teacher model)\_fpn(neck)\_coco(dataset)\_1x(12 epoch).py +- Results may fluctuate by about 0.2 mAP. +- 2 GPUs are used, 8 samples per GPU. + +## Citation + +```latex +@inproceedings{nguyen2021improving, + title={Improving Object Detection by Label Assignment Distillation}, + author={Chuong H. Nguyen and Thuy C. Nguyen and Tuan N. Tang and Nam L. H. Phan}, + booktitle = {WACV}, + year={2022} +} +``` diff --git a/mmdetection/configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py b/mmdetection/configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py new file mode 100644 index 0000000..d61d086 --- /dev/null +++ b/mmdetection/configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py @@ -0,0 +1,127 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth' # noqa + +model = dict( + type='LAD', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + # student + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='LADHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # teacher + teacher_ckpt=teacher_ckpt, + teacher_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + teacher_neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + teacher_bbox_head=dict( + type='LADHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.1, + neg_iou_thr=0.1, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + score_voting=True, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +train_dataloader = dict(batch_size=8, num_workers=4) +optim_wrapper = dict(type='AmpOptimWrapper', optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py b/mmdetection/configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py new file mode 100644 index 0000000..f7eaf2b --- /dev/null +++ b/mmdetection/configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py @@ -0,0 +1,126 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +teacher_ckpt = 'http://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth' # noqa + +model = dict( + type='LAD', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + # student + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='LADHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # teacher + teacher_ckpt=teacher_ckpt, + teacher_backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + teacher_neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + teacher_bbox_head=dict( + type='LADHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.1, + neg_iou_thr=0.1, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + score_voting=True, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +train_dataloader = dict(batch_size=8, num_workers=4) +optim_wrapper = dict(type='AmpOptimWrapper', optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/lad/metafile.yml b/mmdetection/configs/lad/metafile.yml new file mode 100644 index 0000000..230132e --- /dev/null +++ b/mmdetection/configs/lad/metafile.yml @@ -0,0 +1,45 @@ +Collections: + - Name: Label Assignment Distillation + Metadata: + Training Data: COCO + Training Techniques: + - Label Assignment Distillation + - SGD with Momentum + - Weight Decay + Training Resources: 2x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2108.10520 + Title: 'Improving Object Detection by Label Assignment Distillation' + README: configs/lad/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.19.0/mmdet/models/detectors/lad.py#L10 + Version: v2.19.0 + +Models: + - Name: lad_r101-paa-r50_fpn_2xb8_coco_1x + In Collection: Label Assignment Distillation + Config: configs/lad/lad_r101-paa-r50_fpn_2xb8_coco_1x.py + Metadata: + Training Memory (GB): 12.4 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r101_paa_r50_fpn_coco_1x/lad_r101_paa_r50_fpn_coco_1x_20220708_124357-9407ac54.pth + - Name: lad_r50-paa-r101_fpn_2xb8_coco_1x + In Collection: Label Assignment Distillation + Config: configs/lad/lad_r50-paa-r101_fpn_2xb8_coco_1x.py + Metadata: + Training Memory (GB): 8.9 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lad/lad_r50_paa_r101_fpn_coco_1x/lad_r50_paa_r101_fpn_coco_1x_20220708_124246-74c76ff0.pth diff --git a/mmdetection/configs/ld/README.md b/mmdetection/configs/ld/README.md new file mode 100644 index 0000000..65e16c7 --- /dev/null +++ b/mmdetection/configs/ld/README.md @@ -0,0 +1,43 @@ +# LD + +> [Localization Distillation for Dense Object Detection](https://arxiv.org/abs/2102.12252) + + + +## Abstract + +Knowledge distillation (KD) has witnessed its powerful capability in learning compact models in object detection. Previous KD methods for object detection mostly focus on imitating deep features within the imitation regions instead of mimicking classification logits due to its inefficiency in distilling localization information. In this paper, by reformulating the knowledge distillation process on localization, we present a novel localization distillation (LD) method which can efficiently transfer the localization knowledge from the teacher to the student. Moreover, we also heuristically introduce the concept of valuable localization region that can aid to selectively distill the semantic and localization knowledge for a certain region. Combining these two new components, for the first time, we show that logit mimicking can outperform feature imitation and localization knowledge distillation is more important and efficient than semantic knowledge for distilling object detectors. Our distillation scheme is simple as well as effective and can be easily applied to different dense object detectors. Experiments show that our LD can boost the AP score of GFocal-ResNet-50 with a single-scale 1× training schedule from 40.1 to 42.1 on the COCO benchmark without any sacrifice on the inference speed. + +
    + +
    + +## Results and Models + +### GFocalV1 with LD + +| Teacher | Student | Training schedule | Mini-batch size | AP (val) | Config | Download | +| :-------: | :-----: | :---------------: | :-------------: | :------: | :-----------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| -- | R-18 | 1x | 6 | 35.8 | | | +| R-101 | R-18 | 1x | 6 | 36.5 | [config](./ld_r18-gflv1-r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r18_gflv1_r101_fpn_coco_1x/ld_r18_gflv1_r101_fpn_coco_1x_20220702_062206-330e6332.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r18_gflv1_r101_fpn_coco_1x/ld_r18_gflv1_r101_fpn_coco_1x_20220702_062206.log.json) | +| -- | R-34 | 1x | 6 | 38.9 | | | +| R-101 | R-34 | 1x | 6 | 39.9 | [config](./ld_r34-gflv1-r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r34_gflv1_r101_fpn_coco_1x/ld_r34_gflv1_r101_fpn_coco_1x_20220630_134007-9bc69413.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r34_gflv1_r101_fpn_coco_1x/ld_r34_gflv1_r101_fpn_coco_1x_20220630_134007.log.json) | +| -- | R-50 | 1x | 6 | 40.1 | | | +| R-101 | R-50 | 1x | 6 | 41.0 | [config](./ld_r50-gflv1-r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r50_gflv1_r101_fpn_coco_1x/ld_r50_gflv1_r101_fpn_coco_1x_20220629_145355-8dc5bad8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r50_gflv1_r101_fpn_coco_1x/ld_r50_gflv1_r101_fpn_coco_1x_20220629_145355.log.json) | +| -- | R-101 | 2x | 6 | 44.6 | | | +| R-101-DCN | R-101 | 2x | 6 | 45.5 | [config](./ld_r101-gflv1-r101-dcn_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x/ld_r101_gflv1_r101dcn_fpn_coco_2x_20220629_185920-9e658426.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x/ld_r101_gflv1_r101dcn_fpn_coco_2x_20220629_185920.log.json) | + +## Note + +- Meaning of Config name: ld_r18(student model)\_gflv1(based on gflv1)\_r101(teacher model)\_fpn(neck)\_coco(dataset)\_1x(12 epoch).py + +## Citation + +```latex +@Inproceedings{zheng2022LD, + title={Localization Distillation for Dense Object Detection}, + author= {Zheng, Zhaohui and Ye, Rongguang and Wang, Ping and Ren, Dongwei and Zuo, Wangmeng and Hou, Qibin and Cheng, Mingming}, + booktitle={CVPR}, + year={2022} +} +``` diff --git a/mmdetection/configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py b/mmdetection/configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py new file mode 100644 index 0000000..a7e928b --- /dev/null +++ b/mmdetection/configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py @@ -0,0 +1,49 @@ +_base_ = ['./ld_r18-gflv1-r101_fpn_1x_coco.py'] +teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20200630_102002-134b07df.pth' # noqa +model = dict( + teacher_config='configs/gfl/gfl_r101-dconv-c3-c5_fpn_ms-2x_coco.py', + teacher_ckpt=teacher_ckpt, + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5)) + +max_epochs = 24 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) + +# multi-scale training +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 480), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py b/mmdetection/configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py new file mode 100644 index 0000000..f18bb1d --- /dev/null +++ b/mmdetection/configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py @@ -0,0 +1,70 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +teacher_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/gfl/gfl_r101_fpn_mstrain_2x_coco/gfl_r101_fpn_mstrain_2x_coco_20200629_200126-dd12f847.pth' # noqa +model = dict( + type='KnowledgeDistillationSingleStageDetector', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + teacher_config='configs/gfl/gfl_r101_fpn_ms-2x_coco.py', + teacher_ckpt=teacher_ckpt, + backbone=dict( + type='ResNet', + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict( + type='FPN', + in_channels=[64, 128, 256, 512], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='LDHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25), + loss_ld=dict( + type='KnowledgeDistillationKLDivLoss', loss_weight=0.25, T=10), + reg_max=16, + loss_bbox=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py b/mmdetection/configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py new file mode 100644 index 0000000..2198adc --- /dev/null +++ b/mmdetection/configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py @@ -0,0 +1,19 @@ +_base_ = ['./ld_r18-gflv1-r101_fpn_1x_coco.py'] +model = dict( + backbone=dict( + type='ResNet', + depth=34, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet34')), + neck=dict( + type='FPN', + in_channels=[64, 128, 256, 512], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5)) diff --git a/mmdetection/configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py b/mmdetection/configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py new file mode 100644 index 0000000..89ab579 --- /dev/null +++ b/mmdetection/configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py @@ -0,0 +1,19 @@ +_base_ = ['./ld_r18-gflv1-r101_fpn_1x_coco.py'] +model = dict( + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5)) diff --git a/mmdetection/configs/ld/metafile.yml b/mmdetection/configs/ld/metafile.yml new file mode 100644 index 0000000..a807d1b --- /dev/null +++ b/mmdetection/configs/ld/metafile.yml @@ -0,0 +1,69 @@ +Collections: + - Name: Localization Distillation + Metadata: + Training Data: COCO + Training Techniques: + - Localization Distillation + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2102.12252 + Title: 'Localization Distillation for Dense Object Detection' + README: configs/ld/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.11.0/mmdet/models/dense_heads/ld_head.py#L11 + Version: v2.11.0 + +Models: + - Name: ld_r18-gflv1-r101_fpn_1x_coco + In Collection: Localization Distillation + Config: configs/ld/ld_r18-gflv1-r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 1.8 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r18_gflv1_r101_fpn_coco_1x/ld_r18_gflv1_r101_fpn_coco_1x_20220702_062206-330e6332.pth + - Name: ld_r34-gflv1-r101_fpn_1x_coco + In Collection: Localization Distillation + Config: configs/ld/ld_r34-gflv1-r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 2.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r34_gflv1_r101_fpn_coco_1x/ld_r34_gflv1_r101_fpn_coco_1x_20220630_134007-9bc69413.pth + - Name: ld_r50-gflv1-r101_fpn_1x_coco + In Collection: Localization Distillation + Config: configs/ld/ld_r50-gflv1-r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r50_gflv1_r101_fpn_coco_1x/ld_r50_gflv1_r101_fpn_coco_1x_20220629_145355-8dc5bad8.pth + - Name: ld_r101-gflv1-r101-dcn_fpn_2x_coco + In Collection: Localization Distillation + Config: configs/ld/ld_r101-gflv1-r101-dcn_fpn_2x_coco.py + Metadata: + Training Memory (GB): 5.5 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ld/ld_r101_gflv1_r101dcn_fpn_coco_2x/ld_r101_gflv1_r101dcn_fpn_coco_2x_20220629_185920-9e658426.pth diff --git a/mmdetection/configs/legacy_1.x/README.md b/mmdetection/configs/legacy_1.x/README.md new file mode 100644 index 0000000..443a0a7 --- /dev/null +++ b/mmdetection/configs/legacy_1.x/README.md @@ -0,0 +1,54 @@ +# Legacy Configs in MMDetection V1.x + + + +Configs in this directory implement the legacy configs used by MMDetection V1.x and its model zoos. + +To help users convert their models from V1.x to MMDetection V2.0, we provide v1.x configs to inference the converted v1.x models. +Due to the BC-breaking changes in MMDetection V2.0 from MMDetection V1.x, running inference with the same model weights in these two version will produce different results. The difference will cause within 1% AP absolute difference as can be found in the following table. + +## Usage + +To upgrade the model version, the users need to do the following steps. + +### 1. Convert model weights + +There are three main difference in the model weights between V1.x and V2.0 codebases. + +1. Since the class order in all the detector's classification branch is reordered, all the legacy model weights need to go through the conversion process. +2. The regression and segmentation head no longer contain the background channel. Weights in these background channels should be removed to fix in the current codebase. +3. For two-stage detectors, their wegihts need to be upgraded since MMDetection V2.0 refactors all the two-stage detectors with `RoIHead`. + +The users can do the same modification as mentioned above for the self-implemented +detectors. We provide a scripts `tools/model_converters/upgrade_model_version.py` to convert the model weights in the V1.x model zoo. + +```bash +python tools/model_converters/upgrade_model_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH} --num-classes ${NUM_CLASSES} + +``` + +- OLD_MODEL_PATH: the path to load the model weights in 1.x version. +- NEW_MODEL_PATH: the path to save the converted model weights in 2.0 version. +- NUM_CLASSES: number of classes of the original model weights. Usually it is 81 for COCO dataset, 21 for VOC dataset. + The number of classes in V2.0 models should be equal to that in V1.x models - 1. + +### 2. Use configs with legacy settings + +After converting the model weights, checkout to the v1.2 release to find the corresponding config file that uses the legacy settings. +The V1.x models usually need these three legacy modules: `LegacyAnchorGenerator`, `LegacyDeltaXYWHBBoxCoder`, and `RoIAlign(align=False)`. +For models using ResNet Caffe backbones, they also need to change the pretrain name and the corresponding `img_norm_cfg`. +An example is in [`retinanet_r50-caffe_fpn_1x_coco_v1.py`](retinanet_r50-caffe_fpn_1x_coco_v1.py) +Then use the config to test the model weights. For most models, the obtained results should be close to that in V1.x. +We provide configs of some common structures in this directory. + +## Performance + +The performance change after converting the models in this directory are listed as the following. + +| Method | Style | Lr schd | V1.x box AP | V1.x mask AP | V2.0 box AP | V2.0 mask AP | Config | Download | +| :-------------------------: | :-----: | :-----: | :---------: | :----------: | :---------: | :----------: | :-------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------: | +| Mask R-CNN R-50-FPN | pytorch | 1x | 37.3 | 34.2 | 36.8 | 33.9 | [config](./mask-rcnn_r50_fpn_1x_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/mask_rcnn_r50_fpn_1x_20181010-069fa190.pth) | +| RetinaNet R-50-FPN | caffe | 1x | 35.8 | - | 35.4 | - | [config](./retinanet_r50-caffe_fpn_1x_coco_v1.py) | | +| RetinaNet R-50-FPN | pytorch | 1x | 35.6 | - | 35.2 | - | [config](./retinanet_r50_fpn_1x_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/retinanet_r50_fpn_1x_20181125-7b0c2548.pth) | +| Cascade Mask R-CNN R-50-FPN | pytorch | 1x | 41.2 | 35.7 | 40.8 | 35.6 | [config](./cascade-mask-rcnn_r50_fpn_1x_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/cascade_mask_rcnn_r50_fpn_1x_20181123-88b170c9.pth) | +| SSD300-VGG16 | caffe | 120e | 25.7 | - | 25.4 | - | [config](./ssd300_coco_v1.py) | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/ssd300_coco_vgg16_caffe_120e_20181221-84d7110b.pth) | diff --git a/mmdetection/configs/legacy_1.x/cascade-mask-rcnn_r50_fpn_1x_coco_v1.py b/mmdetection/configs/legacy_1.x/cascade-mask-rcnn_r50_fpn_1x_coco_v1.py new file mode 100644 index 0000000..f948a7a --- /dev/null +++ b/mmdetection/configs/legacy_1.x/cascade-mask-rcnn_r50_fpn_1x_coco_v1.py @@ -0,0 +1,78 @@ +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='CascadeRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + anchor_generator=dict(type='LegacyAnchorGenerator', center_offset=0.5), + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0])), + roi_head=dict( + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=2, + aligned=False)), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + reg_class_agnostic=True, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2])), + dict( + type='Shared2FCBBoxHead', + reg_class_agnostic=True, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1])), + dict( + type='Shared2FCBBoxHead', + reg_class_agnostic=True, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067])), + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=14, + sampling_ratio=2, + aligned=False)))) diff --git a/mmdetection/configs/legacy_1.x/faster-rcnn_r50_fpn_1x_coco_v1.py b/mmdetection/configs/legacy_1.x/faster-rcnn_r50_fpn_1x_coco_v1.py new file mode 100644 index 0000000..66bf971 --- /dev/null +++ b/mmdetection/configs/legacy_1.x/faster-rcnn_r50_fpn_1x_coco_v1.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='FasterRCNN', + backbone=dict( + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + rpn_head=dict( + type='RPNHead', + anchor_generator=dict( + type='LegacyAnchorGenerator', + center_offset=0.5, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=2, + aligned=False), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn_proposal=dict(max_per_img=2000), + rcnn=dict(assigner=dict(match_low_quality=True)))) diff --git a/mmdetection/configs/legacy_1.x/mask-rcnn_r50_fpn_1x_coco_v1.py b/mmdetection/configs/legacy_1.x/mask-rcnn_r50_fpn_1x_coco_v1.py new file mode 100644 index 0000000..6908025 --- /dev/null +++ b/mmdetection/configs/legacy_1.x/mask-rcnn_r50_fpn_1x_coco_v1.py @@ -0,0 +1,34 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + rpn_head=dict( + anchor_generator=dict(type='LegacyAnchorGenerator', center_offset=0.5), + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=2, + aligned=False)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=14, + sampling_ratio=2, + aligned=False)), + bbox_head=dict( + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + + # model training and testing settings + train_cfg=dict( + rpn_proposal=dict(max_per_img=2000), + rcnn=dict(assigner=dict(match_low_quality=True)))) diff --git a/mmdetection/configs/legacy_1.x/retinanet_r50-caffe_fpn_1x_coco_v1.py b/mmdetection/configs/legacy_1.x/retinanet_r50-caffe_fpn_1x_coco_v1.py new file mode 100644 index 0000000..49abc31 --- /dev/null +++ b/mmdetection/configs/legacy_1.x/retinanet_r50-caffe_fpn_1x_coco_v1.py @@ -0,0 +1,16 @@ +_base_ = './retinanet_r50_fpn_1x_coco_v1.py' +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + # use caffe img_norm + mean=[102.9801, 115.9465, 122.7717], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet50_caffe'))) diff --git a/mmdetection/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py b/mmdetection/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py new file mode 100644 index 0000000..6198b97 --- /dev/null +++ b/mmdetection/configs/legacy_1.x/retinanet_r50_fpn_1x_coco_v1.py @@ -0,0 +1,17 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + bbox_head=dict( + type='RetinaHead', + anchor_generator=dict( + type='LegacyAnchorGenerator', + center_offset=0.5, + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'), + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0))) diff --git a/mmdetection/configs/legacy_1.x/ssd300_coco_v1.py b/mmdetection/configs/legacy_1.x/ssd300_coco_v1.py new file mode 100644 index 0000000..e5ffc63 --- /dev/null +++ b/mmdetection/configs/legacy_1.x/ssd300_coco_v1.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# model settings +input_size = 300 +model = dict( + bbox_head=dict( + type='SSDHead', + anchor_generator=dict( + type='LegacySSDAnchorGenerator', + scale_major=False, + input_size=input_size, + basesize_ratio_range=(0.15, 0.9), + strides=[8, 16, 32, 64, 100, 300], + ratios=[[2], [2, 3], [2, 3], [2, 3], [2], [2]]), + bbox_coder=dict( + type='LegacyDeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]))) diff --git a/mmdetection/configs/libra_rcnn/README.md b/mmdetection/configs/libra_rcnn/README.md new file mode 100644 index 0000000..ee8015b --- /dev/null +++ b/mmdetection/configs/libra_rcnn/README.md @@ -0,0 +1,53 @@ +# Libra R-CNN + +> [Libra R-CNN: Towards Balanced Learning for Object Detection](https://arxiv.org/abs/1904.02701) + + + +## Abstract + +Compared with model architectures, the training process, which is also crucial to the success of detectors, has received relatively less attention in object detection. In this work, we carefully revisit the standard training practice of detectors, and find that the detection performance is often limited by the imbalance during the training process, which generally consists in three levels - sample level, feature level, and objective level. To mitigate the adverse effects caused thereby, we propose Libra R-CNN, a simple but effective framework towards balanced learning for object detection. It integrates three novel components: IoU-balanced sampling, balanced feature pyramid, and balanced L1 loss, respectively for reducing the imbalance at sample, feature, and objective level. Benefitted from the overall balanced design, Libra R-CNN significantly improves the detection performance. Without bells and whistles, it achieves 2.5 points and 2.0 points higher Average Precision (AP) than FPN Faster R-CNN and RetinaNet respectively on MSCOCO. + +Instance recognition is rapidly advanced along with the developments of various deep convolutional neural networks. Compared to the architectures of networks, the training process, which is also crucial to the success of detectors, has received relatively less attention. In this work, we carefully revisit the standard training practice of detectors, and find that the detection performance is often limited by the imbalance during the training process, which generally consists in three levels - sample level, feature level, and objective level. To mitigate the adverse effects caused thereby, we propose Libra R-CNN, a simple yet effective framework towards balanced learning for instance recognition. It integrates IoU-balanced sampling, balanced feature pyramid, and objective re-weighting, respectively for reducing the imbalance at sample, feature, and objective level. Extensive experiments conducted on MS COCO, LVIS and Pascal VOC datasets prove the effectiveness of the overall balanced design. + +
    + +
    + +## Results and Models + +The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val) + +| Architecture | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :----------: | :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN | R-50-FPN | pytorch | 1x | 4.6 | 19.0 | 38.3 | [config](./libra-faster-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| Fast R-CNN | R-50-FPN | pytorch | 1x | | | | | | +| Faster R-CNN | R-101-FPN | pytorch | 1x | 6.5 | 14.4 | 40.1 | [config](./libra-faster-rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203-8dba6a5a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203_001405.log.json) | +| Faster R-CNN | X-101-64x4d-FPN | pytorch | 1x | 10.8 | 8.5 | 42.7 | [config](./libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315-3a7d0488.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315_231625.log.json) | +| RetinaNet | R-50-FPN | pytorch | 1x | 4.2 | 17.7 | 37.6 | [config](./libra-retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205-804d94ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205_112757.log.json) | + +## Citation + +We provide config files to reproduce the results in the CVPR 2019 paper [Libra R-CNN](https://arxiv.org/pdf/1904.02701.pdf). + +The extended version of [Libra R-CNN](https://arxiv.org/pdf/2108.10175.pdf) is accpeted by IJCV. + +```latex +@inproceedings{pang2019libra, + title={Libra R-CNN: Towards Balanced Learning for Object Detection}, + author={Pang, Jiangmiao and Chen, Kai and Shi, Jianping and Feng, Huajun and Ouyang, Wanli and Dahua Lin}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} + +@article{pang2021towards, + title={Towards Balanced Learning for Instance Recognition}, + author={Pang, Jiangmiao and Chen, Kai and Li, Qi and Xu, Zhihai and Feng, Huajun and Shi, Jianping and Ouyang, Wanli and Lin, Dahua}, + journal={International Journal of Computer Vision}, + volume={129}, + number={5}, + pages={1376--1393}, + year={2021}, + publisher={Springer} +} +``` diff --git a/mmdetection/configs/libra_rcnn/libra-fast-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/libra_rcnn/libra-fast-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..2efe440 --- /dev/null +++ b/mmdetection/configs/libra_rcnn/libra-fast-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,52 @@ +_base_ = '../fast_rcnn/fast-rcnn_r50_fpn_1x_coco.py' +# model settings +model = dict( + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + dict( + type='BFP', + in_channels=256, + num_levels=5, + refine_level=2, + refine_type='non_local') + ], + roi_head=dict( + bbox_head=dict( + loss_bbox=dict( + _delete_=True, + type='BalancedL1Loss', + alpha=0.5, + gamma=1.5, + beta=1.0, + loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + sampler=dict( + _delete_=True, + type='CombinedSampler', + num=512, + pos_fraction=0.25, + add_gt_as_proposals=True, + pos_sampler=dict(type='InstanceBalancedPosSampler'), + neg_sampler=dict( + type='IoUBalancedNegSampler', + floor_thr=-1, + floor_fraction=0, + num_bins=3))))) + +# MMEngine support the following two ways, users can choose +# according to convenience +# _base_.train_dataloader.dataset.proposal_file = 'libra_proposals/rpn_r50_fpn_1x_train2017.pkl' # noqa +train_dataloader = dict( + dataset=dict(proposal_file='libra_proposals/rpn_r50_fpn_1x_train2017.pkl')) + +# _base_.val_dataloader.dataset.proposal_file = 'libra_proposals/rpn_r50_fpn_1x_val2017.pkl' # noqa +# test_dataloader = _base_.val_dataloader +val_dataloader = dict( + dataset=dict(proposal_file='libra_proposals/rpn_r50_fpn_1x_val2017.pkl')) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py b/mmdetection/configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..985df64 --- /dev/null +++ b/mmdetection/configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './libra-faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..f9ee507 --- /dev/null +++ b/mmdetection/configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,41 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +# model settings +model = dict( + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + dict( + type='BFP', + in_channels=256, + num_levels=5, + refine_level=2, + refine_type='non_local') + ], + roi_head=dict( + bbox_head=dict( + loss_bbox=dict( + _delete_=True, + type='BalancedL1Loss', + alpha=0.5, + gamma=1.5, + beta=1.0, + loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict(sampler=dict(neg_pos_ub=5), allowed_border=-1), + rcnn=dict( + sampler=dict( + _delete_=True, + type='CombinedSampler', + num=512, + pos_fraction=0.25, + add_gt_as_proposals=True, + pos_sampler=dict(type='InstanceBalancedPosSampler'), + neg_sampler=dict( + type='IoUBalancedNegSampler', + floor_thr=-1, + floor_fraction=0, + num_bins=3))))) diff --git a/mmdetection/configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..158e238 --- /dev/null +++ b/mmdetection/configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './libra-faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py b/mmdetection/configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py new file mode 100644 index 0000000..be27420 --- /dev/null +++ b/mmdetection/configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,26 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' +# model settings +model = dict( + neck=[ + dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5), + dict( + type='BFP', + in_channels=256, + num_levels=5, + refine_level=1, + refine_type='non_local') + ], + bbox_head=dict( + loss_bbox=dict( + _delete_=True, + type='BalancedL1Loss', + alpha=0.5, + gamma=1.5, + beta=0.11, + loss_weight=1.0))) diff --git a/mmdetection/configs/libra_rcnn/metafile.yml b/mmdetection/configs/libra_rcnn/metafile.yml new file mode 100644 index 0000000..f01bd02 --- /dev/null +++ b/mmdetection/configs/libra_rcnn/metafile.yml @@ -0,0 +1,99 @@ +Collections: + - Name: Libra R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - IoU-Balanced Sampling + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Balanced Feature Pyramid + Paper: + URL: https://arxiv.org/abs/1904.02701 + Title: 'Libra R-CNN: Towards Balanced Learning for Object Detection' + README: configs/libra_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/bfp.py#L10 + Version: v2.0.0 + +Models: + - Name: libra-faster-rcnn_r50_fpn_1x_coco + In Collection: Libra R-CNN + Config: configs/libra_rcnn/libra-faster-rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.6 + inference time (ms/im): + - value: 52.63 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r50_fpn_1x_coco/libra_faster_rcnn_r50_fpn_1x_coco_20200130-3afee3a9.pth + + - Name: libra-faster-rcnn_r101_fpn_1x_coco + In Collection: Libra R-CNN + Config: configs/libra_rcnn/libra-faster-rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.5 + inference time (ms/im): + - value: 69.44 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_r101_fpn_1x_coco/libra_faster_rcnn_r101_fpn_1x_coco_20200203-8dba6a5a.pth + + - Name: libra-faster-rcnn_x101-64x4d_fpn_1x_coco + In Collection: Libra R-CNN + Config: configs/libra_rcnn/libra-faster-rcnn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.8 + inference time (ms/im): + - value: 117.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_faster_rcnn_x101_64x4d_fpn_1x_coco/libra_faster_rcnn_x101_64x4d_fpn_1x_coco_20200315-3a7d0488.pth + + - Name: libra-retinanet_r50_fpn_1x_coco + In Collection: Libra R-CNN + Config: configs/libra_rcnn/libra-retinanet_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + inference time (ms/im): + - value: 56.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/libra_rcnn/libra_retinanet_r50_fpn_1x_coco/libra_retinanet_r50_fpn_1x_coco_20200205-804d94ce.pth diff --git a/mmdetection/configs/lvis/README.md b/mmdetection/configs/lvis/README.md new file mode 100644 index 0000000..57aeda4 --- /dev/null +++ b/mmdetection/configs/lvis/README.md @@ -0,0 +1,56 @@ +# LVIS + +> [LVIS: A Dataset for Large Vocabulary Instance Segmentation](https://arxiv.org/abs/1908.03195) + + + +## Abstract + +Progress on object detection is enabled by datasets that focus the research community's attention on open challenges. This process led us from simple images to complex scenes and from bounding boxes to segmentation masks. In this work, we introduce LVIS (pronounced \`el-vis'): a new dataset for Large Vocabulary Instance Segmentation. We plan to collect ~2 million high-quality instance segmentation masks for over 1000 entry-level object categories in 164k images. Due to the Zipfian distribution of categories in natural images, LVIS naturally has a long tail of categories with few training samples. Given that state-of-the-art deep learning methods for object detection perform poorly in the low-sample regime, we believe that our dataset poses an important and exciting new scientific challenge. + +
    + +
    + +## Common Setting + +- Please follow [install guide](../../docs/get_started.md#install-mmdetection) to install open-mmlab forked cocoapi first. + +- Run following scripts to install our forked lvis-api. + + ```shell + pip install git+https://github.com/lvis-dataset/lvis-api.git + ``` + +- All experiments use oversample strategy [here](../../docs/tutorials/customize_dataset.md#class-balanced-dataset) with oversample threshold `1e-3`. + +- The size of LVIS v0.5 is half of COCO, so schedule `2x` in LVIS is roughly the same iterations as `1x` in COCO. + +## Results and models of LVIS v0.5 + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 2x | - | - | 26.1 | 25.9 | [config](./mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis-dbd06831.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis_20200531_160435.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 27.1 | 27.0 | [config](./mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis-54582ee2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis_20200601_134748.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 26.7 | 26.9 | [config](./mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis-3cf55ea2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis_20200531_221749.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 26.4 | 26.0 | [config](./mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis-1c99a5ad.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis_20200601_194651.log.json) | + +## Results and models of LVIS v1 + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 9.1 | - | 22.5 | 21.7 | [config](./mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1-aa78ac3d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_061305.log.json) | +| R-101-FPN | pytorch | 1x | 10.8 | - | 24.6 | 23.6 | [config](./mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1-ec55ce32.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_070959.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 11.8 | - | 26.7 | 25.5 | [config](./mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-ebbc5c81.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-20200829_071317.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 14.6 | - | 27.2 | 25.8 | [config](./mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-43d9edfe.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-20200830_060206.log.json) | + +## Citation + +```latex +@inproceedings{gupta2019lvis, + title={{LVIS}: A Dataset for Large Vocabulary Instance Segmentation}, + author={Gupta, Agrim and Dollar, Piotr and Girshick, Ross}, + booktitle={Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition}, + year={2019} +} +``` diff --git a/mmdetection/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py b/mmdetection/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py new file mode 100644 index 0000000..3994d75 --- /dev/null +++ b/mmdetection/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py b/mmdetection/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py new file mode 100644 index 0000000..ed8b363 --- /dev/null +++ b/mmdetection/configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py b/mmdetection/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py new file mode 100644 index 0000000..cdd3683 --- /dev/null +++ b/mmdetection/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py @@ -0,0 +1,13 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/lvis_v1_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict(num_classes=1203), mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) diff --git a/mmdetection/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py b/mmdetection/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py new file mode 100644 index 0000000..b36b6c1 --- /dev/null +++ b/mmdetection/configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py @@ -0,0 +1,13 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/lvis_v0.5_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict(num_classes=1230), mask_head=dict(num_classes=1230)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) diff --git a/mmdetection/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py b/mmdetection/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py new file mode 100644 index 0000000..9da3ab6 --- /dev/null +++ b/mmdetection/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py b/mmdetection/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py new file mode 100644 index 0000000..9a097c9 --- /dev/null +++ b/mmdetection/configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py b/mmdetection/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py new file mode 100644 index 0000000..b0819b3 --- /dev/null +++ b/mmdetection/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py b/mmdetection/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py new file mode 100644 index 0000000..9d27200 --- /dev/null +++ b/mmdetection/configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/lvis/metafile.yml b/mmdetection/configs/lvis/metafile.yml new file mode 100644 index 0000000..f8def96 --- /dev/null +++ b/mmdetection/configs/lvis/metafile.yml @@ -0,0 +1,128 @@ +Models: + - Name: mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5 + In Collection: Mask R-CNN + Config: configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-2x_lvis-v0.5.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v0.5 + Metrics: + box AP: 26.1 + - Task: Instance Segmentation + Dataset: LVIS v0.5 + Metrics: + mask AP: 25.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_2x_lvis-dbd06831.pth + + - Name: mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5 + In Collection: Mask R-CNN + Config: configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-2x_lvis-v0.5.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v0.5 + Metrics: + box AP: 27.1 + - Task: Instance Segmentation + Dataset: LVIS v0.5 + Metrics: + mask AP: 27.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_2x_lvis-54582ee2.pth + + - Name: mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5 + In Collection: Mask R-CNN + Config: configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v0.5 + Metrics: + box AP: 26.7 + - Task: Instance Segmentation + Dataset: LVIS v0.5 + Metrics: + mask AP: 26.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_2x_lvis-3cf55ea2.pth + + - Name: mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5 + In Collection: Mask R-CNN + Config: configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-2x_lvis-v0.5.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v0.5 + Metrics: + box AP: 26.4 + - Task: Instance Segmentation + Dataset: LVIS v0.5 + Metrics: + mask AP: 26.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_2x_lvis-1c99a5ad.pth + + - Name: mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1 + In Collection: Mask R-CNN + Config: configs/lvis/mask-rcnn_r50_fpn_sample1e-3_ms-1x_lvis-v1.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 22.5 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 21.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r50_fpn_sample1e-3_mstrain_1x_lvis_v1-aa78ac3d.pth + + - Name: mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1 + In Collection: Mask R-CNN + Config: configs/lvis/mask-rcnn_r101_fpn_sample1e-3_ms-1x_lvis-v1.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 24.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 23.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_r101_fpn_sample1e-3_mstrain_1x_lvis_v1-ec55ce32.pth + + - Name: mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1 + In Collection: Mask R-CNN + Config: configs/lvis/mask-rcnn_x101-32x4d_fpn_sample1e-3_ms-1x_lvis-v1.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 26.7 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 25.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_32x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-ebbc5c81.pth + + - Name: mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1 + In Collection: Mask R-CNN + Config: configs/lvis/mask-rcnn_x101-64x4d_fpn_sample1e-3_ms-1x_lvis-v1.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.2 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 25.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/lvis/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1/mask_rcnn_x101_64x4d_fpn_sample1e-3_mstrain_1x_lvis_v1-43d9edfe.pth diff --git a/mmdetection/configs/mask2former/README.md b/mmdetection/configs/mask2former/README.md new file mode 100644 index 0000000..94b0821 --- /dev/null +++ b/mmdetection/configs/mask2former/README.md @@ -0,0 +1,76 @@ +# Mask2Former + +> [Masked-attention Mask Transformer for Universal Image Segmentation](http://arxiv.org/abs/2112.01527) + + + +## Abstract + +Image segmentation is about grouping pixels with different semantics, e.g., category or instance membership, where each choice of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K). + +
    + +
    + +## Introduction + +Mask2Former requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +| | | ├── instances_train2017.json +| | | ├── instances_val2017.json +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +## Results and Models + +### Panoptic segmentation + +| Backbone | style | Pretrain | Lr schd | Mem (GB) | Inf time (fps) | PQ | box mAP | mask mAP | Config | Download | +| -------- | ------- | ------------ | ------- | -------- | -------------- | ---- | ------- | -------- | ------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| R-50 | pytorch | ImageNet-1K | 50e | 13.9 | - | 52.0 | 44.5 | 41.8 | [config](./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic/mask2former_r50_8xb2-lsj-50e_coco-panoptic_20230118_125535-54df384a.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic/mask2former_r50_8xb2-lsj-50e_coco-panoptic_20230118_125535.log.json) | +| R-101 | pytorch | ImageNet-1K | 50e | 16.1 | - | 52.4 | 45.3 | 42.4 | [config](./mask2former_r101_8xb2-lsj-50e_coco-panoptic.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic/mask2former_r101_8xb2-lsj-50e_coco-panoptic_20220329_225104-c74d4d71.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco-panoptic/mask2former_r101_lsj_8x2_50e_coco-panoptic_20220329_225104.log.json) | +| Swin-T | - | ImageNet-1K | 50e | 15.9 | - | 53.4 | 46.3 | 43.4 | [config](./mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic_20220326_224553-3ec9e0ae.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco-panoptic_20220326_224553.log.json) | +| Swin-S | - | ImageNet-1K | 50e | 19.1 | - | 54.5 | 47.8 | 44.5 | [config](./mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic_20220329_225200-4a16ded7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco-panoptic_20220329_225200.log.json) | +| Swin-B | - | ImageNet-1K | 50e | 26.0 | - | 55.1 | 48.2 | 44.9 | [config](./mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic_20220331_002244-8a651d82.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic/mask2former_swin-b-p4-w12-384_lsj_8x2_50e_coco-panoptic_20220331_002244.log.json) | +| Swin-B | - | ImageNet-21K | 50e | 25.8 | - | 56.3 | 50.0 | 46.3 | [config](./mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic_20220329_230021-05ec7315.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic/mask2former_swin-b-p4-w12-384-in21k_lsj_8x2_50e_coco-panoptic_20220329_230021.log.json) | +| Swin-L | - | ImageNet-21K | 100e | 21.1 | - | 57.6 | 52.2 | 48.5 | [config](./mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic_20220407_104949-82f8d28d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic/mask2former_swin-l-p4-w12-384-in21k_lsj_16x1_100e_coco-panoptic_20220407_104949.log.json) | + +### Instance segmentation + +| Backbone | style | Pretrain | Lr schd | Mem (GB) | Inf time (fps) | box mAP | mask mAP | Config | Download | +| -------- | ------- | ----------- | ------- | -------- | -------------- | ------- | -------- | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| R-50 | pytorch | ImageNet-1K | 50e | 13.7 | - | 45.7 | 42.9 | [config](./mask2former_r50_8xb2-lsj-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco/mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r50_lsj_8x2_50e_coco/mask2former_r50_lsj_8x2_50e_coco_20220506_191028.log.json) | +| R-101 | pytorch | ImageNet-1K | 50e | 15.5 | - | 46.7 | 44.0 | [config](./mask2former_r101_8xb2-lsj-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r101_8xb2-lsj-50e_coco/mask2former_r101_8xb2-lsj-50e_coco_20220426_100250-ecf181e2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_r101_lsj_8x2_50e_coco/mask2former_r101_lsj_8x2_50e_coco_20220426_100250.log.json) | +| Swin-T | - | ImageNet-1K | 50e | 15.3 | - | 47.7 | 44.7 | [config](./mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco_20220508_091649-01b0f990.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-t-p4-w7-224_lsj_8x2_50e_coco_20220508_091649.log.json) | +| Swin-S | - | ImageNet-1K | 50e | 18.8 | - | 49.3 | 46.1 | [config](./mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco_20220504_001756-c9d0c4f2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask2former/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco/mask2former_swin-s-p4-w7-224_lsj_8x2_50e_coco_20220504_001756.log.json) | + +### Note + +1. The performance is unstable. The `Mask2Former-R50-coco-panoptic` may fluctuate about 0.2 PQ. The models other than `Mask2Former-R50-coco-panoptic` were trained with mmdet 2.x and have been converted for mmdet 3.x. +2. We have trained the instance segmentation models many times (see more details in [PR 7571](https://github.com/open-mmlab/mmdetection/pull/7571)). The results of the trained models are relatively stable (+- 0.2), and have a certain gap (about 0.2 AP) in comparison with the results in the [paper](http://arxiv.org/abs/2112.01527). However, the performance of the model trained with the official code is unstable and may also be slightly lower than the reported results as mentioned in the [issue](https://github.com/facebookresearch/Mask2Former/issues/46). + +## Citation + +```latex +@article{cheng2021mask2former, + title={Masked-attention Mask Transformer for Universal Image Segmentation}, + author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar}, + journal={arXiv}, + year={2021} +} +``` diff --git a/mmdetection/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py b/mmdetection/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py new file mode 100644 index 0000000..66685a2 --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py @@ -0,0 +1,7 @@ +_base_ = './mask2former_r50_8xb2-lsj-50e_coco-panoptic.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py b/mmdetection/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py new file mode 100644 index 0000000..f4c2990 --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py @@ -0,0 +1,7 @@ +_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco.py'] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py b/mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py new file mode 100644 index 0000000..c53e981 --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py @@ -0,0 +1,251 @@ +_base_ = [ + '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' +] +image_size = (1024, 1024) +batch_augments = [ + dict( + type='BatchFixedSizePad', + size=image_size, + img_pad_value=0, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255) +] +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255, + batch_augments=batch_augments) + +num_things_classes = 80 +num_stuff_classes = 53 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type='Mask2Former', + data_preprocessor=data_preprocessor, + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='Mask2FormerHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + num_transformer_feat_level=3, + pixel_decoder=dict( + type='MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + dropout=0.0, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True)))), + positional_encoding=dict(num_feats=128, normalize=True)), + enforce_decoder_input_project=False, + positional_encoding=dict(num_feats=128, normalize=True), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True))), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='ClassificationCost', weight=2.0), + dict( + type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), + dict(type='DiceCost', weight=5.0, pred_act=True, eps=1.0) + ]), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=True, + # max_per_image is for instance segmentation. + max_per_image=100, + iou_thr=0.8, + # In Mask2Former's panoptic postprocessing, + # it will filter mask area where score is less than 0.5 . + filter_low_score=True), + init_cfg=None) + +# dataset settings +data_root = 'data/coco/' +train_pipeline = [ + dict( + type='LoadImageFromFile', + to_float32=True, + backend_args={{_base_.backend_args}}), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True, + backend_args={{_base_.backend_args}}), + dict(type='RandomFlip', prob=0.5), + # large scale jittering + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +val_evaluator = [ + dict( + type='CocoPanopticMetric', + ann_file=data_root + 'annotations/panoptic_val2017.json', + seg_prefix=data_root + 'annotations/panoptic_val2017/', + backend_args={{_base_.backend_args}}), + dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + backend_args={{_base_.backend_args}}) +] +test_evaluator = val_evaluator + +# optimizer +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999)), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi, + }, + norm_decay_mult=0.0), + clip_grad=dict(max_norm=0.01, norm_type=2)) + +# learning policy +max_iters = 368750 +param_scheduler = dict( + type='MultiStepLR', + begin=0, + end=max_iters, + by_epoch=False, + milestones=[327778, 355092], + gamma=0.1) + +# Before 365001th iteration, we do evaluation every 5000 iterations. +# After 365000th iteration, we do evaluation every 368750 iterations, +# which means that we do evaluation at the end of training. +interval = 5000 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +train_cfg = dict( + type='IterBasedTrainLoop', + max_iters=max_iters, + val_interval=interval, + dynamic_intervals=dynamic_intervals) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + by_epoch=False, + save_last=True, + max_keep_ckpts=3, + interval=interval)) +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py b/mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py new file mode 100644 index 0000000..24a17f5 --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py @@ -0,0 +1,100 @@ +_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py'] + +num_things_classes = 80 +num_stuff_classes = 0 +num_classes = num_things_classes + num_stuff_classes +image_size = (1024, 1024) +batch_augments = [ + dict( + type='BatchFixedSizePad', + size=image_size, + img_pad_value=0, + pad_mask=True, + mask_pad_value=0, + pad_seg=False) +] +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + pad_mask=True, + mask_pad_value=0, + pad_seg=False, + batch_augments=batch_augments) +model = dict( + data_preprocessor=data_preprocessor, + panoptic_head=dict( + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_cls=dict(class_weight=[1.0] * num_classes + [0.1])), + panoptic_fusion_head=dict( + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes), + test_cfg=dict(panoptic_on=False)) + +# dataset settings +train_pipeline = [ + dict( + type='LoadImageFromFile', + to_float32=True, + backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', prob=0.5), + # large scale jittering + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + resize_type='Resize', + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=image_size, + crop_type='absolute', + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-5, 1e-5), by_mask=True), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict( + type='LoadImageFromFile', + to_float32=True, + backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +train_dataloader = dict( + dataset=dict( + type=dataset_type, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline)) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict( + _delete_=True, + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args={{_base_.backend_args}}) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py b/mmdetection/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py new file mode 100644 index 0000000..b275f23 --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py @@ -0,0 +1,5 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/mmdetection/configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py b/mmdetection/configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py new file mode 100644 index 0000000..bd59400 --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py @@ -0,0 +1,42 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + pretrain_img_size=384, + embed_dims=128, + depths=depths, + num_heads=[4, 8, 16, 32], + window_size=12, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(in_channels=[128, 256, 512, 1024])) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optim_wrapper = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/mmdetection/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py b/mmdetection/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py new file mode 100644 index 0000000..e203ffc --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py @@ -0,0 +1,25 @@ +_base_ = ['./mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa + +model = dict( + backbone=dict( + embed_dims=192, + num_heads=[6, 12, 24, 48], + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict(num_queries=200, in_channels=[192, 384, 768, 1536])) + +train_dataloader = dict(batch_size=1, num_workers=1) + +# learning policy +max_iters = 737500 +param_scheduler = dict(end=max_iters, milestones=[655556, 710184]) + +# Before 735001th iteration, we do evaluation every 5000 iterations. +# After 735000th iteration, we do evaluation every 737500 iterations, +# which means that we do evaluation at the end of training.' +interval = 5000 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +train_cfg = dict( + max_iters=max_iters, + val_interval=interval, + dynamic_intervals=dynamic_intervals) diff --git a/mmdetection/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py b/mmdetection/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py new file mode 100644 index 0000000..f9d081d --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py @@ -0,0 +1,37 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + depths=depths, init_cfg=dict(type='Pretrained', + checkpoint=pretrained))) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optim_wrapper = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/mmdetection/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py b/mmdetection/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py new file mode 100644 index 0000000..69d5e8c --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py @@ -0,0 +1,37 @@ +_base_ = ['./mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa + +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + depths=depths, init_cfg=dict(type='Pretrained', + checkpoint=pretrained))) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optim_wrapper = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/mmdetection/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py b/mmdetection/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py new file mode 100644 index 0000000..1c00d7a --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py @@ -0,0 +1,58 @@ +_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco-panoptic.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +depths = [2, 2, 6, 2] +model = dict( + type='Mask2Former', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), + init_cfg=None) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) + +# optimizer +optim_wrapper = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/mmdetection/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py b/mmdetection/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py new file mode 100644 index 0000000..5bb9c21 --- /dev/null +++ b/mmdetection/configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py @@ -0,0 +1,56 @@ +_base_ = ['./mask2former_r50_8xb2-lsj-50e_coco.py'] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa +depths = [2, 2, 6, 2] +model = dict( + type='Mask2Former', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=depths, + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + type='Mask2FormerHead', in_channels=[96, 192, 384, 768]), + init_cfg=None) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optim_wrapper = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/mmdetection/configs/mask2former/metafile.yml b/mmdetection/configs/mask2former/metafile.yml new file mode 100644 index 0000000..3321239 --- /dev/null +++ b/mmdetection/configs/mask2former/metafile.yml @@ -0,0 +1,223 @@ +Collections: + - Name: Mask2Former + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 8x A100 GPUs + Architecture: + - Mask2Former + Paper: + URL: https://arxiv.org/pdf/2112.01527 + Title: 'Masked-attention Mask Transformer for Universal Image Segmentation' + README: configs/mask2former/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.23.0/mmdet/models/detectors/mask2former.py#L7 + Version: v2.23.0 + +Models: +- Name: mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py + Metadata: + Training Memory (GB): 19.1 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.5 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 54.5 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco-panoptic_20220329_225200-4a16ded7.pth +- Name: mask2former_r101_8xb2-lsj-50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco.py + Metadata: + Training Memory (GB): 15.5 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.0 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r101_8xb2-lsj-50e_coco/mask2former_r101_8xb2-lsj-50e_coco_20220426_100250-ecf181e2.pth +- Name: mask2former_r101_8xb2-lsj-50e_coco-panoptic + In Collection: Mask2Former + Config: configs/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic.py + Metadata: + Training Memory (GB): 16.1 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.4 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 52.4 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r101_8xb2-lsj-50e_coco-panoptic/mask2former_r101_8xb2-lsj-50e_coco-panoptic_20220329_225104-c74d4d71.pth +- Name: mask2former_r50_8xb2-lsj-50e_coco-panoptic + In Collection: Mask2Former + Config: configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic.py + Metadata: + Training Memory (GB): 13.9 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.8 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 52.0 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco-panoptic/mask2former_r50_8xb2-lsj-50e_coco-panoptic_20230118_125535-54df384a.pth +- Name: mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic.py + Metadata: + Training Memory (GB): 15.9 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.4 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 53.4 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco-panoptic_20220326_224553-3ec9e0ae.pth +- Name: mask2former_r50_8xb2-lsj-50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_r50_8xb2-lsj-50e_coco.py + Metadata: + Training Memory (GB): 13.7 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_r50_8xb2-lsj-50e_coco/mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth +- Name: mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic.py + Metadata: + Training Memory (GB): 21.1 + Iterations: 737500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 48.5 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 57.6 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic/mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic_20220407_104949-82f8d28d.pth +- Name: mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic.py + Metadata: + Training Memory (GB): 25.8 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 46.3 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 56.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic/mask2former_swin-b-p4-w12-384-in21k_8xb2-lsj-50e_coco-panoptic_20220329_230021-05ec7315.pth +- Name: mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic.py + Metadata: + Training Memory (GB): 26.0 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.9 + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 55.1 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic/mask2former_swin-b-p4-w12-384_8xb2-lsj-50e_coco-panoptic_20220331_002244-8a651d82.pth +- Name: mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco.py + Metadata: + Training Memory (GB): 15.3 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.7 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco/mask2former_swin-t-p4-w7-224_8xb2-lsj-50e_coco_20220508_091649-01b0f990.pth +- Name: mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco + In Collection: Mask2Former + Config: configs/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco.py + Metadata: + Training Memory (GB): 18.8 + Iterations: 368750 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 46.1 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco/mask2former_swin-s-p4-w7-224_8xb2-lsj-50e_coco_20220504_001756-c9d0c4f2.pth diff --git a/mmdetection/configs/mask2former_vis/README.md b/mmdetection/configs/mask2former_vis/README.md new file mode 100644 index 0000000..6996572 --- /dev/null +++ b/mmdetection/configs/mask2former_vis/README.md @@ -0,0 +1,81 @@ +# Mask2Former for Video Instance Segmentation + +## Abstract + + + +We find Mask2Former also achieves state-of-the-art performance on video instance segmentation without modifying the architecture, the loss or even the training pipeline. In this report, we show universal image segmentation architectures trivially generalize to video segmentation by directly predicting 3D segmentation volumes. Specifically, Mask2Former sets a new state-of-the-art of 60.4 AP on YouTubeVIS-2019 and 52.6 AP on YouTubeVIS-2021. We believe Mask2Former is also capable of handling video semantic and panoptic segmentation, given its versatility in image segmentation. We hope this will make state-of-theart video segmentation research more accessible and bring more attention to designing universal image and video segmentation architectures. + + + +
    + +
    + +## Citation + + + +```latex +@inproceedings{cheng2021mask2former, + title={Masked-attention Mask Transformer for Universal Image Segmentation}, + author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar}, + journal={CVPR}, + year={2022} +} +``` + +## Results and models of Mask2Former on YouTube-VIS 2021 validation dataset + +Note: Codalab has closed the evaluation portal of `YouTube-VIS 2019`, so we do not provide the results of `YouTube-VIS 2019` at present. If you want to evaluate the results of `YouTube-VIS 2021`, at present, you can submit the result to the evaluation portal of `YouTube-VIS 2022`. The value of `AP_S` is the result of `YouTube-VIS 2021`. + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | AP | Config | Download | +| :----------------------: | :------: | :-----: | :-----: | :------: | :------------: | :--: | :---------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Mask2Former | R-50 | pytorch | 8e | 6.0 | - | 41.3 | [config](mask2former_r50_8xb2-8e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021/mask2former_r50_8xb2-8e_youtubevis2021_20230426_131833-5d215283.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021/mask2former_r50_8xb2-8e_youtubevis2021_20230426_131833.json) | +| Mask2Former | R-101 | pytorch | 8e | 7.5 | - | 42.3 | [config](mask2former_r101_8xb2-8e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021/mask2former_r101_8xb2-8e_youtubevis2021_20220823_092747-8077d115.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_r101_8xb2-8e_youtubevis2021_20220823_092747.json) | +| Mask2Former(200 queries) | Swin-L | pytorch | 8e | 18.5 | - | 52.3 | [config](mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021_20220907_124752-48252603.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/mask2former/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021_20220907_124752.json) | + +## Get started + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Prepare + +Tracking Dataset Prepare can refer to this [document](../../docs/en/user_guides/tracking_dataset_prepare.md). + +### 3. Training + +Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training. + +```shell +# Training Mask2Former on YouTube-VIS-2021 dataset with following command. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_train.sh configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py 8 +``` + +If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 4. Testing and evaluation + +If you want to get the results of the [YouTube-VOS](https://youtube-vos.org/dataset/vis/) val/test set, please use the following command to generate result files that can be used for submission. It will be stored in `./youtube_vis_results.submission_file.zip`, you can modify the saved path in `test_evaluator` of the config. + +```shell +# The number after config file represents the number of GPUs used. +bash tools/dist_test_tracking.sh configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py --checkpoint ${CHECKPOINT_PATH} +``` + +If you want to know about more detailed usage of `test_tracking.py/dist_test_tracking.sh/slurm_test_tracking.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 5.Inference + +Use a single GPU to predict a video and save it as a video. + +```shell +python demo/mot_demo.py demo/demo_mot.mp4 configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py --checkpoint {CHECKPOINT_PATH} --out vis.mp4 +``` + +If you want to know about more detailed usage of `mot_demo.py`, please refer to this [document](../../docs/en/user_guides/tracking_inference.md). diff --git a/mmdetection/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2019.py b/mmdetection/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2019.py new file mode 100644 index 0000000..3ba4aea --- /dev/null +++ b/mmdetection/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2019.py @@ -0,0 +1,12 @@ +_base_ = './mask2former_r50_8xb2-8e_youtubevis2019.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'mask2former/mask2former_r101_8xb2-lsj-50e_coco/' + 'mask2former_r101_8xb2-lsj-50e_coco_20220426_100250-ecf181e2.pth')) diff --git a/mmdetection/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py b/mmdetection/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py new file mode 100644 index 0000000..95f9cee --- /dev/null +++ b/mmdetection/configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py @@ -0,0 +1,12 @@ +_base_ = './mask2former_r50_8xb2-8e_youtubevis2021.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'mask2former/mask2former_r101_8xb2-lsj-50e_coco/' + 'mask2former_r101_8xb2-lsj-50e_coco_20220426_100250-ecf181e2.pth')) diff --git a/mmdetection/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2019.py b/mmdetection/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2019.py new file mode 100644 index 0000000..8dc03bf --- /dev/null +++ b/mmdetection/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2019.py @@ -0,0 +1,174 @@ +_base_ = ['../_base_/datasets/youtube_vis.py', '../_base_/default_runtime.py'] + +num_classes = 40 +num_frames = 2 +model = dict( + type='Mask2FormerVideo', + data_preprocessor=dict( + type='TrackDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + track_head=dict( + type='Mask2FormerTrackHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + strides=[4, 8, 16, 32], + feat_channels=256, + out_channels=256, + num_classes=num_classes, + num_queries=100, + num_frames=num_frames, + num_transformer_feat_level=3, + pixel_decoder=dict( + type='MSDeformAttnPixelDecoder', + num_outs=3, + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + num_heads=8, + num_levels=3, + num_points=4, + im2col_step=128, + dropout=0.0, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True)))), + positional_encoding=dict(num_feats=128, normalize=True)), + enforce_decoder_input_project=False, + positional_encoding=dict( + type='SinePositionalEncoding3D', num_feats=128, normalize=True), + transformer_decoder=dict( # Mask2FormerTransformerDecoder + return_intermediate=True, + num_layers=9, + layer_cfg=dict( # Mask2FormerTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.0, + act_cfg=dict(type='ReLU', inplace=True))), + init_cfg=None), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0), + train_cfg=dict( + num_points=12544, + oversample_ratio=3.0, + importance_sample_ratio=0.75, + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='ClassificationCost', weight=2.0), + dict( + type='CrossEntropyLossCost', + weight=5.0, + use_sigmoid=True), + dict(type='DiceCost', weight=5.0, pred_act=True, eps=1.0) + ]), + sampler=dict(type='MaskPseudoSampler'))), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmdetection/v3.0/' + 'mask2former/mask2former_r50_8xb2-lsj-50e_coco/' + 'mask2former_r50_8xb2-lsj-50e_coco_20220506_191028-41b088b6.pth')) + +# optimizer +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999)), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi, + }, + norm_decay_mult=0.0), + clip_grad=dict(max_norm=0.01, norm_type=2)) + +# learning policy +max_iters = 6000 +param_scheduler = dict( + type='MultiStepLR', + begin=0, + end=max_iters, + by_epoch=False, + milestones=[ + 4000, + ], + gamma=0.1) +# runtime settings +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iters, val_interval=6001) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', by_epoch=False, save_last=True, interval=2000), + visualization=dict(type='TrackVisualizationHook', draw=False)) +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False) + +# evaluator +val_evaluator = dict( + type='YouTubeVISMetric', + metric='youtube_vis_ap', + outfile_prefix='./youtube_vis_results', + format_only=True) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py b/mmdetection/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py new file mode 100644 index 0000000..158fe52 --- /dev/null +++ b/mmdetection/configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py @@ -0,0 +1,37 @@ +_base_ = './mask2former_r50_8xb2-8e_youtubevis2019.py' + +dataset_type = 'YouTubeVISDataset' +data_root = 'data/youtube_vis_2021/' +dataset_version = data_root[-5:-1] # 2019 or 2021 + +train_dataloader = dict( + dataset=dict( + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2021_train.json')) + +val_dataloader = dict( + dataset=dict( + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2021_valid.json')) +test_dataloader = val_dataloader + +# learning policy +max_iters = 8000 +param_scheduler = dict( + type='MultiStepLR', + begin=0, + end=max_iters, + by_epoch=False, + milestones=[ + 5500, + ], + gamma=0.1) +# runtime settings +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iters, val_interval=8001) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', by_epoch=False, save_last=True, interval=500)) diff --git a/mmdetection/configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py b/mmdetection/configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py new file mode 100644 index 0000000..94dcccf --- /dev/null +++ b/mmdetection/configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py @@ -0,0 +1,64 @@ +_base_ = ['./mask2former_r50_8xb2-8e_youtubevis2021.py'] +depths = [2, 2, 18, 2] +model = dict( + type='Mask2FormerVideo', + backbone=dict( + _delete_=True, + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + depths=depths, + num_heads=[6, 12, 24, 48], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + frozen_stages=-1, + init_cfg=None), + track_head=dict( + type='Mask2FormerTrackHead', + in_channels=[192, 384, 768, 1536], + num_queries=200), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v3.0/mask2former/' + 'mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic/' + 'mask2former_swin-l-p4-w12-384-in21k_16xb1-lsj-100e_coco-panoptic_' + '20220407_104949-82f8d28d.pth')) + +# set all layers in backbone to lr_mult=0.1 +# set all norm layers, position_embeding, +# query_embeding, level_embeding to decay_multi=0.0 +backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0) +backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0) +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'backbone.patch_embed.norm': backbone_norm_multi, + 'backbone.norm': backbone_norm_multi, + 'absolute_pos_embed': backbone_embed_multi, + 'relative_position_bias_table': backbone_embed_multi, + 'query_embed': embed_multi, + 'query_feat': embed_multi, + 'level_embed': embed_multi +} +custom_keys.update({ + f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi + for stage_id, num_blocks in enumerate(depths) + for block_id in range(num_blocks) +}) +custom_keys.update({ + f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi + for stage_id in range(len(depths) - 1) +}) +# optimizer +optim_wrapper = dict( + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) diff --git a/mmdetection/configs/mask2former_vis/metafile.yml b/mmdetection/configs/mask2former_vis/metafile.yml new file mode 100644 index 0000000..f5f4bd7 --- /dev/null +++ b/mmdetection/configs/mask2former_vis/metafile.yml @@ -0,0 +1,53 @@ +Collections: + - Name: Mask2Former + Metadata: + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 8x A100 GPUs + Architecture: + - Mask2Former + Paper: + URL: https://arxiv.org/pdf/2112.10764.pdf + Title: Mask2Former for Video Instance Segmentation + README: configs/mask2former/README.md + +Models: + - Name: mask2former_r50_8xb2-8e_youtubevis2021 + In Collection: Mask2Former + Config: configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py + Metadata: + Training Data: YouTube-VIS 2021 + Training Memory (GB): 6.0 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2021 + Metrics: + AP: 41.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021/mask2former_r50_8xb2-8e_youtubevis2021_20230426_131833-5d215283.pth + + - Name: mask2former_r101_8xb2-8e_youtubevis2021 + In Collection: Mask2Former + Config: configs/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021.py + Metadata: + Training Data: YouTube-VIS 2021 + Training Memory (GB): 7.5 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2021 + Metrics: + AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_r101_8xb2-8e_youtubevis2021/mask2former_r101_8xb2-8e_youtubevis2021_20220823_092747-8077d115.pth + + - Name: mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py + In Collection: Mask2Former + Config: configs/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021.py + Metadata: + Training Data: YouTube-VIS 2021 + Training Memory (GB): 18.5 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2021 + Metrics: + AP: 52.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/mask2former_vis/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021/mask2former_swin-l-p4-w12-384-in21k_8xb2-8e_youtubevis2021_20220907_124752-48252603.pth diff --git a/mmdetection/configs/mask_rcnn/README.md b/mmdetection/configs/mask_rcnn/README.md new file mode 100644 index 0000000..afc5c3c --- /dev/null +++ b/mmdetection/configs/mask_rcnn/README.md @@ -0,0 +1,59 @@ +# Mask R-CNN + +> [Mask R-CNN](https://arxiv.org/abs/1703.06870) + + + +## Abstract + +We present a conceptually simple, flexible, and general framework for object instance segmentation. Our approach efficiently detects objects in an image while simultaneously generating a high-quality segmentation mask for each instance. The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition. Mask R-CNN is simple to train and adds only a small overhead to Faster R-CNN, running at 5 fps. Moreover, Mask R-CNN is easy to generalize to other tasks, e.g., allowing us to estimate human poses in the same framework. We show top results in all three tracks of the COCO suite of challenges, including instance segmentation, bounding-box object detection, and person keypoint detection. Without bells and whistles, Mask R-CNN outperforms all existing, single-model entries on every task, including the COCO 2016 challenge winners. We hope our simple and effective approach will serve as a solid baseline and help ease future research in instance-level recognition. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :---------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | caffe | 1x | 4.3 | | 38.0 | 34.4 | [config](./mask-rcnn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.38__segm_mAP-0.344_20200504_231812-0ebd1859.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_20200504_231812.log.json) | +| R-50-FPN | pytorch | 1x | 4.4 | 16.1 | 38.2 | 34.7 | [config](./mask-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json) | +| R-50-FPN (FP16) | pytorch | 1x | 3.6 | 24.1 | 38.1 | 34.7 | [config](./mask-rcnn_r50_fpn_amp-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205-59faf7e4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205_130539.log.json) | +| R-50-FPN | pytorch | 2x | - | - | 39.2 | 35.4 | [config](./mask-rcnn_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_20200505_003907.log.json) | +| R-101-FPN | caffe | 1x | | | 40.4 | 36.4 | [config](./mask-rcnn_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758-805e06c1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758.log.json) | +| R-101-FPN | pytorch | 1x | 6.4 | 13.5 | 40.0 | 36.1 | [config](./mask-rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204_144809.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 40.8 | 36.6 | [config](./mask-rcnn_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_bbox_mAP-0.408__segm_mAP-0.366_20200505_071027-14b391c7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_20200505_071027.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.6 | 11.3 | 41.9 | 37.5 | [config](./mask-rcnn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205_034906.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 42.2 | 37.8 | [config](./mask-rcnn_x101-32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.422__segm_mAP-0.378_20200506_004702-faef898c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_20200506_004702.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.7 | 8.0 | 42.8 | 38.4 | [config](./mask-rcnn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201_124310.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 42.7 | 38.1 | [config](./mask-rcnn_x101-64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208-39d6f70c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208.log.json) | +| X-101-32x8d-FPN | pytorch | 1x | 10.6 | - | 42.8 | 38.3 | [config](./mask-rcnn_x101-32x8d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco/mask_rcnn_x101_32x8d_fpn_1x_coco_20220630_173841-0aaf329e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco/mask_rcnn_x101_32x8d_fpn_1x_coco_20220630_173841.log.json) | + +## Pre-trained Models + +We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks. + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :--------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [R-50-FPN](./mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py) | caffe | 2x | 4.3 | | 40.3 | 36.5 | [config](./mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_bbox_mAP-0.403__segm_mAP-0.365_20200504_231822-a75c98ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_20200504_231822.log.json) | +| [R-50-FPN](./mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py) | caffe | 3x | 4.3 | | 40.8 | 37.0 | [config](./mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_20200504_163245.log.json) | +| [R-50-FPN](./mask-rcnn_r50_fpn_ms-poly-3x_coco.py) | pytorch | 3x | 4.1 | | 40.9 | 37.1 | [config](./mask-rcnn_r50_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154.log.json) | +| [R-101-FPN](./mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py) | caffe | 3x | 5.9 | | 42.9 | 38.5 | [config](./mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339-3c33ce02.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339.log.json) | +| [R-101-FPN](./mask-rcnn_r101_fpn_ms-poly-3x_coco.py) | pytorch | 3x | 6.1 | | 42.7 | 38.5 | [config](./mask-rcnn_r101_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244-5675c317.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244.log.json) | +| [x101-32x4d-FPN](./mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py) | pytorch | 3x | 7.3 | | 43.6 | 39.0 | [config](./mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410-abcd7859.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410.log.json) | +| [X-101-32x8d-FPN](./mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py) | pytorch | 1x | 10.4 | | 43.4 | 39.0 | [config](./mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco_20220630_170346-b4637974.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco_20220630_170346.log.json) | +| [X-101-32x8d-FPN](./mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py) | pytorch | 3x | 10.3 | | 44.3 | 39.5 | [config](./mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042-8bd2c639.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042.log.json) | +| [X-101-64x4d-FPN](./mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py) | pytorch | 3x | 10.4 | | 44.5 | 39.7 | [config](./mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447-c376f129.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447.log.json) | + +## Citation + +```latex +@article{He_2017, + title={Mask R-CNN}, + journal={2017 IEEE International Conference on Computer Vision (ICCV)}, + publisher={IEEE}, + author={He, Kaiming and Gkioxari, Georgia and Dollar, Piotr and Girshick, Ross}, + year={2017}, + month={Oct} +} +``` diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..09808e4 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..e723aea --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py @@ -0,0 +1,19 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + depth=101, + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..af91ff0 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py new file mode 100644 index 0000000..a5599e7 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..4523510 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..384f6dc --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py @@ -0,0 +1,10 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..5b9219c --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r18_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py new file mode 100644 index 0000000..9919f11 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe-c4_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50-caffe-c4.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..4124f13 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py new file mode 100644 index 0000000..7702ae1 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py @@ -0,0 +1,28 @@ +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs'), +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py new file mode 100644 index 0000000..94d94dd --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py @@ -0,0 +1,31 @@ +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py new file mode 100644 index 0000000..dbf87bb --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py' + +train_cfg = dict(max_epochs=24) +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=24, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..45260e2 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py @@ -0,0 +1,15 @@ +_base_ = './mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py' + +train_cfg = dict(max_epochs=36) +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=24, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py new file mode 100644 index 0000000..3baf001 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py @@ -0,0 +1,31 @@ +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + bbox_roi_extractor=dict( + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=2, + aligned=False)), + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_roi_extractor=dict( + roi_layer=dict( + type='RoIAlign', + output_size=14, + sampling_ratio=2, + aligned=False)))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_1x-wandb_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_1x-wandb_coco.py new file mode 100644 index 0000000..28b125c --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_1x-wandb_coco.py @@ -0,0 +1,16 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')] +visualizer = dict(vis_backends=vis_backends) + +# MMEngine support the following two ways, users can choose +# according to convenience +# default_hooks = dict(checkpoint=dict(interval=4)) +_base_.default_hooks.checkpoint.interval = 4 + +# train_cfg = dict(val_interval=2) +_base_.train_cfg.val_interval = 2 diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..0fc6b91 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py new file mode 100644 index 0000000..87cb8b4 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py @@ -0,0 +1,5 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..7371b36 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,22 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../common/lsj-100e_coco-instance.py' +] +image_size = (1024, 1024) +batch_augments = [ + dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) +] + +model = dict(data_preprocessor=dict(batch_augments=batch_augments)) + +train_dataloader = dict(batch_size=8, num_workers=4) +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.02 * 4, momentum=0.9, weight_decay=0.00004)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py new file mode 100644 index 0000000..a139c48 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py @@ -0,0 +1,4 @@ +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict(type='AmpOptimWrapper') diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..417adc3 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py @@ -0,0 +1,4 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_poly-1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_poly-1x_coco.py new file mode 100644 index 0000000..826180c --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_r50_fpn_poly-1x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs'), +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..921ade8 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_r101_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py new file mode 100644 index 0000000..db8157f --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_r101_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..83e5451 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py new file mode 100644 index 0000000..3e9b1b6 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py @@ -0,0 +1,22 @@ +_base_ = './mask-rcnn_r101_fpn_1x_coco.py' + +model = dict( + # ResNeXt-101-32x8d model trained with Caffe2 at FB, + # so the mean and std need to be changed. + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False), + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py new file mode 100644 index 0000000..6ee204d --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py @@ -0,0 +1,40 @@ +_base_ = './mask-rcnn_r101_fpn_1x_coco.py' + +model = dict( + # ResNeXt-101-32x8d model trained with Caffe2 at FB, + # so the mean and std need to be changed. + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False), + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs'), +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..999a30c --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py @@ -0,0 +1,25 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + # ResNeXt-101-32x8d model trained with Caffe2 at FB, + # so the mean and std need to be changed. + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False), + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..2cbb658 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_x101-32x4d_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py new file mode 100644 index 0000000..f21a55b --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './mask-rcnn_x101-32x4d_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py new file mode 100644 index 0000000..09b49d4 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/mask_rcnn/metafile.yml b/mmdetection/configs/mask_rcnn/metafile.yml new file mode 100644 index 0000000..ddf85c8 --- /dev/null +++ b/mmdetection/configs/mask_rcnn/metafile.yml @@ -0,0 +1,443 @@ +Collections: + - Name: Mask R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Softmax + - RPN + - Convolution + - Dense Connections + - FPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/1703.06870v3 + Title: "Mask R-CNN" + README: configs/mask_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: mask-rcnn_r50-caffe_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco/mask_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.38__segm_mAP-0.344_20200504_231812-0ebd1859.pth + + - Name: mask-rcnn_r50_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 62.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth + + - Name: mask-rcnn_r50_fpn_fp16_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r50_fpn_amp-1x_coco.py + Metadata: + Training Memory (GB): 3.6 + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + inference time (ms/im): + - value: 41.49 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP16 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/mask_rcnn_r50_fpn_fp16_1x_coco/mask_rcnn_r50_fpn_fp16_1x_coco_20200205-59faf7e4.pth + + - Name: mask-rcnn_r50_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py + Metadata: + Training Memory (GB): 4.4 + inference time (ms/im): + - value: 62.11 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth + + - Name: mask-rcnn_r101-caffe_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco/mask_rcnn_r101_caffe_fpn_1x_coco_20200601_095758-805e06c1.pth + + - Name: mask-rcnn_r101_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 74.07 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth + + - Name: mask-rcnn_r101_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 6.4 + inference time (ms/im): + - value: 74.07 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_2x_coco/mask_rcnn_r101_fpn_2x_coco_bbox_mAP-0.408__segm_mAP-0.366_20200505_071027-14b391c7.pth + + - Name: mask-rcnn_x101-32x4d_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth + + - Name: mask-rcnn_x101-32x4d_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.6 + inference time (ms/im): + - value: 88.5 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco/mask_rcnn_x101_32x4d_fpn_2x_coco_bbox_mAP-0.422__segm_mAP-0.378_20200506_004702-faef898c.pth + + - Name: mask-rcnn_x101-64x4d_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.7 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth + + - Name: mask-rcnn_x101-64x4d_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 10.7 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco/mask_rcnn_x101_64x4d_fpn_2x_coco_20200509_224208-39d6f70c.pth + + - Name: mask-rcnn_x101-32x8d_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco/mask_rcnn_x101_32x8d_fpn_1x_coco_20220630_173841-0aaf329e.pth + + - Name: mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-2x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_2x_coco_bbox_mAP-0.403__segm_mAP-0.365_20200504_231822-a75c98ce.pth + + - Name: mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth + + - Name: mask-rcnn_r50_fpn_mstrain-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r50_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 4.1 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_fpn_mstrain-poly_3x_coco_20210524_201154-21b550bb.pth + + - Name: mask-rcnn_r101_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r101_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 6.1 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_fpn_mstrain-poly_3x_coco_20210524_200244-5675c317.pth + + - Name: mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_r101-caffe_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 5.9 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r101_caffe_fpn_mstrain-poly_3x_coco_20210526_132339-3c33ce02.pth + + - Name: mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-32x4d_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 7.3 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x4d_fpn_mstrain-poly_3x_coco_20210524_201410-abcd7859.pth + + - Name: mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-1x_coco.py + Metadata: + Training Memory (GB): 10.4 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_1x_coco_20220630_170346-b4637974.pth + + - Name: mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-32x8d_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 10.3 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_32x8d_fpn_mstrain-poly_3x_coco_20210607_161042-8bd2c639.pth + + - Name: mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco + In Collection: Mask R-CNN + Config: configs/mask_rcnn/mask-rcnn_x101-64x4d_fpn_ms-poly_3x_coco.py + Metadata: + Epochs: 36 + Training Memory (GB): 10.4 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco/mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco_20210526_120447-c376f129.pth diff --git a/mmdetection/configs/maskformer/README.md b/mmdetection/configs/maskformer/README.md new file mode 100644 index 0000000..ca5ce32 --- /dev/null +++ b/mmdetection/configs/maskformer/README.md @@ -0,0 +1,58 @@ +# MaskFormer + +> [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) + + + +## Abstract + +Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models. + +
    + +
    + +## Introduction + +MaskFormer requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +## Results and Models + +| Backbone | style | Lr schd | Mem (GB) | Inf time (fps) | PQ | SQ | RQ | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st | Config | Download | +| :------: | :-----: | :-----: | :------: | :------------: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :--------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | 75e | 16.2 | - | 46.757 | 80.297 | 57.176 | 50.829 | 81.125 | 61.798 | 40.610 | 79.048 | 50.199 | [config](./maskformer_r50_ms-16xb1-75e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/maskformer/maskformer_r50_ms-16xb1-75e_coco/maskformer_r50_ms-16xb1-75e_coco_20230116_095226-baacd858.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/maskformer/maskformer_r50_ms-16xb1-75e_coco/maskformer_r50_ms-16xb1-75e_coco_20230116_095226.log.json) | +| Swin-L | pytorch | 300e | 27.2 | - | 53.249 | 81.704 | 64.231 | 58.798 | 82.923 | 70.282 | 44.874 | 79.863 | 55.097 | [config](./maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco_20220326_221612-c63ab967.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/maskformer/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco/maskformer_swin-l-p4-w12_mstrain_64x1_300e_coco_20220326_221612.log.json) | + +### Note + +1. The `R-50` version was mentioned in Table XI, in paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527). +2. The models were trained with mmdet 2.x and have been converted for mmdet 3.x. + +## Citation + +```latex +@inproceedings{cheng2021maskformer, + title={Per-Pixel Classification is Not All You Need for Semantic Segmentation}, + author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov}, + journal={NeurIPS}, + year={2021} +} +``` diff --git a/mmdetection/configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py b/mmdetection/configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py new file mode 100644 index 0000000..784ee77 --- /dev/null +++ b/mmdetection/configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py @@ -0,0 +1,216 @@ +_base_ = [ + '../_base_/datasets/coco_panoptic.py', '../_base_/default_runtime.py' +] + +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255) + +num_things_classes = 80 +num_stuff_classes = 53 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type='MaskFormer', + data_preprocessor=data_preprocessor, + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type='MaskFormerHead', + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + pixel_decoder=dict( + type='TransformerEncoderPixelDecoder', + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU'), + encoder=dict( # DetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type='ReLU', inplace=True)))), + positional_encoding=dict(num_feats=128, normalize=True)), + enforce_decoder_input_project=False, + positional_encoding=dict(num_feats=128, normalize=True), + transformer_decoder=dict( # DetrTransformerDecoder + num_layers=6, + layer_cfg=dict( # DetrTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type='ReLU', inplace=True))), + return_intermediate=True), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=20.0), + loss_dice=dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=1.0)), + panoptic_fusion_head=dict( + type='MaskFormerFusionHead', + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='ClassificationCost', weight=1.0), + dict(type='FocalLossCost', weight=20.0, binary_input=True), + dict(type='DiceCost', weight=1.0, pred_act=True, eps=1.0) + ]), + sampler=dict(type='MaskPseudoSampler')), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=False, + # max_per_image is for instance segmentation. + max_per_image=100, + object_mask_thr=0.8, + iou_thr=0.8, + # In MaskFormer's panoptic postprocessing, + # it will not filter masks whose score is smaller than 0.5 . + filter_low_score=False), + init_cfg=None) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + batch_size=1, num_workers=1, dataset=dict(pipeline=train_pipeline)) + +val_dataloader = dict(batch_size=1, num_workers=1) + +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + eps=1e-8, + betas=(0.9, 0.999)), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': dict(lr_mult=1.0, decay_mult=0.0) + }, + norm_decay_mult=0.0), + clip_grad=dict(max_norm=0.01, norm_type=2)) + +max_epochs = 75 + +# learning rate +param_scheduler = dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[50], + gamma=0.1) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (16 GPUs) x (1 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py b/mmdetection/configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py new file mode 100644 index 0000000..9e4897f --- /dev/null +++ b/mmdetection/configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py @@ -0,0 +1,73 @@ +_base_ = './maskformer_r50_ms-16xb1-75e_coco.py' + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa +depths = [2, 2, 18, 2] +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + patch_size=4, + window_size=12, + mlp_ratio=4, + depths=depths, + num_heads=[6, 12, 24, 48], + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + panoptic_head=dict( + in_channels=[192, 384, 768, 1536], # pass to pixel_decoder inside + pixel_decoder=dict( + _delete_=True, + type='PixelDecoder', + norm_cfg=dict(type='GN', num_groups=32), + act_cfg=dict(type='ReLU')), + enforce_decoder_input_project=True)) + +# optimizer + +# weight_decay = 0.01 +# norm_weight_decay = 0.0 +# embed_weight_decay = 0.0 +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +norm_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'norm': norm_multi, + 'absolute_pos_embed': embed_multi, + 'relative_position_bias_table': embed_multi, + 'query_embed': embed_multi +} + +optim_wrapper = dict( + optimizer=dict(lr=6e-5, weight_decay=0.01), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)) + +max_epochs = 300 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[250], + gamma=0.1) +] + +train_cfg = dict(max_epochs=max_epochs) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (64 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/maskformer/metafile.yml b/mmdetection/configs/maskformer/metafile.yml new file mode 100644 index 0000000..fa58269 --- /dev/null +++ b/mmdetection/configs/maskformer/metafile.yml @@ -0,0 +1,43 @@ +Collections: + - Name: MaskFormer + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 16x V100 GPUs + Architecture: + - MaskFormer + Paper: + URL: https://arxiv.org/pdf/2107.06278 + Title: 'Per-Pixel Classification is Not All You Need for Semantic Segmentation' + README: configs/maskformer/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/mmdet/models/detectors/maskformer.py#L7 + Version: v2.22.0 + +Models: + - Name: maskformer_r50_ms-16xb1-75e_coco + In Collection: MaskFormer + Config: configs/maskformer/maskformer_r50_ms-16xb1-75e_coco.py + Metadata: + Training Memory (GB): 16.2 + Epochs: 75 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 46.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/maskformer/maskformer_r50_ms-16xb1-75e_coco/maskformer_r50_ms-16xb1-75e_coco_20230116_095226-baacd858.pth + - Name: maskformer_swin-l-p4-w12_64xb1-ms-300e_coco + In Collection: MaskFormer + Config: configs/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco.py + Metadata: + Training Memory (GB): 27.2 + Epochs: 300 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 53.2 + Weights: https://download.openmmlab.com/mmdetection/v3.0/maskformer/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco/maskformer_swin-l-p4-w12_64xb1-ms-300e_coco_20220326_221612-c63ab967.pth diff --git a/mmdetection/configs/masktrack_rcnn/README.md b/mmdetection/configs/masktrack_rcnn/README.md new file mode 100644 index 0000000..5cef692 --- /dev/null +++ b/mmdetection/configs/masktrack_rcnn/README.md @@ -0,0 +1,93 @@ +# Video Instance Segmentation + +## Abstract + + + +In this paper we present a new computer vision task, named video instance segmentation. The goal of this new task is simultaneous detection, segmentation and tracking of instances in videos. In words, it is the first time that the image instance segmentation problem is extended to the video domain. To facilitate research on this new task, we propose a large-scale benchmark called YouTube-VIS, which consists of 2883 high-resolution YouTube videos, a 40-category label set and 131k high-quality instance masks. In addition, we propose a novel algorithm called MaskTrack R-CNN for this task. Our new method introduces a new tracking branch to Mask R-CNN to jointly perform the detection, segmentation and tracking tasks simultaneously. Finally, we evaluate the proposed method and several strong baselines on our new dataset. Experimental results clearly demonstrate the advantages of the proposed algorithm and reveal insight for future improvement. We believe the video instance segmentation task will motivate the community along the line of research for video understanding. + + + +
    + +
    + +## Citation + + + +```latex +@inproceedings{yang2019video, + title={Video instance segmentation}, + author={Yang, Linjie and Fan, Yuchen and Xu, Ning}, + booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision}, + pages={5188--5197}, + year={2019} +} +``` + +## Results and models of MaskTrack R-CNN on YouTube-VIS 2019 validation dataset + +As mentioned in [Issues #6](https://github.com/youtubevos/MaskTrackRCNN/issues/6#issuecomment-502503505) in MaskTrack R-CNN, the result is kind of unstable for different trials, which ranges from 28 AP to 31 AP when using R-50-FPN as backbone. +The checkpoint provided below is the best one from two experiments. + +| Method | Base detector | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | AP | Config | Download | +| :-------------: | :-----------: | :-------: | :-----: | :-----: | :------: | :------------: | :--: | :--------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| MaskTrack R-CNN | Mask R-CNN | R-50-FPN | pytorch | 12e | 1.61 | - | 30.2 | [config](masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830.log.json) | +| MaskTrack R-CNN | Mask R-CNN | R-101-FPN | pytorch | 12e | 2.27 | - | 32.2 | [config](masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038-454dc48b.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038.log.json) | +| MaskTrack R-CNN | Mask R-CNN | X-101-FPN | pytorch | 12e | 3.69 | - | 34.7 | [config](masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205-fff7a102.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205.log.json) | + +## Results and models of MaskTrack R-CNN on YouTube-VIS 2021 validation dataset + +The checkpoint provided below is the best one from two experiments. + +| Method | Base detector | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | AP | Config | Download | +| :-------------: | :-----------: | :-------: | :-----: | :-----: | :------: | :------------: | :--: | :--------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| MaskTrack R-CNN | Mask R-CNN | R-50-FPN | pytorch | 12e | 1.61 | - | 28.7 | [config](masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948-10da90d9.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948.log.json) | +| MaskTrack R-CNN | Mask R-CNN | R-101-FPN | pytorch | 12e | 2.27 | - | 31.3 | [config](masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509-3c49e4f3.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509.log.json) | +| MaskTrack R-CNN | Mask R-CNN | X-101-FPN | pytorch | 12e | 3.69 | - | 33.5 | [config](masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py) | [model](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021/masktrack_rcnn_x101_fpn_12e_youtubevis2021_20211026_095943-90831df4.pth) \| [log](https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021/masktrack_rcnn_x101_fpn_12e_youtubevis2021_20211026_095943.log.json) | + +## Get started + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Prepare + +Tracking Dataset Prepare can refer to this [document](../../docs/en/user_guides/tracking_dataset_prepare.md). + +### 3. Training + +Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training. + +```shell +# Training MaskTrack R-CNN on YouTube-VIS-2021 dataset with following command. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_train.sh configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py 8 +``` + +If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 4. Testing and evaluation + +If you want to get the results of the [YouTube-VOS](https://youtube-vos.org/dataset/vis/) val/test set, please use the following command to generate result files that can be used for submission. It will be stored in `./youtube_vis_results.submission_file.zip`, you can modify the saved path in `test_evaluator` of the config. + +```shell +# The number after config file represents the number of GPUs used. +bash tools/dist_test_tracking.sh configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py 8 --checkpoint ${CHECKPOINT_PATH} +``` + +If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 5.Inference + +Use a single GPU to predict a video and save it as a video. + +```shell +python demo/mot_demo.py demo/demo_mot.mp4 configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py --checkpoint {CHECKPOINT_PATH} --out vis.mp4 +``` + +If you want to know about more detailed usage of `mot_demo.py`, please refer to this [document](../../docs/en/user_guides/tracking_inference.md). diff --git a/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py new file mode 100644 index 0000000..4be492d --- /dev/null +++ b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py @@ -0,0 +1,12 @@ +_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py'] +model = dict( + detector=dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', checkpoint='torchvision://resnet101')), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth' # noqa: E501 + ))) diff --git a/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py new file mode 100644 index 0000000..81bae4a --- /dev/null +++ b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py @@ -0,0 +1,28 @@ +_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py'] +model = dict( + detector=dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', checkpoint='torchvision://resnet101')), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth' # noqa: E501 + ))) + +data_root = 'data/youtube_vis_2021/' +dataset_version = data_root[-5:-1] + +# dataloader +train_dataloader = dict( + dataset=dict( + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2021_train.json')) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2021_valid.json')) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py new file mode 100644 index 0000000..db1be7b --- /dev/null +++ b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py @@ -0,0 +1,130 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/youtube_vis.py', '../_base_/default_runtime.py' +] + +detector = _base_.model +detector.pop('data_preprocessor') +detector.roi_head.bbox_head.update(dict(num_classes=40)) +detector.roi_head.mask_head.update(dict(num_classes=40)) +detector.train_cfg.rpn.sampler.update(dict(num=64)) +detector.train_cfg.rpn_proposal.update(dict(nms_pre=200, max_per_img=200)) +detector.train_cfg.rcnn.sampler.update(dict(num=128)) +detector.test_cfg.rpn.update(dict(nms_pre=200, max_per_img=200)) +detector.test_cfg.rcnn.update(dict(score_thr=0.01)) +detector['init_cfg'] = dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth' # noqa: E501 +) +del _base_.model + +model = dict( + type='MaskTrackRCNN', + data_preprocessor=dict( + type='TrackDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + detector=detector, + track_head=dict( + type='RoITrackHead', + roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + embed_head=dict( + type='RoIEmbedHead', + num_fcs=2, + roi_feat_size=7, + in_channels=256, + fc_out_channels=1024), + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=128, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + tracker=dict( + type='MaskTrackRCNNTracker', + match_weights=dict(det_score=1.0, iou=2.0, det_label=10.0), + num_frames_retain=20)) + +dataset_type = 'YouTubeVISDataset' +data_root = 'data/youtube_vis_2019/' +dataset_version = data_root[-5:-1] # 2019 or 2021 + +# train_dataloader +train_dataloader = dict( + _delete_=True, + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='TrackImgSampler'), # image-based sampling + batch_sampler=dict(type='TrackAspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2019_train.json', + data_prefix=dict(img_path='train/JPEGImages'), + pipeline=_base_.train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.00125, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3.0, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# visualizer +default_hooks = dict( + visualization=dict(type='TrackVisualizationHook', draw=False)) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# runtime settings +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=13) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# evaluator +val_evaluator = dict( + type='YouTubeVISMetric', + metric='youtube_vis_ap', + outfile_prefix='./youtube_vis_results', + format_only=True) +test_evaluator = val_evaluator + +del detector diff --git a/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py new file mode 100644 index 0000000..47263d5 --- /dev/null +++ b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py @@ -0,0 +1,17 @@ +_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py'] + +data_root = 'data/youtube_vis_2021/' +dataset_version = data_root[-5:-1] + +# dataloader +train_dataloader = dict( + dataset=dict( + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2021_train.json')) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2021_valid.json')) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py new file mode 100644 index 0000000..e7e3f11 --- /dev/null +++ b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py @@ -0,0 +1,16 @@ +_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py'] +model = dict( + detector=dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://resnext101_64x4d')), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth' # noqa: E501 + ))) diff --git a/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py new file mode 100644 index 0000000..ea4c8b9 --- /dev/null +++ b/mmdetection/configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py @@ -0,0 +1,32 @@ +_base_ = ['./masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py'] +model = dict( + detector=dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://resnext101_64x4d')), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_64x4d_fpn_1x_coco/mask_rcnn_x101_64x4d_fpn_1x_coco_20200201-9352eb0d.pth' # noqa: E501 + ))) + +data_root = 'data/youtube_vis_2021/' +dataset_version = data_root[-5:-1] + +# dataloader +train_dataloader = dict( + dataset=dict( + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2021_train.json')) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + dataset_version=dataset_version, + ann_file='annotations/youtube_vis_2021_valid.json')) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/masktrack_rcnn/metafile.yml b/mmdetection/configs/masktrack_rcnn/metafile.yml new file mode 100644 index 0000000..7a1d71d --- /dev/null +++ b/mmdetection/configs/masktrack_rcnn/metafile.yml @@ -0,0 +1,91 @@ +Collections: + - Name: MaskTrack R-CNN + Metadata: + Training Techniques: + - SGD with Momentum + Training Resources: 8x TiTanXP GPUs + Architecture: + - ResNet + Paper: + URL: https://arxiv.org/pdf/1905.04804.pdf + Title: Video Instance Segmentation + README: configs/masktrack_rcnn/README.md + +Models: + - Name: masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019 + In Collection: MaskTrack R-CNN + Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2019.py + Metadata: + Training Data: YouTube-VIS 2019 + Training Memory (GB): 1.16 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2019 + Metrics: + AP: 30.2 + Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2019/masktrack_rcnn_r50_fpn_12e_youtubevis2019_20211022_194830-6ca6b91e.pth + + - Name: masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019 + In Collection: MaskTrack R-CNN + Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2019.py + Metadata: + Training Data: YouTube-VIS 2019 + Training Memory (GB): 2.27 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2019 + Metrics: + AP: 32.2 + Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2019/masktrack_rcnn_r101_fpn_12e_youtubevis2019_20211023_150038-454dc48b.pth + + - Name: masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019 + In Collection: MaskTrack R-CNN + Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2019.py + Metadata: + Training Data: YouTube-VIS 2019 + Training Memory (GB): 3.69 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2019 + Metrics: + AP: 34.7 + Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2019/masktrack_rcnn_x101_fpn_12e_youtubevis2019_20211023_153205-fff7a102.pth + + - Name: masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021 + In Collection: MaskTrack R-CNN + Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r50_fpn_8xb1-12e_youtubevis2021.py + Metadata: + Training Data: YouTube-VIS 2021 + Training Memory (GB): 1.16 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2021 + Metrics: + AP: 28.7 + Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r50_fpn_12e_youtubevis2021/masktrack_rcnn_r50_fpn_12e_youtubevis2021_20211026_044948-10da90d9.pth + + - Name: masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021 + In Collection: MaskTrack R-CNN + Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_r101_fpn_8xb1-12e_youtubevis2021.py + Metadata: + Training Data: YouTube-VIS 2021 + Training Memory (GB): 2.27 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2021 + Metrics: + AP: 31.3 + Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_r101_fpn_12e_youtubevis2021/masktrack_rcnn_r101_fpn_12e_youtubevis2021_20211026_045509-3c49e4f3.pth + + - Name: masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021 + In Collection: MaskTrack R-CNN + Config: configs/masktrack_rcnn/masktrack-rcnn_mask-rcnn_x101_fpn_8xb1-12e_youtubevis2021.py + Metadata: + Training Data: YouTube-VIS 2021 + Training Memory (GB): 3.69 + Results: + - Task: Video Instance Segmentation + Dataset: YouTube-VIS 2021 + Metrics: + AP: 33.5 + Weights: https://download.openmmlab.com/mmtracking/vis/masktrack_rcnn/masktrack_rcnn_x101_fpn_12e_youtubevis2021/masktrack_rcnn_x101_fpn_12e_youtubevis2021_20211026_095943-90831df4.pth diff --git a/mmdetection/configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py b/mmdetection/configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py new file mode 100644 index 0000000..d93e156 --- /dev/null +++ b/mmdetection/configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py @@ -0,0 +1,75 @@ +_base_ = '../common/ms-90k_coco.py' + +# model settings +model = dict( + type='Detectron2Wrapper', + bgr_to_rgb=False, + detector=dict( + # The settings in `d2_detector` will merged into default settings + # in detectron2. More details please refer to + # https://github.com/facebookresearch/detectron2/blob/main/detectron2/config/defaults.py # noqa + meta_architecture='GeneralizedRCNN', + # If you want to finetune the detector, you can use the + # checkpoint released by detectron2, for example: + # weights='detectron2://COCO-Detection/faster_rcnn_R_50_FPN_1x/137257794/model_final_b275ba.pkl' # noqa + weights='detectron2://ImageNetPretrained/MSRA/R-50.pkl', + mask_on=False, + pixel_mean=[103.530, 116.280, 123.675], + pixel_std=[1.0, 1.0, 1.0], + backbone=dict(name='build_resnet_fpn_backbone', freeze_at=2), + resnets=dict( + depth=50, + out_features=['res2', 'res3', 'res4', 'res5'], + num_groups=1, + norm='FrozenBN'), + fpn=dict( + in_features=['res2', 'res3', 'res4', 'res5'], out_channels=256), + anchor_generator=dict( + name='DefaultAnchorGenerator', + sizes=[[32], [64], [128], [256], [512]], + aspect_ratios=[[0.5, 1.0, 2.0]], + angles=[[-90, 0, 90]]), + proposal_generator=dict(name='RPN'), + rpn=dict( + head_name='StandardRPNHead', + in_features=['p2', 'p3', 'p4', 'p5', 'p6'], + iou_thresholds=[0.3, 0.7], + iou_labels=[0, -1, 1], + batch_size_per_image=256, + positive_fraction=0.5, + bbox_reg_loss_type='smooth_l1', + bbox_reg_loss_weight=1.0, + bbox_reg_weights=(1.0, 1.0, 1.0, 1.0), + smooth_l1_beta=0.0, + loss_weight=1.0, + boundary_thresh=-1, + pre_nms_topk_train=2000, + post_nms_topk_train=1000, + pre_nms_topk_test=1000, + post_nms_topk_test=1000, + nms_thresh=0.7, + conv_dims=[-1]), + roi_heads=dict( + name='StandardROIHeads', + num_classes=80, + in_features=['p2', 'p3', 'p4', 'p5'], + iou_thresholds=[0.5], + iou_labels=[0, 1], + batch_size_per_image=512, + positive_fraction=0.25, + score_thresh_test=0.05, + nms_thresh_test=0.5, + proposal_append_gt=True), + roi_box_head=dict( + name='FastRCNNConvFCHead', + num_fc=2, + fc_dim=1024, + conv_dim=256, + pooler_type='ROIAlignV2', + pooler_resolution=7, + pooler_sampling_ratio=0, + bbox_reg_loss_type='smooth_l1', + bbox_reg_loss_weight=1.0, + bbox_reg_weights=(10.0, 10.0, 5.0, 5.0), + smooth_l1_beta=0.0, + cls_agnostic_bbox_reg=False))) diff --git a/mmdetection/configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py b/mmdetection/configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py new file mode 100644 index 0000000..c0919c4 --- /dev/null +++ b/mmdetection/configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py @@ -0,0 +1,83 @@ +_base_ = '../common/ms-poly-90k_coco-instance.py' + +# model settings +model = dict( + type='Detectron2Wrapper', + bgr_to_rgb=False, + detector=dict( + # The settings in `d2_detector` will merged into default settings + # in detectron2. More details please refer to + # https://github.com/facebookresearch/detectron2/blob/main/detectron2/config/defaults.py # noqa + meta_architecture='GeneralizedRCNN', + # If you want to finetune the detector, you can use the + # checkpoint released by detectron2, for example: + # weights='detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl' # noqa + weights='detectron2://ImageNetPretrained/MSRA/R-50.pkl', + mask_on=True, + pixel_mean=[103.530, 116.280, 123.675], + pixel_std=[1.0, 1.0, 1.0], + backbone=dict(name='build_resnet_fpn_backbone', freeze_at=2), + resnets=dict( + depth=50, + out_features=['res2', 'res3', 'res4', 'res5'], + num_groups=1, + norm='FrozenBN'), + fpn=dict( + in_features=['res2', 'res3', 'res4', 'res5'], out_channels=256), + anchor_generator=dict( + name='DefaultAnchorGenerator', + sizes=[[32], [64], [128], [256], [512]], + aspect_ratios=[[0.5, 1.0, 2.0]], + angles=[[-90, 0, 90]]), + proposal_generator=dict(name='RPN'), + rpn=dict( + head_name='StandardRPNHead', + in_features=['p2', 'p3', 'p4', 'p5', 'p6'], + iou_thresholds=[0.3, 0.7], + iou_labels=[0, -1, 1], + batch_size_per_image=256, + positive_fraction=0.5, + bbox_reg_loss_type='smooth_l1', + bbox_reg_loss_weight=1.0, + bbox_reg_weights=(1.0, 1.0, 1.0, 1.0), + smooth_l1_beta=0.0, + loss_weight=1.0, + boundary_thresh=-1, + pre_nms_topk_train=2000, + post_nms_topk_train=1000, + pre_nms_topk_test=1000, + post_nms_topk_test=1000, + nms_thresh=0.7, + conv_dims=[-1]), + roi_heads=dict( + name='StandardROIHeads', + num_classes=80, + in_features=['p2', 'p3', 'p4', 'p5'], + iou_thresholds=[0.5], + iou_labels=[0, 1], + batch_size_per_image=512, + positive_fraction=0.25, + score_thresh_test=0.05, + nms_thresh_test=0.5, + proposal_append_gt=True), + roi_box_head=dict( + name='FastRCNNConvFCHead', + num_fc=2, + fc_dim=1024, + conv_dim=256, + pooler_type='ROIAlignV2', + pooler_resolution=7, + pooler_sampling_ratio=0, + bbox_reg_loss_type='smooth_l1', + bbox_reg_loss_weight=1.0, + bbox_reg_weights=(10.0, 10.0, 5.0, 5.0), + smooth_l1_beta=0.0, + cls_agnostic_bbox_reg=False), + roi_mask_head=dict( + name='MaskRCNNConvUpsampleHead', + conv_dim=256, + num_conv=4, + pooler_type='ROIAlignV2', + pooler_resolution=14, + pooler_sampling_ratio=0, + cls_agnostic_mask=False))) diff --git a/mmdetection/configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py b/mmdetection/configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py new file mode 100644 index 0000000..d3f7587 --- /dev/null +++ b/mmdetection/configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py @@ -0,0 +1,48 @@ +_base_ = '../common/ms-90k_coco.py' + +# model settings +model = dict( + type='Detectron2Wrapper', + bgr_to_rgb=False, + detector=dict( + # The settings in `d2_detector` will merged into default settings + # in detectron2. More details please refer to + # https://github.com/facebookresearch/detectron2/blob/main/detectron2/config/defaults.py # noqa + meta_architecture='RetinaNet', + # If you want to finetune the detector, you can use the + # checkpoint released by detectron2, for example: + # weights='detectron2://COCO-Detection/retinanet_R_50_FPN_1x/190397773/model_final_bfca0b.pkl' # noqa + weights='detectron2://ImageNetPretrained/MSRA/R-50.pkl', + mask_on=False, + pixel_mean=[103.530, 116.280, 123.675], + pixel_std=[1.0, 1.0, 1.0], + backbone=dict(name='build_retinanet_resnet_fpn_backbone', freeze_at=2), + resnets=dict( + depth=50, + out_features=['res3', 'res4', 'res5'], + num_groups=1, + norm='FrozenBN'), + fpn=dict(in_features=['res3', 'res4', 'res5'], out_channels=256), + anchor_generator=dict( + name='DefaultAnchorGenerator', + sizes=[[x, x * 2**(1.0 / 3), x * 2**(2.0 / 3)] + for x in [32, 64, 128, 256, 512]], + aspect_ratios=[[0.5, 1.0, 2.0]], + angles=[[-90, 0, 90]]), + retinanet=dict( + num_classes=80, + in_features=['p3', 'p4', 'p5', 'p6', 'p7'], + num_convs=4, + iou_thresholds=[0.4, 0.5], + iou_labels=[0, -1, 1], + bbox_reg_weights=(1.0, 1.0, 1.0, 1.0), + bbox_reg_loss_type='smooth_l1', + smooth_l1_loss_beta=0.0, + focal_loss_gamma=2.0, + focal_loss_alpha=0.25, + prior_prob=0.01, + score_thresh_test=0.05, + topk_candidates_test=1000, + nms_thresh_test=0.5))) + +optim_wrapper = dict(optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/ms_rcnn/README.md b/mmdetection/configs/ms_rcnn/README.md new file mode 100644 index 0000000..abbec9b --- /dev/null +++ b/mmdetection/configs/ms_rcnn/README.md @@ -0,0 +1,36 @@ +# MS R-CNN + +> [Mask Scoring R-CNN](https://arxiv.org/abs/1903.00241) + + + +## Abstract + +Letting a deep network be aware of the quality of its own predictions is an interesting yet important problem. In the task of instance segmentation, the confidence of instance classification is used as mask quality score in most instance segmentation frameworks. However, the mask quality, quantified as the IoU between the instance mask and its ground truth, is usually not well correlated with classification score. In this paper, we study this problem and propose Mask Scoring R-CNN which contains a network block to learn the quality of the predicted instance masks. The proposed network block takes the instance feature and the corresponding predicted mask together to regress the mask IoU. The mask scoring strategy calibrates the misalignment between mask quality and mask score, and improves instance segmentation performance by prioritizing more accurate mask predictions during COCO AP evaluation. By extensive evaluations on the COCO dataset, Mask Scoring R-CNN brings consistent and noticeable gain with different models, and outperforms the state-of-the-art Mask R-CNN. We hope our simple and effective approach will provide a new direction for improving instance segmentation. + +
    + +
    + +## Results and Models + +| Backbone | style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | caffe | 1x | 4.5 | | 38.2 | 36.0 | [config](./ms-rcnn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848.log.json) | +| R-50-FPN | caffe | 2x | - | - | 38.8 | 36.3 | [config](./ms-rcnn_r50-caffe_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_bbox_mAP-0.388__segm_mAP-0.363_20200506_004738-ee87b137.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_20200506_004738.log.json) | +| R-101-FPN | caffe | 1x | 6.5 | | 40.4 | 37.6 | [config](./ms-rcnn_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.404__segm_mAP-0.376_20200506_004755-b9b12a37.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_20200506_004755.log.json) | +| R-101-FPN | caffe | 2x | - | - | 41.1 | 38.1 | [config](./ms-rcnn_r101-caffe_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_bbox_mAP-0.411__segm_mAP-0.381_20200506_011134-5f3cc74f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_20200506_011134.log.json) | +| R-X101-32x4d | pytorch | 2x | 7.9 | 11.0 | 41.8 | 38.7 | [config](./ms-rcnn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206-81fd1740.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206_100113.log.json) | +| R-X101-64x4d | pytorch | 1x | 11.0 | 8.0 | 43.0 | 39.5 | [config](./ms-rcnn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206-86ba88d2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206_091744.log.json) | +| R-X101-64x4d | pytorch | 2x | 11.0 | 8.0 | 42.6 | 39.5 | [config](./ms-rcnn_x101-64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308-02a445e2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308_012247.log.json) | + +## Citation + +```latex +@inproceedings{huang2019msrcnn, + title={Mask Scoring R-CNN}, + author={Zhaojin Huang and Lichao Huang and Yongchao Gong and Chang Huang and Xinggang Wang}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019}, +} +``` diff --git a/mmdetection/configs/ms_rcnn/metafile.yml b/mmdetection/configs/ms_rcnn/metafile.yml new file mode 100644 index 0000000..290f054 --- /dev/null +++ b/mmdetection/configs/ms_rcnn/metafile.yml @@ -0,0 +1,159 @@ +Collections: + - Name: Mask Scoring R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RPN + - FPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/1903.00241 + Title: 'Mask Scoring R-CNN' + README: configs/ms_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_scoring_rcnn.py#L6 + Version: v2.0.0 + +Models: + - Name: ms-rcnn_r50-caffe_fpn_1x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_1x_coco/ms_rcnn_r50_caffe_fpn_1x_coco_20200702_180848-61c9355e.pth + + - Name: ms-rcnn_r50-caffe_fpn_2x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r50_caffe_fpn_2x_coco/ms_rcnn_r50_caffe_fpn_2x_coco_bbox_mAP-0.388__segm_mAP-0.363_20200506_004738-ee87b137.pth + + - Name: ms-rcnn_r101-caffe_fpn_1x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.5 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_1x_coco/ms_rcnn_r101_caffe_fpn_1x_coco_bbox_mAP-0.404__segm_mAP-0.376_20200506_004755-b9b12a37.pth + + - Name: ms-rcnn_r101-caffe_fpn_2x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_r101_caffe_fpn_2x_coco/ms_rcnn_r101_caffe_fpn_2x_coco_bbox_mAP-0.411__segm_mAP-0.381_20200506_011134-5f3cc74f.pth + + - Name: ms-rcnn_x101-32x4d_fpn_1x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.9 + inference time (ms/im): + - value: 90.91 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_32x4d_fpn_1x_coco/ms_rcnn_x101_32x4d_fpn_1x_coco_20200206-81fd1740.pth + + - Name: ms-rcnn_x101-64x4d_fpn_1x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 11.0 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_1x_coco/ms_rcnn_x101_64x4d_fpn_1x_coco_20200206-86ba88d2.pth + + - Name: ms-rcnn_x101-64x4d_fpn_2x_coco + In Collection: Mask Scoring R-CNN + Config: configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 11.0 + inference time (ms/im): + - value: 125 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ms_rcnn/ms_rcnn_x101_64x4d_fpn_2x_coco/ms_rcnn_x101_64x4d_fpn_2x_coco_20200308-02a445e2.pth diff --git a/mmdetection/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..2ff4f2d --- /dev/null +++ b/mmdetection/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './ms-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py b/mmdetection/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py new file mode 100644 index 0000000..54b29e4 --- /dev/null +++ b/mmdetection/configs/ms_rcnn/ms-rcnn_r101-caffe_fpn_2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './ms-rcnn_r101-caffe_fpn_1x_coco.py' +# learning policy +max_epochs = 24 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..e7fbc51 --- /dev/null +++ b/mmdetection/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50-caffe_fpn_1x_coco.py' +model = dict( + type='MaskScoringRCNN', + roi_head=dict( + type='MaskScoringRoIHead', + mask_iou_head=dict( + type='MaskIoUHead', + num_convs=4, + num_fcs=2, + roi_feat_size=14, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + num_classes=80)), + # model training and testing settings + train_cfg=dict(rcnn=dict(mask_thr_binary=0.5))) diff --git a/mmdetection/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py b/mmdetection/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py new file mode 100644 index 0000000..0334882 --- /dev/null +++ b/mmdetection/configs/ms_rcnn/ms-rcnn_r50-caffe_fpn_2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './ms-rcnn_r50-caffe_fpn_1x_coco.py' +# learning policy +max_epochs = 24 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/ms_rcnn/ms-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/ms_rcnn/ms-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..0ae47d1 --- /dev/null +++ b/mmdetection/configs/ms_rcnn/ms-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + type='MaskScoringRCNN', + roi_head=dict( + type='MaskScoringRoIHead', + mask_iou_head=dict( + type='MaskIoUHead', + num_convs=4, + num_fcs=2, + roi_feat_size=14, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + num_classes=80)), + # model training and testing settings + train_cfg=dict(rcnn=dict(mask_thr_binary=0.5))) diff --git a/mmdetection/configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..1a5d0d0 --- /dev/null +++ b/mmdetection/configs/ms_rcnn/ms-rcnn_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ms-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..1629007 --- /dev/null +++ b/mmdetection/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './ms-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py b/mmdetection/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py new file mode 100644 index 0000000..7aec187 --- /dev/null +++ b/mmdetection/configs/ms_rcnn/ms-rcnn_x101-64x4d_fpn_2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './ms-rcnn_x101-64x4d_fpn_1x_coco.py' +# learning policy +max_epochs = 24 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/nas_fcos/README.md b/mmdetection/configs/nas_fcos/README.md new file mode 100644 index 0000000..a0ec77c --- /dev/null +++ b/mmdetection/configs/nas_fcos/README.md @@ -0,0 +1,35 @@ +# NAS-FCOS + +> [NAS-FCOS: Fast Neural Architecture Search for Object Detection](https://arxiv.org/abs/1906.04423) + + + +## Abstract + +The success of deep neural networks relies on significant architecture engineering. Recently neural architecture search (NAS) has emerged as a promise to greatly reduce manual effort in network design by automatically searching for optimal architectures, although typically such algorithms need an excessive amount of computational resources, e.g., a few thousand GPU-days. To date, on challenging vision tasks such as object detection, NAS, especially fast versions of NAS, is less studied. Here we propose to search for the decoder structure of object detectors with search efficiency being taken into consideration. To be more specific, we aim to efficiently search for the feature pyramid network (FPN) as well as the prediction head of a simple anchor-free object detector, namely FCOS, using a tailored reinforcement learning paradigm. With carefully designed search space, search algorithms and strategies for evaluating network quality, we are able to efficiently search a top-performing detection architecture within 4 days using 8 V100 GPUs. The discovered architecture surpasses state-of-the-art object detection models (such as Faster R-CNN, RetinaNet and FCOS) by 1.5 to 3.5 points in AP on the COCO dataset, with comparable computation complexity and memory footprint, demonstrating the efficacy of the proposed NAS for object detection. + +
    + +
    + +## Results and Models + +| Head | Backbone | Style | GN-head | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :----------: | :------: | :---: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| NAS-FCOSHead | R-50 | caffe | Y | 1x | | | 39.4 | [config](./nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520.log.json) | +| FCOSHead | R-50 | caffe | Y | 1x | | | 38.5 | [config](./nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521-7fdcbce0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521.log.json) | + +**Notes:** + +- To be consistent with the author's implementation, we use 4 GPUs with 4 images/GPU. + +## Citation + +```latex +@article{wang2019fcos, + title={Nas-fcos: Fast neural architecture search for object detection}, + author={Wang, Ning and Gao, Yang and Chen, Hao and Wang, Peng and Tian, Zhi and Shen, Chunhua}, + journal={arXiv preprint arXiv:1906.04423}, + year={2019} +} +``` diff --git a/mmdetection/configs/nas_fcos/metafile.yml b/mmdetection/configs/nas_fcos/metafile.yml new file mode 100644 index 0000000..02292a4 --- /dev/null +++ b/mmdetection/configs/nas_fcos/metafile.yml @@ -0,0 +1,44 @@ +Collections: + - Name: NAS-FCOS + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 4x V100 GPUs + Architecture: + - FPN + - NAS-FCOS + - ResNet + Paper: + URL: https://arxiv.org/abs/1906.04423 + Title: 'NAS-FCOS: Fast Neural Architecture Search for Object Detection' + README: configs/nas_fcos/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/detectors/nasfcos.py#L6 + Version: v2.1.0 + +Models: + - Name: nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco + In Collection: NAS-FCOS + Config: configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200520-1bdba3ce.pth + + - Name: nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco + In Collection: NAS-FCOS + Config: configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco_20200521-7fdcbce0.pth diff --git a/mmdetection/configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py b/mmdetection/configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py new file mode 100644 index 0000000..ba207c9 --- /dev/null +++ b/mmdetection/configs/nas_fcos/nas-fcos_r50-caffe_fpn_fcoshead-gn-head_4xb4-1x_coco.py @@ -0,0 +1,75 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='NASFCOS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False, eps=0), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='NASFCOS_FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=dict(type='BN'), + conv_cfg=dict(type='DCNv2', deform_groups=2)), + bbox_head=dict( + type='FCOSHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + norm_cfg=dict(type='GN', num_groups=32), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# dataset settings +train_dataloader = dict(batch_size=4, num_workers=2) + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.01), + paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) diff --git a/mmdetection/configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py b/mmdetection/configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py new file mode 100644 index 0000000..329f34c --- /dev/null +++ b/mmdetection/configs/nas_fcos/nas-fcos_r50-caffe_fpn_nashead-gn-head_4xb4-1x_coco.py @@ -0,0 +1,74 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='NASFCOS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False, eps=0), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='NASFCOS_FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=dict(type='BN'), + conv_cfg=dict(type='DCNv2', deform_groups=2)), + bbox_head=dict( + type='NASFCOSHead', + num_classes=80, + in_channels=256, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + norm_cfg=dict(type='GN', num_groups=32), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# dataset settings +train_dataloader = dict(batch_size=4, num_workers=2) + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.01), + paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) diff --git a/mmdetection/configs/nas_fpn/README.md b/mmdetection/configs/nas_fpn/README.md new file mode 100644 index 0000000..260ec47 --- /dev/null +++ b/mmdetection/configs/nas_fpn/README.md @@ -0,0 +1,36 @@ +# NAS-FPN + +> [NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection](https://arxiv.org/abs/1904.07392) + + + +## Abstract + +Current state-of-the-art convolutional architectures for object detection are manually designed. Here we aim to learn a better architecture of feature pyramid network for object detection. We adopt Neural Architecture Search and discover a new feature pyramid architecture in a novel scalable search space covering all cross-scale connections. The discovered architecture, named NAS-FPN, consists of a combination of top-down and bottom-up connections to fuse features across scales. NAS-FPN, combined with various backbone models in the RetinaNet framework, achieves better accuracy and latency tradeoff compared to state-of-the-art object detection models. NAS-FPN improves mobile detection accuracy by 2 AP compared to state-of-the-art SSDLite with MobileNetV2 model in \[32\] and achieves 48.3 AP which surpasses Mask R-CNN \[10\] detection accuracy with less computation time. + +
    + +
    + +## Results and Models + +We benchmark the new training schedule (crop training, large batch, unfrozen BN, 50 epochs) introduced in NAS-FPN. RetinaNet is used in the paper. + +| Backbone | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | 50e | 12.9 | 22.9 | 37.9 | [config](./retinanet_r50_fpn_crop640-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco-9b953d76.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco_20200529_095329.log.json) | +| R-50-NASFPN | 50e | 13.2 | 23.0 | 40.5 | [config](./retinanet_r50_nasfpn_crop640-50e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco_20200528_230008.log.json) | + +**Note**: We find that it is unstable to train NAS-FPN and there is a small chance that results can be 3% mAP lower. + +## Citation + +```latex +@inproceedings{ghiasi2019fpn, + title={Nas-fpn: Learning scalable feature pyramid architecture for object detection}, + author={Ghiasi, Golnaz and Lin, Tsung-Yi and Le, Quoc V}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={7036--7045}, + year={2019} +} +``` diff --git a/mmdetection/configs/nas_fpn/metafile.yml b/mmdetection/configs/nas_fpn/metafile.yml new file mode 100644 index 0000000..aef0df6 --- /dev/null +++ b/mmdetection/configs/nas_fpn/metafile.yml @@ -0,0 +1,59 @@ +Collections: + - Name: NAS-FPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - NAS-FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.07392 + Title: 'NAS-FPN: Learning Scalable Feature Pyramid Architecture for Object Detection' + README: configs/nas_fpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/nas_fpn.py#L67 + Version: v2.0.0 + +Models: + - Name: retinanet_r50_fpn_crop640-50e_coco + In Collection: NAS-FPN + Config: configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py + Metadata: + Training Memory (GB): 12.9 + inference time (ms/im): + - value: 43.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_fpn_crop640_50e_coco/retinanet_r50_fpn_crop640_50e_coco-9b953d76.pth + + - Name: retinanet_r50_nasfpn_crop640-50e_coco + In Collection: NAS-FPN + Config: configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py + Metadata: + Training Memory (GB): 13.2 + inference time (ms/im): + - value: 43.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/nas_fpn/retinanet_r50_nasfpn_crop640_50e_coco/retinanet_r50_nasfpn_crop640_50e_coco-0ad1f644.pth diff --git a/mmdetection/configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py b/mmdetection/configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py new file mode 100644 index 0000000..11c34f6 --- /dev/null +++ b/mmdetection/configs/nas_fpn/retinanet_r50_fpn_crop640-50e_coco.py @@ -0,0 +1,78 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=64, + batch_augments=[dict(type='BatchFixedSizePad', size=(640, 640))]), + backbone=dict(norm_eval=False), + neck=dict( + relu_before_extra_convs=True, + no_norm_on_lateral=True, + norm_cfg=norm_cfg), + bbox_head=dict(type='RetinaSepBNHead', num_ins=5, norm_cfg=norm_cfg), + # training and testing settings + train_cfg=dict(assigner=dict(neg_iou_thr=0.5))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(640, 640), + ratio_range=(0.8, 1.2), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# training schedule for 50e +max_epochs = 50 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[30, 40], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001), + paramwise_cfg=dict(norm_decay_mult=0, bypass_duplicate=True)) + +env_cfg = dict(cudnn_benchmark=True) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py b/mmdetection/configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py new file mode 100644 index 0000000..a851b74 --- /dev/null +++ b/mmdetection/configs/nas_fpn/retinanet_r50_nasfpn_crop640-50e_coco.py @@ -0,0 +1,16 @@ +_base_ = './retinanet_r50_fpn_crop640-50e_coco.py' + +# model settings +model = dict( + # `pad_size_divisor=128` ensures the feature maps sizes + # in `NAS_FPN` won't mismatch. + data_preprocessor=dict(pad_size_divisor=128), + neck=dict( + _delete_=True, + type='NASFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5, + stack_times=7, + start_level=1, + norm_cfg=dict(type='BN', requires_grad=True))) diff --git a/mmdetection/configs/objects365/README.md b/mmdetection/configs/objects365/README.md new file mode 100644 index 0000000..fca0dbf --- /dev/null +++ b/mmdetection/configs/objects365/README.md @@ -0,0 +1,102 @@ +# Objects365 Dataset + +> [Objects365 Dataset](https://openaccess.thecvf.com/content_ICCV_2019/papers/Shao_Objects365_A_Large-Scale_High-Quality_Dataset_for_Object_Detection_ICCV_2019_paper.pdf) + + + +## Abstract + + + +#### Objects365 Dataset V1 + +[Objects365 Dataset V1](http://www.objects365.org/overview.html) is a brand new dataset, +designed to spur object detection research with a focus on diverse objects in the Wild. +It has 365 object categories over 600K training images. More than 10 million, high-quality bounding boxes are manually labeled through a three-step, carefully designed annotation pipeline. It is the largest object detection dataset (with full annotation) so far and establishes a more challenging benchmark for the community. Objects365 can serve as a better feature learning dataset for localization-sensitive tasks like object detection +and semantic segmentation. + + + +
    + +
    + +#### Objects365 Dataset V2 + +[Objects365 Dataset V2](http://www.objects365.org/overview.html) is based on the V1 release of the Objects365 dataset. +Objects 365 annotated 365 object classes on more than 1800k images, with more than 29 million bounding boxes in the training set, surpassing PASCAL VOC, ImageNet, and COCO datasets. +Objects 365 includes 11 categories of people, clothing, living room, bathroom, kitchen, office/medical, electrical appliances, transportation, food, animals, sports/musical instruments, and each category has dozens of subcategories. + +## Citation + +``` +@inproceedings{shao2019objects365, + title={Objects365: A large-scale, high-quality dataset for object detection}, + author={Shao, Shuai and Li, Zeming and Zhang, Tianyuan and Peng, Chao and Yu, Gang and Zhang, Xiangyu and Li, Jing and Sun, Jian}, + booktitle={Proceedings of the IEEE/CVF international conference on computer vision}, + pages={8430--8439}, + year={2019} +} +``` + +## Prepare Dataset + +1. You need to download and extract Objects365 dataset. Users can download Objects365 V2 by using `tools/misc/download_dataset.py`. + + **Usage** + + ```shell + python tools/misc/download_dataset.py --dataset-name objects365v2 \ + --save-dir ${SAVING PATH} \ + --unzip \ + --delete # Optional, delete the download zip file + ``` + + **Note:** There is no download link for Objects365 V1 right now. If you would like to download Objects365-V1, please visit [official website](http://www.objects365.org/) to concat the author. + +2. The directory should be like this: + + ```none + mmdetection + ├── mmdet + ├── tools + ├── configs + ├── data + │ ├── Objects365 + │ │ ├── Obj365_v1 + │ │ │ ├── annotations + │ │ │ │ ├── objects365_train.json + │ │ │ │ ├── objects365_val.json + │ │ │ ├── train # training images + │ │ │ ├── val # validation images + │ │ ├── Obj365_v2 + │ │ │ ├── annotations + │ │ │ │ ├── zhiyuan_objv2_train.json + │ │ │ │ ├── zhiyuan_objv2_val.json + │ │ │ ├── train # training images + │ │ │ │ ├── patch0 + │ │ │ │ ├── patch1 + │ │ │ │ ├── ... + │ │ │ ├── val # validation images + │ │ │ │ ├── patch0 + │ │ │ │ ├── patch1 + │ │ │ │ ├── ... + ``` + +## Results and Models + +### Objects365 V1 + +| Architecture | Backbone | Style | Lr schd | Mem (GB) | box AP | Config | Download | +| :----------: | :------: | :-----: | :-----: | :------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN | R-50 | pytorch | 1x | - | 19.6 | [config](https://github.com/open-mmlab/mmdetection/tree/main/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1/faster_rcnn_r50_fpn_16x4_1x_obj365v1_20221219_181226-9ff10f95.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1/faster_rcnn_r50_fpn_16x4_1x_obj365v1_20221219_181226.log.json) | +| Faster R-CNN | R-50 | pytorch | 1350K | - | 22.3 | [config](https://github.com/open-mmlab/mmdetection/tree/main/configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1_20220510_142457-337d8965.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1_20220510_142457.log.json) | +| Retinanet | R-50 | pytorch | 1x | - | 14.8 | [config](https://github.com/open-mmlab/mmdetection/tree/main/configs/objects365/retinanet_r50_fpn_1x_objects365v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v1/retinanet_r50_fpn_1x_obj365v1_20221219_181859-ba3e3dd5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v1/retinanet_r50_fpn_1x_obj365v1_20221219_181859.log.json) | +| Retinanet | R-50 | pytorch | 1350K | - | 18.0 | [config](https://github.com/open-mmlab/mmdetection/tree/main/configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1/retinanet_r50_fpn_syncbn_1350k_obj365v1_20220513_111237-7517c576.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1/retinanet_r50_fpn_syncbn_1350k_obj365v1_20220513_111237.log.json) | + +### Objects365 V2 + +| Architecture | Backbone | Style | Lr schd | Mem (GB) | box AP | Config | Download | +| :----------: | :------: | :-----: | :-----: | :------: | :----: | :---------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN | R-50 | pytorch | 1x | - | 19.8 | [config](https://github.com/open-mmlab/mmdetection/tree/main/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2/faster_rcnn_r50_fpn_16x4_1x_obj365v2_20221220_175040-5910b015.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2/faster_rcnn_r50_fpn_16x4_1x_obj365v2_20221220_175040.log.json) | +| Retinanet | R-50 | pytorch | 1x | - | 16.7 | [config](https://github.com/open-mmlab/mmdetection/tree/main/configs/objects365/retinanet_r50_fpn_1x_objects365v2.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v2/retinanet_r50_fpn_1x_obj365v2_20221223_122105-d9b191f1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v2/retinanet_r50_fpn_1x_obj365v2_20221223_122105.log.json) | diff --git a/mmdetection/configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py b/mmdetection/configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py new file mode 100644 index 0000000..ff7d0a3 --- /dev/null +++ b/mmdetection/configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py @@ -0,0 +1,49 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/objects365v2_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict(norm_cfg=dict(type='SyncBN', requires_grad=True)), + roi_head=dict(bbox_head=dict(num_classes=365))) + +# training schedule for 1350K +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=1350000, # 36 epochs + val_interval=150000) + +# Using 8 GPUS while training +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning rate policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 1000, + by_epoch=False, + begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=1350000, + by_epoch=False, + milestones=[900000, 1200000], + gamma=0.1) +] + +train_dataloader = dict(sampler=dict(type='InfiniteSampler')) +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=150000)) + +log_processor = dict(by_epoch=False) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py b/mmdetection/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py new file mode 100644 index 0000000..bc0d96f --- /dev/null +++ b/mmdetection/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/objects365v1_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict(roi_head=dict(bbox_head=dict(num_classes=365))) + +train_dataloader = dict( + batch_size=4, # using 16 GPUS while training. total batch size is 16 x 4) +) + +# Using 32 GPUS while training +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 1000, + by_epoch=False, + begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py b/mmdetection/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py new file mode 100644 index 0000000..1090678 --- /dev/null +++ b/mmdetection/configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/objects365v2_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict(roi_head=dict(bbox_head=dict(num_classes=365))) + +train_dataloader = dict( + batch_size=4, # using 16 GPUS while training. total batch size is 16 x 4) +) + +# Using 32 GPUS while training +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 1000, + by_epoch=False, + begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/objects365/metafile.yml b/mmdetection/configs/objects365/metafile.yml new file mode 100644 index 0000000..d43e8bd --- /dev/null +++ b/mmdetection/configs/objects365/metafile.yml @@ -0,0 +1,101 @@ +- Name: retinanet_r50_fpn_1x_objects365v1 + In Collection: RetinaNet + Config: configs/objects365/retinanet_r50_fpn_1x_objects365v1.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 12 + Training Data: Objects365 v1 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Objects365 v1 + Metrics: + box AP: 14.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v1/retinanet_r50_fpn_1x_obj365v1_20221219_181859-ba3e3dd5.pth + +- Name: retinanet_r50-syncbn_fpn_1350k_objects365v1 + In Collection: RetinaNet + Config: configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py + Metadata: + Training Memory (GB): 7.6 + Iterations: 1350000 + Training Data: Objects365 v1 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Objects365 v1 + Metrics: + box AP: 18.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_syncbn_1350k_obj365v1/retinanet_r50_fpn_syncbn_1350k_obj365v1_20220513_111237-7517c576.pth + +- Name: retinanet_r50_fpn_1x_objects365v2 + In Collection: RetinaNet + Config: configs/objects365/retinanet_r50_fpn_1x_objects365v2.py + Metadata: + Training Memory (GB): 7.2 + Epochs: 12 + Training Data: Objects365 v2 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Objects365 v2 + Metrics: + box AP: 16.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/retinanet_r50_fpn_1x_obj365v2/retinanet_r50_fpn_1x_obj365v2_20221223_122105-d9b191f1.pth + +- Name: faster-rcnn_r50_fpn_16xb4-1x_objects365v1 + In Collection: Faster R-CNN + Config: configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v1.py + Metadata: + Training Memory (GB): 11.4 + Epochs: 12 + Training Data: Objects365 v1 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Objects365 v1 + Metrics: + box AP: 19.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v1/faster_rcnn_r50_fpn_16x4_1x_obj365v1_20221219_181226-9ff10f95.pth + +- Name: faster-rcnn_r50-syncbn_fpn_1350k_objects365v1 + In Collection: Faster R-CNN + Config: configs/objects365/faster-rcnn_r50-syncbn_fpn_1350k_objects365v1.py + Metadata: + Training Memory (GB): 8.6 + Iterations: 1350000 + Training Data: Objects365 v1 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Objects365 v1 + Metrics: + box AP: 22.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1/faster_rcnn_r50_fpn_syncbn_1350k_obj365v1_20220510_142457-337d8965.pth + +- Name: faster-rcnn_r50_fpn_16xb4-1x_objects365v2 + In Collection: Faster R-CNN + Config: configs/objects365/faster-rcnn_r50_fpn_16xb4-1x_objects365v2.py + Metadata: + Training Memory (GB): 10.8 + Epochs: 12 + Training Data: Objects365 v1 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Objects365 v2 + Metrics: + box AP: 19.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/objects365/faster_rcnn_r50_fpn_16x4_1x_obj365v2/faster_rcnn_r50_fpn_16x4_1x_obj365v2_20221220_175040-5910b015.pth diff --git a/mmdetection/configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py b/mmdetection/configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py new file mode 100644 index 0000000..c41dfce --- /dev/null +++ b/mmdetection/configs/objects365/retinanet_r50-syncbn_fpn_1350k_objects365v1.py @@ -0,0 +1,49 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/objects365v2_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict(norm_cfg=dict(type='SyncBN', requires_grad=True)), + bbox_head=dict(num_classes=365)) + +# training schedule for 1350K +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=1350000, # 36 epochs + val_interval=150000) + +# Using 8 GPUS while training +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning rate policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 1000, + by_epoch=False, + begin=0, + end=10000), + dict( + type='MultiStepLR', + begin=0, + end=1350000, + by_epoch=False, + milestones=[900000, 1200000], + gamma=0.1) +] + +train_dataloader = dict(sampler=dict(type='InfiniteSampler')) +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=150000)) + +log_processor = dict(by_epoch=False) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/objects365/retinanet_r50_fpn_1x_objects365v1.py b/mmdetection/configs/objects365/retinanet_r50_fpn_1x_objects365v1.py new file mode 100644 index 0000000..7214419 --- /dev/null +++ b/mmdetection/configs/objects365/retinanet_r50_fpn_1x_objects365v1.py @@ -0,0 +1,35 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/objects365v1_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict(bbox_head=dict(num_classes=365)) + +# Using 8 GPUS while training +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 1000, + by_epoch=False, + begin=0, + end=10000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/objects365/retinanet_r50_fpn_1x_objects365v2.py b/mmdetection/configs/objects365/retinanet_r50_fpn_1x_objects365v2.py new file mode 100644 index 0000000..2195441 --- /dev/null +++ b/mmdetection/configs/objects365/retinanet_r50_fpn_1x_objects365v2.py @@ -0,0 +1,35 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/objects365v2_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict(bbox_head=dict(num_classes=365)) + +# Using 8 GPUS while training +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 1000, + by_epoch=False, + begin=0, + end=10000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/ocsort/README.md b/mmdetection/configs/ocsort/README.md new file mode 100644 index 0000000..e9b86c6 --- /dev/null +++ b/mmdetection/configs/ocsort/README.md @@ -0,0 +1,56 @@ +# Observation-Centric SORT: Rethinking SORT for Robust Multi-Object Tracking + +## Abstract + + + +Multi-Object Tracking (MOT) has rapidly progressed with the development of object detection and re-identification. However, motion modeling, which facilitates object association by forecasting short-term trajec- tories with past observations, has been relatively under-explored in recent years. Current motion models in MOT typically assume that the object motion is linear in a small time window and needs continuous observations, so these methods are sensitive to occlusions and non-linear motion and require high frame-rate videos. In this work, we show that a simple motion model can obtain state-of-the-art tracking performance without other cues like appearance. We emphasize the role of “observation” when recovering tracks from being lost and reducing the error accumulated by linear motion models during the lost period. We thus name the proposed method as Observation-Centric SORT, OC-SORT for short. It remains simple, online, and real-time but improves robustness over occlusion and non-linear motion. It achieves 63.2 and 62.1 HOTA on MOT17 and MOT20, respectively, surpassing all published methods. It also sets new states of the art on KITTI Pedestrian Tracking and DanceTrack where the object motion is highly non-linear + + + +
    + +
    + +## Citation + + + +```latex +@article{cao2022observation, + title={Observation-Centric SORT: Rethinking SORT for Robust Multi-Object Tracking}, + author={Cao, Jinkun and Weng, Xinshuo and Khirodkar, Rawal and Pang, Jiangmiao and Kitani, Kris}, + journal={arXiv preprint arXiv:2203.14360}, + year={2022} +} +``` + +## Results and models on MOT17 + +The performance on `MOT17-half-val` is comparable with the performance from [the OC-SORT official implementation](https://github.com/noahcao/OC_SORT). We use the same YOLO-X detector weights as in [ByteTrack](https://github.com/open-mmlab/mmtracking/tree/master/configs/mot/bytetrack). + +| Method | Detector | Train Set | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 | FP | FN | IDSw. | Config | Download | +| :-----: | :------: | :---------------------: | :------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :-------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| OC-SORT | YOLOX-X | CrowdHuman + half-train | half-val | N | - | 67.5 | 77.5 | 78.2 | 15987 | 19590 | 855 | [config](ocsort_yolox_x_crowdhuman_mot17-private-half.py) | [model](https://download.openmmlab.com/mmtracking/mot/ocsort/mot_dataset/ocsort_yolox_x_crowdhuman_mot17-private-half_20220813_101618-fe150582.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/ocsort/mot_dataset/ocsort_yolox_x_crowdhuman_mot17-private-half_20220813_101618.log.json) | + +## Get started + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Prepare + +Tracking Dataset Prepare can refer to this [document](../../docs/en/user_guides/tracking_dataset_prepare.md). + +### 3. Training + +OCSORT training is same as Bytetrack, please refer to [document](../../configs/bytetrack/README.md). + +### 4. Testing and evaluation + +OCSORT evaluation and test are same as Bytetrack, please refer to [document](../../configs/bytetrack/README.md). + +### 5.Inference + +OCSORT inference is same as Bytetrack, please refer to [document](../../configs/bytetrack/README.md). diff --git a/mmdetection/configs/ocsort/metafile.yml b/mmdetection/configs/ocsort/metafile.yml new file mode 100644 index 0000000..0a31ef1 --- /dev/null +++ b/mmdetection/configs/ocsort/metafile.yml @@ -0,0 +1,27 @@ +Collections: + - Name: OCSORT + Metadata: + Training Techniques: + - SGD with Momentum + Training Resources: 8x V100 GPUs + Architecture: + - YOLOX + Paper: + URL: https://arxiv.org/abs/2203.14360 + Title: Observation-Centric SORT Rethinking SORT for Robust Multi-Object Tracking + README: configs/ocsort/README.md + +Models: + - Name: ocsort_yolox_x_crowdhuman_mot17-private-half + In Collection: OCSORT + Config: configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py + Metadata: + Training Data: CrowdHuman + MOT17-half-train + Results: + - Task: Multiple Object Tracking + Dataset: MOT17-half-val + Metrics: + HOTA: 67.5 + MOTA: 77.5 + IDF1: 78.2 + Weights: https://download.openmmlab.com/mmtracking/mot/ocsort/mot_dataset/ocsort_yolox_x_crowdhuman_mot17-private-half_20220813_101618-fe150582.pth diff --git a/mmdetection/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmdetection/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..ea04923 --- /dev/null +++ b/mmdetection/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py @@ -0,0 +1,18 @@ +_base_ = [ + '../bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py', # noqa: E501 +] + +model = dict( + type='OCSORT', + tracker=dict( + _delete_=True, + type='OCSORTTracker', + motion=dict(type='KalmanFilter'), + obj_score_thr=0.3, + init_track_thr=0.7, + weight_iou_with_det_scores=True, + match_iou_thr=0.3, + num_tentatives=3, + vel_consist_weight=0.2, + vel_delta_t=3, + num_frames_retain=30)) diff --git a/mmdetection/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py b/mmdetection/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py new file mode 100644 index 0000000..ea04923 --- /dev/null +++ b/mmdetection/configs/ocsort/ocsort_yolox_x_8xb4-amp-80e_crowdhuman-mot20train_test-mot20test.py @@ -0,0 +1,18 @@ +_base_ = [ + '../bytetrack/bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py', # noqa: E501 +] + +model = dict( + type='OCSORT', + tracker=dict( + _delete_=True, + type='OCSORTTracker', + motion=dict(type='KalmanFilter'), + obj_score_thr=0.3, + init_track_thr=0.7, + weight_iou_with_det_scores=True, + match_iou_thr=0.3, + num_tentatives=3, + vel_consist_weight=0.2, + vel_delta_t=3, + num_frames_retain=30)) diff --git a/mmdetection/configs/openimages/README.md b/mmdetection/configs/openimages/README.md new file mode 100644 index 0000000..ccfc721 --- /dev/null +++ b/mmdetection/configs/openimages/README.md @@ -0,0 +1,149 @@ +# Open Images Dataset + +> [Open Images Dataset](https://arxiv.org/abs/1811.00982) + + + +## Abstract + + + +#### Open Images v6 + +[Open Images](https://storage.googleapis.com/openimages/web/index.html) is a dataset of ~9M images annotated with image-level labels, +object bounding boxes, object segmentation masks, visual relationships, +and localized narratives: + +- It contains a total of 16M bounding boxes for 600 object classes on + 1.9M images, making it the largest existing dataset with object location + annotations. The boxes have been largely manually drawn by professional + annotators to ensure accuracy and consistency. The images are very diverse + and often contain complex scenes with several objects (8.3 per image on + average). + +- Open Images also offers visual relationship annotations, indicating pairs + of objects in particular relations (e.g. "woman playing guitar", "beer on + table"), object properties (e.g. "table is wooden"), and human actions (e.g. + "woman is jumping"). In total it has 3.3M annotations from 1,466 distinct + relationship triplets. + +- In V5 we added segmentation masks for 2.8M object instances in 350 classes. + Segmentation masks mark the outline of objects, which characterizes their + spatial extent to a much higher level of detail. + +- In V6 we added 675k localized narratives: multimodal descriptions of images + consisting of synchronized voice, text, and mouse traces over the objects being + described. (Note we originally launched localized narratives only on train in V6, + but since July 2020 we also have validation and test covered.) + +- Finally, the dataset is annotated with 59.9M image-level labels spanning 19,957 + classes. + +We believe that having a single dataset with unified annotations for image +classification, object detection, visual relationship detection, instance +segmentation, and multimodal image descriptions will enable to study these +tasks jointly and stimulate progress towards genuine scene understanding. + + + +
    + +
    + +#### Open Images Challenge 2019 + +[Open Images Challenges 2019](https://storage.googleapis.com/openimages/web/challenge2019.html) is based on the V5 release of the Open +Images dataset. The images of the dataset are very varied and +often contain complex scenes with several objects (explore the dataset). + +## Citation + +``` +@article{OpenImages, + author = {Alina Kuznetsova and Hassan Rom and Neil Alldrin and Jasper Uijlings and Ivan Krasin and Jordi Pont-Tuset and Shahab Kamali and Stefan Popov and Matteo Malloci and Alexander Kolesnikov and Tom Duerig and Vittorio Ferrari}, + title = {The Open Images Dataset V4: Unified image classification, object detection, and visual relationship detection at scale}, + year = {2020}, + journal = {IJCV} +} +``` + +## Prepare Dataset + +1. You need to download and extract Open Images dataset. + +2. The Open Images dataset does not have image metas (width and height of the image), + which will be used during training and testing (evaluation). We suggest to get test image metas before + training/testing by using `tools/misc/get_image_metas.py`. + + **Usage** + + ```shell + python tools/misc/get_image_metas.py ${CONFIG} \ + --dataset ${DATASET TYPE} \ # train or val or test + --out ${OUTPUT FILE NAME} + ``` + +3. The directory should be like this: + + ```none + mmdetection + ├── mmdet + ├── tools + ├── configs + ├── data + │ ├── OpenImages + │ │ ├── annotations + │ │ │ ├── bbox_labels_600_hierarchy.json + │ │ │ ├── class-descriptions-boxable.csv + │ │ │ ├── oidv6-train-annotations-bbox.scv + │ │ │ ├── validation-annotations-bbox.csv + │ │ │ ├── validation-annotations-human-imagelabels-boxable.csv + │ │ │ ├── validation-image-metas.pkl # get from script + │ │ ├── challenge2019 + │ │ │ ├── challenge-2019-train-detection-bbox.txt + │ │ │ ├── challenge-2019-validation-detection-bbox.txt + │ │ │ ├── class_label_tree.np + │ │ │ ├── class_sample_train.pkl + │ │ │ ├── challenge-2019-validation-detection-human-imagelabels.csv # download from official website + │ │ │ ├── challenge-2019-validation-metas.pkl # get from script + │ │ ├── OpenImages + │ │ │ ├── train # training images + │ │ │ ├── test # testing images + │ │ │ ├── validation # validation images + ``` + +**Note**: + +1. The training and validation images of Open Images Challenge dataset are based on + Open Images v6, but the test images are different. +2. The Open Images Challenges annotations are obtained from [TSD](https://github.com/Sense-X/TSD). + You can also download the annotations from [official website](https://storage.googleapis.com/openimages/web/challenge2019_downloads.html), + and set data.train.type=OpenImagesDataset, data.val.type=OpenImagesDataset, and data.test.type=OpenImagesDataset in the config +3. If users do not want to use `validation-annotations-human-imagelabels-boxable.csv` and `challenge-2019-validation-detection-human-imagelabels.csv` + users can set `test_dataloader.dataset.image_level_ann_file=None` and `test_dataloader.dataset.image_level_ann_file=None` in the config. + Please note that loading image-levels label is the default of Open Images evaluation metric. + More details please refer to the [official website](https://storage.googleapis.com/openimages/web/evaluation.html) + +## Results and Models + +| Architecture | Backbone | Style | Lr schd | Sampler | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------------------------: | :------: | :-----: | :-----: | :-----------------: | :------: | :------------: | :----: | :------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN | R-50 | pytorch | 1x | Group Sampler | 7.7 | - | 51.6 | [config](./faster-rcnn_r50_fpn_32xb2-1x_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159.log.json) | +| Faster R-CNN | R-50 | pytorch | 1x | Class Aware Sampler | 7.7 | - | 60.0 | [config](./faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424-98c630e5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424.log.json) | +| Faster R-CNN (Challenge 2019) | R-50 | pytorch | 1x | Group Sampler | 7.7 | - | 54.9 | [config](./faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100-0e79e5df.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100.log.json) | +| Faster R-CNN (Challenge 2019) | R-50 | pytorch | 1x | Class Aware Sampler | 7.1 | - | 65.0 | [config](./faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021-34c402d9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021.log.json) | +| Retinanet | R-50 | pytorch | 1x | Group Sampler | 6.6 | - | 61.5 | [config](./retinanet_r50_fpn_32xb2-1x_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954.log.json) | +| SSD | VGG16 | pytorch | 36e | Group Sampler | 10.8 | - | 35.4 | [config](./ssd300_32xb8-36e_openimages.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232.log.json) | + +**Notes:** + +- 'cas' is short for 'Class Aware Sampler' + +### Results of consider image level labels + +| Architecture | Sampler | Consider Image Level Labels | box AP | +| :-------------------------------: | :-----------------: | :-------------------------: | :----: | +| Faster R-CNN r50 (Challenge 2019) | Group Sampler | w/o | 62.19 | +| Faster R-CNN r50 (Challenge 2019) | Group Sampler | w/ | 54.87 | +| Faster R-CNN r50 (Challenge 2019) | Class Aware Sampler | w/o | 71.77 | +| Faster R-CNN r50 (Challenge 2019) | Class Aware Sampler | w/ | 64.98 | diff --git a/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py b/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py new file mode 100644 index 0000000..e79a92c --- /dev/null +++ b/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py @@ -0,0 +1,39 @@ +_base_ = ['faster-rcnn_r50_fpn_32xb2-1x_openimages.py'] + +model = dict( + roi_head=dict(bbox_head=dict(num_classes=500)), + test_cfg=dict(rcnn=dict(score_thr=0.01))) + +# dataset settings +dataset_type = 'OpenImagesChallengeDataset' +train_dataloader = dict( + dataset=dict( + type=dataset_type, + ann_file='challenge2019/challenge-2019-train-detection-bbox.txt', + label_file='challenge2019/cls-label-description.csv', + hierarchy_file='challenge2019/class_label_tree.np', + meta_file='challenge2019/challenge-2019-train-metas.pkl')) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + ann_file='challenge2019/challenge-2019-validation-detection-bbox.txt', + data_prefix=dict(img='OpenImages/'), + label_file='challenge2019/cls-label-description.csv', + hierarchy_file='challenge2019/class_label_tree.np', + meta_file='challenge2019/challenge-2019-validation-metas.pkl', + image_level_ann_file='challenge2019/challenge-2019-validation-' + 'detection-human-imagelabels.csv')) +test_dataloader = dict( + dataset=dict( + type=dataset_type, + ann_file='challenge2019/challenge-2019-validation-detection-bbox.txt', + label_file='challenge2019/cls-label-description.csv', + hierarchy_file='challenge2019/class_label_tree.np', + meta_file='challenge2019/challenge-2019-validation-metas.pkl', + image_level_ann_file='challenge2019/challenge-2019-validation-' + 'detection-human-imagelabels.csv')) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py b/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py new file mode 100644 index 0000000..f3f0aa0 --- /dev/null +++ b/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py @@ -0,0 +1,35 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/openimages_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict(roi_head=dict(bbox_head=dict(num_classes=601))) + +# Using 32 GPUS while training +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 64, + by_epoch=False, + begin=0, + end=26000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py b/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py new file mode 100644 index 0000000..9e42872 --- /dev/null +++ b/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py @@ -0,0 +1,5 @@ +_base_ = ['faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py'] + +# Use ClassAwareSampler +train_dataloader = dict( + sampler=dict(_delete_=True, type='ClassAwareSampler', num_sample_class=1)) diff --git a/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py b/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py new file mode 100644 index 0000000..803190a --- /dev/null +++ b/mmdetection/configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py @@ -0,0 +1,5 @@ +_base_ = ['faster-rcnn_r50_fpn_32xb2-1x_openimages.py'] + +# Use ClassAwareSampler +train_dataloader = dict( + sampler=dict(_delete_=True, type='ClassAwareSampler', num_sample_class=1)) diff --git a/mmdetection/configs/openimages/metafile.yml b/mmdetection/configs/openimages/metafile.yml new file mode 100644 index 0000000..76c1209 --- /dev/null +++ b/mmdetection/configs/openimages/metafile.yml @@ -0,0 +1,102 @@ +Models: + - Name: faster-rcnn_r50_fpn_32x2_1x_openimages + In Collection: Faster R-CNN + Config: configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages.py + Metadata: + Training Memory (GB): 7.7 + Epochs: 12 + Training Data: Open Images v6 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images v6 + Metrics: + box AP: 51.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_20211130_231159-e87ab7ce.pth + + - Name: retinanet_r50_fpn_32xb2-1x_openimages + In Collection: RetinaNet + Config: configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py + Metadata: + Training Memory (GB): 6.6 + Epochs: 12 + Training Data: Open Images v6 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images v6 + Metrics: + box AP: 61.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/retinanet_r50_fpn_32x2_1x_openimages/retinanet_r50_fpn_32x2_1x_openimages_20211223_071954-d2ae5462.pth + + - Name: ssd300_32xb8-36e_openimages + In Collection: SSD + Config: configs/openimages/ssd300_32xb8-36e_openimages.py + Metadata: + Training Memory (GB): 10.8 + Epochs: 36 + Training Data: Open Images v6 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images v6 + Metrics: + box AP: 35.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/ssd300_32x8_36e_openimages/ssd300_32x8_36e_openimages_20211224_000232-dce93846.pth + + - Name: faster-rcnn_r50_fpn_32x2_1x_openimages_challenge + In Collection: Faster R-CNN + Config: configs/openimages/faster-rcnn_r50_fpn_32xb2-1x_openimages-challenge.py + Metadata: + Training Memory (GB): 7.7 + Epochs: 12 + Training Data: Open Images Challenge 2019 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images Challenge 2019 + Metrics: + box AP: 54.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_1x_openimages_challenge_20220114_045100-0e79e5df.pth + + - Name: faster-rcnn_r50_fpn_32x2_cas_1x_openimages + In Collection: Faster R-CNN + Config: configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages.py + Metadata: + Training Memory (GB): 7.7 + Epochs: 12 + Training Data: Open Images Challenge 2019 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images Challenge 2019 + Metrics: + box AP: 60.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_20220306_202424-98c630e5.pth + + - Name: faster-rcnn_r50_fpn_32x2_cas_1x_openimages_challenge + In Collection: Faster R-CNN + Config: configs/openimages/faster-rcnn_r50_fpn_32xb2-cas-1x_openimages-challenge.py + Metadata: + Training Memory (GB): 7.1 + Epochs: 12 + Training Data: Open Images Challenge 2019 + Training Techniques: + - SGD with Momentum + - Weight Decay + Results: + - Task: Object Detection + Dataset: Open Images Challenge 2019 + Metrics: + box AP: 65.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge/faster_rcnn_r50_fpn_32x2_cas_1x_openimages_challenge_20220221_192021-34c402d9.pth diff --git a/mmdetection/configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py b/mmdetection/configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py new file mode 100644 index 0000000..97a0eb0 --- /dev/null +++ b/mmdetection/configs/openimages/retinanet_r50_fpn_32xb2-1x_openimages.py @@ -0,0 +1,35 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/openimages_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict(bbox_head=dict(num_classes=601)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 64, + by_epoch=False, + begin=0, + end=26000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.08, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/openimages/ssd300_32xb8-36e_openimages.py b/mmdetection/configs/openimages/ssd300_32xb8-36e_openimages.py new file mode 100644 index 0000000..9cb51ca --- /dev/null +++ b/mmdetection/configs/openimages/ssd300_32xb8-36e_openimages.py @@ -0,0 +1,88 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/openimages_detection.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py' +] +model = dict( + bbox_head=dict( + num_classes=601, + anchor_generator=dict(basesize_ratio_range=(0.2, 0.9)))) +# dataset settings +dataset_type = 'OpenImagesDataset' +data_root = 'data/OpenImages/' +input_size = 300 +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='Expand', + mean={{_base_.model.data_preprocessor.mean}}, + to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}}, + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'instances')) +] + +train_dataloader = dict( + batch_size=8, # using 32 GPUS while training. total batch size is 32 x 8 + batch_sampler=None, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=3, # repeat 3 times, total epochs are 12 x 3 + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/oidv6-train-annotations-bbox.csv', + data_prefix=dict(img='OpenImages/train/'), + label_file='annotations/class-descriptions-boxable.csv', + hierarchy_file='annotations/bbox_labels_600_hierarchy.json', + meta_file='annotations/train-image-metas.pkl', + pipeline=train_pipeline))) +val_dataloader = dict(batch_size=8, dataset=dict(pipeline=test_pipeline)) +test_dataloader = dict(batch_size=8, dataset=dict(pipeline=test_pipeline)) + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.04, momentum=0.9, weight_decay=5e-4)) +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.001, + by_epoch=False, + begin=0, + end=20000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=256) diff --git a/mmdetection/configs/paa/README.md b/mmdetection/configs/paa/README.md new file mode 100644 index 0000000..625aacf --- /dev/null +++ b/mmdetection/configs/paa/README.md @@ -0,0 +1,47 @@ +# PAA + +> [Probabilistic Anchor Assignment with IoU Prediction for Object Detection](https://arxiv.org/abs/2007.08103) + + + +## Abstract + +In object detection, determining which anchors to assign as positive or negative samples, known as anchor assignment, has been revealed as a core procedure that can significantly affect a model's performance. In this paper we propose a novel anchor assignment strategy that adaptively separates anchors into positive and negative samples for a ground truth bounding box according to the model's learning status such that it is able to reason about the separation in a probabilistic manner. To do so we first calculate the scores of anchors conditioned on the model and fit a probability distribution to these scores. The model is then trained with anchors separated into positive and negative samples according to their probabilities. Moreover, we investigate the gap between the training and testing objectives and propose to predict the Intersection-over-Unions of detected boxes as a measure of localization quality to reduce the discrepancy. The combined score of classification and localization qualities serving as a box selection metric in non-maximum suppression well aligns with the proposed anchor assignment strategy and leads significant performance improvements. The proposed methods only add a single convolutional layer to RetinaNet baseline and does not require multiple anchors per location, so are efficient. Experimental results verify the effectiveness of the proposed methods. Especially, our models set new records for single-stage detectors on MS COCO test-dev dataset with various backbones. + +
    + +
    + +## Results and Models + +We provide config files to reproduce the object detection results in the +ECCV 2020 paper for Probabilistic Anchor Assignment with IoU +Prediction for Object Detection. + +| Backbone | Lr schd | Mem (GB) | Score voting | box AP | Config | Download | +| :-------: | :-----: | :------: | :----------: | :----: | :------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | 12e | 3.7 | True | 40.4 | [config](./paa_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.log.json) | +| R-50-FPN | 12e | 3.7 | False | 40.2 | - | | +| R-50-FPN | 18e | 3.7 | True | 41.4 | [config](./paa_r50_fpn_1.5x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.log.json) | +| R-50-FPN | 18e | 3.7 | False | 41.2 | - | | +| R-50-FPN | 24e | 3.7 | True | 41.6 | [config](./paa_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.log.json) | +| R-50-FPN | 36e | 3.7 | True | 43.3 | [config](./paa_r50_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722-06a6880b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722.log.json) | +| R-101-FPN | 12e | 6.2 | True | 42.6 | [config](./paa_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.log.json) | +| R-101-FPN | 12e | 6.2 | False | 42.4 | - | | +| R-101-FPN | 24e | 6.2 | True | 43.5 | [config](./paa_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.log.json) | +| R-101-FPN | 36e | 6.2 | True | 45.1 | [config](./paa_r101_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202-83250d22.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202.log.json) | + +**Note**: + +1. We find that the performance is unstable with 1x setting and may fluctuate by about 0.2 mAP. We report the best results. + +## Citation + +```latex +@inproceedings{paa-eccv2020, + title={Probabilistic Anchor Assignment with IoU Prediction for Object Detection}, + author={Kim, Kang and Lee, Hee Seok}, + booktitle = {ECCV}, + year={2020} +} +``` diff --git a/mmdetection/configs/paa/metafile.yml b/mmdetection/configs/paa/metafile.yml new file mode 100644 index 0000000..078b974 --- /dev/null +++ b/mmdetection/configs/paa/metafile.yml @@ -0,0 +1,111 @@ +Collections: + - Name: PAA + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - Probabilistic Anchor Assignment + - ResNet + Paper: + URL: https://arxiv.org/abs/2007.08103 + Title: 'Probabilistic Anchor Assignment with IoU Prediction for Object Detection' + README: configs/paa/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/detectors/paa.py#L6 + Version: v2.4.0 + +Models: + - Name: paa_r50_fpn_1x_coco + In Collection: PAA + Config: configs/paa/paa_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.7 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1x_coco/paa_r50_fpn_1x_coco_20200821-936edec3.pth + + - Name: paa_r50_fpn_1.5x_coco + In Collection: PAA + Config: configs/paa/paa_r50_fpn_1.5x_coco.py + Metadata: + Training Memory (GB): 3.7 + Epochs: 18 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_1.5x_coco/paa_r50_fpn_1.5x_coco_20200823-805d6078.pth + + - Name: paa_r50_fpn_2x_coco + In Collection: PAA + Config: configs/paa/paa_r50_fpn_2x_coco.py + Metadata: + Training Memory (GB): 3.7 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_2x_coco/paa_r50_fpn_2x_coco_20200821-c98bfc4e.pth + + - Name: paa_r50_fpn_mstrain_3x_coco + In Collection: PAA + Config: configs/paa/paa_r50_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 3.7 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r50_fpn_mstrain_3x_coco/paa_r50_fpn_mstrain_3x_coco_20210121_145722-06a6880b.pth + + - Name: paa_r101_fpn_1x_coco + In Collection: PAA + Config: configs/paa/paa_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_1x_coco/paa_r101_fpn_1x_coco_20200821-0a1825a4.pth + + - Name: paa_r101_fpn_2x_coco + In Collection: PAA + Config: configs/paa/paa_r101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_2x_coco/paa_r101_fpn_2x_coco_20200821-6829f96b.pth + + - Name: paa_r101_fpn_mstrain_3x_coco + In Collection: PAA + Config: configs/paa/paa_r101_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/paa/paa_r101_fpn_mstrain_3x_coco/paa_r101_fpn_mstrain_3x_coco_20210122_084202-83250d22.pth diff --git a/mmdetection/configs/paa/paa_r101_fpn_1x_coco.py b/mmdetection/configs/paa/paa_r101_fpn_1x_coco.py new file mode 100644 index 0000000..94f1c27 --- /dev/null +++ b/mmdetection/configs/paa/paa_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './paa_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/paa/paa_r101_fpn_2x_coco.py b/mmdetection/configs/paa/paa_r101_fpn_2x_coco.py new file mode 100644 index 0000000..c6136f3 --- /dev/null +++ b/mmdetection/configs/paa/paa_r101_fpn_2x_coco.py @@ -0,0 +1,18 @@ +_base_ = './paa_r101_fpn_1x_coco.py' +max_epochs = 24 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +# training schedule for 2x +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/paa/paa_r101_fpn_ms-3x_coco.py b/mmdetection/configs/paa/paa_r101_fpn_ms-3x_coco.py new file mode 100644 index 0000000..8529dcd --- /dev/null +++ b/mmdetection/configs/paa/paa_r101_fpn_ms-3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './paa_r50_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/paa/paa_r50_fpn_1.5x_coco.py b/mmdetection/configs/paa/paa_r50_fpn_1.5x_coco.py new file mode 100644 index 0000000..ae993b5 --- /dev/null +++ b/mmdetection/configs/paa/paa_r50_fpn_1.5x_coco.py @@ -0,0 +1,18 @@ +_base_ = './paa_r50_fpn_1x_coco.py' +max_epochs = 18 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[12, 16], + gamma=0.1) +] + +# training schedule for 1.5x +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/paa/paa_r50_fpn_1x_coco.py b/mmdetection/configs/paa/paa_r50_fpn_1x_coco.py new file mode 100644 index 0000000..f806a3e --- /dev/null +++ b/mmdetection/configs/paa/paa_r50_fpn_1x_coco.py @@ -0,0 +1,80 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='PAA', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='PAAHead', + reg_decoded_bbox=True, + score_voting=True, + topk=9, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.3), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.1, + neg_iou_thr=0.1, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/paa/paa_r50_fpn_2x_coco.py b/mmdetection/configs/paa/paa_r50_fpn_2x_coco.py new file mode 100644 index 0000000..6908e4e --- /dev/null +++ b/mmdetection/configs/paa/paa_r50_fpn_2x_coco.py @@ -0,0 +1,18 @@ +_base_ = './paa_r50_fpn_1x_coco.py' +max_epochs = 24 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +# training schedule for 2x +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/paa/paa_r50_fpn_ms-3x_coco.py b/mmdetection/configs/paa/paa_r50_fpn_ms-3x_coco.py new file mode 100644 index 0000000..fed8b90 --- /dev/null +++ b/mmdetection/configs/paa/paa_r50_fpn_ms-3x_coco.py @@ -0,0 +1,29 @@ +_base_ = './paa_r50_fpn_1x_coco.py' +max_epochs = 36 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] + +# training schedule for 3x +train_cfg = dict(max_epochs=max_epochs) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/pafpn/README.md b/mmdetection/configs/pafpn/README.md new file mode 100644 index 0000000..36cd6e9 --- /dev/null +++ b/mmdetection/configs/pafpn/README.md @@ -0,0 +1,34 @@ +# PAFPN + +> [Path Aggregation Network for Instance Segmentation](https://arxiv.org/abs/1803.01534) + + + +## Abstract + +The way that information propagates in neural networks is of great importance. In this paper, we propose Path Aggregation Network (PANet) aiming at boosting information flow in proposal-based instance segmentation framework. Specifically, we enhance the entire feature hierarchy with accurate localization signals in lower layers by bottom-up path augmentation, which shortens the information path between lower layers and topmost feature. We present adaptive feature pooling, which links feature grid and all feature levels to make useful information in each feature level propagate directly to following proposal subnetworks. A complementary branch capturing different views for each proposal is created to further improve mask prediction. These improvements are simple to implement, with subtle extra computational overhead. Our PANet reaches the 1st place in the COCO 2017 Challenge Instance Segmentation task and the 2nd place in Object Detection task without large-batch training. It is also state-of-the-art on MVD and Cityscapes. + +
    + +
    + +## Results and Models + +| Backbone | style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 4.0 | 17.2 | 37.5 | | [config](./faster-rcnn_r50_pafpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_20200503_105836.log.json) | + +## Citation + +```latex +@inproceedings{liu2018path, + author = {Shu Liu and + Lu Qi and + Haifang Qin and + Jianping Shi and + Jiaya Jia}, + title = {Path Aggregation Network for Instance Segmentation}, + booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2018} +} +``` diff --git a/mmdetection/configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py b/mmdetection/configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py new file mode 100644 index 0000000..1452bae --- /dev/null +++ b/mmdetection/configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' + +model = dict( + neck=dict( + type='PAFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/pafpn/metafile.yml b/mmdetection/configs/pafpn/metafile.yml new file mode 100644 index 0000000..7772d27 --- /dev/null +++ b/mmdetection/configs/pafpn/metafile.yml @@ -0,0 +1,38 @@ +Collections: + - Name: PAFPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - PAFPN + Paper: + URL: https://arxiv.org/abs/1803.01534 + Title: 'Path Aggregation Network for Instance Segmentation' + README: configs/pafpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/necks/pafpn.py#L11 + Version: v2.0.0 + +Models: + - Name: faster-rcnn_r50_pafpn_1x_coco + In Collection: PAFPN + Config: configs/pafpn/faster-rcnn_r50_pafpn_1x_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 58.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pafpn/faster_rcnn_r50_pafpn_1x_coco/faster_rcnn_r50_pafpn_1x_coco_bbox_mAP-0.375_20200503_105836-b7b4b9bd.pth diff --git a/mmdetection/configs/panoptic_fpn/README.md b/mmdetection/configs/panoptic_fpn/README.md new file mode 100644 index 0000000..0321fb7 --- /dev/null +++ b/mmdetection/configs/panoptic_fpn/README.md @@ -0,0 +1,62 @@ +# Panoptic FPN + +> [Panoptic feature pyramid networks](https://arxiv.org/abs/1901.02446) + + + +## Abstract + +The recently introduced panoptic segmentation task has renewed our community's interest in unifying the tasks of instance segmentation (for thing classes) and semantic segmentation (for stuff classes). However, current state-of-the-art methods for this joint task use separate and dissimilar networks for instance and semantic segmentation, without performing any shared computation. In this work, we aim to unify these methods at the architectural level, designing a single network for both tasks. Our approach is to endow Mask R-CNN, a popular instance segmentation method, with a semantic segmentation branch using a shared Feature Pyramid Network (FPN) backbone. Surprisingly, this simple baseline not only remains effective for instance segmentation, but also yields a lightweight, top-performing method for semantic segmentation. In this work, we perform a detailed study of this minimally extended version of Mask R-CNN with FPN, which we refer to as Panoptic FPN, and show it is a robust and accurate baseline for both tasks. Given its effectiveness and conceptual simplicity, we hope our method can serve as a strong baseline and aid future research in panoptic segmentation. + +
    + +
    + +## Dataset + +PanopticFPN requires COCO and [COCO-panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) dataset for training and evaluation. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +## Results and Models + +| Backbone | style | Lr schd | Mem (GB) | Inf time (fps) | PQ | SQ | RQ | PQ_th | SQ_th | RQ_th | PQ_st | SQ_st | RQ_st | Config | Download | +| :-------: | :-----: | :-----: | :------: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :---: | :---: | :---: | :---------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 4.7 | | 40.2 | 77.8 | 49.3 | 47.8 | 80.9 | 57.5 | 28.9 | 73.1 | 37.0 | [config](./panoptic-fpn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153.log.json) | +| R-50-FPN | pytorch | 3x | - | - | 42.5 | 78.1 | 51.7 | 50.3 | 81.5 | 60.3 | 30.7 | 73.0 | 38.8 | [config](./panoptic-fpn_r50_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155-5650f98b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155.log.json) | +| R-101-FPN | pytorch | 1x | 6.7 | | 42.2 | 78.3 | 51.4 | 50.1 | 81.4 | 59.9 | 30.3 | 73.6 | 38.5 | [config](./panoptic-fpn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950.log.json) | +| R-101-FPN | pytorch | 3x | - | - | 44.1 | 78.9 | 53.6 | 52.1 | 81.7 | 62.3 | 32.0 | 74.6 | 40.3 | [config](./panoptic-fpn_r101_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712-9c99acc4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712.log.json) | + +## Citation + +The base method for panoptic segmentation task. + +```latex +@inproceedings{kirillov2018panopticfpn, + author = { + Alexander Kirillov, + Ross Girshick, + Kaiming He, + Piotr Dollar, + }, + title = {Panoptic Feature Pyramid Networks}, + booktitle = {Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year = {2019} +} +``` diff --git a/mmdetection/configs/panoptic_fpn/metafile.yml b/mmdetection/configs/panoptic_fpn/metafile.yml new file mode 100644 index 0000000..c99275e --- /dev/null +++ b/mmdetection/configs/panoptic_fpn/metafile.yml @@ -0,0 +1,70 @@ +Collections: + - Name: PanopticFPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - PanopticFPN + Paper: + URL: https://arxiv.org/pdf/1901.02446 + Title: 'Panoptic feature pyramid networks' + README: configs/panoptic_fpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/detectors/panoptic_fpn.py#L7 + Version: v2.16.0 + +Models: + - Name: panoptic_fpn_r50_fpn_1x_coco + In Collection: PanopticFPN + Config: configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.6 + Epochs: 12 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth + + - Name: panoptic_fpn_r50_fpn_mstrain_3x_coco + In Collection: PanopticFPN + Config: configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 4.6 + Epochs: 36 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 42.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r50_fpn_mstrain_3x_coco/panoptic_fpn_r50_fpn_mstrain_3x_coco_20210824_171155-5650f98b.pth + + - Name: panoptic_fpn_r101_fpn_1x_coco + In Collection: PanopticFPN + Config: configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.5 + Epochs: 12 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 42.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth + + - Name: panoptic_fpn_r101_fpn_mstrain_3x_coco + In Collection: PanopticFPN + Config: configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 6.5 + Epochs: 36 + Results: + - Task: Panoptic Segmentation + Dataset: COCO + Metrics: + PQ: 44.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/panoptic_fpn/panoptic_fpn_r101_fpn_mstrain_3x_coco/panoptic_fpn_r101_fpn_mstrain_3x_coco_20210823_114712-9c99acc4.pth diff --git a/mmdetection/configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py b/mmdetection/configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..b960254 --- /dev/null +++ b/mmdetection/configs/panoptic_fpn/panoptic-fpn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './panoptic-fpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py b/mmdetection/configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py new file mode 100644 index 0000000..268782e --- /dev/null +++ b/mmdetection/configs/panoptic_fpn/panoptic-fpn_r101_fpn_ms-3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './panoptic-fpn_r50_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py b/mmdetection/configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..c2c89ef --- /dev/null +++ b/mmdetection/configs/panoptic_fpn/panoptic-fpn_r50_fpn_1x_coco.py @@ -0,0 +1,45 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_panoptic.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='PanopticFPN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255), + semantic_head=dict( + type='PanopticFPNHead', + num_things_classes=80, + num_stuff_classes=53, + in_channels=256, + inner_channels=128, + start_level=0, + end_level=4, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + conv_cfg=None, + loss_seg=dict( + type='CrossEntropyLoss', ignore_index=255, loss_weight=0.5)), + panoptic_fusion_head=dict( + type='HeuristicFusionHead', + num_things_classes=80, + num_stuff_classes=53), + test_cfg=dict( + rcnn=dict( + score_thr=0.6, + nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True), + max_per_img=100, + mask_thr_binary=0.5), + # used in HeuristicFusionHead + panoptic=dict(mask_overlap=0.5, stuff_area_limit=4096))) + +# Forced to remove NumClassCheckHook +custom_hooks = [] diff --git a/mmdetection/configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py b/mmdetection/configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py new file mode 100644 index 0000000..b18a8f8 --- /dev/null +++ b/mmdetection/configs/panoptic_fpn/panoptic-fpn_r50_fpn_ms-3x_coco.py @@ -0,0 +1,35 @@ +_base_ = './panoptic-fpn_r50_fpn_1x_coco.py' + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadPanopticAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# TODO: Use RepeatDataset to speed up training +# training schedule for 3x +train_cfg = dict(max_epochs=36, val_interval=3) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=36, + by_epoch=True, + milestones=[24, 33], + gamma=0.1) +] diff --git a/mmdetection/configs/pascal_voc/README.md b/mmdetection/configs/pascal_voc/README.md new file mode 100644 index 0000000..2ead3ad --- /dev/null +++ b/mmdetection/configs/pascal_voc/README.md @@ -0,0 +1,40 @@ +# Pascal VOC + +> [The Pascal Visual Object Classes (VOC) Challenge](https://link.springer.com/article/10.1007/s11263-009-0275-4) + + + +## Abstract + +The Pascal Visual Object Classes (VOC) challenge is a benchmark in visual object category recognition and detection, providing the vision and machine learning communities with a standard dataset of images and annotation, and standard evaluation procedures. Organised annually from 2005 to present, the challenge and its associated dataset has become accepted as the benchmark for object detection. + +This paper describes the dataset and evaluation procedure. We review the state-of-the-art in evaluated methods for both classification and detection, analyse whether the methods are statistically different, what they are learning from the images (e.g. the object or its context), and what the methods find easy or confuse. The paper concludes with lessons learnt in the three year history of the challenge, and proposes directions for future improvement and extension. + +
    + +
    + +## Results and Models + +| Architecture | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :------: | :-----: | :-----: | :------: | :------------: | :----: | :----------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN C4 | R-50 | caffe | 18k | | - | 80.9 | [config](./faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712//home/dong/code_sensetime/2022Q1/mmdetection/work_dirs/prepare_voc/gather/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712_20220314_234327-847a14d2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712/faster_rcnn_r50_caffe_c4_mstrain_18k_voc0712_20220314_234327.log.json) | +| Faster R-CNN | R-50 | pytorch | 1x | 2.6 | - | 80.4 | [config](./faster-rcnn_r50_fpn_1x_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/faster_rcnn_r50_fpn_1x_voc0712_20220320_192712-54bef0f3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712/faster_rcnn_r50_fpn_1x_voc0712_20220320_192712.log.json) | +| Retinanet | R-50 | pytorch | 1x | 2.1 | - | 77.3 | [config](./retinanet_r50_fpn_1x_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20200617-47cbdd0e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/retinanet_r50_fpn_1x_voc0712/retinanet_r50_fpn_1x_voc0712_20200616_014642.log.json) | +| SSD300 | VGG16 | - | 120e | - | - | 76.5 | [config](./ssd300_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd300_voc0712/ssd300_voc0712_20220320_194658-17edda1b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd300_voc0712/ssd300_voc0712_20220320_194658.log.json) | +| SSD512 | VGG16 | - | 120e | - | - | 79.5 | [config](./ssd512_voc0712.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd512_voc0712/ssd512_voc0712_20220320_194717-03cefefe.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pascal_voc/ssd512_voc0712/ssd512_voc0712_20220320_194717.log.json) | + +## Citation + +```latex +@Article{Everingham10, + author = "Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.", + title = "The Pascal Visual Object Classes (VOC) Challenge", + journal = "International Journal of Computer Vision", + volume = "88", + year = "2010", + number = "2", + month = jun, + pages = "303--338", +} +``` diff --git a/mmdetection/configs/pascal_voc/faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py b/mmdetection/configs/pascal_voc/faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py new file mode 100644 index 0000000..dddc0bb --- /dev/null +++ b/mmdetection/configs/pascal_voc/faster-rcnn_r50-caffe-c4_ms-18k_voc0712.py @@ -0,0 +1,86 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50-caffe-c4.py', + '../_base_/schedules/schedule_1x.py', '../_base_/datasets/voc0712.py', + '../_base_/default_runtime.py' +] +model = dict(roi_head=dict(bbox_head=dict(num_classes=20))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 480), (1333, 512), (1333, 544), (1333, 576), + (1333, 608), (1333, 640), (1333, 672), (1333, 704), + (1333, 736), (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + sampler=dict(type='InfiniteSampler', shuffle=True), + dataset=dict( + _delete_=True, + type='ConcatDataset', + datasets=[ + dict( + type='VOCDataset', + data_root={{_base_.data_root}}, + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args={{_base_.backend_args}}), + dict( + type='VOCDataset', + data_root={{_base_.data_root}}, + ann_file='VOC2012/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2012/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args={{_base_.backend_args}}) + ])) + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# training schedule for 18k +max_iter = 18000 +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=max_iter, + val_interval=3000) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=100), + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[12000, 16000], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) + +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=3000)) +log_processor = dict(by_epoch=False) diff --git a/mmdetection/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712-cocofmt.py b/mmdetection/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712-cocofmt.py new file mode 100644 index 0000000..0b0aa41 --- /dev/null +++ b/mmdetection/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712-cocofmt.py @@ -0,0 +1,100 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/datasets/voc0712.py', + '../_base_/default_runtime.py' +] +model = dict(roi_head=dict(bbox_head=dict(num_classes=20))) + +METAINFO = { + 'classes': + ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', + 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', + 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'), + # palette is a list of color tuples, which is used for visualization. + 'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192), + (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255), + (153, 69, 1), (120, 166, 157), (0, 182, 199), (0, 226, 252), + (182, 182, 255), (0, 0, 230), (220, 20, 60), (163, 255, 0), + (0, 82, 0), (3, 95, 161), (0, 80, 100), (183, 130, 88)] +} + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/VOCdevkit/' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1000, 600), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1000, 600), keep_ratio=True), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + dataset=dict( + type='RepeatDataset', + times=3, + dataset=dict( + _delete_=True, + type=dataset_type, + data_root=data_root, + ann_file='annotations/voc0712_trainval.json', + data_prefix=dict(img=''), + metainfo=METAINFO, + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args={{_base_.backend_args}}))) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + ann_file='annotations/voc07_test.json', + data_prefix=dict(img=''), + metainfo=METAINFO, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/voc07_test.json', + metric='bbox', + format_only=False, + backend_args={{_base_.backend_args}}) +test_evaluator = val_evaluator + +# training schedule, the dataset is repeated 3 times, so the +# actual epoch = 4 * 3 = 12 +max_epochs = 4 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[3], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py b/mmdetection/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py new file mode 100644 index 0000000..0739166 --- /dev/null +++ b/mmdetection/configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py @@ -0,0 +1,35 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/datasets/voc0712.py', + '../_base_/default_runtime.py' +] +model = dict(roi_head=dict(bbox_head=dict(num_classes=20))) + +# training schedule, voc dataset is repeated 3 times, in +# `_base_/datasets/voc0712.py`, so the actual epoch = 4 * 3 = 12 +max_epochs = 4 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[3], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py b/mmdetection/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py new file mode 100644 index 0000000..c86a6f1 --- /dev/null +++ b/mmdetection/configs/pascal_voc/retinanet_r50_fpn_1x_voc0712.py @@ -0,0 +1,34 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', '../_base_/datasets/voc0712.py', + '../_base_/default_runtime.py' +] +model = dict(bbox_head=dict(num_classes=20)) + +# training schedule, voc dataset is repeated 3 times, in +# `_base_/datasets/voc0712.py`, so the actual epoch = 4 * 3 = 12 +max_epochs = 4 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[3], + gamma=0.1) +] +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/configs/pascal_voc/ssd300_voc0712.py b/mmdetection/configs/pascal_voc/ssd300_voc0712.py new file mode 100644 index 0000000..ff7a136 --- /dev/null +++ b/mmdetection/configs/pascal_voc/ssd300_voc0712.py @@ -0,0 +1,102 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/voc0712.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +model = dict( + bbox_head=dict( + num_classes=20, anchor_generator=dict(basesize_ratio_range=(0.2, + 0.9)))) +# dataset settings +dataset_type = 'VOCDataset' +data_root = 'data/VOCdevkit/' +input_size = 300 +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean={{_base_.model.data_preprocessor.mean}}, + to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}}, + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=8, + num_workers=3, + dataset=dict( # RepeatDataset + # the dataset is repeated 10 times, and the training schedule is 2x, + # so the actual epoch = 12 * 10 = 120. + times=10, + dataset=dict( # ConcatDataset + # VOCDataset will add different `dataset_type` in dataset.metainfo, + # which will get error if using ConcatDataset. Adding + # `ignore_keys` can avoid this error. + ignore_keys=['dataset_type'], + datasets=[ + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline), + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2012/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2012/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + ]))) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +custom_hooks = [ + dict(type='NumClassCheckHook'), + dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW') +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=24, + by_epoch=True, + milestones=[16, 20], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/pascal_voc/ssd512_voc0712.py b/mmdetection/configs/pascal_voc/ssd512_voc0712.py new file mode 100644 index 0000000..6c4dc8a --- /dev/null +++ b/mmdetection/configs/pascal_voc/ssd512_voc0712.py @@ -0,0 +1,82 @@ +_base_ = 'ssd300_voc0712.py' + +input_size = 512 +model = dict( + neck=dict( + out_channels=(512, 1024, 512, 256, 256, 256, 256), + level_strides=(2, 2, 2, 2, 1), + level_paddings=(1, 1, 1, 1, 1), + last_kernel_size=4), + bbox_head=dict( + in_channels=(512, 1024, 512, 256, 256, 256, 256), + anchor_generator=dict( + input_size=input_size, + strides=[8, 16, 32, 64, 128, 256, 512], + basesize_ratio_range=(0.15, 0.9), + ratios=([2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2])))) + +# dataset settings +dataset_type = 'VOCDataset' +data_root = 'data/VOCdevkit/' +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean={{_base_.model.data_preprocessor.mean}}, + to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}}, + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + # avoid bboxes being resized + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=8, + num_workers=3, + dataset=dict( # RepeatDataset + # the dataset is repeated 10 times, and the training schedule is 2x, + # so the actual epoch = 12 * 10 = 120. + times=10, + dataset=dict( # ConcatDataset + # VOCDataset will add different `dataset_type` in dataset.metainfo, + # which will get error if using ConcatDataset. Adding + # `ignore_keys` can avoid this error. + ignore_keys=['dataset_type'], + datasets=[ + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline), + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2012/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2012/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + ]))) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/pisa/README.md b/mmdetection/configs/pisa/README.md new file mode 100644 index 0000000..39f79ec --- /dev/null +++ b/mmdetection/configs/pisa/README.md @@ -0,0 +1,50 @@ +# PISA + +> [Prime Sample Attention in Object Detection](https://arxiv.org/abs/1904.04821) + + + +## Abstract + +It is a common paradigm in object detection frameworks to treat all samples equally and target at maximizing the performance on average. In this work, we revisit this paradigm through a careful study on how different samples contribute to the overall performance measured in terms of mAP. Our study suggests that the samples in each mini-batch are neither independent nor equally important, and therefore a better classifier on average does not necessarily mean higher mAP. Motivated by this study, we propose the notion of Prime Samples, those that play a key role in driving the detection performance. We further develop a simple yet effective sampling and learning strategy called PrIme Sample Attention (PISA) that directs the focus of the training process towards such samples. Our experiments demonstrate that it is often more effective to focus on prime samples than hard samples when training a detector. Particularly, On the MSCOCO dataset, PISA outperforms the random sampling baseline and hard mining schemes, e.g., OHEM and Focal Loss, consistently by around 2% on both single-stage and two-stage detectors, even with a strong backbone ResNeXt-101. + +
    + +
    + +## Results and Models + +| PISA | Network | Backbone | Lr schd | box AP | mask AP | Config | Download | +| :--: | :----------: | :------------: | :-----: | :----: | :-----: | :----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| × | Faster R-CNN | R-50-FPN | 1x | 36.4 | | - | | +| √ | Faster R-CNN | R-50-FPN | 1x | 38.4 | | [config](./faster-rcnn_r50_fpn_pisa_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco_20200506_185619.log.json) | +| × | Faster R-CNN | X101-32x4d-FPN | 1x | 40.1 | | - | | +| √ | Faster R-CNN | X101-32x4d-FPN | 1x | 41.9 | | [config](./faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco-e4accec4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco_20200505_181503.log.json) | +| × | Mask R-CNN | R-50-FPN | 1x | 37.3 | 34.2 | - | | +| √ | Mask R-CNN | R-50-FPN | 1x | 39.1 | 35.2 | [config](./mask-rcnn_r50_fpn_pisa_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco-dfcedba6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco_20200508_150500.log.json) | +| × | Mask R-CNN | X101-32x4d-FPN | 1x | 41.1 | 37.1 | - | | +| √ | Mask R-CNN | X101-32x4d-FPN | 1x | | | | | +| × | RetinaNet | R-50-FPN | 1x | 35.6 | | - | | +| √ | RetinaNet | R-50-FPN | 1x | 36.9 | | [config](./retinanet-r50_fpn_pisa_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco-76409952.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco_20200504_014311.log.json) | +| × | RetinaNet | X101-32x4d-FPN | 1x | 39.0 | | - | | +| √ | RetinaNet | X101-32x4d-FPN | 1x | 40.7 | | [config](./retinanet_x101-32x4d_fpn_pisa_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco-a0c13c73.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco_20200505_001404.log.json) | +| × | SSD300 | VGG16 | 1x | 25.6 | | - | | +| √ | SSD300 | VGG16 | 1x | 27.6 | | [config](./ssd300_pisa_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco-710e3ac9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco_20200504_144325.log.json) | +| × | SSD512 | VGG16 | 1x | 29.3 | | - | | +| √ | SSD512 | VGG16 | 1x | 31.8 | | [config](./ssd512_pisa_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco-247addee.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco_20200508_131030.log.json) | + +**Notes:** + +- In the original paper, all models are trained and tested on mmdet v1.x, thus results may not be exactly the same with this release on v2.0. +- It is noted PISA only modifies the training pipeline so the inference time remains the same with the baseline. + +## Citation + +```latex +@inproceedings{cao2019prime, + title={Prime sample attention in object detection}, + author={Cao, Yuhang and Chen, Kai and Loy, Chen Change and Lin, Dahua}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2020} +} +``` diff --git a/mmdetection/configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py b/mmdetection/configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py new file mode 100644 index 0000000..237a3b1 --- /dev/null +++ b/mmdetection/configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' + +model = dict( + roi_head=dict( + type='PISARoIHead', + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + sampler=dict( + type='ScoreHLRSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True, + k=0.5, + bias=0.), + isr=dict(k=2, bias=0), + carl=dict(k=1, bias=0.2))), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/mmdetection/configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py b/mmdetection/configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py new file mode 100644 index 0000000..4b2c8d9 --- /dev/null +++ b/mmdetection/configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../faster_rcnn/faster-rcnn_x101-32x4d_fpn_1x_coco.py' + +model = dict( + roi_head=dict( + type='PISARoIHead', + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + sampler=dict( + type='ScoreHLRSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True, + k=0.5, + bias=0.), + isr=dict(k=2, bias=0), + carl=dict(k=1, bias=0.2))), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/mmdetection/configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py b/mmdetection/configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py new file mode 100644 index 0000000..d6a6823 --- /dev/null +++ b/mmdetection/configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' + +model = dict( + roi_head=dict( + type='PISARoIHead', + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + sampler=dict( + type='ScoreHLRSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True, + k=0.5, + bias=0.), + isr=dict(k=2, bias=0), + carl=dict(k=1, bias=0.2))), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/mmdetection/configs/pisa/mask-rcnn_x101-32x4d_fpn_pisa_1x_coco.py b/mmdetection/configs/pisa/mask-rcnn_x101-32x4d_fpn_pisa_1x_coco.py new file mode 100644 index 0000000..f2ac19f --- /dev/null +++ b/mmdetection/configs/pisa/mask-rcnn_x101-32x4d_fpn_pisa_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = '../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py' + +model = dict( + roi_head=dict( + type='PISARoIHead', + bbox_head=dict( + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))), + train_cfg=dict( + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + sampler=dict( + type='ScoreHLRSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True, + k=0.5, + bias=0.), + isr=dict(k=2, bias=0), + carl=dict(k=1, bias=0.2))), + test_cfg=dict( + rpn=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0))) diff --git a/mmdetection/configs/pisa/metafile.yml b/mmdetection/configs/pisa/metafile.yml new file mode 100644 index 0000000..3be5c3b --- /dev/null +++ b/mmdetection/configs/pisa/metafile.yml @@ -0,0 +1,110 @@ +Collections: + - Name: PISA + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - PISA + - RPN + - ResNet + - RoIPool + Paper: + URL: https://arxiv.org/abs/1904.04821 + Title: 'Prime Sample Attention in Object Detection' + README: configs/pisa/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/roi_heads/pisa_roi_head.py#L8 + Version: v2.1.0 + +Models: + - Name: pisa_faster_rcnn_r50_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/faster-rcnn_r50_fpn_pisa_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_r50_fpn_1x_coco/pisa_faster_rcnn_r50_fpn_1x_coco-dea93523.pth + + - Name: pisa_faster_rcnn_x101_32x4d_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/faster-rcnn_x101-32x4d_fpn_pisa_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco/pisa_faster_rcnn_x101_32x4d_fpn_1x_coco-e4accec4.pth + + - Name: pisa_mask_rcnn_r50_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/mask-rcnn_r50_fpn_pisa_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_mask_rcnn_r50_fpn_1x_coco/pisa_mask_rcnn_r50_fpn_1x_coco-dfcedba6.pth + + - Name: pisa_retinanet_r50_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_r50_fpn_1x_coco/pisa_retinanet_r50_fpn_1x_coco-76409952.pth + + - Name: pisa_retinanet_x101_32x4d_fpn_1x_coco + In Collection: PISA + Config: configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_retinanet_x101_32x4d_fpn_1x_coco/pisa_retinanet_x101_32x4d_fpn_1x_coco-a0c13c73.pth + + - Name: pisa_ssd300_coco + In Collection: PISA + Config: configs/pisa/ssd300_pisa_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 27.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd300_coco/pisa_ssd300_coco-710e3ac9.pth + + - Name: pisa_ssd512_coco + In Collection: PISA + Config: configs/pisa/ssd512_pisa_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 31.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pisa/pisa_ssd512_coco/pisa_ssd512_coco-247addee.pth diff --git a/mmdetection/configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py b/mmdetection/configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py new file mode 100644 index 0000000..70f89e2 --- /dev/null +++ b/mmdetection/configs/pisa/retinanet-r50_fpn_pisa_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = '../retinanet/retinanet_r50_fpn_1x_coco.py' + +model = dict( + bbox_head=dict( + type='PISARetinaHead', + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)), + train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2))) diff --git a/mmdetection/configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py b/mmdetection/configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py new file mode 100644 index 0000000..9caad45 --- /dev/null +++ b/mmdetection/configs/pisa/retinanet_x101-32x4d_fpn_pisa_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = '../retinanet/retinanet_x101-32x4d_fpn_1x_coco.py' + +model = dict( + bbox_head=dict( + type='PISARetinaHead', + loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)), + train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2))) diff --git a/mmdetection/configs/pisa/ssd300_pisa_coco.py b/mmdetection/configs/pisa/ssd300_pisa_coco.py new file mode 100644 index 0000000..b10236b --- /dev/null +++ b/mmdetection/configs/pisa/ssd300_pisa_coco.py @@ -0,0 +1,7 @@ +_base_ = '../ssd/ssd300_coco.py' + +model = dict( + bbox_head=dict(type='PISASSDHead'), + train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2))) + +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/pisa/ssd512_pisa_coco.py b/mmdetection/configs/pisa/ssd512_pisa_coco.py new file mode 100644 index 0000000..939c7f4 --- /dev/null +++ b/mmdetection/configs/pisa/ssd512_pisa_coco.py @@ -0,0 +1,7 @@ +_base_ = '../ssd/ssd512_coco.py' + +model = dict( + bbox_head=dict(type='PISASSDHead'), + train_cfg=dict(isr=dict(k=2., bias=0.), carl=dict(k=1., bias=0.2))) + +optim_wrapper = dict(clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/point_rend/README.md b/mmdetection/configs/point_rend/README.md new file mode 100644 index 0000000..efa1dca --- /dev/null +++ b/mmdetection/configs/point_rend/README.md @@ -0,0 +1,33 @@ +# PointRend + +> [PointRend: Image Segmentation as Rendering](https://arxiv.org/abs/1912.08193) + + + +## Abstract + +We present a new method for efficient high-quality image segmentation of objects and scenes. By analogizing classical computer graphics methods for efficient rendering with over- and undersampling challenges faced in pixel labeling tasks, we develop a unique perspective of image segmentation as a rendering problem. From this vantage, we present the PointRend (Point-based Rendering) neural network module: a module that performs point-based segmentation predictions at adaptively selected locations based on an iterative subdivision algorithm. PointRend can be flexibly applied to both instance and semantic segmentation tasks by building on top of existing state-of-the-art models. While many concrete implementations of the general idea are possible, we show that a simple design already achieves excellent results. Qualitatively, PointRend outputs crisp object boundaries in regions that are over-smoothed by previous methods. Quantitatively, PointRend yields significant gains on COCO and Cityscapes, for both instance and semantic segmentation. PointRend's efficiency enables output resolutions that are otherwise impractical in terms of memory or computation compared to existing approaches. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :------: | :---: | :-----: | :------: | :------------: | :----: | :-----: | :------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | caffe | 1x | 4.6 | | 38.4 | 36.3 | [config](./point-rend_r50-caffe_fpn_ms-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco_20200612_161407.log.json) | +| R-50-FPN | caffe | 3x | 4.6 | | 41.0 | 38.0 | [config](./point-rend_r50-caffe_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco-e0ebb6b7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco_20200614_002632.log.json) | + +Note: All models are trained with multi-scale, the input image shorter side is randomly scaled to one of (640, 672, 704, 736, 768, 800). + +## Citation + +```latex +@InProceedings{kirillov2019pointrend, + title={{PointRend}: Image Segmentation as Rendering}, + author={Alexander Kirillov and Yuxin Wu and Kaiming He and Ross Girshick}, + journal={ArXiv:1912.08193}, + year={2019} +} +``` diff --git a/mmdetection/configs/point_rend/metafile.yml b/mmdetection/configs/point_rend/metafile.yml new file mode 100644 index 0000000..f54f8a8 --- /dev/null +++ b/mmdetection/configs/point_rend/metafile.yml @@ -0,0 +1,54 @@ +Collections: + - Name: PointRend + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - PointRend + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1912.08193 + Title: 'PointRend: Image Segmentation as Rendering' + README: configs/point_rend/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.2.0/mmdet/models/detectors/point_rend.py#L6 + Version: v2.2.0 + +Models: + - Name: point_rend_r50_caffe_fpn_mstrain_1x_coco + In Collection: PointRend + Config: configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py + Metadata: + Training Memory (GB): 4.6 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco/point_rend_r50_caffe_fpn_mstrain_1x_coco-1bcb5fb4.pth + + - Name: point_rend_r50_caffe_fpn_mstrain_3x_coco + In Collection: PointRend + Config: configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 4.6 + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/point_rend/point_rend_r50_caffe_fpn_mstrain_3x_coco/point_rend_r50_caffe_fpn_mstrain_3x_coco-e0ebb6b7.pth diff --git a/mmdetection/configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py b/mmdetection/configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py new file mode 100644 index 0000000..8b17f5a --- /dev/null +++ b/mmdetection/configs/point_rend/point-rend_r50-caffe_fpn_ms-1x_coco.py @@ -0,0 +1,44 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-1x_coco.py' +# model settings +model = dict( + type='PointRend', + roi_head=dict( + type='PointRendRoIHead', + mask_roi_extractor=dict( + type='GenericRoIExtractor', + aggregation='concat', + roi_layer=dict( + _delete_=True, type='SimpleRoIAlign', output_size=14), + out_channels=256, + featmap_strides=[4]), + mask_head=dict( + _delete_=True, + type='CoarseMaskHead', + num_fcs=2, + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + point_head=dict( + type='MaskPointHead', + num_fcs=3, + in_channels=256, + fc_channels=256, + num_classes=80, + coarse_pred_each_layer=True, + loss_point=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rcnn=dict( + mask_size=7, + num_points=14 * 14, + oversample_ratio=3, + importance_sample_ratio=0.75)), + test_cfg=dict( + rcnn=dict( + subdivision_steps=5, + subdivision_num_points=28 * 28, + scale_factor=2))) diff --git a/mmdetection/configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py b/mmdetection/configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py new file mode 100644 index 0000000..b11faaa --- /dev/null +++ b/mmdetection/configs/point_rend/point-rend_r50-caffe_fpn_ms-3x_coco.py @@ -0,0 +1,18 @@ +_base_ = './point-rend_r50-caffe_fpn_ms-1x_coco.py' + +max_epochs = 36 + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] + +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/pvt/README.md b/mmdetection/configs/pvt/README.md new file mode 100644 index 0000000..fccad4f --- /dev/null +++ b/mmdetection/configs/pvt/README.md @@ -0,0 +1,57 @@ +# PVT + +> [Pyramid vision transformer: A versatile backbone for dense prediction without convolutions](https://arxiv.org/abs/2102.12122) + + + +## Abstract + +Although using convolutional neural networks (CNNs) as backbones achieves great successes in computer vision, this work investigates a simple backbone network useful for many dense prediction tasks without convolutions. Unlike the recently-proposed Transformer model (e.g., ViT) that is specially designed for image classification, we propose Pyramid Vision Transformer~(PVT), which overcomes the difficulties of porting Transformer to various dense prediction tasks. PVT has several merits compared to prior arts. (1) Different from ViT that typically has low-resolution outputs and high computational and memory cost, PVT can be not only trained on dense partitions of the image to achieve high output resolution, which is important for dense predictions but also using a progressive shrinking pyramid to reduce computations of large feature maps. (2) PVT inherits the advantages from both CNN and Transformer, making it a unified backbone in various vision tasks without convolutions by simply replacing CNN backbones. (3) We validate PVT by conducting extensive experiments, showing that it boosts the performance of many downstream tasks, e.g., object detection, semantic, and instance segmentation. For example, with a comparable number of parameters, RetinaNet+PVT achieves 40.4 AP on the COCO dataset, surpassing RetinNet+ResNet50 (36.3 AP) by 4.1 absolute AP. We hope PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future researches. + +Transformer recently has shown encouraging progresses in computer vision. In this work, we present new baselines by improving the original Pyramid Vision Transformer (abbreviated as PVTv1) by adding three designs, including (1) overlapping patch embedding, (2) convolutional feed-forward networks, and (3) linear complexity attention layers. +With these modifications, our PVTv2 significantly improves PVTv1 on three tasks e.g., classification, detection, and segmentation. Moreover, PVTv2 achieves comparable or better performances than recent works such as Swin Transformer. We hope this work will facilitate state-of-the-art Transformer researches in computer vision. + +
    + +
    + +## Results and Models + +### RetinaNet (PVTv1) + +| Backbone | Lr schd | Mem (GB) | box AP | Config | Download | +| :--------: | :-----: | :------: | :----: | :----------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| PVT-Tiny | 12e | 8.5 | 36.6 | [config](./retinanet_pvt-t_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110-17b566bd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110.log.json) | +| PVT-Small | 12e | 14.5 | 40.4 | [config](./retinanet_pvt-s_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921.log.json) | +| PVT-Medium | 12e | 20.9 | 41.7 | [config](./retinanet_pvt-m_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243-55effa1b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243.log.json) | + +### RetinaNet (PVTv2) + +| Backbone | Lr schd | Mem (GB) | box AP | Config | Download | +| :------: | :-----: | :------: | :----: | :-------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| PVTv2-B0 | 12e | 7.4 | 37.1 | [config](./retinanet_pvtv2-b0_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157-13e9aabe.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157.log.json) | +| PVTv2-B1 | 12e | 9.5 | 41.2 | [config](./retinanet_pvtv2-b1_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318-7e169a7d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318.log.json) | +| PVTv2-B2 | 12e | 16.2 | 44.6 | [config](./retinanet_pvtv2-b2_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843-529f0b9a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843.log.json) | +| PVTv2-B3 | 12e | 23.0 | 46.0 | [config](./retinanet_pvtv2-b3_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512-8357deff.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512.log.json) | +| PVTv2-B4 | 12e | 17.0 | 46.3 | [config](./retinanet_pvtv2-b4_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151-83795c86.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151.log.json) | +| PVTv2-B5 | 12e | 18.7 | 46.1 | [config](./retinanet_pvtv2-b5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800-3420eb57.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800.log.json) | + +## Citation + +```latex +@article{wang2021pyramid, + title={Pyramid vision transformer: A versatile backbone for dense prediction without convolutions}, + author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling}, + journal={arXiv preprint arXiv:2102.12122}, + year={2021} +} +``` + +```latex +@article{wang2021pvtv2, + title={PVTv2: Improved Baselines with Pyramid Vision Transformer}, + author={Wang, Wenhai and Xie, Enze and Li, Xiang and Fan, Deng-Ping and Song, Kaitao and Liang, Ding and Lu, Tong and Luo, Ping and Shao, Ling}, + journal={arXiv preprint arXiv:2106.13797}, + year={2021} +} +``` diff --git a/mmdetection/configs/pvt/metafile.yml b/mmdetection/configs/pvt/metafile.yml new file mode 100644 index 0000000..5884378 --- /dev/null +++ b/mmdetection/configs/pvt/metafile.yml @@ -0,0 +1,243 @@ +Models: + - Name: retinanet_pvt-t_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvt-t_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-t_fpn_1x_coco/retinanet_pvt-t_fpn_1x_coco_20210831_103110-17b566bd.pth + Paper: + URL: https://arxiv.org/abs/2102.12122 + Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315 + Version: 2.17.0 + + - Name: retinanet_pvt-s_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvt-s_fpn_1x_coco.py + Metadata: + Training Memory (GB): 14.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-s_fpn_1x_coco/retinanet_pvt-s_fpn_1x_coco_20210906_142921-b6c94a5b.pth + Paper: + URL: https://arxiv.org/abs/2102.12122 + Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315 + Version: 2.17.0 + + - Name: retinanet_pvt-m_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvt-m_fpn_1x_coco.py + Metadata: + Training Memory (GB): 20.9 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvt-m_fpn_1x_coco/retinanet_pvt-m_fpn_1x_coco_20210831_103243-55effa1b.pth + Paper: + URL: https://arxiv.org/abs/2102.12122 + Title: "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L315 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b0_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b0_fpn_1x_coco/retinanet_pvtv2-b0_fpn_1x_coco_20210831_103157-13e9aabe.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b1_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py + Metadata: + Training Memory (GB): 9.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b1_fpn_1x_coco/retinanet_pvtv2-b1_fpn_1x_coco_20210831_103318-7e169a7d.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b2_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py + Metadata: + Training Memory (GB): 16.2 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b2_fpn_1x_coco/retinanet_pvtv2-b2_fpn_1x_coco_20210901_174843-529f0b9a.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b3_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py + Metadata: + Training Memory (GB): 23.0 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b3_fpn_1x_coco/retinanet_pvtv2-b3_fpn_1x_coco_20210903_151512-8357deff.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b4_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py + Metadata: + Training Memory (GB): 17.0 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b4_fpn_1x_coco/retinanet_pvtv2-b4_fpn_1x_coco_20210901_170151-83795c86.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 + + - Name: retinanet_pvtv2-b5_fpn_1x_coco + In Collection: RetinaNet + Config: configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 18.7 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x NVIDIA V100 GPUs + Architecture: + - PyramidVisionTransformerV2 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/pvt/retinanet_pvtv2-b5_fpn_1x_coco/retinanet_pvtv2-b5_fpn_1x_coco_20210902_201800-3420eb57.pth + Paper: + URL: https://arxiv.org/abs/2106.13797 + Title: "PVTv2: Improved Baselines with Pyramid Vision Transformer" + README: configs/pvt/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.17.0/mmdet/models/backbones/pvt.py#L543 + Version: 2.17.0 diff --git a/mmdetection/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py new file mode 100644 index 0000000..1a6f604 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvt-l_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = 'retinanet_pvt-t_fpn_1x_coco.py' +model = dict( + backbone=dict( + num_layers=[3, 8, 27, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_large.pth'))) +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict(type='AmpOptimWrapper') diff --git a/mmdetection/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py new file mode 100644 index 0000000..b888f78 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvt-m_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = 'retinanet_pvt-t_fpn_1x_coco.py' +model = dict( + backbone=dict( + num_layers=[3, 4, 18, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_medium.pth'))) diff --git a/mmdetection/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py new file mode 100644 index 0000000..4660348 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvt-s_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = 'retinanet_pvt-t_fpn_1x_coco.py' +model = dict( + backbone=dict( + num_layers=[3, 4, 6, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_small.pth'))) diff --git a/mmdetection/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py new file mode 100644 index 0000000..5f67c44 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvt-t_fpn_1x_coco.py @@ -0,0 +1,18 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='RetinaNet', + backbone=dict( + _delete_=True, + type='PyramidVisionTransformer', + num_layers=[2, 2, 2, 2], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_tiny.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) +# optimizer +optim_wrapper = dict( + optimizer=dict( + _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001)) diff --git a/mmdetection/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py new file mode 100644 index 0000000..cbebf90 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvtv2-b0_fpn_1x_coco.py @@ -0,0 +1,19 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='RetinaNet', + backbone=dict( + _delete_=True, + type='PyramidVisionTransformerV2', + embed_dims=32, + num_layers=[2, 2, 2, 2], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b0.pth')), + neck=dict(in_channels=[32, 64, 160, 256])) +# optimizer +optim_wrapper = dict( + optimizer=dict( + _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001)) diff --git a/mmdetection/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py new file mode 100644 index 0000000..5374c50 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvtv2-b1_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b1.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) diff --git a/mmdetection/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py new file mode 100644 index 0000000..cf9a18d --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvtv2-b2_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + num_layers=[3, 4, 6, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b2.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) diff --git a/mmdetection/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py new file mode 100644 index 0000000..7a47f82 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvtv2-b3_fpn_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + num_layers=[3, 4, 18, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b3.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) diff --git a/mmdetection/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py new file mode 100644 index 0000000..5faf4c5 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvtv2-b4_fpn_1x_coco.py @@ -0,0 +1,20 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + num_layers=[3, 8, 27, 3], + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b4.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) +# optimizer +optim_wrapper = dict( + optimizer=dict( + _delete_=True, type='AdamW', lr=0.0001 / 1.4, weight_decay=0.0001)) + +# dataset settings +train_dataloader = dict(batch_size=1, num_workers=1) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/mmdetection/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py b/mmdetection/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py new file mode 100644 index 0000000..afff871 --- /dev/null +++ b/mmdetection/configs/pvt/retinanet_pvtv2-b5_fpn_1x_coco.py @@ -0,0 +1,21 @@ +_base_ = 'retinanet_pvtv2-b0_fpn_1x_coco.py' +model = dict( + backbone=dict( + embed_dims=64, + num_layers=[3, 6, 40, 3], + mlp_ratios=(4, 4, 4, 4), + init_cfg=dict(checkpoint='https://github.com/whai362/PVT/' + 'releases/download/v2/pvt_v2_b5.pth')), + neck=dict(in_channels=[64, 128, 320, 512])) +# optimizer +optim_wrapper = dict( + optimizer=dict( + _delete_=True, type='AdamW', lr=0.0001 / 1.4, weight_decay=0.0001)) + +# dataset settings +train_dataloader = dict(batch_size=1, num_workers=1) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/mmdetection/configs/qdtrack/README.md b/mmdetection/configs/qdtrack/README.md new file mode 100644 index 0000000..5a6efe7 --- /dev/null +++ b/mmdetection/configs/qdtrack/README.md @@ -0,0 +1,89 @@ +# Quasi-Dense Similarity Learning for Multiple Object Tracking + +## Abstract + + + +Similarity learning has been recognized as a crucial step for object tracking. However, existing multiple object tracking methods only use sparse ground truth matching as the training objective, while ignoring the majority of the informative regions on the images. In this paper, we present Quasi-Dense Similarity Learning, which densely samples hundreds of region proposals on a pair of images for contrastive learning. We can directly combine this similarity learning with existing detection methods to build Quasi-Dense Tracking (QDTrack) without turning to displacementregression or motion priors. We also find that the resulting distinctive feature space admits a simple nearest neighbor search at the inference time. Despite its simplicity, QD-Track outperforms all existing methods on MOT, BDD100K, Waymo, and TAO tracking benchmarks. It achieves 68.7 MOTA at 20.3 FPS on MOT17 without using external training data. Compared to methods with similar detectors, it boosts almost 10 points of MOTA and significantly decreases the number of ID switches on BDD100K and Waymo datasets. + + + +
    + + +
    + +## Results and models on MOT17 + +| Method | Detector | Train Set | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 | FP | FN | IDSw. | Config | Download | +| :-----: | :----------: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :--: | :---: | :---: | :-------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| QDTrack | Faster R-CNN | half-train | half-val | N | - | 57.1 | 68.1 | 68.6 | 7707 | 42732 | 1083 | [config](qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py) | [model](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth) \| [log](https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635.log.json) | + +## Get started + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Prepare + +Tracking Dataset Prepare can refer to this [document](../../docs/en/user_guides/tracking_dataset_prepare.md). + +### 3. Training + +Due to the influence of parameters such as learning rate in default configuration file, we recommend using 8 GPUs for training in order to reproduce accuracy. You can use the following command to start the training. + +```shell +# Training QDTrack on mot17-half-train dataset with following command. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_train.sh configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 +``` + +If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 4. Testing and evaluation + +**4.1 Example on MOTxx-halfval dataset** + +```shell +# Example 1: Test on motXX-half-val set +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_test_tracking.sh configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 --checkpoint ${CHECKPOINT_PATH} +``` + +**4.2 use video_baesd to evaluating and testing** +we also provide two_ways(img_based or video_based) to evaluating and testing. +if you want to use video_based to evaluating and testing, you can modify config as follows + +``` +val_dataloader = dict( + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False)) +``` + +If you want to know about more detailed usage of `test_tracking.py/dist_test_tracking.sh/slurm_test_tracking.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 5.Inference + +Use a single GPU to predict a video and save it as a video. + +```shell +python demo/mot_demo.py demo/demo_mot.mp4 configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py --checkpoint ${CHECKPOINT_PATH} --out mot.mp4 +``` + +If you want to know about more detailed usage of `mot_demo.py`, please refer to this [document](../../docs/en/user_guides/tracking_inference.md). + +## Citation + + + +```latex +@inproceedings{pang2021quasi, + title={Quasi-dense similarity learning for multiple object tracking}, + author={Pang, Jiangmiao and Qiu, Linlu and Li, Xia and Chen, Haofeng and Li, Qi and Darrell, Trevor and Yu, Fisher}, + booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, + pages={164--173}, + year={2021} +} +``` diff --git a/mmdetection/configs/qdtrack/metafile.yml b/mmdetection/configs/qdtrack/metafile.yml new file mode 100644 index 0000000..e5c5504 --- /dev/null +++ b/mmdetection/configs/qdtrack/metafile.yml @@ -0,0 +1,30 @@ +Collections: + - Name: QDTrack + Metadata: + Training Data: MOT17, crowdhuman + Training Techniques: + - SGD + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Paper: + URL: https://arxiv.org/pdf/2006.06664.pdf + Title: Quasi-Dense Similarity Learning for Multiple Object Tracking + README: configs/qdtrack/README.md + +Models: + - Name: qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval + In Collection: QDTrack + Config: configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py + Metadata: + Training Data: MOT17 + Training Memory (GB): 5.83 + Epochs: 4 + Results: + - Task: Multi-object Tracking + Dataset: MOT17 + Metrics: + HOTA: 57.1 + MOTA: 68.1 + IDF1: 68.6 + Weights: https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth diff --git a/mmdetection/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py b/mmdetection/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py new file mode 100644 index 0000000..e3c17c3 --- /dev/null +++ b/mmdetection/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_4e_base.py @@ -0,0 +1,118 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/default_runtime.py' +] + +detector = _base_.model +detector.pop('data_preprocessor') + +detector['backbone'].update( + dict( + norm_cfg=dict(type='BN', requires_grad=False), + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +detector.rpn_head.loss_bbox.update( + dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)) +detector.rpn_head.bbox_coder.update(dict(clip_border=False)) +detector.roi_head.bbox_head.update(dict(num_classes=1)) +detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False)) +detector['init_cfg'] = dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/' + 'faster_rcnn_r50_fpn_1x_coco-person/' + 'faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' + # noqa: E501 +) +del _base_.model + +model = dict( + type='QDTrack', + data_preprocessor=dict( + type='TrackDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + detector=detector, + track_head=dict( + type='QuasiDenseTrackHead', + roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + embed_head=dict( + type='QuasiDenseEmbedHead', + num_convs=4, + num_fcs=1, + embed_channels=256, + norm_cfg=dict(type='GN', num_groups=32), + loss_track=dict(type='MultiPosCrossEntropyLoss', loss_weight=0.25), + loss_track_aux=dict( + type='MarginL2Loss', + neg_pos_ub=3, + pos_margin=0, + neg_margin=0.1, + hard_mining=True, + loss_weight=1.0)), + loss_bbox=dict(type='L1Loss', loss_weight=1.0), + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='CombinedSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=3, + add_gt_as_proposals=True, + pos_sampler=dict(type='InstanceBalancedPosSampler'), + neg_sampler=dict(type='RandomSampler')))), + tracker=dict( + type='QuasiDenseTracker', + init_score_thr=0.9, + obj_score_thr=0.5, + match_score_thr=0.5, + memo_tracklet_frames=30, + memo_backdrop_frames=1, + memo_momentum=0.8, + nms_conf_thr=0.5, + nms_backdrop_iou_thr=0.3, + nms_class_iou_thr=0.7, + with_cats=True, + match_metric='bisoftmax')) +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) +# learning policy +param_scheduler = [ + dict(type='MultiStepLR', begin=0, end=4, by_epoch=True, milestones=[3]) +] + +# runtime settings +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=4, val_interval=4) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=50), + visualization=dict(type='TrackVisualizationHook', draw=False)) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# custom hooks +custom_hooks = [ + # Synchronize model buffers such as running_mean and running_var in BN + # at the end of each epoch + dict(type='SyncBuffersHook') +] diff --git a/mmdetection/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmdetection/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..d87604d --- /dev/null +++ b/mmdetection/configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py @@ -0,0 +1,14 @@ +_base_ = [ + './qdtrack_faster-rcnn_r50_fpn_4e_base.py', + '../_base_/datasets/mot_challenge.py', +] + +# evaluator +val_evaluator = [ + dict(type='CocoVideoMetric', metric=['bbox'], classwise=True), + dict(type='MOTChallengeMetric', metric=['HOTA', 'CLEAR', 'Identity']) +] + +test_evaluator = val_evaluator +# The fluctuation of HOTA is about +-1. +randomness = dict(seed=6) diff --git a/mmdetection/configs/queryinst/README.md b/mmdetection/configs/queryinst/README.md new file mode 100644 index 0000000..ee62ccb --- /dev/null +++ b/mmdetection/configs/queryinst/README.md @@ -0,0 +1,36 @@ +# QueryInst + +> [Instances as Queries](https://openaccess.thecvf.com/content/ICCV2021/html/Fang_Instances_As_Queries_ICCV_2021_paper.html) + + + +## Abstract + +We present QueryInst, a new perspective for instance segmentation. QueryInst is a multi-stage end-to-end system that treats instances of interest as learnable queries, enabling query based object detectors, e.g., Sparse R-CNN, to have strong instance segmentation performance. The attributes of instances such as categories, bounding boxes, instance masks, and instance association embeddings are represented by queries in a unified manner. In QueryInst, a query is shared by both detection and segmentation via dynamic convolutions and driven by parallelly-supervised multi-stage learning. We conduct extensive experiments on three challenging benchmarks, i.e., COCO, CityScapes, and YouTube-VIS to evaluate the effectiveness of QueryInst in object detection, instance segmentation, and video instance segmentation tasks. For the first time, we demonstrate that a simple end-to-end query based framework can achieve the state-of-the-art performance in various instance-level recognition tasks. + +
    + +
    + +## Results and Models + +| Model | Backbone | Style | Lr schd | Number of Proposals | Multi-Scale | RandomCrop | box AP | mask AP | Config | Download | +| :-------: | :-------: | :-----: | :-----: | :-----------------: | :---------: | :--------: | :----: | :-----: | :---------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| QueryInst | R-50-FPN | pytorch | 1x | 100 | False | False | 42.0 | 37.5 | [config](./queryinst_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916.log.json) | +| QueryInst | R-50-FPN | pytorch | 3x | 100 | True | False | 44.8 | 39.8 | [config](./queryinst_r50_fpn_ms-480-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643-7837af86.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643.log.json) | +| QueryInst | R-50-FPN | pytorch | 3x | 300 | True | True | 47.5 | 41.7 | [config](./queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802-85cffbd8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802.log.json) | +| QueryInst | R-101-FPN | pytorch | 3x | 100 | True | False | 46.4 | 41.0 | [config](./queryinst_r101_fpn_ms-480-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048-91f9995b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048.log.json) | +| QueryInst | R-101-FPN | pytorch | 3x | 300 | True | True | 49.0 | 42.9 | [config](./queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621-76cce59f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621.log.json) | + +## Citation + +```latex +@InProceedings{Fang_2021_ICCV, + author = {Fang, Yuxin and Yang, Shusheng and Wang, Xinggang and Li, Yu and Fang, Chen and Shan, Ying and Feng, Bin and Liu, Wenyu}, + title = {Instances As Queries}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2021}, + pages = {6910-6919} +} +``` diff --git a/mmdetection/configs/queryinst/metafile.yml b/mmdetection/configs/queryinst/metafile.yml new file mode 100644 index 0000000..3ea3b00 --- /dev/null +++ b/mmdetection/configs/queryinst/metafile.yml @@ -0,0 +1,100 @@ +Collections: + - Name: QueryInst + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - QueryInst + Paper: + URL: https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Instances_As_Queries_ICCV_2021_paper.pdf + Title: 'Instances as Queries' + README: configs/queryinst/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/detectors/queryinst.py + Version: v2.18.0 + +Models: + - Name: queryinst_r50_fpn_1x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_1x_coco/queryinst_r50_fpn_1x_coco_20210907_084916-5a8f1998.pth + + - Name: queryinst_r50_fpn_ms-480-800-3x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco/queryinst_r50_fpn_mstrain_480-800_3x_coco_20210901_103643-7837af86.pth + + - Name: queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_101802-85cffbd8.pth + + - Name: queryinst_r101_fpn_ms-480-800-3x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_mstrain_480-800_3x_coco/queryinst_r101_fpn_mstrain_480-800_3x_coco_20210904_104048-91f9995b.pth + + - Name: queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco + In Collection: QueryInst + Config: configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/queryinst/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/queryinst_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20210904_153621-76cce59f.pth diff --git a/mmdetection/configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py b/mmdetection/configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py new file mode 100644 index 0000000..1692c13 --- /dev/null +++ b/mmdetection/configs/queryinst/queryinst_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py b/mmdetection/configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py new file mode 100644 index 0000000..dd5b7f4 --- /dev/null +++ b/mmdetection/configs/queryinst/queryinst_r101_fpn_ms-480-800-3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './queryinst_r50_fpn_ms-480-800-3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/queryinst/queryinst_r50_fpn_1x_coco.py b/mmdetection/configs/queryinst/queryinst_r50_fpn_1x_coco.py new file mode 100644 index 0000000..63d61d7 --- /dev/null +++ b/mmdetection/configs/queryinst/queryinst_r50_fpn_1x_coco.py @@ -0,0 +1,155 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +num_stages = 6 +num_proposals = 100 +model = dict( + type='QueryInst', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=0, + add_extra_convs='on_input', + num_outs=4), + rpn_head=dict( + type='EmbeddingRPNHead', + num_proposals=num_proposals, + proposal_feature_channel=256), + roi_head=dict( + type='SparseRoIHead', + num_stages=num_stages, + stage_loss_weights=[1] * num_stages, + proposal_feature_channel=256, + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='DIIHead', + num_classes=80, + num_ffn_fcs=2, + num_heads=8, + num_cls_fcs=1, + num_reg_fcs=3, + feedforward_channels=2048, + in_channels=256, + dropout=0.0, + ffn_act_cfg=dict(type='ReLU', inplace=True), + dynamic_conv_cfg=dict( + type='DynamicConv', + in_channels=256, + feat_channels=64, + out_channels=256, + input_feat_shape=7, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN')), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + clip_border=False, + target_means=[0., 0., 0., 0.], + target_stds=[0.5, 0.5, 1., 1.])) for _ in range(num_stages) + ], + mask_head=[ + dict( + type='DynamicMaskHead', + dynamic_conv_cfg=dict( + type='DynamicConv', + in_channels=256, + feat_channels=64, + out_channels=256, + input_feat_shape=14, + with_proj=False, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN')), + num_convs=4, + num_classes=80, + roi_feat_size=14, + in_channels=256, + conv_kernel_size=3, + conv_out_channels=256, + class_agnostic=False, + norm_cfg=dict(type='BN'), + upsample_cfg=dict(type='deconv', scale_factor=2), + loss_mask=dict( + type='DiceLoss', + loss_weight=8.0, + use_sigmoid=True, + activate=False, + eps=1e-5)) for _ in range(num_stages) + ]), + # training and testing settings + train_cfg=dict( + rpn=None, + rcnn=[ + dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xyxy'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ]), + sampler=dict(type='PseudoSampler'), + pos_weight=1, + mask_size=28, + ) for _ in range(num_stages) + ]), + test_cfg=dict( + rpn=None, rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5))) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001), + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}), + clip_grad=dict(max_norm=0.1, norm_type=2)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] diff --git a/mmdetection/configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py b/mmdetection/configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py new file mode 100644 index 0000000..33ab061 --- /dev/null +++ b/mmdetection/configs/queryinst/queryinst_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py @@ -0,0 +1,45 @@ +_base_ = './queryinst_r50_fpn_ms-480-800-3x_coco.py' +num_proposals = 300 +model = dict( + rpn_head=dict(num_proposals=num_proposals), + test_cfg=dict( + _delete_=True, + rpn=None, + rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5))) + +# augmentation strategy originates from DETR. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py b/mmdetection/configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py new file mode 100644 index 0000000..6b99374 --- /dev/null +++ b/mmdetection/configs/queryinst/queryinst_r50_fpn_ms-480-800-3x_coco.py @@ -0,0 +1,32 @@ +_base_ = './queryinst_r50_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# learning policy +max_epochs = 36 +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=max_epochs) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] diff --git a/mmdetection/configs/recycle/detr_r50_8xb2-150e_recycle.py b/mmdetection/configs/recycle/detr_r50_8xb2-150e_recycle.py new file mode 100644 index 0000000..2af2fa7 --- /dev/null +++ b/mmdetection/configs/recycle/detr_r50_8xb2-150e_recycle.py @@ -0,0 +1,61 @@ +# The new config inherits a base config to highlight the necessary modification +_base_ = '../detr/detr_r50_8xb2-150e_coco.py' + +# We also need to change the num_classes in head to match the dataset's annotation +model = dict( + bbox_head=dict( + type='DETRHead', + num_classes=15, + embed_dims=256, + loss_cls=dict( + type='CrossEntropyLoss', + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + ) + +# default_hooks = dict( +# dict(type='TextLoggerHook'), +# logger=dict(type='MMDetWandbHook', init_kwars={'project' : 'TA_test'}, +# interval=10, +# log_checkpoint=True, +# log_checkpoint_metadata=True, +# num_eval_images=100 +# )) +vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')] +visualizer = dict(vis_backends=vis_backends) + +# Modify dataset related settings +data_root = 'data/recycle/' +metainfo = { + 'classes': ('General trash', 'Paper', 'Paper pack', 'Metal', 'Glass', + 'Plastic', 'Styrofoam', 'Plastic bag', 'Battery', 'Clothing',), + 'palette': [ + (220, 20, 60), (119, 11, 32), (0, 0, 230), (106, 0, 228), (60, 20, 220), + (0, 80, 100), (0, 0, 70), (50, 0, 192), (250, 170, 30), (255, 0, 0) + ] +} +train_dataloader = dict( + batch_size=8, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='train.json', + data_prefix=dict(img=''))) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='train.json', + data_prefix=dict(img=''))) +test_dataloader = val_dataloader + +# Modify metric related settings +val_evaluator = dict(ann_file=data_root + 'train.json') +test_evaluator = val_evaluator + +# We can use the pre-trained Mask RCNN model to obtain higher performance +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/detr/detr_r50_8xb2-150e_coco/detr_r50_8xb2-150e_coco_20221023_153551-436d03e8.pth' \ No newline at end of file diff --git a/mmdetection/configs/recycle/faster-rcnn_r50_fpn_1x_recycle.py b/mmdetection/configs/recycle/faster-rcnn_r50_fpn_1x_recycle.py new file mode 100644 index 0000000..caf5cea --- /dev/null +++ b/mmdetection/configs/recycle/faster-rcnn_r50_fpn_1x_recycle.py @@ -0,0 +1,48 @@ +# The new config inherits a base config to highlight the necessary modification +_base_ = '../faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py' + +# We also need to change the num_classes in head to match the dataset's annotation +# model = dict( +# roi_head=dict( +# bbox_head=dict(num_classes=1), mask_head=dict(num_classes=1))) + +custom_hooks = [ + dict(type='SubmissionHook', test_out_dir='submit') +] + +# Modify dataset related settings +data_root = 'data/recycle/' +metainfo = { + 'classes': ('General trash', 'Paper', 'Paper pack', 'Metal', 'Glass', + 'Plastic', 'Styrofoam', 'Plastic bag', 'Battery', 'Clothing',), + 'palette': [ + (220, 20, 60), (119, 11, 32), (0, 0, 230), (106, 0, 228), (60, 20, 220), + (0, 80, 100), (0, 0, 70), (50, 0, 192), (250, 170, 30), (255, 0, 0) + ] +} +train_dataloader = dict( + batch_size=8, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='train.json', + data_prefix=dict(img=''))) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='train.json', + data_prefix=dict(img=''))) +test_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='test.json', + data_prefix=dict(img=''))) + +# Modify metric related settings +val_evaluator = dict(ann_file=data_root + 'train.json') +test_evaluator = dict(ann_file=data_root + 'test.json') + +# We can use the pre-trained Mask RCNN model to obtain higher performance +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' diff --git a/mmdetection/configs/regnet/README.md b/mmdetection/configs/regnet/README.md new file mode 100644 index 0000000..0bfcec1 --- /dev/null +++ b/mmdetection/configs/regnet/README.md @@ -0,0 +1,121 @@ +# RegNet + +> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) + + + +## Abstract + +In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs. + +
    + +
    + +## Introduction + +We implement RegNetX and RegNetY models in detection systems and provide their first results on Mask R-CNN, Faster R-CNN and RetinaNet. + +The pre-trained models are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md). + +## Usage + +To use a regnet model, there are two steps to do: + +1. Convert the model to ResNet-style supported by MMDetection +2. Modify backbone and neck in config accordingly + +### Convert model + +We already prepare models of FLOPs from 400M to 12G in our model zoo. + +For more general usage, we also provide script `regnet2mmdet.py` in the tools directory to convert the key of models pretrained by [pycls](https://github.com/facebookresearch/pycls/) to +ResNet-style checkpoints used in MMDetection. + +```bash +python -u tools/model_converters/regnet2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH} +``` + +This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`. + +### Modify config + +The users can modify the config's `depth` of backbone and corresponding keys in `arch` according to the configs in the [pycls model zoo](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md). +The parameter `in_channels` in FPN can be found in the Figure 15 & 16 of the paper (`wi` in the legend). +This directory already provides some configs with their performance, using RegNetX from 800MF to 12GF level. +For other pre-trained models or self-implemented regnet models, the users are responsible to check these parameters by themselves. + +**Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model. + +## Results and Models + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------------------------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [R-50-FPN](../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py) | pytorch | 1x | 4.4 | 12.0 | 38.2 | 34.7 | [config](../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205_050542.log.json) | +| [RegNetX-3.2GF-FPN](./mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py) | pytorch | 1x | 5.0 | | 40.3 | 36.6 | [config](./mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141.log.json) | +| [RegNetX-4.0GF-FPN](./mask-rcnn_regnetx-4GF_fpn_1x_coco.py) | pytorch | 1x | 5.5 | | 41.5 | 37.4 | [config](./mask-rcnn_regnetx-4GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217-32e9c92d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217.log.json) | +| [R-101-FPN](../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py) | pytorch | 1x | 6.4 | 10.3 | 40.0 | 36.1 | [config](../mask_rcnn/mask-rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204-1efe0ed5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r101_fpn_1x_coco/mask_rcnn_r101_fpn_1x_coco_20200204_144809.log.json) | +| [RegNetX-6.4GF-FPN](./mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py) | pytorch | 1x | 6.1 | | 41.0 | 37.1 | [config](./mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439-3a7aae83.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439.log.json) | +| [X-101-32x4d-FPN](../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py) | pytorch | 1x | 7.6 | 9.4 | 41.9 | 37.5 | [config](../mask_rcnn/mask-rcnn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205-478d0b67.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco/mask_rcnn_x101_32x4d_fpn_1x_coco_20200205_034906.log.json) | +| [RegNetX-8.0GF-FPN](./mask-rcnn_regnetx-8GF_fpn_1x_coco.py) | pytorch | 1x | 6.4 | | 41.7 | 37.5 | [config](./mask-rcnn_regnetx-8GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515-09daa87e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515.log.json) | +| [RegNetX-12GF-FPN](./mask-rcnn_regnetx-12GF_fpn_1x_coco.py) | pytorch | 1x | 7.4 | | 42.2 | 38 | [config](./mask-rcnn_regnetx-12GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552-b538bd8b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552.log.json) | +| [RegNetX-3.2GF-FPN-DCN-C3-C5](./mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py) | pytorch | 1x | 5.0 | | 40.3 | 36.6 | [config](./mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726-75f40794.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726.log.json) | + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [R-50-FPN](../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py) | pytorch | 1x | 4.0 | 18.2 | 37.4 | [config](../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130_204655.log.json) | +| [RegNetX-3.2GF-FPN](./faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py) | pytorch | 1x | 4.5 | | 39.9 | [config](./faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927-126fd9bf.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927.log.json) | +| [RegNetX-3.2GF-FPN](./faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py) | pytorch | 2x | 4.5 | | 41.1 | [config](./faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955-e2081918.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955.log.json) | + +### RetinaNet + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-----------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [R-50-FPN](../retinanet/retinanet_r50_fpn_1x_coco.py) | pytorch | 1x | 3.8 | 16.6 | 36.5 | [config](../retinanet/retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130_002941.log.json) | +| [RegNetX-800MF-FPN](./retinanet_regnetx-800MF_fpn_1x_coco.py) | pytorch | 1x | 2.5 | | 35.6 | [config](./retinanet_regnetx-800MF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403-f6f91d10.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403.log.json) | +| [RegNetX-1.6GF-FPN](./retinanet_regnetx-1.6GF_fpn_1x_coco.py) | pytorch | 1x | 3.3 | | 37.3 | [config](./retinanet_regnetx-1.6GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403-37009a9d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403.log.json) | +| [RegNetX-3.2GF-FPN](./retinanet_regnetx-3.2GF_fpn_1x_coco.py) | pytorch | 1x | 4.2 | | 39.1 | [config](./retinanet_regnetx-3.2GF_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141-cb1509e8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141.log.json) | + +### Pre-trained models + +We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks. + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :---------------: | :----------------------------------------------------------------------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster RCNN | [RegNetX-400MF-FPN](./faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py) | pytorch | 3x | 2.3 | | 37.1 | - | [config](./faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112-e1967c37.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112.log.json) | +| Faster RCNN | [RegNetX-800MF-FPN](./faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py) | pytorch | 3x | 2.8 | | 38.8 | - | [config](./faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118-a2c70b20.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118.log.json) | +| Faster RCNN | [RegNetX-1.6GF-FPN](./faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py) | pytorch | 3x | 3.4 | | 40.5 | - | [config](./faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325-94aa46cc.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325.log.json) | +| Faster RCNN | [RegNetX-3.2GF-FPN](./faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py) | pytorch | 3x | 4.4 | | 42.3 | - | [config](./faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152-e16a5227.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152.log.json) | +| Faster RCNN | [RegNetX-4GF-FPN](./faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py) | pytorch | 3x | 4.9 | | 42.8 | - | [config](./faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201-65eaf841.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201.log.json) | +| Mask RCNN | [RegNetX-400MF-FPN](./mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py) | pytorch | 3x | 2.5 | | 37.6 | 34.4 | [config](./mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443-8aac57a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443.log.json) | +| Mask RCNN | [RegNetX-800MF-FPN](./mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py) | pytorch | 3x | 2.9 | | 39.5 | 36.1 | [config](./mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641-715d51f5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641.log.json) | +| Mask RCNN | [RegNetX-1.6GF-FPN](./mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py) | pytorch | 3x | 3.6 | | 40.9 | 37.5 | [config](./mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641-6764cff5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641.log.json) | +| Mask RCNN | [RegNetX-3.2GF-FPN](./mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py) | pytorch | 3x | 5.0 | | 43.1 | 38.7 | [config](./mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221.log.json) | +| Mask RCNN | [RegNetX-4GF-FPN](./mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py) | pytorch | 3x | 5.1 | | 43.4 | 39.2 | [config](./mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621-00f0331c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621.log.json) | +| Cascade Mask RCNN | [RegNetX-400MF-FPN](./cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py) | pytorch | 3x | 4.3 | | 41.6 | 36.4 | [config](./cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619-5142f449.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619.log.json) | +| Cascade Mask RCNN | [RegNetX-800MF-FPN](./cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py) | pytorch | 3x | 4.8 | | 42.8 | 37.6 | [config](./cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616-dcbd13f4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616.log.json) | +| Cascade Mask RCNN | [RegNetX-1.6GF-FPN](./cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py) | pytorch | 3x | 5.4 | | 44.5 | 39.0 | [config](./cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616-75f29a61.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616.log.json) | +| Cascade Mask RCNN | [RegNetX-3.2GF-FPN](./cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py) | pytorch | 3x | 6.4 | | 45.8 | 40.0 | [config](./cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616-b9c2c58b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616.log.json) | +| Cascade Mask RCNN | [RegNetX-4GF-FPN](./cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py) | pytorch | 3x | 6.9 | | 45.8 | 40.0 | [config](./cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034-cbb1be4c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034.log.json) | + +### Notice + +1. The models are trained using a different weight decay, i.e., `weight_decay=5e-5` according to the setting in ImageNet training. This brings improvement of at least 0.7 AP absolute but does not improve the model using ResNet-50. +2. RetinaNets using RegNets are trained with learning rate 0.02 with gradient clip. We find that using learning rate 0.02 could improve the results by at least 0.7 AP absolute and gradient clip is necessary to stabilize the training. However, this does not improve the performance of ResNet-50-FPN RetinaNet. + +## Citation + +```latex +@article{radosavovic2020designing, + title={Designing Network Design Spaces}, + author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár}, + year={2020}, + eprint={2003.13678}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..74e6ada --- /dev/null +++ b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_1.6gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')), + neck=dict( + type='FPN', + in_channels=[72, 168, 408, 912], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..ea21902 --- /dev/null +++ b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py @@ -0,0 +1,28 @@ +_base_ = [ + '../common/ms_3x_coco-instance.py', + '../_base_/models/cascade-mask-rcnn_r50_fpn.py' +] +model = dict( + data_preprocessor=dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict(optimizer=dict(weight_decay=0.00005)) diff --git a/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..3fe47f8 --- /dev/null +++ b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_400mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')), + neck=dict( + type='FPN', + in_channels=[32, 64, 160, 384], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..e22886a --- /dev/null +++ b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_4.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 560, 1360], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..655bdc6 --- /dev/null +++ b/mmdetection/configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_800mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')), + neck=dict( + type='FPN', + in_channels=[64, 128, 288, 672], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..e9e8302 --- /dev/null +++ b/mmdetection/configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_1.6gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')), + neck=dict( + type='FPN', + in_channels=[72, 168, 408, 912], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py b/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py new file mode 100644 index 0000000..db49092 --- /dev/null +++ b/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + data_preprocessor=dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)) diff --git a/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py b/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py new file mode 100644 index 0000000..be53360 --- /dev/null +++ b/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..d3d5d5d --- /dev/null +++ b/mmdetection/configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py @@ -0,0 +1,25 @@ +_base_ = ['../common/ms_3x_coco.py', '../_base_/models/faster-rcnn_r50_fpn.py'] +model = dict( + data_preprocessor=dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict(optimizer=dict(weight_decay=0.00005)) diff --git a/mmdetection/configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..2edeff9 --- /dev/null +++ b/mmdetection/configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_400mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')), + neck=dict( + type='FPN', + in_channels=[32, 64, 160, 384], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..afcbb5d --- /dev/null +++ b/mmdetection/configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_4.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 560, 1360], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..f659ec9 --- /dev/null +++ b/mmdetection/configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = 'faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_800mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')), + neck=dict( + type='FPN', + in_channels=[64, 128, 288, 672], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..60874c6 --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py @@ -0,0 +1,26 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_1.6gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')), + neck=dict( + type='FPN', + in_channels=[72, 168, 408, 912], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005), + clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py new file mode 100644 index 0000000..e82cece --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_12gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_12gf')), + neck=dict( + type='FPN', + in_channels=[224, 448, 896, 2240], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py new file mode 100644 index 0000000..c7c1d1a --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = 'mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf'))) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py new file mode 100644 index 0000000..c52bf13 --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py @@ -0,0 +1,30 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + data_preprocessor=dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py new file mode 100644 index 0000000..36482c9 --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py @@ -0,0 +1,60 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + data_preprocessor=dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning policy +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..b96e192 --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py @@ -0,0 +1,26 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_400mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_400mf')), + neck=dict( + type='FPN', + in_channels=[32, 64, 160, 384], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005), + clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py new file mode 100644 index 0000000..ce9f8ef --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_4.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 560, 1360], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..f160ccf --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py @@ -0,0 +1,26 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_4.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_4.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 560, 1360], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005), + clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py new file mode 100644 index 0000000..e17a3d7 --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_6.4gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_6.4gf')), + neck=dict( + type='FPN', + in_channels=[168, 392, 784, 1624], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py new file mode 100644 index 0000000..93851fd --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py @@ -0,0 +1,26 @@ +_base_ = [ + '../common/ms-poly_3x_coco-instance.py', + '../_base_/models/mask-rcnn_r50_fpn.py' +] + +model = dict( + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_800mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')), + neck=dict( + type='FPN', + in_channels=[64, 128, 288, 672], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005), + clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py b/mmdetection/configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py new file mode 100644 index 0000000..62a4c93 --- /dev/null +++ b/mmdetection/configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_8.0gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_8.0gf')), + neck=dict( + type='FPN', + in_channels=[80, 240, 720, 1920], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/metafile.yml b/mmdetection/configs/regnet/metafile.yml new file mode 100644 index 0000000..19fbba8 --- /dev/null +++ b/mmdetection/configs/regnet/metafile.yml @@ -0,0 +1,797 @@ +Models: + - Name: mask-rcnn_regnetx-3.2GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_1x_coco_20200520_163141-2a9d1814.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-4GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-4GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_1x_coco/mask_rcnn_regnetx-4GF_fpn_1x_coco_20200517_180217-32e9c92d.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-6.4GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-6.4GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.1 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-6.4GF_fpn_1x_coco/mask_rcnn_regnetx-6.4GF_fpn_1x_coco_20200517_180439-3a7aae83.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-8GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-8GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.4 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-8GF_fpn_1x_coco/mask_rcnn_regnetx-8GF_fpn_1x_coco_20200517_180515-09daa87e.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-12GF_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-12GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-12GF_fpn_1x_coco/mask_rcnn_regnetx-12GF_fpn_1x_coco_20200517_180552-b538bd8b.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-3.2GF-mdconv-c3-c5_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco_20200520_172726-75f40794.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster-rcnn_regnetx-3.2GF_fpn_1x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_1x_coco/faster_rcnn_regnetx-3.2GF_fpn_1x_coco_20200517_175927-126fd9bf.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster-rcnn_regnetx-3.2GF_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_2x_coco.py + Metadata: + Training Memory (GB): 4.5 + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_2x_coco/faster_rcnn_regnetx-3.2GF_fpn_2x_coco_20200520_223955-e2081918.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: retinanet_regnetx-800MF_fpn_1x_coco + In Collection: RetinaNet + Config: configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 2.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-800MF_fpn_1x_coco/retinanet_regnetx-800MF_fpn_1x_coco_20200517_191403-f6f91d10.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: retinanet_regnetx-1.6GF_fpn_1x_coco + In Collection: RetinaNet + Config: configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.3 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco/retinanet_regnetx-1.6GF_fpn_1x_coco_20200517_191403-37009a9d.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: retinanet_regnetx-3.2GF_fpn_1x_coco + In Collection: RetinaNet + Config: configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.2 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco/retinanet_regnetx-3.2GF_fpn_1x_coco_20200520_163141-cb1509e8.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster-rcnn_regnetx-400MF_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster-rcnn_regnetx-400MF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 2.3 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210526_095112-e1967c37.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster-rcnn_regnetx-800MF_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster-rcnn_regnetx-800MF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 2.8 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210526_095118-a2c70b20.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 3.4 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-1_20210526_095325-94aa46cc.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 4.4 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-3_20210526_095152-e16a5227.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: faster-rcnn_regnetx-4GF_fpn_ms-3x_coco + In Collection: Faster R-CNN + Config: configs/regnet/faster-rcnn_regnetx-4GF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 4.9 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/faster_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210526_095201-65eaf841.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-400MF_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 2.5 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-400MF_fpn_mstrain-poly_3x_coco_20210601_235443-8aac57a4.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-800MF_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 2.9 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-800MF_fpn_mstrain-poly_3x_coco_20210602_210641-715d51f5.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-1.6GF_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 3.6 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-1.6GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-1_20210602_210641-6764cff5.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco_20200521_202221-99879813.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco + In Collection: Mask R-CNN + Config: configs/regnet/mask-rcnn_regnetx-4GF_fpn_ms-poly-3x_coco.py + Metadata: + Training Memory (GB): 5.1 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco/mask_rcnn_regnetx-4GF_fpn_mstrain-poly_3x_coco_20210602_032621-00f0331c.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade-mask-rcnn_regnetx-400MF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 4.3 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-400MF_fpn_mstrain_3x_coco_20210715_211619-5142f449.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade-mask-rcnn_regnetx-800MF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-800MF_fpn_mstrain_3x_coco_20210715_211616-dcbd13f4.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade-mask-rcnn_regnetx-1.6GF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 5.4 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-1.6GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-1_20210715_211616-75f29a61.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade-mask-rcnn_regnetx-3.2GF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 6.4 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-3.2GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-3_20210715_211616-b9c2c58b.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 + + - Name: cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco + In Collection: Cascade R-CNN + Config: configs/regnet/cascade-mask-rcnn_regnetx-4GF_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 6.9 + Epochs: 36 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - RegNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/regnet/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco/cascade_mask_rcnn_regnetx-4GF_fpn_mstrain_3x_coco_20210715_212034-cbb1be4c.pth + Paper: + URL: https://arxiv.org/abs/2003.13678 + Title: 'Designing Network Design Spaces' + README: configs/regnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/regnet.py#L11 + Version: v2.1.0 diff --git a/mmdetection/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py b/mmdetection/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py new file mode 100644 index 0000000..7395c1b --- /dev/null +++ b/mmdetection/configs/regnet/retinanet_regnetx-1.6GF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './retinanet_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_1.6gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf')), + neck=dict( + type='FPN', + in_channels=[72, 168, 408, 912], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py b/mmdetection/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py new file mode 100644 index 0000000..8b8a32c --- /dev/null +++ b/mmdetection/configs/regnet/retinanet_regnetx-3.2GF_fpn_1x_coco.py @@ -0,0 +1,31 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + data_preprocessor=dict( + # The mean and std are used in PyCls when training RegNets + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + _delete_=True, + type='RegNet', + arch='regnetx_3.2gf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf')), + neck=dict( + type='FPN', + in_channels=[96, 192, 432, 1008], + out_channels=256, + num_outs=5)) + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.00005), + clip_grad=dict(max_norm=35, norm_type=2)) diff --git a/mmdetection/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py b/mmdetection/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py new file mode 100644 index 0000000..f6f8989 --- /dev/null +++ b/mmdetection/configs/regnet/retinanet_regnetx-800MF_fpn_1x_coco.py @@ -0,0 +1,17 @@ +_base_ = './retinanet_regnetx-3.2GF_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='RegNet', + arch='regnetx_800mf', + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://regnetx_800mf')), + neck=dict( + type='FPN', + in_channels=[64, 128, 288, 672], + out_channels=256, + num_outs=5)) diff --git a/mmdetection/configs/reid/README.md b/mmdetection/configs/reid/README.md new file mode 100644 index 0000000..a5bfe5e --- /dev/null +++ b/mmdetection/configs/reid/README.md @@ -0,0 +1,135 @@ +# Training a ReID Model + +You may want to train a ReID model for multiple object tracking or other applications. We support ReID model training in MMDetection, which is built upon [MMPretrain](https://github.com/open-mmlab/mmpretrain). + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Preparation + +This section will show how to train a ReID model on standard datasets i.e. MOT17. + +We need to download datasets following docs. We use [ReIDDataset](mmdet/datasets/reid_dataset.py) to maintain standard datasets. In this case, you need to convert the official dataset to this style. We provide scripts and the usages as follow: + +```python +python tools/dataset_converters/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3 +``` + +Arguments: + +- `--val-split`: Proportion of the validation dataset to the whole ReID dataset. +- `--vis-threshold`: Threshold of visibility for each person. + +The directory of the converted datasets is as follows: + +``` +MOT17 +├── train +├── test +├── reid +│ ├── imgs +│ │ ├── MOT17-02-FRCNN_000002 +│ │ │ ├── 000000.jpg +│ │ │ ├── 000001.jpg +│ │ │ ├── ... +│ │ ├── MOT17-02-FRCNN_000003 +│ │ │ ├── 000000.jpg +│ │ │ ├── 000001.jpg +│ │ │ ├── ... +│ ├── meta +│ │ ├── train_80.txt +│ │ ├── val_20.txt +``` + +Note: `80` in `train_80.txt` means the proportion of the training dataset to the whole ReID dataset is eighty percent. While the proportion of the validation dataset is twenty percent. + +For training, we provide a annotation list `train_80.txt`. Each line of the list constraints a filename and its corresponding ground-truth labels. The format is as follows: + +``` +MOT17-05-FRCNN_000110/000018.jpg 0 +MOT17-13-FRCNN_000146/000014.jpg 1 +MOT17-05-FRCNN_000088/000004.jpg 2 +MOT17-02-FRCNN_000009/000081.jpg 3 +``` + +For validation, The annotation list `val_20.txt` remains the same as format above. + +Note: Images in `MOT17/reid/imgs` are cropped from raw images in `MOT17/train` by the corresponding `gt.txt`. The value of ground-truth labels should fall in range `[0, num_classes - 1]`. + +### 3. Training + +#### Training on a single GPU + +```shell +python tools/train.py configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py +``` + +#### Training on multiple GPUs + +We provide `tools/dist_train.sh` to launch training on multiple GPUs. +The basic usage is as follows. + +```shell +bash tools/dist_train.sh configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py 8 +``` + +### 4. Customize Dataset + +This section will show how to train a ReID model on customize datasets. + +### 4.1 Dataset Preparation + +You need to convert your customize datasets to existing dataset format. + +#### An example of customized dataset + +Assume we are going to implement a `Filelist` dataset, which takes filelists for both training and testing. The directory of the dataset is as follows: + +``` +Filelist +├── imgs +│ ├── person1 +│ │ ├── 000000.jpg +│ │ ├── 000001.jpg +│ │ ├── ... +│ ├── person2 +│ │ ├── 000000.jpg +│ │ ├── 000001.jpg +│ │ ├── ... +├── meta +│ ├── train.txt +│ ├── val.txt +``` + +The format of annotation list is as follows: + +``` +person1/000000.jpg 0 +person1/000001.jpg 0 +person2/000000.jpg 1 +person2/000001.jpg 1 +``` + +You can directly use [ReIDDataset](mmdet/datasets/reid_dataset.py). In this case, you only need to modify the config as follows: + +```python +# modify the path of annotation files and the image path prefix +data = dict( + train=dict( + data_prefix='data/Filelist/imgs', + ann_file='data/Filelist/meta/train.txt'), + val=dict( + data_prefix='data/Filelist/imgs', + ann_file='data/Filelist/meta/val.txt'), + test=dict( + data_prefix='data/Filelist/imgs', + ann_file='data/Filelist/meta/val.txt'), +) +# modify the number of classes, assume your training set has 100 classes +model = dict(reid=dict(head=dict(num_classes=100))) +``` + +### 4.2 Training + +The training stage is the same as `Standard Dataset`. diff --git a/mmdetection/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py b/mmdetection/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py new file mode 100644 index 0000000..4e30b22 --- /dev/null +++ b/mmdetection/configs/reid/reid_r50_8xb32-6e_mot15train80_test-mot15val20.py @@ -0,0 +1,7 @@ +_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py'] +model = dict(head=dict(num_classes=368)) +# data +data_root = 'data/MOT15/' +train_dataloader = dict(dataset=dict(data_root=data_root)) +val_dataloader = dict(dataset=dict(data_root=data_root)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py b/mmdetection/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py new file mode 100644 index 0000000..468b9bf --- /dev/null +++ b/mmdetection/configs/reid/reid_r50_8xb32-6e_mot16train80_test-mot16val20.py @@ -0,0 +1,7 @@ +_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py'] +model = dict(head=dict(num_classes=371)) +# data +data_root = 'data/MOT16/' +train_dataloader = dict(dataset=dict(data_root=data_root)) +val_dataloader = dict(dataset=dict(data_root=data_root)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py b/mmdetection/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py new file mode 100644 index 0000000..83669de --- /dev/null +++ b/mmdetection/configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py @@ -0,0 +1,61 @@ +_base_ = [ + '../_base_/datasets/mot_challenge_reid.py', '../_base_/default_runtime.py' +] +model = dict( + type='BaseReID', + data_preprocessor=dict( + type='ReIDDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + backbone=dict( + type='mmpretrain.ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + style='pytorch'), + neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1), + head=dict( + type='LinearReIDHead', + num_fcs=1, + in_channels=2048, + fc_channels=1024, + out_channels=128, + num_classes=380, + loss_cls=dict(type='mmpretrain.CrossEntropyLoss', loss_weight=1.0), + loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU')), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth' # noqa: E501 + )) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + clip_grad=None, + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 1000, + by_epoch=False, + begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=6, + by_epoch=True, + milestones=[5], + gamma=0.1) +] + +# train, val, test setting +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/mmdetection/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py b/mmdetection/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py new file mode 100644 index 0000000..8a80799 --- /dev/null +++ b/mmdetection/configs/reid/reid_r50_8xb32-6e_mot20train80_test-mot20val20.py @@ -0,0 +1,10 @@ +_base_ = ['./reid_r50_8xb32-6e_mot17train80_test-mot17val20.py'] +model = dict(head=dict(num_classes=1701)) +# data +data_root = 'data/MOT20/' +train_dataloader = dict(dataset=dict(data_root=data_root)) +val_dataloader = dict(dataset=dict(data_root=data_root)) +test_dataloader = val_dataloader + +# train, val, test setting +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=6, val_interval=7) diff --git a/mmdetection/configs/reppoints/README.md b/mmdetection/configs/reppoints/README.md new file mode 100644 index 0000000..03cb86b --- /dev/null +++ b/mmdetection/configs/reppoints/README.md @@ -0,0 +1,59 @@ +# RepPoints + +> [RepPoints: Point Set Representation for Object Detection](https://arxiv.org/abs/1904.11490) + + + +## Abstract + +Modern object detectors rely heavily on rectangular bounding boxes, such as anchors, proposals and the final predictions, to represent objects at various recognition stages. The bounding box is convenient to use but provides only a coarse localization of objects and leads to a correspondingly coarse extraction of object features. In this paper, we present RepPoints(representative points), a new finer representation of objects as a set of sample points useful for both localization and recognition. Given ground truth localization and recognition targets for training, RepPoints learn to automatically arrange themselves in a manner that bounds the spatial extent of an object and indicates semantically significant local areas. They furthermore do not require the use of anchors to sample a space of bounding boxes. We show that an anchor-free object detector based on RepPoints can be as effective as the state-of-the-art anchor-based detection methods, with 46.5 AP and 67.4 AP50 on the COCO test-dev detection benchmark, using ResNet-101 model. + +
    + +
    + +## Introdution + +By [Ze Yang](https://yangze.tech/), [Shaohui Liu](http://b1ueber2y.me/), and [Han Hu](https://ancientmooner.github.io/). + +We provide code support and configuration files to reproduce the results in the paper for +["RepPoints: Point Set Representation for Object Detection"](https://arxiv.org/abs/1904.11490) on COCO object detection. + +**RepPoints**, initially described in [arXiv](https://arxiv.org/abs/1904.11490), is a new representation method for visual objects, on which visual understanding tasks are typically centered. Visual object representation, aiming at both geometric description and appearance feature extraction, is conventionally achieved by `bounding box + RoIPool (RoIAlign)`. The bounding box representation is convenient to use; however, it provides only a rectangular localization of objects that lacks geometric precision and may consequently degrade feature quality. Our new representation, RepPoints, models objects by a `point set` instead of a `bounding box`, which learns to adaptively position themselves over an object in a manner that circumscribes the object’s `spatial extent` and enables `semantically aligned feature extraction`. This richer and more flexible representation maintains the convenience of bounding boxes while facilitating various visual understanding applications. This repo demonstrated the effectiveness of RepPoints for COCO object detection. + +Another feature of this repo is the demonstration of an `anchor-free detector`, which can be as effective as state-of-the-art anchor-based detection methods. The anchor-free detector can utilize either `bounding box` or `RepPoints` as the basic object representation. + +## Results and Models + +The results on COCO 2017val are shown in the table below. + +| Method | Backbone | GN | Anchor | convert func | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------: | :-----------: | :-: | :----: | :----------: | :-----: | :------: | :------------: | :----: | :---------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| BBox | R-50-FPN | Y | single | - | 1x | 3.9 | 15.9 | 36.4 | [config](./reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916.log.json) | +| BBox | R-50-FPN | Y | none | - | 1x | 3.9 | 15.4 | 37.4 | [config](./reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916.log.json) | +| RepPoints | R-50-FPN | N | none | moment | 1x | 3.3 | 18.5 | 37.0 | [config](./reppoints-moment_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330_233609.log.json) | +| RepPoints | R-50-FPN | Y | none | moment | 1x | 3.9 | 17.5 | 38.1 | [config](./reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329_145952-3e51b550.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329_145952.log.json) | +| RepPoints | R-50-FPN | Y | none | moment | 2x | 3.9 | - | 38.6 | [config](./reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329-91babaa2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329_150020.log.json) | +| RepPoints | R-101-FPN | Y | none | moment | 2x | 5.8 | 13.7 | 40.5 | [config](./reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329-4fbc7310.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329_132205.log.json) | +| RepPoints | R-101-FPN-DCN | Y | none | moment | 2x | 5.9 | 12.1 | 42.9 | [config](./reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-3309fbf2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329_132134.log.json) | +| RepPoints | X-101-FPN-DCN | Y | none | moment | 2x | 7.1 | 9.3 | 44.2 | [config](./reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-f87da1ea.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329_132201.log.json) | + +**Notes:** + +- `R-xx`, `X-xx` denote the ResNet and ResNeXt architectures, respectively. +- `DCN` denotes replacing 3x3 conv with the 3x3 deformable convolution in `c3-c5` stages of backbone. +- `none` in the `anchor` column means 2-d `center point` (x,y) is used to represent the initial object hypothesis. `single` denotes one 4-d anchor box (x,y,w,h) with IoU based label assign criterion is adopted. +- `moment`, `partial MinMax`, `MinMax` in the `convert func` column are three functions to convert a point set to a pseudo box. +- Note the results here are slightly different from those reported in the paper, due to framework change. While the original paper uses an [MXNet](https://mxnet.apache.org/) implementation, we re-implement the method in [PyTorch](https://pytorch.org/) based on mmdetection. + +## Citation + +```latex +@inproceedings{yang2019reppoints, + title={RepPoints: Point Set Representation for Object Detection}, + author={Yang, Ze and Liu, Shaohui and Hu, Han and Wang, Liwei and Lin, Stephen}, + booktitle={The IEEE International Conference on Computer Vision (ICCV)}, + month={Oct}, + year={2019} +} +``` diff --git a/mmdetection/configs/reppoints/metafile.yml b/mmdetection/configs/reppoints/metafile.yml new file mode 100644 index 0000000..732d541 --- /dev/null +++ b/mmdetection/configs/reppoints/metafile.yml @@ -0,0 +1,181 @@ +Collections: + - Name: RepPoints + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Group Normalization + - FPN + - RepPoints + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.11490 + Title: 'RepPoints: Point Set Representation for Object Detection' + README: configs/reppoints/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/reppoints_detector.py#L9 + Version: v2.0.0 + +Models: + - Name: reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py + Metadata: + Training Memory (GB): 3.9 + inference time (ms/im): + - value: 62.89 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth + + - Name: reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py + Metadata: + Training Memory (GB): 3.9 + inference time (ms/im): + - value: 64.94 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco/bbox_r50_grid_fpn_gn-neck%2Bhead_1x_coco_20200329_145916-0eedf8d1.pth + + - Name: reppoints-moment_r50_fpn_1x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.3 + inference time (ms/im): + - value: 54.05 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_1x_coco/reppoints_moment_r50_fpn_1x_coco_20200330-b73db8d1.pth + + - Name: reppoints-moment_r50_fpn-gn_head-gn_1x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py + Metadata: + Training Memory (GB): 3.9 + inference time (ms/im): + - value: 57.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_1x_coco_20200329_145952-3e51b550.pth + + - Name: reppoints-moment_r50_fpn-gn_head-gn_2x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py + Metadata: + Training Memory (GB): 3.9 + inference time (ms/im): + - value: 57.14 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r50_fpn_gn-neck%2Bhead_2x_coco_20200329-91babaa2.pth + + - Name: reppoints-moment_r101_fpn-gn_head-gn_2x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py + Metadata: + Training Memory (GB): 5.8 + inference time (ms/im): + - value: 72.99 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_gn-neck%2Bhead_2x_coco_20200329-4fbc7310.pth + + - Name: reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py + Metadata: + Training Memory (GB): 5.9 + inference time (ms/im): + - value: 82.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-3309fbf2.pth + + - Name: reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco + In Collection: RepPoints + Config: configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py + Metadata: + Training Memory (GB): 7.1 + inference time (ms/im): + - value: 107.53 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck%2Bhead_2x_coco_20200329-f87da1ea.pth diff --git a/mmdetection/configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py b/mmdetection/configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py new file mode 100644 index 0000000..f116e53 --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-bbox_r50-center_fpn-gn_head-gn-grid_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py' +model = dict(bbox_head=dict(transform_method='minmax', use_grid_points=True)) diff --git a/mmdetection/configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py b/mmdetection/configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py new file mode 100644 index 0000000..76be39b --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-bbox_r50_fpn-gn_head-gn-grid_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py' +model = dict( + bbox_head=dict(transform_method='minmax', use_grid_points=True), + # training and testing settings + train_cfg=dict( + init=dict( + assigner=dict( + _delete_=True, + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1)))) diff --git a/mmdetection/configs/reppoints/reppoints-minmax_r50_fpn-gn_head-gn_1x_coco.py b/mmdetection/configs/reppoints/reppoints-minmax_r50_fpn-gn_head-gn_1x_coco.py new file mode 100644 index 0000000..0e7dffe --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-minmax_r50_fpn-gn_head-gn_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py' +model = dict(bbox_head=dict(transform_method='minmax')) diff --git a/mmdetection/configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py b/mmdetection/configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py new file mode 100644 index 0000000..5c2bfab --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-moment_r101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py @@ -0,0 +1,8 @@ +_base_ = './reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py b/mmdetection/configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py new file mode 100644 index 0000000..02c447a --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-moment_r101_fpn-gn_head-gn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py b/mmdetection/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py new file mode 100644 index 0000000..cedf222 --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py @@ -0,0 +1,3 @@ +_base_ = './reppoints-moment_r50_fpn_1x_coco.py' +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict(neck=dict(norm_cfg=norm_cfg), bbox_head=dict(norm_cfg=norm_cfg)) diff --git a/mmdetection/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py b/mmdetection/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py new file mode 100644 index 0000000..4490d44 --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py' + +max_epochs = 24 + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py b/mmdetection/configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py new file mode 100644 index 0000000..df7e72a --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-moment_r50_fpn_1x_coco.py @@ -0,0 +1,74 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='RepPointsDetector', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5), + bbox_head=dict( + type='RepPointsHead', + num_classes=80, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment'), + # training and testing settings + train_cfg=dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)) + +optim_wrapper = dict(optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py b/mmdetection/configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py new file mode 100644 index 0000000..a9909ef --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-moment_x101-dconv-c3-c5_fpn-gn_head-gn_2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './reppoints-moment_r50_fpn-gn_head-gn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/reppoints/reppoints-partial-minmax_r50_fpn-gn_head-gn_1x_coco.py b/mmdetection/configs/reppoints/reppoints-partial-minmax_r50_fpn-gn_head-gn_1x_coco.py new file mode 100644 index 0000000..30f7844 --- /dev/null +++ b/mmdetection/configs/reppoints/reppoints-partial-minmax_r50_fpn-gn_head-gn_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './reppoints-moment_r50_fpn-gn_head-gn_1x_coco.py' +model = dict(bbox_head=dict(transform_method='partial_minmax')) diff --git a/mmdetection/configs/res2net/README.md b/mmdetection/configs/res2net/README.md new file mode 100644 index 0000000..cd6732b --- /dev/null +++ b/mmdetection/configs/res2net/README.md @@ -0,0 +1,77 @@ +# Res2Net + +> [Res2Net: A New Multi-scale Backbone Architecture](https://arxiv.org/abs/1904.01169) + + + +## Abstract + +Representing features at multiple scales is of great importance for numerous vision tasks. Recent advances in backbone convolutional neural networks (CNNs) continually demonstrate stronger multi-scale representation ability, leading to consistent performance gains on a wide range of applications. However, most existing methods represent the multi-scale features in a layer-wise manner. In this paper, we propose a novel building block for CNNs, namely Res2Net, by constructing hierarchical residual-like connections within one single residual block. The Res2Net represents multi-scale features at a granular level and increases the range of receptive fields for each network layer. The proposed Res2Net block can be plugged into the state-of-the-art backbone CNN models, e.g., ResNet, ResNeXt, and DLA. We evaluate the Res2Net block on all these models and demonstrate consistent performance gains over baseline models on widely-used datasets, e.g., CIFAR-100 and ImageNet. Further ablation studies and experimental results on representative computer vision tasks, i.e., object detection, class activation mapping, and salient object detection, further verify the superiority of the Res2Net over the state-of-the-art baseline methods. + +
    + +
    + +## Introduction + +We propose a novel building block for CNNs, namely Res2Net, by constructing hierarchical residual-like connections within one single residual block. The Res2Net represents multi-scale features at a granular level and increases the range of receptive fields for each network layer. + +| Backbone | Params. | GFLOPs | top-1 err. | top-5 err. | +| :---------------: | :-----: | :----: | :--------: | :--------: | +| ResNet-101 | 44.6 M | 7.8 | 22.63 | 6.44 | +| ResNeXt-101-64x4d | 83.5M | 15.5 | 20.40 | - | +| HRNetV2p-W48 | 77.5M | 16.1 | 20.70 | 5.50 | +| Res2Net-101 | 45.2M | 8.3 | 18.77 | 4.64 | + +Compared with other backbone networks, Res2Net requires fewer parameters and FLOPs. + +**Note:** + +- GFLOPs for classification are calculated with image size (224x224). + +## Results and Models + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R2-101-FPN | pytorch | 2x | 7.4 | - | 43.0 | [config](./faster-rcnn_res2net-101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco_20200514_231734.log.json) | + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :----------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R2-101-FPN | pytorch | 2x | 7.9 | - | 43.6 | 38.7 | [config](./mask-rcnn_res2net-101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco-17f061e8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco_20200515_002413.log.json) | + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R2-101-FPN | pytorch | 20e | 7.8 | - | 45.7 | [config](./cascade-rcnn_res2net-101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco-f4b7b7db.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco_20200515_091644.log.json) | + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R2-101-FPN | pytorch | 20e | 9.5 | - | 46.4 | 40.0 | [config](./cascade-mask-rcnn_res2net-101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco-8a7b41e1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco_20200515_091645.log.json) | + +### Hybrid Task Cascade (HTC) + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :--------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R2-101-FPN | pytorch | 20e | - | - | 47.5 | 41.6 | [config](./htc_res2net-101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco-3a8d2112.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco_20200515_150029.log.json) | + +- Res2Net ImageNet pretrained models are in [Res2Net-PretrainedModels](https://github.com/Res2Net/Res2Net-PretrainedModels). +- More applications of Res2Net are in [Res2Net-Github](https://github.com/Res2Net/). + +## Citation + +```latex +@article{gao2019res2net, + title={Res2Net: A New Multi-scale Backbone Architecture}, + author={Gao, Shang-Hua and Cheng, Ming-Ming and Zhao, Kai and Zhang, Xin-Yu and Yang, Ming-Hsuan and Torr, Philip}, + journal={IEEE TPAMI}, + year={2020}, + doi={10.1109/TPAMI.2019.2938758}, +} +``` diff --git a/mmdetection/configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py b/mmdetection/configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py new file mode 100644 index 0000000..21b6d2e --- /dev/null +++ b/mmdetection/configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py @@ -0,0 +1,10 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/mmdetection/configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py b/mmdetection/configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py new file mode 100644 index 0000000..670a774 --- /dev/null +++ b/mmdetection/configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py @@ -0,0 +1,10 @@ +_base_ = '../cascade_rcnn/cascade-rcnn_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/mmdetection/configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py b/mmdetection/configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py new file mode 100644 index 0000000..033cf57 --- /dev/null +++ b/mmdetection/configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py @@ -0,0 +1,10 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/mmdetection/configs/res2net/htc_res2net-101_fpn_20e_coco.py b/mmdetection/configs/res2net/htc_res2net-101_fpn_20e_coco.py new file mode 100644 index 0000000..d5542fd --- /dev/null +++ b/mmdetection/configs/res2net/htc_res2net-101_fpn_20e_coco.py @@ -0,0 +1,10 @@ +_base_ = '../htc/htc_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/mmdetection/configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py b/mmdetection/configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py new file mode 100644 index 0000000..3a2d573 --- /dev/null +++ b/mmdetection/configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py @@ -0,0 +1,10 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/mmdetection/configs/res2net/metafile.yml b/mmdetection/configs/res2net/metafile.yml new file mode 100644 index 0000000..1d9f9ea --- /dev/null +++ b/mmdetection/configs/res2net/metafile.yml @@ -0,0 +1,146 @@ +Models: + - Name: faster-rcnn_res2net-101_fpn_2x_coco + In Collection: Faster R-CNN + Config: configs/res2net/faster-rcnn_res2net-101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/faster_rcnn_r2_101_fpn_2x_coco/faster_rcnn_r2_101_fpn_2x_coco-175f1da6.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 + + - Name: mask-rcnn_res2net-101_fpn_2x_coco + In Collection: Mask R-CNN + Config: configs/res2net/mask-rcnn_res2net-101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.9 + Epochs: 24 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/mask_rcnn_r2_101_fpn_2x_coco/mask_rcnn_r2_101_fpn_2x_coco-17f061e8.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 + + - Name: cascade-rcnn_res2net-101_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/res2net/cascade-rcnn_res2net-101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_rcnn_r2_101_fpn_20e_coco/cascade_rcnn_r2_101_fpn_20e_coco-f4b7b7db.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 + + - Name: cascade-mask-rcnn_res2net-101_fpn_20e_coco + In Collection: Cascade R-CNN + Config: configs/res2net/cascade-mask-rcnn_res2net-101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 9.5 + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/cascade_mask_rcnn_r2_101_fpn_20e_coco/cascade_mask_rcnn_r2_101_fpn_20e_coco-8a7b41e1.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 + + - Name: htc_res2net-101_fpn_20e_coco + In Collection: HTC + Config: configs/res2net/htc_res2net-101_fpn_20e_coco.py + Metadata: + Epochs: 20 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Res2Net + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/res2net/htc_r2_101_fpn_20e_coco/htc_r2_101_fpn_20e_coco-3a8d2112.pth + Paper: + URL: https://arxiv.org/abs/1904.01169 + Title: 'Res2Net for object detection and instance segmentation' + README: configs/res2net/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.1.0/mmdet/models/backbones/res2net.py#L239 + Version: v2.1.0 diff --git a/mmdetection/configs/resnest/README.md b/mmdetection/configs/resnest/README.md new file mode 100644 index 0000000..a72f842 --- /dev/null +++ b/mmdetection/configs/resnest/README.md @@ -0,0 +1,54 @@ +# ResNeSt + +> [ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955) + + + +## Abstract + +It is well known that featuremap attention and multi-path representation are important for visual recognition. In this paper, we present a modularized architecture, which applies the channel-wise attention on different network branches to leverage their success in capturing cross-feature interactions and learning diverse representations. Our design results in a simple and unified computation block, which can be parameterized using only a few variables. Our model, named ResNeSt, outperforms EfficientNet in accuracy and latency trade-off on image classification. In addition, ResNeSt has achieved superior transfer learning results on several public benchmarks serving as the backbone, and has been adopted by the winning entries of COCO-LVIS challenge. + +
    + +
    + +## Results and Models + +### Faster R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------: | :-----: | :-----: | :------: | :------------: | :----: | :-----------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| S-50-FPN | pytorch | 1x | 4.8 | - | 42.0 | [config](./faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20200926_125502-20289c16.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20200926_125502.log.json) | +| S-101-FPN | pytorch | 1x | 7.1 | - | 44.5 | [config](./faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201006_021058-421517f1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201006_021058.log.json) | + +### Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :---------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| S-50-FPN | pytorch | 1x | 5.5 | - | 42.6 | 38.1 | [config](./mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20200926_125503-8a2c3d47.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20200926_125503.log.json) | +| S-101-FPN | pytorch | 1x | 7.8 | - | 45.2 | 40.2 | [config](./mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_215831-af60cdf9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201005_215831.log.json) | + +### Cascade R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------: | :-----: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| S-50-FPN | pytorch | 1x | - | - | 44.5 | [config](./cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201122_213640-763cc7b5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201005_113242.log.json) | +| S-101-FPN | pytorch | 1x | 8.4 | - | 46.8 | [config](./cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201005_113242-b9459f8f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco-20201122_213640.log.json) | + +### Cascade Mask R-CNN + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| S-50-FPN | pytorch | 1x | - | - | 45.4 | 39.5 | [config](./cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201122_104428-99eca4c7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201122_104428.log.json) | +| S-101-FPN | pytorch | 1x | 10.5 | - | 47.7 | 41.4 | [config](./cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_113243-42607475.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco-20201005_113243.log.json) | + +## Citation + +```latex +@article{zhang2020resnest, +title={ResNeSt: Split-Attention Networks}, +author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander}, +journal={arXiv preprint arXiv:2004.08955}, +year={2020} +} +``` diff --git a/mmdetection/configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py b/mmdetection/configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py new file mode 100644 index 0000000..f4f1992 --- /dev/null +++ b/mmdetection/configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py' +model = dict( + backbone=dict( + stem_channels=128, + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='open-mmlab://resnest101'))) diff --git a/mmdetection/configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py b/mmdetection/configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py new file mode 100644 index 0000000..c6ef41c --- /dev/null +++ b/mmdetection/configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py @@ -0,0 +1,101 @@ +_base_ = '../cascade_rcnn/cascade-mask-rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='SyncBN', requires_grad=True) + +model = dict( + # use ResNeSt img_norm + data_preprocessor=dict( + mean=[123.68, 116.779, 103.939], + std=[58.393, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + stem_channels=64, + depth=50, + radix=2, + reduction_factor=4, + avg_down_stride=True, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')), + roi_head=dict( + bbox_head=[ + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_head=dict(norm_cfg=norm_cfg))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py b/mmdetection/configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py new file mode 100644 index 0000000..9dbf3fa --- /dev/null +++ b/mmdetection/configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py' +model = dict( + backbone=dict( + stem_channels=128, + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='open-mmlab://resnest101'))) diff --git a/mmdetection/configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py b/mmdetection/configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py new file mode 100644 index 0000000..7ce7b56 --- /dev/null +++ b/mmdetection/configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py @@ -0,0 +1,93 @@ +_base_ = '../cascade_rcnn/cascade-rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + # use ResNeSt img_norm + data_preprocessor=dict( + mean=[123.68, 116.779, 103.939], + std=[58.393, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + stem_channels=64, + depth=50, + radix=2, + reduction_factor=4, + avg_down_stride=True, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')), + roi_head=dict( + bbox_head=[ + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + conv_out_channels=256, + fc_out_channels=1024, + norm_cfg=norm_cfg, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], )) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py b/mmdetection/configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py new file mode 100644 index 0000000..f1e1632 --- /dev/null +++ b/mmdetection/configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py' +model = dict( + backbone=dict( + stem_channels=128, + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='open-mmlab://resnest101'))) diff --git a/mmdetection/configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py b/mmdetection/configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py new file mode 100644 index 0000000..8f0ec6e --- /dev/null +++ b/mmdetection/configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py @@ -0,0 +1,39 @@ +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + # use ResNeSt img_norm + data_preprocessor=dict( + mean=[123.68, 116.779, 103.939], + std=[58.393, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + stem_channels=64, + depth=50, + radix=2, + reduction_factor=4, + avg_down_stride=True, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py b/mmdetection/configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py new file mode 100644 index 0000000..3edf49f --- /dev/null +++ b/mmdetection/configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py' +model = dict( + backbone=dict( + stem_channels=128, + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='open-mmlab://resnest101'))) diff --git a/mmdetection/configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py b/mmdetection/configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py new file mode 100644 index 0000000..c6f2700 --- /dev/null +++ b/mmdetection/configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py @@ -0,0 +1,46 @@ +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + # use ResNeSt img_norm + data_preprocessor=dict( + mean=[123.68, 116.779, 103.939], + std=[58.393, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='ResNeSt', + stem_channels=64, + depth=50, + radix=2, + reduction_factor=4, + avg_down_stride=True, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnest50')), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg), + mask_head=dict(norm_cfg=norm_cfg))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/resnest/metafile.yml b/mmdetection/configs/resnest/metafile.yml new file mode 100644 index 0000000..265c940 --- /dev/null +++ b/mmdetection/configs/resnest/metafile.yml @@ -0,0 +1,230 @@ +Models: + - Name: faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco + In Collection: Faster R-CNN + Config: configs/resnest/faster-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20200926_125502-20289c16.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco + In Collection: Faster R-CNN + Config: configs/resnest/faster-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py + Metadata: + Training Memory (GB): 7.1 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/faster_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201006_021058-421517f1.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco + In Collection: Mask R-CNN + Config: configs/resnest/mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py + Metadata: + Training Memory (GB): 5.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20200926_125503-8a2c3d47.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco + In Collection: Mask R-CNN + Config: configs/resnest/mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_215831-af60cdf9.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco + In Collection: Cascade R-CNN + Config: configs/resnest/cascade-rcnn_s50_fpn_syncbn-backbone+head_ms-range-1x_coco.py + Metadata: + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201122_213640-763cc7b5.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco + In Collection: Cascade R-CNN + Config: configs/resnest/cascade-rcnn_s101_fpn_syncbn-backbone+head_ms-range-1x_coco.py + Metadata: + Training Memory (GB): 8.4 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco/cascade_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain-range_1x_coco_20201005_113242-b9459f8f.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco + In Collection: Cascade R-CNN + Config: configs/resnest/cascade-mask-rcnn_s50_fpn_syncbn-backbone+head_ms-1x_coco.py + Metadata: + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s50_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201122_104428-99eca4c7.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 + + - Name: cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco + In Collection: Cascade R-CNN + Config: configs/resnest/cascade-mask-rcnn_s101_fpn_syncbn-backbone+head_ms-1x_coco.py + Metadata: + Training Memory (GB): 10.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNeSt + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnest/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco/cascade_mask_rcnn_s101_fpn_syncbn-backbone%2Bhead_mstrain_1x_coco_20201005_113243-42607475.pth + Paper: + URL: https://arxiv.org/abs/2004.08955 + Title: 'ResNeSt: Split-Attention Networks' + README: configs/resnest/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.7.0/mmdet/models/backbones/resnest.py#L273 + Version: v2.7.0 diff --git a/mmdetection/configs/resnet_strikes_back/README.md b/mmdetection/configs/resnet_strikes_back/README.md new file mode 100644 index 0000000..f015729 --- /dev/null +++ b/mmdetection/configs/resnet_strikes_back/README.md @@ -0,0 +1,40 @@ +# ResNet strikes back + +> [ResNet strikes back: An improved training procedure in timm](https://arxiv.org/abs/2110.00476) + + + +## Abstract + +The influential Residual Networks designed by He et al. remain the gold-standard architecture in numerous scientific publications. They typically serve as the default architecture in studies, or as baselines when new architectures are proposed. Yet there has been significant progress on best practices for training neural networks since the inception of the ResNet architecture in 2015. Novel optimization & dataaugmentation have increased the effectiveness of the training recipes. + +In this paper, we re-evaluate the performance of the vanilla ResNet-50 when trained with a procedure that integrates such advances. We share competitive training settings and pre-trained models in the timm open-source library, with the hope that they will serve as better baselines for future work. For instance, with our more demanding training setting, a vanilla ResNet-50 reaches 80.4% top-1 accuracy at resolution 224×224 on ImageNet-val without extra data or distillation. We also report the performance achieved with popular models with our training procedure. + +
    + +
    + +## Results and Models + +| Method | Backbone | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :----------------: | :------: | :-----: | :------: | :------------: | :---------: | :---------: | :------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN | R-50 rsb | 1x | 3.9 | - | 40.8 (+3.4) | - | [Config](./faster-rcnn_r50-rsb-pre_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229.log.json) | +| Mask R-CNN | R-50 rsb | 1x | 4.5 | - | 41.2 (+3.0) | 38.2 (+3.0) | [Config](./mask-rcnn_r50-rsb-pre_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054.log.json) | +| Cascade Mask R-CNN | R-50 rsb | 1x | 6.2 | - | 44.8 (+3.6) | 39.9 (+3.6) | [Config](./cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636.log.json) | +| RetinaNet | R-50 rsb | 1x | 3.8 | - | 39.0 (+2.5) | - | [Config](./retinanet_r50-rsb-pre_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432.log.json) | + +**Notes:** + +- 'rsb' is short for 'resnet strikes back' +- We have done some grid searches on learning rate and weight decay and get these optimal hyper-parameters. + +## Citation + +```latex +@article{wightman2021resnet, +title={Resnet strikes back: An improved training procedure in timm}, +author={Ross Wightman, Hugo Touvron, Hervé Jégou}, +journal={arXiv preprint arXiv:2110.00476}, +year={2021} +} +``` diff --git a/mmdetection/configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py b/mmdetection/configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py new file mode 100644 index 0000000..de7b95b --- /dev/null +++ b/mmdetection/configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py @@ -0,0 +1,15 @@ +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth' # noqa +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint))) + +optim_wrapper = dict( + optimizer=dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/mmdetection/configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py b/mmdetection/configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py new file mode 100644 index 0000000..8c60f66 --- /dev/null +++ b/mmdetection/configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py @@ -0,0 +1,15 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth' # noqa +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint))) + +optim_wrapper = dict( + optimizer=dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/mmdetection/configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py b/mmdetection/configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py new file mode 100644 index 0000000..85e25d3 --- /dev/null +++ b/mmdetection/configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py @@ -0,0 +1,15 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth' # noqa +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint))) + +optim_wrapper = dict( + optimizer=dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/mmdetection/configs/resnet_strikes_back/metafile.yml b/mmdetection/configs/resnet_strikes_back/metafile.yml new file mode 100644 index 0000000..74b1521 --- /dev/null +++ b/mmdetection/configs/resnet_strikes_back/metafile.yml @@ -0,0 +1,116 @@ +Models: + - Name: faster-rcnn_r50_fpn_rsb-pretrain_1x_coco + In Collection: Faster R-CNN + Config: configs/resnet_strikes_back/faster-rcnn_r50-rsb-pre_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.9 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco/faster_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_162229-32ae82a9.pth + Paper: + URL: https://arxiv.org/abs/2110.00476 + Title: 'ResNet strikes back: An improved training procedure in timm' + README: configs/resnet_strikes_back/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md + Version: v2.22.0 + + - Name: cascade-mask-rcnn_r50_fpn_rsb-pretrain_1x_coco + In Collection: Cascade R-CNN + Config: configs/resnet_strikes_back/cascade-mask-rcnn_r50-rsb-pre_fpn_1x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/cascade_mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_193636-8b9ad50f.pth + Paper: + URL: https://arxiv.org/abs/2110.00476 + Title: 'ResNet strikes back: An improved training procedure in timm' + README: configs/resnet_strikes_back/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md + Version: v2.22.0 + + - Name: retinanet_r50-rsb-pre_fpn_1x_coco + In Collection: RetinaNet + Config: configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/retinanet_r50_fpn_rsb-pretrain_1x_coco/retinanet_r50_fpn_rsb-pretrain_1x_coco_20220113_175432-bd24aae9.pth + Paper: + URL: https://arxiv.org/abs/2110.00476 + Title: 'ResNet strikes back: An improved training procedure in timm' + README: configs/resnet_strikes_back/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md + Version: v2.22.0 + + - Name: mask-rcnn_r50_fpn_rsb-pretrain_1x_coco + In Collection: Mask R-CNN + Config: configs/resnet_strikes_back/mask-rcnn_r50-rsb-pre_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.5 + Epochs: 12 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/resnet_strikes_back/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco/mask_rcnn_r50_fpn_rsb-pretrain_1x_coco_20220113_174054-06ce8ba0.pth + Paper: + URL: https://arxiv.org/abs/2110.00476 + Title: 'ResNet strikes back: An improved training procedure in timm' + README: configs/resnet_strikes_back/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.22.0/configs/resnet_strikes_back/README.md + Version: v2.22.0 diff --git a/mmdetection/configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py b/mmdetection/configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py new file mode 100644 index 0000000..7ce7bfd --- /dev/null +++ b/mmdetection/configs/resnet_strikes_back/retinanet_r50-rsb-pre_fpn_1x_coco.py @@ -0,0 +1,15 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth' # noqa +model = dict( + backbone=dict( + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint))) + +optim_wrapper = dict( + optimizer=dict(_delete_=True, type='AdamW', lr=0.0001, weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0., bypass_duplicate=True)) diff --git a/mmdetection/configs/retinanet/README.md b/mmdetection/configs/retinanet/README.md new file mode 100644 index 0000000..b38335a --- /dev/null +++ b/mmdetection/configs/retinanet/README.md @@ -0,0 +1,53 @@ +# RetinaNet + +> [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) + + + +## Abstract + +The highest accuracy object detectors to date are based on a two-stage approach popularized by R-CNN, where a classifier is applied to a sparse set of candidate object locations. In contrast, one-stage detectors that are applied over a regular, dense sampling of possible object locations have the potential to be faster and simpler, but have trailed the accuracy of two-stage detectors thus far. In this paper, we investigate why this is the case. We discover that the extreme foreground-background class imbalance encountered during training of dense detectors is the central cause. We propose to address this class imbalance by reshaping the standard cross entropy loss such that it down-weights the loss assigned to well-classified examples. Our novel Focal Loss focuses training on a sparse set of hard examples and prevents the vast number of easy negatives from overwhelming the detector during training. To evaluate the effectiveness of our loss, we design and train a simple dense detector we call RetinaNet. Our results show that when trained with the focal loss, RetinaNet is able to match the speed of previous one-stage detectors while surpassing the accuracy of all existing state-of-the-art two-stage detectors. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :----------: | :------: | :------------: | :----: | :---------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-18-FPN | pytorch | 1x | 1.7 | | 31.7 | [config](./retinanet_r18_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055.log.json) | +| R-18-FPN | pytorch | 1x(1 x 8 BS) | 5.0 | | 31.7 | [config](./retinanet_r18_fpn_1xb8-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255-4ea310d7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255.log.json) | +| R-50-FPN | caffe | 1x | 3.5 | 18.6 | 36.3 | [config](./retinanet_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531_012518.log.json) | +| R-50-FPN | pytorch | 1x | 3.8 | 19.0 | 36.5 | [config](./retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130_002941.log.json) | +| R-50-FPN (FP16) | pytorch | 1x | 2.8 | 31.6 | 36.4 | [config](./retinanet_r50_fpn_amp-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702-0dbfb212.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702_020127.log.json) | +| R-50-FPN | pytorch | 2x | - | - | 37.4 | [config](./retinanet_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131_114738.log.json) | +| R-101-FPN | caffe | 1x | 5.5 | 14.7 | 38.5 | [config](./retinanet_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531-b428fa0f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531_012536.log.json) | +| R-101-FPN | pytorch | 1x | 5.7 | 15.0 | 38.5 | [config](./retinanet_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130-7a93545f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130_003055.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 38.9 | [config](./retinanet_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131-5560aee8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131_114859.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.0 | 12.1 | 39.9 | [config](./retinanet_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130-5c8b7ec4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130_003004.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 40.1 | [config](./retinanet_x101-32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131-237fc5e1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131_114812.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.0 | 8.7 | 41.0 | [config](./retinanet_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130-366f5af1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130_003008.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 40.8 | [config](./retinanet_x101-64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131-bca068ab.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131_114833.log.json) | + +## Pre-trained Models + +We also train some models with longer schedules and multi-scale training. The users could finetune them for downstream tasks. + +| Backbone | Style | Lr schd | Mem (GB) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :----: | :--------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 3x | 3.5 | 39.5 | [config](./retinanet_r50_fpn_ms-640-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.log.json) | +| R-101-FPN | caffe | 3x | 5.4 | 40.7 | [config](./retinanet_r101-caffe_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.log.json) | +| R-101-FPN | pytorch | 3x | 5.4 | 41 | [config](./retinanet_r101_fpn_ms-640-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.log.json) | +| X-101-64x4d-FPN | pytorch | 3x | 9.8 | 41.6 | [config](./retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.log.json) | + +## Citation + +```latex +@inproceedings{lin2017focal, + title={Focal loss for dense object detection}, + author={Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + year={2017} +} +``` diff --git a/mmdetection/configs/retinanet/metafile.yml b/mmdetection/configs/retinanet/metafile.yml new file mode 100644 index 0000000..0551541 --- /dev/null +++ b/mmdetection/configs/retinanet/metafile.yml @@ -0,0 +1,312 @@ +Collections: + - Name: RetinaNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Focal Loss + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1708.02002 + Title: "Focal Loss for Dense Object Detection" + README: configs/retinanet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/retinanet.py#L6 + Version: v2.0.0 + +Models: + - Name: retinanet_r18_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r18_fpn_1x_coco.py + Metadata: + Training Memory (GB): 1.7 + Training Resources: 8x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 31.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth + + - Name: retinanet_r18_fpn_1xb8-1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py + Metadata: + Training Memory (GB): 5.0 + Training Resources: 1x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 31.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x8_1x_coco/retinanet_r18_fpn_1x8_1x_coco_20220407_171255-4ea310d7.pth + + - Name: retinanet_r50-caffe_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.5 + inference time (ms/im): + - value: 53.76 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth + + - Name: retinanet_r50_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + inference time (ms/im): + - value: 52.63 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth + + - Name: retinanet_r50_fpn_amp-1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py + Metadata: + Training Memory (GB): 2.8 + Training Techniques: + - SGD with Momentum + - Weight Decay + - Mixed Precision Training + inference time (ms/im): + - value: 31.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP16 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/fp16/retinanet_r50_fpn_fp16_1x_coco/retinanet_r50_fpn_fp16_1x_coco_20200702-0dbfb212.pth + + - Name: retinanet_r50_fpn_2x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_fpn_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_2x_coco/retinanet_r50_fpn_2x_coco_20200131-fdb43119.pth + + - Name: retinanet_r50_fpn_ms-640-800-3x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_mstrain_3x_coco/retinanet_r50_fpn_mstrain_3x_coco_20210718_220633-88476508.pth + + - Name: retinanet_r101-caffe_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.5 + inference time (ms/im): + - value: 68.03 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_1x_coco/retinanet_r101_caffe_fpn_1x_coco_20200531-b428fa0f.pth + + - Name: retinanet_r101-caffe_fpn_ms-3x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_caffe_fpn_mstrain_3x_coco/retinanet_r101_caffe_fpn_mstrain_3x_coco_20210721_063439-88a8a944.pth + + - Name: retinanet_r101_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.7 + inference time (ms/im): + - value: 66.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_1x_coco/retinanet_r101_fpn_1x_coco_20200130-7a93545f.pth + + - Name: retinanet_r101_fpn_2x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101_fpn_2x_coco.py + Metadata: + Training Memory (GB): 5.7 + inference time (ms/im): + - value: 66.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_2x_coco/retinanet_r101_fpn_2x_coco_20200131-5560aee8.pth + + - Name: retinanet_r101_fpn_ms-640-800-3x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r101_fpn_mstrain_3x_coco/retinanet_r101_fpn_mstrain_3x_coco_20210720_214650-7ee888e0.pth + + - Name: retinanet_x101-32x4d_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 82.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_1x_coco/retinanet_x101_32x4d_fpn_1x_coco_20200130-5c8b7ec4.pth + + - Name: retinanet_x101-32x4d_fpn_2x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 82.64 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_32x4d_fpn_2x_coco/retinanet_x101_32x4d_fpn_2x_coco_20200131-237fc5e1.pth + + - Name: retinanet_x101-64x4d_fpn_1x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.0 + inference time (ms/im): + - value: 114.94 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_1x_coco/retinanet_x101_64x4d_fpn_1x_coco_20200130-366f5af1.pth + + - Name: retinanet_x101-64x4d_fpn_2x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py + Metadata: + Training Memory (GB): 10.0 + inference time (ms/im): + - value: 114.94 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_2x_coco/retinanet_x101_64x4d_fpn_2x_coco_20200131-bca068ab.pth + + - Name: retinanet_x101-64x4d_fpn_ms-640-800-3x_coco + In Collection: RetinaNet + Config: configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_x101_64x4d_fpn_mstrain_3x_coco/retinanet_x101_64x4d_fpn_mstrain_3x_coco_20210719_051838-022c2187.pth diff --git a/mmdetection/configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..1f3a448 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './retinanet_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py b/mmdetection/configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py new file mode 100644 index 0000000..cfe7734 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r101-caffe_fpn_ms-3x_coco.py @@ -0,0 +1,8 @@ +_base_ = './retinanet_r50-caffe_fpn_ms-3x_coco.py' +# learning policy +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/retinanet/retinanet_r101_fpn_1x_coco.py b/mmdetection/configs/retinanet/retinanet_r101_fpn_1x_coco.py new file mode 100644 index 0000000..a7f0600 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/retinanet/retinanet_r101_fpn_2x_coco.py b/mmdetection/configs/retinanet/retinanet_r101_fpn_2x_coco.py new file mode 100644 index 0000000..721112a --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './retinanet_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/retinanet/retinanet_r101_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/retinanet/retinanet_r101_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..be018ea --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r101_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py b/mmdetection/configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py new file mode 100644 index 0000000..5663972 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r101_fpn_ms-640-800-3x_coco.py @@ -0,0 +1,9 @@ +_base_ = ['../_base_/models/retinanet_r50_fpn.py', '../common/ms_3x_coco.py'] +# optimizer +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/retinanet/retinanet_r18_fpn_1x_coco.py b/mmdetection/configs/retinanet/retinanet_r18_fpn_1x_coco.py new file mode 100644 index 0000000..9602118 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r18_fpn_1x_coco.py @@ -0,0 +1,20 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +# TODO: support auto scaling lr +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +# auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py b/mmdetection/configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py new file mode 100644 index 0000000..d2e88d6 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r18_fpn_1xb8-1x_coco.py @@ -0,0 +1,24 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# data +train_dataloader = dict(batch_size=8) + +# model +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) + +# Note: If the learning rate is set to 0.0025, the mAP will be 32.4. +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001)) +# TODO: support auto scaling lr +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (1 GPUs) x (8 samples per GPU) +# auto_scale_lr = dict(base_batch_size=8) diff --git a/mmdetection/configs/retinanet/retinanet_r18_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/retinanet/retinanet_r18_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..d6833f3 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r18_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..6ba1cdd --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + # use caffe img_norm + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py b/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py new file mode 100644 index 0000000..93687d8 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py @@ -0,0 +1,15 @@ +_base_ = './retinanet_r50-caffe_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-2x_coco.py b/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-2x_coco.py new file mode 100644 index 0000000..6d1604f --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './retinanet_r50-caffe_fpn_ms-1x_coco.py' +# training schedule for 2x +train_cfg = dict(max_epochs=24) + +# learning rate policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=24, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-3x_coco.py b/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-3x_coco.py new file mode 100644 index 0000000..5a6d42a --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50-caffe_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = './retinanet_r50-caffe_fpn_ms-1x_coco.py' + +# training schedule for 2x +train_cfg = dict(max_epochs=36) + +# learning rate policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=36, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] diff --git a/mmdetection/configs/retinanet/retinanet_r50_fpn_1x_coco.py b/mmdetection/configs/retinanet/retinanet_r50_fpn_1x_coco.py new file mode 100644 index 0000000..00d2567 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,10 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', + './retinanet_tta.py' +] + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/retinanet/retinanet_r50_fpn_2x_coco.py b/mmdetection/configs/retinanet/retinanet_r50_fpn_2x_coco.py new file mode 100644 index 0000000..47511b7 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50_fpn_2x_coco.py @@ -0,0 +1,25 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# training schedule for 2x +train_cfg = dict(max_epochs=24) + +# learning rate policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=24, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/retinanet/retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py b/mmdetection/configs/retinanet/retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py new file mode 100644 index 0000000..2f10db2 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50_fpn_8xb8-amp-lsj-200e_coco.py @@ -0,0 +1,21 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../common/lsj-200e_coco-detection.py' +] + +image_size = (1024, 1024) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +model = dict(data_preprocessor=dict(batch_augments=batch_augments)) + +train_dataloader = dict(batch_size=8, num_workers=4) +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/retinanet/retinanet_r50_fpn_90k_coco.py b/mmdetection/configs/retinanet/retinanet_r50_fpn_90k_coco.py new file mode 100644 index 0000000..1e1b2fd --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50_fpn_90k_coco.py @@ -0,0 +1,24 @@ +_base_ = 'retinanet_r50_fpn_1x_coco.py' + +# training schedule for 90k +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=90000, + val_interval=10000) +# learning rate policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=90000, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] +train_dataloader = dict(sampler=dict(type='InfiniteSampler')) +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000)) + +log_processor = dict(by_epoch=False) diff --git a/mmdetection/configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py b/mmdetection/configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py new file mode 100644 index 0000000..acf5266 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50_fpn_amp-1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' + +# MMEngine support the following two ways, users can choose +# according to convenience +# optim_wrapper = dict(type='AmpOptimWrapper') +_base_.optim_wrapper.type = 'AmpOptimWrapper' diff --git a/mmdetection/configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py b/mmdetection/configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py new file mode 100644 index 0000000..d91cf8c --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_r50_fpn_ms-640-800-3x_coco.py @@ -0,0 +1,4 @@ +_base_ = ['../_base_/models/retinanet_r50_fpn.py', '../common/ms_3x_coco.py'] +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/retinanet/retinanet_tta.py b/mmdetection/configs/retinanet/retinanet_tta.py new file mode 100644 index 0000000..d0f37e0 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_tta.py @@ -0,0 +1,23 @@ +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.5), max_per_img=100)) + +img_scales = [(1333, 800), (666, 400), (2000, 1200)] +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=None), + dict( + type='TestTimeAug', + transforms=[[ + dict(type='Resize', scale=s, keep_ratio=True) for s in img_scales + ], [ + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], [dict(type='LoadAnnotations', with_bbox=True)], + [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', + 'img_shape', 'scale_factor', 'flip', + 'flip_direction')) + ]]) +] diff --git a/mmdetection/configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..765a4c2 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py b/mmdetection/configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py new file mode 100644 index 0000000..14de96f --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_x101-32x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..948cd18 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py b/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py new file mode 100644 index 0000000..ad04b6e --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './retinanet_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py b/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py new file mode 100644 index 0000000..8531341 --- /dev/null +++ b/mmdetection/configs/retinanet/retinanet_x101-64x4d_fpn_ms-640-800-3x_coco.py @@ -0,0 +1,11 @@ +_base_ = ['../_base_/models/retinanet_r50_fpn.py', '../common/ms_3x_coco.py'] +# optimizer +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) +optim_wrapper = dict(optimizer=dict(type='SGD', lr=0.01)) diff --git a/mmdetection/configs/rpn/README.md b/mmdetection/configs/rpn/README.md new file mode 100644 index 0000000..bd328b4 --- /dev/null +++ b/mmdetection/configs/rpn/README.md @@ -0,0 +1,39 @@ +# RPN + +> [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497) + + + +## Abstract + +State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. + +
    + +
    + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | AR1000 | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :---------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | caffe | 1x | 3.5 | 22.6 | 58.7 | [config](./rpn_r50-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_caffe_fpn_1x_coco/rpn_r50_caffe_fpn_1x_coco_20200531-5b903a37.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_caffe_fpn_1x_coco/rpn_r50_caffe_fpn_1x_coco_20200531_012334.log.json) | +| R-50-FPN | pytorch | 1x | 3.8 | 22.3 | 58.2 | [config](./rpn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218_151240.log.json) | +| R-50-FPN | pytorch | 2x | - | - | 58.6 | [config](./rpn_r50_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_2x_coco/rpn_r50_fpn_2x_coco_20200131-0728c9b3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_2x_coco/rpn_r50_fpn_2x_coco_20200131_190631.log.json) | +| R-101-FPN | caffe | 1x | 5.4 | 17.3 | 60.0 | [config](./rpn_r101-caffe_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_caffe_fpn_1x_coco/rpn_r101_caffe_fpn_1x_coco_20200531-0629a2e2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_caffe_fpn_1x_coco/rpn_r101_caffe_fpn_1x_coco_20200531_012345.log.json) | +| R-101-FPN | pytorch | 1x | 5.8 | 16.5 | 59.7 | [config](./rpn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_1x_coco/rpn_r101_fpn_1x_coco_20200131-2ace2249.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_1x_coco/rpn_r101_fpn_1x_coco_20200131_191000.log.json) | +| R-101-FPN | pytorch | 2x | - | - | 60.2 | [config](./rpn_r101_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_2x_coco/rpn_r101_fpn_2x_coco_20200131-24e3db1a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_fpn_2x_coco/rpn_r101_fpn_2x_coco_20200131_191106.log.json) | +| X-101-32x4d-FPN | pytorch | 1x | 7.0 | 13.0 | 60.6 | [config](./rpn_x101-32x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_1x_coco/rpn_x101_32x4d_fpn_1x_coco_20200219-b02646c6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_1x_coco/rpn_x101_32x4d_fpn_1x_coco_20200219_012037.log.json) | +| X-101-32x4d-FPN | pytorch | 2x | - | - | 61.1 | [config](./rpn_x101-32x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_2x_coco/rpn_x101_32x4d_fpn_2x_coco_20200208-d22bd0bb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_2x_coco/rpn_x101_32x4d_fpn_2x_coco_20200208_200752.log.json) | +| X-101-64x4d-FPN | pytorch | 1x | 10.1 | 9.1 | 61.0 | [config](./rpn_x101-64x4d_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_1x_coco/rpn_x101_64x4d_fpn_1x_coco_20200208-cde6f7dd.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_1x_coco/rpn_x101_64x4d_fpn_1x_coco_20200208_200752.log.json) | +| X-101-64x4d-FPN | pytorch | 2x | - | - | 61.5 | [config](./rpn_x101-64x4d_fpn_2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_2x_coco/rpn_x101_64x4d_fpn_2x_coco_20200208-c65f524f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_2x_coco/rpn_x101_64x4d_fpn_2x_coco_20200208_200752.log.json) | + +## Citation + +```latex +@inproceedings{ren2015faster, + title={Faster r-cnn: Towards real-time object detection with region proposal networks}, + author={Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian}, + booktitle={Advances in neural information processing systems}, + year={2015} +} +``` diff --git a/mmdetection/configs/rpn/metafile.yml b/mmdetection/configs/rpn/metafile.yml new file mode 100644 index 0000000..9796ead --- /dev/null +++ b/mmdetection/configs/rpn/metafile.yml @@ -0,0 +1,127 @@ +Collections: + - Name: RPN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1506.01497 + Title: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" + README: configs/rpn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/rpn.py#L6 + Version: v2.0.0 + +Models: + - Name: rpn_r50-caffe_fpn_1x_coco + In Collection: RPN + Config: configs/rpn/rpn_r50-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.5 + Training Resources: 8x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + AR@1000: 58.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_caffe_fpn_1x_coco/rpn_r50_caffe_fpn_1x_coco_20200531-5b903a37.pth + + - Name: rpn_r50_fpn_1x_coco + In Collection: RPN + Config: configs/rpn/rpn_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 3.8 + Training Resources: 8x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + AR@1000: 58.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_1x_coco/rpn_r50_fpn_1x_coco_20200218-5525fa2e.pth + + - Name: rpn_r50_fpn_2x_coco + In Collection: RPN + Config: rpn_r50_fpn_2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + AR@1000: 58.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r50_fpn_2x_coco/rpn_r50_fpn_2x_coco_20200131-0728c9b3.pth + + - Name: rpn_r101-caffe_fpn_1x_coco + In Collection: RPN + Config: configs/rpn/rpn_r101-caffe_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.4 + Training Resources: 8x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + AR@1000: 60.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_r101_caffe_fpn_1x_coco/rpn_r101_caffe_fpn_1x_coco_20200531-0629a2e2.pth + + - Name: rpn_x101-32x4d_fpn_1x_coco + In Collection: RPN + Config: configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.0 + Training Resources: 8x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + AR@1000: 60.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_1x_coco/rpn_x101_32x4d_fpn_1x_coco_20200219-b02646c6.pth + + - Name: rpn_x101-32x4d_fpn_2x_coco + In Collection: RPN + Config: configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py + Metadata: + Training Resources: 8x V100 GPUs + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + AR@1000: 61.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_32x4d_fpn_2x_coco/rpn_x101_32x4d_fpn_2x_coco_20200208-d22bd0bb.pth + + - Name: rpn_x101-64x4d_fpn_1x_coco + In Collection: RPN + Config: configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py + Metadata: + Training Memory (GB): 10.1 + Training Resources: 8x V100 GPUs + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + AR@1000: 61.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_1x_coco/rpn_x101_64x4d_fpn_1x_coco_20200208-cde6f7dd.pth + + - Name: rpn_x101-64x4d_fpn_2x_coco + In Collection: RPN + Config: configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py + Metadata: + Training Resources: 8x V100 GPUs + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + AR@1000: 61.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/rpn/rpn_x101_64x4d_fpn_2x_coco/rpn_x101_64x4d_fpn_2x_coco_20200208-c65f524f.pth diff --git a/mmdetection/configs/rpn/rpn_r101-caffe_fpn_1x_coco.py b/mmdetection/configs/rpn/rpn_r101-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..22977af --- /dev/null +++ b/mmdetection/configs/rpn/rpn_r101-caffe_fpn_1x_coco.py @@ -0,0 +1,7 @@ +_base_ = './rpn_r50-caffe_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/configs/rpn/rpn_r101_fpn_1x_coco.py b/mmdetection/configs/rpn/rpn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..962728f --- /dev/null +++ b/mmdetection/configs/rpn/rpn_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/rpn/rpn_r101_fpn_2x_coco.py b/mmdetection/configs/rpn/rpn_r101_fpn_2x_coco.py new file mode 100644 index 0000000..ac7671c --- /dev/null +++ b/mmdetection/configs/rpn/rpn_r101_fpn_2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './rpn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/rpn/rpn_r50-caffe-c4_1x_coco.py b/mmdetection/configs/rpn/rpn_r50-caffe-c4_1x_coco.py new file mode 100644 index 0000000..76b878c --- /dev/null +++ b/mmdetection/configs/rpn/rpn_r50-caffe-c4_1x_coco.py @@ -0,0 +1,8 @@ +_base_ = [ + '../_base_/models/rpn_r50-caffe-c4.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +val_evaluator = dict(metric='proposal_fast') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/rpn/rpn_r50-caffe_fpn_1x_coco.py b/mmdetection/configs/rpn/rpn_r50-caffe_fpn_1x_coco.py new file mode 100644 index 0000000..530f365 --- /dev/null +++ b/mmdetection/configs/rpn/rpn_r50-caffe_fpn_1x_coco.py @@ -0,0 +1,16 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' +# use caffe img_norm +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/configs/rpn/rpn_r50_fpn_1x_coco.py b/mmdetection/configs/rpn/rpn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..7fe88d3 --- /dev/null +++ b/mmdetection/configs/rpn/rpn_r50_fpn_1x_coco.py @@ -0,0 +1,36 @@ +_base_ = [ + '../_base_/models/rpn_r50_fpn.py', '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +val_evaluator = dict(metric='proposal_fast') +test_evaluator = val_evaluator + +# inference on val dataset and dump the proposals with evaluate metric +# data_root = 'data/coco/' +# test_evaluator = [ +# dict( +# type='DumpProposals', +# output_dir=data_root + 'proposals/', +# proposals_file='rpn_r50_fpn_1x_val2017.pkl'), +# dict( +# type='CocoMetric', +# ann_file=data_root + 'annotations/instances_val2017.json', +# metric='proposal_fast', +# backend_args={{_base_.backend_args}}, +# format_only=False) +# ] + +# inference on training dataset and dump the proposals without evaluate metric +# data_root = 'data/coco/' +# test_dataloader = dict( +# dataset=dict( +# ann_file='annotations/instances_train2017.json', +# data_prefix=dict(img='train2017/'))) +# +# test_evaluator = [ +# dict( +# type='DumpProposals', +# output_dir=data_root + 'proposals/', +# proposals_file='rpn_r50_fpn_1x_train2017.pkl'), +# ] diff --git a/mmdetection/configs/rpn/rpn_r50_fpn_2x_coco.py b/mmdetection/configs/rpn/rpn_r50_fpn_2x_coco.py new file mode 100644 index 0000000..0ebccbc --- /dev/null +++ b/mmdetection/configs/rpn/rpn_r50_fpn_2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' + +# learning policy +max_epochs = 24 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py b/mmdetection/configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..d0c7394 --- /dev/null +++ b/mmdetection/configs/rpn/rpn_x101-32x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py b/mmdetection/configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py new file mode 100644 index 0000000..c6880b7 --- /dev/null +++ b/mmdetection/configs/rpn/rpn_x101-32x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './rpn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py b/mmdetection/configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py new file mode 100644 index 0000000..96e691a --- /dev/null +++ b/mmdetection/configs/rpn/rpn_x101-64x4d_fpn_1x_coco.py @@ -0,0 +1,14 @@ +_base_ = './rpn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py b/mmdetection/configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py new file mode 100644 index 0000000..4182a39 --- /dev/null +++ b/mmdetection/configs/rpn/rpn_x101-64x4d_fpn_2x_coco.py @@ -0,0 +1,14 @@ +_base_ = './rpn_r50_fpn_2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/rtmdet/README.md b/mmdetection/configs/rtmdet/README.md new file mode 100644 index 0000000..4574dd6 --- /dev/null +++ b/mmdetection/configs/rtmdet/README.md @@ -0,0 +1,454 @@ +# RTMDet: An Empirical Study of Designing Real-Time Object Detectors + +> [RTMDet: An Empirical Study of Designing Real-Time Object Detectors](https://arxiv.org/abs/2212.07784) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + + + +## Abstract + +In this paper, we aim to design an efficient real-time object detector that exceeds the YOLO series and is easily extensible for many object recognition tasks such as instance segmentation and rotated object detection. To obtain a more efficient model architecture, we explore an architecture that has compatible capacities in the backbone and neck, constructed by a basic building block that consists of large-kernel depth-wise convolutions. We further introduce soft labels when calculating matching costs in the dynamic label assignment to improve accuracy. Together with better training techniques, the resulting object detector, named RTMDet, achieves 52.8% AP on COCO with 300+ FPS on an NVIDIA 3090 GPU, outperforming the current mainstream industrial detectors. RTMDet achieves the best parameter-accuracy trade-off with tiny/small/medium/large/extra-large model sizes for various application scenarios, and obtains new state-of-the-art performance on real-time instance segmentation and rotated object detection. We hope the experimental results can provide new insights into designing versatile real-time object detectors for many object recognition tasks. + +
    + +
    + +## Results and Models + +### Object Detection + +| Model | size | box AP | Params(M) | FLOPS(G) | TRT-FP16-Latency(ms)
    RTX3090 | TRT-FP16-Latency(ms)
    T4 | Config | Download | +| :---------: | :--: | :----: | :-------: | :------: | :-----------------------------: | :------------------------: | :----------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RTMDet-tiny | 640 | 41.1 | 4.8 | 8.1 | 0.98 | 2.34 | [config](./rtmdet_tiny_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414.log.json) | +| RTMDet-s | 640 | 44.6 | 8.89 | 14.8 | 1.22 | 2.96 | [config](./rtmdet_s_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_s_8xb32-300e_coco/rtmdet_s_8xb32-300e_coco_20220905_161602-387a891e.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_s_8xb32-300e_coco/rtmdet_s_8xb32-300e_coco_20220905_161602.log.json) | +| RTMDet-m | 640 | 49.4 | 24.71 | 39.27 | 1.62 | 6.41 | [config](./rtmdet_m_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_m_8xb32-300e_coco/rtmdet_m_8xb32-300e_coco_20220719_112220-229f527c.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_m_8xb32-300e_coco/rtmdet_m_8xb32-300e_coco_20220719_112220.log.json) | +| RTMDet-l | 640 | 51.5 | 52.3 | 80.23 | 2.44 | 10.32 | [config](./rtmdet_l_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_l_8xb32-300e_coco/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_l_8xb32-300e_coco/rtmdet_l_8xb32-300e_coco_20220719_112030.log.json) | +| RTMDet-x | 640 | 52.8 | 94.86 | 141.67 | 3.10 | 18.80 | [config](./rtmdet_x_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_x_8xb32-300e_coco/rtmdet_x_8xb32-300e_coco_20220715_230555-cc79b9ae.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_x_8xb32-300e_coco/rtmdet_x_8xb32-300e_coco_20220715_230555.log.json) | +| RTMDet-x-P6 | 1280 | 54.9 | | | | | [config](./rtmdet_x_p6_4xb8-300e_coco.py) | [model](https://github.com/orange0-jp/orange-weights/releases/download/v0.1.0rtmdet-p6/rtmdet_x_p6_4xb8-300e_coco-bf32be58.pth) | + +**Note**: + +1. We implement a fast training version of RTMDet in [MMYOLO](https://github.com/open-mmlab/mmyolo). Its training speed is **2.6 times faster** and memory requirement is lower! Try it [here](https://github.com/open-mmlab/mmyolo/tree/main/configs/rtmdet)! +2. The inference speed of RTMDet is measured with TensorRT 8.4.3, cuDNN 8.2.0, FP16, batch size=1, and without NMS. +3. For a fair comparison, the config of bbox postprocessing is changed to be consistent with YOLOv5/6/7 after [PR#9494](https://github.com/open-mmlab/mmdetection/pull/9494), bringing about 0.1~0.3% AP improvement. + +### Instance Segmentation + +RTMDet-Ins is the state-of-the-art real-time instance segmentation on coco dataset: + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) + +| Model | size | box AP | mask AP | Params(M) | FLOPS(G) | TRT-FP16-Latency(ms) | Config | Download | +| :-------------: | :--: | :----: | :-----: | :-------: | :------: | :------------------: | :--------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RTMDet-Ins-tiny | 640 | 40.5 | 35.4 | 5.6 | 11.8 | 1.70 | [config](./rtmdet-ins_tiny_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco/rtmdet-ins_tiny_8xb32-300e_coco_20221130_151727-ec670f7e.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco/rtmdet-ins_tiny_8xb32-300e_coco_20221130_151727.log.json) | +| RTMDet-Ins-s | 640 | 44.0 | 38.7 | 10.18 | 21.5 | 1.93 | [config](./rtmdet-ins_s_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_s_8xb32-300e_coco/rtmdet-ins_s_8xb32-300e_coco_20221121_212604-fdc5d7ec.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_s_8xb32-300e_coco/rtmdet-ins_s_8xb32-300e_coco_20221121_212604.log.json) | +| RTMDet-Ins-m | 640 | 48.8 | 42.1 | 27.58 | 54.13 | 2.69 | [config](./rtmdet-ins_m_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_m_8xb32-300e_coco/rtmdet-ins_m_8xb32-300e_coco_20221123_001039-6eba602e.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_m_8xb32-300e_coco/rtmdet-ins_m_8xb32-300e_coco_20221123_001039.log.json) | +| RTMDet-Ins-l | 640 | 51.2 | 43.7 | 57.37 | 106.56 | 3.68 | [config](./rtmdet-ins_l_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_l_8xb32-300e_coco/rtmdet-ins_l_8xb32-300e_coco_20221124_103237-78d1d652.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_l_8xb32-300e_coco/rtmdet-ins_l_8xb32-300e_coco_20221124_103237.log.json) | +| RTMDet-Ins-x | 640 | 52.4 | 44.6 | 102.7 | 182.7 | 5.31 | [config](./rtmdet-ins_x_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_x_8xb16-300e_coco/rtmdet-ins_x_8xb16-300e_coco_20221124_111313-33d4595b.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_x_8xb16-300e_coco/rtmdet-ins_x_8xb16-300e_coco_20221124_111313.log.json) | + +**Note**: + +1. The inference speed of RTMDet-Ins is measured on an NVIDIA 3090 GPU with TensorRT 8.4.3, cuDNN 8.2.0, FP16, batch size=1. Top 100 masks are kept and the post process latency is included. + +### Rotated Object Detection + +RTMDet-R achieves state-of-the-art on various remote sensing datasets. + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/one-stage-anchor-free-oriented-object-1)](https://paperswithcode.com/sota/one-stage-anchor-free-oriented-object-1?p=rtmdet-an-empirical-study-of-designing-real) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/one-stage-anchor-free-oriented-object-3)](https://paperswithcode.com/sota/one-stage-anchor-free-oriented-object-3?p=rtmdet-an-empirical-study-of-designing-real) + +Models and configs of RTMDet-R are available in [MMRotate](https://github.com/open-mmlab/mmrotate/tree/1.x/configs/rotated_rtmdet). + +| Backbone | pretrain | Aug | mmAP | mAP50 | mAP75 | Params(M) | FLOPS(G) | TRT-FP16-Latency(ms) | Config | Download | +| :---------: | :------: | :---: | :---: | :---: | :---: | :-------: | :------: | :------------------: | :---------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RTMDet-tiny | IN | RR | 47.37 | 75.36 | 50.64 | 4.88 | 20.45 | 4.40 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_tiny-3x-dota.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_tiny-3x-dota/rotated_rtmdet_tiny-3x-dota-9d821076.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_tiny-3x-dota/rotated_rtmdet_tiny-3x-dota_20221201_120814.json) | +| RTMDet-tiny | IN | MS+RR | 53.59 | 79.82 | 58.87 | 4.88 | 20.45 | 4.40 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_tiny-3x-dota_ms.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_tiny-3x-dota_ms/rotated_rtmdet_tiny-3x-dota_ms-f12286ff.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_tiny-3x-dota_ms/rotated_rtmdet_tiny-3x-dota_ms_20221113_201235.log) | +| RTMDet-s | IN | RR | 48.16 | 76.93 | 50.59 | 8.86 | 37.62 | 4.86 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_s-3x-dota.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_s-3x-dota/rotated_rtmdet_s-3x-dota-11f6ccf5.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_s-3x-dota/rotated_rtmdet_s-3x-dota_20221124_081442.json) | +| RTMDet-s | IN | MS+RR | 54.43 | 79.98 | 60.07 | 8.86 | 37.62 | 4.86 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_s-3x-dota_ms.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_s-3x-dota_ms/rotated_rtmdet_s-3x-dota_ms-20ead048.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_s-3x-dota_ms/rotated_rtmdet_s-3x-dota_ms_20221113_201055.json) | +| RTMDet-m | IN | RR | 50.56 | 78.24 | 54.47 | 24.67 | 99.76 | 7.82 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_m-3x-dota.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_m-3x-dota/rotated_rtmdet_m-3x-dota-beeadda6.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_m-3x-dota/rotated_rtmdet_m-3x-dota_20221122_011234.json) | +| RTMDet-m | IN | MS+RR | 55.00 | 80.26 | 61.26 | 24.67 | 99.76 | 7.82 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_m-3x-dota_ms.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_m-3x-dota_ms/rotated_rtmdet_m-3x-dota_ms-c71eb375.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_m-3x-dota_ms/rotated_rtmdet_m-3x-dota_ms_20221122_011234.json) | +| RTMDet-l | IN | RR | 51.01 | 78.85 | 55.21 | 52.27 | 204.21 | 10.82 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_l-3x-dota.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_l-3x-dota/rotated_rtmdet_l-3x-dota-23992372.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_l-3x-dota/rotated_rtmdet_l-3x-dota_20221122_011241.json) | +| RTMDet-l | IN | MS+RR | 55.52 | 80.54 | 61.47 | 52.27 | 204.21 | 10.82 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_l-3x-dota_ms.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_l-3x-dota_ms/rotated_rtmdet_l-3x-dota_ms-2738da34.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_l-3x-dota_ms/rotated_rtmdet_l-3x-dota_ms_20221122_011241.json) | +| RTMDet-l | COCO | MS+RR | 56.74 | 81.33 | 63.45 | 52.27 | 204.21 | 10.82 | [config](https://github.com/open-mmlab/mmrotate/edit/1.x/configs/rotated_rtmdet/rotated_rtmdet_l-coco_pretrain-3x-dota_ms.py) | [model](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_l-coco_pretrain-3x-dota_ms/rotated_rtmdet_l-coco_pretrain-3x-dota_ms-06d248a2.pth) \| [log](https://download.openmmlab.com/mmrotate/v1.0/rotated_rtmdet/rotated_rtmdet_l-coco_pretrain-3x-dota_ms/rotated_rtmdet_l-coco_pretrain-3x-dota_ms_20221113_202010.json) | + +### Classification + +We also provide the imagenet classification configs of the RTMDet backbone. Find more details in the [classification folder](./classification). + +| Model | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | +| :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------------------------------------------------------------------------: | +| CSPNeXt-tiny | 224x224 | 2.73 | 0.34 | 69.44 | 89.45 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e-3a2dd350.pth) | +| CSPNeXt-s | 224x224 | 4.89 | 0.66 | 74.41 | 92.23 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e-ea671761.pth) | +| CSPNeXt-m | 224x224 | 13.05 | 1.93 | 79.27 | 94.79 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth) | +| CSPNeXt-l | 224x224 | 27.16 | 4.19 | 81.30 | 95.62 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth) | +| CSPNeXt-x | 224x224 | 48.85 | 7.76 | 82.10 | 95.69 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-x_8xb256-rsb-a1-600e_in1k-b3f78edd.pth) | + +## Citation + +```latex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +## Visualization + +
    + +
    + +## Deployment Tutorial + +Here is a basic example of deploy RTMDet with [MMDeploy-1.x](https://github.com/open-mmlab/mmdeploy/tree/1.x). + +### Step1. Install MMDeploy + +Before starting the deployment, please make sure you install MMDetection and MMDeploy-1.x correctly. + +- Install MMDetection, please refer to the [MMDetection installation guide](https://mmdetection.readthedocs.io/en/latest/get_started.html). +- Install MMDeploy-1.x, please refer to the [MMDeploy-1.x installation guide](https://mmdeploy.readthedocs.io/en/1.x/get_started.html#installation). + +If you want to deploy RTMDet with ONNXRuntime, TensorRT, or other inference engine, +please make sure you have installed the corresponding dependencies and MMDeploy precompiled packages. + +### Step2. Convert Model + +After the installation, you can enjoy the model deployment journey starting from converting PyTorch model to backend model by running MMDeploy's `tools/deploy.py`. + +The detailed model conversion tutorial please refer to the [MMDeploy document](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/convert_model.html). +Here we only give the example of converting RTMDet. + +MMDeploy supports converting dynamic and static models. Dynamic models support different input shape, but the inference speed is slower than static models. +To achieve the best performance, we suggest converting RTMDet with static setting. + +- If you only want to use ONNX, please use [`configs/mmdet/detection/detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmdet/detection/detection_onnxruntime_static.py) as the deployment config. +- If you want to use TensorRT, please use [`configs/mmdet/detection/detection_tensorrt_static-640x640.py`](https://github.com/open-mmlab/mmdeploy/blob/1.x/configs/mmdet/detection/detection_tensorrt_static-640x640.py). + +If you want to customize the settings in the deployment config for your requirements, please refer to [MMDeploy config tutorial](https://mmdeploy.readthedocs.io/en/1.x/02-how-to-run/write_config.html). + +After preparing the deployment config, you can run the `tools/deploy.py` script to convert your model. +Here we take converting RTMDet-s to TensorRT as an example: + +```shell +# go to the mmdeploy folder +cd ${PATH_TO_MMDEPLOY} + +# download RTMDet-s checkpoint +wget -P checkpoint https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_s_8xb32-300e_coco/rtmdet_s_8xb32-300e_coco_20220905_161602-387a891e.pth + +# run the command to start model conversion +python tools/deploy.py \ + configs/mmdet/detection/detection_tensorrt_static-640x640.py \ + ${PATH_TO_MMDET}/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py \ + checkpoint/rtmdet_s_8xb32-300e_coco_20220905_161602-387a891e.pth \ + demo/resources/det.jpg \ + --work-dir ./work_dirs/rtmdet \ + --device cuda:0 \ + --show +``` + +If the script runs successfully, you will see the following files: + +``` +|----work_dirs + |----rtmdet + |----end2end.onnx # ONNX model + |----end2end.engine # TensorRT engine file +``` + +After this, you can check the inference results with MMDeploy Model Converter API: + +```python +from mmdeploy.apis import inference_model + +result = inference_model( + model_cfg='${PATH_TO_MMDET}/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py', + deploy_cfg='${PATH_TO_MMDEPLOY}/configs/mmdet/detection/detection_tensorrt_static-640x640.py', + backend_files=['work_dirs/rtmdet/end2end.engine'], + img='demo/resources/det.jpg', + device='cuda:0') +``` + +#### Advanced Setting + +To convert the model with TRT-FP16, you can enable the fp16 mode in your deploy config: + +```python +# in MMDeploy config +backend_config = dict( + type='tensorrt', + common_config=dict( + fp16_mode=True # enable fp16 + )) +``` + +To reduce the end to end inference speed with the inference engine, we suggest you to adjust the post-processing setting of the model. +We set a very low score threshold during training and testing to achieve better COCO mAP. +However, in actual usage scenarios, a relatively high score threshold (e.g. 0.3) is usually used. + +You can adjust the score threshold and the number of detection boxes in your model config according to the actual usage to reduce the time-consuming of post-processing. + +```python +# in MMDetection config +model = dict( + test_cfg=dict( + nms_pre=1000, # keep top-k score bboxes before nms + min_bbox_size=0, + score_thr=0.3, # score threshold to filter bboxes + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=100) # only keep top-100 as the final results. +) +``` + +### Step3. Inference with SDK + +We provide both Python and C++ inference API with MMDeploy SDK. + +To use SDK, you need to dump the required info during converting the model. Just add `--dump-info` to the model conversion command: + +```shell +python tools/deploy.py \ + configs/mmdet/detection/detection_tensorrt_static-640x640.py \ + ${PATH_TO_MMDET}/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py \ + checkpoint/rtmdet_s_8xb32-300e_coco_20220905_161602-387a891e.pth \ + demo/resources/det.jpg \ + --work-dir ./work_dirs/rtmdet-sdk \ + --device cuda:0 \ + --show \ + --dump-info # dump sdk info +``` + +After running the command, it will dump 3 json files additionally for the SDK: + +``` +|----work_dirs + |----rtmdet-sdk + |----end2end.onnx # ONNX model + |----end2end.engine # TensorRT engine file + # json files for the SDK + |----pipeline.json + |----deploy.json + |----detail.json +``` + +#### Python API + +Here is a basic example of SDK Python API: + +```python +from mmdeploy_python import Detector +import cv2 + +img = cv2.imread('demo/resources/det.jpg') + +# create a detector +detector = Detector(model_path='work_dirs/rtmdet-sdk', device_name='cuda', device_id=0) +# run the inference +bboxes, labels, _ = detector(img) +# Filter the result according to threshold +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + # draw bbox + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('output_detection.png', img) +``` + +#### C++ API + +Here is a basic example of SDK C++ API: + +```C++ +#include +#include +#include "mmdeploy/detector.hpp" + +int main() { + const char* device_name = "cuda"; + int device_id = 0; + std::string model_path = "work_dirs/rtmdet-sdk"; + std::string image_path = "demo/resources/det.jpg"; + + // 1. load model + mmdeploy::Model model(model_path); + // 2. create predictor + mmdeploy::Detector detector(model, mmdeploy::Device{device_name, device_id}); + // 3. read image + cv::Mat img = cv::imread(image_path); + // 4. inference + auto dets = detector.Apply(img); + // 5. deal with the result. Here we choose to visualize it + for (int i = 0; i < dets.size(); ++i) { + const auto& box = dets[i].bbox; + fprintf(stdout, "box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n", + i, box.left, box.top, box.right, box.bottom, dets[i].label_id, dets[i].score); + if (bboxes[i].score < 0.3) { + continue; + } + cv::rectangle(img, cv::Point{(int)box.left, (int)box.top}, + cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0}); + } + cv::imwrite("output_detection.png", img); + return 0; +} +``` + +To build C++ example, please add MMDeploy package in your CMake project as following: + +```cmake +find_package(MMDeploy REQUIRED) +target_link_libraries(${name} PRIVATE mmdeploy ${OpenCV_LIBS}) +``` + +#### Other languages + +- [C# API Examples](https://github.com/open-mmlab/mmdeploy/tree/1.x/demo/csharp) +- [JAVA API Examples](https://github.com/open-mmlab/mmdeploy/tree/1.x/demo/java) + +### Deploy RTMDet Instance Segmentation Model + +We support RTMDet-Ins ONNXRuntime and TensorRT deployment after [MMDeploy v1.0.0rc2](https://github.com/open-mmlab/mmdeploy/tree/v1.0.0rc2). And its deployment process is almost consistent with the detection model. + +#### Step1. Install MMDeploy >= v1.0.0rc2 + +Please refer to the [MMDeploy-1.x installation guide](https://mmdeploy.readthedocs.io/en/1.x/get_started.html#installation) to install the latest version. +Please remember to replace the pre-built package with the latest version. +The v1.0.0rc2 package can be downloaded from [v1.0.0rc2 release page](https://github.com/open-mmlab/mmdeploy/releases/tag/v1.0.0rc2). + +Step2. Convert Model + +This step has no difference with the previous tutorial. The only thing you need to change is switching to the RTMDet-Ins deploy config: + +- If you want to use ONNXRuntime, please use [`configs/mmdet/instance-seg/instance-seg_rtmdet-ins_onnxruntime_static-640x640.py`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/configs/mmdet/instance-seg/instance-seg_rtmdet-ins_onnxruntime_static-640x640.py) as the deployment config. +- If you want to use TensorRT, please use [`configs/mmdet/instance-seg/instance-seg_rtmdet-ins_tensorrt_static-640x640.py`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/configs/mmdet/instance-seg/instance-seg_rtmdet-ins_tensorrt_static-640x640.py). + +Here we take converting RTMDet-Ins-s to TensorRT as an example: + +```shell +# go to the mmdeploy folder +cd ${PATH_TO_MMDEPLOY} + +# download RTMDet-s checkpoint +wget -P checkpoint https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_s_8xb32-300e_coco/rtmdet-ins_s_8xb32-300e_coco_20221121_212604-fdc5d7ec.pth + +# run the command to start model conversion +python tools/deploy.py \ + configs/mmdet/instance-seg/instance-seg_rtmdet-ins_tensorrt_static-640x640.py \ + ${PATH_TO_MMDET}/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py \ + checkpoint/rtmdet-ins_s_8xb32-300e_coco_20221121_212604-fdc5d7ec.pth \ + demo/resources/det.jpg \ + --work-dir ./work_dirs/rtmdet-ins \ + --device cuda:0 \ + --show +``` + +If the script runs successfully, you will see the following files: + +``` +|----work_dirs + |----rtmdet-ins + |----end2end.onnx # ONNX model + |----end2end.engine # TensorRT engine file +``` + +After this, you can check the inference results with MMDeploy Model Converter API: + +```python +from mmdeploy.apis import inference_model + +result = inference_model( + model_cfg='${PATH_TO_MMDET}/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py', + deploy_cfg='${PATH_TO_MMDEPLOY}/configs/mmdet/instance-seg/instance-seg_rtmdet-ins_tensorrt_static-640x640.py', + backend_files=['work_dirs/rtmdet-ins/end2end.engine'], + img='demo/resources/det.jpg', + device='cuda:0') +``` + +### Model Config + +In MMDetection's config, we use `model` to set up detection algorithm components. In addition to neural network components such as `backbone`, `neck`, etc, it also requires `data_preprocessor`, `train_cfg`, and `test_cfg`. `data_preprocessor` is responsible for processing a batch of data output by dataloader. `train_cfg`, and `test_cfg` in the model config are for training and testing hyperparameters of the components.Taking RTMDet as an example, we will introduce each field in the config according to different function modules: + +```python +model = dict( + type='RTMDet', # The name of detector + data_preprocessor=dict( # The config of data preprocessor, usually includes image normalization and padding + type='DetDataPreprocessor', # The type of the data preprocessor. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.data_preprocessors.DetDataPreprocessor + mean=[103.53, 116.28, 123.675], # Mean values used to pre-training the pre-trained backbone models, ordered in R, G, B + std=[57.375, 57.12, 58.395], # Standard variance used to pre-training the pre-trained backbone models, ordered in R, G, B + bgr_to_rgb=False, # whether to convert image from BGR to RGB + batch_augments=None), # Batch-level augmentations + backbone=dict( # The config of backbone + type='CSPNeXt', # The type of backbone network. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.backbones.CSPNeXt + arch='P5', # Architecture of CSPNeXt, from {P5, P6}. Defaults to P5 + expand_ratio=0.5, # Ratio to adjust the number of channels of the hidden layer. Defaults to 0.5 + deepen_factor=1, # Depth multiplier, multiply number of blocks in CSP layer by this amount. Defaults to 1.0 + widen_factor=1, # Width multiplier, multiply number of channels in each layer by this amount. Defaults to 1.0 + channel_attention=True, # Whether to add channel attention in each stage. Defaults to True + norm_cfg=dict(type='SyncBN'), # Dictionary to construct and config norm layer. Defaults to dict(type=’BN’, requires_grad=True) + act_cfg=dict(type='SiLU', inplace=True)), # Config dict for activation layer. Defaults to dict(type=’SiLU’) + neck=dict( + type='CSPNeXtPAFPN', # The type of neck is CSPNeXtPAFPN. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.necks.CSPNeXtPAFPN + in_channels=[256, 512, 1024], # Number of input channels per scale + out_channels=256, # Number of output channels (used at each scale) + num_csp_blocks=3, # Number of bottlenecks in CSPLayer. Defaults to 3 + expand_ratio=0.5, # Ratio to adjust the number of channels of the hidden layer. Default: 0.5 + norm_cfg=dict(type='SyncBN'), # Config dict for normalization layer. Default: dict(type=’BN’) + act_cfg=dict(type='SiLU', inplace=True)), # Config dict for activation layer. Default: dict(type=’Swish’) + bbox_head=dict( + type='RTMDetSepBNHead', # The type of bbox_head is RTMDetSepBNHead. RTMDetHead with separated BN layers and shared conv layers. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.dense_heads.RTMDetSepBNHead + num_classes=80, # Number of categories excluding the background category + in_channels=256, # Number of channels in the input feature map + stacked_convs=2, # Whether to share conv layers between stages. Defaults to True + feat_channels=256, # Feature channels of convolutional layers in the head + anchor_generator=dict( # The config of anchor generator + type='MlvlPointGenerator', # The methods use MlvlPointGenerator. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/prior_generators/point_generator.py#L92 + offset=0, # The offset of points, the value is normalized with corresponding stride. Defaults to 0.5 + strides=[8, 16, 32]), # Strides of anchors in multiple feature levels in order (w, h) + bbox_coder=dict(type='DistancePointBBoxCoder'), # Distance Point BBox coder.This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,right) and decode it back to the original. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/coders/distance_point_bbox_coder.py#L9 + loss_cls=dict( # Config of loss function for the classification branch + type='QualityFocalLoss', # Type of loss for classification branch. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.losses.QualityFocalLoss + use_sigmoid=True, # Whether sigmoid operation is conducted in QFL. Defaults to True + beta=2.0, # The beta parameter for calculating the modulating factor. Defaults to 2.0 + loss_weight=1.0), # Loss weight of current loss + loss_bbox=dict( # Config of loss function for the regression branch + type='GIoULoss', # Type of loss. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.losses.GIoULoss + loss_weight=2.0), # Loss weight of the regression branch + with_objectness=False, # Whether to add an objectness branch. Defaults to True + exp_on_reg=True, # Whether to use .exp() in regression + share_conv=True, # Whether to share conv layers between stages. Defaults to True + pred_kernel_size=1, # Kernel size of prediction layer. Defaults to 1 + norm_cfg=dict(type='SyncBN'), # Config dict for normalization layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001) + act_cfg=dict(type='SiLU', inplace=True)), # Config dict for activation layer. Defaults to dict(type='SiLU') + train_cfg=dict( # Config of training hyperparameters for ATSS + assigner=dict( # Config of assigner + type='DynamicSoftLabelAssigner', # Type of assigner. DynamicSoftLabelAssigner computes matching between predictions and ground truth with dynamic soft label assignment. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py#L40 + topk=13), # Select top-k predictions to calculate dynamic k best matches for each gt. Defaults to 13 + allowed_border=-1, # The border allowed after padding for valid anchors + pos_weight=-1, # The weight of positive samples during training + debug=False), # Whether to set the debug mode + test_cfg=dict( # Config for testing hyperparameters for ATSS + nms_pre=30000, # The number of boxes before NMS + min_bbox_size=0, # The allowed minimal box size + score_thr=0.001, # Threshold to filter out boxes + nms=dict( # Config of NMS in the second stage + type='nms', # Type of NMS + iou_threshold=0.65), # NMS threshold + max_per_img=300), # Max number of detections of each image +) +``` diff --git a/mmdetection/configs/rtmdet/classification/README.md b/mmdetection/configs/rtmdet/classification/README.md new file mode 100644 index 0000000..acc127d --- /dev/null +++ b/mmdetection/configs/rtmdet/classification/README.md @@ -0,0 +1,56 @@ +# CSPNeXt ImageNet Pre-training + +In this folder, we provide the imagenet pre-training config of RTMDet's backbone CSPNeXt. + +## Requirements + +To train with these configs, please install [MMPreTrain](https://github.com/open-mmlab/mmpretrain) first. + +Install by MIM: + +```shell +mim install mmpretrain +``` + +or install by pip: + +```shell +pip install mmpretrain +``` + +## Prepare Dataset + +To pre-train on ImageNet, you need to prepare the dataset first. Please refer to the [guide](https://mmpretrain.readthedocs.io/en/latest/user_guides/dataset_prepare.html#imagenet). + +## How to Train + +You can use the classification config in the same way as the detection config. + +For single-GPU training, run: + +```shell +python tools/train.py \ + ${CONFIG_FILE} \ + [optional arguments] +``` + +For multi-GPU training, run: + +```shell +bash ./tools/dist_train.sh \ + ${CONFIG_FILE} \ + ${GPU_NUM} \ + [optional arguments] +``` + +More details can be found in [user guides](https://mmdetection.readthedocs.io/en/latest/user_guides/train.html). + +## Results and Models + +| Model | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | +| :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :---------------------------------------------------------------------------------------------------------------------------------: | +| CSPNeXt-tiny | 224x224 | 2.73 | 0.34 | 69.44 | 89.45 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e-3a2dd350.pth) | +| CSPNeXt-s | 224x224 | 4.89 | 0.66 | 74.41 | 92.23 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e-ea671761.pth) | +| CSPNeXt-m | 224x224 | 13.05 | 1.93 | 79.27 | 94.79 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth) | +| CSPNeXt-l | 224x224 | 27.16 | 4.19 | 81.30 | 95.62 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth) | +| CSPNeXt-x | 224x224 | 48.85 | 7.76 | 82.10 | 95.69 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-x_8xb256-rsb-a1-600e_in1k-b3f78edd.pth) | diff --git a/mmdetection/configs/rtmdet/classification/cspnext-l_8xb256-rsb-a1-600e_in1k.py b/mmdetection/configs/rtmdet/classification/cspnext-l_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000..d2e7053 --- /dev/null +++ b/mmdetection/configs/rtmdet/classification/cspnext-l_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,5 @@ +_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py' + +model = dict( + backbone=dict(deepen_factor=1, widen_factor=1), + head=dict(in_channels=1024)) diff --git a/mmdetection/configs/rtmdet/classification/cspnext-m_8xb256-rsb-a1-600e_in1k.py b/mmdetection/configs/rtmdet/classification/cspnext-m_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000..e1b1352 --- /dev/null +++ b/mmdetection/configs/rtmdet/classification/cspnext-m_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,5 @@ +_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py' + +model = dict( + backbone=dict(deepen_factor=0.67, widen_factor=0.75), + head=dict(in_channels=768)) diff --git a/mmdetection/configs/rtmdet/classification/cspnext-s_8xb256-rsb-a1-600e_in1k.py b/mmdetection/configs/rtmdet/classification/cspnext-s_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000..dcfd2ea --- /dev/null +++ b/mmdetection/configs/rtmdet/classification/cspnext-s_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,64 @@ +_base_ = [ + 'mmpretrain::_base_/datasets/imagenet_bs256_rsb_a12.py', + 'mmpretrain::_base_/schedules/imagenet_bs2048_rsb.py', + 'mmpretrain::_base_/default_runtime.py' +] + +model = dict( + type='ImageClassifier', + backbone=dict( + type='mmdet.CSPNeXt', + arch='P5', + out_indices=(4, ), + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='mmdet.SiLU')), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=512, + loss=dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + loss_weight=1.0), + topk=(1, 5)), + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.2), + dict(type='CutMix', alpha=1.0) + ])) + +# dataset settings +train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True)) + +# schedule settings +optim_wrapper = dict( + optimizer=dict(weight_decay=0.01), + paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.), +) + +param_scheduler = [ + # warm up learning rate scheduler + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=True, + begin=0, + end=5, + # update by iter + convert_to_iter_based=True), + # main learning rate scheduler + dict( + type='CosineAnnealingLR', + T_max=595, + eta_min=1.0e-6, + by_epoch=True, + begin=5, + end=600) +] + +train_cfg = dict(by_epoch=True, max_epochs=600) diff --git a/mmdetection/configs/rtmdet/classification/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py b/mmdetection/configs/rtmdet/classification/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000..af3170b --- /dev/null +++ b/mmdetection/configs/rtmdet/classification/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,5 @@ +_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py' + +model = dict( + backbone=dict(deepen_factor=0.167, widen_factor=0.375), + head=dict(in_channels=384)) diff --git a/mmdetection/configs/rtmdet/classification/cspnext-x_8xb256-rsb-a1-600e_in1k.py b/mmdetection/configs/rtmdet/classification/cspnext-x_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000..edec48d --- /dev/null +++ b/mmdetection/configs/rtmdet/classification/cspnext-x_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,5 @@ +_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py' + +model = dict( + backbone=dict(deepen_factor=1.33, widen_factor=1.25), + head=dict(in_channels=1280)) diff --git a/mmdetection/configs/rtmdet/metafile.yml b/mmdetection/configs/rtmdet/metafile.yml new file mode 100644 index 0000000..7dc72e1 --- /dev/null +++ b/mmdetection/configs/rtmdet/metafile.yml @@ -0,0 +1,200 @@ +Collections: + - Name: RTMDet + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Flat Cosine Annealing + Training Resources: 8x A100 GPUs + Architecture: + - CSPNeXt + - CSPNeXtPAFPN + README: configs/rtmdet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v3.0.0rc1/mmdet/models/detectors/rtmdet.py#L6 + Version: v3.0.0rc1 + +Models: + - Name: rtmdet_tiny_8xb32-300e_coco + Alias: + - rtmdet-t + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 11.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth + + - Name: rtmdet_s_8xb32-300e_coco + Alias: + - rtmdet-s + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_s_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 15.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_s_8xb32-300e_coco/rtmdet_s_8xb32-300e_coco_20220905_161602-387a891e.pth + + - Name: rtmdet_m_8xb32-300e_coco + Alias: + - rtmdet-m + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_m_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 27.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.1 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_m_8xb32-300e_coco/rtmdet_m_8xb32-300e_coco_20220719_112220-229f527c.pth + + - Name: rtmdet_l_8xb32-300e_coco + Alias: + - rtmdet-l + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_l_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 43.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.3 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_l_8xb32-300e_coco/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth + + - Name: rtmdet_x_8xb32-300e_coco + Alias: + - rtmdet-x + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_x_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 61.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.6 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_x_8xb32-300e_coco/rtmdet_x_8xb32-300e_coco_20220715_230555-cc79b9ae.pth + + - Name: rtmdet_x_p6_4xb8-300e_coco + Alias: + - rtmdet-x_p6 + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py + Metadata: + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 54.9 + Weights: https://github.com/orange0-jp/orange-weights/releases/download/v0.1.0rtmdet-p6/rtmdet_x_p6_4xb8-300e_coco-bf32be58.pth + + - Name: rtmdet-ins_tiny_8xb32-300e_coco + Alias: + - rtmdet-ins-t + In Collection: RTMDet + Config: configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 18.4 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.4 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco/rtmdet-ins_tiny_8xb32-300e_coco_20221130_151727-ec670f7e.pth + + - Name: rtmdet-ins_s_8xb32-300e_coco + Alias: + - rtmdet-ins-s + In Collection: RTMDet + Config: configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 27.6 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 38.7 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_s_8xb32-300e_coco/rtmdet-ins_s_8xb32-300e_coco_20221121_212604-fdc5d7ec.pth + + - Name: rtmdet-ins_m_8xb32-300e_coco + Alias: + - rtmdet-ins-m + In Collection: RTMDet + Config: configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 42.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.1 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_m_8xb32-300e_coco/rtmdet-ins_m_8xb32-300e_coco_20221123_001039-6eba602e.pth + + - Name: rtmdet-ins_l_8xb32-300e_coco + Alias: + - rtmdet-ins-l + In Collection: RTMDet + Config: configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 59.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.7 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_l_8xb32-300e_coco/rtmdet-ins_l_8xb32-300e_coco_20221124_103237-78d1d652.pth + + - Name: rtmdet-ins_x_8xb16-300e_coco + Alias: + - rtmdet-ins-x + In Collection: RTMDet + Config: configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 33.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.4 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 44.6 + Weights: https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet-ins_x_8xb16-300e_coco/rtmdet-ins_x_8xb16-300e_coco_20221124_111313-33d4595b.pth diff --git a/mmdetection/configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py new file mode 100644 index 0000000..6b4b924 --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet-ins_l_8xb32-300e_coco.py @@ -0,0 +1,104 @@ +_base_ = './rtmdet_l_8xb32-300e_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='RTMDetInsSepBNHead', + num_classes=80, + in_channels=256, + stacked_convs=2, + share_conv=True, + pred_kernel_size=1, + feat_channels=256, + act_cfg=dict(type='SiLU', inplace=True), + norm_cfg=dict(type='SyncBN', requires_grad=True), + anchor_generator=dict( + type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]), + bbox_coder=dict(type='DistancePointBBoxCoder'), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + loss_mask=dict( + type='DiceLoss', loss_weight=2.0, eps=5e-6, reduction='mean')), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100, + mask_thr_binary=0.5), +) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomResize', + scale=(1280, 1280), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type='CachedMixUp', + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), + dict(type='PackDetInputs') +] + +train_dataloader = dict(pin_memory=True, dataset=dict(pipeline=train_pipeline)) + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='RandomResize', + scale=(640, 640), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict(metric=['bbox', 'segm']) +test_evaluator = val_evaluator diff --git a/mmdetection/configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py new file mode 100644 index 0000000..66da914 --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet-ins_m_8xb32-300e_coco.py @@ -0,0 +1,6 @@ +_base_ = './rtmdet-ins_l_8xb32-300e_coco.py' + +model = dict( + backbone=dict(deepen_factor=0.67, widen_factor=0.75), + neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2), + bbox_head=dict(in_channels=192, feat_channels=192)) diff --git a/mmdetection/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py new file mode 100644 index 0000000..28bc21c --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet-ins_s_8xb32-300e_coco.py @@ -0,0 +1,80 @@ +_base_ = './rtmdet-ins_l_8xb32-300e_coco.py' +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa +model = dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.5, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict(in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1), + bbox_head=dict(in_channels=128, feat_channels=128)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomResize', + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type='CachedMixUp', + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), + dict(type='PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='RandomResize', + scale=(640, 640), + ratio_range=(0.5, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] diff --git a/mmdetection/configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py new file mode 100644 index 0000000..954f911 --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet-ins_tiny_8xb32-300e_coco.py @@ -0,0 +1,48 @@ +_base_ = './rtmdet-ins_s_8xb32-300e_coco.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +model = dict( + backbone=dict( + deepen_factor=0.167, + widen_factor=0.375, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict(in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1), + bbox_head=dict(in_channels=96, feat_channels=96)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='CachedMosaic', + img_scale=(640, 640), + pad_val=114.0, + max_cached_images=20, + random_pop=False), + dict( + type='RandomResize', + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type='CachedMixUp', + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=10, + random_pop=False, + pad_val=(114, 114, 114), + prob=0.5), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1)), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py new file mode 100644 index 0000000..daaa640 --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet-ins_x_8xb16-300e_coco.py @@ -0,0 +1,31 @@ +_base_ = './rtmdet-ins_l_8xb32-300e_coco.py' + +model = dict( + backbone=dict(deepen_factor=1.33, widen_factor=1.25), + neck=dict( + in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4), + bbox_head=dict(in_channels=320, feat_channels=320)) + +base_lr = 0.002 + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=_base_.max_epochs // 2, + end=_base_.max_epochs, + T_max=_base_.max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] diff --git a/mmdetection/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py new file mode 100644 index 0000000..1cce4d8 --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py @@ -0,0 +1,179 @@ +_base_ = [ + '../_base_/default_runtime.py', '../_base_/schedules/schedule_1x.py', + '../_base_/datasets/coco_detection.py', './rtmdet_tta.py' +] +model = dict( + type='RTMDet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False, + batch_augments=None), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=1, + widen_factor=1, + channel_attention=True, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='CSPNeXtPAFPN', + in_channels=[256, 512, 1024], + out_channels=256, + num_csp_blocks=3, + expand_ratio=0.5, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='RTMDetSepBNHead', + num_classes=80, + in_channels=256, + stacked_convs=2, + feat_channels=256, + anchor_generator=dict( + type='MlvlPointGenerator', offset=0, strides=[8, 16, 32]), + bbox_coder=dict(type='DistancePointBBoxCoder'), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + with_objectness=False, + exp_on_reg=True, + share_conv=True, + pred_kernel_size=1, + norm_cfg=dict(type='SyncBN'), + act_cfg=dict(type='SiLU', inplace=True)), + train_cfg=dict( + assigner=dict(type='DynamicSoftLabelAssigner', topk=13), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300), +) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomResize', + scale=(1280, 1280), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type='CachedMixUp', + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type='PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(640, 640), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(640, 640), keep_ratio=True), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=32, + num_workers=10, + batch_sampler=None, + pin_memory=True, + dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=5, num_workers=10, dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +max_epochs = 300 +stage2_num_epochs = 20 +base_lr = 0.004 +interval = 10 + +train_cfg = dict( + max_epochs=max_epochs, + val_interval=interval, + dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)]) + +val_evaluator = dict(proposal_nums=(100, 1, 10)) +test_evaluator = val_evaluator + +# optimizer +optim_wrapper = dict( + _delete_=True, + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks = dict( + checkpoint=dict( + interval=interval, + max_keep_ckpts=3 # only keep latest 3 checkpoints + )) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] diff --git a/mmdetection/configs/rtmdet/rtmdet_m_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet_m_8xb32-300e_coco.py new file mode 100644 index 0000000..c83f5a6 --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet_m_8xb32-300e_coco.py @@ -0,0 +1,6 @@ +_base_ = './rtmdet_l_8xb32-300e_coco.py' + +model = dict( + backbone=dict(deepen_factor=0.67, widen_factor=0.75), + neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2), + bbox_head=dict(in_channels=192, feat_channels=192)) diff --git a/mmdetection/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py new file mode 100644 index 0000000..cbf7624 --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet_s_8xb32-300e_coco.py @@ -0,0 +1,62 @@ +_base_ = './rtmdet_l_8xb32-300e_coco.py' +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa +model = dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.5, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict(in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1), + bbox_head=dict(in_channels=128, feat_channels=128, exp_on_reg=False)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='CachedMosaic', img_scale=(640, 640), pad_val=114.0), + dict( + type='RandomResize', + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type='CachedMixUp', + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type='PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(640, 640), + ratio_range=(0.5, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] diff --git a/mmdetection/configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py new file mode 100644 index 0000000..a686f4a --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py @@ -0,0 +1,43 @@ +_base_ = './rtmdet_s_8xb32-300e_coco.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +model = dict( + backbone=dict( + deepen_factor=0.167, + widen_factor=0.375, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict(in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1), + bbox_head=dict(in_channels=96, feat_channels=96, exp_on_reg=False)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='CachedMosaic', + img_scale=(640, 640), + pad_val=114.0, + max_cached_images=20, + random_pop=False), + dict( + type='RandomResize', + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(640, 640)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type='CachedMixUp', + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=10, + random_pop=False, + pad_val=(114, 114, 114), + prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/rtmdet/rtmdet_tta.py b/mmdetection/configs/rtmdet/rtmdet_tta.py new file mode 100644 index 0000000..6dde36d --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet_tta.py @@ -0,0 +1,36 @@ +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100)) + +img_scales = [(640, 640), (320, 320), (960, 960)] +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=None), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='Resize', scale=s, keep_ratio=True) + for s in img_scales + ], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], + [ + dict( + type='Pad', + size=(960, 960), + pad_val=dict(img=(114, 114, 114))), + ], + [dict(type='LoadAnnotations', with_bbox=True)], + [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/mmdetection/configs/rtmdet/rtmdet_x_8xb32-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet_x_8xb32-300e_coco.py new file mode 100644 index 0000000..16a3363 --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet_x_8xb32-300e_coco.py @@ -0,0 +1,7 @@ +_base_ = './rtmdet_l_8xb32-300e_coco.py' + +model = dict( + backbone=dict(deepen_factor=1.33, widen_factor=1.25), + neck=dict( + in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4), + bbox_head=dict(in_channels=320, feat_channels=320)) diff --git a/mmdetection/configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py b/mmdetection/configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py new file mode 100644 index 0000000..d1bb7fa --- /dev/null +++ b/mmdetection/configs/rtmdet/rtmdet_x_p6_4xb8-300e_coco.py @@ -0,0 +1,132 @@ +_base_ = './rtmdet_x_8xb32-300e_coco.py' + +model = dict( + backbone=dict(arch='P6', out_indices=(2, 3, 4, 5)), + neck=dict(in_channels=[320, 640, 960, 1280]), + bbox_head=dict( + anchor_generator=dict( + type='MlvlPointGenerator', offset=0, strides=[8, 16, 32, 64]))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='CachedMosaic', img_scale=(1280, 1280), pad_val=114.0), + dict( + type='RandomResize', + scale=(2560, 2560), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(1280, 1280)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))), + dict( + type='CachedMixUp', + img_scale=(1280, 1280), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type='PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(1280, 1280), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(1280, 1280)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1280, 1280), keep_ratio=True), + dict(type='Pad', size=(1280, 1280), pad_val=dict(img=(114, 114, 114))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=8, num_workers=20, dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=5, num_workers=20, dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +max_epochs = 300 +stage2_num_epochs = 20 + +base_lr = 0.004 * 32 / 256 +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0e-5, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type='PipelineSwitchHook', + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] + +img_scales = [(1280, 1280), (640, 640), (1920, 1920)] +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=None), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='Resize', scale=s, keep_ratio=True) + for s in img_scales + ], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], + [ + dict( + type='Pad', + size=(1920, 1920), + pad_val=dict(img=(114, 114, 114))), + ], + [dict(type='LoadAnnotations', with_bbox=True)], + [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/mmdetection/configs/sabl/README.md b/mmdetection/configs/sabl/README.md new file mode 100644 index 0000000..c730729 --- /dev/null +++ b/mmdetection/configs/sabl/README.md @@ -0,0 +1,47 @@ +# SABL + +> [Side-Aware Boundary Localization for More Precise Object Detection](https://arxiv.org/abs/1912.04260) + + + +## Abstract + +Current object detection frameworks mainly rely on bounding box regression to localize objects. Despite the remarkable progress in recent years, the precision of bounding box regression remains unsatisfactory, hence limiting performance in object detection. We observe that precise localization requires careful placement of each side of the bounding box. However, the mainstream approach, which focuses on predicting centers and sizes, is not the most effective way to accomplish this task, especially when there exists displacements with large variance between the anchors and the targets. In this paper, we propose an alternative approach, named as Side-Aware Boundary Localization (SABL), where each side of the bounding box is respectively localized with a dedicated network branch. To tackle the difficulty of precise localization in the presence of displacements with large variance, we further propose a two-step localization scheme, which first predicts a range of movement through bucket prediction and then pinpoints the precise position within the predicted bucket. We test the proposed method on both two-stage and single-stage detection frameworks. Replacing the standard bounding box regression branch with the proposed design leads to significant improvements on Faster R-CNN, RetinaNet, and Cascade R-CNN, by 3.0%, 1.7%, and 0.9%, respectively. + +
    + +
    + +## Results and Models + +The results on COCO 2017 val is shown in the below table. (results on test-dev are usually slightly higher than val). +Single-scale testing (1333x800) is adopted in all results. + +| Method | Backbone | Lr schd | ms-train | box AP | Config | Download | +| :----------------: | :-------: | :-----: | :------: | :----: | :-----------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| SABL Faster R-CNN | R-50-FPN | 1x | N | 39.9 | [config](./sabl-faster-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/20200830_130324.log.json) | +| SABL Faster R-CNN | R-101-FPN | 1x | N | 41.7 | [config](./sabl-faster-rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/sabl_faster_rcnn_r101_fpn_1x_coco-f804c6c1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/20200830_183949.log.json) | +| SABL Cascade R-CNN | R-50-FPN | 1x | N | 41.6 | [config](./sabl-cascade-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/sabl_cascade_rcnn_r50_fpn_1x_coco-e1748e5e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/20200831_033726.log.json) | +| SABL Cascade R-CNN | R-101-FPN | 1x | N | 43.0 | [config](./sabl-cascade-rcnn_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/sabl_cascade_rcnn_r101_fpn_1x_coco-2b83e87c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/20200831_141745.log.json) | + +| Method | Backbone | GN | Lr schd | ms-train | box AP | Config | Download | +| :------------: | :-------: | :-: | :-----: | :---------: | :----: | :----------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| SABL RetinaNet | R-50-FPN | N | 1x | N | 37.7 | [config](./sabl-retinanet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/20200830_053451.log.json) | +| SABL RetinaNet | R-50-FPN | Y | 1x | N | 38.8 | [config](./sabl-retinanet_r50-gn_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/sabl_retinanet_r50_fpn_gn_1x_coco-e16dfcf1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/20200831_141955.log.json) | +| SABL RetinaNet | R-101-FPN | N | 1x | N | 39.7 | [config](./sabl-retinanet_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/sabl_retinanet_r101_fpn_1x_coco-42026904.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/20200831_034256.log.json) | +| SABL RetinaNet | R-101-FPN | Y | 1x | N | 40.5 | [config](./sabl-retinanet_r101-gn_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/sabl_retinanet_r101_fpn_gn_1x_coco-40a893e8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/20200830_201422.log.json) | +| SABL RetinaNet | R-101-FPN | Y | 2x | Y (640~800) | 42.9 | [config](./sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco-1e63382c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/20200830_144807.log.json) | +| SABL RetinaNet | R-101-FPN | Y | 2x | Y (480~960) | 43.6 | [config](./sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco-5342f857.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/20200830_164537.log.json) | + +## Citation + +We provide config files to reproduce the object detection results in the ECCV 2020 Spotlight paper for [Side-Aware Boundary Localization for More Precise Object Detection](https://arxiv.org/abs/1912.04260). + +```latex +@inproceedings{Wang_2020_ECCV, + title = {Side-Aware Boundary Localization for More Precise Object Detection}, + author = {Jiaqi Wang and Wenwei Zhang and Yuhang Cao and Kai Chen and Jiangmiao Pang and Tao Gong and Jianping Shi and Chen Change Loy and Dahua Lin}, + booktitle = {ECCV}, + year = {2020} +} +``` diff --git a/mmdetection/configs/sabl/metafile.yml b/mmdetection/configs/sabl/metafile.yml new file mode 100644 index 0000000..632b869 --- /dev/null +++ b/mmdetection/configs/sabl/metafile.yml @@ -0,0 +1,140 @@ +Collections: + - Name: SABL + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - SABL + Paper: + URL: https://arxiv.org/abs/1912.04260 + Title: 'Side-Aware Boundary Localization for More Precise Object Detection' + README: configs/sabl/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/roi_heads/bbox_heads/sabl_head.py#L14 + Version: v2.4.0 + +Models: + - Name: sabl-faster-rcnn_r50_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r50_fpn_1x_coco/sabl_faster_rcnn_r50_fpn_1x_coco-e867595b.pth + + - Name: sabl-faster-rcnn_r101_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_faster_rcnn_r101_fpn_1x_coco/sabl_faster_rcnn_r101_fpn_1x_coco-f804c6c1.pth + + - Name: sabl-cascade-rcnn_r50_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r50_fpn_1x_coco/sabl_cascade_rcnn_r50_fpn_1x_coco-e1748e5e.pth + + - Name: sabl-cascade-rcnn_r101_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_cascade_rcnn_r101_fpn_1x_coco/sabl_cascade_rcnn_r101_fpn_1x_coco-2b83e87c.pth + + - Name: sabl-retinanet_r50_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_1x_coco/sabl_retinanet_r50_fpn_1x_coco-6c54fd4f.pth + + - Name: sabl-retinanet_r50-gn_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r50_fpn_gn_1x_coco/sabl_retinanet_r50_fpn_gn_1x_coco-e16dfcf1.pth + + - Name: sabl-retinanet_r101_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 39.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_1x_coco/sabl_retinanet_r101_fpn_1x_coco-42026904.pth + + - Name: sabl-retinanet_r101-gn_fpn_1x_coco + In Collection: SABL + Config: configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_1x_coco/sabl_retinanet_r101_fpn_gn_1x_coco-40a893e8.pth + + - Name: sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco + In Collection: SABL + Config: configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco/sabl_retinanet_r101_fpn_gn_2x_ms_640_800_coco-1e63382c.pth + + - Name: sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco + In Collection: SABL + Config: configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sabl/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco/sabl_retinanet_r101_fpn_gn_2x_ms_480_960_coco-5342f857.pth diff --git a/mmdetection/configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py b/mmdetection/configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..404e7fc --- /dev/null +++ b/mmdetection/configs/sabl/sabl-cascade-rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,90 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + roi_head=dict(bbox_head=[ + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.5), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.3), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0)) + ])) diff --git a/mmdetection/configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..69c59ca --- /dev/null +++ b/mmdetection/configs/sabl/sabl-cascade-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,86 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + roi_head=dict(bbox_head=[ + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.5), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.3), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0)) + ])) diff --git a/mmdetection/configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py b/mmdetection/configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..d1bf8b9 --- /dev/null +++ b/mmdetection/configs/sabl/sabl-faster-rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + roi_head=dict( + bbox_head=dict( + _delete_=True, + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)))) diff --git a/mmdetection/configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..a727bd6 --- /dev/null +++ b/mmdetection/configs/sabl/sabl-faster-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,34 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict( + _delete_=True, + type='SABLHead', + num_classes=80, + cls_in_channels=256, + reg_in_channels=256, + roi_feat_size=7, + reg_feat_up_ratio=2, + reg_pre_kernel=3, + reg_post_kernel=3, + reg_pre_num=2, + reg_post_num=1, + cls_out_channels=1024, + reg_offset_out_channels=256, + reg_cls_out_channels=256, + num_cls_fcs=1, + num_reg_fcs=0, + reg_class_agnostic=True, + norm_cfg=None, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=1.7), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox_reg=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)))) diff --git a/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py b/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py new file mode 100644 index 0000000..f181ad6 --- /dev/null +++ b/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_1x_coco.py @@ -0,0 +1,57 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + norm_cfg=norm_cfg, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py b/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py new file mode 100644 index 0000000..dc7209a --- /dev/null +++ b/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-480-960-2x_coco.py @@ -0,0 +1,68 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + norm_cfg=norm_cfg, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 480), (1333, 960)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py b/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py new file mode 100644 index 0000000..ac5f6d9 --- /dev/null +++ b/mmdetection/configs/sabl/sabl-retinanet_r101-gn_fpn_ms-640-800-2x_coco.py @@ -0,0 +1,68 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + norm_cfg=norm_cfg, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 480), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py b/mmdetection/configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py new file mode 100644 index 0000000..409695b --- /dev/null +++ b/mmdetection/configs/sabl/sabl-retinanet_r101_fpn_1x_coco.py @@ -0,0 +1,55 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py b/mmdetection/configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py new file mode 100644 index 0000000..4facdb6 --- /dev/null +++ b/mmdetection/configs/sabl/sabl-retinanet_r50-gn_fpn_1x_coco.py @@ -0,0 +1,53 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + norm_cfg=norm_cfg, + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py b/mmdetection/configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py new file mode 100644 index 0000000..9073d6f --- /dev/null +++ b/mmdetection/configs/sabl/sabl-retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,51 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + bbox_head=dict( + _delete_=True, + type='SABLRetinaHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + approx_anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='ApproxMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0.0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/scnet/README.md b/mmdetection/configs/scnet/README.md new file mode 100644 index 0000000..08dbfa8 --- /dev/null +++ b/mmdetection/configs/scnet/README.md @@ -0,0 +1,63 @@ +# SCNet + +> [SCNet: Training Inference Sample Consistency for Instance Segmentation](https://arxiv.org/abs/2012.10150) + + + +## Abstract + + + +Cascaded architectures have brought significant performance improvement in object detection and instance segmentation. However, there are lingering issues regarding the disparity in the Intersection-over-Union (IoU) distribution of the samples between training and inference. This disparity can potentially exacerbate detection accuracy. This paper proposes an architecture referred to as Sample Consistency Network (SCNet) to ensure that the IoU distribution of the samples at training time is close to that at inference time. Furthermore, SCNet incorporates feature relay and utilizes global contextual information to further reinforce the reciprocal relationships among classifying, detecting, and segmenting sub-tasks. Extensive experiments on the standard COCO dataset reveal the effectiveness of the proposed method over multiple evaluation metrics, including box AP, mask AP, and inference speed. In particular, while running 38% faster, the proposed SCNet improves the AP of the box and mask predictions by respectively 1.3 and 2.3 points compared to the strong Cascade Mask R-CNN baseline. + +
    + +
    + +## Dataset + +SCNet requires COCO and [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) dataset for training. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +| | ├── stuffthingmaps +``` + +## Results and Models + +The results on COCO 2017val are shown in the below table. (results on test-dev are usually slightly higher than val) + +| Backbone | Style | Lr schd | Mem (GB) | Inf speed (fps) | box AP | mask AP | TTA box AP | TTA mask AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :-------------: | :----: | :-----: | :--------: | :---------: | :------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-FPN | pytorch | 1x | 7.0 | 6.2 | 43.5 | 39.2 | 44.8 | 40.9 | [config](./scnet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco-c3f09857.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco_20210117_192725.log.json) | +| R-50-FPN | pytorch | 20e | 7.0 | 6.2 | 44.5 | 40.0 | 45.8 | 41.5 | [config](./scnet_r50_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco-a569f645.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco_20210116_060148.log.json) | +| R-101-FPN | pytorch | 20e | 8.9 | 5.8 | 45.8 | 40.9 | 47.3 | 42.7 | [config](./scnet_r101_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco-294e312c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco_20210118_175824.log.json) | +| X-101-64x4d-FPN | pytorch | 20e | 13.2 | 4.9 | 47.5 | 42.3 | 48.9 | 44.0 | [config](./scnet_x101-64x4d_fpn_20e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco-fb09dec9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco_20210120_045959.log.json) | + +### Notes + +- Training hyper-parameters are identical to those of [HTC](https://github.com/open-mmlab/mmdetection/tree/main/configs/htc). +- TTA means Test Time Augmentation, which applies horizontal flip and multi-scale testing. Refer to [config](./scnet_r50_fpn_1x_coco.py). + +## Citation + +We provide the code for reproducing experiment results of [SCNet](https://arxiv.org/abs/2012.10150). + +```latex +@inproceedings{vu2019cascade, + title={SCNet: Training Inference Sample Consistency for Instance Segmentation}, + author={Vu, Thang and Haeyong, Kang and Yoo, Chang D}, + booktitle={AAAI}, + year={2021} +} +``` diff --git a/mmdetection/configs/scnet/metafile.yml b/mmdetection/configs/scnet/metafile.yml new file mode 100644 index 0000000..936d389 --- /dev/null +++ b/mmdetection/configs/scnet/metafile.yml @@ -0,0 +1,116 @@ +Collections: + - Name: SCNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - SCNet + Paper: + URL: https://arxiv.org/abs/2012.10150 + Title: 'SCNet: Training Inference Sample Consistency for Instance Segmentation' + README: configs/scnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.9.0/mmdet/models/detectors/scnet.py#L6 + Version: v2.9.0 + +Models: + - Name: scnet_r50_fpn_1x_coco + In Collection: SCNet + Config: configs/scnet/scnet_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 161.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_1x_coco/scnet_r50_fpn_1x_coco-c3f09857.pth + + - Name: scnet_r50_fpn_20e_coco + In Collection: SCNet + Config: configs/scnet/scnet_r50_fpn_20e_coco.py + Metadata: + Training Memory (GB): 7.0 + inference time (ms/im): + - value: 161.29 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r50_fpn_20e_coco/scnet_r50_fpn_20e_coco-a569f645.pth + + - Name: scnet_r101_fpn_20e_coco + In Collection: SCNet + Config: configs/scnet/scnet_r101_fpn_20e_coco.py + Metadata: + Training Memory (GB): 8.9 + inference time (ms/im): + - value: 172.41 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_r101_fpn_20e_coco/scnet_r101_fpn_20e_coco-294e312c.pth + + - Name: scnet_x101-64x4d_fpn_20e_coco + In Collection: SCNet + Config: configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py + Metadata: + Training Memory (GB): 13.2 + inference time (ms/im): + - value: 204.08 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (800, 1333) + Epochs: 20 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scnet/scnet_x101_64x4d_fpn_20e_coco/scnet_x101_64x4d_fpn_20e_coco-fb09dec9.pth diff --git a/mmdetection/configs/scnet/scnet_r101_fpn_20e_coco.py b/mmdetection/configs/scnet/scnet_r101_fpn_20e_coco.py new file mode 100644 index 0000000..ebba529 --- /dev/null +++ b/mmdetection/configs/scnet/scnet_r101_fpn_20e_coco.py @@ -0,0 +1,6 @@ +_base_ = './scnet_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/scnet/scnet_r50_fpn_1x_coco.py b/mmdetection/configs/scnet/scnet_r50_fpn_1x_coco.py new file mode 100644 index 0000000..a0210fd --- /dev/null +++ b/mmdetection/configs/scnet/scnet_r50_fpn_1x_coco.py @@ -0,0 +1,138 @@ +_base_ = '../htc/htc_r50_fpn_1x_coco.py' +# model settings +model = dict( + type='SCNet', + roi_head=dict( + _delete_=True, + type='SCNetRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SCNetBBoxHead', + num_shared_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='SCNetBBoxHead', + num_shared_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='SCNetBBoxHead', + num_shared_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='SCNetMaskHead', + num_convs=12, + in_channels=256, + conv_out_channels=256, + num_classes=80, + conv_to_res=True, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='SCNetSemanticHead', + num_ins=5, + fusion_level=1, + seg_scale_factor=1 / 8, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + loss_seg=dict( + type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2), + conv_to_res=True), + glbctx_head=dict( + type='GlobalContextHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_weight=3.0, + conv_to_res=True), + feat_relay_head=dict( + type='FeatureRelayHead', + in_channels=1024, + out_conv_channels=256, + roi_feat_size=7, + scale_factor=2))) + +# TODO +# uncomment below code to enable test time augmentations +# img_norm_cfg = dict( +# mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +# test_pipeline = [ +# dict(type='LoadImageFromFile'), +# dict( +# type='MultiScaleFlipAug', +# img_scale=[(600, 900), (800, 1200), (1000, 1500), (1200, 1800), +# (1400, 2100)], +# flip=True, +# transforms=[ +# dict(type='Resize', keep_ratio=True), +# dict(type='RandomFlip', flip_ratio=0.5), +# dict(type='Normalize', **img_norm_cfg), +# dict(type='Pad', size_divisor=32), +# dict(type='ImageToTensor', keys=['img']), +# dict(type='Collect', keys=['img']), +# ]) +# ] +# data = dict( +# val=dict(pipeline=test_pipeline), +# test=dict(pipeline=test_pipeline)) diff --git a/mmdetection/configs/scnet/scnet_r50_fpn_20e_coco.py b/mmdetection/configs/scnet/scnet_r50_fpn_20e_coco.py new file mode 100644 index 0000000..533e1b5 --- /dev/null +++ b/mmdetection/configs/scnet/scnet_r50_fpn_20e_coco.py @@ -0,0 +1,15 @@ +_base_ = './scnet_r50_fpn_1x_coco.py' +# learning policy +max_epochs = 20 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 19], + gamma=0.1) +] +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py b/mmdetection/configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py new file mode 100644 index 0000000..1e54b03 --- /dev/null +++ b/mmdetection/configs/scnet/scnet_x101-64x4d_fpn_20e_coco.py @@ -0,0 +1,15 @@ +_base_ = './scnet_r50_fpn_20e_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/scnet/scnet_x101-64x4d_fpn_8xb1-20e_coco.py b/mmdetection/configs/scnet/scnet_x101-64x4d_fpn_8xb1-20e_coco.py new file mode 100644 index 0000000..3cdce7d --- /dev/null +++ b/mmdetection/configs/scnet/scnet_x101-64x4d_fpn_8xb1-20e_coco.py @@ -0,0 +1,8 @@ +_base_ = './scnet_x101-64x4d_fpn_20e_coco.py' +train_dataloader = dict(batch_size=1, num_workers=1) + +optim_wrapper = dict(optimizer=dict(lr=0.01)) +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (1 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/mmdetection/configs/scratch/README.md b/mmdetection/configs/scratch/README.md new file mode 100644 index 0000000..7bdd8ff --- /dev/null +++ b/mmdetection/configs/scratch/README.md @@ -0,0 +1,35 @@ +# Scratch + +> [Rethinking ImageNet Pre-training](https://arxiv.org/abs/1811.08883) + + + +## Abstract + +We report competitive results on object detection and instance segmentation on the COCO dataset using standard models trained from random initialization. The results are no worse than their ImageNet pre-training counterparts even when using the hyper-parameters of the baseline system (Mask R-CNN) that were optimized for fine-tuning pre-trained models, with the sole exception of increasing the number of training iterations so the randomly initialized models may converge. Training from random initialization is surprisingly robust; our results hold even when: (i) using only 10% of the training data, (ii) for deeper and wider models, and (iii) for multiple tasks and metrics. Experiments show that ImageNet pre-training speeds up convergence early in training, but does not necessarily provide regularization or improve final target task accuracy. To push the envelope we demonstrate 50.9 AP on COCO object detection without using any external data---a result on par with the top COCO 2017 competition results that used ImageNet pre-training. These observations challenge the conventional wisdom of ImageNet pre-training for dependent tasks and we expect these discoveries will encourage people to rethink the current de facto paradigm of \`pre-training and fine-tuning' in computer vision. + +
    + +
    + +## Results and Models + +| Model | Backbone | Style | Lr schd | box AP | mask AP | Config | Download | +| :----------: | :------: | :-----: | :-----: | :----: | :-----: | :-------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Faster R-CNN | R-50-FPN | pytorch | 6x | 40.7 | | [config](./faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_bbox_mAP-0.407_20200201_193013-90813d01.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_20200201_193013.log.json) | +| Mask R-CNN | R-50-FPN | pytorch | 6x | 41.2 | 37.4 | [config](./mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_20200201_193051.log.json) | + +Note: + +- The above models are trained with 16 GPUs. + +## Citation + +```latex +@article{he2018rethinking, + title={Rethinking imagenet pre-training}, + author={He, Kaiming and Girshick, Ross and Doll{\'a}r, Piotr}, + journal={arXiv preprint arXiv:1811.08883}, + year={2018} +} +``` diff --git a/mmdetection/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py b/mmdetection/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py new file mode 100644 index 0000000..6e632b9 --- /dev/null +++ b/mmdetection/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + frozen_stages=-1, + zero_init_residual=False, + norm_cfg=norm_cfg, + init_cfg=None), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg))) + +optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.)) + +max_epochs = 73 + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[65, 71], + gamma=0.1) +] + +train_cfg = dict(max_epochs=max_epochs) + +# only keep latest 3 checkpoints +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) diff --git a/mmdetection/configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py b/mmdetection/configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py new file mode 100644 index 0000000..9796f50 --- /dev/null +++ b/mmdetection/configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py @@ -0,0 +1,40 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) +model = dict( + backbone=dict( + frozen_stages=-1, + zero_init_residual=False, + norm_cfg=norm_cfg, + init_cfg=None), + neck=dict(norm_cfg=norm_cfg), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=norm_cfg), + mask_head=dict(norm_cfg=norm_cfg))) + +optim_wrapper = dict(paramwise_cfg=dict(norm_decay_mult=0.)) + +max_epochs = 73 + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[65, 71], + gamma=0.1) +] + +train_cfg = dict(max_epochs=max_epochs) + +# only keep latest 3 checkpoints +default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) diff --git a/mmdetection/configs/scratch/metafile.yml b/mmdetection/configs/scratch/metafile.yml new file mode 100644 index 0000000..977b8e5 --- /dev/null +++ b/mmdetection/configs/scratch/metafile.yml @@ -0,0 +1,48 @@ +Collections: + - Name: Rethinking ImageNet Pre-training + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - RPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1811.08883 + Title: 'Rethinking ImageNet Pre-training' + README: configs/scratch/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py + Version: v2.0.0 + +Models: + - Name: faster-rcnn_r50_fpn_gn-all_scratch_6x_coco + In Collection: Rethinking ImageNet Pre-training + Config: configs/scratch/faster-rcnn_r50-scratch_fpn_gn-all_6x_coco.py + Metadata: + Epochs: 72 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scratch/faster_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_faster_rcnn_r50_fpn_gn_6x_bbox_mAP-0.407_20200201_193013-90813d01.pth + + - Name: mask-rcnn_r50_fpn_gn-all_scratch_6x_coco + In Collection: Rethinking ImageNet Pre-training + Config: configs/scratch/mask-rcnn_r50-scratch_fpn_gn-all_6x_coco.py + Metadata: + Epochs: 72 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/scratch/mask_rcnn_r50_fpn_gn-all_scratch_6x_coco/scratch_mask_rcnn_r50_fpn_gn_6x_bbox_mAP-0.412__segm_mAP-0.374_20200201_193051-1e190a40.pth diff --git a/mmdetection/configs/seesaw_loss/README.md b/mmdetection/configs/seesaw_loss/README.md new file mode 100644 index 0000000..7077d75 --- /dev/null +++ b/mmdetection/configs/seesaw_loss/README.md @@ -0,0 +1,47 @@ +# Seesaw Loss + +> [Seesaw Loss for Long-Tailed Instance Segmentation](https://arxiv.org/abs/2008.10032) + + + +## Abstract + +Instance segmentation has witnessed a remarkable progress on class-balanced benchmarks. However, they fail to perform as accurately in real-world scenarios, where the category distribution of objects naturally comes with a long tail. Instances of head classes dominate a long-tailed dataset and they serve as negative samples of tail categories. The overwhelming gradients of negative samples on tail classes lead to a biased learning process for classifiers. Consequently, objects of tail categories are more likely to be misclassified as backgrounds or head categories. To tackle this problem, we propose Seesaw Loss to dynamically re-balance gradients of positive and negative samples for each category, with two complementary factors, i.e., mitigation factor and compensation factor. The mitigation factor reduces punishments to tail categories w.r.t. the ratio of cumulative training instances between different categories. Meanwhile, the compensation factor increases the penalty of misclassified instances to avoid false positives of tail categories. We conduct extensive experiments on Seesaw Loss with mainstream frameworks and different data sampling strategies. With a simple end-to-end training pipeline, Seesaw Loss obtains significant gains over Cross-Entropy Loss, and achieves state-of-the-art performance on LVIS dataset without bells and whistles. + +
    + +
    + +- Please setup [LVIS dataset](../lvis/README.md) for MMDetection. + +- RFS indicates to use oversample strategy [here](../../docs/tutorials/customipredataset.md#class-balanced-dataset) with oversample threshold `1e-3`. + +## Results and models of Seasaw Loss on LVIS v1 dataset + +| Method | Backbone | Style | Lr schd | Data Sampler | Norm Mask | box AP | mask AP | Config | Download | +| :----------------: | :-------: | :-----: | :-----: | :----------: | :-------: | :----: | :-----: | :----------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Mask R-CNN | R-50-FPN | pytorch | 2x | random | N | 25.6 | 25.0 | [config](./mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-a698dd3d.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-50-FPN | pytorch | 2x | random | Y | 25.6 | 25.4 | [config](./mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a1c11314.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-101-FPN | pytorch | 2x | random | N | 27.4 | 26.7 | [config](./mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-8e6e6dd5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-101-FPN | pytorch | 2x | random | Y | 27.2 | 27.3 | [config](./mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a0b59c42.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-50-FPN | pytorch | 2x | RFS | N | 27.6 | 26.4 | [config](./mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-392a804b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-50-FPN | pytorch | 2x | RFS | Y | 27.6 | 26.8 | [config](./mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-cd0f6a12.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-101-FPN | pytorch | 2x | RFS | N | 28.9 | 27.6 | [config](./mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-e68eb464.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Mask R-CNN | R-101-FPN | pytorch | 2x | RFS | Y | 28.9 | 28.2 | [config](./mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-1d817139.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Cascade Mask R-CNN | R-101-FPN | pytorch | 2x | random | N | 33.1 | 29.2 | [config](./cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-71e2215e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Cascade Mask R-CNN | R-101-FPN | pytorch | 2x | random | Y | 33.0 | 30.0 | [config](./cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-8b5a6745.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | +| Cascade Mask R-CNN | R-101-FPN | pytorch | 2x | RFS | N | 30.0 | 29.3 | [config](./cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-5d8ca2a4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1.log.json) | +| Cascade Mask R-CNN | R-101-FPN | pytorch | 2x | RFS | Y | 32.8 | 30.1 | [config](./cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-c8551505.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1.log.json) | + +## Citation + +We provide config files to reproduce the instance segmentation performance in the CVPR 2021 paper for [Seesaw Loss for Long-Tailed Instance Segmentation](https://arxiv.org/abs/2008.10032). + +```latex +@inproceedings{wang2021seesaw, + title={Seesaw Loss for Long-Tailed Instance Segmentation}, + author={Jiaqi Wang and Wenwei Zhang and Yuhang Zang and Yuhang Cao and Jiangmiao Pang and Tao Gong and Kai Chen and Ziwei Liu and Chen Change Loy and Dahua Lin}, + booktitle={Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition}, + year={2021} +} +``` diff --git a/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py new file mode 100644 index 0000000..2de87dc --- /dev/null +++ b/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py @@ -0,0 +1,5 @@ +_base_ = './cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py' # noqa: E501 +model = dict( + roi_head=dict( + mask_head=dict( + predictor_cfg=dict(type='NormedConv2d', tempearture=20)))) diff --git a/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py new file mode 100644 index 0000000..4d67ad7 --- /dev/null +++ b/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py @@ -0,0 +1,5 @@ +_base_ = './cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py' # noqa: E501 +model = dict( + roi_head=dict( + mask_head=dict( + predictor_cfg=dict(type='NormedConv2d', tempearture=20)))) diff --git a/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py new file mode 100644 index 0000000..2a1a87d --- /dev/null +++ b/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py @@ -0,0 +1,116 @@ +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + roi_head=dict( + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +dataset_type = 'LVISV1Dataset' +data_root = 'data/lvis_v1/' +train_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/lvis_v1_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline)) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/lvis_v1_val.json', + data_prefix=dict(img=''))) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='LVISMetric', + ann_file=data_root + 'annotations/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator + +train_cfg = dict(val_interval=24) diff --git a/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py new file mode 100644 index 0000000..0e7b4df --- /dev/null +++ b/mmdetection/configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py @@ -0,0 +1,95 @@ +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/lvis_v1_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + roi_head=dict( + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1203, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) + +train_cfg = dict(val_interval=24) diff --git a/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py new file mode 100644 index 0000000..b518c21 --- /dev/null +++ b/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py' # noqa: E501 +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py new file mode 100644 index 0000000..008bbca --- /dev/null +++ b/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py' # noqa: E501 +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py new file mode 100644 index 0000000..8a0b675 --- /dev/null +++ b/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py new file mode 100644 index 0000000..6143231 --- /dev/null +++ b/mmdetection/configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py new file mode 100644 index 0000000..06d2438 --- /dev/null +++ b/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py @@ -0,0 +1,5 @@ +_base_ = './mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py' +model = dict( + roi_head=dict( + mask_head=dict( + predictor_cfg=dict(type='NormedConv2d', tempearture=20)))) diff --git a/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py new file mode 100644 index 0000000..5fc68d3 --- /dev/null +++ b/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py @@ -0,0 +1,5 @@ +_base_ = './mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py' +model = dict( + roi_head=dict( + mask_head=dict( + predictor_cfg=dict(type='NormedConv2d', tempearture=20)))) diff --git a/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py new file mode 100644 index 0000000..25c646c --- /dev/null +++ b/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py @@ -0,0 +1,59 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict( + num_classes=1203, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0)), + mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +dataset_type = 'LVISV1Dataset' +data_root = 'data/lvis_v1/' +train_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/lvis_v1_train.json', + data_prefix=dict(img=''), + pipeline=train_pipeline)) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/lvis_v1_val.json', + data_prefix=dict(img=''))) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='LVISMetric', + ann_file=data_root + 'annotations/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator + +train_cfg = dict(val_interval=24) diff --git a/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py b/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py new file mode 100644 index 0000000..d60320e --- /dev/null +++ b/mmdetection/configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py @@ -0,0 +1,38 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/lvis_v1_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +model = dict( + roi_head=dict( + bbox_head=dict( + num_classes=1203, + cls_predictor_cfg=dict(type='NormedLinear', tempearture=20), + loss_cls=dict( + type='SeesawLoss', + p=0.8, + q=2.0, + num_classes=1203, + loss_weight=1.0)), + mask_head=dict(num_classes=1203)), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + # LVIS allows up to 300 + max_per_img=300))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) + +train_cfg = dict(val_interval=24) diff --git a/mmdetection/configs/seesaw_loss/metafile.yml b/mmdetection/configs/seesaw_loss/metafile.yml new file mode 100644 index 0000000..374b9cd --- /dev/null +++ b/mmdetection/configs/seesaw_loss/metafile.yml @@ -0,0 +1,203 @@ +Collections: + - Name: Seesaw Loss + Metadata: + Training Data: LVIS + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Softmax + - RPN + - Convolution + - Dense Connections + - FPN + - ResNet + - RoIAlign + - Seesaw Loss + Paper: + URL: https://arxiv.org/abs/2008.10032 + Title: 'Seesaw Loss for Long-Tailed Instance Segmentation' + README: configs/seesaw_loss/README.md + +Models: + - Name: mask-rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_random-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 25.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 25.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-a698dd3d.pth + - Name: mask-rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 25.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 25.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a1c11314.pth + - Name: mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.4 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 26.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-8e6e6dd5.pth + - Name: mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.2 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 27.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-a0b59c42.pth + - Name: mask-rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 26.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-392a804b.pth + - Name: mask-rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask-rcnn_r50_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 27.6 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 26.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r50_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-cd0f6a12.pth + - Name: mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 28.9 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 27.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-e68eb464.pth + - Name: mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 28.9 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 28.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-1d817139.pth + - Name: cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_random-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 33.1 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 29.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_mstrain_2x_lvis_v1-71e2215e.pth + - Name: cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_random-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 33.0 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 30.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_random_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-8b5a6745.pth + - Name: cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss_sample1e-3-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 30.0 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 29.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_mstrain_2x_lvis_v1-5d8ca2a4.pth + - Name: cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1 + In Collection: Seesaw Loss + Config: configs/seesaw_loss/cascade-mask-rcnn_r101_fpn_seesaw-loss-normed-mask_sample1e-3-ms-2x_lvis-v1.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: LVIS v1 + Metrics: + box AP: 32.8 + - Task: Instance Segmentation + Dataset: LVIS v1 + Metrics: + mask AP: 30.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/seesaw_loss/cascade_mask_rcnn_r101_fpn_sample1e-3_seesaw_loss_normed_mask_mstrain_2x_lvis_v1-c8551505.pth diff --git a/mmdetection/configs/selfsup_pretrain/README.md b/mmdetection/configs/selfsup_pretrain/README.md new file mode 100644 index 0000000..57537dd --- /dev/null +++ b/mmdetection/configs/selfsup_pretrain/README.md @@ -0,0 +1,109 @@ +# Backbones Trained by Self-Supervise Algorithms + + + +## Abstract + +Unsupervised image representations have significantly reduced the gap with supervised pretraining, notably with the recent achievements of contrastive learning methods. These contrastive methods typically work online and rely on a large number of explicit pairwise feature comparisons, which is computationally challenging. In this paper, we propose an online algorithm, SwAV, that takes advantage of contrastive methods without requiring to compute pairwise comparisons. Specifically, our method simultaneously clusters the data while enforcing consistency between cluster assignments produced for different augmentations (or views) of the same image, instead of comparing features directly as in contrastive learning. Simply put, we use a swapped prediction mechanism where we predict the cluster assignment of a view from the representation of another view. Our method can be trained with large and small batches and can scale to unlimited amounts of data. Compared to previous contrastive methods, our method is more memory efficient since it does not require a large memory bank or a special momentum network. In addition, we also propose a new data augmentation strategy, multi-crop, that uses a mix of views with different resolutions in place of two full-resolution views, without increasing the memory or compute requirements much. We validate our findings by achieving 75.3% top-1 accuracy on ImageNet with ResNet-50, as well as surpassing supervised pretraining on all the considered transfer tasks. + +
    + +
    + +We present Momentum Contrast (MoCo) for unsupervised visual representation learning. From a perspective on contrastive learning as dictionary look-up, we build a dynamic dictionary with a queue and a moving-averaged encoder. This enables building a large and consistent dictionary on-the-fly that facilitates contrastive unsupervised learning. MoCo provides competitive results under the common linear protocol on ImageNet classification. More importantly, the representations learned by MoCo transfer well to downstream tasks. MoCo can outperform its supervised pre-training counterpart in 7 detection/segmentation tasks on PASCAL VOC, COCO, and other datasets, sometimes surpassing it by large margins. This suggests that the gap between unsupervised and supervised representation learning has been largely closed in many vision tasks. + +
    + +
    + +## Usage + +To use a self-supervisely pretrained backbone, there are two steps to do: + +1. Download and convert the model to PyTorch-style supported by MMDetection +2. Modify the config and change the training setting accordingly + +### Convert model + +For more general usage, we also provide script `selfsup2mmdet.py` in the tools directory to convert the key of models pretrained by different self-supervised methods to PyTorch-style checkpoints used in MMDetection. + +```bash +python -u tools/model_converters/selfsup2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH} --selfsup ${method} +``` + +This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`. + +For example, to use a ResNet-50 backbone released by MoCo, you can download it from [here](https://dl.fbaipublicfiles.com/moco/moco_checkpoints/moco_v2_800ep/moco_v2_800ep_pretrain.pth.tar) and use the following command + +```bash +python -u tools/model_converters/selfsup2mmdet.py ./moco_v2_800ep_pretrain.pth.tar mocov2_r50_800ep_pretrain.pth --selfsup moco +``` + +To use the ResNet-50 backbone released by SwAV, you can download it from [here](https://dl.fbaipublicfiles.com/deepcluster/swav_800ep_pretrain.pth.tar) + +### Modify config + +The backbone requires SyncBN and the `frozen_stages` need to be changed. A config that use the moco backbone is as below + +```python +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + pretrained='./mocov2_r50_800ep_pretrain.pth', + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False)) + +``` + +## Results and Models + +| Method | Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :-------: | :------------------------------------------------------------: | :-----: | :------------: | :------: | :------------: | :----: | :-----: | :----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Mask RCNN | [R50 by MoCo v2](./mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py) | pytorch | 1x | | | 38.0 | 34.3 | [config](./mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco_20210604_114614-a8b63483.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_1x_coco_20210604_114614.log.json) | +| Mask RCNN | [R50 by MoCo v2](./mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py) | pytorch | multi-scale 2x | | | 40.8 | 36.8 | [config](./mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco_20210605_163717-d95df20a.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_mocov2-pretrain_ms-2x_coco_20210605_163717.log.json) | +| Mask RCNN | [R50 by SwAV](./mask-rcnn_r50-swav-pre_fpn_1x_coco.py) | pytorch | 1x | | | 39.1 | 35.7 | [config](./mask-rcnn_r50-swav-pre_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco/mask_rcnn_r50_fpn_swav-pretrain_1x_coco_20210604_114640-7b9baf28.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_1x_coco/mask_rcnn_r50_fpn_swav-pretrain_1x_coco_20210604_114640.log.json) | +| Mask RCNN | [R50 by SwAV](./mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py) | pytorch | multi-scale 2x | | | 41.3 | 37.3 | [config](./mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco_20210605_163717-08e26fca.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/selfsup_pretrain/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco/mask_rcnn_r50_fpn_swav-pretrain_ms-2x_coco_20210605_163717.log.json) | + +### Notice + +1. We only provide single-scale 1x and multi-scale 2x configs as examples to show how to use backbones trained by self-supervised algorithms. We will try to reproduce the results in their corresponding paper using the released backbone in the future. Please stay tuned. + +## Citation + +We support to apply the backbone models pre-trained by different self-supervised methods in detection systems and provide their results on Mask R-CNN. + +The pre-trained models are converted from [MoCo](https://github.com/facebookresearch/moco) and downloaded from [SwAV](https://github.com/facebookresearch/swav). + +For SwAV, please cite + +```latex +@article{caron2020unsupervised, + title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments}, + author={Caron, Mathilde and Misra, Ishan and Mairal, Julien and Goyal, Priya and Bojanowski, Piotr and Joulin, Armand}, + booktitle={Proceedings of Advances in Neural Information Processing Systems (NeurIPS)}, + year={2020} +} +``` + +For MoCo, please cite + +```latex +@Article{he2019moco, + author = {Kaiming He and Haoqi Fan and Yuxin Wu and Saining Xie and Ross Girshick}, + title = {Momentum Contrast for Unsupervised Visual Representation Learning}, + journal = {arXiv preprint arXiv:1911.05722}, + year = {2019}, +} +@Article{chen2020mocov2, + author = {Xinlei Chen and Haoqi Fan and Ross Girshick and Kaiming He}, + title = {Improved Baselines with Momentum Contrastive Learning}, + journal = {arXiv preprint arXiv:2003.04297}, + year = {2020}, +} +``` diff --git a/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py b/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py new file mode 100644 index 0000000..91d45ad --- /dev/null +++ b/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + init_cfg=dict( + type='Pretrained', checkpoint='./mocov2_r50_800ep_pretrain.pth'))) diff --git a/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py b/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py new file mode 100644 index 0000000..ddaebf5 --- /dev/null +++ b/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-mocov2-pre_fpn_ms-2x_coco.py @@ -0,0 +1,25 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + init_cfg=dict( + type='Pretrained', checkpoint='./mocov2_r50_800ep_pretrain.pth'))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_1x_coco.py b/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_1x_coco.py new file mode 100644 index 0000000..785c80e --- /dev/null +++ b/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_1x_coco.py @@ -0,0 +1,13 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + init_cfg=dict( + type='Pretrained', checkpoint='./swav_800ep_pretrain.pth.tar'))) diff --git a/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py b/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py new file mode 100644 index 0000000..c393e0b --- /dev/null +++ b/mmdetection/configs/selfsup_pretrain/mask-rcnn_r50-swav-pre_fpn_ms-2x_coco.py @@ -0,0 +1,25 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +model = dict( + backbone=dict( + frozen_stages=0, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + init_cfg=dict( + type='Pretrained', checkpoint='./swav_800ep_pretrain.pth.tar'))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/simple_copy_paste/README.md b/mmdetection/configs/simple_copy_paste/README.md new file mode 100644 index 0000000..23b09ce --- /dev/null +++ b/mmdetection/configs/simple_copy_paste/README.md @@ -0,0 +1,38 @@ +# SimpleCopyPaste + +> [Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation](https://arxiv.org/abs/2012.07177) + + + +## Abstract + +Building instance segmentation models that are data-efficient and can handle rare object categories is an important challenge in computer vision. Leveraging data augmentations is a promising direction towards addressing this challenge. Here, we perform a systematic study of the Copy-Paste augmentation (\[13, 12\]) for instance segmentation where we randomly paste objects onto an image. Prior studies on Copy-Paste relied on modeling the surrounding visual context for pasting the objects. However, we find that the simple mechanism of pasting objects randomly is good enough and can provide solid gains on top of strong baselines. Furthermore, we show Copy-Paste is additive with semi-supervised methods that leverage extra data through pseudo labeling (e.g. self-training). On COCO instance segmentation, we achieve 49.1 mask AP and 57.3 box AP, an improvement of +0.6 mask AP and +1.5 box AP over the previous state-of-the-art. We further demonstrate that Copy-Paste can lead to significant improvements on the LVIS benchmark. Our baseline model outperforms the LVIS 2020 Challenge winning entry by +3.6 mask AP on rare categories. + +
    + +
    + +## Results and Models + +### Mask R-CNN with Standard Scale Jittering (SSJ) and Simple Copy-Paste(SCP) + +Standard Scale Jittering(SSJ) resizes and crops an image with a resize range of 0.8 to 1.25 of the original image size, and Simple Copy-Paste(SCP) selects a random subset of objects from one of the images and pastes them onto the other image. + +| Backbone | Training schedule | Augmentation | batch size | box AP | mask AP | Config | Download | +| :------: | :---------------: | :----------: | :--------: | :----: | :-----: | :------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | 90k | SSJ | 64 | 43.3 | 39.0 | [config](./mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco_20220316_181409-f79c84c5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco_20220316_181409.log.json) | +| R-50 | 90k | SSJ+SCP | 64 | 43.8 | 39.2 | [config](./mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco_20220316_181307-6bc5726f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco_20220316_181307.log.json) | +| R-50 | 270k | SSJ | 64 | 43.5 | 39.1 | [config](./mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco_20220324_182940-33a100c5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco_20220324_182940.log.json) | +| R-50 | 270k | SSJ+SCP | 64 | 45.1 | 40.3 | [config](./mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco_20220324_201229-80ee90b7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco_20220324_201229.log.json) | + +## Citation + +```latex +@inproceedings{ghiasi2021simple, + title={Simple copy-paste is a strong data augmentation method for instance segmentation}, + author={Ghiasi, Golnaz and Cui, Yin and Srinivas, Aravind and Qian, Rui and Lin, Tsung-Yi and Cubuk, Ekin D and Le, Quoc V and Zoph, Barret}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={2918--2928}, + year={2021} +} +``` diff --git a/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py b/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py new file mode 100644 index 0000000..0c6e081 --- /dev/null +++ b/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py @@ -0,0 +1,31 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + # 270k iterations with batch_size 64 is roughly equivalent to 144 epochs + '../common/ssj_270k_coco-instance.py', +] + +image_size = (1024, 1024) +batch_augments = [ + dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) +] +norm_cfg = dict(type='SyncBN', requires_grad=True) +# Use MMSyncBN that handles empty tensor in head. It can be changed to +# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed +head_norm_cfg = dict(type='MMSyncBN', requires_grad=True) +model = dict( + # the model is trained from scratch, so init_cfg is None + data_preprocessor=dict( + # pad_size_divisor=32 is unnecessary in training but necessary + # in testing. + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg, init_cfg=None), + neck=dict(norm_cfg=norm_cfg), + rpn_head=dict(num_convs=2), # leads to 0.1+ mAP + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=head_norm_cfg), + mask_head=dict(norm_cfg=head_norm_cfg))) diff --git a/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py b/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py new file mode 100644 index 0000000..abe8962 --- /dev/null +++ b/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py @@ -0,0 +1,18 @@ +_base_ = 'mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py' # noqa + +# training schedule for 90k +max_iters = 90000 + +# learning rate policy +# lr steps at [0.9, 0.95, 0.975] of the maximum iterations +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=90000, + by_epoch=False, + milestones=[81000, 85500, 87750], + gamma=0.1) +] diff --git a/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py b/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py new file mode 100644 index 0000000..f0ea57d --- /dev/null +++ b/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py @@ -0,0 +1,31 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + # 270k iterations with batch_size 64 is roughly equivalent to 144 epochs + '../common/ssj_scp_270k_coco-instance.py' +] + +image_size = (1024, 1024) +batch_augments = [ + dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) +] +norm_cfg = dict(type='SyncBN', requires_grad=True) +# Use MMSyncBN that handles empty tensor in head. It can be changed to +# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed +head_norm_cfg = dict(type='MMSyncBN', requires_grad=True) +model = dict( + # the model is trained from scratch, so init_cfg is None + data_preprocessor=dict( + # pad_size_divisor=32 is unnecessary in training but necessary + # in testing. + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg, init_cfg=None), + neck=dict(norm_cfg=norm_cfg), + rpn_head=dict(num_convs=2), # leads to 0.1+ mAP + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=head_norm_cfg), + mask_head=dict(norm_cfg=head_norm_cfg))) diff --git a/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py b/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py new file mode 100644 index 0000000..e158b5c --- /dev/null +++ b/mmdetection/configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py @@ -0,0 +1,18 @@ +_base_ = 'mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py' # noqa + +# training schedule for 90k +max_iters = 90000 + +# learning rate policy +# lr steps at [0.9, 0.95, 0.975] of the maximum iterations +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.067, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=90000, + by_epoch=False, + milestones=[81000, 85500, 87750], + gamma=0.1) +] diff --git a/mmdetection/configs/simple_copy_paste/metafile.yml b/mmdetection/configs/simple_copy_paste/metafile.yml new file mode 100644 index 0000000..8a40b65 --- /dev/null +++ b/mmdetection/configs/simple_copy_paste/metafile.yml @@ -0,0 +1,92 @@ +Collections: + - Name: SimpleCopyPaste + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 32x A100 GPUs + Architecture: + - Softmax + - RPN + - Convolution + - Dense Connections + - FPN + - ResNet + - RoIAlign + Paper: + URL: https://arxiv.org/abs/2012.07177 + Title: "Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" + README: configs/simple_copy_paste/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.25.0/mmdet/datasets/pipelines/transforms.py#L2762 + Version: v2.25.0 + +Models: + - Name: mask-rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco + In Collection: SimpleCopyPaste + Config: configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-270k_coco.py + Metadata: + Training Memory (GB): 7.2 + Iterations: 270000 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_270k_coco_20220324_182940-33a100c5.pth + + - Name: mask-rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco + In Collection: SimpleCopyPaste + Config: configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-90k_coco.py + Metadata: + Training Memory (GB): 7.2 + Iterations: 90000 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.3 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_32x2_90k_coco_20220316_181409-f79c84c5.pth + + - Name: mask-rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco + In Collection: SimpleCopyPaste + Config: configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-270k_coco.py + Metadata: + Training Memory (GB): 7.2 + Iterations: 270000 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco_20220324_201229-80ee90b7.pth + + - Name: mask-rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco + In Collection: SimpleCopyPaste + Config: configs/simple_copy_paste/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_32xb2-ssj-scp-90k_coco.py + Metadata: + Training Memory (GB): 7.2 + Iterations: 90000 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_90k_coco_20220316_181307-6bc5726f.pth diff --git a/mmdetection/configs/soft_teacher/README.md b/mmdetection/configs/soft_teacher/README.md new file mode 100644 index 0000000..1fd3d84 --- /dev/null +++ b/mmdetection/configs/soft_teacher/README.md @@ -0,0 +1,33 @@ +# SoftTeacher + +> [End-to-End Semi-Supervised Object Detection with Soft Teacher](https://arxiv.org/abs/2106.09018) + + + +## Abstract + +This paper presents an end-to-end semi-supervised object detection approach, in contrast to previous more complex multi-stage methods. The end-to-end training gradually improves pseudo label qualities during the curriculum, and the more and more accurate pseudo labels in turn benefit object detection training. We also propose two simple yet effective techniques within this framework: a soft teacher mechanism where the classification loss of each unlabeled bounding box is weighed by the classification score produced by the teacher network; a box jittering approach to select reliable pseudo boxes for the learning of box regression. On the COCO benchmark, the proposed approach outperforms previous methods by a large margin under various labeling ratios, i.e. 1%, 5% and 10%. Moreover, our approach proves to perform also well when the amount of labeled data is relatively large. For example, it can improve a 40.9 mAP baseline detector trained using the full COCO training set by +3.6 mAP, reaching 44.5 mAP, by leveraging the 123K unlabeled images of COCO. On the state-of-the-art Swin Transformer based object detector (58.9 mAP on test-dev), it can still significantly improve the detection accuracy by +1.5 mAP, reaching 60.4 mAP, and improve the instance segmentation accuracy by +1.2 mAP, reaching 52.4 mAP. Further incorporating with the Object365 pre-trained model, the detection accuracy reaches 61.3 mAP and the instance segmentation accuracy reaches 53.0 mAP, pushing the new state-of-the-art. + +
    + +
    + +## Results and Models + +| Model | Detector | Labeled Dataset | Iteration | box AP | Config | Download | +| :---------: | :----------: | :-------------: | :-------: | :----: | :-----------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| SoftTeacher | Faster R-CNN | COCO-1% | 180k | 19.9 | [config](./soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230330_233412-3c8f6d4a.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230330_233412.log.json) | +| SoftTeacher | Faster R-CNN | COCO-2% | 180k | 24.9 | [config](./soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230331_020244-c0d2c3aa.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230331_020244.log.json) | +| SoftTeacher | Faster R-CNN | COCO-5% | 180k | 30.4 | [config](./soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230331_070656-308798ad.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230331_070656.log.json) | +| SoftTeacher | Faster R-CNN | COCO-10% | 180k | 33.8 | [config](./soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230330_232113-b46f78d0.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230330_232113.log.json) | + +## Citation + +```latex +@article{xu2021end, + title={End-to-End Semi-Supervised Object Detection with Soft Teacher}, + author={Xu, Mengde and Zhang, Zheng and Hu, Han and Wang, Jianfeng and Wang, Lijuan and Wei, Fangyun and Bai, Xiang and Liu, Zicheng}, + journal={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + year={2021} +} +``` diff --git a/mmdetection/configs/soft_teacher/metafile.yml b/mmdetection/configs/soft_teacher/metafile.yml new file mode 100644 index 0000000..9622ace --- /dev/null +++ b/mmdetection/configs/soft_teacher/metafile.yml @@ -0,0 +1,67 @@ +Collections: + - Name: SoftTeacher + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x A100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/2106.09018 + Title: "End-to-End Semi-Supervised Object Detection with Soft Teacher" + README: configs/soft_teacher/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v3.0.0rc1/mmdet/models/detectors/soft_teacher.py#L20 + Version: v3.0.0rc1 + +Models: + - Name: soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py + In Collection: SoftTeacher + Config: configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py + Metadata: + Iterations: 180000 + Results: + - Task: Semi-Supervised Object Detection + Dataset: COCO + Metrics: + box AP: 19.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230330_233412-3c8f6d4a.pth + + - Name: soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py + In Collection: SoftTeacher + Config: configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py + Metadata: + Iterations: 180000 + Results: + - Task: Semi-Supervised Object Detection + Dataset: COCO + Metrics: + box AP: 24.9 + Weights: https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230331_020244-c0d2c3aa.pth + + - Name: soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py + In Collection: SoftTeacher + Config: configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py + Metadata: + Iterations: 180000 + Results: + - Task: Semi-Supervised Object Detection + Dataset: COCO + Metrics: + box AP: 30.4 + Weights: https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230331_070656-308798ad.pth + + - Name: soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py + In Collection: SoftTeacher + Config: configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py + Metadata: + Iterations: 180000 + Results: + - Task: Semi-Supervised Object Detection + Dataset: COCO + Metrics: + box AP: 33.8 + Weights: https://download.openmmlab.com/mmdetection/v3.0/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0_20230330_232113-b46f78d0.pth diff --git a/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py b/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py new file mode 100644 index 0000000..2bd0964 --- /dev/null +++ b/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.01-coco.py @@ -0,0 +1,9 @@ +_base_ = ['soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py'] + +# 1% coco train2017 is set as labeled dataset +labeled_dataset = _base_.labeled_dataset +unlabeled_dataset = _base_.unlabeled_dataset +labeled_dataset.ann_file = 'semi_anns/instances_train2017.1@1.json' +unlabeled_dataset.ann_file = 'semi_anns/instances_train2017.1@1-unlabeled.json' +train_dataloader = dict( + dataset=dict(datasets=[labeled_dataset, unlabeled_dataset])) diff --git a/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py b/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py new file mode 100644 index 0000000..8ca38c9 --- /dev/null +++ b/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.02-coco.py @@ -0,0 +1,9 @@ +_base_ = ['soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py'] + +# 2% coco train2017 is set as labeled dataset +labeled_dataset = _base_.labeled_dataset +unlabeled_dataset = _base_.unlabeled_dataset +labeled_dataset.ann_file = 'semi_anns/instances_train2017.1@2.json' +unlabeled_dataset.ann_file = 'semi_anns/instances_train2017.1@2-unlabeled.json' +train_dataloader = dict( + dataset=dict(datasets=[labeled_dataset, unlabeled_dataset])) diff --git a/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py b/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py new file mode 100644 index 0000000..750b7ed --- /dev/null +++ b/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.05-coco.py @@ -0,0 +1,9 @@ +_base_ = ['soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py'] + +# 5% coco train2017 is set as labeled dataset +labeled_dataset = _base_.labeled_dataset +unlabeled_dataset = _base_.unlabeled_dataset +labeled_dataset.ann_file = 'semi_anns/instances_train2017.1@5.json' +unlabeled_dataset.ann_file = 'semi_anns/instances_train2017.1@5-unlabeled.json' +train_dataloader = dict( + dataset=dict(datasets=[labeled_dataset, unlabeled_dataset])) diff --git a/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py b/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py new file mode 100644 index 0000000..3713aef --- /dev/null +++ b/mmdetection/configs/soft_teacher/soft-teacher_faster-rcnn_r50-caffe_fpn_180k_semi-0.1-coco.py @@ -0,0 +1,84 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/default_runtime.py', + '../_base_/datasets/semi_coco_detection.py' +] + +detector = _base_.model +detector.data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32) +detector.backbone = dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')) + +model = dict( + _delete_=True, + type='SoftTeacher', + detector=detector, + data_preprocessor=dict( + type='MultiBranchDataPreprocessor', + data_preprocessor=detector.data_preprocessor), + semi_train_cfg=dict( + freeze_teacher=True, + sup_weight=1.0, + unsup_weight=4.0, + pseudo_label_initial_score_thr=0.5, + rpn_pseudo_thr=0.9, + cls_pseudo_thr=0.9, + reg_pseudo_thr=0.02, + jitter_times=10, + jitter_scale=0.06, + min_pseudo_bbox_wh=(1e-2, 1e-2)), + semi_test_cfg=dict(predict_on='teacher')) + +# 10% coco train2017 is set as labeled dataset +labeled_dataset = _base_.labeled_dataset +unlabeled_dataset = _base_.unlabeled_dataset +labeled_dataset.ann_file = 'semi_anns/instances_train2017.1@10.json' +unlabeled_dataset.ann_file = 'semi_anns/' \ + 'instances_train2017.1@10-unlabeled.json' +unlabeled_dataset.data_prefix = dict(img='train2017/') +train_dataloader = dict( + dataset=dict(datasets=[labeled_dataset, unlabeled_dataset])) + +# training schedule for 180k +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=180000, val_interval=5000) +val_cfg = dict(type='TeacherStudentValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=180000, + by_epoch=False, + milestones=[120000, 160000], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +default_hooks = dict( + checkpoint=dict(by_epoch=False, interval=10000, max_keep_ckpts=2)) +log_processor = dict(by_epoch=False) + +custom_hooks = [dict(type='MeanTeacherHook')] diff --git a/mmdetection/configs/solo/README.md b/mmdetection/configs/solo/README.md new file mode 100644 index 0000000..4a36676 --- /dev/null +++ b/mmdetection/configs/solo/README.md @@ -0,0 +1,54 @@ +# SOLO + +> [SOLO: Segmenting Objects by Locations](https://arxiv.org/abs/1912.04488) + + + +## Abstract + +We present a new, embarrassingly simple approach to instance segmentation in images. Compared to many other dense prediction tasks, e.g., semantic segmentation, it is the arbitrary number of instances that have made instance segmentation much more challenging. In order to predict a mask for each instance, mainstream approaches either follow the 'detect-thensegment' strategy as used by Mask R-CNN, or predict category masks first then use clustering techniques to group pixels into individual instances. We view the task of instance segmentation from a completely new perspective by introducing the notion of "instance categories", which assigns categories to each pixel within an instance according to the instance's location and size, thus nicely converting instance mask segmentation into a classification-solvable problem. Now instance segmentation is decomposed into two classification tasks. We demonstrate a much simpler and flexible instance segmentation framework with strong performance, achieving on par accuracy with Mask R-CNN and outperforming recent singleshot instance segmenters in accuracy. We hope that this very simple and strong framework can serve as a baseline for many instance-level recognition tasks besides instance segmentation. + +
    + +
    + +## Results and Models + +### SOLO + +| Backbone | Style | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP | Download | +| :------: | :-----: | :------: | :-----: | :------: | :------------: | :-----: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | N | 1x | 8.0 | 14.0 | 33.1 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055-2290a6b8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055.log.json) | +| R-50 | pytorch | Y | 3x | 7.4 | 14.0 | 35.9 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353.log.json) | + +### Decoupled SOLO + +| Backbone | Style | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP | Download | +| :------: | :-----: | :------: | :-----: | :------: | :------------: | :-----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | N | 1x | 7.8 | 12.5 | 33.9 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348.log.json) | +| R-50 | pytorch | Y | 3x | 7.9 | 12.5 | 36.7 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504-7b3301ec.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504.log.json) | + +- Decoupled SOLO has a decoupled head which is different from SOLO head. + Decoupled SOLO serves as an efficient and equivalent variant in accuracy + of SOLO. Please refer to the corresponding config files for details. + +### Decoupled Light SOLO + +| Backbone | Style | MS train | Lr schd | Mem (GB) | Inf time (fps) | mask AP | Download | +| :------: | :-----: | :------: | :-----: | :------: | :------------: | :-----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | Y | 3x | 2.2 | 31.2 | 32.9 | [model](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703-e70e226f.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703.log.json) | + +- Decoupled Light SOLO using decoupled structure similar to Decoupled + SOLO head, with light-weight head and smaller input size, Please refer + to the corresponding config files for details. + +## Citation + +```latex +@inproceedings{wang2020solo, + title = {{SOLO}: Segmenting Objects by Locations}, + author = {Wang, Xinlong and Kong, Tao and Shen, Chunhua and Jiang, Yuning and Li, Lei}, + booktitle = {Proc. Eur. Conf. Computer Vision (ECCV)}, + year = {2020} +} +``` diff --git a/mmdetection/configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py b/mmdetection/configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py new file mode 100644 index 0000000..fc35df3 --- /dev/null +++ b/mmdetection/configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py @@ -0,0 +1,50 @@ +_base_ = './decoupled-solo_r50_fpn_3x_coco.py' + +# model settings +model = dict( + mask_head=dict( + type='DecoupledSOLOLightHead', + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict( + type='DiceLoss', use_sigmoid=True, activate=False, + loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(852, 512), (852, 480), (852, 448), (852, 416), (852, 384), + (852, 352)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(852, 512), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/solo/decoupled-solo_r50_fpn_1x_coco.py b/mmdetection/configs/solo/decoupled-solo_r50_fpn_1x_coco.py new file mode 100644 index 0000000..6d7f4b9 --- /dev/null +++ b/mmdetection/configs/solo/decoupled-solo_r50_fpn_1x_coco.py @@ -0,0 +1,24 @@ +_base_ = './solo_r50_fpn_1x_coco.py' +# model settings +model = dict( + mask_head=dict( + type='DecoupledSOLOHead', + num_classes=80, + in_channels=256, + stacked_convs=7, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict( + type='DiceLoss', use_sigmoid=True, activate=False, + loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) diff --git a/mmdetection/configs/solo/decoupled-solo_r50_fpn_3x_coco.py b/mmdetection/configs/solo/decoupled-solo_r50_fpn_3x_coco.py new file mode 100644 index 0000000..4a8c19d --- /dev/null +++ b/mmdetection/configs/solo/decoupled-solo_r50_fpn_3x_coco.py @@ -0,0 +1,25 @@ +_base_ = './solo_r50_fpn_3x_coco.py' + +# model settings +model = dict( + mask_head=dict( + type='DecoupledSOLOHead', + num_classes=80, + in_channels=256, + stacked_convs=7, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict( + type='DiceLoss', use_sigmoid=True, activate=False, + loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True))) diff --git a/mmdetection/configs/solo/metafile.yml b/mmdetection/configs/solo/metafile.yml new file mode 100644 index 0000000..aa38b8c --- /dev/null +++ b/mmdetection/configs/solo/metafile.yml @@ -0,0 +1,115 @@ +Collections: + - Name: SOLO + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - Convolution + - ResNet + Paper: https://arxiv.org/abs/1912.04488 + README: configs/solo/README.md + +Models: + - Name: decoupled-solo_r50_fpn_1x_coco + In Collection: SOLO + Config: configs/solo/decoupled-solo_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 12 + inference time (ms/im): + - value: 116.4 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1333, 800) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 33.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_1x_coco/decoupled_solo_r50_fpn_1x_coco_20210820_233348-6337c589.pth + + - Name: decoupled-solo_r50_fpn_3x_coco + In Collection: SOLO + Config: configs/solo/decoupled-solo_r50_fpn_3x_coco.py + Metadata: + Training Memory (GB): 7.9 + Epochs: 36 + inference time (ms/im): + - value: 117.2 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1333, 800) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 36.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_r50_fpn_3x_coco/decoupled_solo_r50_fpn_3x_coco_20210821_042504-7b3301ec.pth + + - Name: decoupled-solo-light_r50_fpn_3x_coco + In Collection: SOLO + Config: configs/solo/decoupled-solo-light_r50_fpn_3x_coco.py + Metadata: + Training Memory (GB): 2.2 + Epochs: 36 + inference time (ms/im): + - value: 35.0 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (852, 512) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/decoupled_solo_light_r50_fpn_3x_coco/decoupled_solo_light_r50_fpn_3x_coco_20210906_142703-e70e226f.pth + + - Name: solo_r50_fpn_3x_coco + In Collection: SOLO + Config: configs/solo/solo_r50_fpn_3x_coco.py + Metadata: + Training Memory (GB): 7.4 + Epochs: 36 + inference time (ms/im): + - value: 94.2 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1333, 800) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 35.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_3x_coco/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth + + - Name: solo_r50_fpn_1x_coco + In Collection: SOLO + Config: configs/solo/solo_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 8.0 + Epochs: 12 + inference time (ms/im): + - value: 95.1 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1333, 800) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 33.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solo/solo_r50_fpn_1x_coco/solo_r50_fpn_1x_coco_20210821_035055-2290a6b8.pth diff --git a/mmdetection/configs/solo/solo_r101_fpn_8xb8-lsj-200e_coco.py b/mmdetection/configs/solo/solo_r101_fpn_8xb8-lsj-200e_coco.py new file mode 100644 index 0000000..0f49c5c --- /dev/null +++ b/mmdetection/configs/solo/solo_r101_fpn_8xb8-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './solo_r50_fpn_8xb8-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/solo/solo_r18_fpn_8xb8-lsj-200e_coco.py b/mmdetection/configs/solo/solo_r18_fpn_8xb8-lsj-200e_coco.py new file mode 100644 index 0000000..977ae54 --- /dev/null +++ b/mmdetection/configs/solo/solo_r18_fpn_8xb8-lsj-200e_coco.py @@ -0,0 +1,7 @@ +_base_ = './solo_r50_fpn_8xb8-lsj-200e_coco.py' + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/solo/solo_r50_fpn_1x_coco.py b/mmdetection/configs/solo/solo_r50_fpn_1x_coco.py new file mode 100644 index 0000000..595e9ff --- /dev/null +++ b/mmdetection/configs/solo/solo_r50_fpn_1x_coco.py @@ -0,0 +1,62 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='SOLO', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=0, + num_outs=5), + mask_head=dict( + type='SOLOHead', + num_classes=80, + in_channels=256, + stacked_convs=7, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), + # model training and testing settings + test_cfg=dict( + nms_pre=500, + score_thr=0.1, + mask_thr=0.5, + filter_thr=0.05, + kernel='gaussian', # gaussian/linear + sigma=2.0, + max_per_img=100)) + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=0.01)) + +val_evaluator = dict(metric='segm') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/solo/solo_r50_fpn_3x_coco.py b/mmdetection/configs/solo/solo_r50_fpn_3x_coco.py new file mode 100644 index 0000000..98a9505 --- /dev/null +++ b/mmdetection/configs/solo/solo_r50_fpn_3x_coco.py @@ -0,0 +1,35 @@ +_base_ = './solo_r50_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 800), (1333, 768), (1333, 736), (1333, 704), + (1333, 672), (1333, 640)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# training schedule for 3x +max_epochs = 36 +train_cfg = dict(by_epoch=True, max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=36, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] diff --git a/mmdetection/configs/solo/solo_r50_fpn_8xb8-lsj-200e_coco.py b/mmdetection/configs/solo/solo_r50_fpn_8xb8-lsj-200e_coco.py new file mode 100644 index 0000000..d46bf39 --- /dev/null +++ b/mmdetection/configs/solo/solo_r50_fpn_8xb8-lsj-200e_coco.py @@ -0,0 +1,71 @@ +_base_ = '../common/lsj-200e_coco-instance.py' + +image_size = (1024, 1024) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +# model settings +model = dict( + type='SOLO', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=0, + num_outs=5), + mask_head=dict( + type='SOLOHead', + num_classes=80, + in_channels=256, + stacked_convs=7, + feat_channels=256, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), + # model training and testing settings + test_cfg=dict( + nms_pre=500, + score_thr=0.1, + mask_thr=0.5, + filter_thr=0.05, + kernel='gaussian', # gaussian/linear + sigma=2.0, + max_per_img=100)) + +train_dataloader = dict(batch_size=8, num_workers=4) + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004), + clip_grad=dict(max_norm=35, norm_type=2)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/solov2/README.md b/mmdetection/configs/solov2/README.md new file mode 100644 index 0000000..b216913 --- /dev/null +++ b/mmdetection/configs/solov2/README.md @@ -0,0 +1,59 @@ +# SOLOv2 + +> [SOLOv2: Dynamic and Fast Instance Segmentation](https://arxiv.org/abs/2003.10152) + + + +## Abstract + +In this work, we aim at building a simple, direct, and fast instance segmentation +framework with strong performance. We follow the principle of the SOLO method of +Wang et al. "SOLO: segmenting objects by locations". Importantly, we take one +step further by dynamically learning the mask head of the object segmenter such +that the mask head is conditioned on the location. Specifically, the mask branch +is decoupled into a mask kernel branch and mask feature branch, which are +responsible for learning the convolution kernel and the convolved features +respectively. Moreover, we propose Matrix NMS (non maximum suppression) to +significantly reduce the inference time overhead due to NMS of masks. Our +Matrix NMS performs NMS with parallel matrix operations in one shot, and +yields better results. We demonstrate a simple direct instance segmentation +system, outperforming a few state-of-the-art methods in both speed and accuracy. +A light-weight version of SOLOv2 executes at 31.3 FPS and yields 37.1% AP. +Moreover, our state-of-the-art results in object detection (from our mask byproduct) +and panoptic segmentation show the potential to serve as a new strong baseline +for many instance-level recognition tasks besides instance segmentation. + +
    + +
    + +## Results and Models + +### SOLOv2 + +| Backbone | Style | MS train | Lr schd | Mem (GB) | mask AP | Config | Download | +| :--------: | :-----: | :------: | :-----: | :------: | :-----: | :-------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | N | 1x | 5.1 | 34.8 | [config](./solov2_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_1x_coco/solov2_r50_fpn_1x_coco_20220512_125858-a357fa23.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_1x_coco/solov2_r50_fpn_1x_coco_20220512_125858.log.json) | +| R-50 | pytorch | Y | 3x | 5.1 | 37.5 | [config](./solov2_r50_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_3x_coco/solov2_r50_fpn_3x_coco_20220512_125856-fed092d4.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_3x_coco/solov2_r50_fpn_3x_coco_20220512_125856.log.json) | +| R-101 | pytorch | Y | 3x | 6.9 | 39.1 | [config](./solov2_r101_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_fpn_3x_coco/solov2_r101_fpn_3x_coco_20220511_095119-c559a076.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_fpn_3x_coco/solov2_r101_fpn_3x_coco_20220511_095119.log.json) | +| R-101(DCN) | pytorch | Y | 3x | 7.1 | 41.2 | [config](./solov2_r101-dcn_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_dcn_fpn_3x_coco/solov2_r101_dcn_fpn_3x_coco_20220513_214734-16c966cb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_dcn_fpn_3x_coco/solov2_r101_dcn_fpn_3x_coco_20220513_214734.log.json) | +| X-101(DCN) | pytorch | Y | 3x | 11.3 | 42.4 | [config](./solov2_x101-dcn_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_x101_dcn_fpn_3x_coco/solov2_x101_dcn_fpn_3x_coco_20220513_214337-aef41095.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_x101_dcn_fpn_3x_coco/solov2_x101_dcn_fpn_3x_coco_20220513_214337.log.json) | + +### Light SOLOv2 + +| Backbone | Style | MS train | Lr schd | Mem (GB) | mask AP | Config | Download | +| :------: | :-----: | :------: | :-----: | :------: | :-----: | :--------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-18 | pytorch | Y | 3x | 9.1 | 29.7 | [config](./solov2-light_r18_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r18_fpn_3x_coco/solov2_light_r18_fpn_3x_coco_20220511_083717-75fa355b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r18_fpn_3x_coco/solov2_light_r18_fpn_3x_coco_20220511_083717.log.json) | +| R-34 | pytorch | Y | 3x | 9.3 | 31.9 | [config](./solov2-light_r34_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r34_fpn_3x_coco/solov2_light_r34_fpn_3x_coco_20220511_091839-e51659d3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r34_fpn_3x_coco/solov2_light_r34_fpn_3x_coco_20220511_091839.log.json) | +| R-50 | pytorch | Y | 3x | 9.9 | 33.7 | [config](./solov2-light_r50_fpn_ms-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r50_fpn_3x_coco/solov2_light_r50_fpn_3x_coco_20220512_165256-c93a6074.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r50_fpn_3x_coco/solov2_light_r50_fpn_3x_coco_20220512_165256.log.json) | + +## Citation + +```latex +@article{wang2020solov2, + title={SOLOv2: Dynamic and Fast Instance Segmentation}, + author={Wang, Xinlong and Zhang, Rufeng and Kong, Tao and Li, Lei and Shen, Chunhua}, + journal={Proc. Advances in Neural Information Processing Systems (NeurIPS)}, + year={2020} +} +``` diff --git a/mmdetection/configs/solov2/metafile.yml b/mmdetection/configs/solov2/metafile.yml new file mode 100644 index 0000000..d0156b2 --- /dev/null +++ b/mmdetection/configs/solov2/metafile.yml @@ -0,0 +1,93 @@ +Collections: + - Name: SOLOv2 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x A100 GPUs + Architecture: + - FPN + - Convolution + - ResNet + Paper: https://arxiv.org/abs/2003.10152 + README: configs/solov2/README.md + +Models: + - Name: solov2_r50_fpn_1x_coco + In Collection: SOLOv2 + Config: configs/solov2/solov2_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 5.1 + Epochs: 12 + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 34.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_1x_coco/solov2_r50_fpn_1x_coco_20220512_125858-a357fa23.pth + + - Name: solov2_r50_fpn_ms-3x_coco + In Collection: SOLOv2 + Config: configs/solov2/solov2_r50_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 5.1 + Epochs: 36 + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r50_fpn_3x_coco/solov2_r50_fpn_3x_coco_20220512_125856-fed092d4.pth + + - Name: solov2_r101-dcn_fpn_ms-3x_coco + In Collection: SOLOv2 + Config: configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 7.1 + Epochs: 36 + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_r101_dcn_fpn_3x_coco/solov2_r101_dcn_fpn_3x_coco_20220513_214734-16c966cb.pth + + - Name: solov2_x101-dcn_fpn_ms-3x_coco + In Collection: SOLOv2 + Config: configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 11.3 + Epochs: 36 + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 42.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_x101_dcn_fpn_3x_coco/solov2_x101_dcn_fpn_3x_coco_20220513_214337-aef41095.pth + + - Name: solov2-light_r18_fpn_ms-3x_coco + In Collection: SOLOv2 + Config: configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 9.1 + Epochs: 36 + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 29.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r18_fpn_3x_coco/solov2_light_r18_fpn_3x_coco_20220511_083717-75fa355b.pth + + - Name: solov2-light_r50_fpn_ms-3x_coco + In Collection: SOLOv2 + Config: configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py + Metadata: + Training Memory (GB): 9.9 + Epochs: 36 + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 33.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/solov2/solov2_light_r50_fpn_3x_coco/solov2_light_r50_fpn_3x_coco_20220512_165256-c93a6074.pth diff --git a/mmdetection/configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py b/mmdetection/configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py new file mode 100644 index 0000000..f8fc53e --- /dev/null +++ b/mmdetection/configs/solov2/solov2-light_r18_fpn_ms-3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './solov2-light_r50_fpn_ms-3x_coco.py' + +# model settings +model = dict( + backbone=dict( + depth=18, init_cfg=dict(checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/solov2/solov2-light_r34_fpn_ms-3x_coco.py b/mmdetection/configs/solov2/solov2-light_r34_fpn_ms-3x_coco.py new file mode 100644 index 0000000..149b336 --- /dev/null +++ b/mmdetection/configs/solov2/solov2-light_r34_fpn_ms-3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './solov2-light_r50_fpn_ms-3x_coco.py' + +# model settings +model = dict( + backbone=dict( + depth=34, init_cfg=dict(checkpoint='torchvision://resnet34')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/configs/solov2/solov2-light_r50-dcn_fpn_ms-3x_coco.py b/mmdetection/configs/solov2/solov2-light_r50-dcn_fpn_ms-3x_coco.py new file mode 100644 index 0000000..0539194 --- /dev/null +++ b/mmdetection/configs/solov2/solov2-light_r50-dcn_fpn_ms-3x_coco.py @@ -0,0 +1,14 @@ +_base_ = './solov2-light_r50_fpn_ms-3x_coco.py' + +# model settings +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + mask_head=dict( + feat_channels=256, + stacked_convs=3, + scale_ranges=((1, 64), (32, 128), (64, 256), (128, 512), (256, 2048)), + mask_feature_head=dict(out_channels=128), + dcn_cfg=dict(type='DCNv2'), + dcn_apply_to_all_conv=False)) # light solov2 head diff --git a/mmdetection/configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py b/mmdetection/configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py new file mode 100644 index 0000000..cf0a7f7 --- /dev/null +++ b/mmdetection/configs/solov2/solov2-light_r50_fpn_ms-3x_coco.py @@ -0,0 +1,56 @@ +_base_ = './solov2_r50_fpn_1x_coco.py' + +# model settings +model = dict( + mask_head=dict( + stacked_convs=2, + feat_channels=256, + scale_ranges=((1, 56), (28, 112), (56, 224), (112, 448), (224, 896)), + mask_feature_head=dict(out_channels=128))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(768, 512), (768, 480), (768, 448), (768, 416), (768, 384), + (768, 352)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(448, 768), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# training schedule for 3x +max_epochs = 36 +train_cfg = dict(by_epoch=True, max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=36, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] diff --git a/mmdetection/configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py b/mmdetection/configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py new file mode 100644 index 0000000..370a4eb --- /dev/null +++ b/mmdetection/configs/solov2/solov2_r101-dcn_fpn_ms-3x_coco.py @@ -0,0 +1,13 @@ +_base_ = './solov2_r50_fpn_ms-3x_coco.py' + +# model settings +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(checkpoint='torchvision://resnet101'), + dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + mask_head=dict( + mask_feature_head=dict(conv_cfg=dict(type='DCNv2')), + dcn_cfg=dict(type='DCNv2'), + dcn_apply_to_all_conv=True)) diff --git a/mmdetection/configs/solov2/solov2_r101_fpn_ms-3x_coco.py b/mmdetection/configs/solov2/solov2_r101_fpn_ms-3x_coco.py new file mode 100644 index 0000000..96aaac0 --- /dev/null +++ b/mmdetection/configs/solov2/solov2_r101_fpn_ms-3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './solov2_r50_fpn_ms-3x_coco.py' + +# model settings +model = dict( + backbone=dict( + depth=101, init_cfg=dict(checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/solov2/solov2_r50_fpn_1x_coco.py b/mmdetection/configs/solov2/solov2_r50_fpn_1x_coco.py new file mode 100644 index 0000000..138ca01 --- /dev/null +++ b/mmdetection/configs/solov2/solov2_r50_fpn_1x_coco.py @@ -0,0 +1,70 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='SOLOv2', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'), + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=0, + num_outs=5), + mask_head=dict( + type='SOLOV2Head', + num_classes=80, + in_channels=256, + feat_channels=512, + stacked_convs=4, + strides=[8, 8, 16, 32, 32], + scale_ranges=((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048)), + pos_scale=0.2, + num_grids=[40, 36, 24, 16, 12], + cls_down_index=0, + mask_feature_head=dict( + feat_channels=128, + start_level=0, + end_level=3, + out_channels=256, + mask_stride=4, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), + loss_mask=dict(type='DiceLoss', use_sigmoid=True, loss_weight=3.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0)), + # model training and testing settings + test_cfg=dict( + nms_pre=500, + score_thr=0.1, + mask_thr=0.5, + filter_thr=0.05, + kernel='gaussian', # gaussian/linear + sigma=2.0, + max_per_img=100)) + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.01), clip_grad=dict(max_norm=35, norm_type=2)) + +val_evaluator = dict(metric='segm') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/solov2/solov2_r50_fpn_ms-3x_coco.py b/mmdetection/configs/solov2/solov2_r50_fpn_ms-3x_coco.py new file mode 100644 index 0000000..d6f0982 --- /dev/null +++ b/mmdetection/configs/solov2/solov2_r50_fpn_ms-3x_coco.py @@ -0,0 +1,35 @@ +_base_ = './solov2_r50_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 800), (1333, 768), (1333, 736), (1333, 704), + (1333, 672), (1333, 640)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# training schedule for 3x +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 3, + by_epoch=False, + begin=0, + end=500), + dict( + type='MultiStepLR', + begin=0, + end=36, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] diff --git a/mmdetection/configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py b/mmdetection/configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py new file mode 100644 index 0000000..612c45e --- /dev/null +++ b/mmdetection/configs/solov2/solov2_x101-dcn_fpn_ms-3x_coco.py @@ -0,0 +1,17 @@ +_base_ = './solov2_r50_fpn_ms-3x_coco.py' + +# model settings +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d')), + mask_head=dict( + mask_feature_head=dict(conv_cfg=dict(type='DCNv2')), + dcn_cfg=dict(type='DCNv2'), + dcn_apply_to_all_conv=True)) diff --git a/mmdetection/configs/sort/README.md b/mmdetection/configs/sort/README.md new file mode 100644 index 0000000..8f035fd --- /dev/null +++ b/mmdetection/configs/sort/README.md @@ -0,0 +1,108 @@ +# Simple online and realtime tracking + +## Abstract + + + +This paper explores a pragmatic approach to multiple object tracking where the main focus is to associate objects efficiently for online and realtime applications. To this end, detection quality is identified as a key factor influencing tracking performance, where changing the detector can improve tracking by up to 18.9%. Despite only using a rudimentary combination of familiar techniques such as the Kalman Filter and Hungarian algorithm for the tracking components, this approach achieves an accuracy comparable to state-of-the-art online trackers. Furthermore, due to the simplicity of our tracking method, the tracker updates at a rate of 260 Hz which is over 20x faster than other state-of-the-art trackers. + + + +
    + +
    + +## Citation + + + +```latex +@inproceedings{bewley2016simple, + title={Simple online and realtime tracking}, + author={Bewley, Alex and Ge, Zongyuan and Ott, Lionel and Ramos, Fabio and Upcroft, Ben}, + booktitle={2016 IEEE International Conference on Image Processing (ICIP)}, + pages={3464--3468}, + year={2016}, + organization={IEEE} +} +``` + +## Results and models on MOT17 + +| Method | Detector | ReID | Train Set | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 | FP | FN | IDSw. | Config | Download | +| :----: | :----------------: | :--: | :--------: | :------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------: | +| SORT | R50-FasterRCNN-FPN | - | half-train | half-val | N | 18.6 | 52.0 | 62.0 | 57.8 | 15150 | 40410 | 5847 | [config](sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth) | + +## Get started + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Prepare + +Tracking Dataset Prepare can refer to this [document](../../docs/en/user_guides/tracking_dataset_prepare.md). + +### 3. Training + +We implement SORT with independent detector models. +Note that, due to the influence of parameters such as learning rate in default configuration file, +we recommend using 8 GPUs for training in order to reproduce accuracy. + +You can train the detector as follows. + +```shell script +# Training Faster R-CNN on mot17-half-train dataset with following command. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_train.sh configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 +``` + +If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 4. Testing and evaluation + +### 4.1 Example on MOTxx-halfval dataset + +**4.1.1 use separate trained detector model to evaluating and testing**\* + +```shell script +# Example 1: Test on motXX-half-val set. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_test_tracking.sh configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 --detector ${DETECTOR_CHECKPOINT_PATH} +``` + +**4.1.2 use video_baesd to evaluating and testing** + +we also provide two_ways(img_based or video_based) to evaluating and testing. +if you want to use video_based to evaluating and testing, you can modify config as follows + +``` +val_dataloader = dict( + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False)) +``` + +### 4.2 Example on MOTxx-test dataset + +If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set, +please use the following command to generate result files that can be used for submission. +It will be stored in `./mot_17_test_res`, you can modify the saved path in `test_evaluator` of the config. + +```shell script +# Example 2: Test on motxx-test set +# The number after config file represents the number of GPUs used +bash tools/dist_test_tracking.sh configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py 8 --detector ${DETECTOR_CHECKPOINT_PATH} +``` + +If you want to know about more detailed usage of `test_tracking.py/dist_test_tracking.sh/slurm_test_tracking.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 5.Inference + +Use a single GPU to predict a video and save it as a video. + +```shell +python demo/mot_demo.py demo/demo_mot.mp4 configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py --detector ${DETECTOR_CHECKPOINT_PATH} --out mot.mp4 +``` + +If you want to know about more detailed usage of `mot_demo.py`, please refer to this [document](../../docs/en/user_guides/tracking_inference.md). diff --git a/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..f1d5b72 --- /dev/null +++ b/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py @@ -0,0 +1,41 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/mot_challenge_det.py', '../_base_/default_runtime.py' +] + +model = dict( + rpn_head=dict( + bbox_coder=dict(clip_border=False), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + bbox_head=dict( + num_classes=1, + bbox_coder=dict(clip_border=False), + loss_bbox=dict(type='SmoothL1Loss', loss_weight=1.0))), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth' # noqa: E501 + )) + +# training schedule for 4e +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=4, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100), + dict( + type='MultiStepLR', + begin=0, + end=4, + by_epoch=True, + milestones=[3], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py b/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py new file mode 100644 index 0000000..8364706 --- /dev/null +++ b/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17train.py @@ -0,0 +1,11 @@ +_base_ = ['./faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval'] +# data +data_root = 'data/MOT17/' +train_dataloader = dict( + dataset=dict(ann_file='annotations/train_cocoformat.json')) +val_dataloader = dict( + dataset=dict(ann_file='annotations/train_cocoformat.json')) +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/train_cocoformat.json') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py b/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py new file mode 100644 index 0000000..a6d14ad --- /dev/null +++ b/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20halftrain_test-mot20halfval.py @@ -0,0 +1,29 @@ +_base_ = ['./faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval'] +model = dict( + rpn_head=dict(bbox_coder=dict(clip_border=True)), + roi_head=dict( + bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1))) +# data +data_root = 'data/MOT20/' +train_dataloader = dict(dataset=dict(data_root=data_root)) +val_dataloader = dict(dataset=dict(data_root=data_root)) +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + + 'annotations/half-val_cocoformat.json') +test_evaluator = val_evaluator + +# training schedule for 8e +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1) + +# learning rate +param_scheduler = [ + dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100), + dict( + type='MultiStepLR', + begin=0, + end=8, + by_epoch=True, + milestones=[6], + gamma=0.1) +] diff --git a/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py b/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py new file mode 100644 index 0000000..85c8597 --- /dev/null +++ b/mmdetection/configs/sort/faster-rcnn_r50_fpn_8xb2-8e_mot20train_test-mot20train.py @@ -0,0 +1,32 @@ +_base_ = ['./faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval'] +model = dict( + rpn_head=dict(bbox_coder=dict(clip_border=True)), + roi_head=dict( + bbox_head=dict(bbox_coder=dict(clip_border=True), num_classes=1))) +# data +data_root = 'data/MOT20/' +train_dataloader = dict( + dataset=dict( + data_root=data_root, ann_file='annotations/train_cocoformat.json')) +val_dataloader = dict( + dataset=dict( + data_root=data_root, ann_file='annotations/train_cocoformat.json')) +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/train_cocoformat.json') +test_evaluator = val_evaluator + +# training schedule for 8e +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1) + +# learning rate +param_scheduler = [ + dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=100), + dict( + type='MultiStepLR', + begin=0, + end=8, + by_epoch=True, + milestones=[6], + gamma=0.1) +] diff --git a/mmdetection/configs/sort/metafile.yml b/mmdetection/configs/sort/metafile.yml new file mode 100644 index 0000000..c582ce3 --- /dev/null +++ b/mmdetection/configs/sort/metafile.yml @@ -0,0 +1,35 @@ +Collections: + - Name: SORT + Metadata: + Training Techniques: + - SGD with Momentum + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - FPN + Paper: + URL: https://arxiv.org/abs/1602.00763 + Title: Simple Online and Realtime Tracking + README: configs/sort/README.md + +Models: + - Name: sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval + In Collection: SORT + Config: configs/mot/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py + Metadata: + Training Data: MOT17-half-train + inference time (ms/im): + - value: 53.8 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (640, 1088) + Results: + - Task: Multiple Object Tracking + Dataset: MOT17-half-val + Metrics: + MOTA: 62.0 + IDF1: 57.8 + HOTA: 52.0 + Weights: https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth diff --git a/mmdetection/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmdetection/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..78acb77 --- /dev/null +++ b/mmdetection/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py @@ -0,0 +1,54 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/mot_challenge.py', '../_base_/default_runtime.py' +] + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=1), + visualization=dict(type='TrackVisualizationHook', draw=False)) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='TrackLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# custom hooks +custom_hooks = [ + # Synchronize model buffers such as running_mean and running_var in BN + # at the end of each epoch + dict(type='SyncBuffersHook') +] + +detector = _base_.model +detector.pop('data_preprocessor') +detector.rpn_head.bbox_coder.update(dict(clip_border=False)) +detector.roi_head.bbox_head.update(dict(num_classes=1)) +detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False)) +detector['init_cfg'] = dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmtracking/mot/' + 'faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth') # noqa: E501 +del _base_.model + +model = dict( + type='DeepSORT', + data_preprocessor=dict( + type='TrackDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + rgb_to_bgr=False, + pad_size_divisor=32), + detector=detector, + tracker=dict( + type='SORTTracker', + motion=dict(type='KalmanFilter', center_only=False), + obj_score_thr=0.5, + match_iou_thr=0.5, + reid=None)) + +train_dataloader = None + +train_cfg = None +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/mmdetection/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py b/mmdetection/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py new file mode 100644 index 0000000..921652c --- /dev/null +++ b/mmdetection/configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17train_test-mot17test.py @@ -0,0 +1,15 @@ +_base_ = [ + './sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain' + '_test-mot17halfval.py' +] + +# dataloader +val_dataloader = dict( + dataset=dict(ann_file='annotations/train_cocoformat.json')) +test_dataloader = dict( + dataset=dict( + ann_file='annotations/test_cocoformat.json', + data_prefix=dict(img_path='test'))) + +# evaluator +test_evaluator = dict(format_only=True, outfile_prefix='./mot_17_test_res') diff --git a/mmdetection/configs/sparse_rcnn/README.md b/mmdetection/configs/sparse_rcnn/README.md new file mode 100644 index 0000000..2e8e365 --- /dev/null +++ b/mmdetection/configs/sparse_rcnn/README.md @@ -0,0 +1,38 @@ +# Sparse R-CNN + +> [Sparse R-CNN: End-to-End Object Detection with Learnable Proposals](https://arxiv.org/abs/2011.12450) + + + +## Abstract + +We present Sparse R-CNN, a purely sparse method for object detection in images. Existing works on object detection heavily rely on dense object candidates, such as k anchor boxes pre-defined on all grids of image feature map of size H×W. In our method, however, a fixed sparse set of learned object proposals, total length of N, are provided to object recognition head to perform classification and location. By eliminating HWk (up to hundreds of thousands) hand-designed object candidates to N (e.g. 100) learnable proposals, Sparse R-CNN completely avoids all efforts related to object candidates design and many-to-one label assignment. More importantly, final predictions are directly output without non-maximum suppression post-procedure. Sparse R-CNN demonstrates accuracy, run-time and training convergence performance on par with the well-established detector baselines on the challenging COCO dataset, e.g., achieving 45.0 AP in standard 3× training schedule and running at 22 fps using ResNet-50 FPN model. We hope our work could inspire re-thinking the convention of dense prior in object detectors. + +
    + +
    + +## Results and Models + +| Model | Backbone | Style | Lr schd | Number of Proposals | Multi-Scale | RandomCrop | box AP | Config | Download | +| :----------: | :-------: | :-----: | :-----: | :-----------------: | :---------: | :--------: | :----: | :-----------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Sparse R-CNN | R-50-FPN | pytorch | 1x | 100 | False | False | 37.9 | [config](./sparse-rcnn_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.log.json) | +| Sparse R-CNN | R-50-FPN | pytorch | 3x | 100 | True | False | 42.8 | [config](./sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.log.json) | +| Sparse R-CNN | R-50-FPN | pytorch | 3x | 300 | True | True | 45.0 | [config](./sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.log.json) | +| Sparse R-CNN | R-101-FPN | pytorch | 3x | 100 | True | False | 44.2 | [config](./sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.log.json) | +| Sparse R-CNN | R-101-FPN | pytorch | 3x | 300 | True | True | 46.2 | [config](./sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.log.json) | + +### Notes + +We observe about 0.3 AP noise especially when using ResNet-101 as the backbone. + +## Citation + +```latex +@article{peize2020sparse, + title = {{SparseR-CNN}: End-to-End Object Detection with Learnable Proposals}, + author = {Peize Sun and Rufeng Zhang and Yi Jiang and Tao Kong and Chenfeng Xu and Wei Zhan and Masayoshi Tomizuka and Lei Li and Zehuan Yuan and Changhu Wang and Ping Luo}, + journal = {arXiv preprint arXiv:2011.12450}, + year = {2020} +} +``` diff --git a/mmdetection/configs/sparse_rcnn/metafile.yml b/mmdetection/configs/sparse_rcnn/metafile.yml new file mode 100644 index 0000000..8fe2531 --- /dev/null +++ b/mmdetection/configs/sparse_rcnn/metafile.yml @@ -0,0 +1,80 @@ +Collections: + - Name: Sparse R-CNN + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - Sparse R-CNN + Paper: + URL: https://arxiv.org/abs/2011.12450 + Title: 'Sparse R-CNN: End-to-End Object Detection with Learnable Proposals' + README: configs/sparse_rcnn/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.9.0/mmdet/models/detectors/sparse_rcnn.py#L6 + Version: v2.9.0 + +Models: + - Name: sparse-rcnn_r50_fpn_1x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco/sparse_rcnn_r50_fpn_1x_coco_20201222_214453-dc79b137.pth + + - Name: sparse-rcnn_r50_fpn_ms-480-800-3x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco_20201218_154234-7bc5c054.pth + + - Name: sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_024605-9fe92701.pth + + - Name: sparse-rcnn_r101_fpn_ms-480-800-3x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_mstrain_480-800_3x_coco_20201223_121552-6c46c9d6.pth + + - Name: sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco + In Collection: Sparse R-CNN + Config: configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/sparse_rcnn/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco/sparse_rcnn_r101_fpn_300_proposals_crop_mstrain_480-800_3x_coco_20201223_023452-c23c3564.pth diff --git a/mmdetection/configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py new file mode 100644 index 0000000..09c11c6 --- /dev/null +++ b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r101_fpn_300-proposals_crop-ms-480-800-3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py new file mode 100644 index 0000000..a51f11c --- /dev/null +++ b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r101_fpn_ms-480-800-3x_coco.py @@ -0,0 +1,7 @@ +_base_ = './sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..8835442 --- /dev/null +++ b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,101 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +num_stages = 6 +num_proposals = 100 +model = dict( + type='SparseRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=0, + add_extra_convs='on_input', + num_outs=4), + rpn_head=dict( + type='EmbeddingRPNHead', + num_proposals=num_proposals, + proposal_feature_channel=256), + roi_head=dict( + type='SparseRoIHead', + num_stages=num_stages, + stage_loss_weights=[1] * num_stages, + proposal_feature_channel=256, + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='DIIHead', + num_classes=80, + num_ffn_fcs=2, + num_heads=8, + num_cls_fcs=1, + num_reg_fcs=3, + feedforward_channels=2048, + in_channels=256, + dropout=0.0, + ffn_act_cfg=dict(type='ReLU', inplace=True), + dynamic_conv_cfg=dict( + type='DynamicConv', + in_channels=256, + feat_channels=64, + out_channels=256, + input_feat_shape=7, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN')), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + clip_border=False, + target_means=[0., 0., 0., 0.], + target_stds=[0.5, 0.5, 1., 1.])) for _ in range(num_stages) + ]), + # training and testing settings + train_cfg=dict( + rpn=None, + rcnn=[ + dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xyxy'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ]), + sampler=dict(type='PseudoSampler'), + pos_weight=1) for _ in range(num_stages) + ]), + test_cfg=dict(rpn=None, rcnn=dict(max_per_img=num_proposals))) + +# optimizer +optim_wrapper = dict( + optimizer=dict( + _delete_=True, type='AdamW', lr=0.000025, weight_decay=0.0001), + clip_grad=dict(max_norm=1, norm_type=2)) diff --git a/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py new file mode 100644 index 0000000..93edc03 --- /dev/null +++ b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_300-proposals_crop-ms-480-800-3x_coco.py @@ -0,0 +1,43 @@ +_base_ = './sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py' +num_proposals = 300 +model = dict( + rpn_head=dict(num_proposals=num_proposals), + test_cfg=dict( + _delete_=True, rpn=None, rcnn=dict(max_per_img=num_proposals))) + +# augmentation strategy originates from DETR. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py new file mode 100644 index 0000000..156028d --- /dev/null +++ b/mmdetection/configs/sparse_rcnn/sparse-rcnn_r50_fpn_ms-480-800-3x_coco.py @@ -0,0 +1,32 @@ +_base_ = './sparse-rcnn_r50_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# learning policy +max_epochs = 36 +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=max_epochs) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] diff --git a/mmdetection/configs/ssd/README.md b/mmdetection/configs/ssd/README.md new file mode 100644 index 0000000..8b3ca91 --- /dev/null +++ b/mmdetection/configs/ssd/README.md @@ -0,0 +1,62 @@ +# SSD + +> [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325) + + + +## Abstract + +We present a method for detecting objects in images using a single deep neural network. Our approach, named SSD, discretizes the output space of bounding boxes into a set of default boxes over different aspect ratios and scales per feature map location. At prediction time, the network generates scores for the presence of each object category in each default box and produces adjustments to the box to better match the object shape. Additionally, the network combines predictions from multiple feature maps with different resolutions to naturally handle objects of various sizes. Our SSD model is simple relative to methods that require object proposals because it completely eliminates proposal generation and subsequent pixel or feature resampling stage and encapsulates all computation in a single network. This makes SSD easy to train and straightforward to integrate into systems that require a detection component. Experimental results on the PASCAL VOC, MS COCO, and ILSVRC datasets confirm that SSD has comparable accuracy to methods that utilize an additional object proposal step and is much faster, while providing a unified framework for both training and inference. Compared to other single stage methods, SSD has much better accuracy, even with a smaller input image size. For 300×300 input, SSD achieves 72.1% mAP on VOC2007 test at 58 FPS on a Nvidia Titan X and for 500×500 input, SSD achieves 75.1% mAP, outperforming a comparable state of the art Faster R-CNN model. + +
    + +
    + +## Results and models of SSD + +| Backbone | Size | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :------: | :--: | :---: | :-----: | :------: | :------------: | :----: | :------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| VGG16 | 300 | caffe | 120e | 9.9 | 43.7 | 25.5 | [config](./ssd300_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428-d231a06e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428.log.json) | +| VGG16 | 512 | caffe | 120e | 19.4 | 30.7 | 29.5 | [config](./ssd512_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849-0a47a1ca.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849.log.json) | + +## Results and models of SSD-Lite + +| Backbone | Size | Training from scratch | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------: | :--: | :-------------------: | :-----: | :------: | :------------: | :----: | :--------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| MobileNetV2 | 320 | yes | 600e | 4.0 | 69.9 | 21.3 | [config](./ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627.log.json) | + +## Notice + +### Compatibility + +In v2.14.0, [PR5291](https://github.com/open-mmlab/mmdetection/pull/5291) refactored SSD neck and head for more +flexible usage. If users want to use the SSD checkpoint trained in the older versions, we provide a scripts +`tools/model_converters/upgrade_ssd_version.py` to convert the model weights. + +```bash +python tools/model_converters/upgrade_ssd_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH} + +``` + +- OLD_MODEL_PATH: the path to load the old version SSD model. +- NEW_MODEL_PATH: the path to save the converted model weights. + +### SSD-Lite training settings + +There are some differences between our implementation of MobileNetV2 SSD-Lite and the one in [TensorFlow 1.x detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md) . + +1. Use 320x320 as input size instead of 300x300. +2. The anchor sizes are different. +3. The C4 feature map is taken from the last layer of stage 4 instead of the middle of the block. +4. The model in TensorFlow1.x is trained on coco 2014 and validated on coco minival2014, but we trained and validated the model on coco 2017. The mAP on val2017 is usually a little lower than minival2014 (refer to the results in TensorFlow Object Detection API, e.g., MobileNetV2 SSD gets 22 mAP on minival2014 but 20.2 mAP on val2017). + +## Citation + +```latex +@article{Liu_2016, + title={SSD: Single Shot MultiBox Detector}, + journal={ECCV}, + author={Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.}, + year={2016}, +} +``` diff --git a/mmdetection/configs/ssd/metafile.yml b/mmdetection/configs/ssd/metafile.yml new file mode 100644 index 0000000..190a207 --- /dev/null +++ b/mmdetection/configs/ssd/metafile.yml @@ -0,0 +1,78 @@ +Collections: + - Name: SSD + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - VGG + Paper: + URL: https://arxiv.org/abs/1512.02325 + Title: 'SSD: Single Shot MultiBox Detector' + README: configs/ssd/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.14.0/mmdet/models/dense_heads/ssd_head.py#L16 + Version: v2.14.0 + +Models: + - Name: ssd300_coco + In Collection: SSD + Config: configs/ssd/ssd300_coco.py + Metadata: + Training Memory (GB): 9.9 + inference time (ms/im): + - value: 22.88 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (300, 300) + Epochs: 120 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 25.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20210803_015428-d231a06e.pth + + - Name: ssd512_coco + In Collection: SSD + Config: configs/ssd/ssd512_coco.py + Metadata: + Training Memory (GB): 19.4 + inference time (ms/im): + - value: 32.57 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (512, 512) + Epochs: 120 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 29.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssd512_coco/ssd512_coco_20210803_022849-0a47a1ca.pth + + - Name: ssdlite_mobilenetv2-scratch_8xb24-600e_coco + In Collection: SSD + Config: configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py + Metadata: + Training Memory (GB): 4.0 + inference time (ms/im): + - value: 14.3 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (320, 320) + Epochs: 600 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 21.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/ssd/ssdlite_mobilenetv2_scratch_600e_coco/ssdlite_mobilenetv2_scratch_600e_coco_20210629_110627-974d9307.pth diff --git a/mmdetection/configs/ssd/ssd300_coco.py b/mmdetection/configs/ssd/ssd300_coco.py new file mode 100644 index 0000000..796d25c --- /dev/null +++ b/mmdetection/configs/ssd/ssd300_coco.py @@ -0,0 +1,71 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] + +# dataset settings +input_size = 300 +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean={{_base_.model.data_preprocessor.mean}}, + to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}}, + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=8, + num_workers=2, + batch_sampler=None, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type={{_base_.dataset_type}}, + data_root={{_base_.data_root}}, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args={{_base_.backend_args}}))) +val_dataloader = dict(batch_size=8, dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=2e-3, momentum=0.9, weight_decay=5e-4)) + +custom_hooks = [ + dict(type='NumClassCheckHook'), + dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW') +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/ssd/ssd512_coco.py b/mmdetection/configs/ssd/ssd512_coco.py new file mode 100644 index 0000000..7acd614 --- /dev/null +++ b/mmdetection/configs/ssd/ssd512_coco.py @@ -0,0 +1,60 @@ +_base_ = 'ssd300_coco.py' + +# model settings +input_size = 512 +model = dict( + neck=dict( + out_channels=(512, 1024, 512, 256, 256, 256, 256), + level_strides=(2, 2, 2, 2, 1), + level_paddings=(1, 1, 1, 1, 1), + last_kernel_size=4), + bbox_head=dict( + in_channels=(512, 1024, 512, 256, 256, 256, 256), + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + input_size=input_size, + basesize_ratio_range=(0.1, 0.9), + strides=[8, 16, 32, 64, 128, 256, 512], + ratios=[[2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]]))) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean={{_base_.model.data_preprocessor.mean}}, + to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}}, + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py b/mmdetection/configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py new file mode 100644 index 0000000..4e508f2 --- /dev/null +++ b/mmdetection/configs/ssd/ssdlite_mobilenetv2-scratch_8xb24-600e_coco.py @@ -0,0 +1,158 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1) +model = dict( + type='SingleStageDetector', + data_preprocessor=data_preprocessor, + backbone=dict( + type='MobileNetV2', + out_indices=(4, 7), + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)), + neck=dict( + type='SSDNeck', + in_channels=(96, 1280), + out_channels=(96, 1280, 512, 256, 256, 128), + level_strides=(2, 2, 2, 2), + level_paddings=(1, 1, 1, 1), + l2_norm_scale=None, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='TruncNormal', layer='Conv2d', std=0.03)), + bbox_head=dict( + type='SSDHead', + in_channels=(96, 1280, 512, 256, 256, 128), + num_classes=80, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='Normal', layer='Conv2d', std=0.001), + + # set anchor size manually instead of using the predefined + # SSD300 setting. + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + strides=[16, 32, 64, 107, 160, 320], + ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], + min_sizes=[48, 100, 150, 202, 253, 304], + max_sizes=[100, 150, 202, 253, 304, 320]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2])), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + sampler=dict(type='PseudoSampler'), + smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False), + test_cfg=dict( + nms_pre=1000, + nms=dict(type='nms', iou_threshold=0.45), + min_bbox_size=0, + score_thr=0.02, + max_per_img=200)) +env_cfg = dict(cudnn_benchmark=True) + +# dataset settings +input_size = 320 +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=data_preprocessor['mean'], + to_rgb=data_preprocessor['bgr_to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=24, + num_workers=4, + batch_sampler=None, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type={{_base_.dataset_type}}, + data_root={{_base_.data_root}}, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline))) +val_dataloader = dict(batch_size=8, dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# training schedule +max_epochs = 120 +train_cfg = dict(max_epochs=max_epochs, val_interval=5) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='CosineAnnealingLR', + begin=0, + T_max=max_epochs, + end=max_epochs, + by_epoch=True, + eta_min=0) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.015, momentum=0.9, weight_decay=4.0e-5)) + +custom_hooks = [ + dict(type='NumClassCheckHook'), + dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW') +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (24 samples per GPU) +auto_scale_lr = dict(base_batch_size=192) diff --git a/mmdetection/configs/strong_baselines/README.md b/mmdetection/configs/strong_baselines/README.md new file mode 100644 index 0000000..e5db3e0 --- /dev/null +++ b/mmdetection/configs/strong_baselines/README.md @@ -0,0 +1,20 @@ +# Strong Baselines + + + +We train Mask R-CNN with large-scale jitter and longer schedule as strong baselines. +The modifications follow those in [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/configs/new_baselines). + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :------: | :-----: | :-----: | :------: | :------------: | :----: | :-----: | :--------------------------------------------------------------------------------: | :----------------------: | +| R-50-FPN | pytorch | 50e | | | | | [config](./mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py) | [model](<>) \| [log](<>) | +| R-50-FPN | pytorch | 100e | | | | | [config](./mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py) | [model](<>) \| [log](<>) | +| R-50-FPN | caffe | 100e | | | 44.7 | 40.4 | [config](./mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py) | [model](<>) \| [log](<>) | +| R-50-FPN | caffe | 400e | | | | | [config](./mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-400e_coco.py) | [model](<>) \| [log](<>) | + +## Notice + +When using large-scale jittering, there are sometimes empty proposals in the box and mask heads during training. +This requires MMSyncBN that allows empty tensors. Therefore, please use mmcv-full>=1.3.14 to train models supported in this directory. diff --git a/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py b/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py new file mode 100644 index 0000000..b004d74 --- /dev/null +++ b/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py @@ -0,0 +1,4 @@ +_base_ = 'mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py' # noqa + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict(type='AmpOptimWrapper') diff --git a/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py b/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py new file mode 100644 index 0000000..70e92a8 --- /dev/null +++ b/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py @@ -0,0 +1,68 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../common/lsj-100e_coco-instance.py' +] +image_size = (1024, 1024) +batch_augments = [ + dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) +] +norm_cfg = dict(type='SyncBN', requires_grad=True) +# Use MMSyncBN that handles empty tensor in head. It can be changed to +# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed +head_norm_cfg = dict(type='MMSyncBN', requires_grad=True) +model = dict( + # use caffe norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + + # pad_size_divisor=32 is unnecessary in training but necessary + # in testing. + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + frozen_stages=-1, + norm_eval=False, + norm_cfg=norm_cfg, + init_cfg=None, + style='caffe'), + neck=dict(norm_cfg=norm_cfg), + rpn_head=dict(num_convs=2), + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=head_norm_cfg), + mask_head=dict(norm_cfg=head_norm_cfg))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +# Use RepeatDataset to speed up training +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-400e_coco.py b/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-400e_coco.py new file mode 100644 index 0000000..cb64c9b --- /dev/null +++ b/mmdetection/configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-400e_coco.py @@ -0,0 +1,20 @@ +_base_ = './mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py' # noqa + +# Use RepeatDataset to speed up training +# change repeat time from 4 (for 100 epochs) to 16 (for 400 epochs) +train_dataloader = dict(dataset=dict(times=4 * 4)) +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.067, + by_epoch=False, + begin=0, + end=500 * 4), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] diff --git a/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py b/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py new file mode 100644 index 0000000..7fab2c7 --- /dev/null +++ b/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_amp-lsj-100e_coco.py @@ -0,0 +1,4 @@ +_base_ = 'mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py' + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict(type='AmpOptimWrapper') diff --git a/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py b/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py new file mode 100644 index 0000000..8e06587 --- /dev/null +++ b/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py @@ -0,0 +1,30 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../common/lsj-100e_coco-instance.py' +] + +image_size = (1024, 1024) +batch_augments = [ + dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) +] +norm_cfg = dict(type='SyncBN', requires_grad=True) +# Use MMSyncBN that handles empty tensor in head. It can be changed to +# SyncBN after https://github.com/pytorch/pytorch/issues/36530 is fixed +head_norm_cfg = dict(type='MMSyncBN', requires_grad=True) +model = dict( + # the model is trained from scratch, so init_cfg is None + data_preprocessor=dict( + # pad_size_divisor=32 is unnecessary in training but necessary + # in testing. + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + frozen_stages=-1, norm_eval=False, norm_cfg=norm_cfg, init_cfg=None), + neck=dict(norm_cfg=norm_cfg), + rpn_head=dict(num_convs=2), # leads to 0.1+ mAP + roi_head=dict( + bbox_head=dict( + type='Shared4Conv1FCBBoxHead', + conv_out_channels=256, + norm_cfg=head_norm_cfg), + mask_head=dict(norm_cfg=head_norm_cfg))) diff --git a/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py b/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py new file mode 100644 index 0000000..6621d28 --- /dev/null +++ b/mmdetection/configs/strong_baselines/mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-50e_coco.py @@ -0,0 +1,5 @@ +_base_ = 'mask-rcnn_r50_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py' + +# Use RepeatDataset to speed up training +# change repeat time from 4 (for 100 epochs) to 2 (for 50 epochs) +train_dataloader = dict(dataset=dict(times=2)) diff --git a/mmdetection/configs/strong_baselines/metafile.yml b/mmdetection/configs/strong_baselines/metafile.yml new file mode 100644 index 0000000..f72c07e --- /dev/null +++ b/mmdetection/configs/strong_baselines/metafile.yml @@ -0,0 +1,24 @@ +Models: + - Name: mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco + In Collection: Mask R-CNN + Config: configs/strong_baselines/mask-rcnn_r50-caffe_fpn_rpn-2conv_4conv1fc_syncbn-all_lsj-100e_coco.py + Metadata: + Epochs: 100 + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + - LSJ + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - FPN + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + box AP: 40.4 diff --git a/mmdetection/configs/strongsort/README.md b/mmdetection/configs/strongsort/README.md new file mode 100644 index 0000000..8e08413 --- /dev/null +++ b/mmdetection/configs/strongsort/README.md @@ -0,0 +1,108 @@ +# StrongSORT: Make DeepSORT Great Again + +## Abstract + + + +Existing Multi-Object Tracking (MOT) methods can be roughly classified as tracking-by-detection and joint-detection-association paradigms. Although the latter has elicited more attention and demonstrates comparable performance relative to the former, we claim that the tracking-by-detection paradigm is still the optimal solution in terms of tracking accuracy. In this paper, we revisit the classic tracker DeepSORT and upgrade it from various aspects, i.e., detection, embedding and association. The resulting tracker, called StrongSORT, sets new HOTA and IDF1 records on MOT17 and MOT20. We also present two lightweight and plug-and-play algorithms to further refine the tracking results. Firstly, an appearance-free link model (AFLink) is proposed to associate short tracklets into complete trajectories. To the best of our knowledge, this is the first global link model without appearance information. Secondly, we propose Gaussian-smoothed interpolation (GSI) to compensate for missing detections. Instead of ignoring motion information like linear interpolation, GSI is based on the Gaussian process regression algorithm and can achieve more accurate localizations. Moreover, AFLink and GSI can be plugged into various trackers with a negligible extra computational cost (591.9 and 140.9 Hz, respectively, on MOT17). By integrating StrongSORT with the two algorithms, the final tracker StrongSORT++ ranks first on MOT17 and MOT20 in terms of HOTA and IDF1 metrics and surpasses the second-place one by 1.3 - 2.2. Code will be released soon. + + + +
    + +
    + +## Citation + + + +```latex +@article{du2022strongsort, + title={Strongsort: Make deepsort great again}, + author={Du, Yunhao and Song, Yang and Yang, Bo and Zhao, Yanyun}, + journal={arXiv preprint arXiv:2202.13514}, + year={2022} +} +``` + +## Results and models on MOT17 + +| Method | Detector | ReID | Train Set | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 | FP | FN | IDSw. | Config | Download | +| :----------: | :------: | :--: | :---------------------------: | :------------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :----------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| StrongSORT++ | YOLOX-X | R50 | CrowdHuman + MOT17-half-train | MOT17-half-val | N | - | 70.9 | 78.4 | 83.3 | 15237 | 19035 | 582 | [config](strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py) | [detector](https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot17-private-half_20220812_192036-b6c9ce9a.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth) [AFLink](https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth) | + +## Results and models on MOT20 + +| Method | Detector | ReID | Train Set | Test Set | Public | Inf time (fps) | HOTA | MOTA | IDF1 | FP | FN | IDSw. | Config | Download | +| :----------: | :------: | :--: | :----------------------: | :--------: | :----: | :------------: | :--: | :--: | :--: | :---: | :---: | :---: | :---------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| StrongSORT++ | YOLOX-X | R50 | CrowdHuman + MOT20-train | MOT20-test | N | - | 62.9 | 75.5 | 77.3 | 29043 | 96155 | 1640 | [config](strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py) | [detector](https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot20-private_20220812_192123-77c014de.pth) [reid](https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth) [AFLink](https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth) | + +## Get started + +### 1. Development Environment Setup + +Tracking Development Environment Setup can refer to this [document](../../docs/en/get_started.md). + +### 2. Dataset Prepare + +Tracking Dataset Prepare can refer to this [document](../../docs/en/user_guides/tracking_dataset_prepare.md). + +### 3. Training + +We implement StrongSORT with independent detector and ReID models. +Note that, due to the influence of parameters such as learning rate in default configuration file, +we recommend using 8 GPUs for training in order to reproduce accuracy. + +You can train the detector as follows. + +```shell script +# Training YOLOX-X on crowdhuman and mot17-half-train dataset with following command. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_train.sh configs/det/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8 +``` + +And you can train the ReID model as follows. + +```shell script +# Training ReID model on mot17-train80 dataset with following command. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_train.sh configs/reid/reid_r50_8xb32-6e_mot17train80_test-mot17val20.py 8 +``` + +If you want to know about more detailed usage of `train.py/dist_train.sh/slurm_train.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 4. Testing and evaluation + +**2.1 Example on MOTxx-halfval dataset** + +```shell script +# Example 1: Test on motXX-half-val set. +# The number after config file represents the number of GPUs used. Here we use 8 GPUs. +bash tools/dist_test_tracking.sh configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py 8 --detector ${CHECKPOINT_PATH} --reid ${CHECKPOINT_PATH} +``` + +**2.2 Example on MOTxx-test dataset** + +If you want to get the results of the [MOT Challenge](https://motchallenge.net/) test set, +please use the following command to generate result files that can be used for submission. +It will be stored in `./mot_20_test_res`, you can modify the saved path in `test_evaluator` of the config. + +```shell script +# Example 2: Test on motxx-test set +# The number after config file represents the number of GPUs used +bash tools/dist_test_tracking.sh configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py 8 --detector ${CHECKPOINT_PATH} --reid ${CHECKPOINT_PATH} +``` + +If you want to know about more detailed usage of `test_tracking.py/dist_test_tracking.sh/slurm_test_tracking.sh`, +please refer to this [document](../../docs/en/user_guides/tracking_train_test.md). + +### 3.Inference + +Use a single GPU to predict a video and save it as a video. + +```shell +python demo/mot_demo.py demo/demo_mot.mp4 configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py --detector ${CHECKPOINT_FILE} --reid ${CHECKPOINT_PATH} --out mot.mp4 +``` + +If you want to know about more detailed usage of `mot_demo.py`, please refer to this [document](../../docs/en/user_guides/tracking_inference.md). diff --git a/mmdetection/configs/strongsort/metafile.yml b/mmdetection/configs/strongsort/metafile.yml new file mode 100644 index 0000000..08a564b --- /dev/null +++ b/mmdetection/configs/strongsort/metafile.yml @@ -0,0 +1,48 @@ +Collections: + - Name: StrongSORT++ + Metadata: + Training Techniques: + - SGD with Momentum + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - YOLOX + Paper: + URL: https://arxiv.org/abs/2202.13514 + Title: "StrongSORT: Make DeepSORT Great Again" + README: configs/strongsort/README.md + +Models: + - Name: strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval + In Collection: StrongSORT++ + Config: configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py + Metadata: + Training Data: CrowdHuman + MOT17-half-train + Results: + - Task: Multiple Object Tracking + Dataset: MOT17-half-val + Metrics: + MOTA: 78.3 + IDF1: 83.2 + HOTA: 70.9 + Weights: + - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot17-private-half_20220812_192036-b6c9ce9a.pth + - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot17-4bf6b63d.pth + - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth + + - Name: strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test + In Collection: StrongSORT++ + Config: configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py + Metadata: + Training Data: CrowdHuman + MOT20-train + Results: + - Task: Multiple Object Tracking + Dataset: MOT20-test + Metrics: + MOTA: 75.5 + IDF1: 77.3 + HOTA: 62.9 + Weights: + - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/yolox_x_crowdhuman_mot20-private_20220812_192123-77c014de.pth + - https://download.openmmlab.com/mmtracking/mot/reid/reid_r50_6e_mot20_20210803_212426-c83b1c01.pth + - https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth diff --git a/mmdetection/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmdetection/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..532e2ae --- /dev/null +++ b/mmdetection/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py @@ -0,0 +1,130 @@ +_base_ = [ + './yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py', # noqa: E501 +] + +dataset_type = 'MOTChallengeDataset' +detector = _base_.model +detector.pop('data_preprocessor') +del _base_.model + +model = dict( + type='StrongSORT', + data_preprocessor=dict( + type='TrackDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(576, 1024), + size_divisor=32, + interval=10) + ]), + detector=detector, + reid=dict( + type='BaseReID', + data_preprocessor=dict(type='mmpretrain.ClsDataPreprocessor'), + backbone=dict( + type='mmpretrain.ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + style='pytorch'), + neck=dict(type='GlobalAveragePooling', kernel_size=(8, 4), stride=1), + head=dict( + type='LinearReIDHead', + num_fcs=1, + in_channels=2048, + fc_channels=1024, + out_channels=128, + num_classes=380, + loss_cls=dict(type='mmpretrain.CrossEntropyLoss', loss_weight=1.0), + loss_triplet=dict(type='TripletLoss', margin=0.3, loss_weight=1.0), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'))), + cmc=dict( + type='CameraMotionCompensation', + warp_mode='cv2.MOTION_EUCLIDEAN', + num_iters=100, + stop_eps=0.00001), + tracker=dict( + type='StrongSORTTracker', + motion=dict(type='KalmanFilter', center_only=False, use_nsa=True), + obj_score_thr=0.6, + reid=dict( + num_samples=None, + img_scale=(256, 128), + img_norm_cfg=dict( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + match_score_thr=0.3, + motion_weight=0.02, + ), + match_iou_thr=0.7, + momentums=dict(embeds=0.1, ), + num_tentatives=2, + num_frames_retain=100), + postprocess_model=dict( + type='AppearanceFreeLink', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmtracking/mot/strongsort/mot_dataset/aflink_motchallenge_20220812_190310-a7578ad3.pth', # noqa: E501 + temporal_threshold=(0, 30), + spatial_threshold=50, + confidence_threshold=0.95, + )) + +train_pipeline = None +test_pipeline = [ + dict( + type='TransformBroadcaster', + transforms=[ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='Resize', scale=_base_.img_scale, keep_ratio=True), + dict( + type='Pad', + size_divisor=32, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadTrackAnnotations'), + ]), + dict(type='PackTrackInputs') +] + +train_dataloader = None +val_dataloader = dict( + # Now StrongSORT only support video_based sampling + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + _delete_=True, + type=dataset_type, + data_root=_base_.data_root, + ann_file='annotations/half-val_cocoformat.json', + data_prefix=dict(img_path='train'), + # when you evaluate track performance, you need to remove metainfo + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +train_cfg = None +optim_wrapper = None + +# evaluator +val_evaluator = dict( + _delete_=True, + type='MOTChallengeMetric', + metric=['HOTA', 'CLEAR', 'Identity'], + # use_postprocess to support AppearanceFreeLink in val_evaluator + use_postprocess=True, + postprocess_tracklet_cfg=[ + dict( + type='InterpolateTracklets', + min_num_frames=5, + max_num_frames=20, + use_gsi=True, + smooth_tau=10) + ]) +test_evaluator = val_evaluator + +default_hooks = dict(logger=dict(type='LoggerHook', interval=1)) + +del _base_.param_scheduler +del _base_.custom_hooks diff --git a/mmdetection/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/mmdetection/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py new file mode 100644 index 0000000..eab9706 --- /dev/null +++ b/mmdetection/configs/strongsort/strongsort_yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py @@ -0,0 +1,44 @@ +_base_ = [ + './strongsort_yolox_x_8xb4-80e_crowdhuman-mot17halftrain' + '_test-mot17halfval.py' +] + +img_scale = (1600, 896) # width, height + +model = dict( + data_preprocessor=dict( + type='TrackDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict(type='BatchSyncRandomResize', random_size_range=(640, 1152)) + ])) + +test_pipeline = [ + dict( + type='TransformBroadcaster', + transforms=[ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='Pad', + size_divisor=32, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadTrackAnnotations'), + ]), + dict(type='PackTrackInputs') +] + +val_dataloader = dict( + dataset=dict( + data_root='data/MOT17', + ann_file='annotations/train_cocoformat.json', + data_prefix=dict(img_path='train'), + pipeline=test_pipeline)) +test_dataloader = dict( + dataset=dict( + data_root='data/MOT20', + ann_file='annotations/test_cocoformat.json', + data_prefix=dict(img_path='test'), + pipeline=test_pipeline)) + +test_evaluator = dict(format_only=True, outfile_prefix='./mot_20_test_res') diff --git a/mmdetection/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py b/mmdetection/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..59a52e4 --- /dev/null +++ b/mmdetection/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py @@ -0,0 +1,188 @@ +_base_ = ['../yolox/yolox_x_8xb8-300e_coco.py'] + +data_root = 'data/MOT17/' + +img_scale = (1440, 800) # width, height +batch_size = 4 + +# model settings +model = dict( + bbox_head=dict(num_classes=1), + test_cfg=dict(nms=dict(iou_threshold=0.7)), + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth' # noqa: E501 + )) + +train_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + bbox_clip_border=False), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + bbox_clip_border=False), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0, + bbox_clip_border=False), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Resize', + scale=img_scale, + keep_ratio=True, + clip_object_border=False), + dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + _delete_=True, + batch_size=batch_size, + num_workers=4, + persistent_workers=True, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='MultiImageMixDataset', + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type='CocoDataset', + data_root=data_root, + ann_file='annotations/half-train_cocoformat.json', + data_prefix=dict(img='train'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + dict( + type='CocoDataset', + data_root='data/crowdhuman', + ann_file='annotations/crowdhuman_train.json', + data_prefix=dict(img='train'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + dict( + type='CocoDataset', + data_root='data/crowdhuman', + ann_file='annotations/crowdhuman_val.json', + data_prefix=dict(img='val'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + ]), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + data_root=data_root, + ann_file='annotations/half-val_cocoformat.json', + data_prefix=dict(img='train'), + metainfo=dict(classes=('pedestrian', )), + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# training settings +max_epochs = 80 +num_last_epochs = 10 +interval = 5 + +train_cfg = dict(max_epochs=max_epochs, val_begin=75, val_interval=1) + +# optimizer +# default 8 gpu +base_lr = 0.001 / 8 * batch_size +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +# learning rate +param_scheduler = [ + dict( + type='QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=1, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='ConstantLR', + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ) +] + +default_hooks = dict( + checkpoint=dict( + interval=1, + max_keep_ckpts=5 # only keep latest 5 checkpoints + )) + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + priority=49) +] + +# evaluator +val_evaluator = dict( + ann_file=data_root + 'annotations/half-val_cocoformat.json', + format_only=False) +test_evaluator = val_evaluator + +del _base_.tta_model +del _base_.tta_pipeline +del _base_.train_dataset diff --git a/mmdetection/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py b/mmdetection/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py new file mode 100644 index 0000000..d4eb3cb --- /dev/null +++ b/mmdetection/configs/strongsort/yolox_x_8xb4-80e_crowdhuman-mot20train_test-mot20test.py @@ -0,0 +1,108 @@ +_base_ = ['./yolox_x_8xb4-80e_crowdhuman-mot17halftrain_test-mot17halfval.py'] + +data_root = 'data/MOT20/' + +img_scale = (1600, 896) # width, height + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=[ + dict(type='BatchSyncRandomResize', random_size_range=(640, 1152)) + ])) + +train_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + bbox_clip_border=True), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + bbox_clip_border=True), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0, + bbox_clip_border=True), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='Resize', + scale=img_scale, + keep_ratio=True, + clip_object_border=True), + dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + dataset=dict( + type='MultiImageMixDataset', + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type='CocoDataset', + data_root=data_root, + ann_file='annotations/train_cocoformat.json', + data_prefix=dict(img='train'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + dict( + type='CocoDataset', + data_root='data/crowdhuman', + ann_file='annotations/crowdhuman_train.json', + data_prefix=dict(img='train'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + dict( + type='CocoDataset', + data_root='data/crowdhuman', + ann_file='annotations/crowdhuman_val.json', + data_prefix=dict(img='val'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + metainfo=dict(classes=('pedestrian', )), + pipeline=[ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + ]), + ]), + pipeline=train_pipeline)) + +val_dataloader = dict( + dataset=dict( + data_root='data/MOT17', ann_file='annotations/train_cocoformat.json')) +test_dataloader = val_dataloader + +# evaluator +val_evaluator = dict(ann_file='data/MOT17/annotations/train_cocoformat.json') +test_evaluator = val_evaluator diff --git a/mmdetection/configs/swin/README.md b/mmdetection/configs/swin/README.md new file mode 100644 index 0000000..99bcf6e --- /dev/null +++ b/mmdetection/configs/swin/README.md @@ -0,0 +1,41 @@ +# Swin + +> [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) + + + +## Abstract + +This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures. + +
    + +
    + +## Results and Models + +### Mask R-CNN + +| Backbone | Pretrain | Lr schd | Multi-scale crop | FP16 | Mem (GB) | Inf time (fps) | box AP | mask AP | Config | Download | +| :------: | :---------: | :-----: | :--------------: | :--: | :------: | :------------: | :----: | :-----: | :-----------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Swin-T | ImageNet-1K | 1x | no | no | 7.6 | | 42.7 | 39.3 | [config](./mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937.log.json) | +| Swin-T | ImageNet-1K | 3x | yes | no | 10.2 | | 46.0 | 41.6 | [config](./mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725.log.json) | +| Swin-T | ImageNet-1K | 3x | yes | yes | 7.8 | | 46.0 | 41.7 | [config](./mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006.log.json) | +| Swin-S | ImageNet-1K | 3x | yes | yes | 11.9 | | 48.2 | 43.2 | [config](./mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808-b92c91f1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808.log.json) | + +### Notice + +Please follow the example +of `retinanet_swin-t-p4-w7_fpn_1x_coco.py` when you want to combine Swin Transformer with +the one-stage detector. Because there is a layer norm at the outs of Swin Transformer, you must set `start_level` as 0 in FPN, so we have to set the `out_indices` of backbone as `[1,2,3]`. + +## Citation + +```latex +@article{liu2021Swin, + title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows}, + author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining}, + journal={arXiv preprint arXiv:2103.14030}, + year={2021} +} +``` diff --git a/mmdetection/configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py b/mmdetection/configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py new file mode 100644 index 0000000..4a3e8ad --- /dev/null +++ b/mmdetection/configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py @@ -0,0 +1,6 @@ +_base_ = './mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py' +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth' # noqa +model = dict( + backbone=dict( + depths=[2, 2, 18, 2], + init_cfg=dict(type='Pretrained', checkpoint=pretrained))) diff --git a/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py b/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py new file mode 100644 index 0000000..5471caa --- /dev/null +++ b/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py @@ -0,0 +1,60 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa +model = dict( + type='MaskRCNN', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[96, 192, 384, 768])) + +max_epochs = 12 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + }), + optimizer=dict( + _delete_=True, + type='AdamW', + lr=0.0001, + betas=(0.9, 0.999), + weight_decay=0.05)) diff --git a/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py b/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py new file mode 100644 index 0000000..622087b --- /dev/null +++ b/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py @@ -0,0 +1,3 @@ +_base_ = './mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py' +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict(type='AmpOptimWrapper') diff --git a/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py b/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py new file mode 100644 index 0000000..7024b73 --- /dev/null +++ b/mmdetection/configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py @@ -0,0 +1,99 @@ +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +model = dict( + type='MaskRCNN', + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[96, 192, 384, 768])) + +# augmentation strategy originates from DETR / Sparse RCNN +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + paramwise_cfg=dict( + custom_keys={ + 'absolute_pos_embed': dict(decay_mult=0.), + 'relative_position_bias_table': dict(decay_mult=0.), + 'norm': dict(decay_mult=0.) + }), + optimizer=dict( + _delete_=True, + type='AdamW', + lr=0.0001, + betas=(0.9, 0.999), + weight_decay=0.05)) diff --git a/mmdetection/configs/swin/metafile.yml b/mmdetection/configs/swin/metafile.yml new file mode 100644 index 0000000..763f930 --- /dev/null +++ b/mmdetection/configs/swin/metafile.yml @@ -0,0 +1,120 @@ +Models: + - Name: mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco + In Collection: Mask R-CNN + Config: configs/swin/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 11.9 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + Training Resources: 8x V100 GPUs + Architecture: + - Swin Transformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.2 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 43.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-s-p4-w7_fpn_fp16_ms-crop-3x_coco_20210903_104808-b92c91f1.pth + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' + README: configs/swin/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 + + - Name: mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco + In Collection: Mask R-CNN + Config: configs/swin/mask-rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 10.2 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + Training Resources: 8x V100 GPUs + Architecture: + - Swin Transformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' + README: configs/swin/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 + + - Name: mask-rcnn_swin-t-p4-w7_fpn_1x_coco + In Collection: Mask R-CNN + Config: configs/swin/mask-rcnn_swin-t-p4-w7_fpn_1x_coco.py + Metadata: + Training Memory (GB): 7.6 + Epochs: 12 + Training Data: COCO + Training Techniques: + - AdamW + Training Resources: 8x V100 GPUs + Architecture: + - Swin Transformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.7 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_1x_coco/mask_rcnn_swin-t-p4-w7_fpn_1x_coco_20210902_120937-9d6b7cfa.pth + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' + README: configs/swin/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 + + - Name: mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco + In Collection: Mask R-CNN + Config: configs/swin/mask-rcnn_swin-t-p4-w7_fpn_amp-ms-crop-3x_coco.py + Metadata: + Training Memory (GB): 7.8 + Epochs: 36 + Training Data: COCO + Training Techniques: + - AdamW + Training Resources: 8x V100 GPUs + Architecture: + - Swin Transformer + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' + README: configs/swin/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.16.0/mmdet/models/backbones/swin.py#L465 + Version: v2.16.0 diff --git a/mmdetection/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py b/mmdetection/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py new file mode 100644 index 0000000..2f40a87 --- /dev/null +++ b/mmdetection/configs/swin/retinanet_swin-t-p4-w7_fpn_1x_coco.py @@ -0,0 +1,31 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + # Please only add indices that would be used + # in FPN, otherwise some parameter will not be used + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[192, 384, 768], start_level=0, num_outs=5)) + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/timm_example/README.md b/mmdetection/configs/timm_example/README.md new file mode 100644 index 0000000..848f8d3 --- /dev/null +++ b/mmdetection/configs/timm_example/README.md @@ -0,0 +1,62 @@ +# Timm Example + +> [PyTorch Image Models](https://github.com/rwightman/pytorch-image-models) + + + +## Abstract + +Py**T**orch **Im**age **M**odels (`timm`) is a collection of image models, layers, utilities, optimizers, schedulers, data-loaders / augmentations, and reference training / validation scripts that aim to pull together a wide variety of SOTA models with ability to reproduce ImageNet training results. + + + +## Results and Models + +### RetinaNet + +| Backbone | Style | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :-------------: | :-----: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------------: | :------: | +| R-50 | pytorch | 1x | | | | [config](./retinanet_timm-tv-resnet50_fpn_1x_coco.py) | | +| EfficientNet-B1 | - | 1x | | | | [config](./retinanet_timm-efficientnet-b1_fpn_1x_coco.py) | | + +## Usage + +### Install additional requirements + +MMDetection supports timm backbones via `TIMMBackbone`, a wrapper class in MMPretrain. +Thus, you need to install `mmpretrain` in addition to timm. +If you have already installed requirements for mmdet, run + +```shell +pip install 'dataclasses; python_version<"3.7"' +pip install timm +pip install mmpretrain +``` + +See [this document](https://mmpretrain.readthedocs.io/en/latest/get_started.html#installation) for the details of MMPretrain installation. + +### Edit config + +- See example configs for basic usage. +- See the documents of [timm feature extraction](https://rwightman.github.io/pytorch-image-models/feature_extraction/#multi-scale-feature-maps-feature-pyramid) and [TIMMBackbone](https://mmpretrain.readthedocs.io/en/latest/api/generated/mmpretrain.models.backbones.TIMMBackbone.html#mmpretrain.models.backbones.TIMMBackbone) for details. +- Which feature map is output depends on the backbone. + Please check `backbone out_channels` and `backbone out_strides` in your log, and modify `model.neck.in_channels` and `model.backbone.out_indices` if necessary. +- If you use Vision Transformer models that do not support `features_only=True`, add `custom_hooks = []` to your config to disable `NumClassCheckHook`. + +## Citation + +```latex +@misc{rw2019timm, + author = {Ross Wightman}, + title = {PyTorch Image Models}, + year = {2019}, + publisher = {GitHub}, + journal = {GitHub repository}, + doi = {10.5281/zenodo.4414861}, + howpublished = {\url{https://github.com/rwightman/pytorch-image-models}} +} +``` diff --git a/mmdetection/configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py b/mmdetection/configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py new file mode 100644 index 0000000..b87dddf --- /dev/null +++ b/mmdetection/configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py @@ -0,0 +1,23 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict( + imports=['mmpretrain.models'], allow_failed_imports=False) + +model = dict( + backbone=dict( + _delete_=True, + type='mmpretrain.TIMMBackbone', + model_name='efficientnet_b1', + features_only=True, + pretrained=True, + out_indices=(1, 2, 3, 4)), + neck=dict(in_channels=[24, 40, 112, 320])) + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/timm_example/retinanet_timm-tv-resnet50_fpn_1x_coco.py b/mmdetection/configs/timm_example/retinanet_timm-tv-resnet50_fpn_1x_coco.py new file mode 100644 index 0000000..74e4350 --- /dev/null +++ b/mmdetection/configs/timm_example/retinanet_timm-tv-resnet50_fpn_1x_coco.py @@ -0,0 +1,22 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict( + imports=['mmpretrain.models'], allow_failed_imports=False) + +model = dict( + backbone=dict( + _delete_=True, + type='mmpretrain.TIMMBackbone', + model_name='tv_resnet50', # ResNet-50 with torchvision weights + features_only=True, + pretrained=True, + out_indices=(1, 2, 3, 4))) + +# optimizer +optim_wrapper = dict(optimizer=dict(lr=0.01)) diff --git a/mmdetection/configs/tood/README.md b/mmdetection/configs/tood/README.md new file mode 100644 index 0000000..9371d9d --- /dev/null +++ b/mmdetection/configs/tood/README.md @@ -0,0 +1,40 @@ +# TOOD + +> [TOOD: Task-aligned One-stage Object Detection](https://arxiv.org/abs/2108.07755) + + + +## Abstract + +One-stage object detection is commonly implemented by optimizing two sub-tasks: object classification and localization, using heads with two parallel branches, which might lead to a certain level of spatial misalignment in predictions between the two tasks. In this work, we propose a Task-aligned One-stage Object Detection (TOOD) that explicitly aligns the two tasks in a learning-based manner. First, we design a novel Task-aligned Head (T-Head) which offers a better balance between learning task-interactive and task-specific features, as well as a greater flexibility to learn the alignment via a task-aligned predictor. Second, we propose Task Alignment Learning (TAL) to explicitly pull closer (or even unify) the optimal anchors for the two tasks during training via a designed sample assignment scheme and a task-aligned loss. Extensive experiments are conducted on MS-COCO, where TOOD achieves a 51.1 AP at single-model single-scale testing. This surpasses the recent one-stage detectors by a large margin, such as ATSS (47.7 AP), GFL (48.2 AP), and PAA (49.0 AP), with fewer parameters and FLOPs. Qualitative results also demonstrate the effectiveness of TOOD for better aligning the tasks of object classification and localization. + +
    + +
    + +## Results and Models + +| Backbone | Style | Anchor Type | Lr schd | Multi-scale Training | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------------: | :-----: | :----------: | :-----: | :------------------: | :------: | :------------: | :----: | :-------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | Anchor-free | 1x | N | 4.1 | | 42.4 | [config](./tood_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425.log) | +| R-50 | pytorch | Anchor-based | 1x | N | 4.1 | | 42.4 | [config](./tood_r50_fpn_anchor-based_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105-b776c134.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105.log) | +| R-50 | pytorch | Anchor-free | 2x | Y | 4.1 | | 44.5 | [config](./tood_r50_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231-3b23174c.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231.log) | +| R-101 | pytorch | Anchor-free | 2x | Y | 6.0 | | 46.1 | [config](./tood_r101_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232-a18f53c8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232.log) | +| R-101-dcnv2 | pytorch | Anchor-free | 2x | Y | 6.2 | | 49.3 | [config](./tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728-4a824142.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728.log) | +| X-101-64x4d | pytorch | Anchor-free | 2x | Y | 10.2 | | 47.6 | [config](./tood_x101-64x4d_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519-a4f36113.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519.log) | +| X-101-64x4d-dcnv2 | pytorch | Anchor-free | 2x | Y | | | | [config](./tood_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py) | [model](<>) \| [log](<>) | + +\[1\] *1x and 2x mean the model is trained for 90K and 180K iterations, respectively.* \ +\[2\] *All results are obtained with a single model and without any test time data augmentation such as multi-scale, flipping and etc..* \ +\[3\] *`dcnv2` denotes deformable convolutional networks v2.* \\ + +## Citation + +```latex +@inproceedings{feng2021tood, + title={TOOD: Task-aligned One-stage Object Detection}, + author={Feng, Chengjian and Zhong, Yujie and Gao, Yu and Scott, Matthew R and Huang, Weilin}, + booktitle={ICCV}, + year={2021} +} +``` diff --git a/mmdetection/configs/tood/metafile.yml b/mmdetection/configs/tood/metafile.yml new file mode 100644 index 0000000..d2bc080 --- /dev/null +++ b/mmdetection/configs/tood/metafile.yml @@ -0,0 +1,95 @@ +Collections: + - Name: TOOD + Metadata: + Training Data: COCO + Training Techniques: + - SGD + Training Resources: 8x V100 GPUs + Architecture: + - TOOD + Paper: + URL: https://arxiv.org/abs/2108.07755 + Title: 'TOOD: Task-aligned One-stage Object Detection' + README: configs/tood/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.20.0/mmdet/models/detectors/tood.py#L7 + Version: v2.20.0 + +Models: + - Name: tood_r101_fpn_ms-2x_coco + In Collection: TOOD + Config: configs/tood/tood_r101_fpn_ms-2x_coco.py + Metadata: + Training Memory (GB): 6.0 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.1 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_mstrain_2x_coco/tood_r101_fpn_mstrain_2x_coco_20211210_144232-a18f53c8.pth + + - Name: tood_x101-64x4d_fpn_ms-2x_coco + In Collection: TOOD + Config: configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py + Metadata: + Training Memory (GB): 10.2 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_x101_64x4d_fpn_mstrain_2x_coco/tood_x101_64x4d_fpn_mstrain_2x_coco_20211211_003519-a4f36113.pth + + - Name: tood_r101-dconv-c3-c5_fpn_ms-2x_coco + In Collection: TOOD + Config: configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py + Metadata: + Training Memory (GB): 6.2 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco/tood_r101_fpn_dconv_c3-c5_mstrain_2x_coco_20211210_213728-4a824142.pth + + - Name: tood_r50_fpn_anchor-based_1x_coco + In Collection: TOOD + Config: configs/tood/tood_r50_fpn_anchor-based_1x_coco.py + Metadata: + Training Memory (GB): 4.1 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_anchor_based_1x_coco/tood_r50_fpn_anchor_based_1x_coco_20211214_100105-b776c134.pth + + - Name: tood_r50_fpn_1x_coco + In Collection: TOOD + Config: configs/tood/tood_r50_fpn_1x_coco.py + Metadata: + Training Memory (GB): 4.1 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 42.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_1x_coco/tood_r50_fpn_1x_coco_20211210_103425-20e20746.pth + + - Name: tood_r50_fpn_ms-2x_coco + In Collection: TOOD + Config: configs/tood/tood_r50_fpn_ms-2x_coco.py + Metadata: + Training Memory (GB): 4.1 + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tood/tood_r50_fpn_mstrain_2x_coco/tood_r50_fpn_mstrain_2x_coco_20211210_144231-3b23174c.pth diff --git a/mmdetection/configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py b/mmdetection/configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..45030a6 --- /dev/null +++ b/mmdetection/configs/tood/tood_r101-dconv-c3-c5_fpn_ms-2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './tood_r101_fpn_ms-2x_coco.py' + +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + bbox_head=dict(num_dcn=2)) diff --git a/mmdetection/configs/tood/tood_r101_fpn_ms-2x_coco.py b/mmdetection/configs/tood/tood_r101_fpn_ms-2x_coco.py new file mode 100644 index 0000000..fc6ae5d --- /dev/null +++ b/mmdetection/configs/tood/tood_r101_fpn_ms-2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './tood_r50_fpn_ms-2x_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/tood/tood_r50_fpn_1x_coco.py b/mmdetection/configs/tood/tood_r50_fpn_1x_coco.py new file mode 100644 index 0000000..e4839d9 --- /dev/null +++ b/mmdetection/configs/tood/tood_r50_fpn_1x_coco.py @@ -0,0 +1,80 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# model settings +model = dict( + type='TOOD', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5), + bbox_head=dict( + type='TOODHead', + num_classes=80, + in_channels=256, + stacked_convs=6, + feat_channels=256, + anchor_type='anchor_free', + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + initial_loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + activated=True, # use probability instead of logit as input + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + activated=True, # use probability instead of logit as input + beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0)), + train_cfg=dict( + initial_epoch=4, + initial_assigner=dict(type='ATSSAssigner', topk=9), + assigner=dict(type='TaskAlignedAssigner', topk=13), + alpha=1, + beta=6, + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/tood/tood_r50_fpn_anchor-based_1x_coco.py b/mmdetection/configs/tood/tood_r50_fpn_anchor-based_1x_coco.py new file mode 100644 index 0000000..c7fbf6a --- /dev/null +++ b/mmdetection/configs/tood/tood_r50_fpn_anchor-based_1x_coco.py @@ -0,0 +1,2 @@ +_base_ = './tood_r50_fpn_1x_coco.py' +model = dict(bbox_head=dict(anchor_type='anchor_based')) diff --git a/mmdetection/configs/tood/tood_r50_fpn_ms-2x_coco.py b/mmdetection/configs/tood/tood_r50_fpn_ms-2x_coco.py new file mode 100644 index 0000000..ffb296d --- /dev/null +++ b/mmdetection/configs/tood/tood_r50_fpn_ms-2x_coco.py @@ -0,0 +1,30 @@ +_base_ = './tood_r50_fpn_1x_coco.py' +max_epochs = 24 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +# training schedule for 2x +train_cfg = dict(max_epochs=max_epochs) + +# multi-scale training +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 480), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/tood/tood_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py b/mmdetection/configs/tood/tood_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..4340519 --- /dev/null +++ b/mmdetection/configs/tood/tood_x101-64x4d-dconv-c4-c5_fpn_ms-2x_coco.py @@ -0,0 +1,7 @@ +_base_ = './tood_x101-64x4d_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, False, True, True), + ), + bbox_head=dict(num_dcn=2)) diff --git a/mmdetection/configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py b/mmdetection/configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py new file mode 100644 index 0000000..1651542 --- /dev/null +++ b/mmdetection/configs/tood/tood_x101-64x4d_fpn_ms-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './tood_r50_fpn_ms-2x_coco.py' + +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/tridentnet/README.md b/mmdetection/configs/tridentnet/README.md new file mode 100644 index 0000000..b972b3a --- /dev/null +++ b/mmdetection/configs/tridentnet/README.md @@ -0,0 +1,38 @@ +# TridentNet + +> [Scale-Aware Trident Networks for Object Detection](https://arxiv.org/abs/1901.01892) + + + +## Abstract + +Scale variation is one of the key challenges in object detection. In this work, we first present a controlled experiment to investigate the effect of receptive fields for scale variation in object detection. Based on the findings from the exploration experiments, we propose a novel Trident Network (TridentNet) aiming to generate scale-specific feature maps with a uniform representational power. We construct a parallel multi-branch architecture in which each branch shares the same transformation parameters but with different receptive fields. Then, we adopt a scale-aware training scheme to specialize each branch by sampling object instances of proper scales for training. As a bonus, a fast approximation version of TridentNet could achieve significant improvements without any additional parameters and computational cost compared with the vanilla detector. On the COCO dataset, our TridentNet with ResNet-101 backbone achieves state-of-the-art single-model results of 48.4 mAP. + +
    + +
    + +## Results and Models + +We reports the test results using only one branch for inference. + +| Backbone | Style | mstrain | Lr schd | Mem (GB) | Inf time (fps) | box AP | Download | +| :------: | :---: | :-----: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | caffe | N | 1x | | | 37.7 | [model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838.log.json) | +| R-50 | caffe | Y | 1x | | | 37.6 | [model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839-6ce55ccb.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839.log.json) | +| R-50 | caffe | Y | 3x | | | 40.3 | [model](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539-46d227ba.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539.log.json) | + +**Note** + +Similar to [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/projects/TridentNet), we haven't implemented the Scale-aware Training Scheme in section 4.2 of the paper. + +## Citation + +```latex +@InProceedings{li2019scale, + title={Scale-Aware Trident Networks for Object Detection}, + author={Li, Yanghao and Chen, Yuntao and Wang, Naiyan and Zhang, Zhaoxiang}, + journal={The International Conference on Computer Vision (ICCV)}, + year={2019} +} +``` diff --git a/mmdetection/configs/tridentnet/metafile.yml b/mmdetection/configs/tridentnet/metafile.yml new file mode 100644 index 0000000..c0081c5 --- /dev/null +++ b/mmdetection/configs/tridentnet/metafile.yml @@ -0,0 +1,55 @@ +Collections: + - Name: TridentNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - ResNet + - TridentNet Block + Paper: + URL: https://arxiv.org/abs/1901.01892 + Title: 'Scale-Aware Trident Networks for Object Detection' + README: configs/tridentnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.8.0/mmdet/models/detectors/trident_faster_rcnn.py#L6 + Version: v2.8.0 + +Models: + - Name: tridentnet_r50-caffe_1x_coco + In Collection: TridentNet + Config: configs/tridentnet/tridentnet_r50-caffe_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_1x_coco/tridentnet_r50_caffe_1x_coco_20201230_141838-2ec0b530.pth + + - Name: tridentnet_r50-caffe_ms-1x_coco + In Collection: TridentNet + Config: configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_1x_coco/tridentnet_r50_caffe_mstrain_1x_coco_20201230_141839-6ce55ccb.pth + + - Name: tridentnet_r50-caffe_ms-3x_coco + In Collection: TridentNet + Config: configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py + Metadata: + Epochs: 36 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.3 + Weights: https://download.openmmlab.com/mmdetection/v2.0/tridentnet/tridentnet_r50_caffe_mstrain_3x_coco/tridentnet_r50_caffe_mstrain_3x_coco_20201130_100539-46d227ba.pth diff --git a/mmdetection/configs/tridentnet/tridentnet_r50-caffe_1x_coco.py b/mmdetection/configs/tridentnet/tridentnet_r50-caffe_1x_coco.py new file mode 100644 index 0000000..26a4c12 --- /dev/null +++ b/mmdetection/configs/tridentnet/tridentnet_r50-caffe_1x_coco.py @@ -0,0 +1,22 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50-caffe-c4.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + type='TridentFasterRCNN', + backbone=dict( + type='TridentResNet', + trident_dilations=(1, 2, 3), + num_branch=3, + test_branch_idx=1, + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + roi_head=dict(type='TridentRoIHead', num_branch=3, test_branch_idx=1), + train_cfg=dict( + rpn_proposal=dict(max_per_img=500), + rcnn=dict( + sampler=dict(num=128, pos_fraction=0.5, + add_gt_as_proposals=False)))) diff --git a/mmdetection/configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py b/mmdetection/configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py new file mode 100644 index 0000000..806d20b --- /dev/null +++ b/mmdetection/configs/tridentnet/tridentnet_r50-caffe_ms-1x_coco.py @@ -0,0 +1,15 @@ +_base_ = 'tridentnet_r50-caffe_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomChoiceResize', + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py b/mmdetection/configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py new file mode 100644 index 0000000..4de249c --- /dev/null +++ b/mmdetection/configs/tridentnet/tridentnet_r50-caffe_ms-3x_coco.py @@ -0,0 +1,18 @@ +_base_ = 'tridentnet_r50-caffe_ms-1x_coco.py' + +# learning rate +max_epochs = 36 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] diff --git a/mmdetection/configs/v3det/README.md b/mmdetection/configs/v3det/README.md new file mode 100644 index 0000000..3687931 --- /dev/null +++ b/mmdetection/configs/v3det/README.md @@ -0,0 +1,86 @@ +

    +

    + +# V3Det: Vast Vocabulary Visual Detection Dataset + +
    + Jiaqi Wang*, + Pan Zhang*, + Tao Chu*, + Yuhang Cao*,
    + Yujie Zhou, + Tong Wu, + Bin Wang, + Conghui He, + Dahua Lin
    + (* equal contribution)
    + Accepted to ICCV 2023 (Oral) +
    +

    +

    +

    + + Paper, + Dataset
    +
    +
    +
    +

    + +
    + +
    + + + +## Abstract + +Recent advances in detecting arbitrary objects in the real world are trained and evaluated on object detection datasets with a relatively restricted vocabulary. To facilitate the development of more general visual object detection, we propose V3Det, a vast vocabulary visual detection dataset with precisely annotated bounding boxes on massive images. V3Det has several appealing properties: 1) Vast Vocabulary: It contains bounding boxes of objects from 13,204 categories on real-world images, which is 10 times larger than the existing large vocabulary object detection dataset, e.g., LVIS. 2) Hierarchical Category Organization: The vast vocabulary of V3Det is organized by a hierarchical category tree which annotates the inclusion relationship among categories, encouraging the exploration of category relationships in vast and open vocabulary object detection. 3) Rich Annotations: V3Det comprises precisely annotated objects in 243k images and professional descriptions of each category written by human experts and a powerful chatbot. By offering a vast exploration space, V3Det enables extensive benchmarks on both vast and open vocabulary object detection, leading to new observations, practices, and insights for future research. It has the potential to serve as a cornerstone dataset for developing more general visual perception systems. V3Det is available at https://v3det.openxlab.org.cn/. + +## Prepare Dataset + +Please download and prepare V3Det Dataset at [V3Det Homepage](https://v3det.openxlab.org.cn/) and [V3Det Github](https://github.com/V3Det/V3Det). + +The data includes a training set, a validation set, comprising 13,204 categories. The training set consists of 183,354 images, while the validation set has 29,821 images. The data organization is: + +``` +data/ + V3Det/ + images/ + / + |────.png + ... + ... + annotations/ + |────v3det_2023_v1_category_tree.json # Category tree + |────category_name_13204_v3det_2023_v1.txt # Category name + |────v3det_2023_v1_train.json # Train set + |────v3det_2023_v1_val.json # Validation set +``` + +## Results and Models + +| Backbone | Model | Lr schd | box AP | Config | Download | +| :------: | :-------------: | :-----: | :----: | :----------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------: | +| R-50 | Faster R-CNN | 2x | 25.4 | [config](./faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight//faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x) | +| R-50 | Cascade R-CNN | 2x | 31.6 | [config](./cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight//cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x) | +| R-50 | FCOS | 2x | 9.4 | [config](./fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight//fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x) | +| R-50 | Deformable-DETR | 50e | 34.4 | [config](./deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight/Deformable_DETR_V3Det_R50) | +| R-50 | DINO | 36e | 33.5 | [config](./dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight/DINO_V3Det_R50) | +| Swin-B | Faster R-CNN | 2x | 37.6 | [config](./faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight//faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x) | +| Swin-B | Cascade R-CNN | 2x | 42.5 | [config](./cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight//cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x) | +| Swin-B | FCOS | 2x | 21.0 | [config](./fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight//fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x) | +| Swin-B | Deformable-DETR | 50e | 42.5 | [config](./deformable-detr-refine-twostage_swin_16xb2_sample1e-3_v3det_50e.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight/Deformable_DETR_V3Det_SwinB) | +| Swin-B | DINO | 36e | 42.0 | [config](./dino-4scale_swin_16xb1_sample1e-3_v3det_36e.py) | [model](https://download.openxlab.org.cn/models/V3Det/V3Det/weight/DINO_V3Det_SwinB) | + +## Citation + +```latex +@inproceedings{wang2023v3det, + title = {V3Det: Vast Vocabulary Visual Detection Dataset}, + author = {Wang, Jiaqi and Zhang, Pan and Chu, Tao and Cao, Yuhang and Zhou, Yujie and Wu, Tong and Wang, Bin and He, Conghui and Lin, Dahua}, + booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2023} +} +``` diff --git a/mmdetection/configs/v3det/cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmdetection/configs/v3det/cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py new file mode 100644 index 0000000..567c31b --- /dev/null +++ b/mmdetection/configs/v3det/cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py @@ -0,0 +1,171 @@ +_base_ = [ + '../_base_/models/cascade-rcnn_r50_fpn.py', '../_base_/datasets/v3det.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + rpn_head=dict( + loss_bbox=dict(_delete_=True, type='L1Loss', loss_weight=1.0)), + roi_head=dict(bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=13204, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + cls_predictor_cfg=dict( + type='NormedLinear', tempearture=50, bias=True), + loss_cls=dict( + type='CrossEntropyCustomLoss', + num_classes=13204, + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=13204, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + cls_predictor_cfg=dict( + type='NormedLinear', tempearture=50, bias=True), + loss_cls=dict( + type='CrossEntropyCustomLoss', + num_classes=13204, + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=13204, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + cls_predictor_cfg=dict( + type='NormedLinear', tempearture=50, bias=True), + loss_cls=dict( + type='CrossEntropyCustomLoss', + num_classes=13204, + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn_proposal=dict(nms_pre=4000, max_per_img=2000), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1, + perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1, + perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1, + perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300))) +# dataset settings +train_dataloader = dict(batch_size=4, num_workers=8) + +# training schedule for 1x +max_iter = 68760 * 2 +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=max_iter, + val_interval=max_iter) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 2048, + by_epoch=False, + begin=0, + end=5000), + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[45840 * 2, 63030 * 2], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(_delete_=True, type='AdamW', lr=1e-4 * 1, weight_decay=0.1), + clip_grad=dict(max_norm=35, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=32) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=5730 * 2)) +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False) diff --git a/mmdetection/configs/v3det/cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmdetection/configs/v3det/cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py new file mode 100644 index 0000000..f649332 --- /dev/null +++ b/mmdetection/configs/v3det/cascade_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py @@ -0,0 +1,27 @@ +_base_ = [ + './cascade_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py', +] + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth' # noqa + +# model settings +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[128, 256, 512, 1024])) diff --git a/mmdetection/configs/v3det/deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py b/mmdetection/configs/v3det/deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py new file mode 100644 index 0000000..97544a2 --- /dev/null +++ b/mmdetection/configs/v3det/deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py @@ -0,0 +1,108 @@ +_base_ = '../deformable_detr/deformable-detr-refine-twostage_r50_16xb2-50e_coco.py' # noqa + +model = dict( + bbox_head=dict(num_classes=13204), + test_cfg=dict(max_per_img=300), +) + +data_root = 'data/V3Det/' +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + _delete_=True, + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type='V3DetDataset', + data_root=data_root, + ann_file='annotations/v3det_2023_v1_train.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=False), + pipeline=train_pipeline, + backend_args=None))) +val_dataloader = dict( + dataset=dict( + type='V3DetDataset', + data_root=data_root, + ann_file='annotations/v3det_2023_v1_val.json', + data_prefix=dict(img=''))) +test_dataloader = val_dataloader + +val_evaluator = dict( + ann_file=data_root + 'annotations/v3det_2023_v1_val.json', + use_mp_eval=True, + proposal_nums=[300]) +test_evaluator = val_evaluator + +# training schedule for 50e +# when using RFS, bs32, each epoch ~ 5730 iter +max_iter = 286500 +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=max_iter, + val_interval=max_iter / 5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[229200], # 40e + gamma=0.1) +] + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', by_epoch=False, interval=5730, + max_keep_ckpts=3)) + +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False) diff --git a/mmdetection/configs/v3det/deformable-detr-refine-twostage_swin_16xb2_sample1e-3_v3det_50e.py b/mmdetection/configs/v3det/deformable-detr-refine-twostage_swin_16xb2_sample1e-3_v3det_50e.py new file mode 100644 index 0000000..e640cd6 --- /dev/null +++ b/mmdetection/configs/v3det/deformable-detr-refine-twostage_swin_16xb2_sample1e-3_v3det_50e.py @@ -0,0 +1,27 @@ +_base_ = 'deformable-detr-refine-twostage_r50_8xb4_sample1e-3_v3det_50e.py' + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth' # noqa + +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[256, 512, 1024]), +) + +train_dataloader = dict(batch_size=2, num_workers=2) diff --git a/mmdetection/configs/v3det/dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py b/mmdetection/configs/v3det/dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py new file mode 100644 index 0000000..d9e6e6b --- /dev/null +++ b/mmdetection/configs/v3det/dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py @@ -0,0 +1,109 @@ +_base_ = '../dino/dino-4scale_r50_8xb2-36e_coco.py' + +model = dict( + bbox_head=dict(num_classes=13204), + test_cfg=dict(max_per_img=300), +) + +data_root = 'data/V3Det/' +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] +train_dataloader = dict( + _delete_=True, + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type='V3DetDataset', + data_root=data_root, + ann_file='annotations/v3det_2023_v1_train.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=False), + pipeline=train_pipeline, + backend_args=None))) +val_dataloader = dict( + dataset=dict( + type='V3DetDataset', + data_root=data_root, + ann_file='annotations/v3det_2023_v1_val.json', + data_prefix=dict(img=''))) +test_dataloader = val_dataloader + +val_evaluator = dict( + ann_file=data_root + 'annotations/v3det_2023_v1_val.json', + use_mp_eval=True, + proposal_nums=[300]) +test_evaluator = val_evaluator + +# training schedule for 36e +# when using RFS, bs16, each epoch ~ 11460 iter +max_iter = 412560 +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=max_iter, + val_interval=max_iter / 5) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[343800], # 30e + gamma=0.1) +] + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict( + type='CheckpointHook', + by_epoch=False, + interval=11460, + max_keep_ckpts=3)) + +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False) diff --git a/mmdetection/configs/v3det/dino-4scale_swin_16xb1_sample1e-3_v3det_36e.py b/mmdetection/configs/v3det/dino-4scale_swin_16xb1_sample1e-3_v3det_36e.py new file mode 100644 index 0000000..100c4ba --- /dev/null +++ b/mmdetection/configs/v3det/dino-4scale_swin_16xb1_sample1e-3_v3det_36e.py @@ -0,0 +1,27 @@ +_base_ = 'dino-4scale_r50_8xb2_sample1e-3_v3det_36e.py' + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth' # noqa + +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[256, 512, 1024]), +) + +train_dataloader = dict(batch_size=1) diff --git a/mmdetection/configs/v3det/faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmdetection/configs/v3det/faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py new file mode 100644 index 0000000..3d306fb --- /dev/null +++ b/mmdetection/configs/v3det/faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py @@ -0,0 +1,72 @@ +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/datasets/v3det.py', + '../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + roi_head=dict( + bbox_head=dict( + num_classes=13204, + reg_class_agnostic=True, + cls_predictor_cfg=dict( + type='NormedLinear', tempearture=50, bias=True), + loss_cls=dict( + type='CrossEntropyCustomLoss', + num_classes=13204, + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn_proposal=dict(nms_pre=4000, max_per_img=2000), + rcnn=dict( + assigner=dict( + perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)))), + test_cfg=dict( + rcnn=dict( + score_thr=0.0001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300))) +# dataset settings +train_dataloader = dict(batch_size=4, num_workers=8) + +# training schedule for 2x +max_iter = 68760 * 2 +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=max_iter, + val_interval=max_iter) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 2048, + by_epoch=False, + begin=0, + end=5000), + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[45840 * 2, 63030 * 2], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(_delete_=True, type='AdamW', lr=1e-4 * 1, weight_decay=0.1), + clip_grad=dict(max_norm=35, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=32) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=5730 * 2)) +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False) diff --git a/mmdetection/configs/v3det/faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmdetection/configs/v3det/faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py new file mode 100644 index 0000000..b0b1110 --- /dev/null +++ b/mmdetection/configs/v3det/faster_rcnn_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py @@ -0,0 +1,27 @@ +_base_ = [ + './faster_rcnn_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py', +] + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth' # noqa + +# model settings +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[128, 256, 512, 1024])) diff --git a/mmdetection/configs/v3det/fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmdetection/configs/v3det/fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py new file mode 100644 index 0000000..b78e38c --- /dev/null +++ b/mmdetection/configs/v3det/fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py @@ -0,0 +1,116 @@ +_base_ = [ + '../_base_/datasets/v3det.py', '../_base_/schedules/schedule_2x.py', + '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='FCOS', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSHead', + num_classes=13204, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + cls_predictor_cfg=dict(type='NormedLinear', tempearture=50, bias=True), + loss_cls=dict( + type='FocalCustomLoss', + use_sigmoid=True, + num_classes=13204, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1, + perm_repeat_gt_cfg=dict(iou_thr=0.7, perm_range=0.01)), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.0001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300)) +# dataset settings + +backend_args = None + +train_dataloader = dict(batch_size=2, num_workers=8) + +# training schedule for 2x +max_iter = 68760 * 2 * 2 +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=max_iter, + val_interval=max_iter) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1.0 / 2048, + by_epoch=False, + begin=0, + end=5000 * 2), + dict( + type='MultiStepLR', + begin=0, + end=max_iter, + by_epoch=False, + milestones=[45840 * 2 * 2, 63030 * 2 * 2], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + _delete_=True, type='AdamW', lr=1e-4 * 0.25, weight_decay=0.1), + clip_grad=dict(max_norm=35, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=32) + +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=5730 * 2)) +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=False) + +find_unused_parameters = True diff --git a/mmdetection/configs/v3det/fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py b/mmdetection/configs/v3det/fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py new file mode 100644 index 0000000..6ca952a --- /dev/null +++ b/mmdetection/configs/v3det/fcos_swinb_fpn_8x4_sample1e-3_mstrain_v3det_2x.py @@ -0,0 +1,27 @@ +_base_ = [ + './fcos_r50_fpn_8x4_sample1e-3_mstrain_v3det_2x.py', +] + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth' # noqa + +# model settings +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[128, 256, 512, 1024], force_grad_on_level=True)) diff --git a/mmdetection/configs/vfnet/README.md b/mmdetection/configs/vfnet/README.md new file mode 100644 index 0000000..73b5c07 --- /dev/null +++ b/mmdetection/configs/vfnet/README.md @@ -0,0 +1,48 @@ +# VarifocalNet + +> [VarifocalNet: An IoU-aware Dense Object Detector](https://arxiv.org/abs/2008.13367) + + + +## Abstract + +Accurately ranking the vast number of candidate detections is crucial for dense object detectors to achieve high performance. Prior work uses the classification score or a combination of classification and predicted localization scores to rank candidates. However, neither option results in a reliable ranking, thus degrading detection performance. In this paper, we propose to learn an Iou-aware Classification Score (IACS) as a joint representation of object presence confidence and localization accuracy. We show that dense object detectors can achieve a more accurate ranking of candidate detections based on the IACS. We design a new loss function, named Varifocal Loss, to train a dense object detector to predict the IACS, and propose a new star-shaped bounding box feature representation for IACS prediction and bounding box refinement. Combining these two new components and a bounding box refinement branch, we build an IoU-aware dense object detector based on the FCOS+ATSS architecture, that we call VarifocalNet or VFNet for short. Extensive experiments on MS COCO show that our VFNet consistently surpasses the strong baseline by ∼2.0 AP with different backbones. Our best model VFNet-X-1200 with Res2Net-101-DCN achieves a single-model single-scale AP of 55.1 on COCO test-dev, which is state-of-the-art among various object detectors. + +
    + +
    + +## Introduction + +**VarifocalNet (VFNet)** learns to predict the IoU-aware classification score which mixes the object presence confidence and localization accuracy together as the detection score for a bounding box. The learning is supervised by the proposed Varifocal Loss (VFL), based on a new star-shaped bounding box feature representation (the features at nine yellow sampling points). Given the new representation, the object localization accuracy is further improved by refining the initially regressed bounding box. The full paper is available at: [https://arxiv.org/abs/2008.13367](https://arxiv.org/abs/2008.13367). + +## Results and Models + +| Backbone | Style | DCN | MS train | Lr schd | Inf time (fps) | box AP (val) | box AP (test-dev) | Config | Download | +| :---------: | :-----: | :-: | :------: | :-----: | :------------: | :----------: | :---------------: | :---------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | pytorch | N | N | 1x | - | 41.6 | 41.6 | [config](./vfnet_r50_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco.json) | +| R-50 | pytorch | N | Y | 2x | - | 44.5 | 44.8 | [config](./vfnet_r50_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco_20201027-7cc75bd2.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco.json) | +| R-50 | pytorch | Y | Y | 2x | - | 47.8 | 48.0 | [config](./vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-6879c318.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.json) | +| R-101 | pytorch | N | N | 1x | - | 43.0 | 43.6 | [config](./vfnet_r101_fpn_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco_20201027pth-c831ece7.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco.json) | +| R-101 | pytorch | N | Y | 2x | - | 46.2 | 46.7 | [config](./vfnet_r101_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco_20201027pth-4a5d53f1.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco.json) | +| R-101 | pytorch | Y | Y | 2x | - | 49.0 | 49.2 | [config](./vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-7729adb5.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco.json) | +| X-101-32x4d | pytorch | Y | Y | 2x | - | 49.7 | 50.0 | [config](./vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-d300a6fc.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.json) | +| X-101-64x4d | pytorch | Y | Y | 2x | - | 50.4 | 50.8 | [config](./vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-b5f6da5e.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco.json) | + +**Notes:** + +- The MS-train scale range is 1333x\[480:960\] (`range` mode) and the inference scale keeps 1333x800. +- DCN means using `DCNv2` in both backbone and head. +- Inference time will be updated soon. +- More results and pre-trained models can be found in [VarifocalNet-Github](https://github.com/hyz-xmaster/VarifocalNet) + +## Citation + +```latex +@article{zhang2020varifocalnet, + title={VarifocalNet: An IoU-aware Dense Object Detector}, + author={Zhang, Haoyang and Wang, Ying and Dayoub, Feras and S{\"u}nderhauf, Niko}, + journal={arXiv preprint arXiv:2008.13367}, + year={2020} +} +``` diff --git a/mmdetection/configs/vfnet/metafile.yml b/mmdetection/configs/vfnet/metafile.yml new file mode 100644 index 0000000..1b791d0 --- /dev/null +++ b/mmdetection/configs/vfnet/metafile.yml @@ -0,0 +1,116 @@ +Collections: + - Name: VFNet + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + - Varifocal Loss + Paper: + URL: https://arxiv.org/abs/2008.13367 + Title: 'VarifocalNet: An IoU-aware Dense Object Detector' + README: configs/vfnet/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.6.0/mmdet/models/detectors/vfnet.py#L6 + Version: v2.6.0 + +Models: + - Name: vfnet_r50_fpn_1x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r50_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_1x_coco/vfnet_r50_fpn_1x_coco_20201027-38db6f58.pth + + - Name: vfnet_r50_fpn_ms-2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mstrain_2x_coco/vfnet_r50_fpn_mstrain_2x_coco_20201027-7cc75bd2.pth + + - Name: vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-6879c318.pth + + - Name: vfnet_r101_fpn_1x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r101_fpn_1x_coco.py + Metadata: + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.6 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_1x_coco/vfnet_r101_fpn_1x_coco_20201027pth-c831ece7.pth + + - Name: vfnet_r101_fpn_ms-2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mstrain_2x_coco/vfnet_r101_fpn_mstrain_2x_coco_20201027pth-4a5d53f1.pth + + - Name: vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-7729adb5.pth + + - Name: vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_32x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-d300a6fc.pth + + - Name: vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco + In Collection: VFNet + Config: configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py + Metadata: + Epochs: 24 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_x101_64x4d_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-b5f6da5e.pth diff --git a/mmdetection/configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..2dd67a3 --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_r101-mdconv-c3-c5_fpn_ms-2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/vfnet/vfnet_r101_fpn_1x_coco.py b/mmdetection/configs/vfnet/vfnet_r101_fpn_1x_coco.py new file mode 100644 index 0000000..b296a07 --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_r101_fpn_1x_coco.py @@ -0,0 +1,6 @@ +_base_ = './vfnet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/vfnet/vfnet_r101_fpn_2x_coco.py b/mmdetection/configs/vfnet/vfnet_r101_fpn_2x_coco.py new file mode 100644 index 0000000..37a7bac --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_r101_fpn_2x_coco.py @@ -0,0 +1,20 @@ +_base_ = './vfnet_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) +# learning policy +max_epochs = 24 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py new file mode 100644 index 0000000..62f064b --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_r101_fpn_ms-2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './vfnet_r50_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..08adf92 --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py @@ -0,0 +1,6 @@ +_base_ = './vfnet_r50_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + bbox_head=dict(dcn_on_last_conv=True)) diff --git a/mmdetection/configs/vfnet/vfnet_r50_fpn_1x_coco.py b/mmdetection/configs/vfnet/vfnet_r50_fpn_1x_coco.py new file mode 100644 index 0000000..99bc3b5 --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_r50_fpn_1x_coco.py @@ -0,0 +1,104 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + type='VFNet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='VFNetHead', + num_classes=80, + in_channels=256, + stacked_convs=3, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + center_sampling=False, + dcn_on_last_conv=False, + use_atss=True, + use_vfl=True, + loss_cls=dict( + type='VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.5), + loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) + +# data setting +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.01), + paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.), + clip_grad=None) +# learning rate +max_epochs = 12 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py new file mode 100644 index 0000000..0f8eed2 --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_r50_fpn_ms-2x_coco.py @@ -0,0 +1,36 @@ +_base_ = './vfnet_r50_fpn_1x_coco.py' +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', scale=[(1333, 480), (1333, 960)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader +# learning policy +max_epochs = 24 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +train_cfg = dict(max_epochs=max_epochs) diff --git a/mmdetection/configs/vfnet/vfnet_res2net-101_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_res2net-101_fpn_ms-2x_coco.py new file mode 100644 index 0000000..94288e8 --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_res2net-101_fpn_ms-2x_coco.py @@ -0,0 +1,16 @@ +_base_ = './vfnet_r50_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/mmdetection/configs/vfnet/vfnet_res2net101-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_res2net101-mdconv-c3-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..269330d --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_res2net101-mdconv-c3-c5_fpn_ms-2x_coco.py @@ -0,0 +1,18 @@ +_base_ = './vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='Res2Net', + depth=101, + scales=4, + base_width=26, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://res2net101_v1d_26w_4s'))) diff --git a/mmdetection/configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..465da0c --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_x101-32x4d-mdconv-c3-c5_fpn_ms-2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/vfnet/vfnet_x101-32x4d_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_x101-32x4d_fpn_ms-2x_coco.py new file mode 100644 index 0000000..486bcfe --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_x101-32x4d_fpn_ms-2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './vfnet_r50_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py new file mode 100644 index 0000000..14a070e --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_x101-64x4d-mdconv-c3-c5_fpn_ms-2x_coco.py @@ -0,0 +1,17 @@ +_base_ = './vfnet_r50-mdconv-c3-c5_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/vfnet/vfnet_x101-64x4d_fpn_ms-2x_coco.py b/mmdetection/configs/vfnet/vfnet_x101-64x4d_fpn_ms-2x_coco.py new file mode 100644 index 0000000..92e3f71 --- /dev/null +++ b/mmdetection/configs/vfnet/vfnet_x101-64x4d_fpn_ms-2x_coco.py @@ -0,0 +1,15 @@ +_base_ = './vfnet_r50_fpn_ms-2x_coco.py' +model = dict( + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/configs/wider_face/README.md b/mmdetection/configs/wider_face/README.md new file mode 100644 index 0000000..1904506 --- /dev/null +++ b/mmdetection/configs/wider_face/README.md @@ -0,0 +1,57 @@ +# WIDER FACE + +> [WIDER FACE: A Face Detection Benchmark](https://arxiv.org/abs/1511.06523) + + + +## Abstract + +Face detection is one of the most studied topics in the computer vision community. Much of the progresses have been made by the availability of face detection benchmark datasets. We show that there is a gap between current face detection performance and the real world requirements. To facilitate future face detection research, we introduce the WIDER FACE dataset, which is 10 times larger than existing datasets. The dataset contains rich annotations, including occlusions, poses, event categories, and face bounding boxes. Faces in the proposed dataset are extremely challenging due to large variations in scale, pose and occlusion, as shown in Fig. 1. Furthermore, we show that WIDER FACE dataset is an effective training source for face detection. We benchmark several representative detection systems, providing an overview of state-of-the-art performance and propose a solution to deal with large scale variation. Finally, we discuss common failure cases that worth to be further investigated. + +
    + +
    + +## Introduction + +To use the WIDER Face dataset you need to download it +and extract to the `data/WIDERFace` folder. Annotation in the VOC format +can be found in this [repo](https://github.com/sovrasov/wider-face-pascal-voc-annotations.git). +You should move the annotation files from `WIDER_train_annotations` and `WIDER_val_annotations` folders +to the `Annotation` folders inside the corresponding directories `WIDER_train` and `WIDER_val`. +Also annotation lists `val.txt` and `train.txt` should be copied to `data/WIDERFace` from `WIDER_train_annotations` and `WIDER_val_annotations`. +The directory should be like this: + +``` +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── WIDERFace +│ │ ├── WIDER_train +│ | │ ├──0--Parade +│ | │ ├── ... +│ | │ ├── Annotations +│ │ ├── WIDER_val +│ | │ ├──0--Parade +│ | │ ├── ... +│ | │ ├── Annotations +│ │ ├── val.txt +│ │ ├── train.txt + +``` + +After that you can train the SSD300 on WIDER by launching training with the `ssd300_wider_face.py` config or +create your own config based on the presented one. + +## Citation + +```latex +@inproceedings{yang2016wider, + Author = {Yang, Shuo and Luo, Ping and Loy, Chen Change and Tang, Xiaoou}, + Booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + Title = {WIDER FACE: A Face Detection Benchmark}, + Year = {2016} +} +``` diff --git a/mmdetection/configs/wider_face/retinanet_r50_fpn_1x_widerface.py b/mmdetection/configs/wider_face/retinanet_r50_fpn_1x_widerface.py new file mode 100644 index 0000000..7806725 --- /dev/null +++ b/mmdetection/configs/wider_face/retinanet_r50_fpn_1x_widerface.py @@ -0,0 +1,10 @@ +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/wider_face.py', '../_base_/schedules/schedule_1x.py', + '../_base_/default_runtime.py' +] +# model settings +model = dict(bbox_head=dict(num_classes=1)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) diff --git a/mmdetection/configs/wider_face/ssd300_8xb32-24e_widerface.py b/mmdetection/configs/wider_face/ssd300_8xb32-24e_widerface.py new file mode 100644 index 0000000..02c3c92 --- /dev/null +++ b/mmdetection/configs/wider_face/ssd300_8xb32-24e_widerface.py @@ -0,0 +1,64 @@ +_base_ = [ + '../_base_/models/ssd300.py', '../_base_/datasets/wider_face.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_2x.py' +] +model = dict(bbox_head=dict(num_classes=1)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict( + type='Expand', + mean={{_base_.model.data_preprocessor.mean}}, + to_rgb={{_base_.model.data_preprocessor.bgr_to_rgb}}, + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(300, 300), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='Resize', scale=(300, 300), keep_ratio=False), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +dataset_type = 'WIDERFaceDataset' +data_root = 'data/WIDERFace/' +train_dataloader = dict( + batch_size=32, num_workers=8, dataset=dict(pipeline=train_pipeline)) + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict(type='MultiStepLR', by_epoch=True, milestones=[16, 20], gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + optimizer=dict(lr=0.012, momentum=0.9, weight_decay=5e-4), + clip_grad=dict(max_norm=35, norm_type=2)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (32 samples per GPU) +auto_scale_lr = dict(base_batch_size=256) diff --git a/mmdetection/configs/yolact/README.md b/mmdetection/configs/yolact/README.md new file mode 100644 index 0000000..e884ad6 --- /dev/null +++ b/mmdetection/configs/yolact/README.md @@ -0,0 +1,75 @@ +# YOLACT + +> [YOLACT: Real-time Instance Segmentation](https://arxiv.org/abs/1904.02689) + + + +## Abstract + +We present a simple, fully-convolutional model for real-time instance segmentation that achieves 29.8 mAP on MS COCO at 33.5 fps evaluated on a single Titan Xp, which is significantly faster than any previous competitive approach. Moreover, we obtain this result after training on only one GPU. We accomplish this by breaking instance segmentation into two parallel subtasks: (1) generating a set of prototype masks and (2) predicting per-instance mask coefficients. Then we produce instance masks by linearly combining the prototypes with the mask coefficients. We find that because this process doesn't depend on repooling, this approach produces very high-quality masks and exhibits temporal stability for free. Furthermore, we analyze the emergent behavior of our prototypes and show they learn to localize instances on their own in a translation variant manner, despite being fully-convolutional. Finally, we also propose Fast NMS, a drop-in 12 ms faster replacement for standard NMS that only has a marginal performance penalty. + +
    + +
    + +## Introduction + +A simple, fully convolutional model for real-time instance segmentation. This is the code for our paper: + +- [YOLACT: Real-time Instance Segmentation](https://arxiv.org/abs/1904.02689) + + + +For a real-time demo, check out our ICCV video: +[![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/0pMfmo8qfpQ/0.jpg)](https://www.youtube.com/watch?v=0pMfmo8qfpQ) + +## Evaluation + +Here are our YOLACT models along with their FPS on a Titan Xp and mAP on COCO's `val`: + +| Image Size | GPU x BS | Backbone | \*FPS | mAP | Weights | Configs | Download | +| :--------: | :------: | :-----------: | :---: | :--: | :-----: | :--------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | +| 550 | 1x8 | Resnet50-FPN | 42.5 | 29.0 | | [config](./yolact_r50_1xb8-55e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_1x8_coco/yolact_r50_1x8_coco_20200908-f38d58df.pth) | +| 550 | 8x8 | Resnet50-FPN | 42.5 | 28.4 | | [config](./yolact_r50_8xb8-55e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_8x8_coco/yolact_r50_8x8_coco_20200908-ca34f5db.pth) | +| 550 | 1x8 | Resnet101-FPN | 33.5 | 30.4 | | [config](./yolact_r101_1xb8-55e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r101_1x8_coco/yolact_r101_1x8_coco_20200908-4cbe9101.pth) | + +\*Note: The FPS is evaluated by the [original implementation](https://github.com/dbolya/yolact). When calculating FPS, only the model inference time is taken into account. Data loading and post-processing operations such as converting masks to RLE code, generating COCO JSON results, image rendering are not included. + +## Training + +All the aforementioned models are trained with a single GPU. It typically takes ~12GB VRAM when using resnet-101 as the backbone. If you want to try multiple GPUs training, you may have to modify the configuration files accordingly, such as adjusting the training schedule and freezing batch norm. + +```Shell +# Trains using the resnet-101 backbone with a batch size of 8 on a single GPU. +./tools/dist_train.sh configs/yolact/yolact_r101.py 1 +``` + +## Testing + +Please refer to [mmdetection/docs/getting_started.md](https://mmdetection.readthedocs.io/en/latest/1_exist_data_model.html#test-existing-models). + +## Citation + +If you use YOLACT or this code base in your work, please cite + +```latex +@inproceedings{yolact-iccv2019, + author = {Daniel Bolya and Chong Zhou and Fanyi Xiao and Yong Jae Lee}, + title = {YOLACT: {Real-time} Instance Segmentation}, + booktitle = {ICCV}, + year = {2019}, +} +``` + + diff --git a/mmdetection/configs/yolact/metafile.yml b/mmdetection/configs/yolact/metafile.yml new file mode 100644 index 0000000..9ca76b3 --- /dev/null +++ b/mmdetection/configs/yolact/metafile.yml @@ -0,0 +1,81 @@ +Collections: + - Name: YOLACT + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - FPN + - ResNet + Paper: + URL: https://arxiv.org/abs/1904.02689 + Title: 'YOLACT: Real-time Instance Segmentation' + README: configs/yolact/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.5.0/mmdet/models/detectors/yolact.py#L9 + Version: v2.5.0 + +Models: + - Name: yolact_r50_1x8_coco + In Collection: YOLACT + Config: configs/yolact/yolact_r50_1xb8-55e_coco.py + Metadata: + Training Resources: 1x V100 GPU + Batch Size: 8 + Epochs: 55 + inference time (ms/im): + - value: 23.53 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (550, 550) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 29.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_1x8_coco/yolact_r50_1x8_coco_20200908-f38d58df.pth + + - Name: yolact_r50_8x8_coco + In Collection: YOLACT + Config: configs/yolact/yolact_r50_8xb8-55e_coco.py + Metadata: + Batch Size: 64 + Epochs: 55 + inference time (ms/im): + - value: 23.53 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (550, 550) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 28.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r50_8x8_coco/yolact_r50_8x8_coco_20200908-ca34f5db.pth + + - Name: yolact_r101_1x8_coco + In Collection: YOLACT + Config: configs/yolact/yolact_r101_1xb8-55e_coco.py + Metadata: + Training Resources: 1x V100 GPU + Batch Size: 8 + Epochs: 55 + inference time (ms/im): + - value: 29.85 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (550, 550) + Results: + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 30.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolact/yolact_r101_1x8_coco/yolact_r101_1x8_coco_20200908-4cbe9101.pth diff --git a/mmdetection/configs/yolact/yolact_r101_1xb8-55e_coco.py b/mmdetection/configs/yolact/yolact_r101_1xb8-55e_coco.py new file mode 100644 index 0000000..e6ffe29 --- /dev/null +++ b/mmdetection/configs/yolact/yolact_r101_1xb8-55e_coco.py @@ -0,0 +1,7 @@ +_base_ = './yolact_r50_1xb8-55e_coco.py' + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/configs/yolact/yolact_r50_1xb8-55e_coco.py b/mmdetection/configs/yolact/yolact_r50_1xb8-55e_coco.py new file mode 100644 index 0000000..b7dabf1 --- /dev/null +++ b/mmdetection/configs/yolact/yolact_r50_1xb8-55e_coco.py @@ -0,0 +1,170 @@ +_base_ = [ + '../_base_/datasets/coco_instance.py', '../_base_/default_runtime.py' +] +img_norm_cfg = dict( + mean=[123.68, 116.78, 103.94], std=[58.40, 57.12, 57.38], to_rgb=True) +# model settings +input_size = 550 +model = dict( + type='YOLACT', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=img_norm_cfg['mean'], + std=img_norm_cfg['std'], + bgr_to_rgb=img_norm_cfg['to_rgb'], + pad_mask=True), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, # do not freeze stem + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, # update the statistics of bn + zero_init_residual=False, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5, + upsample_cfg=dict(mode='bilinear')), + bbox_head=dict( + type='YOLACTHead', + num_classes=80, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=3, + scales_per_octave=1, + base_sizes=[8, 16, 32, 64, 128], + ratios=[0.5, 1.0, 2.0], + strides=[550.0 / x for x in [69, 35, 18, 9, 5]], + centers=[(550 * 0.5 / x, 550 * 0.5 / x) + for x in [69, 35, 18, 9, 5]]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + reduction='none', + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.5), + num_head_convs=1, + num_protos=32, + use_ohem=True), + mask_head=dict( + type='YOLACTProtonet', + in_channels=256, + num_protos=32, + num_classes=80, + max_masks_to_train=100, + loss_mask_weight=6.125, + with_seg_branch=True, + loss_segm=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + sampler=dict(type='PseudoSampler'), # YOLACT should use PseudoSampler + # smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + mask_thr=0.5, + iou_thr=0.5, + top_k=200, + max_per_img=100, + mask_thr_binary=0.5)) +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(4.0, 4.0)), + dict( + type='Expand', + mean=img_norm_cfg['mean'], + to_rgb=img_norm_cfg['to_rgb'], + ratio_range=(1, 4)), + dict( + type='MinIoURandomCrop', + min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='RandomFlip', prob=0.5), + dict( + type='PhotoMetricDistortion', + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(input_size, input_size), keep_ratio=False), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=8, + num_workers=4, + batch_sampler=None, + dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +max_epochs = 55 +# training schedule for 55e +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# learning rate +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20, 42, 49, 52], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=1e-3, momentum=0.9, weight_decay=5e-4)) + +custom_hooks = [ + dict(type='CheckInvalidLossHook', interval=50, priority='VERY_LOW') +] + +env_cfg = dict(cudnn_benchmark=True) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (1 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=8) diff --git a/mmdetection/configs/yolact/yolact_r50_8xb8-55e_coco.py b/mmdetection/configs/yolact/yolact_r50_8xb8-55e_coco.py new file mode 100644 index 0000000..e39c285 --- /dev/null +++ b/mmdetection/configs/yolact/yolact_r50_8xb8-55e_coco.py @@ -0,0 +1,23 @@ +_base_ = 'yolact_r50_1xb8-55e_coco.py' + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(lr=8e-3), + clip_grad=dict(max_norm=35, norm_type=2)) +# learning rate +max_epochs = 55 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20, 42, 49, 52], + gamma=0.1) +] +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/yolo/README.md b/mmdetection/configs/yolo/README.md new file mode 100644 index 0000000..9cb47bc --- /dev/null +++ b/mmdetection/configs/yolo/README.md @@ -0,0 +1,55 @@ +# YOLOv3 + +> [YOLOv3: An Incremental Improvement](https://arxiv.org/abs/1804.02767) + + + +## Abstract + +We present some updates to YOLO! We made a bunch of little design changes to make it better. We also trained this new network that's pretty swell. It's a little bigger than last time but more accurate. It's still fast though, don't worry. At 320x320 YOLOv3 runs in 22 ms at 28.2 mAP, as accurate as SSD but three times faster. When we look at the old .5 IOU mAP detection metric YOLOv3 is quite good. It achieves 57.9 mAP@50 in 51 ms on a Titan X, compared to 57.5 mAP@50 in 198 ms by RetinaNet, similar performance but 3.8x faster. + +
    + +
    + +## Results and Models + +| Backbone | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :--------: | :---: | :-----: | :------: | :------------: | :----: | :---------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| DarkNet-53 | 320 | 273e | 2.7 | 63.9 | 27.9 | [config](./yolov3_d53_8xb8-320-273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-421362b6.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-20200819_172101.log.json) | +| DarkNet-53 | 416 | 273e | 3.8 | 61.2 | 30.9 | [config](./yolov3_d53_8xb8-ms-416-273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-2b60fcd9.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-20200819_173424.log.json) | +| DarkNet-53 | 608 | 273e | 7.4 | 48.1 | 33.7 | [config](./yolov3_d53_8xb8-ms-608-273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020-a2c3acb8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020.log.json) | + +## Mixed Precision Training + +We also train YOLOv3 with mixed precision training. + +| Backbone | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :--------: | :---: | :-----: | :------: | :------------: | :----: | :-------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| DarkNet-53 | 608 | 273e | 4.7 | 48.1 | 33.8 | [config](./yolov3_d53_8xb8-amp-ms-608-273e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542-4bc34944.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542.log.json) | + +## Lightweight models + +| Backbone | Scale | Lr schd | Mem (GB) | Inf time (fps) | box AP | Config | Download | +| :---------: | :---: | :-----: | :------: | :------------: | :----: | :------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| MobileNetV2 | 416 | 300e | 5.3 | | 23.9 | [config](./yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823-f68a07b3.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823.log.json) | +| MobileNetV2 | 320 | 300e | 3.2 | | 22.2 | [config](./yolov3_mobilenetv2_8xb24-320-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349.log.json) | + +Notice: We reduce the number of channels to 96 in both head and neck. It can reduce the flops and parameters, which makes these models more suitable for edge devices. + +## Credit + +This implementation originates from the project of Haoyu Wu(@wuhy08) at Western Digital. + +## Citation + +```latex +@misc{redmon2018yolov3, + title={YOLOv3: An Incremental Improvement}, + author={Joseph Redmon and Ali Farhadi}, + year={2018}, + eprint={1804.02767}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/mmdetection/configs/yolo/metafile.yml b/mmdetection/configs/yolo/metafile.yml new file mode 100644 index 0000000..627e70c --- /dev/null +++ b/mmdetection/configs/yolo/metafile.yml @@ -0,0 +1,124 @@ +Collections: + - Name: YOLOv3 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - DarkNet + Paper: + URL: https://arxiv.org/abs/1804.02767 + Title: 'YOLOv3: An Incremental Improvement' + README: configs/yolo/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.4.0/mmdet/models/detectors/yolo.py#L8 + Version: v2.4.0 + +Models: + - Name: yolov3_d53_320_273e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_d53_8xb8-320-273e_coco.py + Metadata: + Training Memory (GB): 2.7 + inference time (ms/im): + - value: 15.65 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (320, 320) + Epochs: 273 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 27.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_320_273e_coco/yolov3_d53_320_273e_coco-421362b6.pth + + - Name: yolov3_d53_mstrain-416_273e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py + Metadata: + Training Memory (GB): 3.8 + inference time (ms/im): + - value: 16.34 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (416, 416) + Epochs: 273 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 30.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-416_273e_coco/yolov3_d53_mstrain-416_273e_coco-2b60fcd9.pth + + - Name: yolov3_d53_mstrain-608_273e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py + Metadata: + Training Memory (GB): 7.4 + inference time (ms/im): + - value: 20.79 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (608, 608) + Epochs: 273 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 33.7 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_mstrain-608_273e_coco/yolov3_d53_mstrain-608_273e_coco_20210518_115020-a2c3acb8.pth + + - Name: yolov3_d53_fp16_mstrain-608_273e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py + Metadata: + Training Memory (GB): 4.7 + inference time (ms/im): + - value: 20.79 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP16 + resolution: (608, 608) + Epochs: 273 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 33.8 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_d53_fp16_mstrain-608_273e_coco/yolov3_d53_fp16_mstrain-608_273e_coco_20210517_213542-4bc34944.pth + + - Name: yolov3_mobilenetv2_8xb24-320-300e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py + Metadata: + Training Memory (GB): 3.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 22.2 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_320_300e_coco/yolov3_mobilenetv2_320_300e_coco_20210719_215349-d18dff72.pth + + - Name: yolov3_mobilenetv2_8xb24-ms-416-300e_coco + In Collection: YOLOv3 + Config: configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py + Metadata: + Training Memory (GB): 5.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 23.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolo/yolov3_mobilenetv2_mstrain-416_300e_coco/yolov3_mobilenetv2_mstrain-416_300e_coco_20210718_010823-f68a07b3.pth diff --git a/mmdetection/configs/yolo/yolov3_d53_8xb8-320-273e_coco.py b/mmdetection/configs/yolo/yolov3_d53_8xb8-320-273e_coco.py new file mode 100644 index 0000000..a3d08dd --- /dev/null +++ b/mmdetection/configs/yolo/yolov3_d53_8xb8-320-273e_coco.py @@ -0,0 +1,29 @@ +_base_ = './yolov3_d53_8xb8-ms-608-273e_coco.py' + +input_size = (320, 320) +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + # `mean` and `to_rgb` should be the same with the `preprocess_cfg` + dict(type='Expand', mean=[0, 0, 0], to_rgb=True, ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=input_size, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=input_size, keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py b/mmdetection/configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py new file mode 100644 index 0000000..173d8ee --- /dev/null +++ b/mmdetection/configs/yolo/yolov3_d53_8xb8-amp-ms-608-273e_coco.py @@ -0,0 +1,3 @@ +_base_ = './yolov3_d53_8xb8-ms-608-273e_coco.py' +# fp16 settings +optim_wrapper = dict(type='AmpOptimWrapper', loss_scale='dynamic') diff --git a/mmdetection/configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py b/mmdetection/configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py new file mode 100644 index 0000000..ca0127e --- /dev/null +++ b/mmdetection/configs/yolo/yolov3_d53_8xb8-ms-416-273e_coco.py @@ -0,0 +1,28 @@ +_base_ = './yolov3_d53_8xb8-ms-608-273e_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + # `mean` and `to_rgb` should be the same with the `preprocess_cfg` + dict(type='Expand', mean=[0, 0, 0], to_rgb=True, ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='RandomResize', scale=[(320, 320), (416, 416)], keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(416, 416), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py b/mmdetection/configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py new file mode 100644 index 0000000..d4a36df --- /dev/null +++ b/mmdetection/configs/yolo/yolov3_d53_8xb8-ms-608-273e_coco.py @@ -0,0 +1,167 @@ +_base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'] +# model settings +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[0, 0, 0], + std=[255., 255., 255.], + bgr_to_rgb=True, + pad_size_divisor=32) +model = dict( + type='YOLOV3', + data_preprocessor=data_preprocessor, + backbone=dict( + type='Darknet', + depth=53, + out_indices=(3, 4, 5), + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://darknet53')), + neck=dict( + type='YOLOV3Neck', + num_scales=3, + in_channels=[1024, 512, 256], + out_channels=[512, 256, 128]), + bbox_head=dict( + type='YOLOV3Head', + num_classes=80, + in_channels=[512, 256, 128], + out_channels=[1024, 512, 256], + anchor_generator=dict( + type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), (156, 198), (373, 326)], + [(30, 61), (62, 45), (59, 119)], + [(10, 13), (16, 30), (33, 23)]], + strides=[32, 16, 8]), + bbox_coder=dict(type='YOLOBBoxCoder'), + featmap_strides=[32, 16, 8], + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_conf=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_xy=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=2.0, + reduction='sum'), + loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='GridAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0)), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + conf_thr=0.005, + nms=dict(type='nms', iou_threshold=0.45), + max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=data_preprocessor['mean'], + to_rgb=data_preprocessor['bgr_to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='RandomResize', scale=[(320, 320), (608, 608)], keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(608, 608), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +train_cfg = dict(max_epochs=273, val_interval=7) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning policy +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=2000), + dict(type='MultiStepLR', by_epoch=True, milestones=[218, 246], gamma=0.1) +] + +default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=7)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py b/mmdetection/configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py new file mode 100644 index 0000000..07b3937 --- /dev/null +++ b/mmdetection/configs/yolo/yolov3_mobilenetv2_8xb24-320-300e_coco.py @@ -0,0 +1,42 @@ +_base_ = ['./yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py'] + +# yapf:disable +model = dict( + bbox_head=dict( + anchor_generator=dict( + base_sizes=[[(220, 125), (128, 222), (264, 266)], + [(35, 87), (102, 96), (60, 170)], + [(10, 15), (24, 36), (72, 42)]]))) +# yapf:enable + +input_size = (320, 320) +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + # `mean` and `to_rgb` should be the same with the `preprocess_cfg` + dict( + type='Expand', + mean=[123.675, 116.28, 103.53], + to_rgb=True, + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='Resize', scale=input_size, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=input_size, keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py b/mmdetection/configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py new file mode 100644 index 0000000..9a161b6 --- /dev/null +++ b/mmdetection/configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py @@ -0,0 +1,176 @@ +_base_ = ['../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'] +# model settings +data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32) +model = dict( + type='YOLOV3', + data_preprocessor=data_preprocessor, + backbone=dict( + type='MobileNetV2', + out_indices=(2, 4, 6), + act_cfg=dict(type='LeakyReLU', negative_slope=0.1), + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://mmdet/mobilenet_v2')), + neck=dict( + type='YOLOV3Neck', + num_scales=3, + in_channels=[320, 96, 32], + out_channels=[96, 96, 96]), + bbox_head=dict( + type='YOLOV3Head', + num_classes=80, + in_channels=[96, 96, 96], + out_channels=[96, 96, 96], + anchor_generator=dict( + type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), (156, 198), (373, 326)], + [(30, 61), (62, 45), (59, 119)], + [(10, 13), (16, 30), (33, 23)]], + strides=[32, 16, 8]), + bbox_coder=dict(type='YOLOBBoxCoder'), + featmap_strides=[32, 16, 8], + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_conf=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0, + reduction='sum'), + loss_xy=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=2.0, + reduction='sum'), + loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='GridAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0)), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + conf_thr=0.005, + nms=dict(type='nms', iou_threshold=0.45), + max_per_img=100)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Expand', + mean=data_preprocessor['mean'], + to_rgb=data_preprocessor['bgr_to_rgb'], + ratio_range=(1, 2)), + dict( + type='MinIoURandomCrop', + min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9), + min_crop_size=0.3), + dict(type='RandomResize', scale=[(320, 320), (416, 416)], keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=(416, 416), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=24, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='RepeatDataset', # use RepeatDataset to speed up training + times=10, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=24, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +train_cfg = dict(max_epochs=30) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.003, momentum=0.9, weight_decay=0.0005), + clip_grad=dict(max_norm=35, norm_type=2)) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=False, + begin=0, + end=4000), + dict(type='MultiStepLR', by_epoch=True, milestones=[24, 28], gamma=0.1) +] + +find_unused_parameters = True + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (24 samples per GPU) +auto_scale_lr = dict(base_batch_size=192) diff --git a/mmdetection/configs/yolof/README.md b/mmdetection/configs/yolof/README.md new file mode 100644 index 0000000..b9167f6 --- /dev/null +++ b/mmdetection/configs/yolof/README.md @@ -0,0 +1,35 @@ +# YOLOF + +> [You Only Look One-level Feature](https://arxiv.org/abs/2103.09460) + + + +## Abstract + +This paper revisits feature pyramids networks (FPN) for one-stage detectors and points out that the success of FPN is due to its divide-and-conquer solution to the optimization problem in object detection rather than multi-scale feature fusion. From the perspective of optimization, we introduce an alternative way to address the problem instead of adopting the complex feature pyramids - {\\em utilizing only one-level feature for detection}. Based on the simple and efficient solution, we present You Only Look One-level Feature (YOLOF). In our method, two key components, Dilated Encoder and Uniform Matching, are proposed and bring considerable improvements. Extensive experiments on the COCO benchmark prove the effectiveness of the proposed model. Our YOLOF achieves comparable results with its feature pyramids counterpart RetinaNet while being 2.5× faster. Without transformer layers, YOLOF can match the performance of DETR in a single-level feature manner with 7× less training epochs. With an image size of 608×608, YOLOF achieves 44.3 mAP running at 60 fps on 2080Ti, which is 13% faster than YOLOv4. + +
    + +
    + +## Results and Models + +| Backbone | Style | Epoch | Lr schd | Mem (GB) | box AP | Config | Download | +| :------: | :---: | :---: | :-----: | :------: | :----: | :--------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50-C5 | caffe | Y | 1x | 8.3 | 37.5 | [config](./yolof_r50-c5_8xb8-1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427.log.json) | + +**Note**: + +1. We find that the performance is unstable and may fluctuate by about 0.3 mAP. mAP 37.4 ~ 37.7 is acceptable in YOLOF_R_50_C5_1x. Such fluctuation can also be found in the [original implementation](https://github.com/chensnathan/YOLOF). +2. In addition to instability issues, sometimes there are large loss fluctuations and NAN, so there may still be problems with this project, which will be improved subsequently. + +## Citation + +```latex +@inproceedings{chen2021you, + title={You Only Look One-level Feature}, + author={Chen, Qiang and Wang, Yingming and Yang, Tong and Zhang, Xiangyu and Cheng, Jian and Sun, Jian}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2021} +} +``` diff --git a/mmdetection/configs/yolof/metafile.yml b/mmdetection/configs/yolof/metafile.yml new file mode 100644 index 0000000..b3b7b7f --- /dev/null +++ b/mmdetection/configs/yolof/metafile.yml @@ -0,0 +1,32 @@ +Collections: + - Name: YOLOF + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Momentum + - Weight Decay + Training Resources: 8x V100 GPUs + Architecture: + - Dilated Encoder + - ResNet + Paper: + URL: https://arxiv.org/abs/2103.09460 + Title: 'You Only Look One-level Feature' + README: configs/yolof/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.12.0/mmdet/models/detectors/yolof.py#L6 + Version: v2.12.0 + +Models: + - Name: yolof_r50_c5_8x8_1x_coco + In Collection: YOLOF + Config: configs/yolof/yolof_r50-c5_8xb8-1x_coco.py + Metadata: + Training Memory (GB): 8.3 + Epochs: 12 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolof/yolof_r50_c5_8x8_1x_coco/yolof_r50_c5_8x8_1x_coco_20210425_024427-8e864411.pth diff --git a/mmdetection/configs/yolof/yolof_r50-c5_8xb8-1x_coco.py b/mmdetection/configs/yolof/yolof_r50-c5_8xb8-1x_coco.py new file mode 100644 index 0000000..5ea228e --- /dev/null +++ b/mmdetection/configs/yolof/yolof_r50-c5_8xb8-1x_coco.py @@ -0,0 +1,116 @@ +_base_ = [ + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + type='YOLOF', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron/resnet50_caffe')), + neck=dict( + type='DilatedEncoder', + in_channels=2048, + out_channels=512, + block_mid_channels=128, + num_residual_blocks=4, + block_dilations=[2, 4, 6, 8]), + bbox_head=dict( + type='YOLOFHead', + num_classes=80, + in_channels=512, + reg_decoded_bbox=True, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[1, 2, 4, 8, 16], + strides=[32]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1., 1., 1., 1.], + add_ctr_clamp=True, + ctr_clamp=32), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=1.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='UniformAssigner', pos_ignore_thr=0.15, neg_ignore_thr=0.7), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100)) +# optimizer +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.12, momentum=0.9, weight_decay=0.0001), + paramwise_cfg=dict( + norm_decay_mult=0., custom_keys={'backbone': dict(lr_mult=1. / 3)})) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.00066667, + by_epoch=False, + begin=0, + end=1500), + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='RandomShift', prob=0.5, max_shift_px=32), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=8, num_workers=8, dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/yolof/yolof_r50-c5_8xb8-iter-1x_coco.py b/mmdetection/configs/yolof/yolof_r50-c5_8xb8-iter-1x_coco.py new file mode 100644 index 0000000..466a820 --- /dev/null +++ b/mmdetection/configs/yolof/yolof_r50-c5_8xb8-iter-1x_coco.py @@ -0,0 +1,32 @@ +_base_ = './yolof_r50-c5_8xb8-1x_coco.py' + +# We implemented the iter-based config according to the source code. +# COCO dataset has 117266 images after filtering. We use 8 gpu and +# 8 batch size training, so 22500 is equivalent to +# 22500/(117266/(8x8))=12.3 epoch, 15000 is equivalent to 8.2 epoch, +# 20000 is equivalent to 10.9 epoch. Due to lr(0.12) is large, +# the iter-based and epoch-based setting have about 0.2 difference on +# the mAP evaluation value. + +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=22500, + val_interval=4500) + +# learning rate policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=22500, + by_epoch=False, + milestones=[15000, 20000], + gamma=0.1) +] +train_dataloader = dict(sampler=dict(type='InfiniteSampler')) +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=2500)) + +log_processor = dict(by_epoch=False) diff --git a/mmdetection/configs/yolox/README.md b/mmdetection/configs/yolox/README.md new file mode 100644 index 0000000..0cde192 --- /dev/null +++ b/mmdetection/configs/yolox/README.md @@ -0,0 +1,39 @@ +# YOLOX + +> [YOLOX: Exceeding YOLO Series in 2021](https://arxiv.org/abs/2107.08430) + + + +## Abstract + +In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3% AP on COCO, surpassing NanoDet by 1.8% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3% AP on COCO, outperforming the current best practice by 3.0% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. + +
    + +
    + +## Results and Models + +| Backbone | size | Mem (GB) | box AP | Config | Download | +| :--------: | :--: | :------: | :----: | :--------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOX-tiny | 416 | 3.5 | 32.0 | [config](./yolox_tiny_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234.log.json) | +| YOLOX-s | 640 | 7.6 | 40.5 | [config](./yolox_s_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711.log.json) | +| YOLOX-l | 640 | 19.9 | 49.4 | [config](./yolox_l_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236.log.json) | +| YOLOX-x | 640 | 28.1 | 50.9 | [config](./yolox_x_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254.log.json) | + +**Note**: + +1. The test score threshold is 0.001, and the box AP indicates the best AP. +2. Due to the need for pre-training weights, we cannot reproduce the performance of the `yolox-nano` model. Please refer to https://github.com/Megvii-BaseDetection/YOLOX/issues/674 for more information. +3. We also trained the model by the official release of YOLOX based on [Megvii-BaseDetection/YOLOX#735](https://github.com/Megvii-BaseDetection/YOLOX/issues/735) with commit ID [38c633](https://github.com/Megvii-BaseDetection/YOLOX/tree/38c633bf176462ee42b110c70e4ffe17b5753208). We found that the best AP of `YOLOX-tiny`, `YOLOX-s`, `YOLOX-l`, and `YOLOX-x` is 31.8, 40.3, 49.2, and 50.9, respectively. The performance is consistent with that of our re-implementation (see Table above) but still has a gap (0.3~0.8 AP) in comparison with the reported performance in their [README](https://github.com/Megvii-BaseDetection/YOLOX/blob/38c633bf176462ee42b110c70e4ffe17b5753208/README.md#benchmark). + +## Citation + +```latex +@article{yolox2021, + title={{YOLOX}: Exceeding YOLO Series in 2021}, + author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian}, + journal={arXiv preprint arXiv:2107.08430}, + year={2021} +} +``` diff --git a/mmdetection/configs/yolox/metafile.yml b/mmdetection/configs/yolox/metafile.yml new file mode 100644 index 0000000..2f64450 --- /dev/null +++ b/mmdetection/configs/yolox/metafile.yml @@ -0,0 +1,70 @@ +Collections: + - Name: YOLOX + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - Cosine Annealing Lr Updater + Training Resources: 8x TITANXp GPUs + Architecture: + - CSPDarkNet + - PAFPN + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'YOLOX: Exceeding YOLO Series in 2021' + README: configs/yolox/README.md + Code: + URL: https://github.com/open-mmlab/mmdetection/blob/v2.15.1/mmdet/models/detectors/yolox.py#L6 + Version: v2.15.1 + + +Models: + - Name: yolox_s_8x8_300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_s_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 7.6 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.5 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth + - Name: yolox_l_8x8_300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_l_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 19.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.4 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth + - Name: yolox_x_8x8_300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_x_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 28.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.9 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth + - Name: yolox_tiny_8x8_300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_tiny_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 3.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 32.0 + Weights: https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_tiny_8x8_300e_coco/yolox_tiny_8x8_300e_coco_20211124_171234-b4047906.pth diff --git a/mmdetection/configs/yolox/yolox_l_8xb8-300e_coco.py b/mmdetection/configs/yolox/yolox_l_8xb8-300e_coco.py new file mode 100644 index 0000000..2a4b287 --- /dev/null +++ b/mmdetection/configs/yolox/yolox_l_8xb8-300e_coco.py @@ -0,0 +1,8 @@ +_base_ = './yolox_s_8xb8-300e_coco.py' + +# model settings +model = dict( + backbone=dict(deepen_factor=1.0, widen_factor=1.0), + neck=dict( + in_channels=[256, 512, 1024], out_channels=256, num_csp_blocks=3), + bbox_head=dict(in_channels=256, feat_channels=256)) diff --git a/mmdetection/configs/yolox/yolox_m_8xb8-300e_coco.py b/mmdetection/configs/yolox/yolox_m_8xb8-300e_coco.py new file mode 100644 index 0000000..d82f9e9 --- /dev/null +++ b/mmdetection/configs/yolox/yolox_m_8xb8-300e_coco.py @@ -0,0 +1,8 @@ +_base_ = './yolox_s_8xb8-300e_coco.py' + +# model settings +model = dict( + backbone=dict(deepen_factor=0.67, widen_factor=0.75), + neck=dict(in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2), + bbox_head=dict(in_channels=192, feat_channels=192), +) diff --git a/mmdetection/configs/yolox/yolox_nano_8xb8-300e_coco.py b/mmdetection/configs/yolox/yolox_nano_8xb8-300e_coco.py new file mode 100644 index 0000000..3f7a1c5 --- /dev/null +++ b/mmdetection/configs/yolox/yolox_nano_8xb8-300e_coco.py @@ -0,0 +1,11 @@ +_base_ = './yolox_tiny_8xb8-300e_coco.py' + +# model settings +model = dict( + backbone=dict(deepen_factor=0.33, widen_factor=0.25, use_depthwise=True), + neck=dict( + in_channels=[64, 128, 256], + out_channels=64, + num_csp_blocks=1, + use_depthwise=True), + bbox_head=dict(in_channels=64, feat_channels=64, use_depthwise=True)) diff --git a/mmdetection/configs/yolox/yolox_s_8xb8-300e_coco.py b/mmdetection/configs/yolox/yolox_s_8xb8-300e_coco.py new file mode 100644 index 0000000..3e324eb --- /dev/null +++ b/mmdetection/configs/yolox/yolox_s_8xb8-300e_coco.py @@ -0,0 +1,250 @@ +_base_ = [ + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py', + './yolox_tta.py' +] + +img_scale = (640, 640) # width, height + +# model settings +model = dict( + type='YOLOX', + data_preprocessor=dict( + type='DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=10) + ]), + backbone=dict( + type='CSPDarknet', + deepen_factor=0.33, + widen_factor=0.5, + out_indices=(2, 3, 4), + use_depthwise=False, + spp_kernal_sizes=(5, 9, 13), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + ), + neck=dict( + type='YOLOXPAFPN', + in_channels=[128, 256, 512], + out_channels=128, + num_csp_blocks=1, + use_depthwise=False, + upsample_cfg=dict(scale_factor=2, mode='nearest'), + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')), + bbox_head=dict( + type='YOLOXHead', + num_classes=80, + in_channels=128, + feat_channels=128, + stacked_convs=2, + strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)), + train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), + # In order to align the source code, the threshold of the val phase is + # 0.01, and the threshold of the test phase is 0.001. + test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65))) + +# dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + # According to the official implementation, multi-scale + # training is not considered here but in the + # 'mmdet/models/detectors/yolox.py'. + # Resize and Pad are for the last 15 epochs when Mosaic, + # RandomAffine, and MixUp are closed by YOLOXModeSwitchHook. + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + # If the image is three-channel, the pad value needs + # to be set separately for each channel. + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') +] + +train_dataset = dict( + # use MultiImageMixDataset wrapper to support mosaic and mixup + type='MultiImageMixDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=[ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True) + ], + filter_cfg=dict(filter_empty_gt=False, min_size=32), + backend_args=backend_args), + pipeline=train_pipeline) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset) +val_dataloader = dict( + batch_size=8, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +# training settings +max_epochs = 300 +num_last_epochs = 15 +interval = 10 + +train_cfg = dict(max_epochs=max_epochs, val_interval=interval) + +# optimizer +# default 8 gpu +base_lr = 0.01 +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', lr=base_lr, momentum=0.9, weight_decay=5e-4, + nesterov=True), + paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.)) + +# learning rate +param_scheduler = [ + dict( + # use quadratic formula to warm up 5 epochs + # and lr is updated by iteration + # TODO: fix default scope in get function + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + # use cosine lr from 5 to 285 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=5, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + # use fixed lr during last 15 epochs + type='ConstantLR', + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ) +] + +default_hooks = dict( + checkpoint=dict( + interval=interval, + max_keep_ckpts=3 # only keep latest 3 checkpoints + )) + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + priority=48), + dict(type='SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + priority=49) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/configs/yolox/yolox_tiny_8xb8-300e_coco.py b/mmdetection/configs/yolox/yolox_tiny_8xb8-300e_coco.py new file mode 100644 index 0000000..86f7e9a --- /dev/null +++ b/mmdetection/configs/yolox/yolox_tiny_8xb8-300e_coco.py @@ -0,0 +1,54 @@ +_base_ = './yolox_s_8xb8-300e_coco.py' + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='BatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=10) + ]), + backbone=dict(deepen_factor=0.33, widen_factor=0.375), + neck=dict(in_channels=[96, 192, 384], out_channels=96), + bbox_head=dict(in_channels=96, feat_channels=96)) + +img_scale = (640, 640) # width, height + +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.5, 1.5), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict(type='YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + # Resize and Pad are for the last 15 epochs when Mosaic and + # RandomAffine are closed by YOLOXModeSwitchHook. + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(416, 416), keep_ratio=True), + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/configs/yolox/yolox_tta.py b/mmdetection/configs/yolox/yolox_tta.py new file mode 100644 index 0000000..e65244b --- /dev/null +++ b/mmdetection/configs/yolox/yolox_tta.py @@ -0,0 +1,36 @@ +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=100)) + +img_scales = [(640, 640), (320, 320), (960, 960)] +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=None), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='Resize', scale=s, keep_ratio=True) + for s in img_scales + ], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], + [ + dict( + type='Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + ], + [dict(type='LoadAnnotations', with_bbox=True)], + [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/mmdetection/configs/yolox/yolox_x_8xb8-300e_coco.py b/mmdetection/configs/yolox/yolox_x_8xb8-300e_coco.py new file mode 100644 index 0000000..34828e0 --- /dev/null +++ b/mmdetection/configs/yolox/yolox_x_8xb8-300e_coco.py @@ -0,0 +1,8 @@ +_base_ = './yolox_s_8xb8-300e_coco.py' + +# model settings +model = dict( + backbone=dict(deepen_factor=1.33, widen_factor=1.25), + neck=dict( + in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4), + bbox_head=dict(in_channels=320, feat_channels=320)) diff --git a/mmdetection/dataset-index.yml b/mmdetection/dataset-index.yml new file mode 100644 index 0000000..116412e --- /dev/null +++ b/mmdetection/dataset-index.yml @@ -0,0 +1,18 @@ +openxlab: true +voc2007: + dataset: OpenDataLab/PASCAL_VOC2007 + download_root: data + data_root: data + script: tools/dataset_converters/scripts/preprocess_voc2007.sh + +voc2012: + dataset: OpenDataLab/PASCAL_VOC2012 + download_root: data + data_root: data + script: tools/dataset_converters/scripts/preprocess_voc2012.sh + +coco2017: + dataset: OpenDataLab/COCO_2017 + download_root: data + data_root: data/coco + script: tools/dataset_converters/scripts/preprocess_coco2017.sh diff --git a/mmdetection/demo/create_result_gif.py b/mmdetection/demo/create_result_gif.py new file mode 100644 index 0000000..8e56a33 --- /dev/null +++ b/mmdetection/demo/create_result_gif.py @@ -0,0 +1,165 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +import mmcv +import numpy as np +from mmengine.utils import scandir + +try: + import imageio +except ImportError: + imageio = None + + +# TODO verify after refactoring analyze_results.py +def parse_args(): + parser = argparse.ArgumentParser(description='Create GIF for demo') + parser.add_argument( + 'image_dir', + help='directory where result ' + 'images save path generated by ‘analyze_results.py’') + parser.add_argument( + '--out', + type=str, + default='result.gif', + help='gif path where will be saved') + args = parser.parse_args() + return args + + +def _generate_batch_data(sampler, batch_size): + batch = [] + for idx in sampler: + batch.append(idx) + if len(batch) == batch_size: + yield batch + batch = [] + if len(batch) > 0: + yield batch + + +def create_gif(frames, gif_name, duration=2): + """Create gif through imageio. + + Args: + frames (list[ndarray]): Image frames + gif_name (str): Saved gif name + duration (int): Display interval (s), + Default: 2 + """ + if imageio is None: + raise RuntimeError('imageio is not installed,' + 'Please use “pip install imageio” to install') + imageio.mimsave(gif_name, frames, 'GIF', duration=duration) + + +def create_frame_by_matplotlib(image_dir, + nrows=1, + fig_size=(300, 300), + font_size=15): + """Create gif frame image through matplotlib. + + Args: + image_dir (str): Root directory of result images + nrows (int): Number of rows displayed, Default: 1 + fig_size (tuple): Figure size of the pyplot figure. + Default: (300, 300) + font_size (int): Font size of texts. Default: 15 + + Returns: + list[ndarray]: image frames + """ + + result_dir_names = os.listdir(image_dir) + assert len(result_dir_names) == 2 + # Longer length has higher priority + result_dir_names.reverse() + + images_list = [] + for dir_names in result_dir_names: + images_list.append(scandir(osp.join(image_dir, dir_names))) + + frames = [] + for paths in _generate_batch_data(zip(*images_list), nrows): + + fig, axes = plt.subplots(nrows=nrows, ncols=2) + fig.suptitle('Good/bad case selected according ' + 'to the COCO mAP of the single image') + + det_patch = mpatches.Patch(color='salmon', label='prediction') + gt_patch = mpatches.Patch(color='royalblue', label='ground truth') + # bbox_to_anchor may need to be finetuned + plt.legend( + handles=[det_patch, gt_patch], + bbox_to_anchor=(1, -0.18), + loc='lower right', + borderaxespad=0.) + + if nrows == 1: + axes = [axes] + + dpi = fig.get_dpi() + # set fig size and margin + fig.set_size_inches( + (fig_size[0] * 2 + fig_size[0] // 20) / dpi, + (fig_size[1] * nrows + fig_size[1] // 3) / dpi, + ) + + fig.tight_layout() + # set subplot margin + plt.subplots_adjust( + hspace=.05, + wspace=0.05, + left=0.02, + right=0.98, + bottom=0.02, + top=0.98) + + for i, (path_tuple, ax_tuple) in enumerate(zip(paths, axes)): + image_path_left = osp.join( + osp.join(image_dir, result_dir_names[0], path_tuple[0])) + image_path_right = osp.join( + osp.join(image_dir, result_dir_names[1], path_tuple[1])) + image_left = mmcv.imread(image_path_left) + image_left = mmcv.rgb2bgr(image_left) + image_right = mmcv.imread(image_path_right) + image_right = mmcv.rgb2bgr(image_right) + + if i == 0: + ax_tuple[0].set_title( + result_dir_names[0], fontdict={'size': font_size}) + ax_tuple[1].set_title( + result_dir_names[1], fontdict={'size': font_size}) + ax_tuple[0].imshow( + image_left, extent=(0, *fig_size, 0), interpolation='bilinear') + ax_tuple[0].axis('off') + ax_tuple[1].imshow( + image_right, + extent=(0, *fig_size, 0), + interpolation='bilinear') + ax_tuple[1].axis('off') + + canvas = fig.canvas + s, (width, height) = canvas.print_to_buffer() + buffer = np.frombuffer(s, dtype='uint8') + img_rgba = buffer.reshape(height, width, 4) + rgb, alpha = np.split(img_rgba, [3], axis=2) + img = rgb.astype('uint8') + + frames.append(img) + + return frames + + +def main(): + args = parse_args() + frames = create_frame_by_matplotlib(args.image_dir) + create_gif(frames, args.out) + + +if __name__ == '__main__': + main() diff --git a/mmdetection/demo/demo_multi_model.py b/mmdetection/demo/demo_multi_model.py new file mode 100644 index 0000000..f7935de --- /dev/null +++ b/mmdetection/demo/demo_multi_model.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Support for multi-model fusion, and currently only the Weighted Box Fusion +(WBF) fusion method is supported. + +References: https://github.com/ZFTurbo/Weighted-Boxes-Fusion + +Example: + + python demo/demo_multi_model.py demo/demo.jpg \ + ./configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_1x_coco.py \ + ./configs/retinanet/retinanet_r50-caffe_fpn_1x_coco.py \ + --checkpoints \ + https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_caffe_fpn_1x_coco/faster_rcnn_r50_caffe_fpn_1x_coco_bbox_mAP-0.378_20200504_180032-c5925ee5.pth \ # noqa + https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_caffe_fpn_1x_coco/retinanet_r50_caffe_fpn_1x_coco_20200531-f11027c5.pth \ + --weights 1 2 +""" + +import argparse +import os.path as osp + +import mmcv +import mmengine +from mmengine.fileio import isdir, join_path, list_dir_or_file +from mmengine.logging import print_log +from mmengine.structures import InstanceData + +from mmdet.apis import DetInferencer +from mmdet.models.utils import weighted_boxes_fusion +from mmdet.registry import VISUALIZERS +from mmdet.structures import DetDataSample + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDetection multi-model inference demo') + parser.add_argument( + 'inputs', type=str, help='Input image file or folder path.') + parser.add_argument( + 'config', + type=str, + nargs='*', + help='Config file(s), support receive multiple files') + parser.add_argument( + '--checkpoints', + type=str, + nargs='*', + help='Checkpoint file(s), support receive multiple files, ' + 'remember to correspond to the above config', + ) + parser.add_argument( + '--weights', + type=float, + nargs='*', + default=None, + help='weights for each model, remember to ' + 'correspond to the above config') + parser.add_argument( + '--fusion-iou-thr', + type=float, + default=0.55, + help='IoU value for boxes to be a match in wbf') + parser.add_argument( + '--skip-box-thr', + type=float, + default=0.0, + help='exclude boxes with score lower than this variable in wbf') + parser.add_argument( + '--conf-type', + type=str, + default='avg', # avg, max, box_and_model_avg, absent_model_aware_avg + help='how to calculate confidence in weighted boxes in wbf') + parser.add_argument( + '--out-dir', + type=str, + default='outputs', + help='Output directory of images or prediction results.') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--pred-score-thr', + type=float, + default=0.3, + help='bbox score threshold') + parser.add_argument( + '--batch-size', type=int, default=1, help='Inference batch size.') + parser.add_argument( + '--show', + action='store_true', + help='Display the image in a popup window.') + parser.add_argument( + '--no-save-vis', + action='store_true', + help='Do not save detection vis results') + parser.add_argument( + '--no-save-pred', + action='store_true', + help='Do not save detection json results') + parser.add_argument( + '--palette', + default='none', + choices=['coco', 'voc', 'citys', 'random', 'none'], + help='Color palette used for visualization') + + args = parser.parse_args() + + if args.no_save_vis and args.no_save_pred: + args.out_dir = '' + + return args + + +def main(): + args = parse_args() + + results = [] + cfg_visualizer = None + dataset_meta = None + + inputs = [] + filename_list = [] + if isdir(args.inputs): + dir = list_dir_or_file( + args.inputs, list_dir=False, suffix=IMG_EXTENSIONS) + for filename in dir: + img = mmcv.imread(join_path(args.inputs, filename)) + inputs.append(img) + filename_list.append(filename) + else: + img = mmcv.imread(args.inputs) + inputs.append(img) + img_name = osp.basename(args.inputs) + filename_list.append(img_name) + + for i, (config, + checkpoint) in enumerate(zip(args.config, args.checkpoints)): + inferencer = DetInferencer( + config, checkpoint, device=args.device, palette=args.palette) + + result_raw = inferencer( + inputs=inputs, + batch_size=args.batch_size, + no_save_vis=True, + pred_score_thr=args.pred_score_thr) + + if i == 0: + cfg_visualizer = inferencer.cfg.visualizer + dataset_meta = inferencer.model.dataset_meta + results = [{ + 'bboxes_list': [], + 'scores_list': [], + 'labels_list': [] + } for _ in range(len(result_raw['predictions']))] + + for res, raw in zip(results, result_raw['predictions']): + res['bboxes_list'].append(raw['bboxes']) + res['scores_list'].append(raw['scores']) + res['labels_list'].append(raw['labels']) + + visualizer = VISUALIZERS.build(cfg_visualizer) + visualizer.dataset_meta = dataset_meta + + for i in range(len(results)): + bboxes, scores, labels = weighted_boxes_fusion( + results[i]['bboxes_list'], + results[i]['scores_list'], + results[i]['labels_list'], + weights=args.weights, + iou_thr=args.fusion_iou_thr, + skip_box_thr=args.skip_box_thr, + conf_type=args.conf_type) + + pred_instances = InstanceData() + pred_instances.bboxes = bboxes + pred_instances.scores = scores + pred_instances.labels = labels + + fusion_result = DetDataSample(pred_instances=pred_instances) + + img_name = filename_list[i] + + if not args.no_save_pred: + out_json_path = ( + args.out_dir + '/preds/' + img_name.split('.')[0] + '.json') + mmengine.dump( + { + 'labels': labels.tolist(), + 'scores': scores.tolist(), + 'bboxes': bboxes.tolist() + }, out_json_path) + + out_file = osp.join(args.out_dir, 'vis', + img_name) if not args.no_save_vis else None + + visualizer.add_datasample( + img_name, + inputs[i][..., ::-1], + data_sample=fusion_result, + show=args.show, + draw_gt=False, + wait_time=0, + pred_score_thr=args.pred_score_thr, + out_file=out_file) + + if not args.no_save_vis: + print_log(f'results have been saved at {args.out_dir}') + + +if __name__ == '__main__': + main() diff --git a/mmdetection/demo/image_demo.py b/mmdetection/demo/image_demo.py new file mode 100644 index 0000000..2e2c27a --- /dev/null +++ b/mmdetection/demo/image_demo.py @@ -0,0 +1,136 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Image Demo. + +This script adopts a new infenence class, currently supports image path, +np.array and folder input formats, and will support video and webcam +in the future. + +Example: + Save visualizations and predictions results:: + + python demo/image_demo.py demo/demo.jpg rtmdet-s + + python demo/image_demo.py demo/demo.jpg \ + configs/rtmdet/rtmdet_s_8xb32-300e_coco.py \ + --weights rtmdet_s_8xb32-300e_coco_20220905_161602-387a891e.pth + + python demo/image_demo.py demo/demo.jpg \ + glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 --texts bench + + python demo/image_demo.py demo/demo.jpg \ + glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 --texts 'bench . car .' + + python demo/image_demo.py demo/demo.jpg \ + glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 + --texts 'bench . car .' -c + + python demo/image_demo.py demo/demo.jpg \ + glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365 \ + --texts 'There are a lot of cars here.' + + Visualize prediction results:: + + python demo/image_demo.py demo/demo.jpg rtmdet-ins-s --show + + python demo/image_demo.py demo/demo.jpg rtmdet-ins_s_8xb32-300e_coco \ + --show +""" + +from argparse import ArgumentParser + +from mmengine.logging import print_log + +from mmdet.apis import DetInferencer + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'inputs', type=str, help='Input image file or folder path.') + parser.add_argument( + 'model', + type=str, + help='Config or checkpoint .pth file or the model name ' + 'and alias defined in metafile. The model configuration ' + 'file will try to read from .pth if the parameter is ' + 'a .pth weights file.') + parser.add_argument('--weights', default=None, help='Checkpoint file') + parser.add_argument( + '--out-dir', + type=str, + default='outputs', + help='Output directory of images or prediction results.') + parser.add_argument('--texts', help='text prompt') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--pred-score-thr', + type=float, + default=0.3, + help='bbox score threshold') + parser.add_argument( + '--batch-size', type=int, default=1, help='Inference batch size.') + parser.add_argument( + '--show', + action='store_true', + help='Display the image in a popup window.') + parser.add_argument( + '--no-save-vis', + action='store_true', + help='Do not save detection vis results') + parser.add_argument( + '--no-save-pred', + action='store_true', + help='Do not save detection json results') + parser.add_argument( + '--print-result', + action='store_true', + help='Whether to print the results.') + parser.add_argument( + '--palette', + default='none', + choices=['coco', 'voc', 'citys', 'random', 'none'], + help='Color palette used for visualization') + # only for GLIP + parser.add_argument( + '--custom-entities', + '-c', + action='store_true', + help='Whether to customize entity names? ' + 'If so, the input text should be ' + '"cls_name1 . cls_name2 . cls_name3 ." format') + + call_args = vars(parser.parse_args()) + + if call_args['no_save_vis'] and call_args['no_save_pred']: + call_args['out_dir'] = '' + + if call_args['model'].endswith('.pth'): + print_log('The model is a weight file, automatically ' + 'assign the model to --weights') + call_args['weights'] = call_args['model'] + call_args['model'] = None + + init_kws = ['model', 'weights', 'device', 'palette'] + init_args = {} + for init_kw in init_kws: + init_args[init_kw] = call_args.pop(init_kw) + + return init_args, call_args + + +def main(): + init_args, call_args = parse_args() + # TODO: Video and Webcam are currently not supported and + # may consume too much memory if your input folder has a lot of images. + # We will be optimized later. + inferencer = DetInferencer(**init_args) + inferencer(**call_args) + + if call_args['out_dir'] != '' and not (call_args['no_save_vis'] + and call_args['no_save_pred']): + print_log(f'results have been saved at {call_args["out_dir"]}') + + +if __name__ == '__main__': + main() diff --git a/mmdetection/demo/large_image_demo.py b/mmdetection/demo/large_image_demo.py new file mode 100644 index 0000000..f3d8d22 --- /dev/null +++ b/mmdetection/demo/large_image_demo.py @@ -0,0 +1,282 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Perform MMDET inference on large images (as satellite imagery) as: + +```shell +wget -P checkpoint https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth # noqa: E501, E261. + +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py \ + checkpoint/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth +``` +""" + +import os +import random +from argparse import ArgumentParser +from pathlib import Path + +import mmcv +import numpy as np +from mmengine.config import Config, ConfigDict +from mmengine.logging import print_log +from mmengine.utils import ProgressBar + +from mmdet.apis import inference_detector, init_detector + +try: + from sahi.slicing import slice_image +except ImportError: + raise ImportError('Please run "pip install -U sahi" ' + 'to install sahi first for large image inference.') + +from mmdet.registry import VISUALIZERS +from mmdet.utils.large_image import merge_results_by_nms, shift_predictions +from mmdet.utils.misc import get_file_list + + +def parse_args(): + parser = ArgumentParser( + description='Perform MMDET inference on large images.') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--patch-size', type=int, default=640, help='The size of patches') + parser.add_argument( + '--patch-overlap-ratio', + type=float, + default=0.25, + help='Ratio of overlap between two patches') + parser.add_argument( + '--merge-iou-thr', + type=float, + default=0.25, + help='IoU threshould for merging results') + parser.add_argument( + '--merge-nms-type', + type=str, + default='nms', + help='NMS type for merging results') + parser.add_argument( + '--batch-size', + type=int, + default=1, + help='Batch size, must greater than or equal to 1') + parser.add_argument( + '--debug', + action='store_true', + help='Export debug results before merging') + parser.add_argument( + '--save-patch', + action='store_true', + help='Save the results of each patch. ' + 'The `--debug` must be enabled.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + config = args.config + + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + if args.tta: + assert 'tta_model' in config, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in config, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + config.model = ConfigDict(**config.tta_model, module=config.model) + test_data_cfg = config.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + test_data_cfg.pipeline = config.tta_pipeline + + # TODO: TTA mode will error if cfg_options is not set. + # This is an mmdet issue and needs to be fixed later. + # build the model from a config file and a checkpoint file + model = init_detector( + config, args.checkpoint, device=args.device, cfg_options={}) + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + print(f'Performing inference on {len(files)} images.... ' + 'This may take a while.') + progress_bar = ProgressBar(len(files)) + for file in files: + # read image + img = mmcv.imread(file) + + # arrange slices + height, width = img.shape[:2] + sliced_image_object = slice_image( + img, + slice_height=args.patch_size, + slice_width=args.patch_size, + auto_slice_resolution=False, + overlap_height_ratio=args.patch_overlap_ratio, + overlap_width_ratio=args.patch_overlap_ratio, + ) + # perform sliced inference + slice_results = [] + start = 0 + while True: + # prepare batch slices + end = min(start + args.batch_size, len(sliced_image_object)) + images = [] + for sliced_image in sliced_image_object.images[start:end]: + images.append(sliced_image) + + # forward the model + slice_results.extend(inference_detector(model, images)) + + if end >= len(sliced_image_object): + break + start += args.batch_size + + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + + img = mmcv.imconvert(img, 'bgr', 'rgb') + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # export debug images + if args.debug: + # export sliced image results + name, suffix = os.path.splitext(filename) + + shifted_instances = shift_predictions( + slice_results, + sliced_image_object.starting_pixels, + src_image_shape=(height, width)) + merged_result = slice_results[0].clone() + merged_result.pred_instances = shifted_instances + + debug_file_name = name + '_debug' + suffix + debug_out_file = None if args.show else os.path.join( + args.out_dir, debug_file_name) + visualizer.set_image(img.copy()) + + debug_grids = [] + for starting_point in sliced_image_object.starting_pixels: + start_point_x = starting_point[0] + start_point_y = starting_point[1] + end_point_x = start_point_x + args.patch_size + end_point_y = start_point_y + args.patch_size + debug_grids.append( + [start_point_x, start_point_y, end_point_x, end_point_y]) + debug_grids = np.array(debug_grids) + debug_grids[:, 0::2] = np.clip(debug_grids[:, 0::2], 1, + img.shape[1] - 1) + debug_grids[:, 1::2] = np.clip(debug_grids[:, 1::2], 1, + img.shape[0] - 1) + + palette = np.random.randint(0, 256, size=(len(debug_grids), 3)) + palette = [tuple(c) for c in palette] + line_styles = random.choices(['-', '-.', ':'], k=len(debug_grids)) + visualizer.draw_bboxes( + debug_grids, + edge_colors=palette, + alpha=1, + line_styles=line_styles) + visualizer.draw_bboxes( + debug_grids, face_colors=palette, alpha=0.15) + + visualizer.draw_texts( + list(range(len(debug_grids))), + debug_grids[:, :2] + 5, + colors='w') + + visualizer.add_datasample( + debug_file_name, + visualizer.get_image(), + data_sample=merged_result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=debug_out_file, + pred_score_thr=args.score_thr, + ) + + if args.save_patch: + debug_patch_out_dir = os.path.join(args.out_dir, + f'{name}_patch') + for i, slice_result in enumerate(slice_results): + patch_out_file = os.path.join( + debug_patch_out_dir, + f'{filename}_slice_{i}_result.jpg') + image = mmcv.imconvert(sliced_image_object.images[i], + 'bgr', 'rgb') + + visualizer.add_datasample( + 'patch_result', + image, + data_sample=slice_result, + draw_gt=False, + show=False, + wait_time=0, + out_file=patch_out_file, + pred_score_thr=args.score_thr, + ) + + image_result = merge_results_by_nms( + slice_results, + sliced_image_object.starting_pixels, + src_image_shape=(height, width), + nms_cfg={ + 'type': args.merge_nms_type, + 'iou_threshold': args.merge_iou_thr + }) + + visualizer.add_datasample( + filename, + img, + data_sample=image_result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=out_file, + pred_score_thr=args.score_thr, + ) + progress_bar.update() + + if not args.show or (args.debug and args.save_patch): + print_log( + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + +if __name__ == '__main__': + main() diff --git a/mmdetection/demo/mot_demo.py b/mmdetection/demo/mot_demo.py new file mode 100644 index 0000000..4595cdc --- /dev/null +++ b/mmdetection/demo/mot_demo.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import tempfile +from argparse import ArgumentParser + +import mmcv +import mmengine +from mmengine.registry import init_default_scope + +from mmdet.apis import inference_mot, init_track_model +from mmdet.registry import VISUALIZERS + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png') + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'inputs', type=str, help='Input image file or folder path.') + parser.add_argument('config', help='config file') + parser.add_argument('--checkpoint', help='checkpoint file') + parser.add_argument('--detector', help='det checkpoint file') + parser.add_argument('--reid', help='reid checkpoint file') + parser.add_argument( + '--device', default='cuda:0', help='device used for inference') + parser.add_argument( + '--score-thr', + type=float, + default=0.0, + help='The threshold of score to filter bboxes.') + parser.add_argument( + '--out', help='output video file (mp4 format) or folder') + parser.add_argument( + '--show', + action='store_true', + help='whether show the results on the fly') + parser.add_argument('--fps', help='FPS of the output video') + args = parser.parse_args() + return args + + +def main(args): + assert args.out or args.show + # load images + if osp.isdir(args.inputs): + imgs = sorted( + filter(lambda x: x.endswith(IMG_EXTENSIONS), + os.listdir(args.inputs)), + key=lambda x: int(x.split('.')[0])) + in_video = False + else: + imgs = mmcv.VideoReader(args.inputs) + in_video = True + + # define output + out_video = False + if args.out is not None: + if args.out.endswith('.mp4'): + out_video = True + out_dir = tempfile.TemporaryDirectory() + out_path = out_dir.name + _out = args.out.rsplit(os.sep, 1) + if len(_out) > 1: + os.makedirs(_out[0], exist_ok=True) + else: + out_path = args.out + os.makedirs(out_path, exist_ok=True) + + fps = args.fps + if args.show or out_video: + if fps is None and in_video: + fps = imgs.fps + if not fps: + raise ValueError('Please set the FPS for the output video.') + fps = int(fps) + + init_default_scope('mmdet') + + # build the model from a config file and a checkpoint file + model = init_track_model( + args.config, + args.checkpoint, + args.detector, + args.reid, + device=args.device) + + # build the visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + prog_bar = mmengine.ProgressBar(len(imgs)) + # test and show/save the images + for i, img in enumerate(imgs): + if isinstance(img, str): + img_path = osp.join(args.inputs, img) + img = mmcv.imread(img_path) + # result [TrackDataSample] + result = inference_mot(model, img, frame_id=i, video_len=len(imgs)) + if args.out is not None: + if in_video or out_video: + out_file = osp.join(out_path, f'{i:06d}.jpg') + else: + out_file = osp.join(out_path, img.rsplit(os.sep, 1)[-1]) + else: + out_file = None + + # show the results + visualizer.add_datasample( + 'mot', + img[..., ::-1], + data_sample=result[0], + show=args.show, + draw_gt=False, + out_file=out_file, + wait_time=float(1 / int(fps)) if fps else 0, + pred_score_thr=args.score_thr, + step=i) + + prog_bar.update() + + if args.out and out_video: + print(f'making the output video at {args.out} with a FPS of {fps}') + mmcv.frames2video(out_path, args.out, fps=fps, fourcc='mp4v') + out_dir.cleanup() + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/mmdetection/demo/video_demo.py b/mmdetection/demo/video_demo.py new file mode 100644 index 0000000..6fc3631 --- /dev/null +++ b/mmdetection/demo/video_demo.py @@ -0,0 +1,84 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +import cv2 +import mmcv +from mmcv.transforms import Compose +from mmengine.utils import track_iter_progress + +from mmdet.apis import inference_detector, init_detector +from mmdet.registry import VISUALIZERS + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMDetection video demo') + parser.add_argument('video', help='Video file') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument('--out', type=str, help='Output video file') + parser.add_argument('--show', action='store_true', help='Show video') + parser.add_argument( + '--wait-time', + type=float, + default=1, + help='The interval of show (s), 0 is block') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + assert args.out or args.show, \ + ('Please specify at least one operation (save/show the ' + 'video) with the argument "--out" or "--show"') + + # build the model from a config file and a checkpoint file + model = init_detector(args.config, args.checkpoint, device=args.device) + + # build test pipeline + model.cfg.test_dataloader.dataset.pipeline[ + 0].type = 'mmdet.LoadImageFromNDArray' + test_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + # the dataset_meta is loaded from the checkpoint and + # then pass to the model in init_detector + visualizer.dataset_meta = model.dataset_meta + + video_reader = mmcv.VideoReader(args.video) + video_writer = None + if args.out: + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + video_writer = cv2.VideoWriter( + args.out, fourcc, video_reader.fps, + (video_reader.width, video_reader.height)) + + for frame in track_iter_progress(video_reader): + result = inference_detector(model, frame, test_pipeline=test_pipeline) + visualizer.add_datasample( + name='video', + image=frame, + data_sample=result, + draw_gt=False, + show=False, + pred_score_thr=args.score_thr) + frame = visualizer.get_image() + + if args.show: + cv2.namedWindow('video', 0) + mmcv.imshow(frame, 'video', args.wait_time) + if args.out: + video_writer.write(frame) + + if video_writer: + video_writer.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/mmdetection/demo/video_gpuaccel_demo.py b/mmdetection/demo/video_gpuaccel_demo.py new file mode 100644 index 0000000..3b09164 --- /dev/null +++ b/mmdetection/demo/video_gpuaccel_demo.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from typing import Tuple + +import cv2 +import mmcv +import numpy as np +import torch +import torch.nn as nn +from mmcv.transforms import Compose +from mmengine.utils import track_iter_progress + +from mmdet.apis import init_detector +from mmdet.registry import VISUALIZERS +from mmdet.structures import DetDataSample + +try: + import ffmpegcv +except ImportError: + raise ImportError( + 'Please install ffmpegcv with:\n\n pip install ffmpegcv') + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDetection video demo with GPU acceleration') + parser.add_argument('video', help='Video file') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument('--out', type=str, help='Output video file') + parser.add_argument('--show', action='store_true', help='Show video') + parser.add_argument( + '--nvdecode', action='store_true', help='Use NVIDIA decoder') + parser.add_argument( + '--wait-time', + type=float, + default=1, + help='The interval of show (s), 0 is block') + args = parser.parse_args() + return args + + +def prefetch_batch_input_shape(model: nn.Module, ori_wh: Tuple[int, + int]) -> dict: + cfg = model.cfg + w, h = ori_wh + cfg.test_dataloader.dataset.pipeline[0].type = 'LoadImageFromNDArray' + test_pipeline = Compose(cfg.test_dataloader.dataset.pipeline) + data = {'img': np.zeros((h, w, 3), dtype=np.uint8), 'img_id': 0} + data = test_pipeline(data) + data['inputs'] = [data['inputs']] + data['data_samples'] = [data['data_samples']] + data_sample = model.data_preprocessor(data, False)['data_samples'] + batch_input_shape = data_sample[0].batch_input_shape + return batch_input_shape + + +def pack_data(frame_resize: np.ndarray, batch_input_shape: Tuple[int, int], + ori_shape: Tuple[int, int]) -> dict: + assert frame_resize.shape[:2] == batch_input_shape + data_sample = DetDataSample() + data_sample.set_metainfo({ + 'img_shape': + batch_input_shape, + 'ori_shape': + ori_shape, + 'scale_factor': (batch_input_shape[0] / ori_shape[0], + batch_input_shape[1] / ori_shape[1]) + }) + frame_resize = torch.from_numpy(frame_resize).permute((2, 0, 1)).cuda() + data = {'inputs': [frame_resize], 'data_samples': [data_sample]} + return data + + +def main(): + args = parse_args() + assert args.out or args.show, \ + ('Please specify at least one operation (save/show the ' + 'video) with the argument "--out" or "--show"') + + model = init_detector(args.config, args.checkpoint, device=args.device) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + # the dataset_meta is loaded from the checkpoint and + # then pass to the model in init_detector + visualizer.dataset_meta = model.dataset_meta + + if args.nvdecode: + VideoCapture = ffmpegcv.VideoCaptureNV + else: + VideoCapture = ffmpegcv.VideoCapture + video_origin = VideoCapture(args.video) + + batch_input_shape = prefetch_batch_input_shape( + model, (video_origin.width, video_origin.height)) + ori_shape = (video_origin.height, video_origin.width) + resize_wh = batch_input_shape[::-1] + video_resize = VideoCapture( + args.video, + resize=resize_wh, + resize_keepratio=True, + resize_keepratioalign='topleft') + + video_writer = None + if args.out: + video_writer = ffmpegcv.VideoWriter(args.out, fps=video_origin.fps) + + with torch.no_grad(): + for i, (frame_resize, frame_origin) in enumerate( + zip(track_iter_progress(video_resize), video_origin)): + data = pack_data(frame_resize, batch_input_shape, ori_shape) + result = model.test_step(data)[0] + + visualizer.add_datasample( + name='video', + image=frame_origin, + data_sample=result, + draw_gt=False, + show=False, + pred_score_thr=args.score_thr) + + frame_mask = visualizer.get_image() + + if args.show: + cv2.namedWindow('video', 0) + mmcv.imshow(frame_mask, 'video', args.wait_time) + if args.out: + video_writer.write(frame_mask) + + if video_writer: + video_writer.release() + video_origin.release() + video_resize.release() + + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/mmdetection/demo/webcam_demo.py b/mmdetection/demo/webcam_demo.py new file mode 100644 index 0000000..d090030 --- /dev/null +++ b/mmdetection/demo/webcam_demo.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse + +import cv2 +import mmcv +import torch + +from mmdet.apis import inference_detector, init_detector +from mmdet.registry import VISUALIZERS + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMDetection webcam demo') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--camera-id', type=int, default=0, help='camera device id') + parser.add_argument( + '--score-thr', type=float, default=0.5, help='bbox score threshold') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + # build the model from a config file and a checkpoint file + device = torch.device(args.device) + model = init_detector(args.config, args.checkpoint, device=device) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + # the dataset_meta is loaded from the checkpoint and + # then pass to the model in init_detector + visualizer.dataset_meta = model.dataset_meta + + camera = cv2.VideoCapture(args.camera_id) + + print('Press "Esc", "q" or "Q" to exit.') + while True: + ret_val, img = camera.read() + result = inference_detector(model, img) + + img = mmcv.imconvert(img, 'bgr', 'rgb') + visualizer.add_datasample( + name='result', + image=img, + data_sample=result, + draw_gt=False, + pred_score_thr=args.score_thr, + show=False) + + img = visualizer.get_image() + img = mmcv.imconvert(img, 'bgr', 'rgb') + cv2.imshow('result', img) + + ch = cv2.waitKey(1) + if ch == 27 or ch == ord('q') or ch == ord('Q'): + break + + +if __name__ == '__main__': + main() diff --git a/mmdetection/docker/Dockerfile b/mmdetection/docker/Dockerfile new file mode 100644 index 0000000..2737ec0 --- /dev/null +++ b/mmdetection/docker/Dockerfile @@ -0,0 +1,40 @@ +ARG PYTORCH="1.9.0" +ARG CUDA="11.1" +ARG CUDNN="8" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \ + TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ + FORCE_CUDA="1" + +# Avoid Public GPG key error +# https://github.com/NVIDIA/nvidia-docker/issues/1631 +RUN rm /etc/apt/sources.list.d/cuda.list \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key del 7fa2af80 \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# (Optional, use Mirror to speed up downloads) +# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ +# pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + +# Install the required packages +RUN apt-get update \ + && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install MMEngine and MMCV +RUN pip install openmim && \ + mim install "mmengine>=0.7.1" "mmcv>=2.0.0rc4" + +# Install MMDetection +RUN conda clean --all \ + && git clone https://github.com/open-mmlab/mmdetection.git /mmdetection \ + && cd /mmdetection \ + && pip install --no-cache-dir -e . + +WORKDIR /mmdetection diff --git a/mmdetection/docker/serve/Dockerfile b/mmdetection/docker/serve/Dockerfile new file mode 100644 index 0000000..8729189 --- /dev/null +++ b/mmdetection/docker/serve/Dockerfile @@ -0,0 +1,62 @@ +ARG PYTORCH="1.9.0" +ARG CUDA="11.1" +ARG CUDNN="8" +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ARG MMCV="2.0.0rc4" +ARG MMDET="3.2.0" + +ENV PYTHONUNBUFFERED TRUE + +# Avoid Public GPG key error +# https://github.com/NVIDIA/nvidia-docker/issues/1631 +RUN rm /etc/apt/sources.list.d/cuda.list \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key del 7fa2af80 \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# (Optional, use Mirror to speed up downloads) +# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list + +# Install the required packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ + g++ \ + openjdk-11-jre-headless \ + # MMDet Requirements + ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \ + && rm -rf /var/lib/apt/lists/* + +ENV PATH="/opt/conda/bin:$PATH" \ + FORCE_CUDA="1" + +# TORCHSEVER +RUN pip install torchserve torch-model-archiver + +# MMLAB +ARG PYTORCH +ARG CUDA +RUN pip install mmengine +RUN ["/bin/bash", "-c", "pip install mmcv==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"] +RUN pip install mmdet==${MMDET} + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp + +COPY entrypoint.sh /usr/local/bin/entrypoint.sh + +RUN chmod +x /usr/local/bin/entrypoint.sh \ + && chown -R model-server /home/model-server + +COPY config.properties /home/model-server/config.properties +RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store + +EXPOSE 8080 8081 8082 + +USER model-server +WORKDIR /home/model-server +ENV TEMP=/home/model-server/tmp +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["serve"] diff --git a/mmdetection/docker/serve/config.properties b/mmdetection/docker/serve/config.properties new file mode 100644 index 0000000..efb9c47 --- /dev/null +++ b/mmdetection/docker/serve/config.properties @@ -0,0 +1,5 @@ +inference_address=http://0.0.0.0:8080 +management_address=http://0.0.0.0:8081 +metrics_address=http://0.0.0.0:8082 +model_store=/home/model-server/model-store +load_models=all diff --git a/mmdetection/docker/serve_cn/Dockerfile b/mmdetection/docker/serve_cn/Dockerfile new file mode 100644 index 0000000..5109064 --- /dev/null +++ b/mmdetection/docker/serve_cn/Dockerfile @@ -0,0 +1,65 @@ +ARG PYTORCH="1.9.0" +ARG CUDA="11.1" +ARG CUDNN="8" +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ARG MMCV="2.0.0rc4" +ARG MMDET="3.2.0" + +ENV PYTHONUNBUFFERED TRUE + +# Avoid Public GPG key error +# - https://github.com/NVIDIA/nvidia-docker/issues/1631 +RUN rm /etc/apt/sources.list.d/cuda.list \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-get update \ + && apt-get install -y wget \ + && rm -rf /var/lib/apt/lists/* \ + && apt-key del 7fa2af80 \ + && apt-get update && apt-get install -y --no-install-recommends wget \ + && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb \ + && dpkg -i cuda-keyring_1.0-1_all.deb +# (Optional, use Mirror to speed up downloads) +# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list + +# Install the required packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ + g++ \ + openjdk-11-jre-headless \ + # MMDet Requirements + ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \ + && rm -rf /var/lib/apt/lists/* + +ENV PATH="/opt/conda/bin:$PATH" \ + FORCE_CUDA="1" + +# TORCHSEVER +RUN pip install torchserve torch-model-archiver nvgpu -i https://pypi.mirrors.ustc.edu.cn/simple/ + +# MMLAB +ARG PYTORCH +ARG CUDA +RUN pip install mmengine -i https://pypi.mirrors.ustc.edu.cn/simple/ +RUN ["/bin/bash", "-c", "pip install mmcv==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"] +RUN pip install mmdet==${MMDET} -i https://pypi.mirrors.ustc.edu.cn/simple/ + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp + +COPY entrypoint.sh /usr/local/bin/entrypoint.sh + +RUN chmod +x /usr/local/bin/entrypoint.sh \ + && chown -R model-server /home/model-server + +COPY config.properties /home/model-server/config.properties +RUN mkdir /home/model-server/model-store && chown -R model-server /home/model-server/model-store + +EXPOSE 8080 8081 8082 + +USER model-server +WORKDIR /home/model-server +ENV TEMP=/home/model-server/tmp +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] +CMD ["serve"] diff --git a/mmdetection/docs/en/Makefile b/mmdetection/docs/en/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/mmdetection/docs/en/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/mmdetection/docs/en/_static/css/readthedocs.css b/mmdetection/docs/en/_static/css/readthedocs.css new file mode 100644 index 0000000..57ed0ad --- /dev/null +++ b/mmdetection/docs/en/_static/css/readthedocs.css @@ -0,0 +1,6 @@ +.header-logo { + background-image: url("../image/mmdet-logo.png"); + background-size: 156px 40px; + height: 40px; + width: 156px; +} diff --git a/mmdetection/docs/en/advanced_guides/conventions.md b/mmdetection/docs/en/advanced_guides/conventions.md new file mode 100644 index 0000000..da159ac --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/conventions.md @@ -0,0 +1,111 @@ +# Conventions + +Please check the following conventions if you would like to modify MMDetection as your own project. + +## About the order of image shape + +In OpenMMLab 2.0, to be consistent with the input argument of OpenCV, the argument about image shape in the data transformation pipeline is always in the `(width, height)` order. On the contrary, for computation convenience, the order of the field going through the data pipeline and the model is `(height, width)`. Specifically, in the results processed by each data transform pipeline, the fields and their value meaning is as below: + +- img_shape: (height, width) +- ori_shape: (height, width) +- pad_shape: (height, width) +- batch_input_shape: (height, width) + +As an example, the initialization arguments of `Mosaic` are as below: + +```python +@TRANSFORMS.register_module() +class Mosaic(BaseTransform): + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + prob: float = 1.0) -> None: + ... + + # img_scale order should be (width, height) + self.img_scale = img_scale + + def transform(self, results: dict) -> dict: + ... + + results['img'] = mosaic_img + # (height, width) + results['img_shape'] = mosaic_img.shape[:2] +``` + +## Loss + +In MMDetection, a `dict` containing losses and metrics will be returned by `model(**data)`. + +For example, in bbox head, + +```python +class BBoxHead(nn.Module): + ... + def loss(self, ...): + losses = dict() + # classification loss + losses['loss_cls'] = self.loss_cls(...) + # classification accuracy + losses['acc'] = accuracy(...) + # bbox regression loss + losses['loss_bbox'] = self.loss_bbox(...) + return losses +``` + +`bbox_head.loss()` will be called during model forward. +The returned dict contains `'loss_bbox'`, `'loss_cls'`, `'acc'` . +Only `'loss_bbox'`, `'loss_cls'` will be used during back propagation, +`'acc'` will only be used as a metric to monitor training process. + +By default, only values whose keys contain `'loss'` will be back propagated. +This behavior could be changed by modifying `BaseDetector.train_step()`. + +## Empty Proposals + +In MMDetection, We have added special handling and unit test for empty proposals of two-stage. We need to deal with the empty proposals of the entire batch and single image at the same time. For example, in CascadeRoIHead, + +```python +# simple_test method +... +# There is no proposal in the whole batch +if rois.shape[0] == 0: + bbox_results = [[ + np.zeros((0, 5), dtype=np.float32) + for _ in range(self.bbox_head[-1].num_classes) + ]] * num_imgs + if self.with_mask: + mask_classes = self.mask_head[-1].num_classes + segm_results = [[[] for _ in range(mask_classes)] + for _ in range(num_imgs)] + results = list(zip(bbox_results, segm_results)) + else: + results = bbox_results + return results +... + +# There is no proposal in the single image +for i in range(self.num_stages): + ... + if i < self.num_stages - 1: + for j in range(num_imgs): + # Handle empty proposal + if rois[j].shape[0] > 0: + bbox_label = cls_score[j][:, :-1].argmax(dim=1) + refine_roi = self.bbox_head[i].regress_by_class( + rois[j], bbox_label, bbox_pred[j], img_metas[j]) + refine_roi_list.append(refine_roi) +``` + +If you have customized `RoIHead`, you can refer to the above method to deal with empty proposals. + +## Coco Panoptic Dataset + +In MMDetection, we have supported COCO Panoptic dataset. We clarify a few conventions about the implementation of `CocoPanopticDataset` here. + +1. For mmdet\<=2.16.0, the range of foreground and background labels in semantic segmentation are different from the default setting of MMDetection. The label `0` stands for `VOID` label and the category labels start from `1`. + Since mmdet=2.17.0, the category labels of semantic segmentation start from `0` and label `255` stands for `VOID` for consistency with labels of bounding boxes. + To achieve that, the `Pad` pipeline supports setting the padding value for `seg`. +2. In the evaluation, the panoptic result is a map with the same shape as the original image. Each value in the result map has the format of `instance_id * INSTANCE_OFFSET + category_id`. diff --git a/mmdetection/docs/en/advanced_guides/customize_dataset.md b/mmdetection/docs/en/advanced_guides/customize_dataset.md new file mode 100644 index 0000000..3d63d12 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/customize_dataset.md @@ -0,0 +1,433 @@ +# Customize Datasets + +## Support new data format + +To support a new data format, you can either convert them to existing formats (COCO format or PASCAL format) or directly convert them to the middle format. You could also choose to convert them offline (before training by a script) or online (implement a new dataset and do the conversion at training). In MMDetection, we recommend to convert the data into COCO formats and do the conversion offline, thus you only need to modify the config's data annotation paths and classes after the conversion of your data. + +### Reorganize new data formats to existing format + +The simplest way is to convert your dataset to existing dataset formats (COCO or PASCAL VOC). + +The annotation JSON files in COCO format has the following necessary keys: + +```python +'images': [ + { + 'file_name': 'COCO_val2014_000000001268.jpg', + 'height': 427, + 'width': 640, + 'id': 1268 + }, + ... +], + +'annotations': [ + { + 'segmentation': [[192.81, + 247.09, + ... + 219.03, + 249.06]], # If you have mask labels, and it is in polygon XY point coordinate format, you need to ensure that at least 3 point coordinates are included. Otherwise, it is an invalid polygon. + 'area': 1035.749, + 'iscrowd': 0, + 'image_id': 1268, + 'bbox': [192.81, 224.8, 74.73, 33.43], + 'category_id': 16, + 'id': 42986 + }, + ... +], + +'categories': [ + {'id': 0, 'name': 'car'}, + ] +``` + +There are three necessary keys in the JSON file: + +- `images`: contains a list of images with their information like `file_name`, `height`, `width`, and `id`. +- `annotations`: contains the list of instance annotations. +- `categories`: contains the list of categories names and their ID. + +After the data pre-processing, there are two steps for users to train the customized new dataset with existing format (e.g. COCO format): + +1. Modify the config file for using the customized dataset. +2. Check the annotations of the customized dataset. + +Here we give an example to show the above two steps, which uses a customized dataset of 5 classes with COCO format to train an existing Cascade Mask R-CNN R50-FPN detector. + +#### 1. Modify the config file for using the customized dataset + +There are two aspects involved in the modification of config file: + +1. The `data` field. Specifically, you need to explicitly add the `metainfo=dict(classes=classes)` fields in `train_dataloader.dataset`, `val_dataloader.dataset` and `test_dataloader.dataset` and `classes` must be a tuple type. +2. The `num_classes` field in the `model` part. Explicitly over-write all the `num_classes` from default value (e.g. 80 in COCO) to your classes number. + +In `configs/my_custom_config.py`: + +```python + +# the new config inherits the base configs to highlight the necessary modification +_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py' + +# 1. dataset settings +dataset_type = 'CocoDataset' +classes = ('a', 'b', 'c', 'd', 'e') +data_root='path/to/your/' + +train_dataloader = dict( + batch_size=2, + num_workers=2, + dataset=dict( + type=dataset_type, + # explicitly add your class names to the field `metainfo` + metainfo=dict(classes=classes), + data_root=data_root, + ann_file='train/annotation_data', + data_prefix=dict(img='train/image_data') + ) + ) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + type=dataset_type, + test_mode=True, + # explicitly add your class names to the field `metainfo` + metainfo=dict(classes=classes), + data_root=data_root, + ann_file='val/annotation_data', + data_prefix=dict(img='val/image_data') + ) + ) + +test_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + type=dataset_type, + test_mode=True, + # explicitly add your class names to the field `metainfo` + metainfo=dict(classes=classes), + data_root=data_root, + ann_file='test/annotation_data', + data_prefix=dict(img='test/image_data') + ) + ) + +# 2. model settings + +# explicitly over-write all the `num_classes` field from default 80 to 5. +model = dict( + roi_head=dict( + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + # explicitly over-write all the `num_classes` field from default 80 to 5. + num_classes=5), + dict( + type='Shared2FCBBoxHead', + # explicitly over-write all the `num_classes` field from default 80 to 5. + num_classes=5), + dict( + type='Shared2FCBBoxHead', + # explicitly over-write all the `num_classes` field from default 80 to 5. + num_classes=5)], + # explicitly over-write all the `num_classes` field from default 80 to 5. + mask_head=dict(num_classes=5))) +``` + +#### 2. Check the annotations of the customized dataset + +Assuming your customized dataset is COCO format, make sure you have the correct annotations in the customized dataset: + +1. The length for `categories` field in annotations should exactly equal the tuple length of `classes` fields in your config, meaning the number of classes (e.g. 5 in this example). +2. The `classes` fields in your config file should have exactly the same elements and the same order with the `name` in `categories` of annotations. MMDetection automatically maps the uncontinuous `id` in `categories` to the continuous label indices, so the string order of `name` in `categories` field affects the order of label indices. Meanwhile, the string order of `classes` in config affects the label text during visualization of predicted bounding boxes. +3. The `category_id` in `annotations` field should be valid, i.e., all values in `category_id` should belong to `id` in `categories`. + +Here is a valid example of annotations: + +```python + +'annotations': [ + { + 'segmentation': [[192.81, + 247.09, + ... + 219.03, + 249.06]], # if you have mask labels + 'area': 1035.749, + 'iscrowd': 0, + 'image_id': 1268, + 'bbox': [192.81, 224.8, 74.73, 33.43], + 'category_id': 16, + 'id': 42986 + }, + ... +], + +# MMDetection automatically maps the uncontinuous `id` to the continuous label indices. +'categories': [ + {'id': 1, 'name': 'a'}, {'id': 3, 'name': 'b'}, {'id': 4, 'name': 'c'}, {'id': 16, 'name': 'd'}, {'id': 17, 'name': 'e'}, + ] +``` + +We use this way to support CityScapes dataset. The script is in [cityscapes.py](../../../tools/dataset_converters/cityscapes.py) and we also provide the finetuning [configs](../../../configs/cityscapes). + +**Note** + +1. For instance segmentation datasets, **MMDetection only supports evaluating mask AP of dataset in COCO format for now**. +2. It is recommended to convert the data offline before training, thus you can still use `CocoDataset` and only need to modify the path of annotations and the training classes. + +### Reorganize new data format to middle format + +It is also fine if you do not want to convert the annotation format to COCO or PASCAL format. +Actually, we define a simple annotation format in MMEninge's [BaseDataset](https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/base_dataset.py#L116) and all existing datasets are +processed to be compatible with it, either online or offline. + +The annotation of the dataset must be in `json` or `yaml`, `yml` or `pickle`, `pkl` format; the dictionary stored in the annotation file must contain two fields `metainfo` and `data_list`. The `metainfo` is a dictionary, which contains the metadata of the dataset, such as class information; `data_list` is a list, each element in the list is a dictionary, the dictionary defines the raw data of one image, and each raw data contains a or several training/testing samples. + +Here is an example. + +```python +{ + 'metainfo': + { + 'classes': ('person', 'bicycle', 'car', 'motorcycle'), + ... + }, + 'data_list': + [ + { + "img_path": "xxx/xxx_1.jpg", + "height": 604, + "width": 640, + "instances": + [ + { + "bbox": [0, 0, 10, 20], + "bbox_label": 1, + "ignore_flag": 0 + }, + { + "bbox": [10, 10, 110, 120], + "bbox_label": 2, + "ignore_flag": 0 + } + ] + }, + { + "img_path": "xxx/xxx_2.jpg", + "height": 320, + "width": 460, + "instances": + [ + { + "bbox": [10, 0, 20, 20], + "bbox_label": 3, + "ignore_flag": 1, + } + ] + }, + ... + ] +} +``` + +Some datasets may provide annotations like crowd/difficult/ignored bboxes, we use `ignore_flag`to cover them. + +After obtaining the above standard data annotation format, you can directly use [BaseDetDataset](../../../mmdet/datasets/base_det_dataset.py#L13) of MMDetection in the configuration , without conversion. + +### An example of customized dataset + +Assume the annotation is in a new format in text files. +The bounding boxes annotations are stored in text file `annotation.txt` as the following + +``` +# +000001.jpg +1280 720 +2 +10 20 40 60 1 +20 40 50 60 2 +# +000002.jpg +1280 720 +3 +50 20 40 60 2 +20 40 30 45 2 +30 40 50 60 3 +``` + +We can create a new dataset in `mmdet/datasets/my_dataset.py` to load the data. + +```python +import mmengine + +from mmdet.base_det_dataset import BaseDetDataset +from mmdet.registry import DATASETS + + +@DATASETS.register_module() +class MyDataset(BaseDetDataset): + + METAINFO = { + 'classes': ('person', 'bicycle', 'car', 'motorcycle'), + 'palette': [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230)] + } + + def load_data_list(self, ann_file): + ann_list = mmengine.list_from_file(ann_file) + + data_infos = [] + for i, ann_line in enumerate(ann_list): + if ann_line != '#': + continue + + img_shape = ann_list[i + 2].split(' ') + width = int(img_shape[0]) + height = int(img_shape[1]) + bbox_number = int(ann_list[i + 3]) + + instances = [] + for anns in ann_list[i + 4:i + 4 + bbox_number]: + instance = {} + instance['bbox'] = [float(ann) for ann in anns.split(' ')[:4]] + instance['bbox_label']=int(anns[4]) + instances.append(instance) + + data_infos.append( + dict( + img_path=ann_list[i + 1], + img_id=i, + width=width, + height=height, + instances=instances + )) + + return data_infos +``` + +Then in the config, to use `MyDataset` you can modify the config as the following + +```python +dataset_A_train = dict( + type='MyDataset', + ann_file = 'image_list.txt', + pipeline=train_pipeline +) +``` + +## Customize datasets by dataset wrappers + +MMEngine also supports many dataset wrappers to mix the dataset or modify the dataset distribution for training. +Currently it supports to three dataset wrappers as below: + +- `RepeatDataset`: simply repeat the whole dataset. +- `ClassBalancedDataset`: repeat dataset in a class balanced manner. +- `ConcatDataset`: concat datasets. + +For detailed usage, see [MMEngine Dataset Wrapper](#TODO). + +## Modify Dataset Classes + +With existing dataset types, we can modify the metainfo of them to train subset of the annotations. +For example, if you want to train only three classes of the current dataset, +you can modify the classes of dataset. +The dataset will filter out the ground truth boxes of other classes automatically. + +```python +classes = ('person', 'bicycle', 'car') +train_dataloader = dict( + dataset=dict( + metainfo=dict(classes=classes)) + ) +val_dataloader = dict( + dataset=dict( + metainfo=dict(classes=classes)) + ) +test_dataloader = dict( + dataset=dict( + metainfo=dict(classes=classes)) + ) +``` + +**Note**: + +- Before MMDetection v2.5.0, the dataset will filter out the empty GT images automatically if the classes are set and there is no way to disable that through config. This is an undesirable behavior and introduces confusion because if the classes are not set, the dataset only filter the empty GT images when `filter_empty_gt=True` and `test_mode=False`. After MMDetection v2.5.0, we decouple the image filtering process and the classes modification, i.e., the dataset will only filter empty GT images when `filter_cfg=dict(filter_empty_gt=True)` and `test_mode=False`, no matter whether the classes are set. Thus, setting the classes only influences the annotations of classes used for training and users could decide whether to filter empty GT images by themselves. +- When directly using `BaseDataset` in MMEngine or `BaseDetDataset` in MMDetection, users cannot filter images without GT by modifying the configuration, but it can be solved in an offline way. +- Please remember to modify the `num_classes` in the head when specifying `classes` in dataset. We implemented [NumClassCheckHook](../../../mmdet/engine/hooks/num_class_check_hook.py) to check whether the numbers are consistent since v2.9.0(after PR#4508). + +## COCO Panoptic Dataset + +Now we support COCO Panoptic Dataset, the format of panoptic annotations is different from COCO format. +Both the foreground and the background will exist in the annotation file. +The annotation json files in COCO Panoptic format has the following necessary keys: + +```python +'images': [ + { + 'file_name': '000000001268.jpg', + 'height': 427, + 'width': 640, + 'id': 1268 + }, + ... +] + +'annotations': [ + { + 'filename': '000000001268.jpg', + 'image_id': 1268, + 'segments_info': [ + { + 'id':8345037, # One-to-one correspondence with the id in the annotation map. + 'category_id': 51, + 'iscrowd': 0, + 'bbox': (x1, y1, w, h), # The bbox of the background is the outer rectangle of its mask. + 'area': 24315 + }, + ... + ] + }, + ... +] + +'categories': [ # including both foreground categories and background categories + {'id': 0, 'name': 'person'}, + ... + ] +``` + +Moreover, the `seg` must be set to the path of the panoptic annotation images. + +```python +dataset_type = 'CocoPanopticDataset' +data_root='path/to/your/' + +train_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img='train/image_data/', seg='train/panoptic/image_annotation_data/') + ) +) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img='val/image_data/', seg='val/panoptic/image_annotation_data/') + ) +) +test_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img='test/image_data/', seg='test/panoptic/image_annotation_data/') + ) +) +``` diff --git a/mmdetection/docs/en/advanced_guides/customize_losses.md b/mmdetection/docs/en/advanced_guides/customize_losses.md new file mode 100644 index 0000000..3120dc0 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/customize_losses.md @@ -0,0 +1,126 @@ +# Customize Losses + +MMDetection provides users with different loss functions. But the default configuration may be not applicable for different datasets or models, so users may want to modify a specific loss to adapt the new situation. + +This tutorial first elaborate the computation pipeline of losses, then give some instructions about how to modify each step. The modification can be categorized as tweaking and weighting. + +## Computation pipeline of a loss + +Given the input prediction and target, as well as the weights, a loss function maps the input tensor to the final loss scalar. The mapping can be divided into five steps: + +1. Set the sampling method to sample positive and negative samples. + +2. Get **element-wise** or **sample-wise** loss by the loss kernel function. + +3. Weighting the loss with a weight tensor **element-wisely**. + +4. Reduce the loss tensor to a **scalar**. + +5. Weighting the loss with a **scalar**. + +## Set sampling method (step 1) + +For some loss functions, sampling strategies are needed to avoid imbalance between positive and negative samples. + +For example, when using `CrossEntropyLoss` in RPN head, we need to set `RandomSampler` in `train_cfg` + +```python +train_cfg=dict( + rpn=dict( + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False)) +``` + +For some other losses which have positive and negative sample balance mechanism such as Focal Loss, GHMC, and QualityFocalLoss, the sampler is no more necessary. + +## Tweaking loss + +Tweaking a loss is more related with step 2, 4, 5, and most modifications can be specified in the config. +Here we take [Focal Loss (FL)](../../../mmdet/models/losses/focal_loss.py) as an example. +The following code sniper are the construction method and config of FL respectively, they are actually one to one correspondence. + +```python +@LOSSES.register_module() +class FocalLoss(nn.Module): + + def __init__(self, + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0): +``` + +```python +loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0) +``` + +### Tweaking hyper-parameters (step 2) + +`gamma` and `beta` are two hyper-parameters in the Focal Loss. Say if we want to change the value of `gamma` to be 1.5 and `alpha` to be 0.5, then we can specify them in the config as follows: + +```python +loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=1.5, + alpha=0.5, + loss_weight=1.0) +``` + +### Tweaking the way of reduction (step 3) + +The default way of reduction is `mean` for FL. Say if we want to change the reduction from `mean` to `sum`, we can specify it in the config as follows: + +```python +loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0, + reduction='sum') +``` + +### Tweaking loss weight (step 5) + +The loss weight here is a scalar which controls the weight of different losses in multi-task learning, e.g. classification loss and regression loss. Say if we want to change to loss weight of classification loss to be 0.5, we can specify it in the config as follows: + +```python +loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=0.5) +``` + +## Weighting loss (step 3) + +Weighting loss means we re-weight the loss element-wisely. To be more specific, we multiply the loss tensor with a weight tensor which has the same shape. As a result, different entries of the loss can be scaled differently, and so called element-wisely. +The loss weight varies across different models and highly context related, but overall there are two kinds of loss weights, `label_weights` for classification loss and `bbox_weights` for bbox regression loss. You can find them in the `get_target` method of the corresponding head. Here we take [ATSSHead](../../../mmdet/models/dense_heads/atss_head.py#L322) as an example, which inherit [AnchorHead](../../../mmdet/models/dense_heads/anchor_head.py) but overwrite its `get_targets` method which yields different `label_weights` and `bbox_weights`. + +``` +class ATSSHead(AnchorHead): + + ... + + def get_targets(self, + anchor_list, + valid_flag_list, + gt_bboxes_list, + img_metas, + gt_bboxes_ignore_list=None, + gt_labels_list=None, + label_channels=1, + unmap_outputs=True): +``` diff --git a/mmdetection/docs/en/advanced_guides/customize_models.md b/mmdetection/docs/en/advanced_guides/customize_models.md new file mode 100644 index 0000000..1779aeb --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/customize_models.md @@ -0,0 +1,412 @@ +# Customize Models + +We basically categorize model components into 5 types. + +- backbone: usually an FCN network to extract feature maps, e.g., ResNet, MobileNet. +- neck: the component between backbones and heads, e.g., FPN, PAFPN. +- head: the component for specific tasks, e.g., bbox prediction and mask prediction. +- roi extractor: the part for extracting RoI features from feature maps, e.g., RoI Align. +- loss: the component in head for calculating losses, e.g., FocalLoss, L1Loss, and GHMLoss. + +## Develop new components + +### Add a new backbone + +Here we show how to develop new components with an example of MobileNet. + +#### 1. Define a new backbone (e.g. MobileNet) + +Create a new file `mmdet/models/backbones/mobilenet.py`. + +```python +import torch.nn as nn + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class MobileNet(nn.Module): + + def __init__(self, arg1, arg2): + pass + + def forward(self, x): # should return a tuple + pass +``` + +#### 2. Import the module + +You can either add the following line to `mmdet/models/backbones/__init__.py` + +```python +from .mobilenet import MobileNet +``` + +or alternatively add + +```python +custom_imports = dict( + imports=['mmdet.models.backbones.mobilenet'], + allow_failed_imports=False) +``` + +to the config file to avoid modifying the original code. + +#### 3. Use the backbone in your config file + +```python +model = dict( + ... + backbone=dict( + type='MobileNet', + arg1=xxx, + arg2=xxx), + ... +``` + +### Add new necks + +#### 1. Define a neck (e.g. PAFPN) + +Create a new file `mmdet/models/necks/pafpn.py`. + +```python +import torch.nn as nn + +from mmdet.registry import MODELS + +@MODELS.register_module() +class PAFPN(nn.Module): + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False): + pass + + def forward(self, inputs): + # implementation is ignored + pass +``` + +#### 2. Import the module + +You can either add the following line to `mmdet/models/necks/__init__.py`, + +```python +from .pafpn import PAFPN +``` + +or alternatively add + +```python +custom_imports = dict( + imports=['mmdet.models.necks.pafpn'], + allow_failed_imports=False) +``` + +to the config file and avoid modifying the original code. + +#### 3. Modify the config file + +```python +neck=dict( + type='PAFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5) +``` + +### Add new heads + +Here we show how to develop a new head with the example of [Double Head R-CNN](https://arxiv.org/abs/1904.06493) as the following. + +First, add a new bbox head in `mmdet/models/roi_heads/bbox_heads/double_bbox_head.py`. +Double Head R-CNN implements a new bbox head for object detection. +To implement a bbox head, basically we need to implement three functions of the new module as the following. + +```python +from typing import Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule, ModuleList +from torch import Tensor + +from mmdet.models.backbones.resnet import Bottleneck +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, MultiConfig, OptConfigType, OptMultiConfig +from .bbox_head import BBoxHead + +@MODELS.register_module() +class DoubleConvFCBBoxHead(BBoxHead): + r"""Bbox head used in Double-Head R-CNN + + .. code-block:: none + + /-> cls + /-> shared convs -> + \-> reg + roi features + /-> cls + \-> shared fc -> + \-> reg + """ # noqa: W605 + + def __init__(self, + num_convs: int = 0, + num_fcs: int = 0, + conv_out_channels: int = 1024, + fc_out_channels: int = 1024, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + init_cfg: MultiConfig = dict( + type='Normal', + override=[ + dict(type='Normal', name='fc_cls', std=0.01), + dict(type='Normal', name='fc_reg', std=0.001), + dict( + type='Xavier', + name='fc_branch', + distribution='uniform') + ]), + **kwargs) -> None: + kwargs.setdefault('with_avg_pool', True) + super().__init__(init_cfg=init_cfg, **kwargs) + + def forward(self, x_cls: Tensor, x_reg: Tensor) -> Tuple[Tensor]: + +``` + +Second, implement a new RoI Head if it is necessary. We plan to inherit the new `DoubleHeadRoIHead` from `StandardRoIHead`. We can find that a `StandardRoIHead` already implements the following functions. + +```python +from typing import List, Optional, Tuple + +import torch +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList +from ..task_modules.samplers import SamplingResult +from ..utils import empty_instances, unpack_gt_instances +from .base_roi_head import BaseRoIHead + + +@MODELS.register_module() +class StandardRoIHead(BaseRoIHead): + """Simplest base roi head including one bbox head and one mask head.""" + + def init_assigner_sampler(self) -> None: + + def init_bbox_head(self, bbox_roi_extractor: ConfigType, + bbox_head: ConfigType) -> None: + + def init_mask_head(self, mask_roi_extractor: ConfigType, + mask_head: ConfigType) -> None: + + def forward(self, x: Tuple[Tensor], + rpn_results_list: InstanceList) -> tuple: + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: List[DetDataSample]) -> dict: + + def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + + def bbox_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult]) -> dict: + + def mask_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult], bbox_feats: Tensor, + batch_gt_instances: InstanceList) -> dict: + + def _mask_forward(self, + x: Tuple[Tensor], + rois: Tensor = None, + pos_inds: Optional[Tensor] = None, + bbox_feats: Optional[Tensor] = None) -> dict: + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False) -> InstanceList: + + def predict_mask(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + results_list: InstanceList, + rescale: bool = False) -> InstanceList: + +``` + +Double Head's modification is mainly in the `bbox_forward` logic, and it inherits other logics from the `StandardRoIHead`. In the `mmdet/models/roi_heads/double_roi_head.py`, we implement the new RoI Head as the following: + +```python +from typing import Tuple + +from torch import Tensor + +from mmdet.registry import MODELS +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class DoubleHeadRoIHead(StandardRoIHead): + """RoI head for `Double Head RCNN `_. + + Args: + reg_roi_scale_factor (float): The scale factor to extend the rois + used to extract the regression features. + """ + + def __init__(self, reg_roi_scale_factor: float, **kwargs): + super().__init__(**kwargs) + self.reg_roi_scale_factor = reg_roi_scale_factor + + def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + """Box head forward function used in both training and testing. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + bbox_cls_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], rois) + bbox_reg_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], + rois, + roi_scale_factor=self.reg_roi_scale_factor) + if self.with_shared_head: + bbox_cls_feats = self.shared_head(bbox_cls_feats) + bbox_reg_feats = self.shared_head(bbox_reg_feats) + cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats) + + bbox_results = dict( + cls_score=cls_score, + bbox_pred=bbox_pred, + bbox_feats=bbox_cls_feats) + return bbox_results +``` + +Last, the users need to add the module in +`mmdet/models/bbox_heads/__init__.py` and `mmdet/models/roi_heads/__init__.py` thus the corresponding registry could find and load them. + +Alternatively, the users can add + +```python +custom_imports=dict( + imports=['mmdet.models.roi_heads.double_roi_head', 'mmdet.models.roi_heads.bbox_heads.double_bbox_head']) +``` + +to the config file and achieve the same goal. + +The config file of Double Head R-CNN is as the following + +```python +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + type='DoubleHeadRoIHead', + reg_roi_scale_factor=1.3, + bbox_head=dict( + _delete_=True, + type='DoubleConvFCBBoxHead', + num_convs=4, + num_fcs=2, + in_channels=256, + conv_out_channels=1024, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=2.0)))) + +``` + +Since MMDetection 2.0, the config system supports to inherit configs such that the users can focus on the modification. +The Double Head R-CNN mainly uses a new `DoubleHeadRoIHead` and a new `DoubleConvFCBBoxHead `, the arguments are set according to the `__init__` function of each module. + +### Add new loss + +Assume you want to add a new loss as `MyLoss`, for bounding box regression. +To add a new loss function, the users need implement it in `mmdet/models/losses/my_loss.py`. +The decorator `weighted_loss` enable the loss to be weighted for each element. + +```python +import torch +import torch.nn as nn + +from mmdet.registry import MODELS +from .utils import weighted_loss + +@weighted_loss +def my_loss(pred, target): + assert pred.size() == target.size() and target.numel() > 0 + loss = torch.abs(pred - target) + return loss + +@MODELS.register_module() +class MyLoss(nn.Module): + + def __init__(self, reduction='mean', loss_weight=1.0): + super(MyLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_bbox = self.loss_weight * my_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox +``` + +Then the users need to add it in the `mmdet/models/losses/__init__.py`. + +```python +from .my_loss import MyLoss, my_loss + +``` + +Alternatively, you can add + +```python +custom_imports=dict( + imports=['mmdet.models.losses.my_loss']) +``` + +to the config file and achieve the same goal. + +To use it, modify the `loss_xxx` field. +Since MyLoss is for regression, you need to modify the `loss_bbox` field in the head. + +```python +loss_bbox=dict(type='MyLoss', loss_weight=1.0)) +``` diff --git a/mmdetection/docs/en/advanced_guides/customize_runtime.md b/mmdetection/docs/en/advanced_guides/customize_runtime.md new file mode 100644 index 0000000..e6ce740 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/customize_runtime.md @@ -0,0 +1,391 @@ +# Customize Runtime Settings + +## Customize optimization settings + +Optimization related configuration is now all managed by `optim_wrapper` which usually has three fields: `optimizer`, `paramwise_cfg`, `clip_grad`, refer to [OptimWrapper](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.md) for more detail. See the example below, where `Adamw` is used as an `optimizer`, the learning rate of the backbone is reduced by a factor of 10, and gradient clipping is added. + +```python +optim_wrapper = dict( + type='OptimWrapper', + # optimizer + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999)), + + # Parameter-level learning rate and weight decay settings + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + }, + norm_decay_mult=0.0), + + # gradient clipping + clip_grad=dict(max_norm=0.01, norm_type=2)) +``` + +### Customize optimizer supported by Pytorch + +We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field in `optim_wrapper` field of config files. For example, if you want to use `ADAM` (note that the performance could drop a lot), the modification could be as the following. + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='Adam', lr=0.0003, weight_decay=0.0001)) +``` + +To modify the learning rate of the model, the users only need to modify the `lr` in `optimizer`. The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch. + +### Customize self-implemented optimizer + +#### 1. Define a new optimizer + +A customized optimizer could be defined as following. + +Assume you want to add a optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`. +You need to create a new directory named `mmdet/engine/optimizers`. And then implement the new optimizer in a file, e.g., in `mmdet/engine/optimizers/my_optimizer.py`: + +```python +from mmdet.registry import OPTIMIZERS +from torch.optim import Optimizer + + +@OPTIMIZERS.register_module() +class MyOptimizer(Optimizer): + + def __init__(self, a, b, c) + +``` + +#### 2. Add the optimizer to registry + +To find the above module defined above, this module should be imported into the main namespace at first. There are two options to achieve it. + +- Modify `mmdet/engine/optimizers/__init__.py` to import it. + + The newly defined module should be imported in `mmdet/engine/optimizers/__init__.py` so that the registry will find the new module and add it: + +```python +from .my_optimizer import MyOptimizer +``` + +- Use `custom_imports` in the config to manually import it + +```python +custom_imports = dict(imports=['mmdet.engine.optimizers.my_optimizer'], allow_failed_imports=False) +``` + +The module `mmdet.engine.optimizers.my_optimizer` will be imported at the beginning of the program and the class `MyOptimizer` is then automatically registered. +Note that only the package containing the class `MyOptimizer` should be imported. +`mmdet.engine.optimizers.my_optimizer.MyOptimizer` **cannot** be imported directly. + +Actually users can use a totally different file directory structure using this importing method, as long as the module root can be located in `PYTHONPATH`. + +#### 3. Specify the optimizer in the config file + +Then you can use `MyOptimizer` in `optimizer` field in `optim_wrapper` field of config files. In the configs, the optimizers are defined by the field `optimizer` like the following: + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) +``` + +To use your own optimizer, the field can be changed to + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)) +``` + +### Customize optimizer wrapper constructor + +Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNorm layers. +The users can do those fine-grained parameter tuning through customizing optimizer wrapper constructor. + +```python +from mmengine.optim import DefaultOptiWrapperConstructor + +from mmdet.registry import OPTIM_WRAPPER_CONSTRUCTORS +from .my_optimizer import MyOptimizer + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class MyOptimizerWrapperConstructor(DefaultOptimWrapperConstructor): + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None): + + def __call__(self, model: nn.Module) -> OptimWrapper: + + return optim_wrapper + +``` + +The default optimizer wrapper constructor is implemented [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L18), which could also serve as a template for the new optimizer wrapper constructor. + +### Additional settings + +Tricks not implemented by the optimizer should be implemented through optimizer wrapper constructor (e.g., set parameter-wise learning rates) or hooks. We list some common settings that could stabilize the training or accelerate the training. Feel free to create PR, issue for more settings. + +- __Use gradient clip to stabilize training__: + Some models need gradient clip to clip the gradients to stabilize the training process. An example is as below: + + ```python + optim_wrapper = dict( + _delete_=True, clip_grad=dict(max_norm=35, norm_type=2)) + ``` + + If your config inherits the base config which already sets the `optim_wrapper`, you might need `_delete_=True` to override the unnecessary settings. See the [config documentation](../user_guides/config.md) for more details. + +- __Use momentum schedule to accelerate model convergence__: + We support momentum scheduler to modify model's momentum according to learning rate, which could make the model converge in a faster way. + Momentum scheduler is usually used with LR scheduler, for example, the following config is used in [3D detection](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/schedules/cyclic-20e.py) to accelerate convergence. + For more details, please refer to the implementation of [CosineAnnealingLR](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L43) and [CosineAnnealingMomentum](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py#L71). + + ```python + param_scheduler = [ + # learning rate scheduler + # During the first 8 epochs, learning rate increases from 0 to lr * 10 + # during the next 12 epochs, learning rate decreases from lr * 10 to lr * 1e-4 + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=lr * 10, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=12, + eta_min=lr * 1e-4, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True), + # momentum scheduler + # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95 + # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1 + dict( + type='CosineAnnealingMomentum', + T_max=8, + eta_min=0.85 / 0.95, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=12, + eta_min=1, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True) + ] + ``` + +## Customize training schedules + +By default we use step learning rate with 1x schedule, this calls [MultiStepLR](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L139) in MMEngine. +We support many other learning rate schedule [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py), such as `CosineAnnealingLR` and `PolyLR` schedule. Here are some examples + +- Poly schedule: + + ```python + param_scheduler = [ + dict( + type='PolyLR', + power=0.9, + eta_min=1e-4, + begin=0, + end=8, + by_epoch=True)] + ``` + +- ConsineAnnealing schedule: + + ```python + param_scheduler = [ + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=lr * 1e-5, + begin=0, + end=8, + by_epoch=True)] + + ``` + +## Customize train loop + +By default, `EpochBasedTrainLoop` is used in `train_cfg` and validation is done after every train epoch, as follows. + +```python +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=1, val_interval=1) +``` + +Actually, both [`IterBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L183%5D) and [`EpochBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L18) support dynamical interval, see the following example. + +```python +# Before 365001th iteration, we do evaluation every 5000 iterations. +# After 365000th iteration, we do evaluation every 368750 iterations, +# which means that we do evaluation at the end of training. + +interval = 5000 +max_iters = 368750 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +train_cfg = dict( + type='IterBasedTrainLoop', + max_iters=max_iters, + val_interval=interval, + dynamic_intervals=dynamic_intervals) +``` + +## Customize hooks + +### Customize self-implemented hooks + +#### 1. Implement a new hook + +MMEngine provides many useful [hooks](https://mmengine.readthedocs.io/en/latest/tutorials/hooks.html), but there are some occasions when the users might need to implement a new hook. MMDetection supports customized hooks in training in v3.0 . Thus the users could implement a hook directly in mmdet or their mmdet-based codebases and use the hook by only modifying the config in training. +Here we give an example of creating a new hook in mmdet and using it in training. + +```python +from mmengine.hooks import Hook +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class MyHook(Hook): + + def __init__(self, a, b): + + def before_run(self, runner) -> None: + + def after_run(self, runner) -> None: + + def before_train(self, runner) -> None: + + def after_train(self, runner) -> None: + + def before_train_epoch(self, runner) -> None: + + def after_train_epoch(self, runner) -> None: + + def before_train_iter(self, + runner, + batch_idx: int, + data_batch: DATA_BATCH = None) -> None: + + def after_train_iter(self, + runner, + batch_idx: int, + data_batch: DATA_BATCH = None, + outputs: Optional[dict] = None) -> None: +``` + +Depending on the functionality of the hook, the users need to specify what the hook will do at each stage of the training in `before_run`, `after_run`, `before_train`, `after_train` , `before_train_epoch`, `after_train_epoch`, `before_train_iter`, and `after_train_iter`. There are more points where hooks can be inserted, refer to [base hook class](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py#L9) for more detail. + +#### 2. Register the new hook + +Then we need to make `MyHook` imported. Assuming the file is in `mmdet/engine/hooks/my_hook.py` there are two ways to do that: + +- Modify `mmdet/engine/hooks/__init__.py` to import it. + + The newly defined module should be imported in `mmdet/engine/hooks/__init__.py` so that the registry will find the new module and add it: + +```python +from .my_hook import MyHook +``` + +- Use `custom_imports` in the config to manually import it + +```python +custom_imports = dict(imports=['mmdet.engine.hooks.my_hook'], allow_failed_imports=False) +``` + +#### 3. Modify the config + +```python +custom_hooks = [ + dict(type='MyHook', a=a_value, b=b_value) +] +``` + +You can also set the priority of the hook by adding key `priority` to `'NORMAL'` or `'HIGHEST'` as below + +```python +custom_hooks = [ + dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL') +] +``` + +By default the hook's priority is set as `NORMAL` during registration. + +### Use hooks implemented in MMDetection + +If the hook is already implemented in MMDectection, you can directly modify the config to use the hook as below + +#### Example: `NumClassCheckHook` + +We implement a customized hook named [NumClassCheckHook](../../../mmdet/engine/hooks/num_class_check_hook.py) to check whether the `num_classes` in head matches the length of `classes` in the metainfo of `dataset`. + +We set it in [default_runtime.py](../../../configs/_base_/default_runtime.py). + +```python +custom_hooks = [dict(type='NumClassCheckHook')] +``` + +### Modify default runtime hooks + +There are some common hooks that are registered through `default_hooks`, they are + +- `IterTimerHook`: A hook that logs 'data_time' for loading data and 'time' for a model train step. +- `LoggerHook`: A hook that Collect logs from different components of `Runner` and write them to terminal, JSON file, tensorboard and wandb .etc. +- `ParamSchedulerHook`: A hook to update some hyper-parameters in optimizer, e.g., learning rate and momentum. +- `CheckpointHook`: A hook that saves checkpoints periodically. +- `DistSamplerSeedHook`: A hook that sets the seed for sampler and batch_sampler. +- `DetVisualizationHook`: A hook used to visualize validation and testing process prediction results. + +`IterTimerHook`, `ParamSchedulerHook` and `DistSamplerSeedHook` are simple and no need to be modified usually, so here we reveals how what we can do with `LoggerHook`, `CheckpointHook` and `DetVisualizationHook`. + +#### CheckpointHook + +Except saving checkpoints periodically, [`CheckpointHook`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L19) provides other options such as `max_keep_ckpts`, `save_optimizer` and etc. The users could set `max_keep_ckpts` to only save small number of checkpoints or decide whether to store state dict of optimizer by `save_optimizer`. More details of the arguments are [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L19) + +```python +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=1, + max_keep_ckpts=3, + save_optimizer=True)) +``` + +#### LoggerHook + +The `LoggerHook` enables to set intervals. And the detail usages can be found in the [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py#L18). + +```python +default_hooks = dict(logger=dict(type='LoggerHook', interval=50)) +``` + +#### DetVisualizationHook + +`DetVisualizationHook` use `DetLocalVisualizer` to visualize prediction results, and `DetLocalVisualizer` current supports different backends, e.g., `TensorboardVisBackend` and `WandbVisBackend` (see [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py) for more detail). The users could add multi backbends to do visualization, as follows. + +```python +default_hooks = dict( + visualization=dict(type='DetVisualizationHook', draw=True)) + +vis_backends = [dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +``` diff --git a/mmdetection/docs/en/advanced_guides/customize_transforms.md b/mmdetection/docs/en/advanced_guides/customize_transforms.md new file mode 100644 index 0000000..5fe84e9 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/customize_transforms.md @@ -0,0 +1,49 @@ +# Customize Data Pipelines + +1. Write a new transform in a file, e.g., in `my_pipeline.py`. It takes a dict as input and returns a dict. + + ```python + import random + from mmcv.transforms import BaseTransform + from mmdet.registry import TRANSFORMS + + + @TRANSFORMS.register_module() + class MyTransform(BaseTransform): + """Add your transform + + Args: + p (float): Probability of shifts. Default 0.5. + """ + + def __init__(self, prob=0.5): + self.prob = prob + + def transform(self, results): + if random.random() > self.prob: + results['dummy'] = True + return results + ``` + +2. Import and use the pipeline in your config file. + Make sure the import is relative to where your train script is located. + + ```python + custom_imports = dict(imports=['path.to.my_pipeline'], allow_failed_imports=False) + + train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='MyTransform', prob=0.2), + dict(type='PackDetInputs') + ] + ``` + +3. Visualize the output of your transforms pipeline + + To visualize the output of your transforms pipeline, `tools/misc/browse_dataset.py` + can help the user to browse a detection dataset (both images and bounding box annotations) + visually, or save the image to a designated directory. More details can refer to + [visualization documentation](../user_guides/visualization.md) diff --git a/mmdetection/docs/en/advanced_guides/data_flow.md b/mmdetection/docs/en/advanced_guides/data_flow.md new file mode 100644 index 0000000..59e7ca3 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/data_flow.md @@ -0,0 +1 @@ +# Data Flow diff --git a/mmdetection/docs/en/advanced_guides/datasets.md b/mmdetection/docs/en/advanced_guides/datasets.md new file mode 100644 index 0000000..157ea3a --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/datasets.md @@ -0,0 +1 @@ +# Datasets diff --git a/mmdetection/docs/en/advanced_guides/engine.md b/mmdetection/docs/en/advanced_guides/engine.md new file mode 100644 index 0000000..eaa55b0 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/engine.md @@ -0,0 +1 @@ +# Engine diff --git a/mmdetection/docs/en/advanced_guides/evaluation.md b/mmdetection/docs/en/advanced_guides/evaluation.md new file mode 100644 index 0000000..b394c76 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/evaluation.md @@ -0,0 +1 @@ +# Evaluation diff --git a/mmdetection/docs/en/advanced_guides/how_to.md b/mmdetection/docs/en/advanced_guides/how_to.md new file mode 100644 index 0000000..7eb41ce --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/how_to.md @@ -0,0 +1,222 @@ +This tutorial collects answers to any `How to xxx with MMDetection`. Feel free to update this doc if you meet new questions about `How to` and find the answers! + +# Use backbone network through MMPretrain + +The model registry in MMDet, MMPreTrain, MMSeg all inherit from the root registry in MMEngine. This allows these repositories to directly use the modules already implemented by each other. Therefore, users can use backbone networks from MMPretrain in MMDetection without implementing a network that already exists in MMPretrain. + +## Use backbone network implemented in MMPretrain + +Suppose you want to use `MobileNetV3-small` as the backbone network of `RetinaNet`, the example config is as the following. + +```python +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict(imports=['mmpretrain.models'], allow_failed_imports=False) +pretrained = 'https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth' +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmpretrain.MobileNetV3', # Using MobileNetV3 from mmpretrain + arch='small', + out_indices=(3, 8, 11), # Modify out_indices + init_cfg=dict( + type='Pretrained', + checkpoint=pretrained, + prefix='backbone.')), # The pre-trained weights of backbone network in mmpretrain have prefix='backbone.'. The prefix in the keys will be removed so that these weights can be normally loaded. + # Modify in_channels + neck=dict(in_channels=[24, 48, 96], start_level=0)) +``` + +## Use backbone network in TIMM through MMPretrain + +MMPretrain also provides a wrapper for the PyTorch Image Models (timm) backbone network, users can directly use the backbone network in timm through MMPretrain. Suppose you want to use [EfficientNet-B1](../../../configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py) as the backbone network of RetinaNet, the example config is as the following. + +```python +# https://github.com/open-mmlab/mmdetection/blob/main/configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py + +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict(imports=['mmpretrain.models'], allow_failed_imports=False) +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmpretrain.TIMMBackbone', # Using timm from mmpretrain + model_name='efficientnet_b1', + features_only=True, + pretrained=True, + out_indices=(1, 2, 3, 4)), # Modify out_indices + neck=dict(in_channels=[24, 40, 112, 320])) # Modify in_channels + +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +``` + +`type='mmpretrain.TIMMBackbone'` means use the `TIMMBackbone` class from MMPretrain in MMDetection, and the model used is `EfficientNet-B1`, where `mmpretrain` means the MMPretrain repo and `TIMMBackbone` means the TIMMBackbone wrapper implemented in MMPretrain. + +For the principle of the Hierarchy Registry, please refer to the [MMEngine document](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/config.md). For how to use other backbones in MMPretrain, you can refer to the [MMPretrain document](https://mmpretrain.readthedocs.io/en/latest/user_guides/config.html). + +# Use Mosaic augmentation + +If you want to use `Mosaic` in training, please make sure that you use `MultiImageMixDataset` at the same time. Taking the 'Faster R-CNN' algorithm as an example, you should modify the values of `train_pipeline` and `train_dataset` in the config as below: + +```python +# Open configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py directly and add the following fields +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +img_scale=(1333, 800) + +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), # The image will be enlarged by 4 times after Mosaic processing,so we use affine transformation to restore the image size. + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataset = dict( + _delete_ = True, # remove unnecessary Settings + type='MultiImageMixDataset', + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ], + filter_empty_gt=False, + ), + pipeline=train_pipeline + ) + +data = dict( + train=train_dataset + ) +``` + +# Unfreeze backbone network after freezing the backbone in the config + +If you have freezed the backbone network in the config and want to unfreeze it after some epoches, you can write a hook function to do it. Taking the Faster R-CNN with the resnet backbone as an example, you can freeze one stage of the backbone network and add a `custom_hooks` in the config as below: + +```python +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + # freeze one stage of the backbone network. + backbone=dict(frozen_stages=1), +) +custom_hooks = [dict(type="UnfreezeBackboneEpochBasedHook", unfreeze_epoch=1)] +``` + +Meanwhile write the hook class `UnfreezeBackboneEpochBasedHook` in `mmdet/core/hook/unfreeze_backbone_epoch_based_hook.py` + +```python +from mmengine.model import is_model_wrapper +from mmengine.hooks import Hook +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class UnfreezeBackboneEpochBasedHook(Hook): + """Unfreeze backbone network Hook. + + Args: + unfreeze_epoch (int): The epoch unfreezing the backbone network. + """ + + def __init__(self, unfreeze_epoch=1): + self.unfreeze_epoch = unfreeze_epoch + + def before_train_epoch(self, runner): + # Unfreeze the backbone network. + # Only valid for resnet. + if runner.epoch == self.unfreeze_epoch: + model = runner.model + if is_model_wrapper(model): + model = model.module + backbone = model.backbone + if backbone.frozen_stages >= 0: + if backbone.deep_stem: + backbone.stem.train() + for param in backbone.stem.parameters(): + param.requires_grad = True + else: + backbone.norm1.train() + for m in [backbone.conv1, backbone.norm1]: + for param in m.parameters(): + param.requires_grad = True + + for i in range(1, backbone.frozen_stages + 1): + m = getattr(backbone, f'layer{i}') + m.train() + for param in m.parameters(): + param.requires_grad = True +``` + +# Get the channels of a new backbone + +If you want to get the channels of a new backbone, you can build this backbone alone and input a pseudo image to get each stage output. + +Take `ResNet` as an example: + +```python +from mmdet.models import ResNet +import torch +self = ResNet(depth=18) +self.eval() +inputs = torch.rand(1, 3, 32, 32) +level_outputs = self.forward(inputs) +for level_out in level_outputs: + print(tuple(level_out.shape)) + +``` + +Output of the above script is as below: + +```python +(1, 64, 8, 8) +(1, 128, 4, 4) +(1, 256, 2, 2) +(1, 512, 1, 1) +``` + +Users can get the channels of the new backbone by Replacing the `ResNet(depth=18)` in this script with their customized backbone. + +# Use Detectron2 Model in MMDetection + +Users can use Detectron2Wrapper to run Detectron2's model in MMDetection. We provide examples of [Faster R-CNN](../../../configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py), +[Mask R-CNN](../../../configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py), and [RetinaNet](../../../configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py) in MMDetection. + +The algorithm components in config file should be the same as those of in Detectron2. During setup, we will first initialize the default settings, which can be found in [Detectron2](https://github.com/facebookresearch/detectron2/blob/main/detectron2/config/defaults.py). +Then, the settings in config file will overwrite the default settings and the model will be built with these settings. +The input data will first convert to Detectron2's type and feed into Detectron2's model. +During inference the results calculate from Detectron2's model will reconvert back to the MMDetection's type. + +## Use Detectron2's pre-trained weights + +The weight initialization in `Detectron2Wrapper` will not use the logic of MMDetection. Users can set `model.d2_detector.weights=xxx` to load pre-trained weights. +For example, we can use `model.d2_detector.weights='detectron2://ImageNetPretrained/MSRA/R-50.pkl'` to load the pre-trained ResNet-50 or use +`model.d2_detector.weights='detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl'` to load the pre-trained Mask R-CNN weights proposed in Detectron2. + +**Note:** Detectron2's pretrained model cannot be loaded directly by using `load_from`, it should be first converted via `tools/model_converters/detectron2_to_mmdet.py` + +For inference of released detectron2 checkpoints, users should first use `tools/model_converters/detectron2_to_mmdet.py` to convert Detectron2 checkpoint to MMDetection. + +```shell +python tools/model_converters/detectron2_to_mmdet.py ${Detectron2 ckpt path} ${MMDetectron ckpt path} +``` diff --git a/mmdetection/docs/en/advanced_guides/index.rst b/mmdetection/docs/en/advanced_guides/index.rst new file mode 100644 index 0000000..20d8177 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/index.rst @@ -0,0 +1,34 @@ +Basic Concepts +*************** + +.. toctree:: + :maxdepth: 1 + + data_flow.md + structures.md + models.md + datasets.md + transforms.md + evaluation.md + engine.md + conventions.md + +Component Customization +************************ + +.. toctree:: + :maxdepth: 1 + + customize_models.md + customize_losses.md + customize_dataset.md + customize_transforms.md + customize_runtime.md + +How to +************************ + +.. toctree:: + :maxdepth: 1 + + how_to.md diff --git a/mmdetection/docs/en/advanced_guides/models.md b/mmdetection/docs/en/advanced_guides/models.md new file mode 100644 index 0000000..9136172 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/models.md @@ -0,0 +1 @@ +# Models diff --git a/mmdetection/docs/en/advanced_guides/structures.md b/mmdetection/docs/en/advanced_guides/structures.md new file mode 100644 index 0000000..9852861 --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/structures.md @@ -0,0 +1 @@ +# Structures diff --git a/mmdetection/docs/en/advanced_guides/transforms.md b/mmdetection/docs/en/advanced_guides/transforms.md new file mode 100644 index 0000000..4db036a --- /dev/null +++ b/mmdetection/docs/en/advanced_guides/transforms.md @@ -0,0 +1,42 @@ +# Data Transforms (Need to update) + +## Design of Data transforms pipeline + +Following typical conventions, we use `Dataset` and `DataLoader` for data loading +with multiple workers. `Dataset` returns a dict of data items corresponding +the arguments of models' forward method. + +The data transforms pipeline and the dataset is decomposed. Usually a dataset +defines how to process the annotations and a data transforms pipeline defines all the steps to prepare a data dict. +A pipeline consists of a sequence of data transforms. Each operation takes a dict as input and also output a dict for the next transform. + +We present a classical pipeline in the following figure. The blue blocks are pipeline operations. With the pipeline going on, each operator can add new keys (marked as green) to the result dict or update the existing keys (marked as orange). +![pipeline figure](../../../resources/data_pipeline.png) + +Here is a pipeline example for Faster R-CNN. + +```python +train_pipeline = [ # Training data processing pipeline + dict(type='LoadImageFromFile', backend_args=backend_args), # First pipeline to load images from file path + dict( + type='LoadAnnotations', # Second pipeline to load annotations for current image + with_bbox=True), # Whether to use bounding box, True for detection + dict( + type='Resize', # Pipeline that resize the images and their annotations + scale=(1333, 800), # The largest scale of image + keep_ratio=True # Whether to keep the ratio between height and width + ), + dict( + type='RandomFlip', # Augmentation pipeline that flip the images and their annotations + prob=0.5), # The probability to flip + dict(type='PackDetInputs') # Pipeline that formats the annotation data and decides which keys in the data should be packed into data_samples +] +test_pipeline = [ # Testing data processing pipeline + dict(type='LoadImageFromFile', backend_args=backend_args), # First pipeline to load images from file path + dict(type='Resize', scale=(1333, 800), keep_ratio=True), # Pipeline that resize the images + dict( + type='PackDetInputs', # Pipeline that formats the annotation data and decides which keys in the data should be packed into data_samples + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` diff --git a/mmdetection/docs/en/api.rst b/mmdetection/docs/en/api.rst new file mode 100644 index 0000000..1b12732 --- /dev/null +++ b/mmdetection/docs/en/api.rst @@ -0,0 +1,161 @@ +mmdet.apis +-------------- +.. automodule:: mmdet.apis + :members: + +mmdet.datasets +-------------- + +datasets +^^^^^^^^^^ +.. automodule:: mmdet.datasets + :members: + +api_wrappers +^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.datasets.api_wrappers + :members: + +samplers +^^^^^^^^^^ +.. automodule:: mmdet.datasets.samplers + :members: + +transforms +^^^^^^^^^^^^ +.. automodule:: mmdet.datasets.transforms + :members: + +mmdet.engine +-------------- + +hooks +^^^^^^^^^^ +.. automodule:: mmdet.engine.hooks + :members: + +optimizers +^^^^^^^^^^^^^^^ +.. automodule:: mmdet.engine.optimizers + :members: + +runner +^^^^^^^^^^ +.. automodule:: mmdet.engine.runner + :members: + +schedulers +^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.engine.schedulers + :members: + +mmdet.evaluation +-------------------- + +functional +^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.evaluation.functional + :members: + +metrics +^^^^^^^^^^ +.. automodule:: mmdet.evaluation.metrics + :members: + + +mmdet.models +-------------- + +backbones +^^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.models.backbones + :members: + +data_preprocessors +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.models.data_preprocessors + :members: + +dense_heads +^^^^^^^^^^^^^^^ +.. automodule:: mmdet.models.dense_heads + :members: + +detectors +^^^^^^^^^^ +.. automodule:: mmdet.models.detectors + :members: + +layers +^^^^^^^^^^ +.. automodule:: mmdet.models.layers + :members: + +losses +^^^^^^^^^^ +.. automodule:: mmdet.models.losses + :members: + +necks +^^^^^^^^^^^^ +.. automodule:: mmdet.models.necks + :members: + +roi_heads +^^^^^^^^^^^^^ +.. automodule:: mmdet.models.roi_heads + :members: + +seg_heads +^^^^^^^^^^^^^ +.. automodule:: mmdet.models.seg_heads + :members: + +task_modules +^^^^^^^^^^^^^ +.. automodule:: mmdet.models.task_modules + :members: + +test_time_augs +^^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.models.test_time_augs + :members: + +utils +^^^^^^^^^^ +.. automodule:: mmdet.models.utils + :members: + + +mmdet.structures +-------------------- + +structures +^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.structures + :members: + +bbox +^^^^^^^^^^ +.. automodule:: mmdet.structures.bbox + :members: + +mask +^^^^^^^^^^ +.. automodule:: mmdet.structures.mask + :members: + +mmdet.testing +---------------- +.. automodule:: mmdet.testing + :members: + +mmdet.visualization +-------------------- +.. automodule:: mmdet.visualization + :members: + +mmdet.utils +-------------- +.. automodule:: mmdet.utils + :members: diff --git a/mmdetection/docs/en/conf.py b/mmdetection/docs/en/conf.py new file mode 100644 index 0000000..d2beaf1 --- /dev/null +++ b/mmdetection/docs/en/conf.py @@ -0,0 +1,116 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/main/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import subprocess +import sys + +import pytorch_sphinx_theme + +sys.path.insert(0, os.path.abspath('../..')) + +# -- Project information ----------------------------------------------------- + +project = 'MMDetection' +copyright = '2018-2021, OpenMMLab' +author = 'MMDetection Authors' +version_file = '../../mmdet/version.py' + + +def get_version(): + with open(version_file, 'r') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +# The full version, including alpha/beta/rc tags +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_markdown_tables', + 'sphinx_copybutton', +] + +myst_enable_extensions = ['colon_fence'] +myst_heading_anchors = 3 + +autodoc_mock_imports = [ + 'matplotlib', 'pycocotools', 'terminaltables', 'mmdet.version', 'mmcv.ops' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +# The main toctree document. +master_doc = 'index' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'sphinx_rtd_theme' +html_theme = 'pytorch_sphinx_theme' +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] + +html_theme_options = { + 'menu': [ + { + 'name': 'GitHub', + 'url': 'https://github.com/open-mmlab/mmdetection' + }, + ], + # Specify the language of shared menu + 'menu_lang': + 'en' +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_css_files = ['css/readthedocs.css'] + +# -- Extension configuration ------------------------------------------------- +# Ignore >>> when copying code +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + + +def builder_inited_handler(app): + subprocess.run(['./stat.py']) + + +def setup(app): + app.connect('builder-inited', builder_inited_handler) diff --git a/mmdetection/docs/en/dataset_zoo.md b/mmdetection/docs/en/dataset_zoo.md new file mode 100644 index 0000000..c35cc22 --- /dev/null +++ b/mmdetection/docs/en/dataset_zoo.md @@ -0,0 +1 @@ +# Dataset Zoo diff --git a/mmdetection/docs/en/get_started.md b/mmdetection/docs/en/get_started.md new file mode 100644 index 0000000..f65878b --- /dev/null +++ b/mmdetection/docs/en/get_started.md @@ -0,0 +1,297 @@ +# GET STARTED + +## Prerequisites + +In this section, we demonstrate how to prepare an environment with PyTorch. + +MMDetection works on Linux, Windows, and macOS. It requires Python 3.7+, CUDA 9.2+, and PyTorch 1.8+. + +```{note} +If you are experienced with PyTorch and have already installed it, just skip this part and jump to the [next section](#installation). Otherwise, you can follow these steps for the preparation. +``` + +**Step 0.** Download and install Miniconda from the [official website](https://docs.conda.io/en/latest/miniconda.html). + +**Step 1.** Create a conda environment and activate it. + +```shell +conda create --name openmmlab python=3.8 -y +conda activate openmmlab +``` + +**Step 2.** Install PyTorch following [official instructions](https://pytorch.org/get-started/locally/), e.g. + +On GPU platforms: + +```shell +conda install pytorch torchvision -c pytorch +``` + +On CPU platforms: + +```shell +conda install pytorch torchvision cpuonly -c pytorch +``` + +## Installation + +We recommend that users follow our best practices to install MMDetection. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information. + +### Best Practices + +**Step 0.** Install [MMEngine](https://github.com/open-mmlab/mmengine) and [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim). + +```shell +pip install -U openmim +mim install mmengine +mim install "mmcv>=2.0.0" +``` + +**Note:** In MMCV-v2.x, `mmcv-full` is rename to `mmcv`, if you want to install `mmcv` without CUDA ops, you can use `mim install "mmcv-lite>=2.0.0rc1"` to install the lite version. + +**Step 1.** Install MMDetection. + +Case a: If you develop and run mmdet directly, install it from source: + +```shell +git clone https://github.com/open-mmlab/mmdetection.git +cd mmdetection +pip install -v -e . +# "-v" means verbose, or more output +# "-e" means installing a project in editable mode, +# thus any local modifications made to the code will take effect without reinstallation. +``` + +Case b: If you use mmdet as a dependency or third-party package, install it with MIM: + +```shell +mim install mmdet +``` + +## Verify the installation + +To verify whether MMDetection is installed correctly, we provide some sample codes to run an inference demo. + +**Step 1.** We need to download config and checkpoint files. + +```shell +mim download mmdet --config rtmdet_tiny_8xb32-300e_coco --dest . +``` + +The downloading will take several seconds or more, depending on your network environment. When it is done, you will find two files `rtmdet_tiny_8xb32-300e_coco.py` and `rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth` in your current folder. + +**Step 2.** Verify the inference demo. + +Case a: If you install MMDetection from source, just run the following command. + +```shell +python demo/image_demo.py demo/demo.jpg rtmdet_tiny_8xb32-300e_coco.py --weights rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth --device cpu +``` + +You will see a new image `demo.jpg` on your `./outputs/vis` folder, where bounding boxes are plotted on cars, benches, etc. + +Case b: If you install MMDetection with MIM, open your python interpreter and copy&paste the following codes. + +```python +from mmdet.apis import init_detector, inference_detector + +config_file = 'rtmdet_tiny_8xb32-300e_coco.py' +checkpoint_file = 'rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth' +model = init_detector(config_file, checkpoint_file, device='cpu') # or device='cuda:0' +inference_detector(model, 'demo/demo.jpg') +``` + +You will see a list of `DetDataSample`, and the predictions are in the `pred_instance`, indicating the detected bounding boxes, labels, and scores. + +## Tracking Installation + +We recommend that users follow our best practices to install MMDetection for tracking task. + +### Best Practices + +**Step 0.** Install [MMEngine](https://github.com/open-mmlab/mmengine) and [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim). + +```shell +pip install -U openmim +mim install mmengine +mim install "mmcv>=2.0.0" +``` + +**Step 1.** Install MMDetection. + +Case a: If you develop and run mmdet directly, install it from source: + +```shell +git clone https://github.com/open-mmlab/mmdetection.git +cd mmdetection +pip install -v -e . -r requirements/tracking.txt +# "-v" means verbose, or more output +# "-e" means installing a project in editable mode, +# thus any local modifications made to the code will take effect without reinstallation. +``` + +Case b: If you use mmdet as a dependency or third-party package, install it with MIM: + +```shell +mim install mmdet[tracking] +``` + +**Step 2.** Install TrackEval. + +```shell +pip install git+https://github.com/JonathonLuiten/TrackEval.git +``` + +## Verify the installation + +To verify whether MMDetection is installed correctly, we provide some sample codes to run an inference demo. + +**Step 1.** We need to download config and checkpoint files. + +```shell +mim download mmdet --config bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval --dest . +``` + +The downloading will take several seconds or more, depending on your network environment. When it is done, you will find two files `bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py` and `bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth` in your current folder. + +**Step 2.** Verify the inference demo. + +Case a: If you install MMDetection from source, just run the following command. + +```shell +python demo/mot_demo.py demo/demo_mot.mp4 bytetrack_yolox_x_8xb4-amp-80e_crowdhuman-mot17halftrain_test-mot17halfval.py --checkpoint bytetrack_yolox_x_crowdhuman_mot17-private-half_20211218_205500-1985c9f0.pth --out mot.mp4 +``` + +You will see a new video `mot.mp4` on your folder, where bounding boxes are plotted on person. + +Case b: If you install MMDetection with MIM, open your python interpreter and demo/mot_demo.py, then run it like Case a. + +### Customize Installation + +#### CUDA versions + +When installing PyTorch, you need to specify the version of CUDA. If you are not clear on which to choose, follow our recommendations: + +- For Ampere-based NVIDIA GPUs, such as GeForce 30 series and NVIDIA A100, CUDA 11 is a must. +- For older NVIDIA GPUs, CUDA 11 is backward compatible, but CUDA 10.2 offers better compatibility and is more lightweight. + +Please make sure the GPU driver satisfies the minimum version requirements. See [this table](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions) for more information. + +```{note} +Installing CUDA runtime libraries is enough if you follow our best practices, because no CUDA code will be compiled locally. However, if you hope to compile MMCV from source or develop other CUDA operators, you need to install the complete CUDA toolkit from NVIDIA's [website](https://developer.nvidia.com/cuda-downloads), and its version should match the CUDA version of PyTorch. i.e., the specified version of cudatoolkit in the `conda install` command. +``` + +#### Install MMEngine without MIM + +To install MMEngine with pip instead of MIM, please follow [MMEngine installation guides](https://mmengine.readthedocs.io/en/latest/get_started/installation.html). + +For example, you can install MMEngine by the following command. + +```shell +pip install mmengine +``` + +#### Install MMCV without MIM + +MMCV contains C++ and CUDA extensions, thus depending on PyTorch in a complex way. MIM solves such dependencies automatically and makes the installation easier. However, it is not a must. + +To install MMCV with pip instead of MIM, please follow [MMCV installation guides](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html). This requires manually specifying a find-url based on the PyTorch version and its CUDA version. + +For example, the following command installs MMCV built for PyTorch 1.12.x and CUDA 11.6. + +```shell +pip install "mmcv>=2.0.0" -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.12.0/index.html +``` + +#### Install on CPU-only platforms + +MMDetection can be built for CPU-only environments. In CPU mode you can train (requires MMCV version >= 2.0.0rc1), test, or infer a model. + +However, some functionalities are gone in this mode: + +- Deformable Convolution +- Modulated Deformable Convolution +- ROI pooling +- Deformable ROI pooling +- CARAFE +- SyncBatchNorm +- CrissCrossAttention +- MaskedConv2d +- Temporal Interlace Shift +- nms_cuda +- sigmoid_focal_loss_cuda +- bbox_overlaps + +If you try to train/test/infer a model containing the above ops, an error will be raised. +The following table lists affected algorithms. + +| Operator | Model | +| :-----------------------------------------------------: | :--------------------------------------------------------------------------------------: | +| Deformable Convolution/Modulated Deformable Convolution | DCN, Guided Anchoring, RepPoints, CentripetalNet, VFNet, CascadeRPN, NAS-FCOS, DetectoRS | +| MaskedConv2d | Guided Anchoring | +| CARAFE | CARAFE | +| SyncBatchNorm | ResNeSt | + +#### Install on Google Colab + +[Google Colab](https://colab.research.google.com/) usually has PyTorch installed, +thus we only need to install MMEngine, MMCV, and MMDetection with the following commands. + +**Step 1.** Install [MMEngine](https://github.com/open-mmlab/mmengine) and [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim). + +```shell +!pip3 install openmim +!mim install mmengine +!mim install "mmcv>=2.0.0,<2.1.0" +``` + +**Step 2.** Install MMDetection from the source. + +```shell +!git clone https://github.com/open-mmlab/mmdetection.git +%cd mmdetection +!pip install -e . +``` + +**Step 3.** Verification. + +```python +import mmdet +print(mmdet.__version__) +# Example output: 3.0.0, or an another version. +``` + +```{note} +Within Jupyter, the exclamation mark `!` is used to call external executables and `%cd` is a [magic command](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd) to change the current working directory of Python. +``` + +#### Use MMDetection with Docker + +We provide a [Dockerfile](../../docker/Dockerfile) to build an image. Ensure that your [docker version](https://docs.docker.com/engine/install/) >=19.03. + +```shell +# build an image with PyTorch 1.9, CUDA 11.1 +# If you prefer other versions, just modified the Dockerfile +docker build -t mmdetection docker/ +``` + +Run it with + +```shell +docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmdetection/data mmdetection +``` + +### Troubleshooting + +If you have some issues during the installation, please first view the [FAQ](notes/faq.md) page. +You may [open an issue](https://github.com/open-mmlab/mmdetection/issues/new/choose) on GitHub if no solution is found. + +### Use Multiple Versions of MMDetection in Development + +Training and testing scripts have already been modified in `PYTHONPATH` in order to make sure the scripts are using their own versions of MMDetection. + +To install the default version of MMDetection in your environment, you can exclude the follow code in the relative scripts: + +```shell +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH +``` diff --git a/mmdetection/docs/en/index.rst b/mmdetection/docs/en/index.rst new file mode 100644 index 0000000..32c5952 --- /dev/null +++ b/mmdetection/docs/en/index.rst @@ -0,0 +1,63 @@ +Welcome to MMDetection's documentation! +======================================= + +.. toctree:: + :maxdepth: 1 + :caption: Get Started + + overview.md + get_started.md + +.. toctree:: + :maxdepth: 2 + :caption: User Guides + + user_guides/index.rst + +.. toctree:: + :maxdepth: 2 + :caption: Advanced Guides + + advanced_guides/index.rst + +.. toctree:: + :maxdepth: 1 + :caption: Migration + + migration/migration.md + +.. toctree:: + :maxdepth: 1 + :caption: API Reference + + api.rst + +.. toctree:: + :maxdepth: 1 + :caption: Model Zoo + + model_zoo.md + +.. toctree:: + :maxdepth: 1 + :caption: Notes + + notes/contribution_guide.md + notes/projects.md + notes/changelog.md + notes/changelog_v2.x.md + notes/faq.md + notes/compatibility.md + +.. toctree:: + :caption: Switch Language + + switch_language.md + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/mmdetection/docs/en/make.bat b/mmdetection/docs/en/make.bat new file mode 100644 index 0000000..922152e --- /dev/null +++ b/mmdetection/docs/en/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/mmdetection/docs/en/migration.md b/mmdetection/docs/en/migration.md new file mode 100644 index 0000000..689e8d2 --- /dev/null +++ b/mmdetection/docs/en/migration.md @@ -0,0 +1 @@ +# Migration diff --git a/mmdetection/docs/en/migration/api_and_registry_migration.md b/mmdetection/docs/en/migration/api_and_registry_migration.md new file mode 100644 index 0000000..72bfd3a --- /dev/null +++ b/mmdetection/docs/en/migration/api_and_registry_migration.md @@ -0,0 +1 @@ +# Migrate API and Registry from MMDetection 2.x to 3.x diff --git a/mmdetection/docs/en/migration/config_migration.md b/mmdetection/docs/en/migration/config_migration.md new file mode 100644 index 0000000..1177fa9 --- /dev/null +++ b/mmdetection/docs/en/migration/config_migration.md @@ -0,0 +1,819 @@ +# Migrate Configuration File from MMDetection 2.x to 3.x + +The configuration file of MMDetection 3.x has undergone significant changes in comparison to the 2.x version. This document explains how to migrate 2.x configuration files to 3.x. + +In the previous tutorial [Learn about Configs](../user_guides/config.md), we used Mask R-CNN as an example to introduce the configuration file structure of MMDetection 3.x. Here, we will follow the same structure to demonstrate how to migrate 2.x configuration files to 3.x. + +## Model Configuration + +There have been no major changes to the model configuration in 3.x compared to 2.x. For the model's backbone, neck, head, as well as train_cfg and test_cfg, the parameters remain the same as in version 2.x. + +On the other hand, we have added the `DataPreprocessor` module in MMDetection 3.x. The configuration for the `DataPreprocessor` module is located in `model.data_preprocessor`. It is used to preprocess the input data, such as normalizing input images and padding images of different sizes into batches, and loading images from memory to VRAM. This configuration replaces the `Normalize` and `Pad` modules in `train_pipeline` and `test_pipeline` of the earlier version. + + + + + + + + + +
    2.x Config + +```python +# Image normalization parameters +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +pipeline=[ + ..., + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), # Padding the image to multiples of 32 + ... +] +``` + +
    3.x Config + +```python +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + # Image normalization parameters + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + # Image padding parameters + pad_mask=True, # In instance segmentation, the mask needs to be padded + pad_size_divisor=32) # Padding the image to multiples of 32 +) + +``` + +
    + +## Dataset and Evaluator Configuration + +The dataset and evaluator configurations have undergone major changes compared to version 2.x. We will introduce how to migrate from version 2.x to version 3.x from three aspects: Dataloader and Dataset, Data transform pipeline, and Evaluator configuration. + +### Dataloader and Dataset Configuration + +In the new version, we set the data loading settings consistent with PyTorch's official DataLoader, +making it easier for users to understand and get started with. +We put the data loading settings for training, validation, and testing separately in `train_dataloader`, `val_dataloader`, and `test_dataloader`. +Users can set different parameters for these dataloaders. +The input parameters are basically the same as those required by [PyTorch DataLoader](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader). + +This way, we put the unconfigurable parameters in version 2.x, such as `sampler`, `batch_sampler`, and `persistent_workers`, in the configuration file, so that users can set dataloader parameters more flexibly. + +Users can set the dataset configuration through `train_dataloader.dataset`, `val_dataloader.dataset`, and `test_dataloader.dataset`, which correspond to `data.train`, `data.val`, and `data.test` in version 2.x. + + + + + + + + + +
    2.x Config + +```python +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +``` + +
    3.x Config + +```python +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, # Avoid recreating subprocesses after each iteration + sampler=dict(type='DefaultSampler', shuffle=True), # Default sampler, supports both distributed and non-distributed training + batch_sampler=dict(type='AspectRatioBatchSampler'), # Default batch_sampler, used to ensure that images in the batch have similar aspect ratios, so as to better utilize graphics memory + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) +# In version 3.x, validation and test dataloaders can be configured independently +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader # The configuration of the testing dataloader is the same as that of the validation dataloader, which is omitted here + +``` + +
    + +### Data Transform Pipeline Configuration + +As mentioned earlier, we have separated the normalization and padding configurations for images from the `train_pipeline` and `test_pipeline`, and have placed them in `model.data_preprocessor` instead. Hence, in the 3.x version of the pipeline, we no longer require the `Normalize` and `Pad` transforms. + +At the same time, we have also refactored the transform responsible for packing the data format, and have merged the `Collect` and `DefaultFormatBundle` transforms into `PackDetInputs`. This transform is responsible for packing the data from the data pipeline into the input format of the model. For more details on the input format conversion, please refer to the [data flow documentation](../advanced_guides/data_flow.md). + +Below, we will use the `train_pipeline` of Mask R-CNN as an example, to demonstrate how to migrate from the 2.x configuration to the 3.x configuration: + + + + + + + + + +
    2.x Config + +```python +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +``` + +
    3.x Config + +```python +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +``` + +
    + +For the `test_pipeline`, apart from removing the `Normalize` and `Pad` transforms, we have also separated the data augmentation for testing (TTA) from the normal testing process, and have removed `MultiScaleFlipAug`. For more information on how to use the new TTA version, please refer to the [TTA documentation](../advanced_guides/tta.md). + +Below, we will again use the `test_pipeline` of Mask R-CNN as an example, to demonstrate how to migrate from the 2.x configuration to the 3.x configuration: + + + + + + + + + +
    2.x Config + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +``` + +
    3.x Config + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +
    + +In addition, we have also refactored some data augmentation transforms. The following table lists the mapping between the transforms used in the 2.x version and the 3.x version: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Name2.x Config3.x Config
    Resize + +```python +dict(type='Resize', + img_scale=(1333, 800), + keep_ratio=True) +``` + + + +```python +dict(type='Resize', + scale=(1333, 800), + keep_ratio=True) +``` + +
    RandomResize + +```python +dict( + type='Resize', + img_scale=[ + (1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True) +``` + + + +```python +dict( + type='RandomResize', + scale=[ + (1333, 640), (1333, 800)], + keep_ratio=True) +``` + +
    RandomChoiceResize + +```python +dict( + type='Resize', + img_scale=[ + (1333, 640), (1333, 672), + (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True) +``` + + + +```python +dict( + type='RandomChoiceResize', + scales=[ + (1333, 640), (1333, 672), + (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True) +``` + +
    RandomFlip + +```python +dict(type='RandomFlip', flip_ratio=0.5) +``` + + + +```python +dict(type='RandomFlip', prob=0.5) +``` + +
    + +### 评测器配置 + +In version 3.x, model accuracy evaluation is no longer tied to the dataset, but is instead accomplished through the use of an Evaluator. +The Evaluator configuration is divided into two parts: `val_evaluator` and `test_evaluator`. The `val_evaluator` is used for validation dataset evaluation, while the `test_evaluator` is used for testing dataset evaluation. +This corresponds to the `evaluation` field in version 2.x. + +The following table shows the corresponding relationship between Evaluators in version 2.x and 3.x. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Metric Name2.x Config3.x Config
    COCO + +```python +data = dict( + val=dict( + type='CocoDataset', + ann_file=data_root + 'annotations/instances_val2017.json')) +evaluation = dict(metric=['bbox', 'segm']) +``` + + + +```python +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False) +``` + +
    Pascal VOC + +```python +data = dict( + val=dict( + type=dataset_type, + ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt')) +evaluation = dict(metric='mAP') +``` + + + +```python +val_evaluator = dict( + type='VOCMetric', + metric='mAP', + eval_mode='11points') +``` + +
    OpenImages + +```python +data = dict( + val=dict( + type='OpenImagesDataset', + ann_file=data_root + 'annotations/validation-annotations-bbox.csv', + img_prefix=data_root + 'OpenImages/validation/', + label_file=data_root + 'annotations/class-descriptions-boxable.csv', + hierarchy_file=data_root + + 'annotations/bbox_labels_600_hierarchy.json', + meta_file=data_root + 'annotations/validation-image-metas.pkl', + image_level_ann_file=data_root + + 'annotations/validation-annotations-human-imagelabels-boxable.csv')) +evaluation = dict(interval=1, metric='mAP') +``` + + + +```python +val_evaluator = dict( + type='OpenImagesMetric', + iou_thrs=0.5, + ioa_thrs=0.5, + use_group_of=True, + get_supercategory=True) +``` + +
    CityScapes + +```python +data = dict( + val=dict( + type='CityScapesDataset', + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + img_prefix=data_root + 'leftImg8bit/val/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) +``` + + + +```python +val_evaluator = [ + dict( + type='CocoMetric', + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + metric=['bbox', 'segm']), + dict( + type='CityScapesMetric', + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + seg_prefix=data_root + '/gtFine/val', + outfile_prefix='./work_dirs/cityscapes_metric/instance') +] +``` + +
    + +## Configuration for Training and Testing + + + + + + + + + +
    2.x Config + +```python +runner = dict( + type='EpochBasedRunner', # Type of training loop + max_epochs=12) # Maximum number of training epochs +evaluation = dict(interval=2) # Interval for evaluation, check the performance every 2 epochs +``` + +
    3.x Config + +```python +train_cfg = dict( + type='EpochBasedTrainLoop', # Type of training loop, please refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py + max_epochs=12, # Maximum number of training epochs + val_interval=2) # Interval for validation, check the performance every 2 epochs +val_cfg = dict(type='ValLoop') # Type of validation loop +test_cfg = dict(type='TestLoop') # Type of testing loop +``` + +
    + +## Optimization Configuration + +The configuration for optimizer and gradient clipping is moved to the `optim_wrapper` field. +The following table shows the correspondences for optimizer configuration between 2.x version and 3.x version: + + + + + + + + + +
    2.x Config + +```python +optimizer = dict( + type='SGD', # Optimizer: Stochastic Gradient Descent + lr=0.02, # Base learning rate + momentum=0.9, # SGD with momentum + weight_decay=0.0001) # Weight decay +optimizer_config = dict(grad_clip=None) # Configuration for gradient clipping, set to None to disable +``` + +
    3.x Config + +```python +optim_wrapper = dict( # Configuration for the optimizer wrapper + type='OptimWrapper', # Type of optimizer wrapper, you can switch to AmpOptimWrapper to enable mixed precision training + optimizer=dict( # Optimizer configuration, supports various PyTorch optimizers, please refer to https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # SGD + lr=0.02, # Base learning rate + momentum=0.9, # SGD with momentum + weight_decay=0.0001), # Weight decay + clip_grad=None, # Configuration for gradient clipping, set to None to disable. For usage, please see https://mmengine.readthedocs.io/en/latest/tutorials/optimizer.html + ) +``` + +
    + +The configuration for learning rate is also moved from the `lr_config` field to the `param_scheduler` field. The `param_scheduler` configuration is more similar to PyTorch's learning rate scheduler and more flexible. The following table shows the correspondences for learning rate configuration between 2.x version and 3.x version: + + + + + + + + + +
    2.x Config + +```python +lr_config = dict( + policy='step', # Use multi-step learning rate strategy during training + warmup='linear', # Use linear learning rate warmup + warmup_iters=500, # End warmup at iteration 500 + warmup_ratio=0.001, # Coefficient for learning rate warmup + step=[8, 11], # Learning rate decay at which epochs + gamma=0.1) # Learning rate decay coefficient + +``` + +
    3.x Config + +```python +param_scheduler = [ + dict( + type='LinearLR', # Use linear learning rate warmup + start_factor=0.001, # Coefficient for learning rate warmup + by_epoch=False, # Update the learning rate during warmup at each iteration + begin=0, # Starting from the first iteration + end=500), # End at the 500th iteration + dict( + type='MultiStepLR', # Use multi-step learning rate strategy during training + by_epoch=True, # Update the learning rate at each epoch + begin=0, # Starting from the first epoch + end=12, # Ending at the 12th epoch + milestones=[8, 11], # Learning rate decay at which epochs + gamma=0.1) # Learning rate decay coefficient +] + +``` + +
    + +For information on how to migrate other learning rate adjustment policies, please refer to the [learning rate migration document of MMEngine](https://mmengine.readthedocs.io/zh_CN/latest/migration/param_scheduler.html). + +## Migration of Other Configurations + +### Configuration for Saving Checkpoints + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Function2.x Config3.x Config
    Set Save Interval + +```python +checkpoint_config = dict( + interval=1) +``` + + + +```python +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=1)) +``` + +
    Save Best Model + +```python +evaluation = dict( + save_best='auto') +``` + + + +```python +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='auto')) +``` + +
    Keep Latest Model + +```python +checkpoint_config = dict( + max_keep_ckpts=3) +``` + + + +```python +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + max_keep_ckpts=3)) +``` + +
    + +### Logging Configuration + +In MMDetection 3.x, the logging and visualization of the log are carried out respectively by the logger and visualizer in MMEngine. The following table shows the comparison between the configuration of printing logs and visualizing logs in MMDetection 2.x and 3.x. + + + + + + + + + + + + + + + + + + + + + + + + +
    Function2.x Config3.x Config
    Set Log Printing Interval + +```python +log_config = dict(interval=50) +``` + + + +```python +default_hooks = dict( + logger=dict(type='LoggerHook', interval=50)) +# Optional: set moving average window size +log_processor = dict( + type='LogProcessor', window_size=50) +``` + +
    Use TensorBoard or WandB to visualize logs + +```python +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook'), + dict(type='MMDetWandbHook', + init_kwargs={ + 'project': 'mmdetection', + 'group': 'maskrcnn-r50-fpn-1x-coco' + }, + interval=50, + log_checkpoint=True, + log_checkpoint_metadata=True, + num_eval_images=100) + ]) +``` + + + +```python +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + dict(type='WandbVisBackend', + init_kwargs={ + 'project': 'mmdetection', + 'group': 'maskrcnn-r50-fpn-1x-coco' + }) +] +visualizer = dict( + type='DetLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') +``` + +
    + +For visualization-related tutorials, please refer to [Visualization Tutorial](../user_guides/visualization.md) of MMDetection. + +### Runtime Configuration + +The runtime configuration fields in version 3.x have been adjusted, and the specific correspondence is as follows: + + + + + + + + + + + + + + + + +
    2.x Config3.x Config
    + +```python +cudnn_benchmark = False +opencv_num_threads = 0 +mp_start_method = 'fork' +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None + + +``` + + + +```python +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', + opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) +log_level = 'INFO' +load_from = None +resume = False +``` + +
    diff --git a/mmdetection/docs/en/migration/dataset_migration.md b/mmdetection/docs/en/migration/dataset_migration.md new file mode 100644 index 0000000..75d0932 --- /dev/null +++ b/mmdetection/docs/en/migration/dataset_migration.md @@ -0,0 +1 @@ +# Migrate dataset from MMDetection 2.x to 3.x diff --git a/mmdetection/docs/en/migration/migration.md b/mmdetection/docs/en/migration/migration.md new file mode 100644 index 0000000..ec6a2f8 --- /dev/null +++ b/mmdetection/docs/en/migration/migration.md @@ -0,0 +1,12 @@ +# Migrating from MMDetection 2.x to 3.x + +MMDetection 3.x is a significant update that includes many changes to API and configuration files. This document aims to help users migrate from MMDetection 2.x to 3.x. +We divided the migration guide into the following sections: + +- [Configuration file migration](./config_migration.md) +- [API and Registry migration](./api_and_registry_migration.md) +- [Dataset migration](./dataset_migration.md) +- [Model migration](./model_migration.md) +- [Frequently Asked Questions](./migration_faq.md) + +If you encounter any problems during the migration process, feel free to raise an issue. We also welcome contributions to this document. diff --git a/mmdetection/docs/en/migration/migration_faq.md b/mmdetection/docs/en/migration/migration_faq.md new file mode 100644 index 0000000..a6e3c35 --- /dev/null +++ b/mmdetection/docs/en/migration/migration_faq.md @@ -0,0 +1 @@ +# Migration FAQ diff --git a/mmdetection/docs/en/migration/model_migration.md b/mmdetection/docs/en/migration/model_migration.md new file mode 100644 index 0000000..04e2808 --- /dev/null +++ b/mmdetection/docs/en/migration/model_migration.md @@ -0,0 +1 @@ +# Migrate models from MMDetection 2.x to 3.x diff --git a/mmdetection/docs/en/model_zoo.md b/mmdetection/docs/en/model_zoo.md new file mode 100644 index 0000000..15dd7b2 --- /dev/null +++ b/mmdetection/docs/en/model_zoo.md @@ -0,0 +1,358 @@ +# Benchmark and Model Zoo + +## Mirror sites + +We only use aliyun to maintain the model zoo since MMDetection V2.0. The model zoo of V1.x has been deprecated. + +## Common settings + +- All models were trained on `coco_2017_train`, and tested on the `coco_2017_val`. +- We use distributed training. +- All pytorch-style pretrained backbones on ImageNet are from PyTorch model zoo, caffe-style pretrained backbones are converted from the newly released model from detectron2. +- For fair comparison with other codebases, we report the GPU memory as the maximum value of `torch.cuda.max_memory_allocated()` for all 8 GPUs. Note that this value is usually less than what `nvidia-smi` shows. +- We report the inference time as the total time of network forwarding and post-processing, excluding the data loading time. Results are obtained with the script [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/main/tools/analysis_tools/benchmark.py) which computes the average time on 2000 images. + +## ImageNet Pretrained Models + +It is common to initialize from backbone models pre-trained on ImageNet classification task. All pre-trained model links can be found at [open_mmlab](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json). According to `img_norm_cfg` and source of weight, we can divide all the ImageNet pre-trained model weights into some cases: + +- TorchVision: Corresponding to torchvision weight, including ResNet50, ResNet101. The `img_norm_cfg` is `dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)`. +- Pycls: Corresponding to [pycls](https://github.com/facebookresearch/pycls) weight, including RegNetX. The `img_norm_cfg` is `dict( mean=[103.530, 116.280, 123.675], std=[57.375, 57.12, 58.395], to_rgb=False)`. +- MSRA styles: Corresponding to [MSRA](https://github.com/KaimingHe/deep-residual-networks) weights, including ResNet50_Caffe and ResNet101_Caffe. The `img_norm_cfg` is `dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)`. +- Caffe2 styles: Currently only contains ResNext101_32x8d. The `img_norm_cfg` is `dict(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)`. +- Other styles: E.g SSD which corresponds to `img_norm_cfg` is `dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)` and YOLOv3 which corresponds to `img_norm_cfg` is `dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)`. + +The detailed table of the commonly used backbone models in MMDetection is listed below : + +| model | source | link | description | +| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| ResNet50 | TorchVision | [torchvision's ResNet-50](https://download.pytorch.org/models/resnet50-19c8e357.pth) | From [torchvision's ResNet-50](https://download.pytorch.org/models/resnet50-19c8e357.pth). | +| ResNet101 | TorchVision | [torchvision's ResNet-101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth) | From [torchvision's ResNet-101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth). | +| RegNetX | Pycls | [RegNetX_3.2gf](https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth), [RegNetX_800mf](https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth). etc. | From [pycls](https://github.com/facebookresearch/pycls). | +| ResNet50_Caffe | MSRA | [MSRA's ResNet-50](https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth) | Converted copy of [Detectron2's R-50.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl) model. The original weight comes from [MSRA's original ResNet-50](https://github.com/KaimingHe/deep-residual-networks). | +| ResNet101_Caffe | MSRA | [MSRA's ResNet-101](https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth) | Converted copy of [Detectron2's R-101.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl) model. The original weight comes from [MSRA's original ResNet-101](https://github.com/KaimingHe/deep-residual-networks). | +| ResNext101_32x8d | Caffe2 | [Caffe2 ResNext101_32x8d](https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth) | Converted copy of [Detectron2's X-101-32x8d.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/FAIR/X-101-32x8d.pkl) model. The ResNeXt-101-32x8d model trained with Caffe2 at FB. | + +## Baselines + +### RPN + +Please refer to [RPN](https://github.com/open-mmlab/mmdetection/blob/main/configs/rpn) for details. + +### Faster R-CNN + +Please refer to [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/faster_rcnn) for details. + +### Mask R-CNN + +Please refer to [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask_rcnn) for details. + +### Fast R-CNN (with pre-computed proposals) + +Please refer to [Fast R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/fast_rcnn) for details. + +### RetinaNet + +Please refer to [RetinaNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/retinanet) for details. + +### Cascade R-CNN and Cascade Mask R-CNN + +Please refer to [Cascade R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/cascade_rcnn) for details. + +### Hybrid Task Cascade (HTC) + +Please refer to [HTC](https://github.com/open-mmlab/mmdetection/blob/main/configs/htc) for details. + +### SSD + +Please refer to [SSD](https://github.com/open-mmlab/mmdetection/blob/main/configs/ssd) for details. + +### Group Normalization (GN) + +Please refer to [Group Normalization](https://github.com/open-mmlab/mmdetection/blob/main/configs/gn) for details. + +### Weight Standardization + +Please refer to [Weight Standardization](https://github.com/open-mmlab/mmdetection/blob/main/configs/gn+ws) for details. + +### Deformable Convolution v2 + +Please refer to [Deformable Convolutional Networks](https://github.com/open-mmlab/mmdetection/blob/main/configs/dcn) for details. + +### CARAFE: Content-Aware ReAssembly of FEatures + +Please refer to [CARAFE](https://github.com/open-mmlab/mmdetection/blob/main/configs/carafe) for details. + +### Instaboost + +Please refer to [Instaboost](https://github.com/open-mmlab/mmdetection/blob/main/configs/instaboost) for details. + +### Libra R-CNN + +Please refer to [Libra R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/libra_rcnn) for details. + +### Guided Anchoring + +Please refer to [Guided Anchoring](https://github.com/open-mmlab/mmdetection/blob/main/configs/guided_anchoring) for details. + +### FCOS + +Please refer to [FCOS](https://github.com/open-mmlab/mmdetection/blob/main/configs/fcos) for details. + +### FoveaBox + +Please refer to [FoveaBox](https://github.com/open-mmlab/mmdetection/blob/main/configs/foveabox) for details. + +### RepPoints + +Please refer to [RepPoints](https://github.com/open-mmlab/mmdetection/blob/main/configs/reppoints) for details. + +### FreeAnchor + +Please refer to [FreeAnchor](https://github.com/open-mmlab/mmdetection/blob/main/configs/free_anchor) for details. + +### Grid R-CNN (plus) + +Please refer to [Grid R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/grid_rcnn) for details. + +### GHM + +Please refer to [GHM](https://github.com/open-mmlab/mmdetection/blob/main/configs/ghm) for details. + +### GCNet + +Please refer to [GCNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/gcnet) for details. + +### HRNet + +Please refer to [HRNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/hrnet) for details. + +### Mask Scoring R-CNN + +Please refer to [Mask Scoring R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/ms_rcnn) for details. + +### Train from Scratch + +Please refer to [Rethinking ImageNet Pre-training](https://github.com/open-mmlab/mmdetection/blob/main/configs/scratch) for details. + +### NAS-FPN + +Please refer to [NAS-FPN](https://github.com/open-mmlab/mmdetection/blob/main/configs/nas_fpn) for details. + +### ATSS + +Please refer to [ATSS](https://github.com/open-mmlab/mmdetection/blob/main/configs/atss) for details. + +### FSAF + +Please refer to [FSAF](https://github.com/open-mmlab/mmdetection/blob/main/configs/fsaf) for details. + +### RegNetX + +Please refer to [RegNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/regnet) for details. + +### Res2Net + +Please refer to [Res2Net](https://github.com/open-mmlab/mmdetection/blob/main/configs/res2net) for details. + +### GRoIE + +Please refer to [GRoIE](https://github.com/open-mmlab/mmdetection/blob/main/configs/groie) for details. + +### Dynamic R-CNN + +Please refer to [Dynamic R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/dynamic_rcnn) for details. + +### PointRend + +Please refer to [PointRend](https://github.com/open-mmlab/mmdetection/blob/main/configs/point_rend) for details. + +### DetectoRS + +Please refer to [DetectoRS](https://github.com/open-mmlab/mmdetection/blob/main/configs/detectors) for details. + +### Generalized Focal Loss + +Please refer to [Generalized Focal Loss](https://github.com/open-mmlab/mmdetection/blob/main/configs/gfl) for details. + +### CornerNet + +Please refer to [CornerNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/cornernet) for details. + +### YOLOv3 + +Please refer to [YOLOv3](https://github.com/open-mmlab/mmdetection/blob/main/configs/yolo) for details. + +### PAA + +Please refer to [PAA](https://github.com/open-mmlab/mmdetection/blob/main/configs/paa) for details. + +### SABL + +Please refer to [SABL](https://github.com/open-mmlab/mmdetection/blob/main/configs/sabl) for details. + +### CentripetalNet + +Please refer to [CentripetalNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/centripetalnet) for details. + +### ResNeSt + +Please refer to [ResNeSt](https://github.com/open-mmlab/mmdetection/blob/main/configs/resnest) for details. + +### DETR + +Please refer to [DETR](https://github.com/open-mmlab/mmdetection/blob/main/configs/detr) for details. + +### Deformable DETR + +Please refer to [Deformable DETR](https://github.com/open-mmlab/mmdetection/blob/main/configs/deformable_detr) for details. + +### AutoAssign + +Please refer to [AutoAssign](https://github.com/open-mmlab/mmdetection/blob/main/configs/autoassign) for details. + +### YOLOF + +Please refer to [YOLOF](https://github.com/open-mmlab/mmdetection/blob/main/configs/yolof) for details. + +### Seesaw Loss + +Please refer to [Seesaw Loss](https://github.com/open-mmlab/mmdetection/blob/main/configs/seesaw_loss) for details. + +### CenterNet + +Please refer to [CenterNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/centernet) for details. + +### YOLOX + +Please refer to [YOLOX](https://github.com/open-mmlab/mmdetection/blob/main/configs/yolox) for details. + +### PVT + +Please refer to [PVT](https://github.com/open-mmlab/mmdetection/blob/main/configs/pvt) for details. + +### SOLO + +Please refer to [SOLO](https://github.com/open-mmlab/mmdetection/blob/main/configs/solo) for details. + +### QueryInst + +Please refer to [QueryInst](https://github.com/open-mmlab/mmdetection/blob/main/configs/queryinst) for details. + +### PanopticFPN + +Please refer to [PanopticFPN](https://github.com/open-mmlab/mmdetection/blob/main/configs/panoptic_fpn) for details. + +### MaskFormer + +Please refer to [MaskFormer](https://github.com/open-mmlab/mmdetection/blob/main/configs/maskformer) for details. + +### DyHead + +Please refer to [DyHead](https://github.com/open-mmlab/mmdetection/blob/main/configs/dyhead) for details. + +### Mask2Former + +Please refer to [Mask2Former](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask2former) for details. + +### Efficientnet + +Please refer to [Efficientnet](https://github.com/open-mmlab/mmdetection/blob/main/configs/efficientnet) for details. + +### Other datasets + +We also benchmark some methods on [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/main/configs/pascal_voc), [Cityscapes](https://github.com/open-mmlab/mmdetection/blob/main/configs/cityscapes), [OpenImages](https://github.com/open-mmlab/mmdetection/blob/main/configs/openimages) and [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/main/configs/wider_face). + +### Pre-trained Models + +We also train [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/faster_rcnn) and [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask_rcnn) using ResNet-50 and [RegNetX-3.2G](https://github.com/open-mmlab/mmdetection/blob/main/configs/regnet) with multi-scale training and longer schedules. These models serve as strong pre-trained models for downstream tasks for convenience. + +## Speed benchmark + +### Training Speed benchmark + +We provide [analyze_logs.py](https://github.com/open-mmlab/mmdetection/blob/main/tools/analysis_tools/analyze_logs.py) to get average time of iteration in training. You can find examples in [Log Analysis](https://mmdetection.readthedocs.io/en/latest/useful_tools.html#log-analysis). + +We compare the training speed of Mask R-CNN with some other popular frameworks (The data is copied from [detectron2](https://github.com/facebookresearch/detectron2/blob/main/docs/notes/benchmarks.md/)). +For mmdetection, we benchmark with [mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py), which should have the same setting with [mask_rcnn_R_50_FPN_noaug_1x.yaml](https://github.com/facebookresearch/detectron2/blob/main/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml) of detectron2. +We also provide the [checkpoint](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug_compare_20200518-10127928.pth) and [training log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug_20200518_105755.log.json) for reference. The throughput is computed as the average throughput in iterations 100-500 to skip GPU warmup time. + +| Implementation | Throughput (img/s) | +| -------------------------------------------------------------------------------------- | ------------------ | +| [Detectron2](https://github.com/facebookresearch/detectron2) | 62 | +| [MMDetection](https://github.com/open-mmlab/mmdetection) | 61 | +| [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/) | 53 | +| [tensorpack](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN) | 50 | +| [simpledet](https://github.com/TuSimple/simpledet/) | 39 | +| [Detectron](https://github.com/facebookresearch/Detectron) | 19 | +| [matterport/Mask_RCNN](https://github.com/matterport/Mask_RCNN/) | 14 | + +### Inference Speed Benchmark + +We provide [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/main/tools/analysis_tools/benchmark.py) to benchmark the inference latency. +The script benchmarkes the model with 2000 images and calculates the average time ignoring first 5 times. You can change the output log interval (defaults: 50) by setting `LOG-INTERVAL`. + +```shell +python tools/benchmark.py ${CONFIG} ${CHECKPOINT} [--log-interval $[LOG-INTERVAL]] [--fuse-conv-bn] +``` + +The latency of all models in our model zoo is benchmarked without setting `fuse-conv-bn`, you can get a lower latency by setting it. + +## Comparison with Detectron2 + +We compare mmdetection with [Detectron2](https://github.com/facebookresearch/detectron2.git) in terms of speed and performance. +We use the commit id [185c27e](https://github.com/facebookresearch/detectron2/tree/185c27e4b4d2d4c68b5627b3765420c6d7f5a659)(30/4/2020) of detectron. +For fair comparison, we install and run both frameworks on the same machine. + +### Hardware + +- 8 NVIDIA Tesla V100 (32G) GPUs +- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz + +### Software environment + +- Python 3.7 +- PyTorch 1.4 +- CUDA 10.1 +- CUDNN 7.6.03 +- NCCL 2.4.08 + +### Performance + +| Type | Lr schd | Detectron2 | mmdetection | Download | +| ------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py) | 1x | [37.9](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml) | 38.0 | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-5324cff8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco_20200429_234554.log.json) | +| [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py) | 1x | [38.6 & 35.2](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml) | 38.8 & 35.4 | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco-dbecf295.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco_20200430_054239.log.json) | +| [Retinanet](https://github.com/open-mmlab/mmdetection/blob/main/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py) | 1x | [36.5](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml) | 37.0 | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/retinanet_r50_caffe_fpn_mstrain_1x_coco/retinanet_r50_caffe_fpn_mstrain_1x_coco-586977a0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/retinanet_r50_caffe_fpn_mstrain_1x_coco/retinanet_r50_caffe_fpn_mstrain_1x_coco_20200430_014748.log.json) | + +### Training Speed + +The training speed is measure with s/iter. The lower, the better. + +| Type | Detectron2 | mmdetection | +| ------------ | ---------- | ----------- | +| Faster R-CNN | 0.210 | 0.216 | +| Mask R-CNN | 0.261 | 0.265 | +| Retinanet | 0.200 | 0.205 | + +### Inference Speed + +The inference speed is measured with fps (img/s) on a single GPU, the higher, the better. +To be consistent with Detectron2, we report the pure inference speed (without the time of data loading). +For Mask R-CNN, we exclude the time of RLE encoding in post-processing. +We also include the officially reported speed in the parentheses, which is slightly higher +than the results tested on our server due to differences of hardwares. + +| Type | Detectron2 | mmdetection | +| ------------ | ----------- | ----------- | +| Faster R-CNN | 25.6 (26.3) | 22.2 | +| Mask R-CNN | 22.5 (23.3) | 19.6 | +| Retinanet | 17.8 (18.2) | 20.6 | + +### Training memory + +| Type | Detectron2 | mmdetection | +| ------------ | ---------- | ----------- | +| Faster R-CNN | 3.0 | 3.8 | +| Mask R-CNN | 3.4 | 3.9 | +| Retinanet | 3.9 | 3.4 | diff --git a/mmdetection/docs/en/notes/changelog.md b/mmdetection/docs/en/notes/changelog.md new file mode 100644 index 0000000..4d48a0a --- /dev/null +++ b/mmdetection/docs/en/notes/changelog.md @@ -0,0 +1,603 @@ +# Changelog of v3.x + +## v3.1.0 (12/10/2023) + +### Highlights + +**(1) Detection Transformer SOTA Model Collection** + +- Supported four updated and stronger SOTA Transformer models: DDQ, CO-DETR, AlignDETR, and H-DINO. +- Based on CO-DETR, MMDet released a model with a COCO performance of 64.1 mAP. +- Algorithms such as DINO support AMP/Checkpoint/FrozenBN, which can effectively reduce memory usage. + +**(2) Comprehensive Performance Comparison between CNN and Transformer** + +RF100 consists of a dataset collection of 100 real-world datasets, including 7 domains. It can be used to assess the performance differences of Transformer models like DINO and CNN-based algorithms under different scenarios and data volumes. Users can utilize this benchmark to quickly evaluate the robustness of their algorithms in various scenarios. + +**(3) Support for GLIP and Grounding DINO fine-tuning, the only algorithm library that supports Grounding DINO fine-tuning** + +The Grounding DINO algorithm in MMDet is the only library that supports fine-tuning. Its performance is one point higher than the official version, and of course, GLIP also outperforms the official version. +We also provide a detailed process for training and evaluating Grounding DINO on custom datasets. Everyone is welcome to give it a try. + +**(4) Support for the open-vocabulary detection algorithm Detic and multi-dataset joint training.** + +**(5) Training detection models using FSDP and DeepSpeed.** + +**(6) Support for the V3Det dataset, a large-scale detection dataset with over 13,000 categories.** + +### New Features + +- Support CO-DETR/DDQ/AlignDETR/H-DINO +- Support GLIP and Grounding DINO fine-tuning +- Support Detic and Multi-Datasets training (#10926) +- Support V3Det and benchmark (#10938) +- Support Roboflow 100 Benchmark (#10915) +- Add custom dataset of grounding dino (#11012) +- Release RTMDet-X p6 (#10993) +- Support AMP of DINO (#10827) +- Support FrozenBN (#10845) +- Add new configuration files for `QDTrack/DETR/RTMDet/MaskRCNN/DINO/DeformableDETR/MaskFormer` algorithm +- Add a new script to support the WBF (#10808) +- Add `large_image_demo` (#10719) +- Support download dataset from OpenXLab (#10799) +- Update to support torch2onnx for DETR series models (#10910) +- Translation into Chinese of an English document (#10744, #10756, #10805, #10848) + +### Bug Fixes + +- Fix name error in DETR metafile.yml (#10595) +- Fix device of the tensors in `set_nms` (#10574) +- Remove some unicode chars from `en/` docs (#10648) +- Fix download dataset with mim script. (#10727) +- Fix export to torchserve (#10694) +- Fix typo in `mask-rcnn_r50_fpn_1x-wandb_coco` (#10757) +- Fix `eval_recalls` error in `voc_metric` (#10770) +- Fix torch version comparison (#10934) +- Fix incorrect behavior to access train pipeline from ConcatDataset in `analyze_results.py` (#11004) + +### Improvements + +- Update `useful_tools.md` (#10587) +- Update Instance segmentation Tutorial (#10711) +- Update `train.py` to compat with new config (#11025) +- Support `torch2onnx` for maskformer series (#10782) + +### Contributors + +A total of 36 developers contributed to this release. + +Thank @YQisme, @nskostas, @max-unfinity, @evdcush, @Xiangxu-0103, @ZhaoCake, @RangeKing, @captainIT, @ODAncona, @aaronzs, @zeyuanyin, @gotjd709, @Musiyuan, @YanxingLiu, @RunningLeon, @ytzfhqs, @zhangzhidaSunny, @yeungkong, @crazysteeaam, @timerring, @okotaku, @apatsekin, @Morty-Xu, @Markson-Young, @ZhaoQiiii, @Kuro96, @PhoenixZ810, @yhcao6, @myownskyW7, @jiongjiongli, @Johnson-Wang, @ryylcc, @guyleaf, @agpeshal, @SimonGuoNjust, @hhaAndroid + +## v3.1.0 (30/6/2023) + +### Highlights + +- Supports tracking algorithms including multi-object tracking (MOT) algorithms SORT, DeepSORT, StrongSORT, OCSORT, ByteTrack, QDTrack, and video instance segmentation (VIS) algorithm MaskTrackRCNN, Mask2Former-VIS. +- Support [ViTDet](../../../projects/ViTDet) +- Supports inference and evaluation of multimodal algorithms [GLIP](../../../configs/glip) and [XDecoder](../../../projects/XDecoder), and also supports datasets such as COCO semantic segmentation, COCO Caption, ADE20k general segmentation, and RefCOCO. GLIP fine-tuning will be supported in the future. +- Provides a [gradio demo](https://github.com/open-mmlab/mmdetection/blob/dev-3.x/projects/gradio_demo/README.md) for image type tasks of MMDetection, making it easy for users to experience. + +### New Features + +- Support DSDL Dataset (#9801) +- Support iSAID dataset (#10028) +- Support VISION dataset (#10530) +- Release SoftTeacher checkpoints (#10119) +- Release `centernet-update_r50-caffe_fpn_ms-1x_coco` checkpoints (#10327) +- Support SIoULoss (#10290) +- Support Eqlv2 loss (#10120) +- Support CopyPaste when mask is not available (#10509) +- Support MIM to download ODL dataset (#10460) +- Support new config (#10566) + +### Bug Fixes + +- Fix benchmark scripts error in windows (#10128) +- Fix error of `YOLOXModeSwitchHook` does not switch the mode when resumed from the checkpoint after switched (#10116) +- Fix pred and weight dims unmatch in SmoothL1Loss (#10423) + +### Improvements + +- Update MMDet_Tutorial.ipynb (#10081) +- Support to hide inference progress (#10519) +- Replace mmcls with mmpretrain (#10545) + +### Contributors + +A total of 29 developers contributed to this release. + +Thanks @lovelykite, @minato-ellie, @freepoet, @wufan-tb, @yalibian, @keyakiluo, @gihanjayatilaka, @i-aki-y, @xin-li-67, @RangeKing, @JingweiZhang12, @MambaWong, @lucianovk, @tall-josh, @xiuqhou, @jamiechoi1995, @YQisme, @yechenzhi, @bjzhb666, @xiexinch, @jamiechoi1995, @yarkable, @Renzhihan, @nijkah, @amaizr, @Lum1104, @zwhus, @Czm369, @hhaAndroid + +## v3.0.0 (6/4/2023) + +### Highlights + +- Support Semi-automatic annotation Base [Label-Studio](../../../projects/LabelStudio) (#10039) +- Support [EfficientDet](../../../projects/EfficientDet) in projects (#9810) + +### New Features + +- File I/O migration and reconstruction (#9709) +- Release DINO Swin-L 36e model (#9927) + +### Bug Fixes + +- Fix benchmark script (#9865) +- Fix the crop method of PolygonMasks (#9858) +- Fix Albu augmentation with the mask shape (#9918) +- Fix `RTMDetIns` prior generator device error (#9964) +- Fix `img_shape` in data pipeline (#9966) +- Fix cityscapes import error (#9984) +- Fix `solov2_r50_fpn_ms-3x_coco.py` config error (#10030) +- Fix Conditional DETR AP and Log (#9889) +- Fix accepting an unexpected argument local-rank in PyTorch 2.0 (#10050) +- Fix `common/ms_3x_coco-instance.py` config error (#10056) +- Fix compute flops error (#10051) +- Delete `data_root` in `CocoOccludedSeparatedMetric` to fix bug (#9969) +- Unifying metafile.yml (#9849) + +### Improvements + +- Added BoxInst r101 config (#9967) +- Added config migration guide (#9960) +- Added more social networking links (#10021) +- Added RTMDet config introduce (#10042) +- Added visualization docs (#9938, #10058) +- Refined data_prepare docs (#9935) +- Added support for setting the cache_size_limit parameter of dynamo in PyTorch 2.0 (#10054) +- Updated coco_metric.py (#10033) +- Update type hint (#10040) + +### Contributors + +A total of 19 developers contributed to this release. + +Thanks @IRONICBo, @vansin, @RangeKing, @Ghlerrix, @okotaku, @JosonChan1998, @zgzhengSE, @bobo0810, @yechenzh, @Zheng-LinXiao, @LYMDLUT, @yarkable, @xiejiajiannb, @chhluo, @BIGWangYuDong, @RangiLy, @zwhus, @hhaAndroid, @ZwwWayne + +## v3.0.0rc6 (24/2/2023) + +### Highlights + +- Support [Boxinst](../../../configs/boxinst), [Objects365 Dataset](../../../configs/objects365), and [Separated and Occluded COCO metric](../user_guides/useful_tools.md#COCO-Separated-&-Occluded-Mask-Metric) +- Support [ConvNeXt-V2](../../../projects/ConvNeXt-V2), [DiffusionDet](../../../projects/DiffusionDet), and inference of [EfficientDet](../../../projects/EfficientDet) and [Detic](../../../projects/Detic) in `Projects` +- Refactor [DETR](../../../configs/detr) series and support [Conditional-DETR](../../../configs/conditional_detr), [DAB-DETR](../../../configs/dab_detr), and [DINO](../../../configs/detr) +- Support `DetInferencer` for inference, Test Time Augmentation, and automatically importing modules from registry +- Support RTMDet-Ins ONNXRuntime and TensorRT [deployment](../../../configs/rtmdet/README.md#deployment-tutorial) +- Support [calculating FLOPs of detectors](../user_guides/useful_tools.md#Model-Complexity) + +### New Features + +- Support [Boxinst](https://arxiv.org/abs/2012.02310) (#9525) +- Support [Objects365 Dataset](https://openaccess.thecvf.com/content_ICCV_2019/papers/Shao_Objects365_A_Large-Scale_High-Quality_Dataset_for_Object_Detection_ICCV_2019_paper.pdf) (#9600) +- Support [ConvNeXt-V2](http://arxiv.org/abs/2301.00808) in `Projects` (#9619) +- Support [DiffusionDet](https://arxiv.org/abs/2211.09788) in `Projects` (#9639, #9768) +- Support [Detic](http://arxiv.org/abs/2201.02605) inference in `Projects` (#9645) +- Support [EfficientDet](https://arxiv.org/abs/1911.09070) inference in `Projects` (#9645) +- Support [Separated and Occluded COCO metric](https://arxiv.org/abs/2210.10046) (#9710) +- Support auto import modules from registry (#9143) +- Refactor DETR series and support Conditional-DETR, DAB-DETR and DINO (#9646) +- Support `DetInferencer` for inference (#9561) +- Support Test Time Augmentation (#9452) +- Support calculating FLOPs of detectors (#9777) + +### Bug Fixes + +- Fix deprecating old type alias due to new version of numpy (#9625, #9537) +- Fix VOC metrics (#9784) +- Fix the wrong link of RTMDet-x log (#9549) +- Fix RTMDet link in README (#9575) +- Fix MMDet get flops error (#9589) +- Fix `use_depthwise` in RTMDet (#9624) +- Fix `albumentations` augmentation post process with masks (#9551) +- Fix DETR series Unit Test (#9647) +- Fix `LoadPanopticAnnotations` bug (#9703) +- Fix `isort` CI (#9680) +- Fix amp pooling overflow (#9670) +- Fix docstring about noise in DINO (#9747) +- Fix potential bug in `MultiImageMixDataset` (#9764) + +### Improvements + +- Replace NumPy transpose with PyTorch permute to speed-up (#9762) +- Deprecate `sklearn` (#9725) +- Add RTMDet-Ins deployment guide (#9823) +- Update RTMDet config and README (#9603) +- Replace the models used in the tutorial document with RTMDet (#9843) +- Adjust the minimum supported python version to 3.7 (#9602) +- Support modifying palette through configuration (#9445) +- Update README document in `Project` (#9599) +- Replace `github` with `gitee` in `.pre-commit-config-zh-cn.yaml` file (#9586) +- Use official `isort` in `.pre-commit-config.yaml` file (#9701) +- Change MMCV minimum version to `2.0.0rc4` for `dev-3.x` (#9695) +- Add Chinese version of single_stage_as_rpn.md and test_results_submission.md (#9434) +- Add OpenDataLab download link (#9605, #9738) +- Add type hints of several layers (#9346) +- Add typehint for `DarknetBottleneck` (#9591) +- Add dockerfile (#9659) +- Add twitter, discord, medium, and youtube link (#9775) +- Prepare for merging refactor-detr (#9656) +- Add metafile to ConditionalDETR, DABDETR and DINO (#9715) +- Support to modify `non_blocking` parameters (#9723) +- Comment repeater visualizer register (#9740) +- Update user guide: `finetune.md` and `inference.md` (#9578) + +### New Contributors + +- @NoFish-528 made their first contribution in +- @137208 made their first contribution in +- @lyviva made their first contribution in +- @zwhus made their first contribution in +- @zylo117 made their first contribution in +- @chg0901 made their first contribution in +- @DanShouzhu made their first contribution in https://github.com/open-mmlab/mmdetection/pull/9578 + +### Contributors + +A total of 27 developers contributed to this release. + +Thanks @JosonChan1998, @RangeKing, @NoFish-528, @likyoo, @Xiangxu-0103, @137208, @PeterH0323, @tianleiSHI, @wufan-tb, @lyviva, @zwhus, @jshilong, @Li-Qingyun, @sanbuphy, @zylo117, @triple-Mu, @KeiChiTse, @LYMDLUT, @nijkah, @chg0901, @DanShouzhu, @zytx121, @vansin, @BIGWangYuDong, @hhaAndroid, @RangiLyu, @ZwwWayne + +## v3.0.0rc5 (26/12/2022) + +### Highlights + +- Support [RTMDet](https://arxiv.org/abs/2212.07784) instance segmentation models. The technical report of RTMDet is on [arxiv](https://arxiv.org/abs/2212.07784) +- Support SSHContextModule in paper [SSH: Single Stage Headless Face Detector](https://arxiv.org/abs/1708.03979). + +### New Features + +- Support [RTMDet](https://arxiv.org/abs/2212.07784) instance segmentation models and improve RTMDet test config (#9494) +- Support SSHContextModule in paper [SSH: Single Stage Headless Face Detector](https://arxiv.org/abs/1708.03979) (#8953) +- Release [CondInst](https://arxiv.org/abs/2003.05664) pre-trained model (#9406) + +### Bug Fixes + +- Fix CondInst predict error when `batch_size` is greater than 1 in inference (#9400) +- Fix the bug of visualization when the dtype of the pipeline output image is not uint8 in browse dataset (#9401) +- Fix `analyze_logs.py` to plot mAP and calculate train time correctly (#9409) +- Fix backward inplace error with `PAFPN` (#9450) +- Fix config import links in model converters (#9441) +- Fix `DeformableDETRHead` object has no attribute `loss_single` (#9477) +- Fix the logic of pseudo bboxes predicted by teacher model in SemiBaseDetector (#9414) +- Fix demo API in instance segmentation tutorial (#9226) +- Fix `analyze_results` (#9380) +- Fix the error that Readthedocs API cannot be displayed (#9510) +- Fix the error when there are no prediction results and support visualize the groundtruth of TTA (#9840) + +### Improvements + +- Remove legacy `builder.py` (#9479) +- Make sure the pipeline argument shape is in `(width, height)` order (#9324) +- Add `.pre-commit-config-zh-cn.yaml` file (#9388) +- Refactor dataset metainfo to lowercase (#9469) +- Add PyTorch 1.13 checking in CI (#9478) +- Adjust `FocalLoss` and `QualityFocalLoss` to allow different kinds of targets (#9481) +- Refactor `setup.cfg` (#9370) +- Clip saturation value to valid range `[0, 1]` (#9391) +- Only keep meta and state_dict when publishing model (#9356) +- Add segm evaluator in ms-poly_3x_coco_instance config (#9524) +- Update deployment guide (#9527) +- Update zh_cn `faq.md` (#9396) +- Update `get_started` (#9480) +- Update the zh_cn user_guides of `useful_tools.md` and `useful_hooks.md` (#9453) +- Add type hints for `bfp` and `channel_mapper` (#9410) +- Add type hints of several losses (#9397) +- Add type hints and update docstring for task modules (#9468) + +### New Contributors + +- @lihua199710 made their first contribution in +- @twmht made their first contribution in +- @tianleiSHI made their first contribution in +- @kitecats made their first contribution in +- @QJC123654 made their first contribution in + +### Contributors + +A total of 20 developers contributed to this release. + +Thanks @liuyanyi, @RangeKing, @lihua199710, @MambaWong, @sanbuphy, @Xiangxu-0103, @twmht, @JunyaoHu, @Chan-Sun, @tianleiSHI, @zytx121, @kitecats, @QJC123654, @JosonChan1998, @lvhan028, @Czm369, @BIGWangYuDong, @RangiLyu, @hhaAndroid, @ZwwWayne + +## v3.0.0rc4 (23/11/2022) + +### Highlights + +- Support [CondInst](https://arxiv.org/abs/2003.05664) +- Add `projects/` folder, which will be a place for some experimental models/features. +- Support [SparseInst](https://arxiv.org/abs/2203.12827) in [`projects`](./projects/SparseInst/README.md) + +### New Features + +- Support [CondInst](https://arxiv.org/abs/2003.05664) (#9223) +- Add `projects/` folder, which will be a place for some experimental models/features (#9341) +- Support [SparseInst](https://arxiv.org/abs/2203.12827) in [`projects`](./projects/SparseInst/README.md) (#9377) + +### Bug Fixes + +- Fix `pixel_decoder_type` discrimination in MaskFormer Head. (#9176) +- Fix wrong padding value in cached MixUp (#9259) +- Rename `utils/typing.py` to `utils/typing_utils.py` to fix `collect_env` error (#9265) +- Fix resume arg conflict (#9287) +- Fix the configs of Faster R-CNN with caffe backbone (#9319) +- Fix torchserve and update related documentation (#9343) +- Fix bbox refine bug with sigmooid activation (#9538) + +### Improvements + +- Update the docs of GIoU Loss in README (#8810) +- Handle dataset wrapper in `inference_detector` (#9144) +- Update the type of `counts` in COCO's compressed RLE (#9274) +- Support saving config file in `print_config` (#9276) +- Update docs about video inference (#9305) +- Update guide about model deployment (#9344) +- Fix doc typos of useful tools (#9177) +- Allow to resume from specific checkpoint in CLI (#9284) +- Update FAQ about windows installation issues of pycocotools (#9292) + +### New Contributors + +- @Daa98 made their first contribution in +- @lvhan028 made their first contribution in + +### Contributors + +A total of 12 developers contributed to this release. + +Thanks @sanbuphy, @Czm369, @Daa98, @jbwang1997, @BIGWangYuDong, @JosonChan1998, @lvhan028, @RunningLeon, @RangiLyu, @Daa98, @ZwwWayne, @hhaAndroid + +## v3.0.0rc3 (4/11/2022) + +Upgrade the minimum version requirement of MMEngine to 0.3.0 to use `ignore_key` of `ConcatDataset` for training VOC datasets (#9058) + +### Highlights + +- Support [CrowdDet](https://arxiv.org/abs/2003.09163) and [EIoU Loss](https://ieeexplore.ieee.org/document/9429909) +- Support training detection models in Detectron2 +- Refactor Fast R-CNN + +### New Features + +- Support [CrowdDet](https://arxiv.org/abs/2003.09163) (#8744) +- Support training detection models in Detectron2 with examples of Mask R-CNN, Faster R-CNN, and RetinaNet (#8672) +- Support [EIoU Loss](https://ieeexplore.ieee.org/document/9429909) (#9086) + +### Bug Fixes + +- Fix `XMLDataset` image size error (#9216) +- Fix bugs of empty_instances when predicting without nms in roi_head (#9015) +- Fix the config file of DETR (#9158) +- Fix SOLOv2 cannot dealing with empty gt image (#9192) +- Fix inference demo (#9153) +- Add `ignore_key` in VOC `ConcatDataset` (#9058) +- Fix dumping results issue in test scripts. (#9241) +- Fix configs of training coco subsets on MMDet 3.x (#9225) +- Fix corner2hbox of HorizontalBoxes for supporting empty bboxes (#9140) + +### Improvements + +- Refactor Fast R-CNN (#9132) +- Clean requirements of mmcv-full due to SyncBN (#9207) +- Support training detection models in detectron2 (#8672) +- Add `box_type` support for `DynamicSoftLabelAssigner` (#9179) +- Make scipy as a default dependency in runtime (#9187) +- Update eval_metric (#9062) +- Add `seg_map_suffix` in `BaseDetDataset` (#9088) + +### New Contributors + +- @Wwupup made their first contribution in +- @sanbuphy made their first contribution in +- @cxiang26 made their first contribution in +- @JosonChan1998 made their first contribution in + +### Contributors + +A total of 13 developers contributed to this release. + +Thanks @wanghonglie, @Wwupup, @sanbuphy, @BIGWangYuDong, @liuyanyi, @cxiang26, @jbwang1997, @ZwwWayne, @yuyoujiang, @RangiLyu, @hhaAndroid, @JosonChan1998, @Czm369 + +## v3.0.0rc2 (21/10/2022) + +### Highlights + +- Support [imagenet pre-training](configs/rtmdet/cspnext_imagenet_pretrain) for RTMDet's backbone + +### New Features + +- Support [imagenet pre-training](configs/rtmdet/cspnext_imagenet_pretrain) for RTMDet's backbone (#8887) +- Add `CrowdHumanDataset` and Metric (#8430) +- Add `FixShapeResize` to support resize of fixed shape (#8665) + +### Bug Fixes + +- Fix `ConcatDataset` Import Error (#8909) +- Fix `CircleCI` and `readthedoc` build failed (#8980, #8963) +- Fix bitmap mask translate when `out_shape` is different (#8993) +- Fix inconsistency in `Conv2d` weight channels (#8948) +- Fix bugs when plotting loss curve by analyze_logs.py (#8944) +- Fix type change of labels in `albumentations` (#9074) +- Fix some docs and types error (#8818) +- Update memory occupation of `RTMDet` in metafile (#9098) +- Fix wrong arguments of `OpenImageMetrics` in the config (#9061) + +### Improvements + +- Refactor standard roi head with `box type` (#8658) +- Support mask concatenation in `BitmapMasks` and `PolygonMasks` (#9006) +- Update PyTorch and dependencies' version in dockerfile (#8845) +- Update `robustness_eval.py` and `print_config` (#8452) +- Make compatible with `ConfigDict` and `dict` in `dense_heads` (#8942) +- Support logging coco metric copypaste (#9012) +- Remove `Normalize` transform (#8913) +- Support jittering the color of different instances of the same class (#8988) +- Add assertion for missing key in `PackDetInputs` (#8982) + +### New Contributors + +- @Chan-Sun made their first contribution in +- @MambaWong made their first contribution in +- @yuyoujiang made their first contribution in +- @sltlls made their first contribution in +- @Nioolek made their first contribution in +- @wufan-tb made their first contribution in + +### Contributors + +A total of 13 developers contributed to this release. + +Thanks @RangiLyu, @jbwang1997, @wanghonglie, @Chan-Sun, @RangeKing, @chhluo, @MambaWong, @yuyoujiang, @hhaAndroid, @sltlls, @Nioolek, @ZwwWayne, @wufan-tb + +## v3.0.0rc1 (26/9/2022) + +### Highlights + +- Release a high-precision, low-latency single-stage object detector [RTMDet](configs/rtmdet). + +### Bug Fixes + +- Fix UT to be compatible with PyTorch 1.6 (#8707) +- Fix `NumClassCheckHook` bug when model is wrapped (#8794) +- Update the right URL of R-50-FPN with BoundedIoULoss (#8805) +- Fix potential bug of indices in RandAugment (#8826) +- Fix some types and links (#8839, #8820, #8793, #8868) +- Fix incorrect background fill values in `FSAF` and `RepPoints` Head (#8813) + +### Improvements + +- Refactored anchor head and base head with `box type` (#8625) +- Refactored `SemiBaseDetector` and `SoftTeacher` (#8786) +- Add list to dict keys to avoid modify loss dict (#8828) +- Update `analyze_results.py` , `analyze_logs.py` and `loading.py` (#8430, #8402, #8784) +- Support dump results in `test.py` (#8814) +- Check empty predictions in `DetLocalVisualizer._draw_instances` (#8830) +- Fix `floordiv` warning in `SOLO` (#8738) + +### Contributors + +A total of 16 developers contributed to this release. + +Thanks @ZwwWayne, @jbwang1997, @Czm369, @ice-tong, @Zheng-LinXiao, @chhluo, @RangiLyu, @liuyanyi, @wanghonglie, @levan92, @JiayuXu0, @nye0, @hhaAndroid, @xin-li-67, @shuxp, @zytx121 + +## v3.0.0rc0 (31/8/2022) + +We are excited to announce the release of MMDetection 3.0.0rc0. MMDet 3.0.0rc0 is the first version of MMDetection 3.x, a part of the OpenMMLab 2.0 projects. Built upon the new [training engine](https://github.com/open-mmlab/mmengine), MMDet 3.x unifies the interfaces of the dataset, models, evaluation, and visualization with faster training and testing speed. It also provides a general semi-supervised object detection framework and strong baselines. + +### Highlights + +1. **New engine**. MMDet 3.x is based on [MMEngine](https://github.com/open-mmlab/mmengine), which provides a universal and powerful runner that allows more flexible customizations and significantly simplifies the entry points of high-level interfaces. + +2. **Unified interfaces**. As a part of the OpenMMLab 2.0 projects, MMDet 3.x unifies and refactors the interfaces and internal logic of training, testing, datasets, models, evaluation, and visualization. All the OpenMMLab 2.0 projects share the same design in those interfaces and logic to allow the emergence of multi-task/modality algorithms. + +3. **Faster speed**. We optimize the training and inference speed for common models and configurations, achieving a faster or similar speed than [Detection2](https://github.com/facebookresearch/detectron2/). Model details of benchmark will be updated in [this note](./benchmark.md#comparison-with-detectron2). + +4. **General semi-supervised object detection**. Benefitting from the unified interfaces, we support a general semi-supervised learning framework that works with all the object detectors supported in MMDet 3.x. Please refer to [semi-supervised object detection](../user_guides/semi_det.md) for details. + +5. **Strong baselines**. We release strong baselines of many popular models to enable fair comparisons among state-of-the-art models. + +6. **New features and algorithms**: + + - Enable all the single-stage detectors to serve as region proposal networks + - [SoftTeacher](https://arxiv.org/abs/2106.09018) + - [the updated CenterNet](https://arxiv.org/abs/2103.07461) + +7. **More documentation and tutorials**. We add a bunch of documentation and tutorials to help users get started more smoothly. Read it [here](https://mmdetection.readthedocs.io/en/3.x/). + +### Breaking Changes + +MMDet 3.x has undergone significant changes for better design, higher efficiency, more flexibility, and more unified interfaces. +Besides the changes in API, we briefly list the major breaking changes in this section. +We will update the [migration guide](../migration.md) to provide complete details and migration instructions. +Users can also refer to the [API doc](https://mmdetection.readthedocs.io/en/3.x/) for more details. + +#### Dependencies + +- MMDet 3.x runs on PyTorch>=1.6. We have deprecated the support of PyTorch 1.5 to embrace mixed precision training and other new features since PyTorch 1.6. Some models can still run on PyTorch 1.5, but the full functionality of MMDet 3.x is not guaranteed. +- MMDet 3.x relies on MMEngine to run. MMEngine is a new foundational library for training deep learning models of OpenMMLab and is the core dependency of OpenMMLab 2.0 projects. The dependencies of file IO and training are migrated from MMCV 1.x to MMEngine. +- MMDet 3.x relies on MMCV>=2.0.0rc0. Although MMCV no longer maintains the training functionalities since 2.0.0rc0, MMDet 3.x relies on the data transforms, CUDA operators, and image processing interfaces in MMCV. Note that the package `mmcv` is the version that provides pre-built CUDA operators and `mmcv-lite` does not since MMCV 2.0.0rc0, while `mmcv-full` has been deprecated since 2.0.0rc0. + +#### Training and testing + +- MMDet 3.x uses Runner in [MMEngine](https://github.com/open-mmlab/mmengine) rather than that in MMCV. The new Runner implements and unifies the building logic of the dataset, model, evaluation, and visualizer. Therefore, MMDet 3.x no longer maintains the building logic of those modules in `mmdet.train.apis` and `tools/train.py`. Those codes have been migrated into [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py). Please refer to the [migration guide of Runner in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/runner.html) for more details. +- The Runner in MMEngine also supports testing and validation. The testing scripts are also simplified, which has similar logic to that in training scripts to build the runner. +- The execution points of hooks in the new Runner have been enriched to allow more flexible customization. Please refer to the [migration guide of Hook in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/hook.html) for more details. +- Learning rate and momentum schedules have been migrated from Hook to [Parameter Scheduler in MMEngine](https://mmengine.readthedocs.io/en/latest/tutorials/param_scheduler.html). Please refer to the [migration guide of Parameter Scheduler in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/param_scheduler.html) for more details. + +#### Configs + +- The [Runner in MMEngine](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py) uses a different config structure to ease the understanding of the components in the runner. Users can read the [config example of MMDet 3.x](../user_guides/config.md) or refer to the [migration guide in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/runner.html) for migration details. +- The file names of configs and models are also refactored to follow the new rules unified across OpenMMLab 2.0 projects. The names of checkpoints are not updated for now as there is no BC-breaking of model weights between MMDet 3.x and 2.x. We will progressively replace all the model weights with those trained in MMDet 3.x. Please refer to the [user guides of config](../user_guides/config.md) for more details. + +#### Dataset + +The Dataset classes implemented in MMDet 3.x all inherit from the `BaseDetDataset`, which inherits from the [BaseDataset in MMEngine](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html). In addition to the changes in interfaces, there are several changes in Dataset in MMDet 3.x. + +- All the datasets support serializing the internal data list to reduce the memory when multiple workers are built for data loading. +- The internal data structure in the dataset is changed to be self-contained (without losing information like class names in MMDet 2.x) while keeping simplicity. +- The evaluation functionality of each dataset has been removed from the dataset so that some specific evaluation metrics like COCO AP can be used to evaluate the prediction on other datasets. + +#### Data Transforms + +The data transforms in MMDet 3.x all inherits from `BaseTransform` in MMCV>=2.0.0rc0, which defines a new convention in OpenMMLab 2.0 projects. +Besides the interface changes, there are several changes listed below: + +- The functionality of some data transforms (e.g., `Resize`) are decomposed into several transforms to simplify and clarify the usages. +- The format of data dict processed by each data transform is changed according to the new data structure of dataset. +- Some inefficient data transforms (e.g., normalization and padding) are moved into data preprocessor of model to improve data loading and training speed. +- The same data transforms in different OpenMMLab 2.0 libraries have the same augmentation implementation and the logic given the same arguments, i.e., `Resize` in MMDet 3.x and MMSeg 1.x will resize the image in the exact same manner given the same arguments. + +#### Model + +The models in MMDet 3.x all inherit from `BaseModel` in MMEngine, which defines a new convention of models in OpenMMLab 2.0 projects. +Users can refer to [the tutorial of the model in MMengine](https://mmengine.readthedocs.io/en/latest/tutorials/model.html) for more details. +Accordingly, there are several changes as the following: + +- The model interfaces, including the input and output formats, are significantly simplified and unified following the new convention in MMDet 3.x. + Specifically, all the input data in training and testing are packed into `inputs` and `data_samples`, where `inputs` contains model inputs like a list of image tensors, and `data_samples` contains other information of the current data sample such as ground truths, region proposals, and model predictions. In this way, different tasks in MMDet 3.x can share the same input arguments, which makes the models more general and suitable for multi-task learning and some flexible training paradigms like semi-supervised learning. +- The model has a data preprocessor module, which is used to pre-process the input data of the model. In MMDet 3.x, the data preprocessor usually does the necessary steps to form the input images into a batch, such as padding. It can also serve as a place for some special data augmentations or more efficient data transformations like normalization. +- The internal logic of the model has been changed. In MMdet 2.x, model uses `forward_train`, `forward_test`, `simple_test`, and `aug_test` to deal with different model forward logics. In MMDet 3.x and OpenMMLab 2.0, the forward function has three modes: 'loss', 'predict', and 'tensor' for training, inference, and tracing or other purposes, respectively. + The forward function calls `self.loss`, `self.predict`, and `self._forward` given the modes 'loss', 'predict', and 'tensor', respectively. + +#### Evaluation + +The evaluation in MMDet 2.x strictly binds with the dataset. In contrast, MMDet 3.x decomposes the evaluation from dataset so that all the detection datasets can evaluate with COCO AP and other metrics implemented in MMDet 3.x. +MMDet 3.x mainly implements corresponding metrics for each dataset, which are manipulated by [Evaluator](https://mmengine.readthedocs.io/en/latest/design/evaluator.html) to complete the evaluation. +Users can build an evaluator in MMDet 3.x to conduct offline evaluation, i.e., evaluate predictions that may not produce in MMDet 3.x with the dataset as long as the dataset and the prediction follow the dataset conventions. More details can be found in the [tutorial in mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html). + +#### Visualization + +The functions of visualization in MMDet 2.x are removed. Instead, in OpenMMLab 2.0 projects, we use [Visualizer](https://mmengine.readthedocs.io/en/latest/design/visualization.html) to visualize data. MMDet 3.x implements `DetLocalVisualizer` to allow visualization of ground truths, model predictions, feature maps, etc., at any place. It also supports sending the visualization data to any external visualization backends such as Tensorboard. + +### Improvements + +- Optimized training and testing speed of FCOS, RetinaNet, Faster R-CNN, Mask R-CNN, and Cascade R-CNN. The training speed of those models with some common training strategies is also optimized, including those with synchronized batch normalization and mixed precision training. +- Support mixed precision training of all the models. However, some models may get undesirable performance due to some numerical issues. We will update the documentation and list the results (accuracy of failure) of mixed precision training. +- Release strong baselines of some popular object detectors. Their accuracy and pre-trained checkpoints will be released. + +### Bug Fixes + +- DeepFashion dataset: the config and results have been updated. + +### New Features + +1. Support a general semi-supervised learning framework that works with all the object detectors supported in MMDet 3.x. Please refer to [semi-supervised object detection](../user_guides/semi_det.md) for details. +2. Enable all the single-stage detectors to serve as region proposal networks. We give [an example of using FCOS as RPN](../user_guides/single_stage_as_rpn.md). +3. Support a semi-supervised object detection algorithm: [SoftTeacher](https://arxiv.org/abs/2106.09018). +4. Support [the updated CenterNet](https://arxiv.org/abs/2103.07461). +5. Support data structures `HorizontalBoxes` and `BaseBoxes` to encapsulate different kinds of bounding boxes. We are migrating to use data structures of boxes to replace the use of pure tensor boxes. This will unify the usages of different kinds of bounding boxes in MMDet 3.x and MMRotate 1.x to simplify the implementation and reduce redundant codes. + +### Planned changes + +We list several planned changes of MMDet 3.0.0rc0 so that the community could more comprehensively know the progress of MMDet 3.x. Feel free to create a PR, issue, or discussion if you are interested, have any suggestions and feedback, or want to participate. + +1. Test-time augmentation: which is supported in MMDet 2.x, is not implemented in this version due to the limited time slot. We will support it in the following releases with a new and simplified design. +2. Inference interfaces: unified inference interfaces will be supported in the future to ease the use of released models. +3. Interfaces of useful tools that can be used in Jupyter Notebook or Colab: more useful tools that are implemented in the `tools` directory will have their python interfaces so that they can be used in Jupyter Notebook, Colab, and downstream libraries. +4. Documentation: we will add more design docs, tutorials, and migration guidance so that the community can deep dive into our new design, participate the future development, and smoothly migrate downstream libraries to MMDet 3.x. +5. Wandb visualization: MMDet 2.x supports data visualization since v2.25.0, which has not been migrated to MMDet 3.x for now. Since WandB provides strong visualization and experiment management capabilities, a `DetWandBVisualizer` and maybe a hook are planned to fully migrate those functionalities from MMDet 2.x. +6. Full support of WiderFace dataset (#8508) and Fast R-CNN: we are verifying their functionalities and will fix related issues soon. +7. Migrate DETR-series algorithms (#8655, #8533) and YOLOv3 on IPU (#8552) from MMDet 2.x. + +### Contributors + +A total of 11 developers contributed to this release. +Thanks @shuxp, @wanghonglie, @Czm369, @BIGWangYuDong, @zytx121, @jbwang1997, @chhluo, @jshilong, @RangiLyu, @hhaAndroid, @ZwwWayne diff --git a/mmdetection/docs/en/notes/changelog_v2.x.md b/mmdetection/docs/en/notes/changelog_v2.x.md new file mode 100644 index 0000000..2b3a230 --- /dev/null +++ b/mmdetection/docs/en/notes/changelog_v2.x.md @@ -0,0 +1,1681 @@ +# Changelog v2.x + +### v2.25.0 (31/5/2022) + +#### Highlights + +- Support dedicated `WandbLogger` hook +- Support [ConvNeXt](configs/convnext), [DDOD](configs/ddod), [SOLOv2](configs/solov2) +- Support [Mask2Former](configs/mask2former) for instance segmentation +- Rename [config files of Mask2Former](configs/mask2former) + +#### Backwards incompatible changes + +- Rename [config files of Mask2Former](configs/mask2former) (#7571) + + + + + + + + + + + +
    before v2.25.0after v2.25.0
    + + - `mask2former_xxx_coco.py` represents config files for **panoptic segmentation**. + + + + - `mask2former_xxx_coco.py` represents config files for **instance segmentation**. + - `mask2former_xxx_coco-panoptic.py` represents config files for **panoptic segmentation**. + +
    + +#### New Features + +- Support [ConvNeXt](https://arxiv.org/abs/2201.03545) (#7281) +- Support [DDOD](https://arxiv.org/abs/2107.02963) (#7279) +- Support [SOLOv2](https://arxiv.org/abs/2003.10152) (#7441) +- Support [Mask2Former](https://arxiv.org/abs/2112.01527) for instance segmentation (#7571, #8032) + +#### Bug Fixes + +- Enable YOLOX training on different devices (#7912) +- Fix the log plot error when evaluation with `interval != 1` (#7784) +- Fix RuntimeError of HTC (#8083) + +#### Improvements + +- Support dedicated `WandbLogger` hook (#7459) + + Users can set + + ```python + cfg.log_config.hooks = [ + dict(type='MMDetWandbHook', + init_kwargs={'project': 'MMDetection-tutorial'}, + interval=10, + log_checkpoint=True, + log_checkpoint_metadata=True, + num_eval_images=10)] + ``` + + in the config to use `MMDetWandbHook`. Example can be found in this [colab tutorial](https://colab.research.google.com/drive/1RCSXHZwDZvakFh3eo9RuNrJbCGqD0dru?usp=sharing#scrollTo=WTEdPDRaBz2C) + +- Add `AvoidOOM` to avoid OOM (#7434, #8091) + + Try to use `AvoidCUDAOOM` to avoid GPU out of memory. It will first retry after calling `torch.cuda.empty_cache()`. If it still fails, it will then retry by converting the type of inputs to FP16 format. If it still fails, it will try to copy inputs from GPUs to CPUs to continue computing. Try AvoidOOM in code to make the code continue to run when GPU memory runs out: + + ```python + from mmdet.utils import AvoidCUDAOOM + + output = AvoidCUDAOOM.retry_if_cuda_oom(some_function)(input1, input2) + ``` + + Users can also try `AvoidCUDAOOM` as a decorator to make the code continue to run when GPU memory runs out: + + ```python + from mmdet.utils import AvoidCUDAOOM + + @AvoidCUDAOOM.retry_if_cuda_oom + def function(*args, **kwargs): + ... + return xxx + ``` + +- Support reading `gpu_collect` from `cfg.evaluation.gpu_collect` (#7672) + +- Speedup the Video Inference by Accelerating data-loading Stage (#7832) + +- Support replacing the `${key}` with the value of `cfg.key` (#7492) + +- Accelerate result analysis in `analyze_result.py`. The evaluation time is speedup by 10 ~ 15 times and only tasks 10 ~ 15 minutes now. (#7891) + +- Support to set `block_dilations` in `DilatedEncoder` (#7812) + +- Support panoptic segmentation result analysis (#7922) + +- Release DyHead with Swin-Large backbone (#7733) + +- Documentations updating and adding + + - Fix wrong default type of `act_cfg` in `SwinTransformer` (#7794) + - Fix text errors in the tutorials (#7959) + - Rewrite the [installation guide](docs/en/get_started.md) (#7897) + - [Useful hooks](docs/en/tutorials/useful_hooks.md) (#7810) + - Fix heading anchor in documentation (#8006) + - Replace `markdownlint` with `mdformat` for avoiding installing ruby (#8009) + +#### Contributors + +A total of 20 developers contributed to this release. + +Thanks @ZwwWayne, @DarthThomas, @solyaH, @LutingWang, @chenxinfeng4, @Czm369, @Chenastron, @chhluo, @austinmw, @Shanyaliux @hellock, @Y-M-Y, @jbwang1997, @hhaAndroid, @Irvingao, @zhanggefan, @BIGWangYuDong, @Keiku, @PeterVennerstrom, @ayulockin + +### v2.24.0 (26/4/2022) + +#### Highlights + +- Support [Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation](https://arxiv.org/abs/2012.07177) +- Support automatically scaling LR according to GPU number and samples per GPU +- Support Class Aware Sampler that improves performance on OpenImages Dataset + +#### New Features + +- Support [Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation](https://arxiv.org/abs/2012.07177), see [example configs](configs/simple_copy_paste/mask_rcnn_r50_fpn_syncbn-all_rpn-2conv_ssj_scp_32x2_270k_coco.py) (#7501) + +- Support Class Aware Sampler, users can set + + ```python + data=dict(train_dataloader=dict(class_aware_sampler=dict(num_sample_class=1)))) + ``` + + in the config to use `ClassAwareSampler`. Examples can be found in [the configs of OpenImages Dataset](https://github.com/open-mmlab/mmdetection/tree/main/configs/openimages/faster_rcnn_r50_fpn_32x2_cas_1x_openimages.py). (#7436) + +- Support automatically scaling LR according to GPU number and samples per GPU. (#7482) + In each config, there is a corresponding config of auto-scaling LR as below, + + ```python + auto_scale_lr = dict(enable=True, base_batch_size=N) + ``` + + where `N` is the batch size used for the current learning rate in the config (also equals to `samples_per_gpu` * gpu number to train this config). + By default, we set `enable=False` so that the original usages will not be affected. Users can set `enable=True` in each config or add `--auto-scale-lr` after the command line to enable this feature and should check the correctness of `base_batch_size` in customized configs. + +- Support setting dataloader arguments in config and add functions to handle config compatibility. (#7668) + The comparison between the old and new usages is as below. + + + + + + + + + + + +
    v2.23.0v2.24.0
    + + ```python + data = dict( + samples_per_gpu=64, workers_per_gpu=4, + train=dict(type='xxx', ...), + val=dict(type='xxx', samples_per_gpu=4, ...), + test=dict(type='xxx', ...), + ) + ``` + + + + ```python + # A recommended config that is clear + data = dict( + train=dict(type='xxx', ...), + val=dict(type='xxx', ...), + test=dict(type='xxx', ...), + # Use different batch size during inference. + train_dataloader=dict(samples_per_gpu=64, workers_per_gpu=4), + val_dataloader=dict(samples_per_gpu=8, workers_per_gpu=2), + test_dataloader=dict(samples_per_gpu=8, workers_per_gpu=2), + ) + + # Old style still works but allows to set more arguments about data loaders + data = dict( + samples_per_gpu=64, # only works for train_dataloader + workers_per_gpu=4, # only works for train_dataloader + train=dict(type='xxx', ...), + val=dict(type='xxx', ...), + test=dict(type='xxx', ...), + # Use different batch size during inference. + val_dataloader=dict(samples_per_gpu=8, workers_per_gpu=2), + test_dataloader=dict(samples_per_gpu=8, workers_per_gpu=2), + ) + ``` + +
    + +- Support memory profile hook. Users can use it to monitor the memory usages during training as below (#7560) + + ```python + custom_hooks = [ + dict(type='MemoryProfilerHook', interval=50) + ] + ``` + +- Support to run on PyTorch with MLU chip (#7578) + +- Support re-spliting data batch with tag (#7641) + +- Support the `DiceCost` used by [K-Net](https://arxiv.org/abs/2106.14855) in `MaskHungarianAssigner` (#7716) + +- Support splitting COCO data for Semi-supervised object detection (#7431) + +- Support Pathlib for Config.fromfile (#7685) + +- Support to use file client in OpenImages dataset (#7433) + +- Add a probability parameter to Mosaic transformation (#7371) + +- Support specifying interpolation mode in `Resize` pipeline (#7585) + +#### Bug Fixes + +- Avoid invalid bbox after deform_sampling (#7567) +- Fix the issue that argument color_theme does not take effect when exporting confusion matrix (#7701) +- Fix the `end_level` in Necks, which should be the index of the end input backbone level (#7502) +- Fix the bug that `mix_results` may be None in `MultiImageMixDataset` (#7530) +- Fix the bug in ResNet plugin when two plugins are used (#7797) + +#### Improvements + +- Enhance `load_json_logs` of analyze_logs.py for resumed training logs (#7732) +- Add argument `out_file` in image_demo.py (#7676) +- Allow mixed precision training with `SimOTAAssigner` (#7516) +- Updated INF to 100000.0 to be the same as that in the official YOLOX (#7778) +- Add documentations of: + - how to get channels of a new backbone (#7642) + - how to unfreeze the backbone network (#7570) + - how to train fast_rcnn model (#7549) + - proposals in Deformable DETR (#7690) + - from-scratch install script in get_started.md (#7575) +- Release pre-trained models of + - [Mask2Former](configs/mask2former) (#7595, #7709) + - RetinaNet with ResNet-18 and release models (#7387) + - RetinaNet with EfficientNet backbone (#7646) + +#### Contributors + +A total of 27 developers contributed to this release. +Thanks @jovialio, @zhangsanfeng2022, @HarryZJ, @jamiechoi1995, @nestiank, @PeterH0323, @RangeKing, @Y-M-Y, @mattcasey02, @weiji14, @Yulv-git, @xiefeifeihu, @FANG-MING, @meng976537406, @nijkah, @sudz123, @CCODING04, @SheffieldCao, @Czm369, @BIGWangYuDong, @zytx121, @jbwang1997, @chhluo, @jshilong, @RangiLyu, @hhaAndroid, @ZwwWayne + +### v2.23.0 (28/3/2022) + +#### Highlights + +- Support Mask2Former: [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) +- Support EfficientNet: [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) +- Support setting data root through environment variable `MMDET_DATASETS`, users don't have to modify the corresponding path in config files anymore. +- Find a good recipe for fine-tuning high precision ResNet backbone pre-trained by Torchvision. + +#### New Features + +- Support [Mask2Former](configs/mask2former)(#6938)(#7466)(#7471) +- Support [EfficientNet](configs/efficientnet) (#7514) +- Support setting data root through environment variable `MMDET_DATASETS`, users don't have to modify the corresponding path in config files anymore. (#7386) +- Support setting different seeds to different ranks (#7432) +- Update the `dist_train.sh` so that the script can be used to support launching multi-node training on machines without slurm (#7415) +- Find a good recipe for fine-tuning high precision ResNet backbone pre-trained by Torchvision (#7489) + +#### Bug Fixes + +- Fix bug in VOC unit test which removes the data directory (#7270) +- Adjust the order of `get_classes` and `FileClient` (#7276) +- Force the inputs of `get_bboxes` in yolox_head to float32 (#7324) +- Fix misplaced arguments in LoadPanopticAnnotations (#7388) +- Fix reduction=mean in CELoss. (#7449) +- Update unit test of CrossEntropyCost (#7537) +- Fix memory leaking in panpotic segmentation evaluation (#7538) +- Fix the bug of shape broadcast in YOLOv3 (#7551) + +#### Improvements + +- Add Chinese version of onnx2tensorrt.md (#7219) +- Update colab tutorials (#7310) +- Update information about Localization Distillation (#7350) +- Add Chinese version of `finetune.md` (#7178) +- Update YOLOX log for non square input (#7235) +- Add `nproc` in `coco_panoptic.py` for panoptic quality computing (#7315) +- Allow to set channel_order in LoadImageFromFile (#7258) +- Take point sample related functions out of mask_point_head (#7353) +- Add instance evaluation for coco_panoptic (#7313) +- Enhance the robustness of analyze_logs.py (#7407) +- Supplementary notes of sync_random_seed (#7440) +- Update docstring of cross entropy loss (#7472) +- Update pascal voc result (#7503) +- We create How-to documentation to record any questions about How to xxx. In this version, we added + - How to use Mosaic augmentation (#7507) + - How to use backbone in mmcls (#7438) + - How to produce and submit the prediction results of panoptic segmentation models on COCO test-dev set (#7430)) + +#### Contributors + +A total of 27 developers contributed to this release. +Thanks @ZwwWayne, @haofanwang, @shinya7y, @chhluo, @yangrisheng, @triple-Mu, @jbwang1997, @HikariTJU, @imflash217, @274869388, @zytx121, @matrixgame2018, @jamiechoi1995, @BIGWangYuDong, @JingweiZhang12, @Xiangxu-0103, @hhaAndroid, @jshilong, @osbm, @ceroytres, @bunge-bedstraw-herb, @Youth-Got, @daavoo, @jiangyitong, @RangiLyu, @CCODING04, @yarkable + +### v2.22.0 (24/2/2022) + +#### Highlights + +- Support MaskFormer: [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) (#7212) +- Support DyHead: [Dynamic Head: Unifying Object Detection Heads with Attentions](https://arxiv.org/abs/2106.08322) (#6823) +- Release a good recipe of using ResNet in object detectors pre-trained by [ResNet Strikes Back](https://arxiv.org/abs/2110.00476), which consistently brings about 3~4 mAP improvements over RetinaNet, Faster/Mask/Cascade Mask R-CNN (#7001) +- Support [Open Images Dataset](https://storage.googleapis.com/openimages/web/index.html) (#6331) +- Support TIMM backbone: [PyTorch Image Models](https://github.com/rwightman/pytorch-image-models) (#7020) + +#### New Features + +- Support [MaskFormer](configs/maskformer) (#7212) +- Support [DyHead](configs/dyhead) (#6823) +- Support [ResNet Strikes Back](configs/resnet_strikes_back) (#7001) +- Support [OpenImages Dataset](configs/openimages) (#6331) +- Support [TIMM backbone](configs/timm_example) (#7020) +- Support visualization for Panoptic Segmentation (#7041) + +#### Breaking Changes + +In order to support the visualization for Panoptic Segmentation, the `num_classes` can not be `None` when using the `get_palette` function to determine whether to use the panoptic palette. + +#### Bug Fixes + +- Fix bug for the best checkpoints can not be saved when the `key_score` is None (#7101) +- Fix MixUp transform filter boxes failing case (#7080) +- Add missing properties in SABLHead (#7091) +- Fix bug when NaNs exist in confusion matrix (#7147) +- Fix PALETTE AttributeError in downstream task (#7230) + +#### Improvements + +- Speed up SimOTA matching (#7098) +- Add Chinese translation of `docs_zh-CN/tutorials/init_cfg.md` (#7188) + +#### Contributors + +A total of 20 developers contributed to this release. +Thanks @ZwwWayne, @hhaAndroid, @RangiLyu, @AronLin, @BIGWangYuDong, @jbwang1997, @zytx121, @chhluo, @shinya7y, @LuooChen, @dvansa, @siatwangmin, @del-zhenwu, @vikashranjan26, @haofanwang, @jamiechoi1995, @HJoonKwon, @yarkable, @zhijian-liu, @RangeKing + +### v2.21.0 (8/2/2022) + +### Breaking Changes + +To standardize the contents in config READMEs and meta files of OpenMMLab projects, the READMEs and meta files in each config directory have been significantly changed. The template will be released in the future, for now, you can refer to the examples of README for [algorithm](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/README.md), [dataset](https://github.com/open-mmlab/mmdetection/blob/master/configs/deepfashion/README.md) and [backbone](https://github.com/open-mmlab/mmdetection/blob/master/configs/regnet/README.md). To align with the standard, the configs in dcn are put into to two directories named `dcn` and `dcnv2`. + +#### New Features + +- Allow to customize colors of different classes during visualization (#6716) +- Support CPU training (#7016) +- Add download script of COCO, LVIS, and VOC dataset (#7015) + +#### Bug Fixes + +- Fix weight conversion issue of RetinaNet with Swin-S (#6973) +- Update `__repr__` of `Compose` (#6951) +- Fix BadZipFile Error when build docker (#6966) +- Fix bug in non-distributed multi-gpu training/testing (#7019) +- Fix bbox clamp in PyTorch 1.10 (#7074) +- Relax the requirement of PALETTE in dataset wrappers (#7085) +- Keep the same weights before reassign in the PAA head (#7032) +- Update code demo in doc (#7092) + +#### Improvements + +- Speed-up training by allow to set variables of multi-processing (#6974, #7036) +- Add links of Chinese tutorials in readme (#6897) +- Disable cv2 multiprocessing by default for acceleration (#6867) +- Deprecate the support for "python setup.py test" (#6998) +- Re-organize metafiles and config readmes (#7051) +- Fix None grad problem during training TOOD by adding `SigmoidGeometricMean` (#7090) + +#### Contributors + +A total of 26 developers contributed to this release. +Thanks @del-zhenwu, @zimoqingfeng, @srishilesh, @imyhxy, @jenhaoyang, @jliu-ac, @kimnamu, @ShengliLiu, @garvan2021, @ciusji, @DIYer22, @kimnamu, @q3394101, @zhouzaida, @gaotongxiao, @topsy404, @AntoAndGar, @jbwang1997, @nijkah, @ZwwWayne, @Czm369, @jshilong, @RangiLyu, @BIGWangYuDong, @hhaAndroid, @AronLin + +### v2.20.0 (27/12/2021) + +#### New Features + +- Support [TOOD](configs/tood/README.md): Task-aligned One-stage Object Detection (ICCV 2021 Oral) (#6746) +- Support resuming from the latest checkpoint automatically (#6727) + +#### Bug Fixes + +- Fix wrong bbox `loss_weight` of the PAA head (#6744) +- Fix the padding value of `gt_semantic_seg` in batch collating (#6837) +- Fix test error of lvis when using `classwise` (#6845) +- Avoid BC-breaking of `get_local_path` (#6719) +- Fix bug in `sync_norm_hook` when the BN layer does not exist (#6852) +- Use pycocotools directly no matter what platform it is (#6838) + +#### Improvements + +- Add unit test for SimOTA with no valid bbox (#6770) +- Use precommit to check readme (#6802) +- Support selecting GPU-ids in non-distributed testing time (#6781) + +#### Contributors + +A total of 16 developers contributed to this release. +Thanks @ZwwWayne, @Czm369, @jshilong, @RangiLyu, @BIGWangYuDong, @hhaAndroid, @jamiechoi1995, @AronLin, @Keiku, @gkagkos, @fcakyon, @www516717402, @vansin, @zactodd, @kimnamu, @jenhaoyang + +### v2.19.1 (14/12/2021) + +#### New Features + +- Release [YOLOX](configs/yolox/README.md) COCO pretrained models (#6698) + +#### Bug Fixes + +- Fix DCN initialization in DenseHead (#6625) +- Fix initialization of ConvFCHead (#6624) +- Fix PseudoSampler in RCNN (#6622) +- Fix weight initialization in Swin and PVT (#6663) +- Fix dtype bug in BaseDenseHead (#6767) +- Fix SimOTA with no valid bbox (#6733) + +#### Improvements + +- Add an example of combining swin and one-stage models (#6621) +- Add `get_ann_info` to dataset_wrappers (#6526) +- Support keeping image ratio in the multi-scale training of YOLOX (#6732) +- Support `bbox_clip_border` for the augmentations of YOLOX (#6730) + +#### Documents + +- Update metafile (#6717) +- Add mmhuman3d in readme (#6699) +- Update FAQ docs (#6587) +- Add doc for `detect_anomalous_params` (#6697) + +#### Contributors + +A total of 11 developers contributed to this release. +Thanks @ZwwWayne, @LJoson, @Czm369, @jshilong, @ZCMax, @RangiLyu, @BIGWangYuDong, @hhaAndroid, @zhaoxin111, @GT9505, @shinya7y + +### v2.19.0 (29/11/2021) + +#### Highlights + +- Support [Label Assignment Distillation](https://arxiv.org/abs/2108.10520) +- Support `persistent_workers` for Pytorch >= 1.7 +- Align accuracy to the updated official YOLOX + +#### New Features + +- Support [Label Assignment Distillation](https://arxiv.org/abs/2108.10520) (#6342) +- Support `persistent_workers` for Pytorch >= 1.7 (#6435) + +#### Bug Fixes + +- Fix repeatedly output warning message (#6584) +- Avoid infinite GPU waiting in dist training (#6501) +- Fix SSD512 config error (#6574) +- Fix MMDetection model to ONNX command (#6558) + +#### Improvements + +- Refactor configs of FP16 models (#6592) +- Align accuracy to the updated official YOLOX (#6443) +- Speed up training and reduce memory cost when using PhotoMetricDistortion. (#6442) +- Make OHEM work with seesaw loss (#6514) + +#### Documents + +- Update README.md (#6567) + +#### Contributors + +A total of 11 developers contributed to this release. +Thanks @FloydHsiu, @RangiLyu, @ZwwWayne, @AndreaPi, @st9007a, @hachreak, @BIGWangYuDong, @hhaAndroid, @AronLin, @chhluo, @vealocia, @HarborYuan, @st9007a, @jshilong + +### v2.18.1 (15/11/2021) + +#### Highlights + +- Release [QueryInst](http://arxiv.org/abs/2105.01928) pre-trained weights (#6460) +- Support plot confusion matrix (#6344) + +#### New Features + +- Release [QueryInst](http://arxiv.org/abs/2105.01928) pre-trained weights (#6460) +- Support plot confusion matrix (#6344) + +#### Bug Fixes + +- Fix aug test error when the number of prediction bboxes is 0 (#6398) +- Fix SpatialReductionAttention in PVT (#6488) +- Fix wrong use of `trunc_normal_init` in PVT and Swin-Transformer (#6432) + +#### Improvements + +- Save the printed AP information of COCO API to logger (#6505) +- Always map location to cpu when load checkpoint (#6405) +- Set a random seed when the user does not set a seed (#6457) + +#### Documents + +- Chinese version of [Corruption Benchmarking](robustness_benchmarking.md) (#6375) +- Fix config path in docs (#6396) +- Update GRoIE readme (#6401) + +#### Contributors + +A total of 11 developers contributed to this release. +Thanks @st9007a, @hachreak, @HarborYuan, @vealocia, @chhluo, @AndreaPi, @AronLin, @BIGWangYuDong, @hhaAndroid, @RangiLyu, @ZwwWayne + +### v2.18.0 (27/10/2021) + +#### Highlights + +- Support [QueryInst](http://arxiv.org/abs/2105.01928) (#6050) +- Refactor dense heads to decouple onnx export logics from `get_bboxes` and speed up inference (#5317, #6003, #6369, #6268, #6315) + +#### New Features + +- Support [QueryInst](http://arxiv.org/abs/2105.01928) (#6050) +- Support infinite sampler (#5996) + +#### Bug Fixes + +- Fix init_weight in fcn_mask_head (#6378) +- Fix type error in imshow_bboxes of RPN (#6386) +- Fix broken colab link in MMDetection Tutorial (#6382) +- Make sure the device and dtype of scale_factor are the same as bboxes (#6374) +- Remove sampling hardcode (#6317) +- Fix RandomAffine bbox coordinate recorrection (#6293) +- Fix init bug of final cls/reg layer in convfc head (#6279) +- Fix img_shape broken in auto_augment (#6259) +- Fix kwargs parameter missing error in two_stage (#6256) + +#### Improvements + +- Unify the interface of stuff head and panoptic head (#6308) +- Polish readme (#6243) +- Add code-spell pre-commit hook and fix a typo (#6306) +- Fix typo (#6245, #6190) +- Fix sampler unit test (#6284) +- Fix `forward_dummy` of YOLACT to enable `get_flops` (#6079) +- Fix link error in the config documentation (#6252) +- Adjust the order to beautify the document (#6195) + +#### Refactors + +- Refactor one-stage get_bboxes logic (#5317) +- Refactor ONNX export of One-Stage models (#6003, #6369) +- Refactor dense_head and speedup (#6268) +- Migrate to use prior_generator in training of dense heads (#6315) + +#### Contributors + +A total of 18 developers contributed to this release. +Thanks @Boyden, @onnkeat, @st9007a, @vealocia, @yhcao6, @DapangpangX, @yellowdolphin, @cclauss, @kennymckormick, +@pingguokiller, @collinzrj, @AndreaPi, @AronLin, @BIGWangYuDong, @hhaAndroid, @jshilong, @RangiLyu, @ZwwWayne + +### v2.17.0 (28/9/2021) + +#### Highlights + +- Support [PVT](https://arxiv.org/abs/2102.12122) and [PVTv2](https://arxiv.org/abs/2106.13797) +- Support [SOLO](https://arxiv.org/abs/1912.04488) +- Support large scale jittering and New Mask R-CNN baselines +- Speed up `YOLOv3` inference + +#### New Features + +- Support [PVT](https://arxiv.org/abs/2102.12122) and [PVTv2](https://arxiv.org/abs/2106.13797) (#5780) +- Support [SOLO](https://arxiv.org/abs/1912.04488) (#5832) +- Support large scale jittering and New Mask R-CNN baselines (#6132) +- Add a general data structure for the results of models (#5508) +- Added a base class for one-stage instance segmentation (#5904) +- Speed up `YOLOv3` inference (#5991) +- Release Swin Transformer pre-trained models (#6100) +- Support mixed precision training in `YOLOX` (#5983) +- Support `val` workflow in `YOLACT` (#5986) +- Add script to test `torchserve` (#5936) +- Support `onnxsim` with dynamic input shape (#6117) + +#### Bug Fixes + +- Fix the function naming errors in `model_wrappers` (#5975) +- Fix regression loss bug when the input is an empty tensor (#5976) +- Fix scores not contiguous error in `centernet_head` (#6016) +- Fix missing parameters bug in `imshow_bboxes` (#6034) +- Fix bug in `aug_test` of `HTC` when the length of `det_bboxes` is 0 (#6088) +- Fix empty proposal errors in the training of some two-stage models (#5941) +- Fix `dynamic_axes` parameter error in `ONNX` dynamic shape export (#6104) +- Fix `dynamic_shape` bug of `SyncRandomSizeHook` (#6144) +- Fix the Swin Transformer config link error in the configuration (#6172) + +#### Improvements + +- Add filter rules in `Mosaic` transform (#5897) +- Add size divisor in get flops to avoid some potential bugs (#6076) +- Add Chinese translation of `docs_zh-CN/tutorials/customize_dataset.md` (#5915) +- Add Chinese translation of `conventions.md` (#5825) +- Add description of the output of data pipeline (#5886) +- Add dataset information in the README file for `PanopticFPN` (#5996) +- Add `extra_repr` for `DropBlock` layer to get details in the model printing (#6140) +- Fix CI out of memory and add PyTorch1.9 Python3.9 unit tests (#5862) +- Fix download links error of some model (#6069) +- Improve the generalization of XML dataset (#5943) +- Polish assertion error messages (#6017) +- Remove `opencv-python-headless` dependency by `albumentations` (#5868) +- Check dtype in transform unit tests (#5969) +- Replace the default theme of documentation with PyTorch Sphinx Theme (#6146) +- Update the paper and code fields in the metafile (#6043) +- Support to customize padding value of segmentation map (#6152) +- Support to resize multiple segmentation maps (#5747) + +#### Contributors + +A total of 24 developers contributed to this release. +Thanks @morkovka1337, @HarborYuan, @guillaumefrd, @guigarfr, @www516717402, @gaotongxiao, @ypwhs, @MartaYang, @shinya7y, @justiceeem, @zhaojinjian0000, @VVsssssk, @aravind-anantha, @wangbo-zhao, @czczup, @whai362, @czczup, @marijnl, @AronLin, @BIGWangYuDong, @hhaAndroid, @jshilong, @RangiLyu, @ZwwWayne + +### v2.16.0 (30/8/2021) + +#### Highlights + +- Support [Panoptic FPN](https://arxiv.org/abs/1901.02446) and [Swin Transformer](https://arxiv.org/abs/2103.14030) + +#### New Features + +- Support [Panoptic FPN](https://arxiv.org/abs/1901.02446) and release models (#5577, #5902) +- Support Swin Transformer backbone (#5748) +- Release RetinaNet models pre-trained with multi-scale 3x schedule (#5636) +- Add script to convert unlabeled image list to coco format (#5643) +- Add hook to check whether the loss value is valid (#5674) +- Add YOLO anchor optimizing tool (#5644) +- Support export onnx models without post process. (#5851) +- Support classwise evaluation in CocoPanopticDataset (#5896) +- Adapt browse_dataset for concatenated datasets. (#5935) +- Add `PatchEmbed` and `PatchMerging` with `AdaptivePadding` (#5952) + +#### Bug Fixes + +- Fix unit tests of YOLOX (#5859) +- Fix lose randomness in `imshow_det_bboxes` (#5845) +- Make output result of `ImageToTensor` contiguous (#5756) +- Fix inference bug when calling `regress_by_class` in RoIHead in some cases (#5884) +- Fix bug in CIoU loss where alpha should not have gradient. (#5835) +- Fix the bug that `multiscale_output` is defined but not used in HRNet (#5887) +- Set the priority of EvalHook to LOW. (#5882) +- Fix a YOLOX bug when applying bbox rescaling in test mode (#5899) +- Fix mosaic coordinate error (#5947) +- Fix dtype of bbox in RandomAffine. (#5930) + +#### Improvements + +- Add Chinese version of `data_pipeline` and (#5662) +- Support to remove state dicts of EMA when publishing models. (#5858) +- Refactor the loss function in HTC and SCNet (#5881) +- Use warnings instead of logger.warning (#5540) +- Use legacy coordinate in metric of VOC (#5627) +- Add Chinese version of customize_losses (#5826) +- Add Chinese version of model_zoo (#5827) + +#### Contributors + +A total of 19 developers contributed to this release. +Thanks @ypwhs, @zywvvd, @collinzrj, @OceanPang, @ddonatien, @@haotian-liu, @viibridges, @Muyun99, @guigarfr, @zhaojinjian0000, @jbwang1997,@wangbo-zhao, @xvjiarui, @RangiLyu, @jshilong, @AronLin, @BIGWangYuDong, @hhaAndroid, @ZwwWayne + +### v2.15.1 (11/8/2021) + +#### Highlights + +- Support [YOLOX](https://arxiv.org/abs/2107.08430) + +#### New Features + +- Support [YOLOX](https://arxiv.org/abs/2107.08430)(#5756, #5758, #5760, #5767, #5770, #5774, #5777, #5808, #5828, #5848) + +#### Bug Fixes + +- Update correct SSD models. (#5789) +- Fix casting error in mask structure (#5820) +- Fix MMCV deployment documentation links. (#5790) + +#### Improvements + +- Use dynamic MMCV download link in TorchServe dockerfile (#5779) +- Rename the function `upsample_like` to `interpolate_as` for more general usage (#5788) + +#### Contributors + +A total of 14 developers contributed to this release. +Thanks @HAOCHENYE, @xiaohu2015, @HsLOL, @zhiqwang, @Adamdad, @shinya7y, @Johnson-Wang, @RangiLyu, @jshilong, @mmeendez8, @AronLin, @BIGWangYuDong, @hhaAndroid, @ZwwWayne + +### v2.15.0 (02/8/2021) + +#### Highlights + +- Support adding [MIM](https://github.com/open-mmlab/mim) dependencies during pip installation +- Support MobileNetV2 for SSD-Lite and YOLOv3 +- Support Chinese Documentation + +#### New Features + +- Add function `upsample_like` (#5732) +- Support to output pdf and epub format documentation (#5738) +- Support and release Cascade Mask R-CNN 3x pre-trained models (#5645) +- Add `ignore_index` to CrossEntropyLoss (#5646) +- Support adding [MIM](https://github.com/open-mmlab/mim) dependencies during pip installation (#5676) +- Add MobileNetV2 config and models for YOLOv3 (#5510) +- Support COCO Panoptic Dataset (#5231) +- Support ONNX export of cascade models (#5486) +- Support DropBlock with RetinaNet (#5544) +- Support MobileNetV2 SSD-Lite (#5526) + +#### Bug Fixes + +- Fix the device of label in multiclass_nms (#5673) +- Fix error of backbone initialization from pre-trained checkpoint in config file (#5603, #5550) +- Fix download links of RegNet pretrained weights (#5655) +- Fix two-stage runtime error given empty proposal (#5559) +- Fix flops count error in DETR (#5654) +- Fix unittest for `NumClassCheckHook` when it is not used. (#5626) +- Fix description bug of using custom dataset (#5546) +- Fix bug of `multiclass_nms` that returns the global indices (#5592) +- Fix `valid_mask` logic error in RPNHead (#5562) +- Fix unit test error of pretrained configs (#5561) +- Fix typo error in anchor_head.py (#5555) +- Fix bug when using dataset wrappers (#5552) +- Fix a typo error in demo/MMDet_Tutorial.ipynb (#5511) +- Fixing crash in `get_root_logger` when `cfg.log_level` is not None (#5521) +- Fix docker version (#5502) +- Fix optimizer parameter error when using `IterBasedRunner` (#5490) + +#### Improvements + +- Add unit tests for MMTracking (#5620) +- Add Chinese translation of documentation (#5718, #5618, #5558, #5423, #5593, #5421, #5408. #5369, #5419, #5530, #5531) +- Update resource limit (#5697) +- Update docstring for InstaBoost (#5640) +- Support key `reduction_override` in all loss functions (#5515) +- Use repeatdataset to accelerate CenterNet training (#5509) +- Remove unnecessary code in autoassign (#5519) +- Add documentation about `init_cfg` (#5273) + +#### Contributors + +A total of 18 developers contributed to this release. +Thanks @OceanPang, @AronLin, @hellock, @Outsider565, @RangiLyu, @ElectronicElephant, @likyoo, @BIGWangYuDong, @hhaAndroid, @noobying, @yyz561, @likyoo, +@zeakey, @ZwwWayne, @ChenyangLiu, @johnson-magic, @qingswu, @BuxianChen + +### v2.14.0 (29/6/2021) + +#### Highlights + +- Add `simple_test` to dense heads to improve the consistency of single-stage and two-stage detectors +- Revert the `test_mixins` to single image test to improve efficiency and readability +- Add Faster R-CNN and Mask R-CNN config using multi-scale training with 3x schedule + +#### New Features + +- Support pretrained models from MoCo v2 and SwAV (#5286) +- Add Faster R-CNN and Mask R-CNN config using multi-scale training with 3x schedule (#5179, #5233) +- Add `reduction_override` in MSELoss (#5437) +- Stable support of exporting DETR to ONNX with dynamic shapes and batch inference (#5168) +- Stable support of exporting PointRend to ONNX with dynamic shapes and batch inference (#5440) + +#### Bug Fixes + +- Fix size mismatch bug in `multiclass_nms` (#4980) +- Fix the import path of `MultiScaleDeformableAttention` (#5338) +- Fix errors in config of GCNet ResNext101 models (#5360) +- Fix Grid-RCNN error when there is no bbox result (#5357) +- Fix errors in `onnx_export` of bbox_head when setting reg_class_agnostic (#5468) +- Fix type error of AutoAssign in the document (#5478) +- Fix web links ending with `.md` (#5315) + +#### Improvements + +- Add `simple_test` to dense heads to improve the consistency of single-stage and two-stage detectors (#5264) +- Add support for mask diagonal flip in TTA (#5403) +- Revert the `test_mixins` to single image test to improve efficiency and readability (#5249) +- Make YOLOv3 Neck more flexible (#5218) +- Refactor SSD to make it more general (#5291) +- Refactor `anchor_generator` and `point_generator` (#5349) +- Allow to configure out the `mask_head` of the HTC algorithm (#5389) +- Delete deprecated warning in FPN (#5311) +- Move `model.pretrained` to `model.backbone.init_cfg` (#5370) +- Make deployment tools more friendly to use (#5280) +- Clarify installation documentation (#5316) +- Add ImageNet Pretrained Models docs (#5268) +- Add FAQ about training loss=nan solution and COCO AP or AR =-1 (# 5312, #5313) +- Change all weight links of http to https (#5328) + +### v2.13.0 (01/6/2021) + +#### Highlights + +- Support new methods: [CenterNet](https://arxiv.org/abs/1904.07850), [Seesaw Loss](https://arxiv.org/abs/2008.10032), [MobileNetV2](https://arxiv.org/abs/1801.04381) + +#### New Features + +- Support paper [Objects as Points](https://arxiv.org/abs/1904.07850) (#4602) +- Support paper [Seesaw Loss for Long-Tailed Instance Segmentation (CVPR 2021)](https://arxiv.org/abs/2008.10032) (#5128) +- Support [MobileNetV2](https://arxiv.org/abs/1801.04381) backbone and inverted residual block (#5122) +- Support [MIM](https://github.com/open-mmlab/mim) (#5143) +- ONNX exportation with dynamic shapes of CornerNet (#5136) +- Add `mask_soft` config option to allow non-binary masks (#4615) +- Add PWC metafile (#5135) + +#### Bug Fixes + +- Fix YOLOv3 FP16 training error (#5172) +- Fix Cacscade R-CNN TTA test error when `det_bboxes` length is 0 (#5221) +- Fix `iou_thr` variable naming errors in VOC recall calculation function (#5195) +- Fix Faster R-CNN performance dropped in ONNX Runtime (#5197) +- Fix DETR dict changed error when using python 3.8 during iteration (#5226) + +#### Improvements + +- Refactor ONNX export of two stage detector (#5205) +- Replace MMDetection's EvalHook with MMCV's EvalHook for consistency (#4806) +- Update RoI extractor for ONNX (#5194) +- Use better parameter initialization in YOLOv3 head for higher performance (#5181) +- Release new DCN models of Mask R-CNN by mixed-precision training (#5201) +- Update YOLOv3 model weights (#5229) +- Add DetectoRS ResNet-101 model weights (#4960) +- Discard bboxes with sizes equals to `min_bbox_size` (#5011) +- Remove duplicated code in DETR head (#5129) +- Remove unnecessary object in class definition (#5180) +- Fix doc link (#5192) + +### v2.12.0 (01/5/2021) + +#### Highlights + +- Support new methods: [AutoAssign](https://arxiv.org/abs/2007.03496), [YOLOF](https://arxiv.org/abs/2103.09460), and [Deformable DETR](https://arxiv.org/abs/2010.04159) +- Stable support of exporting models to ONNX with batched images and dynamic shape (#5039) + +#### Backwards Incompatible Changes + +MMDetection is going through big refactoring for more general and convenient usages during the releases from v2.12.0 to v2.15.0 (maybe longer). +In v2.12.0 MMDetection inevitably brings some BC-breakings, including the MMCV dependency, model initialization, model registry, and mask AP evaluation. + +- MMCV version. MMDetection v2.12.0 relies on the newest features in MMCV 1.3.3, including `BaseModule` for unified parameter initialization, model registry, and the CUDA operator `MultiScaleDeformableAttn` for [Deformable DETR](https://arxiv.org/abs/2010.04159). Note that MMCV 1.3.2 already contains all the features used by MMDet but has known issues. Therefore, we recommend users skip MMCV v1.3.2 and use v1.3.3, though v1.3.2 might work for most cases. +- Unified model initialization (#4750). To unify the parameter initialization in OpenMMLab projects, MMCV supports `BaseModule` that accepts `init_cfg` to allow the modules' parameters initialized in a flexible and unified manner. Now the users need to explicitly call `model.init_weights()` in the training script to initialize the model (as in [here](https://github.com/open-mmlab/mmdetection/blob/master/tools/train.py#L162), previously this was handled by the detector. The models in MMDetection have been re-benchmarked to ensure accuracy based on PR #4750. __The downstream projects should update their code accordingly to use MMDetection v2.12.0__. +- Unified model registry (#5059). To easily use backbones implemented in other OpenMMLab projects, MMDetection migrates to inherit the model registry created in MMCV (#760). In this way, as long as the backbone is supported in an OpenMMLab project and that project also uses the registry in MMCV, users can use that backbone in MMDetection by simply modifying the config without copying the code of that backbone into MMDetection. +- Mask AP evaluation (#4898). Previous versions calculate the areas of masks through the bounding boxes when calculating the mask AP of small, medium, and large instances. To indeed use the areas of masks, we pop the key `bbox` during mask AP calculation. This change does not affect the overall mask AP evaluation and aligns the mask AP of similar models in other projects like Detectron2. + +#### New Features + +- Support paper [AutoAssign: Differentiable Label Assignment for Dense Object Detection](https://arxiv.org/abs/2007.03496) (#4295) +- Support paper [You Only Look One-level Feature](https://arxiv.org/abs/2103.09460) (#4295) +- Support paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) (#4778) +- Support calculating IoU with FP16 tensor in `bbox_overlaps` to save memory and keep speed (#4889) +- Add `__repr__` in custom dataset to count the number of instances (#4756) +- Add windows support by updating requirements.txt (#5052) +- Stable support of exporting models to ONNX with batched images and dynamic shape, including SSD, FSAF,FCOS, YOLOv3, RetinaNet, Faster R-CNN, and Mask R-CNN (#5039) + +#### Improvements + +- Use MMCV `MODEL_REGISTRY` (#5059) +- Unified parameter initialization for more flexible usage (#4750) +- Rename variable names and fix docstring in anchor head (#4883) +- Support training with empty GT in Cascade RPN (#4928) +- Add more details of usage of `test_robustness` in documentation (#4917) +- Changing to use `pycocotools` instead of `mmpycocotools` to fully support Detectron2 and MMDetection in one environment (#4939) +- Update torch serve dockerfile to support dockers of more versions (#4954) +- Add check for training with single class dataset (#4973) +- Refactor transformer and DETR Head (#4763) +- Update FPG model zoo (#5079) +- More accurate mask AP of small/medium/large instances (#4898) + +#### Bug Fixes + +- Fix bug in mean_ap.py when calculating mAP by 11 points (#4875) +- Fix error when key `meta` is not in old checkpoints (#4936) +- Fix hanging bug when training with empty GT in VFNet, GFL, and FCOS by changing the place of `reduce_mean` (#4923, #4978, #5058) +- Fix asyncronized inference error and provide related demo (#4941) +- Fix IoU losses dimensionality unmatch error (#4982) +- Fix torch.randperm whtn using PyTorch 1.8 (#5014) +- Fix empty bbox error in `mask_head` when using CARAFE (#5062) +- Fix `supplement_mask` bug when there are zero-size RoIs (#5065) +- Fix testing with empty rois in RoI Heads (#5081) + +### v2.11.0 (01/4/2021) + +__Highlights__ + +- Support new method: [Localization Distillation for Object Detection](https://arxiv.org/pdf/2102.12252.pdf) +- Support Pytorch2ONNX with batch inference and dynamic shape + +__New Features__ + +- Support [Localization Distillation for Object Detection](https://arxiv.org/pdf/2102.12252.pdf) (#4758) +- Support Pytorch2ONNX with batch inference and dynamic shape for Faster-RCNN and mainstream one-stage detectors (#4796) + +__Improvements__ + +- Support batch inference in head of RetinaNet (#4699) +- Add batch dimension in second stage of Faster-RCNN (#4785) +- Support batch inference in bbox coder (#4721) +- Add check for `ann_ids` in `COCODataset` to ensure it is unique (#4789) +- support for showing the FPN results (#4716) +- support dynamic shape for grid_anchor (#4684) +- Move pycocotools version check to when it is used (#4880) + +__Bug Fixes__ + +- Fix a bug of TridentNet when doing the batch inference (#4717) +- Fix a bug of Pytorch2ONNX in FASF (#4735) +- Fix a bug when show the image with float type (#4732) + +### v2.10.0 (01/03/2021) + +#### Highlights + +- Support new methods: [FPG](https://arxiv.org/abs/2004.03580) +- Support ONNX2TensorRT for SSD, FSAF, FCOS, YOLOv3, and Faster R-CNN. + +#### New Features + +- Support ONNX2TensorRT for SSD, FSAF, FCOS, YOLOv3, and Faster R-CNN (#4569) +- Support [Feature Pyramid Grids (FPG)](https://arxiv.org/abs/2004.03580) (#4645) +- Support video demo (#4420) +- Add seed option for sampler (#4665) +- Support to customize type of runner (#4570, #4669) +- Support synchronizing BN buffer in `EvalHook` (#4582) +- Add script for GIF demo (#4573) + +#### Bug Fixes + +- Fix ConfigDict AttributeError and add Colab link (#4643) +- Avoid crash in empty gt training of GFL head (#4631) +- Fix `iou_thrs` bug in RPN evaluation (#4581) +- Fix syntax error of config when upgrading model version (#4584) + +#### Improvements + +- Refactor unit test file structures (#4600) +- Refactor nms config (#4636) +- Get loading pipeline by checking the class directly rather than through config strings (#4619) +- Add doctests for mask target generation and mask structures (#4614) +- Use deep copy when copying pipeline arguments (#4621) +- Update documentations (#4642, #4650, #4620, #4630) +- Remove redundant code calling `import_modules_from_strings` (#4601) +- Clean deprecated FP16 API (#4571) +- Check whether `CLASSES` is correctly initialized in the initialization of `XMLDataset` (#4555) +- Support batch inference in the inference API (#4462, #4526) +- Clean deprecated warning and fix 'meta' error (#4695) + +### v2.9.0 (01/02/2021) + +#### Highlights + +- Support new methods: [SCNet](https://arxiv.org/abs/2012.10150), [Sparse R-CNN](https://arxiv.org/abs/2011.12450) +- Move `train_cfg` and `test_cfg` into model in configs +- Support to visualize results based on prediction quality + +#### New Features + +- Support [SCNet](https://arxiv.org/abs/2012.10150) (#4356) +- Support [Sparse R-CNN](https://arxiv.org/abs/2011.12450) (#4219) +- Support evaluate mAP by multiple IoUs (#4398) +- Support concatenate dataset for testing (#4452) +- Support to visualize results based on prediction quality (#4441) +- Add ONNX simplify option to Pytorch2ONNX script (#4468) +- Add hook for checking compatibility of class numbers in heads and datasets (#4508) + +#### Bug Fixes + +- Fix CPU inference bug of Cascade RPN (#4410) +- Fix NMS error of CornerNet when there is no prediction box (#4409) +- Fix TypeError in CornerNet inference (#4411) +- Fix bug of PAA when training with background images (#4391) +- Fix the error that the window data is not destroyed when `out_file is not None` and `show==False` (#4442) +- Fix order of NMS `score_factor` that will decrease the performance of YOLOv3 (#4473) +- Fix bug in HTC TTA when the number of detection boxes is 0 (#4516) +- Fix resize error in mask data structures (#4520) + +#### Improvements + +- Allow to customize classes in LVIS dataset (#4382) +- Add tutorials for building new models with existing datasets (#4396) +- Add CPU compatibility information in documentation (#4405) +- Add documentation of deprecated `ImageToTensor` for batch inference (#4408) +- Add more details in documentation for customizing dataset (#4430) +- Switch `imshow_det_bboxes` visualization backend from OpenCV to Matplotlib (#4389) +- Deprecate `ImageToTensor` in `image_demo.py` (#4400) +- Move train_cfg/test_cfg into model (#4347, #4489) +- Update docstring for `reg_decoded_bbox` option in bbox heads (#4467) +- Update dataset information in documentation (#4525) +- Release pre-trained R50 and R101 PAA detectors with multi-scale 3x training schedules (#4495) +- Add guidance for speed benchmark (#4537) + +### v2.8.0 (04/01/2021) + +#### Highlights + +- Support new methods: [Cascade RPN](https://arxiv.org/abs/1909.06720), [TridentNet](https://arxiv.org/abs/1901.01892) + +#### New Features + +- Support [Cascade RPN](https://arxiv.org/abs/1909.06720) (#1900) +- Support [TridentNet](https://arxiv.org/abs/1901.01892) (#3313) + +#### Bug Fixes + +- Fix bug of show result in async_benchmark (#4367) +- Fix scale factor in MaskTestMixin (#4366) +- Fix but when returning indices in `multiclass_nms` (#4362) +- Fix bug of empirical attention in resnext backbone error (#4300) +- Fix bug of `img_norm_cfg` in FCOS-HRNet models with updated performance and models (#4250) +- Fix invalid checkpoint and log in Mask R-CNN models on Cityscapes dataset (#4287) +- Fix bug in distributed sampler when dataset is too small (#4257) +- Fix bug of 'PAFPN has no attribute extra_convs_on_inputs' (#4235) + +#### Improvements + +- Update model url from aws to aliyun (#4349) +- Update ATSS for PyTorch 1.6+ (#4359) +- Update script to install ruby in pre-commit installation (#4360) +- Delete deprecated `mmdet.ops` (#4325) +- Refactor hungarian assigner for more general usage in Sparse R-CNN (#4259) +- Handle scipy import in DETR to reduce package dependencies (#4339) +- Update documentation of usages for config options after MMCV (1.2.3) supports overriding list in config (#4326) +- Update pre-train models of faster rcnn trained on COCO subsets (#4307) +- Avoid zero or too small value for beta in Dynamic R-CNN (#4303) +- Add doccumentation for Pytorch2ONNX (#4271) +- Add deprecated warning FPN arguments (#4264) +- Support returning indices of kept bboxes when using nms (#4251) +- Update type and device requirements when creating tensors `GFLHead` (#4210) +- Update device requirements when creating tensors in `CrossEntropyLoss` (#4224) + +### v2.7.0 (30/11/2020) + +- Support new method: [DETR](https://arxiv.org/abs/2005.12872), [ResNest](https://arxiv.org/abs/2004.08955), Faster R-CNN DC5. +- Support YOLO, Mask R-CNN, and Cascade R-CNN models exportable to ONNX. + +#### New Features + +- Support [DETR](https://arxiv.org/abs/2005.12872) (#4201, #4206) +- Support to link the best checkpoint in training (#3773) +- Support to override config through options in inference.py (#4175) +- Support YOLO, Mask R-CNN, and Cascade R-CNN models exportable to ONNX (#4087, #4083) +- Support [ResNeSt](https://arxiv.org/abs/2004.08955) backbone (#2959) +- Support unclip border bbox regression (#4076) +- Add tpfp func in evaluating AP (#4069) +- Support mixed precision training of SSD detector with other backbones (#4081) +- Add Faster R-CNN DC5 models (#4043) + +#### Bug Fixes + +- Fix bug of `gpu_id` in distributed training mode (#4163) +- Support Albumentations with version higher than 0.5 (#4032) +- Fix num_classes bug in faster rcnn config (#4088) +- Update code in docs/2_new_data_model.md (#4041) + +#### Improvements + +- Ensure DCN offset to have similar type as features in VFNet (#4198) +- Add config links in README files of models (#4190) +- Add tutorials for loss conventions (#3818) +- Add solution to installation issues in 30-series GPUs (#4176) +- Update docker version in get_started.md (#4145) +- Add model statistics and polish some titles in configs README (#4140) +- Clamp neg probability in FreeAnchor (#4082) +- Speed up expanding large images (#4089) +- Fix Pytorch 1.7 incompatibility issues (#4103) +- Update trouble shooting page to resolve segmentation fault (#4055) +- Update aLRP-Loss in project page (#4078) +- Clean duplicated `reduce_mean` function (#4056) +- Refactor Q&A (#4045) + +### v2.6.0 (1/11/2020) + +- Support new method: [VarifocalNet](https://arxiv.org/abs/2008.13367). +- Refactored documentation with more tutorials. + +#### New Features + +- Support GIoU calculation in `BboxOverlaps2D`, and re-implement `giou_loss` using `bbox_overlaps` (#3936) +- Support random sampling in CPU mode (#3948) +- Support VarifocalNet (#3666, #4024) + +#### Bug Fixes + +- Fix SABL validating bug in Cascade R-CNN (#3913) +- Avoid division by zero in PAA head when num_pos=0 (#3938) +- Fix temporary directory bug of multi-node testing error (#4034, #4017) +- Fix `--show-dir` option in test script (#4025) +- Fix GA-RetinaNet r50 model url (#3983) +- Update code in docs and fix broken urls (#3947) + +#### Improvements + +- Refactor pytorch2onnx API into `mmdet.core.export` and use `generate_inputs_and_wrap_model` for pytorch2onnx (#3857, #3912) +- Update RPN upgrade scripts for v2.5.0 compatibility (#3986) +- Use mmcv `tensor2imgs` (#4010) +- Update test robustness (#4000) +- Update trouble shooting page (#3994) +- Accelerate PAA training speed (#3985) +- Support batch_size > 1 in validation (#3966) +- Use RoIAlign implemented in MMCV for inference in CPU mode (#3930) +- Documentation refactoring (#4031) + +### v2.5.0 (5/10/2020) + +#### Highlights + +- Support new methods: [YOLACT](https://arxiv.org/abs/1904.02689), [CentripetalNet](https://arxiv.org/abs/2003.09119). +- Add more documentations for easier and more clear usage. + +#### Backwards Incompatible Changes + +__FP16 related methods are imported from mmcv instead of mmdet. (#3766, #3822)__ +Mixed precision training utils in `mmdet.core.fp16` are moved to `mmcv.runner`, including `force_fp32`, `auto_fp16`, `wrap_fp16_model`, and `Fp16OptimizerHook`. A deprecation warning will be raised if users attempt to import those methods from `mmdet.core.fp16`, and will be finally removed in V2.10.0. + +__\[0, N-1\] represents foreground classes and N indicates background classes for all models. (#3221)__ +Before v2.5.0, the background label for RPN is 0, and N for other heads. Now the behavior is consistent for all models. Thus `self.background_labels` in `dense_heads` is removed and all heads use `self.num_classes` to indicate the class index of background labels. +This change has no effect on the pre-trained models in the v2.x model zoo, but will affect the training of all models with RPN heads. Two-stage detectors whose RPN head uses softmax will be affected because the order of categories is changed. + +**Only call `get_subset_by_classes` when `test_mode=True` and `self.filter_empty_gt=True` (#3695)** +Function `get_subset_by_classes` in dataset is refactored and only filters out images when `test_mode=True` and `self.filter_empty_gt=True`. +In the original implementation, `get_subset_by_classes` is not related to the flag `self.filter_empty_gt` and will only be called when the classes is set during initialization no matter `test_mode` is `True` or `False`. This brings ambiguous behavior and potential bugs in many cases. After v2.5.0, if `filter_empty_gt=False`, no matter whether the classes are specified in a dataset, the dataset will use all the images in the annotations. If `filter_empty_gt=True` and `test_mode=True`, no matter whether the classes are specified, the dataset will call \`\`get_subset_by_classes\` to check the images and filter out images containing no GT boxes. Therefore, the users should be responsible for the data filtering/cleaning process for the test dataset. + +#### New Features + +- Test time augmentation for single stage detectors (#3844, #3638) +- Support to show the name of experiments during training (#3764) +- Add `Shear`, `Rotate`, `Translate` Augmentation (#3656, #3619, #3687) +- Add image-only transformations including `Constrast`, `Equalize`, `Color`, and `Brightness`. (#3643) +- Support [YOLACT](https://arxiv.org/abs/1904.02689) (#3456) +- Support [CentripetalNet](https://arxiv.org/abs/2003.09119) (#3390) +- Support PyTorch 1.6 in docker (#3905) + +#### Bug Fixes + +- Fix the bug of training ATSS when there is no ground truth boxes (#3702) +- Fix the bug of using Focal Loss when there is `num_pos` is 0 (#3702) +- Fix the label index mapping in dataset browser (#3708) +- Fix Mask R-CNN training stuck problem when their is no positive rois (#3713) +- Fix the bug of `self.rpn_head.test_cfg` in `RPNTestMixin` by using `self.rpn_head` in rpn head (#3808) +- Fix deprecated `Conv2d` from mmcv.ops (#3791) +- Fix device bug in RepPoints (#3836) +- Fix SABL validating bug (#3849) +- Use `https://download.openmmlab.com/mmcv/dist/index.html` for installing MMCV (#3840) +- Fix nonzero in NMS for PyTorch 1.6.0 (#3867) +- Fix the API change bug of PAA (#3883) +- Fix typo in bbox_flip (#3886) +- Fix cv2 import error of ligGL.so.1 in Dockerfile (#3891) + +#### Improvements + +- Change to use `mmcv.utils.collect_env` for collecting environment information to avoid duplicate codes (#3779) +- Update checkpoint file names to v2.0 models in documentation (#3795) +- Update tutorials for changing runtime settings (#3778), modifying loss (#3777) +- Improve the function of `simple_test_bboxes` in SABL (#3853) +- Convert mask to bool before using it as img's index for robustness and speedup (#3870) +- Improve documentation of modules and dataset customization (#3821) + +### v2.4.0 (5/9/2020) + +__Highlights__ + +- Fix lots of issues/bugs and reorganize the trouble shooting page +- Support new methods [SABL](https://arxiv.org/abs/1912.04260), [YOLOv3](https://arxiv.org/abs/1804.02767), and [PAA Assign](https://arxiv.org/abs/2007.08103) +- Support Batch Inference +- Start to publish `mmdet` package to PyPI since v2.3.0 +- Switch model zoo to download.openmmlab.com + +__Backwards Incompatible Changes__ + +- Support Batch Inference (#3564, #3686, #3705): Since v2.4.0, MMDetection could inference model with multiple images in a single GPU. + This change influences all the test APIs in MMDetection and downstream codebases. To help the users migrate their code, we use `replace_ImageToTensor` (#3686) to convert legacy test data pipelines during dataset initialization. +- Support RandomFlip with horizontal/vertical/diagonal direction (#3608): Since v2.4.0, MMDetection supports horizontal/vertical/diagonal flip in the data augmentation. This influences bounding box, mask, and image transformations in data augmentation process and the process that will map those data back to the original format. +- Migrate to use `mmlvis` and `mmpycocotools` for COCO and LVIS dataset (#3727). The APIs are fully compatible with the original `lvis` and `pycocotools`. Users need to uninstall the existing pycocotools and lvis packages in their environment first and install `mmlvis` & `mmpycocotools`. + +__Bug Fixes__ + +- Fix default mean/std for onnx (#3491) +- Fix coco evaluation and add metric items (#3497) +- Fix typo for install.md (#3516) +- Fix atss when sampler per gpu is 1 (#3528) +- Fix import of fuse_conv_bn (#3529) +- Fix bug of gaussian_target, update unittest of heatmap (#3543) +- Fixed VOC2012 evaluate (#3553) +- Fix scale factor bug of rescale (#3566) +- Fix with_xxx_attributes in base detector (#3567) +- Fix boxes scaling when number is 0 (#3575) +- Fix rfp check when neck config is a list (#3591) +- Fix import of fuse conv bn in benchmark.py (#3606) +- Fix webcam demo (#3634) +- Fix typo and itemize issues in tutorial (#3658) +- Fix error in distributed training when some levels of FPN are not assigned with bounding boxes (#3670) +- Fix the width and height orders of stride in valid flag generation (#3685) +- Fix weight initialization bug in Res2Net DCN (#3714) +- Fix bug in OHEMSampler (#3677) + +__New Features__ + +- Support Cutout augmentation (#3521) +- Support evaluation on multiple datasets through ConcatDataset (#3522) +- Support [PAA assign](https://arxiv.org/abs/2007.08103) #(3547) +- Support eval metric with pickle results (#3607) +- Support [YOLOv3](https://arxiv.org/abs/1804.02767) (#3083) +- Support [SABL](https://arxiv.org/abs/1912.04260) (#3603) +- Support to publish to Pypi in github-action (#3510) +- Support custom imports (#3641) + +__Improvements__ + +- Refactor common issues in documentation (#3530) +- Add pytorch 1.6 to CI config (#3532) +- Add config to runner meta (#3534) +- Add eval-option flag for testing (#3537) +- Add init_eval to evaluation hook (#3550) +- Add include_bkg in ClassBalancedDataset (#3577) +- Using config's loading in inference_detector (#3611) +- Add ATSS ResNet-101 models in model zoo (#3639) +- Update urls to download.openmmlab.com (#3665) +- Support non-mask training for CocoDataset (#3711) + +### v2.3.0 (5/8/2020) + +__Highlights__ + +- The CUDA/C++ operators have been moved to `mmcv.ops`. For backward compatibility `mmdet.ops` is kept as warppers of `mmcv.ops`. +- Support new methods [CornerNet](https://arxiv.org/abs/1808.01244), [DIOU](https://arxiv.org/abs/1911.08287)/[CIOU](https://arxiv.org/abs/2005.03572) loss, and new dataset: [LVIS V1](https://arxiv.org/abs/1908.03195) +- Provide more detailed colab training tutorials and more complete documentation. +- Support to convert RetinaNet from Pytorch to ONNX. + +__Bug Fixes__ + +- Fix the model initialization bug of DetectoRS (#3187) +- Fix the bug of module names in NASFCOSHead (#3205) +- Fix the filename bug in publish_model.py (#3237) +- Fix the dimensionality bug when `inside_flags.any()` is `False` in dense heads (#3242) +- Fix the bug of forgetting to pass flip directions in `MultiScaleFlipAug` (#3262) +- Fixed the bug caused by default value of `stem_channels` (#3333) +- Fix the bug of model checkpoint loading for CPU inference (#3318, #3316) +- Fix topk bug when box number is smaller than the expected topk number in ATSSAssigner (#3361) +- Fix the gt priority bug in center_region_assigner.py (#3208) +- Fix NaN issue of iou calculation in iou_loss.py (#3394) +- Fix the bug that `iou_thrs` is not actually used during evaluation in coco.py (#3407) +- Fix test-time augmentation of RepPoints (#3435) +- Fix runtimeError caused by incontiguous tensor in Res2Net+DCN (#3412) + +__New Features__ + +- Support [CornerNet](https://arxiv.org/abs/1808.01244) (#3036) +- Support [DIOU](https://arxiv.org/abs/1911.08287)/[CIOU](https://arxiv.org/abs/2005.03572) loss (#3151) +- Support [LVIS V1](https://arxiv.org/abs/1908.03195) dataset (#) +- Support customized hooks in training (#3395) +- Support fp16 training of generalized focal loss (#3410) +- Support to convert RetinaNet from Pytorch to ONNX (#3075) + +__Improvements__ + +- Support to process ignore boxes in ATSS assigner (#3082) +- Allow to crop images without ground truth in `RandomCrop` (#3153) +- Enable the the `Accuracy` module to set threshold (#3155) +- Refactoring unit tests (#3206) +- Unify the training settings of `to_float32` and `norm_cfg` in RegNets configs (#3210) +- Add colab training tutorials for beginners (#3213, #3273) +- Move CUDA/C++ operators into `mmcv.ops` and keep `mmdet.ops` as warppers for backward compatibility (#3232)(#3457) +- Update installation scripts in documentation (#3290) and dockerfile (#3320) +- Support to set image resize backend (#3392) +- Remove git hash in version file (#3466) +- Check mmcv version to force version compatibility (#3460) + +### v2.2.0 (1/7/2020) + +__Highlights__ + +- Support new methods: [DetectoRS](https://arxiv.org/abs/2006.02334), [PointRend](https://arxiv.org/abs/1912.08193), [Generalized Focal Loss](https://arxiv.org/abs/2006.04388), [Dynamic R-CNN](https://arxiv.org/abs/2004.06002) + +__Bug Fixes__ + +- Fix FreeAnchor when no gt in image (#3176) +- Clean up deprecated usage of `register_module()` (#3092, #3161) +- Fix pretrain bug in NAS FCOS (#3145) +- Fix `num_classes` in SSD (#3142) +- Fix FCOS warmup (#3119) +- Fix `rstrip` in `tools/publish_model.py` +- Fix `flip_ratio` default value in RandomFLip pipeline (#3106) +- Fix cityscapes eval with ms_rcnn (#3112) +- Fix RPN softmax (#3056) +- Fix filename of LVIS@v0.5 (#2998) +- Fix nan loss by filtering out-of-frame gt_bboxes in COCO (#2999) +- Fix bug in FSAF (#3018) +- Add FocalLoss `num_classes` check (#2964) +- Fix PISA Loss when there are no gts (#2992) +- Avoid nan in `iou_calculator` (#2975) +- Prevent possible bugs in loading and transforms caused by shallow copy (#2967) + +__New Features__ + +- Add DetectoRS (#3064) +- Support Generalize Focal Loss (#3097) +- Support PointRend (#2752) +- Support Dynamic R-CNN (#3040) +- Add DeepFashion dataset (#2968) +- Implement FCOS training tricks (#2935) +- Use BaseDenseHead as base class for anchor-base heads (#2963) +- Add `with_cp` for BasicBlock (#2891) +- Add `stem_channels` argument for ResNet (#2954) + +__Improvements__ + +- Add anchor free base head (#2867) +- Migrate to github action (#3137) +- Add docstring for datasets, pipelines, core modules and methods (#3130, #3125, #3120) +- Add VOC benchmark (#3060) +- Add `concat` mode in GRoI (#3098) +- Remove cmd arg `autorescale-lr` (#3080) +- Use `len(data['img_metas'])` to indicate `num_samples` (#3073, #3053) +- Switch to EpochBasedRunner (#2976) + +### v2.1.0 (8/6/2020) + +__Highlights__ + +- Support new backbones: [RegNetX](https://arxiv.org/abs/2003.13678), [Res2Net](https://arxiv.org/abs/1904.01169) +- Support new methods: [NASFCOS](https://arxiv.org/abs/1906.04423), [PISA](https://arxiv.org/abs/1904.04821), [GRoIE](https://arxiv.org/abs/2004.13665) +- Support new dataset: [LVIS](https://arxiv.org/abs/1908.03195) + +__Bug Fixes__ + +- Change the CLI argument `--validate` to `--no-validate` to enable validation after training epochs by default. (#2651) +- Add missing cython to docker file (#2713) +- Fix bug in nms cpu implementation (#2754) +- Fix bug when showing mask results (#2763) +- Fix gcc requirement (#2806) +- Fix bug in async test (#2820) +- Fix mask encoding-decoding bugs in test API (#2824) +- Fix bug in test time augmentation (#2858, #2921, #2944) +- Fix a typo in comment of apis/train (#2877) +- Fix the bug of returning None when no gt bboxes are in the original image in `RandomCrop`. Fix the bug that misses to handle `gt_bboxes_ignore`, `gt_label_ignore`, and `gt_masks_ignore` in `RandomCrop`, `MinIoURandomCrop` and `Expand` modules. (#2810) +- Fix bug of `base_channels` of regnet (#2917) +- Fix the bug of logger when loading pre-trained weights in base detector (#2936) + +__New Features__ + +- Add IoU models (#2666) +- Add colab demo for inference +- Support class agnostic nms (#2553) +- Add benchmark gathering scripts for development only (#2676) +- Add mmdet-based project links (#2736, #2767, #2895) +- Add config dump in training (#2779) +- Add ClassBalancedDataset (#2721) +- Add res2net backbone (#2237) +- Support RegNetX models (#2710) +- Use `mmcv.FileClient` to support different storage backends (#2712) +- Add ClassBalancedDataset (#2721) +- Code Release: Prime Sample Attention in Object Detection (CVPR 2020) (#2626) +- Implement NASFCOS (#2682) +- Add class weight in CrossEntropyLoss (#2797) +- Support LVIS dataset (#2088) +- Support GRoIE (#2584) + +__Improvements__ + +- Allow different x and y strides in anchor heads. (#2629) +- Make FSAF loss more robust to no gt (#2680) +- Compute pure inference time instead (#2657) and update inference speed (#2730) +- Avoided the possibility that a patch with 0 area is cropped. (#2704) +- Add warnings when deprecated `imgs_per_gpu` is used. (#2700) +- Add a mask rcnn example for config (#2645) +- Update model zoo (#2762, #2866, #2876, #2879, #2831) +- Add `ori_filename` to img_metas and use it in test show-dir (#2612) +- Use `img_fields` to handle multiple images during image transform (#2800) +- Add upsample_cfg support in FPN (#2787) +- Add `['img']` as default `img_fields` for back compatibility (#2809) +- Rename the pretrained model from `open-mmlab://resnet50_caffe` and `open-mmlab://resnet50_caffe_bgr` to `open-mmlab://detectron/resnet50_caffe` and `open-mmlab://detectron2/resnet50_caffe`. (#2832) +- Added sleep(2) in test.py to reduce hanging problem (#2847) +- Support `c10::half` in CARAFE (#2890) +- Improve documentations (#2918, #2714) +- Use optimizer constructor in mmcv and clean the original implementation in `mmdet.core.optimizer` (#2947) + +### v2.0.0 (6/5/2020) + +In this release, we made lots of major refactoring and modifications. + +1. __Faster speed__. We optimize the training and inference speed for common models, achieving up to 30% speedup for training and 25% for inference. Please refer to [model zoo](model_zoo.md#comparison-with-detectron2) for details. + +2. __Higher performance__. We change some default hyperparameters with no additional cost, which leads to a gain of performance for most models. Please refer to [compatibility](compatibility.md#training-hyperparameters) for details. + +3. __More documentation and tutorials__. We add a bunch of documentation and tutorials to help users get started more smoothly. Read it [here](https://mmdetection.readthedocs.io/en/latest/). + +4. __Support PyTorch 1.5__. The support for 1.1 and 1.2 is dropped, and we switch to some new APIs. + +5. __Better configuration system__. Inheritance is supported to reduce the redundancy of configs. + +6. __Better modular design__. Towards the goal of simplicity and flexibility, we simplify some encapsulation while add more other configurable modules like BBoxCoder, IoUCalculator, OptimizerConstructor, RoIHead. Target computation is also included in heads and the call hierarchy is simpler. + +7. Support new methods: [FSAF](https://arxiv.org/abs/1903.00621) and PAFPN (part of [PAFPN](https://arxiv.org/abs/1803.01534)). + +__Breaking Changes__ +Models training with MMDetection 1.x are not fully compatible with 2.0, please refer to the [compatibility doc](compatibility.md) for the details and how to migrate to the new version. + +__Improvements__ + +- Unify cuda and cpp API for custom ops. (#2277) +- New config files with inheritance. (#2216) +- Encapsulate the second stage into RoI heads. (#1999) +- Refactor GCNet/EmpericalAttention into plugins. (#2345) +- Set low quality match as an option in IoU-based bbox assigners. (#2375) +- Change the codebase's coordinate system. (#2380) +- Refactor the category order in heads. 0 means the first positive class instead of background now. (#2374) +- Add bbox sampler and assigner registry. (#2419) +- Speed up the inference of RPN. (#2420) +- Add `train_cfg` and `test_cfg` as class members in all anchor heads. (#2422) +- Merge target computation methods into heads. (#2429) +- Add bbox coder to support different bbox encoding and losses. (#2480) +- Unify the API for regression loss. (#2156) +- Refactor Anchor Generator. (#2474) +- Make `lr` an optional argument for optimizers. (#2509) +- Migrate to modules and methods in MMCV. (#2502, #2511, #2569, #2572) +- Support PyTorch 1.5. (#2524) +- Drop the support for Python 3.5 and use F-string in the codebase. (#2531) + +__Bug Fixes__ + +- Fix the scale factors for resized images without keep the aspect ratio. (#2039) +- Check if max_num > 0 before slicing in NMS. (#2486) +- Fix Deformable RoIPool when there is no instance. (#2490) +- Fix the default value of assigned labels. (#2536) +- Fix the evaluation of Cityscapes. (#2578) + +__New Features__ + +- Add deep_stem and avg_down option to ResNet, i.e., support ResNetV1d. (#2252) +- Add L1 loss. (#2376) +- Support both polygon and bitmap for instance masks. (#2353, #2540) +- Support CPU mode for inference. (#2385) +- Add optimizer constructor for complicated configuration of optimizers. (#2397, #2488) +- Implement PAFPN. (#2392) +- Support empty tensor input for some modules. (#2280) +- Support for custom dataset classes without overriding it. (#2408, #2443) +- Support to train subsets of coco dataset. (#2340) +- Add iou_calculator to potentially support more IoU calculation methods. (2405) +- Support class wise mean AP (was removed in the last version). (#2459) +- Add option to save the testing result images. (#2414) +- Support MomentumUpdaterHook. (#2571) +- Add a demo to inference a single image. (#2605) + +### v1.1.0 (24/2/2020) + +__Highlights__ + +- Dataset evaluation is rewritten with a unified api, which is used by both evaluation hooks and test scripts. +- Support new methods: [CARAFE](https://arxiv.org/abs/1905.02188). + +__Breaking Changes__ + +- The new MMDDP inherits from the official DDP, thus the `__init__` api is changed to be the same as official DDP. +- The `mask_head` field in HTC config files is modified. +- The evaluation and testing script is updated. +- In all transforms, instance masks are stored as a numpy array shaped (n, h, w) instead of a list of (h, w) arrays, where n is the number of instances. + +__Bug Fixes__ + +- Fix IOU assigners when ignore_iof_thr > 0 and there is no pred boxes. (#2135) +- Fix mAP evaluation when there are no ignored boxes. (#2116) +- Fix the empty RoI input for Deformable RoI Pooling. (#2099) +- Fix the dataset settings for multiple workflows. (#2103) +- Fix the warning related to `torch.uint8` in PyTorch 1.4. (#2105) +- Fix the inference demo on devices other than gpu:0. (#2098) +- Fix Dockerfile. (#2097) +- Fix the bug that `pad_val` is unused in Pad transform. (#2093) +- Fix the albumentation transform when there is no ground truth bbox. (#2032) + +__Improvements__ + +- Use torch instead of numpy for random sampling. (#2094) +- Migrate to the new MMDDP implementation in MMCV v0.3. (#2090) +- Add meta information in logs. (#2086) +- Rewrite Soft NMS with pytorch extension and remove cython as a dependency. (#2056) +- Rewrite dataset evaluation. (#2042, #2087, #2114, #2128) +- Use numpy array for masks in transforms. (#2030) + +__New Features__ + +- Implement "CARAFE: Content-Aware ReAssembly of FEatures". (#1583) +- Add `worker_init_fn()` in data_loader when seed is set. (#2066, #2111) +- Add logging utils. (#2035) + +### v1.0.0 (30/1/2020) + +This release mainly improves the code quality and add more docstrings. + +__Highlights__ + +- Documentation is online now: . +- Support new models: [ATSS](https://arxiv.org/abs/1912.02424). +- DCN is now available with the api `build_conv_layer` and `ConvModule` like the normal conv layer. +- A tool to collect environment information is available for trouble shooting. + +__Bug Fixes__ + +- Fix the incompatibility of the latest numpy and pycocotools. (#2024) +- Fix the case when distributed package is unavailable, e.g., on Windows. (#1985) +- Fix the dimension issue for `refine_bboxes()`. (#1962) +- Fix the typo when `seg_prefix` is a list. (#1906) +- Add segmentation map cropping to RandomCrop. (#1880) +- Fix the return value of `ga_shape_target_single()`. (#1853) +- Fix the loaded shape of empty proposals. (#1819) +- Fix the mask data type when using albumentation. (#1818) + +__Improvements__ + +- Enhance AssignResult and SamplingResult. (#1995) +- Add ability to overwrite existing module in Registry. (#1982) +- Reorganize requirements and make albumentations and imagecorruptions optional. (#1969) +- Check NaN in `SSDHead`. (#1935) +- Encapsulate the DCN in ResNe(X)t into a ConvModule & Conv_layers. (#1894) +- Refactoring for mAP evaluation and support multiprocessing and logging. (#1889) +- Init the root logger before constructing Runner to log more information. (#1865) +- Split `SegResizeFlipPadRescale` into different existing transforms. (#1852) +- Move `init_dist()` to MMCV. (#1851) +- Documentation and docstring improvements. (#1971, #1938, #1869, #1838) +- Fix the color of the same class for mask visualization. (#1834) +- Remove the option `keep_all_stages` in HTC and Cascade R-CNN. (#1806) + +__New Features__ + +- Add two test-time options `crop_mask` and `rle_mask_encode` for mask heads. (#2013) +- Support loading grayscale images as single channel. (#1975) +- Implement "Bridging the Gap Between Anchor-based and Anchor-free Detection via Adaptive Training Sample Selection". (#1872) +- Add sphinx generated docs. (#1859, #1864) +- Add GN support for flops computation. (#1850) +- Collect env info for trouble shooting. (#1812) + +### v1.0rc1 (13/12/2019) + +The RC1 release mainly focuses on improving the user experience, and fixing bugs. + +__Highlights__ + +- Support new models: [FoveaBox](https://arxiv.org/abs/1904.03797), [RepPoints](https://arxiv.org/abs/1904.11490) and [FreeAnchor](https://arxiv.org/abs/1909.02466). +- Add a Dockerfile. +- Add a jupyter notebook demo and a webcam demo. +- Setup the code style and CI. +- Add lots of docstrings and unit tests. +- Fix lots of bugs. + +__Breaking Changes__ + +- There was a bug for computing COCO-style mAP w.r.t different scales (AP_s, AP_m, AP_l), introduced by #621. (#1679) + +__Bug Fixes__ + +- Fix a sampling interval bug in Libra R-CNN. (#1800) +- Fix the learning rate in SSD300 WIDER FACE. (#1781) +- Fix the scaling issue when `keep_ratio=False`. (#1730) +- Fix typos. (#1721, #1492, #1242, #1108, #1107) +- Fix the shuffle argument in `build_dataloader`. (#1693) +- Clip the proposal when computing mask targets. (#1688) +- Fix the "index out of range" bug for samplers in some corner cases. (#1610, #1404) +- Fix the NMS issue on devices other than GPU:0. (#1603) +- Fix SSD Head and GHM Loss on CPU. (#1578) +- Fix the OOM error when there are too many gt bboxes. (#1575) +- Fix the wrong keyword argument `nms_cfg` in HTC. (#1573) +- Process masks and semantic segmentation in Expand and MinIoUCrop transforms. (#1550, #1361) +- Fix a scale bug in the Non Local op. (#1528) +- Fix a bug in transforms when `gt_bboxes_ignore` is None. (#1498) +- Fix a bug when `img_prefix` is None. (#1497) +- Pass the device argument to `grid_anchors` and `valid_flags`. (#1478) +- Fix the data pipeline for test_robustness. (#1476) +- Fix the argument type of deformable pooling. (#1390) +- Fix the coco_eval when there are only two classes. (#1376) +- Fix a bug in Modulated DeformableConv when deformable_group>1. (#1359) +- Fix the mask cropping in RandomCrop. (#1333) +- Fix zero outputs in DeformConv when not running on cuda:0. (#1326) +- Fix the type issue in Expand. (#1288) +- Fix the inference API. (#1255) +- Fix the inplace operation in Expand. (#1249) +- Fix the from-scratch training config. (#1196) +- Fix inplace add in RoIExtractor which cause an error in PyTorch 1.2. (#1160) +- Fix FCOS when input images has no positive sample. (#1136) +- Fix recursive imports. (#1099) + +__Improvements__ + +- Print the config file and mmdet version in the log. (#1721) +- Lint the code before compiling in travis CI. (#1715) +- Add a probability argument for the `Expand` transform. (#1651) +- Update the PyTorch and CUDA version in the docker file. (#1615) +- Raise a warning when specifying `--validate` in non-distributed training. (#1624, #1651) +- Beautify the mAP printing. (#1614) +- Add pre-commit hook. (#1536) +- Add the argument `in_channels` to backbones. (#1475) +- Add lots of docstrings and unit tests, thanks to [@Erotemic](https://github.com/Erotemic). (#1603, #1517, #1506, #1505, #1491, #1479, #1477, #1475, #1474) +- Add support for multi-node distributed test when there is no shared storage. (#1399) +- Optimize Dockerfile to reduce the image size. (#1306) +- Update new results of HRNet. (#1284, #1182) +- Add an argument `no_norm_on_lateral` in FPN. (#1240) +- Test the compiling in CI. (#1235) +- Move docs to a separate folder. (#1233) +- Add a jupyter notebook demo. (#1158) +- Support different type of dataset for training. (#1133) +- Use int64_t instead of long in cuda kernels. (#1131) +- Support unsquare RoIs for bbox and mask heads. (#1128) +- Manually add type promotion to make compatible to PyTorch 1.2. (#1114) +- Allowing validation dataset for computing validation loss. (#1093) +- Use `.scalar_type()` instead of `.type()` to suppress some warnings. (#1070) + +__New Features__ + +- Add an option `--with_ap` to compute the AP for each class. (#1549) +- Implement "FreeAnchor: Learning to Match Anchors for Visual Object Detection". (#1391) +- Support [Albumentations](https://github.com/albumentations-team/albumentations) for augmentations in the data pipeline. (#1354) +- Implement "FoveaBox: Beyond Anchor-based Object Detector". (#1339) +- Support horizontal and vertical flipping. (#1273, #1115) +- Implement "RepPoints: Point Set Representation for Object Detection". (#1265) +- Add test-time augmentation to HTC and Cascade R-CNN. (#1251) +- Add a COCO result analysis tool. (#1228) +- Add Dockerfile. (#1168) +- Add a webcam demo. (#1155, #1150) +- Add FLOPs counter. (#1127) +- Allow arbitrary layer order for ConvModule. (#1078) + +### v1.0rc0 (27/07/2019) + +- Implement lots of new methods and components (Mixed Precision Training, HTC, Libra R-CNN, Guided Anchoring, Empirical Attention, Mask Scoring R-CNN, Grid R-CNN (Plus), GHM, GCNet, FCOS, HRNet, Weight Standardization, etc.). Thank all collaborators! +- Support two additional datasets: WIDER FACE and Cityscapes. +- Refactoring for loss APIs and make it more flexible to adopt different losses and related hyper-parameters. +- Speed up multi-gpu testing. +- Integrate all compiling and installing in a single script. + +### v0.6.0 (14/04/2019) + +- Up to 30% speedup compared to the model zoo. +- Support both PyTorch stable and nightly version. +- Replace NMS and SigmoidFocalLoss with Pytorch CUDA extensions. + +### v0.6rc0(06/02/2019) + +- Migrate to PyTorch 1.0. + +### v0.5.7 (06/02/2019) + +- Add support for Deformable ConvNet v2. (Many thanks to the authors and [@chengdazhi](https://github.com/chengdazhi)) +- This is the last release based on PyTorch 0.4.1. + +### v0.5.6 (17/01/2019) + +- Add support for Group Normalization. +- Unify RPNHead and single stage heads (RetinaHead, SSDHead) with AnchorHead. + +### v0.5.5 (22/12/2018) + +- Add SSD for COCO and PASCAL VOC. +- Add ResNeXt backbones and detection models. +- Refactoring for Samplers/Assigners and add OHEM. +- Add VOC dataset and evaluation scripts. + +### v0.5.4 (27/11/2018) + +- Add SingleStageDetector and RetinaNet. + +### v0.5.3 (26/11/2018) + +- Add Cascade R-CNN and Cascade Mask R-CNN. +- Add support for Soft-NMS in config files. + +### v0.5.2 (21/10/2018) + +- Add support for custom datasets. +- Add a script to convert PASCAL VOC annotations to the expected format. + +### v0.5.1 (20/10/2018) + +- Add BBoxAssigner and BBoxSampler, the `train_cfg` field in config files are restructured. +- `ConvFCRoIHead` / `SharedFCRoIHead` are renamed to `ConvFCBBoxHead` / `SharedFCBBoxHead` for consistency. diff --git a/mmdetection/docs/en/notes/compatibility.md b/mmdetection/docs/en/notes/compatibility.md new file mode 100644 index 0000000..26325e2 --- /dev/null +++ b/mmdetection/docs/en/notes/compatibility.md @@ -0,0 +1,178 @@ +# Compatibility of MMDetection 2.x + +## MMDetection 2.25.0 + +In order to support Mask2Former for instance segmentation, the original config files of Mask2Former for panpotic segmentation need to be renamed [PR #7571](https://github.com/open-mmlab/mmdetection/pull/7571). + + + + + + + + + + + +
    before v2.25.0after v2.25.0
    + +``` +'mask2former_xxx_coco.py' represents config files for **panoptic segmentation**. +``` + + + +``` +'mask2former_xxx_coco.py' represents config files for **instance segmentation**. +'mask2former_xxx_coco-panoptic.py' represents config files for **panoptic segmentation**. +``` + +
    + +## MMDetection 2.21.0 + +In order to support CPU training, the logic of scatter in batch collating has been changed. We recommend to use +MMCV v1.4.4 or higher. For more details, please refer to [MMCV PR #1621](https://github.com/open-mmlab/mmcv/pull/1621). + +## MMDetection 2.18.1 + +### MMCV compatibility + +In order to fix the wrong weight reference bug in BaseTransformerLayer, the logic in batch first mode of MultiheadAttention has been changed. +We recommend to use MMCV v1.3.17 or higher. For more details, please refer to [MMCV PR #1418](https://github.com/open-mmlab/mmcv/pull/1418). + +## MMDetection 2.18.0 + +### DIIHead compatibility + +In order to support QueryInst, attn_feats is added into the returned tuple of DIIHead. + +## MMDetection 2.14.0 + +### MMCV Version + +In order to fix the problem that the priority of EvalHook is too low, all hook priorities have been re-adjusted in 1.3.8, so MMDetection 2.14.0 needs to rely on the latest MMCV 1.3.8 version. For related information, please refer to [#1120](https://github.com/open-mmlab/mmcv/pull/1120), for related issues, please refer to [#5343](https://github.com/open-mmlab/mmdetection/issues/5343). + +### SSD compatibility + +In v2.14.0, to make SSD more flexible to use, [PR5291](https://github.com/open-mmlab/mmdetection/pull/5291) refactored its backbone, neck and head. The users can use the script `tools/model_converters/upgrade_ssd_version.py` to convert their models. + +```bash +python tools/model_converters/upgrade_ssd_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH} +``` + +- OLD_MODEL_PATH: the path to load the old version SSD model. +- NEW_MODEL_PATH: the path to save the converted model weights. + +## MMDetection 2.12.0 + +MMDetection is going through big refactoring for more general and convenient usages during the releases from v2.12.0 to v2.18.0 (maybe longer). +In v2.12.0 MMDetection inevitably brings some BC-breakings, including the MMCV dependency, model initialization, model registry, and mask AP evaluation. + +### MMCV Version + +MMDetection v2.12.0 relies on the newest features in MMCV 1.3.3, including `BaseModule` for unified parameter initialization, model registry, and the CUDA operator `MultiScaleDeformableAttn` for [Deformable DETR](https://arxiv.org/abs/2010.04159). Note that MMCV 1.3.2 already contains all the features used by MMDet but has known issues. Therefore, we recommend users to skip MMCV v1.3.2 and use v1.3.2, though v1.3.2 might work for most of the cases. + +### Unified model initialization + +To unify the parameter initialization in OpenMMLab projects, MMCV supports `BaseModule` that accepts `init_cfg` to allow the modules' parameters initialized in a flexible and unified manner. Now the users need to explicitly call `model.init_weights()` in the training script to initialize the model (as in [here](https://github.com/open-mmlab/mmdetection/blob/main/tools/train.py#L162), previously this was handled by the detector. **The downstream projects must update their model initialization accordingly to use MMDetection v2.12.0**. Please refer to PR #4750 for details. + +### Unified model registry + +To easily use backbones implemented in other OpenMMLab projects, MMDetection v2.12.0 inherits the model registry created in MMCV (#760). In this way, as long as the backbone is supported in an OpenMMLab project and that project also uses the registry in MMCV, users can use that backbone in MMDetection by simply modifying the config without copying the code of that backbone into MMDetection. Please refer to PR #5059 for more details. + +### Mask AP evaluation + +Before [PR 4898](https://github.com/open-mmlab/mmdetection/pull/4898) and V2.12.0, the mask AP of small, medium, and large instances is calculated based on the bounding box area rather than the real mask area. This leads to higher `APs` and `APm` but lower `APl` but will not affect the overall mask AP. [PR 4898](https://github.com/open-mmlab/mmdetection/pull/4898) change it to use mask areas by deleting `bbox` in mask AP calculation. +The new calculation does not affect the overall mask AP evaluation and is consistent with [Detectron2](https://github.com/facebookresearch/detectron2/). + +## Compatibility with MMDetection 1.x + +MMDetection 2.0 goes through a big refactoring and addresses many legacy issues. It is not compatible with the 1.x version, i.e., running inference with the same model weights in these two versions will produce different results. Thus, MMDetection 2.0 re-benchmarks all the models and provides their links and logs in the model zoo. + +The major differences are in four folds: coordinate system, codebase conventions, training hyperparameters, and modular design. + +### Coordinate System + +The new coordinate system is consistent with [Detectron2](https://github.com/facebookresearch/detectron2/) and treats the center of the most left-top pixel as (0, 0) rather than the left-top corner of that pixel. +Accordingly, the system interprets the coordinates in COCO bounding box and segmentation annotations as coordinates in range `[0, width]` or `[0, height]`. +This modification affects all the computation related to the bbox and pixel selection, +which is more natural and accurate. + +- The height and width of a box with corners (x1, y1) and (x2, y2) in the new coordinate system is computed as `width = x2 - x1` and `height = y2 - y1`. + In MMDetection 1.x and previous version, a "+ 1" was added both height and width. + This modification are in three folds: + + 1. Box transformation and encoding/decoding in regression. + 2. IoU calculation. This affects the matching process between ground truth and bounding box and the NMS process. The effect to compatibility is very negligible, though. + 3. The corners of bounding box is in float type and no longer quantized. This should provide more accurate bounding box results. This also makes the bounding box and RoIs not required to have minimum size of 1, whose effect is small, though. + +- The anchors are center-aligned to feature grid points and in float type. + In MMDetection 1.x and previous version, the anchors are in `int` type and not center-aligned. + This affects the anchor generation in RPN and all the anchor-based methods. + +- ROIAlign is better aligned with the image coordinate system. The new implementation is adopted from [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign). + The RoIs are shifted by half a pixel by default when they are used to cropping RoI features, compared to MMDetection 1.x. + The old behavior is still available by setting `aligned=False` instead of `aligned=True`. + +- Mask cropping and pasting are more accurate. + + 1. We use the new RoIAlign to crop mask targets. In MMDetection 1.x, the bounding box is quantized before it is used to crop mask target, and the crop process is implemented by numpy. In new implementation, the bounding box for crop is not quantized and sent to RoIAlign. This implementation accelerates the training speed by a large margin (~0.1s per iter, ~2 hour when training Mask R50 for 1x schedule) and should be more accurate. + + 2. In MMDetection 2.0, the "`paste_mask()`" function is different and should be more accurate than those in previous versions. This change follows the modification in [Detectron2](https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/masks.py) and can improve mask AP on COCO by ~0.5% absolute. + +### Codebase Conventions + +- MMDetection 2.0 changes the order of class labels to reduce unused parameters in regression and mask branch more naturally (without +1 and -1). + This effect all the classification layers of the model to have a different ordering of class labels. The final layers of regression branch and mask head no longer keep K+1 channels for K categories, and their class orders are consistent with the classification branch. + + - In MMDetection 2.0, label "K" means background, and labels \[0, K-1\] correspond to the K = num_categories object categories. + + - In MMDetection 1.x and previous version, label "0" means background, and labels \[1, K\] correspond to the K categories. + + - **Note**: The class order of softmax RPN is still the same as that in 1.x in versions\<=2.4.0 while sigmoid RPN is not affected. The class orders in all heads are unified since MMDetection v2.5.0. + +- Low quality matching in R-CNN is not used. In MMDetection 1.x and previous versions, the `max_iou_assigner` will match low quality boxes for each ground truth box in both RPN and R-CNN training. We observe this sometimes does not assign the most perfect GT box to some bounding boxes, + thus MMDetection 2.0 do not allow low quality matching by default in R-CNN training in the new system. This sometimes may slightly improve the box AP (~0.1% absolute). + +- Separate scale factors for width and height. In MMDetection 1.x and previous versions, the scale factor is a single float in mode `keep_ratio=True`. This is slightly inaccurate because the scale factors for width and height have slight difference. MMDetection 2.0 adopts separate scale factors for width and height, the improvement on AP ~0.1% absolute. + +- Configs name conventions are changed. MMDetection V2.0 adopts the new name convention to maintain the gradually growing model zoo as the following: + + ```shell + [model]_(model setting)_[backbone]_[neck]_(norm setting)_(misc)_(gpu x batch)_[schedule]_[dataset].py, + ``` + + where the (`misc`) includes DCN and GCBlock, etc. More details are illustrated in the [documentation for config](tutorials/config) + +- MMDetection V2.0 uses new ResNet Caffe backbones to reduce warnings when loading pre-trained models. Most of the new backbones' weights are the same as the former ones but do not have `conv.bias`, except that they use a different `img_norm_cfg`. Thus, the new backbone will not cause warning of unexpected keys. + +### Training Hyperparameters + +The change in training hyperparameters does not affect +model-level compatibility but slightly improves the performance. The major ones are: + +- The number of proposals after nms is changed from 2000 to 1000 by setting `nms_post=1000` and `max_num=1000`. + This slightly improves both mask AP and bbox AP by ~0.2% absolute. + +- The default box regression losses for Mask R-CNN, Faster R-CNN and RetinaNet are changed from smooth L1 Loss to L1 loss. This leads to an overall improvement in box AP (~0.6% absolute). However, using L1-loss for other methods such as Cascade R-CNN and HTC does not improve the performance, so we keep the original settings for these methods. + +- The sample num of RoIAlign layer is set to be 0 for simplicity. This leads to slightly improvement on mask AP (~0.2% absolute). + +- The default setting does not use gradient clipping anymore during training for faster training speed. This does not degrade performance of the most of models. For some models such as RepPoints we keep using gradient clipping to stabilize the training process and to obtain better performance. + +- The default warmup ratio is changed from 1/3 to 0.001 for a more smooth warming up process since the gradient clipping is usually not used. The effect is found negligible during our re-benchmarking, though. + +### Upgrade Models from 1.x to 2.0 + +To convert the models trained by MMDetection V1.x to MMDetection V2.0, the users can use the script `tools/model_converters/upgrade_model_version.py` to convert +their models. The converted models can be run in MMDetection V2.0 with slightly dropped performance (less than 1% AP absolute). +Details can be found in `configs/legacy`. + +## pycocotools compatibility + +`mmpycocotools` is the OpenMMlab's fork of official `pycocotools`, which works for both MMDetection and Detectron2. +Before [PR 4939](https://github.com/open-mmlab/mmdetection/pull/4939), since `pycocotools` and `mmpycocotool` have the same package name, if users already installed `pycocotools` (installed Detectron2 first under the same environment), then the setup of MMDetection will skip installing `mmpycocotool`. Thus MMDetection fails due to the missing `mmpycocotools`. +If MMDetection is installed before Detectron2, they could work under the same environment. +[PR 4939](https://github.com/open-mmlab/mmdetection/pull/4939) deprecates mmpycocotools in favor of official pycocotools. +Users may install MMDetection and Detectron2 under the same environment after [PR 4939](https://github.com/open-mmlab/mmdetection/pull/4939), no matter what the installation order is. diff --git a/mmdetection/docs/en/notes/contribution_guide.md b/mmdetection/docs/en/notes/contribution_guide.md new file mode 100644 index 0000000..d622c0a --- /dev/null +++ b/mmdetection/docs/en/notes/contribution_guide.md @@ -0,0 +1 @@ +# Contribution diff --git a/mmdetection/docs/en/notes/faq.md b/mmdetection/docs/en/notes/faq.md new file mode 100644 index 0000000..9e3c1a7 --- /dev/null +++ b/mmdetection/docs/en/notes/faq.md @@ -0,0 +1,240 @@ +# Frequently Asked Questions + +We list some common troubles faced by many users and their corresponding solutions here. Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. If the contents here do not cover your issue, please create an issue using the [provided templates](https://github.com/open-mmlab/mmdetection/blob/main/.github/ISSUE_TEMPLATE/error-report.md/) and make sure you fill in all required information in the template. + +## PyTorch 2.0 Support + +The vast majority of algorithms in MMDetection now support PyTorch 2.0 and its `torch.compile` function. Users only need to install MMDetection 3.0.0rc7 or later versions to enjoy this feature. If any unsupported algorithms are found during use, please feel free to give us feedback. We also welcome contributions from the community to benchmark the speed improvement brought by using the `torch.compile` function. + +To enable the `torch.compile` function, simply add `--cfg-options compile=True` after `train.py` or `test.py`. For example, to enable `torch.compile` for RTMDet, you can use the following command: + +```shell +# Single GPU +python tools/train.py configs/rtmdet/rtmdet_s_8xb32-300e_coco.py --cfg-options compile=True + +# Single node multiple GPUs +./tools/dist_train.sh configs/rtmdet/rtmdet_s_8xb32-300e_coco.py 8 --cfg-options compile=True + +# Single node multiple GPUs + AMP +./tools/dist_train.sh configs/rtmdet/rtmdet_s_8xb32-300e_coco.py 8 --cfg-options compile=True --amp +``` + +It is important to note that PyTorch 2.0's support for dynamic shapes is not yet fully developed. In most object detection algorithms, not only are the input shapes dynamic, but the loss calculation and post-processing parts are also dynamic. This can lead to slower training speeds when using the `torch.compile` function. Therefore, if you wish to enable the `torch.compile` function, you should follow these principles: + +1. Input images to the network are fixed shape, not multi-scale +2. set `torch._dynamo.config.cache_size_limit` parameter. TorchDynamo will convert and cache the Python bytecode, and the compiled functions will be stored in the cache. When the next check finds that the function needs to be recompiled, the function will be recompiled and cached. However, if the number of recompilations exceeds the maximum value set (64), the function will no longer be cached or recompiled. As mentioned above, the loss calculation and post-processing parts of the object detection algorithm are also dynamically calculated, and these functions need to be recompiled every time. Therefore, setting the `torch._dynamo.config.cache_size_limit` parameter to a smaller value can effectively reduce the compilation time + +In MMDetection, you can set the `torch._dynamo.config.cache_size_limit` parameter through the environment variable `DYNAMO_CACHE_SIZE_LIMIT`. For example, the command is as follows: + +```shell +# Single GPU +export DYNAMO_CACHE_SIZE_LIMIT = 4 +python tools/train.py configs/rtmdet/rtmdet_s_8xb32-300e_coco.py --cfg-options compile=True + +# Single node multiple GPUs +export DYNAMO_CACHE_SIZE_LIMIT = 4 +./tools/dist_train.sh configs/rtmdet/rtmdet_s_8xb32-300e_coco.py 8 --cfg-options compile=True +``` + +About the common questions about PyTorch 2.0's dynamo, you can refer to [here](https://pytorch.org/docs/stable/dynamo/faq.html) + +## Installation + +Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer"; "AssertionError: MMCV==xxx is used but incompatible. Please install mmcv>=xxx, \<=xxx." + +Compatible MMDetection, MMEngine, and MMCV versions are shown as below. Please choose the correct version of MMCV to avoid installation issues. + +| MMDetection version | MMCV version | MMEngine version | +| :-----------------: | :---------------------: | :----------------------: | +| main | mmcv>=2.0.0, \<2.2.0 | mmengine>=0.7.1, \<1.0.0 | +| 3.2.0 | mmcv>=2.0.0, \<2.2.0 | mmengine>=0.7.1, \<1.0.0 | +| 3.1.0 | mmcv>=2.0.0, \<2.1.0 | mmengine>=0.7.1, \<1.0.0 | +| 3.0.0 | mmcv>=2.0.0, \<2.1.0 | mmengine>=0.7.1, \<1.0.0 | +| 3.0.0rc6 | mmcv>=2.0.0rc4, \<2.1.0 | mmengine>=0.6.0, \<1.0.0 | +| 3.0.0rc5 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.3.0, \<1.0.0 | +| 3.0.0rc4 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.3.0, \<1.0.0 | +| 3.0.0rc3 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.3.0, \<1.0.0 | +| 3.0.0rc2 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.1.0, \<1.0.0 | +| 3.0.0rc1 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.1.0, \<1.0.0 | +| 3.0.0rc0 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.1.0, \<1.0.0 | + +**Note:** + +1. If you want to install mmdet-v2.x, the compatible MMDetection and MMCV versions table can be found at [here](https://mmdetection.readthedocs.io/en/stable/faq.html#installation). Please choose the correct version of MMCV to avoid installation issues. +2. In MMCV-v2.x, `mmcv-full` is rename to `mmcv`, if you want to install `mmcv` without CUDA ops, you can install `mmcv-lite`. + +- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'". + + 1. Uninstall existing `mmcv-lite` in the environment using `pip uninstall mmcv-lite`. + 2. Install `mmcv` following the [installation instruction](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html). + +- "Microsoft Visual C++ 14.0 or graeter is required" during installation on Windows. + + This error happens when building the 'pycocotools.\_mask' extension of pycocotools and the environment lacks corresponding C++ compilation dependencies. You need to download it at Microsoft officials [visual-cpp-build-tools](https://visualstudio.microsoft.com/zh-hans/visual-cpp-build-tools/), select the "Use C ++ Desktop Development" option to install the minimum dependencies, and then reinstall pycocotools. + +- Using Albumentations + + If you would like to use `albumentations`, we suggest using `pip install -r requirements/albu.txt` or + `pip install -U albumentations --no-binary qudida,albumentations`. + If you simply use `pip install albumentations>=0.3.2`, it will install `opencv-python-headless` simultaneously (even though you have already installed `opencv-python`). + Please refer to the [official documentation](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies) for details. + +- ModuleNotFoundError is raised when using some algorithms + + Some extra dependencies are required for Instaboost, Panoptic Segmentation, LVIS dataset, etc. Please note the error message and install corresponding packages, e.g., + + ```shell + # for instaboost + pip install instaboostfast + # for panoptic segmentation + pip install git+https://github.com/cocodataset/panopticapi.git + # for LVIS dataset + pip install git+https://github.com/lvis-dataset/lvis-api.git + ``` + +## Coding + +- Do I need to reinstall mmdet after some code modifications + + If you follow the best practice and install mmdet with `pip install -e .`, any local modifications made to the code will take effect without reinstallation. + +- How to develop with multiple MMDetection versions + + You can have multiple folders like mmdet-3.0, mmdet-3.1. + When you run the train or test script, it will adopt the mmdet package in the current folder. + + To use the default MMDetection installed in the environment rather than the one you are working with, you can remove the following line in those scripts: + + ```shell + PYTHONPATH="$(dirname $0)/..":$PYTHONPATH + ``` + +## PyTorch/CUDA Environment + +- "RTX 30 series card fails when building MMCV or MMDet" + + 1. Temporary work-around: do `MMCV_WITH_OPS=1 MMCV_CUDA_ARGS='-gencode=arch=compute_80,code=sm_80' pip install -e .`. + The common issue is `nvcc fatal : Unsupported gpu architecture 'compute_86'`. This means that the compiler should optimize for sm_86, i.e., nvidia 30 series card, but such optimizations have not been supported by CUDA toolkit 11.0. + This work-around modifies the compile flag by adding `MMCV_CUDA_ARGS='-gencode=arch=compute_80,code=sm_80'`, which tells `nvcc` to optimize for **sm_80**, i.e., Nvidia A100. Although A100 is different from the 30 series card, they use similar ampere architecture. This may hurt the performance but it works. + 2. PyTorch developers have updated that the default compiler flags should be fixed by [pytorch/pytorch#47585](https://github.com/pytorch/pytorch/pull/47585). So using PyTorch-nightly may also be able to solve the problem, though we have not tested it yet. + +- "invalid device function" or "no kernel image is available for execution". + + 1. Check if your cuda runtime version (under `/usr/local/`), `nvcc --version` and `conda list cudatoolkit` version match. + 2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture. + You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. + The GPU arch table could be found [here](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list), + i.e. run `TORCH_CUDA_ARCH_LIST=7.0 pip install mmcv` to build MMCV for Volta GPUs. + The compatibility issue could happen when using old GPUS, e.g., Tesla K80 (3.7) on colab. + 3. Check whether the running environment is the same as that when mmcv/mmdet has compiled. + For example, you may compile mmcv using CUDA 10.0 but run it on CUDA 9.0 environments. + +- "undefined symbol" or "cannot open xxx.so". + + 1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check whether the CUDA/GCC runtimes are the same as those used for compiling mmcv, + i.e. run `python mmdet/utils/collect_env.py` to see if `"MMCV Compiler"`/`"MMCV CUDA Compiler"` is the same as `"GCC"`/`"CUDA_HOME"`. + 2. If those symbols are PyTorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the PyTorch version is the same as that used for compiling mmcv. + 3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment. + +- setuptools.sandbox.UnpickleableException: DistutilsSetupError("each element of 'ext_modules' option must be an Extension instance or 2-tuple") + + 1. If you are using miniconda rather than anaconda, check whether Cython is installed as indicated in [#3379](https://github.com/open-mmlab/mmdetection/issues/3379). + You need to manually install Cython first and then run command `pip install -r requirements.txt`. + 2. You may also need to check the compatibility between the `setuptools`, `Cython`, and `PyTorch` in your environment. + +- "Segmentation fault". + + 1. Check you GCC version and use GCC 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause "segmentation fault" and simply changing it to GCC 5.4 could solve the problem. + + 2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal. + + ```shell + python -c 'import torch; print(torch.cuda.is_available())' + ``` + + And see whether they could correctly output results. + + 3. If Pytorch is correctly installed, check whether MMCV is correctly installed. + + ```shell + python -c 'import mmcv; import mmcv.ops' + ``` + + If MMCV is correctly installed, then there will be no issue of the above two commands. + + 4. If MMCV and Pytorch is correctly installed, you man use `ipdb`, `pdb` to set breakpoints or directly add 'print' in mmdetection code and see which part leads the segmentation fault. + +## Training + +- "Loss goes Nan" + + 1. Check if the dataset annotations are valid: zero-size bounding boxes will cause the regression loss to be Nan due to the commonly used transformation for box regression. Some small size (width or height are smaller than 1) boxes will also cause this problem after data augmentation (e.g., instaboost). So check the data and try to filter out those zero-size boxes and skip some risky augmentations on the small-size boxes when you face the problem. + 2. Reduce the learning rate: the learning rate might be too large due to some reasons, e.g., change of batch size. You can rescale them to the value that could stably train the model. + 3. Extend the warmup iterations: some models are sensitive to the learning rate at the start of the training. You can extend the warmup iterations, e.g., change the `warmup_iters` from 500 to 1000 or 2000. + 4. Add gradient clipping: some models requires gradient clipping to stabilize the training process. The default of `grad_clip` is `None`, you can add gradient clippint to avoid gradients that are too large, i.e., set `optim_wrapper=dict(clip_grad=dict(max_norm=35, norm_type=2))` in your config file. + +- "GPU out of memory" + + 1. There are some scenarios when there are large amount of ground truth boxes, which may cause OOM during target assignment. You can set `gpu_assign_thr=N` in the config of assigner thus the assigner will calculate box overlaps through CPU when there are more than N GT boxes. + + 2. Set `with_cp=True` in the backbone. This uses the sublinear strategy in PyTorch to reduce GPU memory cost in the backbone. + + 3. Try mixed precision training using following the examples in `config/fp16`. The `loss_scale` might need further tuning for different models. + + 4. Try to use `AvoidCUDAOOM` to avoid GPU out of memory. It will first retry after calling `torch.cuda.empty_cache()`. If it still fails, it will then retry by converting the type of inputs to FP16 format. If it still fails, it will try to copy inputs from GPUs to CPUs to continue computing. Try AvoidOOM in you code to make the code continue to run when GPU memory runs out: + + ```python + from mmdet.utils import AvoidCUDAOOM + + output = AvoidCUDAOOM.retry_if_cuda_oom(some_function)(input1, input2) + ``` + + You can also try `AvoidCUDAOOM` as a decorator to make the code continue to run when GPU memory runs out: + + ```python + from mmdet.utils import AvoidCUDAOOM + + @AvoidCUDAOOM.retry_if_cuda_oom + def function(*args, **kwargs): + ... + return xxx + ``` + +- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one" + + 1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode. + 2. You can set `find_unused_parameters = True` in the config to solve the above problems, but this will slow down the training speed. + 3. You can set `detect_anomalous_params = True` in the config or `model_wrapper_cfg = dict(type='MMDistributedDataParallel', detect_anomalous_params=True)` (More details please refer to [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py#L91)) to get the name of those unused parameters. Note `detect_anomalous_params = True` will slow down the training speed, so it is recommended for debugging only. + +- Save the best model + + It can be turned on by configuring `default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto'),`. In the case of the `auto` parameter, the first key in the returned evaluation result will be used as the basis for selecting the best model. You can also directly set the key in the evaluation result to manually set it, for example, `save_best='coco/bbox_mAP'`. + +## Evaluation + +- COCO Dataset, AP or AR = -1 + 1. According to the definition of COCO dataset, the small and medium areas in an image are less than 1024 (32\*32), 9216 (96\*96), respectively. + 2. If the corresponding area has no object, the result of AP and AR will set to -1. + +## Model + +- `style` in ResNet + + The `style` parameter in ResNet allows either `pytorch` or `caffe` style. It indicates the difference in the Bottleneck module. Bottleneck is a stacking structure of `1x1-3x3-1x1` convolutional layers. In the case of `caffe` mode, the convolution layer with `stride=2` is the first `1x1` convolution, while in `pyorch` mode, it is the second `3x3` convolution has `stride=2`. A sample code is as below: + + ```python + if self.style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + ``` + +- ResNeXt parameter description + + ResNeXt comes from the paper [`Aggregated Residual Transformations for Deep Neural Networks`](https://arxiv.org/abs/1611.05431). It introduces group and uses “cardinality” to control the number of groups to achieve a balance between accuracy and complexity. It controls the basic width and grouping parameters of the internal Bottleneck module through two hyperparameters `baseWidth` and `cardinality`. An example configuration name in MMDetection is `mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py`, where `mask_rcnn` represents the algorithm using Mask R-CNN, `x101` represents the backbone network using ResNeXt-101, and `64x4d` represents that the bottleneck block has 64 group and each group has basic width of 4. + +- `norm_eval` in backbone + + Since the detection model is usually large and the input image resolution is high, this will result in a small batch of the detection model, which will make the variance of the statistics calculated by BatchNorm during the training process very large and not as stable as the statistics obtained during the pre-training of the backbone network . Therefore, the `norm_eval=True` mode is generally used in training, and the BatchNorm statistics in the pre-trained backbone network are directly used. The few algorithms that use large batches are the `norm_eval=False` mode, such as NASFPN. For the backbone network without ImageNet pre-training and the batch is relatively small, you can consider using `SyncBN`. diff --git a/mmdetection/docs/en/notes/projects.md b/mmdetection/docs/en/notes/projects.md new file mode 100644 index 0000000..3123e2b --- /dev/null +++ b/mmdetection/docs/en/notes/projects.md @@ -0,0 +1,57 @@ +# Projects based on MMDetection + +There are many projects built upon MMDetection. +We list some of them as examples of how to extend MMDetection for your own projects. +As the page might not be completed, please feel free to create a PR to update this page. + +## Projects as an extension + +Some projects extend the boundary of MMDetection for deployment or other research fields. +They reveal the potential of what MMDetection can do. We list several of them as below. + +- [OTEDetection](https://github.com/opencv/mmdetection): OpenVINO training extensions for object detection. +- [MMDetection3d](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. + +## Projects of papers + +There are also projects released with papers. +Some of the papers are published in top-tier conferences (CVPR, ICCV, and ECCV), the others are also highly influential. +To make this list also a reference for the community to develop and compare new object detection algorithms, we list them following the time order of top-tier conferences. +Methods already supported and maintained by MMDetection are not listed. + +- Involution: Inverting the Inherence of Convolution for Visual Recognition, CVPR21. [\[paper\]](https://arxiv.org/abs/2103.06255)[\[github\]](https://github.com/d-li14/involution) +- Multiple Instance Active Learning for Object Detection, CVPR 2021. [\[paper\]](https://openaccess.thecvf.com/content/CVPR2021/papers/Yuan_Multiple_Instance_Active_Learning_for_Object_Detection_CVPR_2021_paper.pdf)[\[github\]](https://github.com/yuantn/MI-AOD) +- Adaptive Class Suppression Loss for Long-Tail Object Detection, CVPR 2021. [\[paper\]](https://arxiv.org/abs/2104.00885)[\[github\]](https://github.com/CASIA-IVA-Lab/ACSL) +- Generalizable Pedestrian Detection: The Elephant In The Room, CVPR2021. [\[paper\]](https://arxiv.org/abs/2003.08799)[\[github\]](https://github.com/hasanirtiza/Pedestron) +- Group Fisher Pruning for Practical Network Compression, ICML2021. [\[paper\]](https://github.com/jshilong/FisherPruning/blob/main/resources/paper.pdf)[\[github\]](https://github.com/jshilong/FisherPruning) +- Overcoming Classifier Imbalance for Long-tail Object Detection with Balanced Group Softmax, CVPR2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/papers/Li_Overcoming_Classifier_Imbalance_for_Long-Tail_Object_Detection_With_Balanced_Group_CVPR_2020_paper.pdf)[\[github\]](https://github.com/FishYuLi/BalancedGroupSoftmax) +- Coherent Reconstruction of Multiple Humans from a Single Image, CVPR2020. [\[paper\]](https://jiangwenpl.github.io/multiperson/)[\[github\]](https://github.com/JiangWenPL/multiperson) +- Look-into-Object: Self-supervised Structure Modeling for Object Recognition, CVPR 2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/papers/Zhou_Look-Into-Object_Self-Supervised_Structure_Modeling_for_Object_Recognition_CVPR_2020_paper.pdf)[\[github\]](https://github.com/JDAI-CV/LIO) +- Video Panoptic Segmentation, CVPR2020. [\[paper\]](https://arxiv.org/abs/2006.11339)[\[github\]](https://github.com/mcahny/vps) +- D2Det: Towards High Quality Object Detection and Instance Segmentation, CVPR2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/html/Cao_D2Det_Towards_High_Quality_Object_Detection_and_Instance_Segmentation_CVPR_2020_paper.html)[\[github\]](https://github.com/JialeCao001/D2Det) +- CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.09119)[\[github\]](https://github.com/KiveeDong/CentripetalNet) +- Learning a Unified Sample Weighting Network for Object Detection, CVPR 2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/html/Cai_Learning_a_Unified_Sample_Weighting_Network_for_Object_Detection_CVPR_2020_paper.html)[\[github\]](https://github.com/caiqi/sample-weighting-network) +- Scale-equalizing Pyramid Convolution for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2005.03101) [\[github\]](https://github.com/jshilong/SEPC) +- Revisiting the Sibling Head in Object Detector, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.07540)[\[github\]](https://github.com/Sense-X/TSD) +- PolarMask: Single Shot Instance Segmentation with Polar Representation, CVPR2020. [\[paper\]](https://arxiv.org/abs/1909.13226)[\[github\]](https://github.com/xieenze/PolarMask) +- Hit-Detector: Hierarchical Trinity Architecture Search for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.11818)[\[github\]](https://github.com/ggjy/HitDet.pytorch) +- ZeroQ: A Novel Zero Shot Quantization Framework, CVPR2020. [\[paper\]](https://arxiv.org/abs/2001.00281)[\[github\]](https://github.com/amirgholami/ZeroQ) +- CBNet: A Novel Composite Backbone Network Architecture for Object Detection, AAAI2020. [\[paper\]](https://aaai.org/Papers/AAAI/2020GB/AAAI-LiuY.1833.pdf)[\[github\]](https://github.com/VDIGPKU/CBNet) +- RDSNet: A New Deep Architecture for Reciprocal Object Detection and Instance Segmentation, AAAI2020. [\[paper\]](https://arxiv.org/abs/1912.05070)[\[github\]](https://github.com/wangsr126/RDSNet) +- Training-Time-Friendly Network for Real-Time Object Detection, AAAI2020. [\[paper\]](https://arxiv.org/abs/1909.00700)[\[github\]](https://github.com/ZJULearning/ttfnet) +- Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution, NeurIPS 2019. [\[paper\]](https://arxiv.org/abs/1909.06720)[\[github\]](https://github.com/thangvubk/Cascade-RPN) +- Reasoning R-CNN: Unifying Adaptive Global Reasoning into Large-scale Object Detection, CVPR2019. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2019/papers/Xu_Reasoning-RCNN_Unifying_Adaptive_Global_Reasoning_Into_Large-Scale_Object_Detection_CVPR_2019_paper.pdf)[\[github\]](https://github.com/chanyn/Reasoning-RCNN) +- Learning RoI Transformer for Oriented Object Detection in Aerial Images, CVPR2019. [\[paper\]](https://arxiv.org/abs/1812.00155)[\[github\]](https://github.com/dingjiansw101/AerialDetection) +- SOLO: Segmenting Objects by Locations. [\[paper\]](https://arxiv.org/abs/1912.04488)[\[github\]](https://github.com/WXinlong/SOLO) +- SOLOv2: Dynamic, Faster and Stronger. [\[paper\]](https://arxiv.org/abs/2003.10152)[\[github\]](https://github.com/WXinlong/SOLO) +- Dense Peppoints: Representing Visual Objects with Dense Point Sets. [\[paper\]](https://arxiv.org/abs/1912.11473)[\[github\]](https://github.com/justimyhxu/Dense-RepPoints) +- IterDet: Iterative Scheme for Object Detection in Crowded Environments. [\[paper\]](https://arxiv.org/abs/2005.05708)[\[github\]](https://github.com/saic-vul/iterdet) +- Cross-Iteration Batch Normalization. [\[paper\]](https://arxiv.org/abs/2002.05712)[\[github\]](https://github.com/Howal/Cross-iterationBatchNorm) +- A Ranking-based, Balanced Loss Function Unifying Classification and Localisation in Object Detection, NeurIPS2020 [\[paper\]](https://arxiv.org/abs/2009.13592)[\[github\]](https://github.com/kemaloksuz/aLRPLoss) +- RelationNet++: Bridging Visual Representations for Object Detection via Transformer Decoder, NeurIPS2020 [\[paper\]](https://arxiv.org/abs/2010.15831)[\[github\]](https://github.com/microsoft/RelationNet2) +- Generalized Focal Loss V2: Learning Reliable Localization Quality Estimation for Dense Object Detection, CVPR2021[\[paper\]](https://arxiv.org/abs/2011.12885)[\[github\]](https://github.com/implus/GFocalV2) +- Swin Transformer: Hierarchical Vision Transformer using Shifted Windows, ICCV2021[\[paper\]](https://arxiv.org/abs/2103.14030)[\[github\]](https://github.com/SwinTransformer/) +- Focal Transformer: Focal Self-attention for Local-Global Interactions in Vision Transformers, NeurIPS2021[\[paper\]](https://arxiv.org/abs/2107.00641)[\[github\]](https://github.com/microsoft/Focal-Transformer) +- End-to-End Semi-Supervised Object Detection with Soft Teacher, ICCV2021[\[paper\]](https://arxiv.org/abs/2106.09018)[\[github\]](https://github.com/microsoft/SoftTeacher) +- CBNetV2: A Novel Composite Backbone Network Architecture for Object Detection [\[paper\]](http://arxiv.org/abs/2107.00420)[\[github\]](https://github.com/VDIGPKU/CBNetV2) +- Instances as Queries, ICCV2021 [\[paper\]](https://openaccess.thecvf.com/content/ICCV2021/papers/Fang_Instances_As_Queries_ICCV_2021_paper.pdf)[\[github\]](https://github.com/hustvl/QueryInst) diff --git a/mmdetection/docs/en/overview.md b/mmdetection/docs/en/overview.md new file mode 100644 index 0000000..7c7d96b --- /dev/null +++ b/mmdetection/docs/en/overview.md @@ -0,0 +1,54 @@ +# OVERVIEW + +This chapter introduces you to the framework of MMDetection, and provides links to detailed tutorials about MMDetection. + +## What is MMDetection + +![image](https://user-images.githubusercontent.com/12907710/137271636-56ba1cd2-b110-4812-8221-b4c120320aa9.png) + +MMDetection is an object detection toolbox that contains a rich set of object detection, instance segmentation, and panoptic segmentation methods as well as related components and modules, and below is its whole framework: + +MMDetection consists of 7 main parts, apis, structures, datasets, models, engine, evaluation and visualization. + +- **apis** provides high-level APIs for model inference. +- **structures** provides data structures like bbox, mask, and DetDataSample. +- **datasets** supports various dataset for object detection, instance segmentation, and panoptic segmentation. + - **transforms** contains a lot of useful data augmentation transforms. + - **samplers** defines different data loader sampling strategy. +- **models** is the most vital part for detectors and contains different components of a detector. + - **detectors** defines all of the detection model classes. + - **data_preprocessors** is for preprocessing the input data of the model. + - **backbones** contains various backbone networks. + - **necks** contains various neck components. + - **dense_heads** contains various detection heads that perform dense predictions. + - **roi_heads** contains various detection heads that predict from RoIs. + - **seg_heads** contains various segmentation heads. + - **losses** contains various loss functions. + - **task_modules** provides modules for detection tasks. E.g. assigners, samplers, box coders, and prior generators. + - **layers** provides some basic neural network layers. +- **engine** is a part for runtime components. + - **runner** provides extensions for [MMEngine's runner](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). + - **schedulers** provides schedulers for adjusting optimization hyperparameters. + - **optimizers** provides optimizers and optimizer wrappers. + - **hooks** provides various hooks of the runner. +- **evaluation** provides different metrics for evaluating model performance. +- **visualization** is for visualizing detection results. + +## How to Use this Guide + +Here is a detailed step-by-step guide to learn more about MMDetection: + +1. For installation instructions, please see [get_started](get_started.md). + +2. Refer to the below tutorials for the basic usage of MMDetection. + + - [Train and Test](https://mmdetection.readthedocs.io/en/latest/user_guides/index.html#train-test) + + - [Useful Tools](https://mmdetection.readthedocs.io/en/latest/user_guides/index.html#useful-tools) + +3. Refer to the below tutorials to dive deeper: + + - [Basic Concepts](https://mmdetection.readthedocs.io/en/latest/advanced_guides/index.html#basic-concepts) + - [Component Customization](https://mmdetection.readthedocs.io/en/latest/advanced_guides/index.html#component-customization) + +4. For users of MMDetection 2.x version, we provide a guide to help you adapt to the new version. You can find it in the [migration guide](./migration/migration.md). diff --git a/mmdetection/docs/en/stat.py b/mmdetection/docs/en/stat.py new file mode 100755 index 0000000..f0589e3 --- /dev/null +++ b/mmdetection/docs/en/stat.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +import functools as func +import glob +import os.path as osp +import re + +import numpy as np + +url_prefix = 'https://github.com/open-mmlab/mmdetection/blob/main/configs' + +files = sorted(glob.glob('../../configs/*/README.md')) + +stats = [] +titles = [] +num_ckpts = 0 + +for f in files: + url = osp.dirname(f.replace('../../configs', url_prefix)) + + with open(f, 'r') as content_file: + content = content_file.read() + + title = content.split('\n')[0].replace('# ', '').strip() + ckpts = set(x.lower().strip() + for x in re.findall(r'\[model\]\((https?.*)\)', content)) + + if len(ckpts) == 0: + continue + + _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)] + assert len(_papertype) > 0 + papertype = _papertype[0] + + paper = set([(papertype, title)]) + + titles.append(title) + num_ckpts += len(ckpts) + + statsmsg = f""" +\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts) +""" + stats.append((paper, ckpts, statsmsg)) + +allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats]) +msglist = '\n'.join(x for _, _, x in stats) + +papertypes, papercounts = np.unique([t for t, _ in allpapers], + return_counts=True) +countstr = '\n'.join( + [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)]) + +modelzoo = f""" +# Model Zoo Statistics + +* Number of papers: {len(set(titles))} +{countstr} + +* Number of checkpoints: {num_ckpts} + +{msglist} +""" + +with open('modelzoo_statistics.md', 'w') as f: + f.write(modelzoo) diff --git a/mmdetection/docs/en/switch_language.md b/mmdetection/docs/en/switch_language.md new file mode 100644 index 0000000..b2c4ad9 --- /dev/null +++ b/mmdetection/docs/en/switch_language.md @@ -0,0 +1,3 @@ +## English + +## 简体中文 diff --git a/mmdetection/docs/en/user_guides/config.md b/mmdetection/docs/en/user_guides/config.md new file mode 100644 index 0000000..69bd911 --- /dev/null +++ b/mmdetection/docs/en/user_guides/config.md @@ -0,0 +1,612 @@ +# Learn about Configs + +MMDetection and other OpenMMLab repositories use [MMEngine's config system](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html). It has a modular and inheritance design, which is convenient to conduct various experiments. + +## Config file content + +MMDetection uses a modular design, all modules with different functions can be configured through the config. Taking Mask R-CNN as an example, we will introduce each field in the config according to different function modules: + +### Model config + +In MMDetection's config, we use `model` to set up detection algorithm components. In addition to neural network components such as `backbone`, `neck`, etc, it also requires `data_preprocessor`, `train_cfg`, and `test_cfg`. `data_preprocessor` is responsible for processing a batch of data output by dataloader. `train_cfg`, and `test_cfg` in the model config are for training and testing hyperparameters of the components. + +```python +model = dict( + type='MaskRCNN', # The name of detector + data_preprocessor=dict( # The config of data preprocessor, usually includes image normalization and padding + type='DetDataPreprocessor', # The type of the data preprocessor, refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.data_preprocessors.DetDataPreprocessor + mean=[123.675, 116.28, 103.53], # Mean values used to pre-training the pre-trained backbone models, ordered in R, G, B + std=[58.395, 57.12, 57.375], # Standard variance used to pre-training the pre-trained backbone models, ordered in R, G, B + bgr_to_rgb=True, # whether to convert image from BGR to RGB + pad_mask=True, # whether to pad instance masks + pad_size_divisor=32), # The size of padded image should be divisible by ``pad_size_divisor`` + backbone=dict( # The config of backbone + type='ResNet', # The type of backbone network. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.backbones.ResNet + depth=50, # The depth of backbone, usually it is 50 or 101 for ResNet and ResNext backbones. + num_stages=4, # Number of stages of the backbone. + out_indices=(0, 1, 2, 3), # The index of output feature maps produced in each stage + frozen_stages=1, # The weights in the first stage are frozen + norm_cfg=dict( # The config of normalization layers. + type='BN', # Type of norm layer, usually it is BN or GN + requires_grad=True), # Whether to train the gamma and beta in BN + norm_eval=True, # Whether to freeze the statistics in BN + style='pytorch', # The style of backbone, 'pytorch' means that stride 2 layers are in 3x3 Conv, 'caffe' means stride 2 layers are in 1x1 Convs. + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), # The ImageNet pretrained backbone to be loaded + neck=dict( + type='FPN', # The neck of detector is FPN. We also support 'NASFPN', 'PAFPN', etc. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.necks.FPN for more details. + in_channels=[256, 512, 1024, 2048], # The input channels, this is consistent with the output channels of backbone + out_channels=256, # The output channels of each level of the pyramid feature map + num_outs=5), # The number of output scales + rpn_head=dict( + type='RPNHead', # The type of RPN head is 'RPNHead', we also support 'GARPNHead', etc. Refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.dense_heads.RPNHead for more details. + in_channels=256, # The input channels of each input feature map, this is consistent with the output channels of neck + feat_channels=256, # Feature channels of convolutional layers in the head. + anchor_generator=dict( # The config of anchor generator + type='AnchorGenerator', # Most of methods use AnchorGenerator, SSD Detectors uses `SSDAnchorGenerator`. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/prior_generators/anchor_generator.py#L18 for more details + scales=[8], # Basic scale of the anchor, the area of the anchor in one position of a feature map will be scale * base_sizes + ratios=[0.5, 1.0, 2.0], # The ratio between height and width. + strides=[4, 8, 16, 32, 64]), # The strides of the anchor generator. This is consistent with the FPN feature strides. The strides will be taken as base_sizes if base_sizes is not set. + bbox_coder=dict( # Config of box coder to encode and decode the boxes during training and testing + type='DeltaXYWHBBoxCoder', # Type of box coder. 'DeltaXYWHBBoxCoder' is applied for most of the methods. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py#L13 for more details. + target_means=[0.0, 0.0, 0.0, 0.0], # The target means used to encode and decode boxes + target_stds=[1.0, 1.0, 1.0, 1.0]), # The standard variance used to encode and decode boxes + loss_cls=dict( # Config of loss function for the classification branch + type='CrossEntropyLoss', # Type of loss for classification branch, we also support FocalLoss etc. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/losses/cross_entropy_loss.py#L201 for more details + use_sigmoid=True, # RPN usually performs two-class classification, so it usually uses the sigmoid function. + loss_weight=1.0), # Loss weight of the classification branch. + loss_bbox=dict( # Config of loss function for the regression branch. + type='L1Loss', # Type of loss, we also support many IoU Losses and smooth L1-loss, etc. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/losses/smooth_l1_loss.py#L56 for implementation. + loss_weight=1.0)), # Loss weight of the regression branch. + roi_head=dict( # RoIHead encapsulates the second stage of two-stage/cascade detectors. + type='StandardRoIHead', + bbox_roi_extractor=dict( # RoI feature extractor for bbox regression. + type='SingleRoIExtractor', # Type of the RoI feature extractor, most of methods uses SingleRoIExtractor. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py#L13 for details. + roi_layer=dict( # Config of RoI Layer + type='RoIAlign', # Type of RoI Layer, DeformRoIPoolingPack and ModulatedDeformRoIPoolingPack are also supported. Refer to https://mmcv.readthedocs.io/en/latest/api.html#mmcv.ops.RoIAlign for details. + output_size=7, # The output size of feature maps. + sampling_ratio=0), # Sampling ratio when extracting the RoI features. 0 means adaptive ratio. + out_channels=256, # output channels of the extracted feature. + featmap_strides=[4, 8, 16, 32]), # Strides of multi-scale feature maps. It should be consistent with the architecture of the backbone. + bbox_head=dict( # Config of box head in the RoIHead. + type='Shared2FCBBoxHead', # Type of the bbox head, Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py#L220 for implementation details. + in_channels=256, # Input channels for bbox head. This is consistent with the out_channels in roi_extractor + fc_out_channels=1024, # Output feature channels of FC layers. + roi_feat_size=7, # Size of RoI features + num_classes=80, # Number of classes for classification + bbox_coder=dict( # Box coder used in the second stage. + type='DeltaXYWHBBoxCoder', # Type of box coder. 'DeltaXYWHBBoxCoder' is applied for most of the methods. + target_means=[0.0, 0.0, 0.0, 0.0], # Means used to encode and decode box + target_stds=[0.1, 0.1, 0.2, 0.2]), # Standard variance for encoding and decoding. It is smaller since the boxes are more accurate. [0.1, 0.1, 0.2, 0.2] is a conventional setting. + reg_class_agnostic=False, # Whether the regression is class agnostic. + loss_cls=dict( # Config of loss function for the classification branch + type='CrossEntropyLoss', # Type of loss for classification branch, we also support FocalLoss etc. + use_sigmoid=False, # Whether to use sigmoid. + loss_weight=1.0), # Loss weight of the classification branch. + loss_bbox=dict( # Config of loss function for the regression branch. + type='L1Loss', # Type of loss, we also support many IoU Losses and smooth L1-loss, etc. + loss_weight=1.0)), # Loss weight of the regression branch. + mask_roi_extractor=dict( # RoI feature extractor for mask generation. + type='SingleRoIExtractor', # Type of the RoI feature extractor, most of methods uses SingleRoIExtractor. + roi_layer=dict( # Config of RoI Layer that extracts features for instance segmentation + type='RoIAlign', # Type of RoI Layer, DeformRoIPoolingPack and ModulatedDeformRoIPoolingPack are also supported + output_size=14, # The output size of feature maps. + sampling_ratio=0), # Sampling ratio when extracting the RoI features. + out_channels=256, # Output channels of the extracted feature. + featmap_strides=[4, 8, 16, 32]), # Strides of multi-scale feature maps. + mask_head=dict( # Mask prediction head + type='FCNMaskHead', # Type of mask head, refer to https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.roi_heads.FCNMaskHead for implementation details. + num_convs=4, # Number of convolutional layers in mask head. + in_channels=256, # Input channels, should be consistent with the output channels of mask roi extractor. + conv_out_channels=256, # Output channels of the convolutional layer. + num_classes=80, # Number of class to be segmented. + loss_mask=dict( # Config of loss function for the mask branch. + type='CrossEntropyLoss', # Type of loss used for segmentation + use_mask=True, # Whether to only train the mask in the correct class. + loss_weight=1.0))), # Loss weight of mask branch. + train_cfg = dict( # Config of training hyperparameters for rpn and rcnn + rpn=dict( # Training config of rpn + assigner=dict( # Config of assigner + type='MaxIoUAssigner', # Type of assigner, MaxIoUAssigner is used for many common detectors. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/assigners/max_iou_assigner.py#L14 for more details. + pos_iou_thr=0.7, # IoU >= threshold 0.7 will be taken as positive samples + neg_iou_thr=0.3, # IoU < threshold 0.3 will be taken as negative samples + min_pos_iou=0.3, # The minimal IoU threshold to take boxes as positive samples + match_low_quality=True, # Whether to match the boxes under low quality (see API doc for more details). + ignore_iof_thr=-1), # IoF threshold for ignoring bboxes + sampler=dict( # Config of positive/negative sampler + type='RandomSampler', # Type of sampler, PseudoSampler and other samplers are also supported. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/samplers/random_sampler.py#L14 for implementation details. + num=256, # Number of samples + pos_fraction=0.5, # The ratio of positive samples in the total samples. + neg_pos_ub=-1, # The upper bound of negative samples based on the number of positive samples. + add_gt_as_proposals=False), # Whether add GT as proposals after sampling. + allowed_border=-1, # The border allowed after padding for valid anchors. + pos_weight=-1, # The weight of positive samples during training. + debug=False), # Whether to set the debug mode + rpn_proposal=dict( # The config to generate proposals during training + nms_across_levels=False, # Whether to do NMS for boxes across levels. Only work in `GARPNHead`, naive rpn does not support do nms cross levels. + nms_pre=2000, # The number of boxes before NMS + nms_post=1000, # The number of boxes to be kept by NMS. Only work in `GARPNHead`. + max_per_img=1000, # The number of boxes to be kept after NMS. + nms=dict( # Config of NMS + type='nms', # Type of NMS + iou_threshold=0.7 # NMS threshold + ), + min_bbox_size=0), # The allowed minimal box size + rcnn=dict( # The config for the roi heads. + assigner=dict( # Config of assigner for second stage, this is different for that in rpn + type='MaxIoUAssigner', # Type of assigner, MaxIoUAssigner is used for all roi_heads for now. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/assigners/max_iou_assigner.py#L14 for more details. + pos_iou_thr=0.5, # IoU >= threshold 0.5 will be taken as positive samples + neg_iou_thr=0.5, # IoU < threshold 0.5 will be taken as negative samples + min_pos_iou=0.5, # The minimal IoU threshold to take boxes as positive samples + match_low_quality=False, # Whether to match the boxes under low quality (see API doc for more details). + ignore_iof_thr=-1), # IoF threshold for ignoring bboxes + sampler=dict( + type='RandomSampler', # Type of sampler, PseudoSampler and other samplers are also supported. Refer to https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/samplers/random_sampler.py#L14 for implementation details. + num=512, # Number of samples + pos_fraction=0.25, # The ratio of positive samples in the total samples. + neg_pos_ub=-1, # The upper bound of negative samples based on the number of positive samples. + add_gt_as_proposals=True + ), # Whether add GT as proposals after sampling. + mask_size=28, # Size of mask + pos_weight=-1, # The weight of positive samples during training. + debug=False)), # Whether to set the debug mode + test_cfg = dict( # Config for testing hyperparameters for rpn and rcnn + rpn=dict( # The config to generate proposals during testing + nms_across_levels=False, # Whether to do NMS for boxes across levels. Only work in `GARPNHead`, naive rpn does not support do nms cross levels. + nms_pre=1000, # The number of boxes before NMS + nms_post=1000, # The number of boxes to be kept by NMS. Only work in `GARPNHead`. + max_per_img=1000, # The number of boxes to be kept after NMS. + nms=dict( # Config of NMS + type='nms', #Type of NMS + iou_threshold=0.7 # NMS threshold + ), + min_bbox_size=0), # The allowed minimal box size + rcnn=dict( # The config for the roi heads. + score_thr=0.05, # Threshold to filter out boxes + nms=dict( # Config of NMS in the second stage + type='nms', # Type of NMS + iou_thr=0.5), # NMS threshold + max_per_img=100, # Max number of detections of each image + mask_thr_binary=0.5))) # Threshold of mask prediction +``` + +### Dataset and evaluator config + +[Dataloaders](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html) are required for the training, validation, and testing of the [runner](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). Dataset and data pipeline need to be set to build the dataloader. Due to the complexity of this part, we use intermediate variables to simplify the writing of dataloader configs. + +```python +dataset_type = 'CocoDataset' # Dataset type, this will be used to define the dataset +data_root = 'data/coco/' # Root path of data +backend_args = None # Arguments to instantiate the corresponding file backend + +train_pipeline = [ # Training data processing pipeline + dict(type='LoadImageFromFile', backend_args=backend_args), # First pipeline to load images from file path + dict( + type='LoadAnnotations', # Second pipeline to load annotations for current image + with_bbox=True, # Whether to use bounding box, True for detection + with_mask=True, # Whether to use instance mask, True for instance segmentation + poly2mask=True), # Whether to convert the polygon mask to instance mask, set False for acceleration and to save memory + dict( + type='Resize', # Pipeline that resizes the images and their annotations + scale=(1333, 800), # The largest scale of the images + keep_ratio=True # Whether to keep the ratio between height and width + ), + dict( + type='RandomFlip', # Augmentation pipeline that flips the images and their annotations + prob=0.5), # The probability to flip + dict(type='PackDetInputs') # Pipeline that formats the annotation data and decides which keys in the data should be packed into data_samples +] +test_pipeline = [ # Testing data processing pipeline + dict(type='LoadImageFromFile', backend_args=backend_args), # First pipeline to load images from file path + dict(type='Resize', scale=(1333, 800), keep_ratio=True), # Pipeline that resizes the images + dict( + type='PackDetInputs', # Pipeline that formats the annotation data and decides which keys in the data should be packed into data_samples + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( # Train dataloader config + batch_size=2, # Batch size of a single GPU + num_workers=2, # Worker to pre-fetch data for each single GPU + persistent_workers=True, # If ``True``, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed. + sampler=dict( # training data sampler + type='DefaultSampler', # DefaultSampler which supports both distributed and non-distributed training. Refer to https://mmengine.readthedocs.io/en/latest/api/generated/mmengine.dataset.DefaultSampler.html#mmengine.dataset.DefaultSampler + shuffle=True), # randomly shuffle the training data in each epoch + batch_sampler=dict(type='AspectRatioBatchSampler'), # Batch sampler for grouping images with similar aspect ratio into a same batch. It can reduce GPU memory cost. + dataset=dict( # Train dataset config + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', # Path of annotation file + data_prefix=dict(img='train2017/'), # Prefix of image path + filter_cfg=dict(filter_empty_gt=True, min_size=32), # Config of filtering images and annotations + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( # Validation dataloader config + batch_size=1, # Batch size of a single GPU. If batch-size > 1, the extra padding area may influence the performance. + num_workers=2, # Worker to pre-fetch data for each single GPU + persistent_workers=True, # If ``True``, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed. + drop_last=False, # Whether to drop the last incomplete batch, if the dataset size is not divisible by the batch size + sampler=dict( + type='DefaultSampler', + shuffle=False), # not shuffle during validation and testing + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, # Turn on the test mode of the dataset to avoid filtering annotations or images + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader # Testing dataloader config +``` + +[Evaluators](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html) are used to compute the metrics of the trained model on the validation and testing datasets. The config of evaluators consists of one or a list of metric configs: + +```python +val_evaluator = dict( # Validation evaluator config + type='CocoMetric', # The coco metric used to evaluate AR, AP, and mAP for detection and instance segmentation + ann_file=data_root + 'annotations/instances_val2017.json', # Annotation file path + metric=['bbox', 'segm'], # Metrics to be evaluated, `bbox` for detection and `segm` for instance segmentation + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator # Testing evaluator config +``` + +Since the test dataset has no annotation files, the test_dataloader and test_evaluator config in MMDetection are generally equal to the val's. If you want to save the detection results on the test dataset, you can write the config like this: + +```python +# inference on test dataset and +# format the output results for submission. +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'annotations/image_info_test-dev2017.json', + data_prefix=dict(img='test2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/image_info_test-dev2017.json', + metric=['bbox', 'segm'], # Metrics to be evaluated + format_only=True, # Only format and save the results to coco json file + outfile_prefix='./work_dirs/coco_detection/test') # The prefix of output json files +``` + +### Training and testing config + +MMEngine's runner uses Loop to control the training, validation, and testing processes. +Users can set the maximum training epochs and validation intervals with these fields. + +```python +train_cfg = dict( + type='EpochBasedTrainLoop', # The training loop type. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py + max_epochs=12, # Maximum training epochs + val_interval=1) # Validation intervals. Run validation every epoch. +val_cfg = dict(type='ValLoop') # The validation loop type +test_cfg = dict(type='TestLoop') # The testing loop type +``` + +### Optimization config + +`optim_wrapper` is the field to configure optimization-related settings. The optimizer wrapper not only provides the functions of the optimizer, but also supports functions such as gradient clipping, mixed precision training, etc. Find more in [optimizer wrapper tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html). + +```python +optim_wrapper = dict( # Optimizer wrapper config + type='OptimWrapper', # Optimizer wrapper type, switch to AmpOptimWrapper to enable mixed precision training. + optimizer=dict( # Optimizer config. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # Stochastic gradient descent optimizer + lr=0.02, # The base learning rate + momentum=0.9, # Stochastic gradient descent with momentum + weight_decay=0.0001), # Weight decay of SGD + clip_grad=None, # Gradient clip option. Set None to disable gradient clip. Find usage in https://mmengine.readthedocs.io/en/latest/tutorials/optimizer.html + ) +``` + +`param_scheduler` is a field that configures methods of adjusting optimization hyperparameters such as learning rate and momentum. Users can combine multiple schedulers to create a desired parameter adjustment strategy. Find more in [parameter scheduler tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/param_scheduler.html) and [parameter scheduler API documents](https://mmengine.readthedocs.io/en/latest/api/generated/mmengine.optim._ParamScheduler.html#mmengine.optim._ParamScheduler) + +```python +param_scheduler = [ + # Linear learning rate warm-up scheduler + dict( + type='LinearLR', # Use linear policy to warmup learning rate + start_factor=0.001, # The ratio of the starting learning rate used for warmup + by_epoch=False, # The warmup learning rate is updated by iteration + begin=0, # Start from the first iteration + end=500), # End the warmup at the 500th iteration + # The main LRScheduler + dict( + type='MultiStepLR', # Use multi-step learning rate policy during training + by_epoch=True, # The learning rate is updated by epoch + begin=0, # Start from the first epoch + end=12, # End at the 12th epoch + milestones=[8, 11], # Epochs to decay the learning rate + gamma=0.1) # The learning rate decay ratio +] +``` + +### Hook config + +Users can attach Hooks to training, validation, and testing loops to insert some operations during running. There are two different hook fields, one is `default_hooks` and the other is `custom_hooks`. + +`default_hooks` is a dict of hook configs, and they are the hooks must be required at the runtime. They have default priority which should not be modified. If not set, runner will use the default values. To disable a default hook, users can set its config to `None`. Find more in [HOOK](https://mmengine.readthedocs.io/en/latest/tutorials/hook.html). + +```python +default_hooks = dict( + timer=dict(type='IterTimerHook'), # Update the time spent during iteration into message hub + logger=dict(type='LoggerHook', interval=50), # Collect logs from different components of Runner and write them to terminal, JSON file, tensorboard and wandb .etc + param_scheduler=dict(type='ParamSchedulerHook'), # update some hyper-parameters of optimizer + checkpoint=dict(type='CheckpointHook', interval=1), # Save checkpoints periodically + sampler_seed=dict(type='DistSamplerSeedHook'), # Ensure distributed Sampler shuffle is active + visualization=dict(type='DetVisualizationHook')) # Detection Visualization Hook. Used to visualize validation and testing process prediction results +``` + +`custom_hooks` is a list of all other hook configs. Users can develop their own hooks and insert them in this field. + +```python +custom_hooks = [] +``` + +### Runtime config + +```python +default_scope = 'mmdet' # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html + +env_cfg = dict( + cudnn_benchmark=False, # Whether to enable cudnn benchmark + mp_cfg=dict( # Multi-processing config + mp_start_method='fork', # Use fork to start multi-processing threads. 'fork' usually faster than 'spawn' but maybe unsafe. See discussion in https://github.com/pytorch/pytorch/issues/1355 + opencv_num_threads=0), # Disable opencv multi-threads to avoid system being overloaded + dist_cfg=dict(backend='nccl'), # Distribution configs +) + +vis_backends = [dict(type='LocalVisBackend')] # Visualization backends. Refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/visualization.html +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +log_processor = dict( + type='LogProcessor', # Log processor to process runtime logs + window_size=50, # Smooth interval of log values + by_epoch=True) # Whether to format logs with epoch type. Should be consistent with the train loop's type. + +log_level = 'INFO' # The level of logging. +load_from = None # Load model checkpoint as a pre-trained model from a given path. This will not resume training. +resume = False # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`. +``` + +## Iter-based config + +MMEngine's Runner also provides an iter-based training loop except for epoch-based. +To use iter-based training, users should modify the `train_cfg`, `param_scheduler`, `train_dataloader`, `default_hooks`, and `log_processor`. +Here is an example of changing an epoch-based RetinaNet config to iter-based: `configs/retinanet/retinanet_r50_fpn_90k_coco.py` + +```python +# Iter-based training config +train_cfg = dict( + _delete_=True, # Ignore the base config setting (optional) + type='IterBasedTrainLoop', # Use iter-based training loop + max_iters=90000, # Maximum iterations + val_interval=10000) # Validation interval + + +# Change the scheduler to iter-based +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=90000, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] + +# Switch to InfiniteSampler to avoid dataloader restart +train_dataloader = dict(sampler=dict(type='InfiniteSampler')) + +# Change the checkpoint saving interval to iter-based +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000)) + +# Change the log format to iter-based +log_processor = dict(by_epoch=False) +``` + +## Config file inheritance + +There are 4 basic component types under `config/_base_`, dataset, model, schedule, default_runtime. +Many methods could be easily constructed with one of these models like Faster R-CNN, Mask R-CNN, Cascade R-CNN, RPN, SSD. +The configs that are composed by components from `_base_` are called the _primitive_. + +For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should inherit from the _primitive_ config. In this way, the maximum of inheritance level is 3. + +For easy understanding, we recommend contributors to inherit from existing methods. +For example, if some modification is made based on Faster R-CNN, users may first inherit the basic Faster R-CNN structure by specifying `_base_ = ../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py`, then modify the necessary fields in the config files. + +If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder `xxx_rcnn` under `configs`, + +Please refer to [mmengine config tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html) for detailed documentation. + +By setting the `_base_` field, we can set which files the current configuration file inherits from. + +When `_base_` is a string of a file path, it means inheriting the contents from one config file. + +```python +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' +``` + +When `_base_` is a list of multiple file paths, it means inheriting from multiple files. + +```python +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +``` + +If you wish to inspect the config file, you may run `python tools/misc/print_config.py /PATH/TO/CONFIG` to see the complete config. + +### Ignore some fields in the base configs + +Sometimes, you may set `_delete_=True` to ignore some of the fields in base configs. +You may refer to [mmengine config tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html) for a simple illustration. + +In MMDetection, for example, to change the backbone of Mask R-CNN with the following config. + +```python +model = dict( + type='MaskRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict(...), + rpn_head=dict(...), + roi_head=dict(...)) +``` + +`ResNet` and `HRNet` use different keywords to construct. + +```python +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict(...)) +``` + +The `_delete_=True` would replace all old keys in `backbone` field with new keys. + +### Use intermediate variables in configs + +Some intermediate variables are used in the configs files, like `train_pipeline`/`test_pipeline` in datasets. +It's worth noting that when modifying intermediate variables in the children configs, users need to pass the intermediate variables into corresponding fields again. +For example, we would like to use a multi-scale strategy to train a Mask R-CNN. `train_pipeline`/`test_pipeline` are intermediate variables we would like to modify. + +```python +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +``` + +We first define the new `train_pipeline`/`test_pipeline` and pass them into dataloader fields. + +Similarly, if we would like to switch from `SyncBN` to `BN` or `MMSyncBN`, we need to substitute every `norm_cfg` in the config. + +```python +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + backbone=dict(norm_cfg=norm_cfg), + neck=dict(norm_cfg=norm_cfg), + ...) +``` + +### Reuse variables in \_base\_ file + +If the users want to reuse the variables in the base file, they can get a copy of the corresponding variable by using `{{_base_.xxx}}`. E.g: + +```python +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' + +a = {{_base_.model}} # Variable `a` is equal to the `model` defined in `_base_` +``` + +## Modify config through script arguments + +When submitting jobs using `tools/train.py` or `tools/test.py`, you may specify `--cfg-options` to in-place modify the config. + +- Update config keys of dict chains. + + The config options can be specified following the order of the dict keys in the original config. + For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode. + +- Update keys inside a list of configs. + + Some config dicts are composed as a list in your config. For example, the training pipeline `train_dataloader.dataset.pipeline` is normally a list + e.g. `[dict(type='LoadImageFromFile'), ...]`. If you want to change `'LoadImageFromFile'` to `'LoadImageFromNDArray'` in the pipeline, + you may specify `--cfg-options data.train.pipeline.0.type=LoadImageFromNDArray`. + +- Update values of list/tuples. + + If the value to be updated is a list or a tuple. For example, the config file normally sets `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`. If you want to + change the mean values, you may specify `--cfg-options model.data_preprocessor.mean="[127,127,127]"`. Note that the quotation mark `"` is necessary to + support list/tuple data types, and **NO** white space is allowed inside the quotation marks in the specified value. + +## Config name style + +We follow the below style to name config files. Contributors are advised to follow the same style. + +``` +{algorithm name}_{model component names [component1]_[component2]_[...]}_{training settings}_{training dataset information}_{testing dataset information}.py +``` + +The file name is divided into five parts. All parts and components are connected with `_` and words of each part or component should be connected with `-`. + +- `{algorithm name}`: The name of the algorithm. It can be a detector name such as `faster-rcnn`, `mask-rcnn`, etc. Or can be a semi-supervised or knowledge-distillation algorithm such as `soft-teacher`, `lad`. etc. +- `{model component names}`: Names of the components used in the algorithm such as backbone, neck, etc. For example, `r50-caffe_fpn_gn-head` means using caffe-style ResNet50, FPN and detection head with Group Norm in the algorithm. +- `{training settings}`: Information of training settings such as batch size, augmentations, loss trick, scheduler, and epochs/iterations. For example: `4xb4-mixup-giou-coslr-100e` means using 8-gpus x 4-images-per-gpu, mixup augmentation, GIoU loss, cosine annealing learning rate, and train 100 epochs. + Some abbreviations: + - `{gpu x batch_per_gpu}`: GPUs and samples per GPU. `bN` indicates N batch size per GPU. E.g. `4xb4` is the short term of 4-GPUs x 4-images-per-GPU. And `8xb2` is used by default if not mentioned. + - `{schedule}`: training schedule, options are `1x`, `2x`, `20e`, etc. + `1x` and `2x` means 12 epochs and 24 epochs respectively. + `20e` is adopted in cascade models, which denotes 20 epochs. + For `1x`/`2x`, the initial learning rate decays by a factor of 10 at the 8/16th and 11/22th epochs. + For `20e`, the initial learning rate decays by a factor of 10 at the 16th and 19th epochs. +- `{training dataset information}`: Training dataset names like `coco`, `coco-panoptic`, `cityscapes`, `voc-0712`, `wider-face`. +- `{testing dataset information}` (optional): Testing dataset name for models trained on one dataset but tested on another. If not mentioned, it means the model was trained and tested on the same dataset type. diff --git a/mmdetection/docs/en/user_guides/dataset_prepare.md b/mmdetection/docs/en/user_guides/dataset_prepare.md new file mode 100644 index 0000000..1e0259a --- /dev/null +++ b/mmdetection/docs/en/user_guides/dataset_prepare.md @@ -0,0 +1,310 @@ +# Dataset Prepare + +### Basic Detection Dataset Preparation + +MMDetection supports multiple public datasets including COCO, Pascal VOC, CityScapes, and [more](../../../configs/_base_/datasets). + +Public datasets like [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/index.html) or mirror and [COCO](https://cocodataset.org/#download) are available from official websites or mirrors. Note: In the detection task, Pascal VOC 2012 is an extension of Pascal VOC 2007 without overlap, and we usually use them together. +It is recommended to download and extract the dataset somewhere outside the project directory and symlink the dataset root to `$MMDETECTION/data` as below. +If your folder structure is different, you may need to change the corresponding paths in config files. + +We provide a script to download datasets such as COCO, you can run `python tools/misc/download_dataset.py --dataset-name coco2017` to download COCO dataset. +For users in China, more datasets can be downloaded from the opensource dataset platform: [OpenDataLab](https://opendatalab.com/?source=OpenMMLab%20GitHub). + +For more usage please refer to [dataset-download](./useful_tools.md#dataset-download) + +```text +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ ├── cityscapes +│ │ ├── annotations +│ │ ├── leftImg8bit +│ │ │ ├── train +│ │ │ ├── val +│ │ ├── gtFine +│ │ │ ├── train +│ │ │ ├── val +│ ├── VOCdevkit +│ │ ├── VOC2007 +│ │ ├── VOC2012 +``` + +Some models require additional [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) datasets, such as HTC, DetectoRS and SCNet, you can download, unzip, and then move them to the coco folder. The directory should be like this. + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── stuffthingmaps +``` + +Panoptic segmentation models like PanopticFPN require additional [COCO Panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) datasets, you can download, unzip, and then move them to the coco annotation folder. The directory should be like this. + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +The [cityscapes](https://www.cityscapes-dataset.com/) annotations need to be converted into the coco format using `tools/dataset_converters/cityscapes.py`: + +```shell +pip install cityscapesscripts + +python tools/dataset_converters/cityscapes.py \ + ./data/cityscapes \ + --nproc 8 \ + --out-dir ./data/cityscapes/annotations +``` + +### COCO Caption Dataset Preparation + +COCO Caption uses the COCO2014 dataset image and uses the annotation of karpathy. + +At first, you need to download the COCO2014 dataset. + +```shell +python tools/misc/download_dataset.py --dataset-name coco2014 --unzip +``` + +The dataset will be downloaded to `data/coco` under the current path. Then download the annotation of karpathy. + +```shell +cd data/coco/annotations +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json +``` + +The final directory structure of the dataset folder that can be directly used for training and testing is as follows: + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── coco_karpathy_train.json +│ │ │ ├── coco_karpathy_test.json +│ │ │ ├── coco_karpathy_val.json +│ │ │ ├── coco_karpathy_val_gt.json +│ │ │ ├── coco_karpathy_test_gt.json +│ │ ├── train2014 +│ │ ├── val2014 +│ │ ├── test2014 +``` + +### COCO Semantic Dataset Preparation + +There are two types of annotations for COCO semantic segmentation, which differ mainly in the definition of category names, so there are two ways to handle them. The first is to directly use the stuffthingmaps dataset, and the second is to use the panoptic dataset. + +**(1) Use stuffthingmaps dataset** + +The download link for this dataset is [stuffthingmaps_trainval2017](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip). Please download and extract it to the `data/coco` folder. + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── stuffthingmaps +``` + +This dataset is different from the standard COCO category annotation in that it includes 172 classes: 80 "thing" classes, 91 "stuff" classes, and 1 "unlabeled" class. The description of each class can be found at https://github.com/nightrome/cocostuff/blob/master/labels.md. + +Although only 172 categories are annotated, the maximum label ID in `stuffthingmaps` is 182, and some categories in the middle are not annotated. In addition, the "unlabeled" category of class 0 is removed. Therefore, the relationship between the value at each position in the final `stuffthingmaps` image can be found at https://github.com/kazuto1011/deeplab-pytorch/blob/master/data/datasets/cocostuff/labels.txt. + +To train efficiently and conveniently for users, we need to remove 12 unannotated classes before starting training or evaluation. The names of these 12 classes are: `street sign, hat, shoe, eye glasses, plate, mirror, window, desk, door, blender, hair brush`. The category information that can be used for training and evaluation can be found in `mmdet/datasets/coco_semantic.py`. + +You can use `tools/dataset_converters/coco_stuff164k.py` to convert the downloaded `stuffthingmaps` to a dataset that can be directly used for training and evaluation. The directory structure of the converted dataset is as follows: + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── stuffthingmaps +│ │ ├── stuffthingmaps_semseg +``` + +`stuffthingmaps_semseg` is the newly generated COCO semantic segmentation dataset that can be directly used for training and testing. + +**(2) use panoptic dataset** + +The number of categories in the semantic segmentation dataset generated through panoptic annotation will be less than that generated using the `stuffthingmaps` dataset. First, you need to prepare the panoptic segmentation annotations, and then use the following script to complete the conversion. + +```shell +python tools/dataset_converters/prepare_coco_semantic_annos_from_panoptic_annos.py data/coco +``` + +The directory structure of the converted dataset is as follows: + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ │ ├── panoptic_semseg_train2017 +│ │ │ ├── panoptic_semseg_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +`panoptic_semseg_train2017` and `panoptic_semseg_val2017` are the newly generated COCO semantic segmentation datasets that can be directly used for training and testing. Note that their category information is the same as that of COCO panoptic segmentation, including both "thing" and "stuff" categories. + +### RefCOCO Dataset Preparation + +The images and annotations of [RefCOCO](https://github.com/lichengunc/refer) series datasets can be download by running `tools/misc/download_dataset.py`: + +```shell +python tools/misc/download_dataset.py --dataset-name refcoco --save-dir data/coco --unzip +``` + +Then the directory should be like this: + +```text +data +├── coco +│ ├── refcoco +│ │ ├── instances.json +│ │ ├── refs(google).p +│ │ └── refs(unc).p +│ ├── refcoco+ +│ │ ├── instances.json +│ │ └── refs(unc).p +│ ├── refcocog +│ │ ├── instances.json +│ │ ├── refs(google).p +│ │ └── refs(umd).p +│ │── train2014 +``` + +### ADE20K 2016 Dataset Preparation + +The images and annotations of [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/) dataset can be download by running `tools/misc/download_dataset.py`: + +```shell +python tools/misc/download_dataset.py --dataset-name ade20k_2016 --save-dir data --unzip +``` + +Then move the annotations to the `data/ADEChallengeData2016` directory and run the preprocess script to produce the coco format annotations: + +```shell +mv data/annotations_instance data/ADEChallengeData2016/ +mv data/categoryMapping.txt data/ADEChallengeData2016/ +mv data/imgCatIds.json data/ADEChallengeData2016/ +python tools/dataset_converters/ade20k2coco.py data/ADEChallengeData2016 --task panoptic +python tools/dataset_converters/ade20k2coco.py data/ADEChallengeData2016 --task instance +``` + +The directory should be like this. + +```text +data +├── ADEChallengeData2016 +│ ├── ade20k_instance_train.json +│ ├── ade20k_instance_val.json +│ ├── ade20k_panoptic_train +│ │ ├── ADE_train_00000001.png +│ │ ├── ADE_train_00000002.png +│ │ ├── ... +│ ├── ade20k_panoptic_train.json +│ ├── ade20k_panoptic_val +│ │ ├── ADE_val_00000001.png +│ │ ├── ADE_val_00000002.png +│ │ ├── ... +│ ├── ade20k_panoptic_val.json +│ ├── annotations +│ │ ├── training +│ │ │ ├── ADE_train_00000001.png +│ │ │ ├── ADE_train_00000002.png +│ │ │ ├── ... +│ │ ├── validation +│ │ │ ├── ADE_val_00000001.png +│ │ │ ├── ADE_val_00000002.png +│ │ │ ├── ... +│ ├── annotations_instance +│ │ ├── training +│ │ │ ├── ADE_train_00000001.png +│ │ │ ├── ADE_train_00000002.png +│ │ │ ├── ... +│ │ ├── validation +│ │ │ ├── ADE_val_00000001.png +│ │ │ ├── ADE_val_00000002.png +│ │ │ ├── ... +│ ├── categoryMapping.txt +│ ├── images +│ │ ├── training +│ │ │ ├── ADE_train_00000001.jpg +│ │ │ ├── ADE_train_00000002.jpg +│ │ │ ├── ... +│ │ ├── validation +│ │ │ ├── ADE_val_00000001.jpg +│ │ │ ├── ADE_val_00000002.jpg +│ │ │ ├── ... +│ ├── imgCatIds.json +│ ├── objectInfo150.txt +│ │── sceneCategories.txt +``` + +The above folders include all data of ADE20K's semantic segmentation, instance segmentation, and panoptic segmentation. + +### Download from OpenDataLab + +By using [OpenDataLab](https://opendatalab.com/), researchers can obtain free formatted datasets in various fields. Through the search function of the platform, researchers may address the dataset they look for quickly and easily. Using the formatted datasets from the platform, researchers can efficiently conduct tasks across datasets. + +Currently, MIM supports downloading VOC and COCO datasets from OpenDataLab with one command line. More datasets will be supported in the future. You can also directly download the datasets you need from the OpenDataLab platform and then convert them to the format required by MMDetection. + +If you use MIM to download, make sure that the version is greater than v0.3.8. You can use the following command to update: + +```Bash +pip install -U openmim +``` + +```Bash +# install OpenXLab CLI tools +pip install -U openxlab +# log in OpenXLab, registry +openxlab login + +# download voc2007 and preprocess by MIM +mim download mmdet --dataset voc2007 + +# download voc2012 and preprocess by MIM +mim download mmdet --dataset voc2012 + +# download coco2017 and preprocess by MIM +mim download mmdet --dataset coco2017 +``` diff --git a/mmdetection/docs/en/user_guides/deploy.md b/mmdetection/docs/en/user_guides/deploy.md new file mode 100644 index 0000000..db320d1 --- /dev/null +++ b/mmdetection/docs/en/user_guides/deploy.md @@ -0,0 +1,173 @@ +# Model Deployment + +The deployment of OpenMMLab codebases, including MMDetection, MMPretrain and so on are supported by [MMDeploy](https://github.com/open-mmlab/mmdeploy). +The latest deployment guide for MMDetection can be found from [here](https://mmdeploy.readthedocs.io/en/dev-1.x/04-supported-codebases/mmdet.html). + +This tutorial is organized as follows: + +- [Installation](#installation) +- [Convert model](#convert-model) +- [Model specification](#model-specification) +- [Model inference](#model-inference) + - [Backend model inference](#backend-model-inference) + - [SDK model inference](#sdk-model-inference) +- [Supported models](#supported-models) + +## Installation + +Please follow the [guide](https://mmdetection.readthedocs.io/en/latest/get_started.html) to install mmdet. And then install mmdeploy from source by following [this](https://mmdeploy.readthedocs.io/en/1.x/get_started.html#installation) guide. + +```{note} +If you install mmdeploy prebuilt package, please also clone its repository by 'git clone https://github.com/open-mmlab/mmdeploy.git --depth=1' to get the deployment config files. +``` + +## Convert model + +Suppose mmdetection and mmdeploy repositories are in the same directory, and the working directory is the root path of mmdetection. + +Take [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py) model as an example. You can download its checkpoint from [here](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth), and then convert it to onnx model as follows: + +```python +from mmdeploy.apis import torch2onnx +from mmdeploy.backend.sdk.export_info import export2SDK + +img = 'demo/demo.jpg' +work_dir = 'mmdeploy_models/mmdet/onnx' +save_file = 'end2end.onnx' +deploy_cfg = '../mmdeploy/configs/mmdet/detection/detection_onnxruntime_dynamic.py' +model_cfg = 'configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model_checkpoint = 'faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth' +device = 'cpu' + +# 1. convert model to onnx +torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, + model_checkpoint, device) + +# 2. extract pipeline info for inference by MMDeploy SDK +export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, + device=device) +``` + +It is crucial to specify the correct deployment config during model conversion. MMDeploy has already provided builtin deployment config [files](https://github.com/open-mmlab/mmdeploy/tree/1.x/configs/mmdet) of all supported backends for mmdetection, under which the config file path follows the pattern: + +``` +{task}/{task}_{backend}-{precision}_{static | dynamic}_{shape}.py +``` + +- **{task}:** task in mmdetection. + + There are two of them. One is `detection` and the other is `instance-seg`, indicating instance segmentation. + + mmdet models like `RetinaNet`, `Faster R-CNN` and `DETR` and so on belongs to `detection` task. While `Mask R-CNN` is one of `instance-seg` models. + + **DO REMEMBER TO USE** `detection/detection_*.py` deployment config file when trying to convert detection models and use `instance-seg/instance-seg_*.py` to deploy instance segmentation models. + +- **{backend}:** inference backend, such as onnxruntime, tensorrt, pplnn, ncnn, openvino, coreml etc. + +- **{precision}:** fp16, int8. When it's empty, it means fp32 + +- **{static | dynamic}:** static shape or dynamic shape + +- **{shape}:** input shape or shape range of a model + +Therefore, in the above example, you can also convert `Faster R-CNN` to tensorrt-fp16 model by `detection_tensorrt-fp16_dynamic-320x320-1344x1344.py`. + +```{tip} +When converting mmdet models to tensorrt models, --device should be set to "cuda" +``` + +## Model specification + +Before moving on to model inference chapter, let's know more about the converted model structure which is very important for model inference. + +The converted model locates in the working directory like `mmdeploy_models/mmdet/onnx` in the previous example. It includes: + +``` +mmdeploy_models/mmdet/onnx +├── deploy.json +├── detail.json +├── end2end.onnx +└── pipeline.json +``` + +in which, + +- **end2end.onnx**: backend model which can be inferred by ONNX Runtime +- ***xxx*.json**: the necessary information for mmdeploy SDK + +The whole package **mmdeploy_models/mmdet/onnx** is defined as **mmdeploy SDK model**, i.e., **mmdeploy SDK model** includes both backend model and inference meta information. + +## Model inference + +### Backend model inference + +Take the previous converted `end2end.onnx` model as an example, you can use the following code to inference the model and visualize the results. + +```python +from mmdeploy.apis.utils import build_task_processor +from mmdeploy.utils import get_input_shape, load_config +import torch + +deploy_cfg = '../mmdeploy/configs/mmdet/detection/detection_onnxruntime_dynamic.py' +model_cfg = 'configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +device = 'cpu' +backend_model = ['mmdeploy_models/mmdet/onnx/end2end.onnx'] +image = 'demo/demo.jpg' + +# read deploy_cfg and model_cfg +deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg) + +# build task and backend model +task_processor = build_task_processor(model_cfg, deploy_cfg, device) +model = task_processor.build_backend_model(backend_model) + +# process input image +input_shape = get_input_shape(deploy_cfg) +model_inputs, _ = task_processor.create_input(image, input_shape) + +# do model inference +with torch.no_grad(): + result = model.test_step(model_inputs) + +# visualize results +task_processor.visualize( + image=image, + model=model, + result=result[0], + window_name='visualize', + output_file='output_detection.png') +``` + +### SDK model inference + +You can also perform SDK model inference like following, + +```python +from mmdeploy_python import Detector +import cv2 + +img = cv2.imread('demo/demo.jpg') + +# create a detector +detector = Detector(model_path='mmdeploy_models/mmdet/onnx', + device_name='cpu', device_id=0) +# perform inference +bboxes, labels, masks = detector(img) + +# visualize inference result +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('output_detection.png', img) +``` + +Besides python API, mmdeploy SDK also provides other FFI (Foreign Function Interface), such as C, C++, C#, Java and so on. You can learn their usage from [demos](https://github.com/open-mmlab/mmdeploy/tree/1.x/demo). + +## Supported models + +Please refer to [here](https://mmdeploy.readthedocs.io/en/1.x/04-supported-codebases/mmdet.html#supported-models) for the supported model list. diff --git a/mmdetection/docs/en/user_guides/finetune.md b/mmdetection/docs/en/user_guides/finetune.md new file mode 100644 index 0000000..e181eba --- /dev/null +++ b/mmdetection/docs/en/user_guides/finetune.md @@ -0,0 +1,96 @@ +# Finetuning Models + +Detectors pre-trained on the COCO dataset can serve as a good pre-trained model for other datasets, e.g., CityScapes and KITTI Dataset. +This tutorial provides instructions for users to use the models provided in the [Model Zoo](../model_zoo.md) for other datasets to obtain better performance. + +There are two steps to finetune a model on a new dataset. + +- Add support for the new dataset following [Customize Datasets](../advanced_guides/customize_dataset.md). +- Modify the configs as will be discussed in this tutorial. + +Take the finetuning process on Cityscapes Dataset as an example, the users need to modify five parts in the config. + +## Inherit base configs + +To release the burden and reduce bugs in writing the whole configs, MMDetection V3.0 support inheriting configs from multiple existing configs. To finetune a Mask RCNN model, the new config needs to inherit +`_base_/models/mask-rcnn_r50_fpn.py` to build the basic structure of the model. To use the Cityscapes Dataset, the new config can also simply inherit `_base_/datasets/cityscapes_instance.py`. For runtime settings such as logger settings, the new config needs to inherit `_base_/default_runtime.py`. For training schedules, the new config can to inherit `_base_/schedules/schedule_1x.py`. These configs are in the `configs` directory and the users can also choose to write the whole contents rather than use inheritance. + +```python +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_1x.py' +] +``` + +## Modify head + +Then the new config needs to modify the head according to the class numbers of the new datasets. By only changing `num_classes` in the roi_head, the weights of the pre-trained models are mostly reused except for the final prediction head. + +```python +model = dict( + roi_head=dict( + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=8, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))) +``` + +## Modify dataset + +The users may also need to prepare the dataset and write the configs about dataset, refer to [Customize Datasets](../advanced_guides/customize_dataset.md) for more detail. MMDetection V3.0 already supports VOC, WIDERFACE, COCO, LIVS, OpenImages, DeepFashion, Objects365, and Cityscapes Dataset. + +## Modify training schedule + +The finetuning hyperparameters vary from the default schedule. It usually requires a smaller learning rate and fewer training epochs + +```python +# optimizer +# lr is set for a batch size of 8 +optim_wrapper = dict(optimizer=dict(lr=0.01)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=8, + by_epoch=True, + milestones=[7], + gamma=0.1) +] + +# max_epochs +train_cfg = dict(max_epochs=8) + +# log config +default_hooks = dict(logger=dict(interval=100)), +``` + +## Use pre-trained model + +To use the pre-trained model, the new config adds the link of pre-trained models in the `load_from`. The users might need to download the model weights before training to avoid the download time during training. + +```python +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa +``` diff --git a/mmdetection/docs/en/user_guides/index.rst b/mmdetection/docs/en/user_guides/index.rst new file mode 100644 index 0000000..e74fc5f --- /dev/null +++ b/mmdetection/docs/en/user_guides/index.rst @@ -0,0 +1,41 @@ +Train & Test +************** + +MMDetection provides hundreds of pretrained detection models in `Model Zoo `_, +and supports multiple standard datasets, including Pascal VOC, COCO, CityScapes, LVIS, etc. This note will show how to perform common tasks on these existing models and standard datasets: + + +.. toctree:: + :maxdepth: 1 + + config.md + inference.md + dataset_prepare.md + test.md + train.md + new_model.md + finetune.md + test_results_submission.md + init_cfg.md + single_stage_as_rpn.md + semi_det.md + + +Useful Tools +************ + +.. toctree:: + :maxdepth: 1 + + useful_tools.md + useful_hooks.md + visualization.md + robustness_benchmarking.md + deploy.md + label_studio.md + tracking_analysis_tools.md + tracking_config.md + tracking_dataset_prepare.md + tracking_inference.md + tracking_train_test.md + tracking_visualization.md diff --git a/mmdetection/docs/en/user_guides/inference.md b/mmdetection/docs/en/user_guides/inference.md new file mode 100644 index 0000000..49186d2 --- /dev/null +++ b/mmdetection/docs/en/user_guides/inference.md @@ -0,0 +1,440 @@ +# Inference with existing models + +MMDetection provides hundreds of pre-trained detection models in [Model Zoo](https://mmdetection.readthedocs.io/en/latest/model_zoo.html). +This note will show how to inference, which means using trained models to detect objects on images. + +In MMDetection, a model is defined by a [configuration file](https://mmdetection.readthedocs.io/en/latest/user_guides/config.html) and existing model parameters are saved in a checkpoint file. + +To start with, we recommend [RTMDet](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet) with this [configuration file](https://github.com/open-mmlab/mmdetection/blob/main/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py) and this [checkpoint file](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_l_8xb32-300e_coco/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth). It is recommended to download the checkpoint file to `checkpoints` directory. + +## High-level APIs for inference - `Inferencer` + +In OpenMMLab, all the inference operations are unified into a new interface - Inferencer. Inferencer is designed to expose a neat and simple API to users, and shares very similar interface across different OpenMMLab libraries. +A notebook demo can be found in [demo/inference_demo.ipynb](https://github.com/open-mmlab/mmdetection/blob/main/demo/inference_demo.ipynb). + +### Basic Usage + +You can get inference results for an image with only 3 lines of code. + +```python +from mmdet.apis import DetInferencer + +# Initialize the DetInferencer +inferencer = DetInferencer('rtmdet_tiny_8xb32-300e_coco') + +# Perform inference +inferencer('demo/demo.jpg', show=True) +``` + +The resulting output will be displayed in a new window:. + +
    + +
    + +```{note} +If you are running MMDetection on a server without GUI or via SSH tunnel with X11 forwarding disabled, the `show` option will not work. However, you can still save visualizations to files by setting `out_dir` arguments. Read [Dumping Results](#dumping-results) for details. +``` + +### Initialization + +Each Inferencer must be initialized with a model. You can also choose the inference device during initialization. + +#### Model Initialization + +- To infer with MMDetection's pre-trained model, passing its name to the argument `model` can work. The weights will be automatically downloaded and loaded from OpenMMLab's model zoo. + + ```python + inferencer = DetInferencer(model='rtmdet_tiny_8xb32-300e_coco') + ``` + + There is a very easy to list all model names in MMDetection. + + ```python + # models is a list of model names, and them will print automatically + models = DetInferencer.list_models('mmdet') + ``` + + You can load another weight by passing its path/url to `weights`. + + ```python + inferencer = DetInferencer(model='rtmdet_tiny_8xb32-300e_coco', weights='path/to/rtmdet.pth') + ``` + +- To load custom config and weight, you can pass the path to the config file to `model` and the path to the weight to `weights`. + + ```python + inferencer = DetInferencer(model='path/to/rtmdet_config.py', weights='path/to/rtmdet.pth') + ``` + +- By default, [MMEngine](https://github.com/open-mmlab/mmengine/) dumps config to the weight. If you have a weight trained on MMEngine, you can also pass the path to the weight file to `weights` without specifying `model`: + + ```python + # It will raise an error if the config file cannot be found in the weight. Currently, within the MMDetection model repository, only the weights of ddq-detr-4scale_r50 can be loaded in this manner. + inferencer = DetInferencer(weights='https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq-detr-4scale_r50_8xb2-12e_coco/ddq-detr-4scale_r50_8xb2-12e_coco_20230809_170711-42528127.pth') + ``` + +- Passing config file to `model` without specifying `weight` will result in a randomly initialized model. + +### Device + +Each Inferencer instance is bound to a device. +By default, the best device is automatically decided by [MMEngine](https://github.com/open-mmlab/mmengine/). You can also alter the device by specifying the `device` argument. For example, you can use the following code to create an Inferencer on GPU 1. + +```python +inferencer = DetInferencer(model='rtmdet_tiny_8xb32-300e_coco', device='cuda:1') +``` + +To create an Inferencer on CPU: + +```python +inferencer = DetInferencer(model='rtmdet_tiny_8xb32-300e_coco', device='cpu') +``` + +Refer to [torch.device](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device) for all the supported forms. + +### Inference + +Once the Inferencer is initialized, you can directly pass in the raw data to be inferred and get the inference results from return values. + +#### Input + +Input can be either of these types: + +- str: Path/URL to the image. + + ```python + inferencer('demo/demo.jpg') + ``` + +- array: Image in numpy array. It should be in BGR order. + + ```python + import mmcv + array = mmcv.imread('demo/demo.jpg') + inferencer(array) + ``` + +- list: A list of basic types above. Each element in the list will be processed separately. + + ```python + inferencer(['img_1.jpg', 'img_2.jpg]) + # You can even mix the types + inferencer(['img_1.jpg', array]) + ``` + +- str: Path to the directory. All images in the directory will be processed. + + ```python + inferencer('path/to/your_imgs/') + ``` + +### Output + +By default, each `Inferencer` returns the prediction results in a dictionary format. + +- `visualization` contains the visualized predictions. + +- `predictions` contains the predictions results in a json-serializable format. But it's an empty list by default unless `return_vis=True`. + +```python +{ + 'predictions' : [ + # Each instance corresponds to an input image + { + 'labels': [...], # int list of length (N, ) + 'scores': [...], # float list of length (N, ) + 'bboxes': [...], # 2d list of shape (N, 4), format: [min_x, min_y, max_x, max_y] + }, + ... + ], + 'visualization' : [ + array(..., dtype=uint8), + ] + } +``` + +If you wish to get the raw outputs from the model, you can set `return_datasamples` to `True` to get the original [DataSample](advanced_guides/structures.md), which will be stored in `predictions`. + +#### Dumping Results + +Apart from obtaining predictions from the return value, you can also export the predictions/visualizations to files by setting `out_dir` and `no_save_pred`/`no_save_vis` arguments. + +```python +inferencer('demo/demo.jpg', out_dir='outputs/', no_save_pred=False) +``` + +Results in the directory structure like: + +```text +outputs +├── preds +│ └── demo.json +└── vis + └── demo.jpg +``` + +The filename of each file is the same as the corresponding input image filename. If the input image is an array, the filename will be a number starting from 0. + +#### Batch Inference + +You can customize the batch size by setting `batch_size`. The default batch size is 1. + +### API + +Here are extensive lists of parameters that you can use. + +- **DetInferencer.\_\_init\_\_():** + +| Arguments | Type | Type | Description | +| --------------- | ------------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | str, optional | None | Path to the config file or the model name defined in metafile. For example, it could be 'rtmdet-s' or 'rtmdet_s_8xb32-300e_coco' or 'configs/rtmdet/rtmdet_s_8xb32-300e_coco.py'. If the model is not specified, the user must provide the `weights` saved by MMEngine which contains the config string. | +| `weights` | str, optional | None | Path to the checkpoint. If it is not specified and `model` is a model name of metafile, the weights will be loaded from metafile. | +| `device` | str, optional | None | Device used for inference, accepting all allowed strings by `torch.device`. E.g., 'cuda:0' or 'cpu'. If None, the available device will be automatically used. | +| `scope` | str, optional | 'mmdet' | The scope of the model. | +| `palette` | str | 'none' | Color palette used for visualization. The order of priority is palette -> config -> checkpoint. | +| `show_progress` | bool | True | Control whether to display the progress bar during the inference process. | + +- **DetInferencer.\_\_call\_\_()** + +| Arguments | Type | Default | Description | +| -------------------- | ------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `inputs` | str/list/tuple/np.array | **required** | It can be a path to an image/a folder, an np array or a list/tuple (with img paths or np arrays) | +| `batch_size` | int | 1 | Inference batch size. | +| `print_result` | bool | False | Whether to print the inference result to the console. | +| `show` | bool | False | Whether to display the visualization results in a popup window. | +| `wait_time` | float | 0 | The interval of show(s). | +| `no_save_vis` | bool | False | Whether to force not to save prediction vis results. | +| `draw_pred` | bool | True | Whether to draw predicted bounding boxes. | +| `pred_score_thr` | float | 0.3 | Minimum score of bboxes to draw. | +| `return_datasamples` | bool | False | Whether to return results as DataSamples. If False, the results will be packed into a dict. | +| `print_result` | bool | False | Whether to print the inference result to the console. | +| `no_save_pred` | bool | True | Whether to force not to save prediction results. | +| `out_dir` | str | '' | Output directory of results. | +| `texts` | str/list\[str\], optional | None | Text prompts. | +| `stuff_texts` | str/list\[str\], optional | None | Stuff text prompts of open panoptic task. | +| `custom_entities` | bool | False | Whether to use custom entities. Only used in GLIP. | +| \*\*kwargs | | | Other keyword arguments passed to :meth:`preprocess`, :meth:`forward`, :meth:`visualize` and :meth:`postprocess`. Each key in kwargs should be in the corresponding set of `preprocess_kwargs`, `forward_kwargs`, `visualize_kwargs` and `postprocess_kwargs`. | + +## Demos + +We also provide four demo scripts, implemented with high-level APIs and supporting functionality codes. +Source codes are available [here](https://github.com/open-mmlab/mmdetection/blob/main/demo). + +### Image demo + +This script performs inference on a single image. + +```shell +python demo/image_demo.py \ + ${IMAGE_FILE} \ + ${CONFIG_FILE} \ + [--weights ${WEIGHTS}] \ + [--device ${GPU_ID}] \ + [--pred-score-thr ${SCORE_THR}] +``` + +Examples: + +```shell +python demo/image_demo.py demo/demo.jpg \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + --weights checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --device cpu +``` + +### Webcam demo + +This is a live demo from a webcam. + +```shell +python demo/webcam_demo.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--device ${GPU_ID}] \ + [--camera-id ${CAMERA-ID}] \ + [--score-thr ${SCORE_THR}] +``` + +Examples: + +```shell +python demo/webcam_demo.py \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth +``` + +### Video demo + +This script performs inference on a video. + +```shell +python demo/video_demo.py \ + ${VIDEO_FILE} \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--device ${GPU_ID}] \ + [--score-thr ${SCORE_THR}] \ + [--out ${OUT_FILE}] \ + [--show] \ + [--wait-time ${WAIT_TIME}] +``` + +Examples: + +```shell +python demo/video_demo.py demo/demo.mp4 \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --out result.mp4 +``` + +#### Video demo with GPU acceleration + +This script performs inference on a video with GPU acceleration. + +```shell +python demo/video_gpuaccel_demo.py \ + ${VIDEO_FILE} \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--device ${GPU_ID}] \ + [--score-thr ${SCORE_THR}] \ + [--nvdecode] \ + [--out ${OUT_FILE}] \ + [--show] \ + [--wait-time ${WAIT_TIME}] +``` + +Examples: + +```shell +python demo/video_gpuaccel_demo.py demo/demo.mp4 \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --nvdecode --out result.mp4 +``` + +### Large-image inference demo + +This is a script for slicing inference on large images. + +``` +python demo/large_image_demo.py \ + ${IMG_PATH} \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + --device ${GPU_ID} \ + --show \ + --tta \ + --score-thr ${SCORE_THR} \ + --patch-size ${PATCH_SIZE} \ + --patch-overlap-ratio ${PATCH_OVERLAP_RATIO} \ + --merge-iou-thr ${MERGE_IOU_THR} \ + --merge-nms-type ${MERGE_NMS_TYPE} \ + --batch-size ${BATCH_SIZE} \ + --debug \ + --save-patch +``` + +Examples: + +```shell +# inferecnce without tta +wget -P checkpoint https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth + +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py \ + checkpoint/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth + +# inference with tta +wget -P checkpoint https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth + +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/retinanet/retinanet_r50_fpn_1x_coco.py \ + checkpoint/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth --tta + +``` + +## Multi-modal algorithm inference demo and evaluation + +As multimodal vision algorithms continue to evolve, MMDetection has also supported such algorithms. This section demonstrates how to use the demo and eval scripts corresponding to multimodal algorithms using the GLIP algorithm and model as the example. Moreover, MMDetection integrated a [gradio_demo project](../../../projects/gradio_demo/), which allows developers to quickly play with all image input tasks in MMDetection on their local devices. Check the [document](../../../projects/gradio_demo/README.md) for more details. + +### Preparation + +Please first make sure that you have the correct dependencies installed: + +```shell +# if source +pip install -r requirements/multimodal.txt + +# if wheel +mim install mmdet[multimodal] +``` + +MMDetection has already implemented GLIP algorithms and provided the weights, you can download directly from urls: + +```shell +cd mmdetection +wget https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_a_mmdet-b3654169.pth +``` + +### Inference + +Once the model is successfully downloaded, you can use the `demo/image_demo.py` script to run the inference. + +```shell +python demo/image_demo.py demo/demo.jpg glip_tiny_a_mmdet-b3654169.pth --texts bench +``` + +Demo result will be similar to this: + +
    + +
    + +If users would like to detect multiple targets, please declare them in the format of `xx. xx` after the `--texts`. + +```shell +python demo/image_demo.py demo/demo.jpg glip_tiny_a_mmdet-b3654169.pth --texts 'bench. car' +``` + +And the result will be like this one: + +
    + +
    + +You can also use a sentence as the input prompt for the `--texts` field, for example: + +```shell +python demo/image_demo.py demo/demo.jpg glip_tiny_a_mmdet-b3654169.pth --texts 'There are a lot of cars here.' +``` + +The result will be similar to this: + +
    + +
    + +### Evaluation + +The GLIP implementation in MMDetection does not have any performance degradation, our benchmark is as follows: + +| Model | official mAP | mmdet mAP | +| ----------------------- | :----------: | :-------: | +| glip_A_Swin_T_O365.yaml | 42.9 | 43.0 | +| glip_Swin_T_O365.yaml | 44.9 | 44.9 | +| glip_Swin_L.yaml | 51.4 | 51.3 | + +Users can use the test script we provided to run evaluation as well. Here is a basic example: + +```shell +# 1 gpu +python tools/test.py configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365.py glip_tiny_a_mmdet-b3654169.pth + +# 8 GPU +./tools/dist_test.sh configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365.py glip_tiny_a_mmdet-b3654169.pth 8 +``` diff --git a/mmdetection/docs/en/user_guides/init_cfg.md b/mmdetection/docs/en/user_guides/init_cfg.md new file mode 100644 index 0000000..312b67a --- /dev/null +++ b/mmdetection/docs/en/user_guides/init_cfg.md @@ -0,0 +1,161 @@ +# Weight initialization + +During training, a proper initialization strategy is beneficial to speeding up the training or obtaining a higher performance. [MMCV](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/weight_init.py) provide some commonly used methods for initializing modules like `nn.Conv2d`. Model initialization in MMdetection mainly uses `init_cfg`. Users can initialize models with following two steps: + +1. Define `init_cfg` for a model or its components in `model_cfg`, but `init_cfg` of children components have higher priority and will override `init_cfg` of parents modules. +2. Build model as usual, but call `model.init_weights()` method explicitly, and model parameters will be initialized as configuration. + +The high-level workflow of initialization in MMdetection is : + +model_cfg(init_cfg) -> build_from_cfg -> model -> init_weight() -> initialize(self, self.init_cfg) -> children's init_weight() + +### Description + +It is dict or list\[dict\], and contains the following keys and values: + +- `type` (str), containing the initializer name in `INTIALIZERS`, and followed by arguments of the initializer. +- `layer` (str or list\[str\]), containing the names of basic layers in Pytorch or MMCV with learnable parameters that will be initialized, e.g. `'Conv2d'`,`'DeformConv2d'`. +- `override` (dict or list\[dict\]), containing the sub-modules that not inherit from BaseModule and whose initialization configuration is different from other layers' which are in `'layer'` key. Initializer defined in `type` will work for all layers defined in `layer`, so if sub-modules are not derived Classes of `BaseModule` but can be initialized as same ways of layers in `layer`, it does not need to use `override`. `override` contains: + - `type` followed by arguments of initializer; + - `name` to indicate sub-module which will be initialized. + +### Initialize parameters + +Inherit a new model from `mmcv.runner.BaseModule` or `mmdet.models` Here we show an example of FooModel. + +```python +import torch.nn as nn +from mmcv.runner import BaseModule + +class FooModel(BaseModule) + def __init__(self, + arg1, + arg2, + init_cfg=None): + super(FooModel, self).__init__(init_cfg) + ... +``` + +- Initialize model by using `init_cfg` directly in code + + ```python + import torch.nn as nn + from mmcv.runner import BaseModule + # or directly inherit mmdet models + + class FooModel(BaseModule) + def __init__(self, + arg1, + arg2, + init_cfg=XXX): + super(FooModel, self).__init__(init_cfg) + ... + ``` + +- Initialize model by using `init_cfg` directly in `mmcv.Sequential` or `mmcv.ModuleList` code + + ```python + from mmcv.runner import BaseModule, ModuleList + + class FooModel(BaseModule) + def __init__(self, + arg1, + arg2, + init_cfg=None): + super(FooModel, self).__init__(init_cfg) + ... + self.conv1 = ModuleList(init_cfg=XXX) + ``` + +- Initialize model by using `init_cfg` in config file + + ```python + model = dict( + ... + model = dict( + type='FooModel', + arg1=XXX, + arg2=XXX, + init_cfg=XXX), + ... + ``` + +### Usage of init_cfg + +1. Initialize model by `layer` key + + If we only define `layer`, it just initialize the layer in `layer` key. + + NOTE: Value of `layer` key is the class name with attributes weights and bias of Pytorch, (so such as `MultiheadAttention layer` is not supported). + +- Define `layer` key for initializing module with same configuration. + + ```python + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1) + # initialize whole module with same configuration + ``` + +- Define `layer` key for initializing layer with different configurations. + +```python +init_cfg = [dict(type='Constant', layer='Conv1d', val=1), + dict(type='Constant', layer='Conv2d', val=2), + dict(type='Constant', layer='Linear', val=3)] +# nn.Conv1d will be initialized with dict(type='Constant', val=1) +# nn.Conv2d will be initialized with dict(type='Constant', val=2) +# nn.Linear will be initialized with dict(type='Constant', val=3) +``` + +2. Initialize model by `override` key + +- When initializing some specific part with its attribute name, we can use `override` key, and the value in `override` will ignore the value in init_cfg. + + ```python + # layers: + # self.feat = nn.Conv1d(3, 1, 3) + # self.reg = nn.Conv2d(3, 3, 3) + # self.cls = nn.Linear(1,2) + + init_cfg = dict(type='Constant', + layer=['Conv1d','Conv2d'], val=1, bias=2, + override=dict(type='Constant', name='reg', val=3, bias=4)) + # self.feat and self.cls will be initialized with dict(type='Constant', val=1, bias=2) + # The module called 'reg' will be initialized with dict(type='Constant', val=3, bias=4) + ``` + +- If `layer` is None in init_cfg, only sub-module with the name in override will be initialized, and type and other args in override can be omitted. + + ```python + # layers: + # self.feat = nn.Conv1d(3, 1, 3) + # self.reg = nn.Conv2d(3, 3, 3) + # self.cls = nn.Linear(1,2) + + init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg')) + + # self.feat and self.cls will be initialized by Pytorch + # The module called 'reg' will be initialized with dict(type='Constant', val=1, bias=2) + ``` + +- If we don't define `layer` key or `override` key, it will not initialize anything. + +- Invalid usage + + ```python + # It is invalid that override don't have name key + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2, + override=dict(type='Constant', val=3, bias=4)) + + # It is also invalid that override has name and other args except type + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2, + override=dict(name='reg', val=3, bias=4)) + ``` + +3. Initialize model with the pretrained model + + ```python + init_cfg = dict(type='Pretrained', + checkpoint='torchvision://resnet50') + ``` + +More details can refer to the documentation in [MMEngine](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html) diff --git a/mmdetection/docs/en/user_guides/label_studio.md b/mmdetection/docs/en/user_guides/label_studio.md new file mode 100644 index 0000000..d4b3744 --- /dev/null +++ b/mmdetection/docs/en/user_guides/label_studio.md @@ -0,0 +1,256 @@ +# Semi-automatic Object Detection Annotation with MMDetection and Label-Studio + +Annotation data is a time-consuming and laborious task. This article introduces how to perform semi-automatic annotation using the RTMDet algorithm in MMDetection in conjunction with Label-Studio software. Specifically, using RTMDet to predict image annotations and then refining the annotations with Label-Studio. Community users can refer to this process and methodology and apply it to other fields. + +- RTMDet: RTMDet is a high-precision single-stage object detection algorithm developed by OpenMMLab, open-sourced in the MMDetection object detection toolbox. Its open-source license is Apache 2.0, and it can be used freely without restrictions by industrial users. + +- [Label Studio](https://github.com/heartexlabs/label-studio) is an excellent annotation software covering the functionality of dataset annotation in areas such as image classification, object detection, and segmentation. + +In this article, we will use [cat](https://download.openmmlab.com/mmyolo/data/cat_dataset.zip) images for semi-automatic annotation. + +## Environment Configuration + +To begin with, you need to create a virtual environment and then install PyTorch and MMCV. In this article, we will specify the versions of PyTorch and MMCV. Next, you can install MMDetection, Label-Studio, and label-studio-ml-backend using the following steps: + +Create a virtual environment: + +```shell +conda create -n rtmdet python=3.9 -y +conda activate rtmdet +``` + +Install PyTorch: + +```shell +# Linux and Windows CPU only +pip install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html +# Linux and Windows CUDA 11.3 +pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu113/torch_stable.html +# OSX +pip install torch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 +``` + +Install MMCV: + +```shell +pip install -U openmim +mim install "mmcv>=2.0.0" +# Installing mmcv will automatically install mmengine +``` + +Install MMDetection: + +```shell +git clone https://github.com/open-mmlab/mmdetection +cd mmdetection +pip install -v -e . +``` + +Install Label-Studio and label-studio-ml-backend: + +```shell +# Installing Label-Studio may take some time, if the version is not found, please use the official source +pip install label-studio==1.7.2 +pip install label-studio-ml==1.0.9 +``` + +Download the rtmdet weights: + +```shell +cd path/to/mmetection +mkdir work_dirs +cd work_dirs +wget https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_m_8xb32-300e_coco/rtmdet_m_8xb32-300e_coco_20220719_112220-229f527c.pth +``` + +## Start the Service + +Start the RTMDet backend inference service: + +```shell +cd path/to/mmetection + +label-studio-ml start projects/LabelStudio/backend_template --with \ +config_file=configs/rtmdet/rtmdet_m_8xb32-300e_coco.py \ +checkpoint_file=./work_dirs/rtmdet_m_8xb32-300e_coco_20220719_112220-229f527c.pth \ +device=cpu \ +--port 8003 +# Set device=cpu to use CPU inference, and replace cpu with cuda:0 to use GPU inference. +``` + +![](https://cdn.vansin.top/picgo20230330131601.png) + +The RTMDet backend inference service has now been started. To configure it in the Label-Studio web system, use http://localhost:8003 as the backend inference service. + +Now, start the Label-Studio web service: + +```shell +label-studio start +``` + +![](https://cdn.vansin.top/picgo20230330132913.png) + +Open your web browser and go to http://localhost:8080/ to see the Label-Studio interface. + +![](https://cdn.vansin.top/picgo20230330133118.png) + +Register a user and then create an RTMDet-Semiautomatic-Label project. + +![](https://cdn.vansin.top/picgo20230330133333.png) + +Download the example cat images by running the following command and import them using the Data Import button: + +```shell +cd path/to/mmetection +mkdir data && cd data + +wget https://download.openmmlab.com/mmyolo/data/cat_dataset.zip && unzip cat_dataset.zip +``` + +![](https://cdn.vansin.top/picgo20230330133628.png) + +![](https://cdn.vansin.top/picgo20230330133715.png) + +Then, select the Object Detection With Bounding Boxes template. + +![](https://cdn.vansin.top/picgo20230330133807.png) + +```shell +airplane +apple +backpack +banana +baseball_bat +baseball_glove +bear +bed +bench +bicycle +bird +boat +book +bottle +bowl +broccoli +bus +cake +car +carrot +cat +cell_phone +chair +clock +couch +cow +cup +dining_table +dog +donut +elephant +fire_hydrant +fork +frisbee +giraffe +hair_drier +handbag +horse +hot_dog +keyboard +kite +knife +laptop +microwave +motorcycle +mouse +orange +oven +parking_meter +person +pizza +potted_plant +refrigerator +remote +sandwich +scissors +sheep +sink +skateboard +skis +snowboard +spoon +sports_ball +stop_sign +suitcase +surfboard +teddy_bear +tennis_racket +tie +toaster +toilet +toothbrush +traffic_light +train +truck +tv +umbrella +vase +wine_glass +zebra +``` + +Then, copy and add the above categories to Label-Studio and click Save. + +![](https://cdn.vansin.top/picgo20230330134027.png) + +In the Settings, click Add Model to add the RTMDet backend inference service. + +![](https://cdn.vansin.top/picgo20230330134320.png) + +Click Validate and Save, and then click Start Labeling. + +![](https://cdn.vansin.top/picgo20230330134424.png) + +If you see Connected as shown below, the backend inference service has been successfully added. + +![](https://cdn.vansin.top/picgo20230330134554.png) + +## Start Semi-Automatic Labeling + +Click on Label to start labeling. + +![](https://cdn.vansin.top/picgo20230330134804.png) + +We can see that the RTMDet backend inference service has successfully returned the predicted results and displayed them on the image. However, we noticed that the predicted bounding boxes for the cats are a bit too large and not very accurate. + +![](https://cdn.vansin.top/picgo20230403104419.png) + +We manually adjust the position of the cat bounding box, and then click Submit to complete the annotation of this image. + +![](https://cdn.vansin.top/picgo/20230403105923.png) + +After submitting all images, click export to export the labeled dataset in COCO format. + +![](https://cdn.vansin.top/picgo20230330135921.png) + +Use VS Code to open the unzipped folder to see the labeled dataset, which includes the images and the annotation files in JSON format. + +![](https://cdn.vansin.top/picgo20230330140321.png) + +At this point, the semi-automatic labeling is complete. We can use this dataset to train a more accurate model in MMDetection and then continue semi-automatic labeling on newly collected images with this model. This way, we can iteratively expand the high-quality dataset and improve the accuracy of the model. + +## Use MMYOLO as the Backend Inference Service + +If you want to use Label-Studio in MMYOLO, you can refer to replacing the config_file and checkpoint_file with the configuration file and weight file of MMYOLO when starting the backend inference service. + +```shell +cd path/to/mmetection + +label-studio-ml start projects/LabelStudio/backend_template --with \ +config_file= path/to/mmyolo_config.py \ +checkpoint_file= path/to/mmyolo_weights.pth \ +device=cpu \ +--port 8003 +# device=cpu is for using CPU inference. If using GPU inference, replace cpu with cuda:0. +``` + +Rotation object detection and instance segmentation are still under development, please stay tuned. diff --git a/mmdetection/docs/en/user_guides/new_model.md b/mmdetection/docs/en/user_guides/new_model.md new file mode 100644 index 0000000..c7af855 --- /dev/null +++ b/mmdetection/docs/en/user_guides/new_model.md @@ -0,0 +1,290 @@ +# Train with customized models and standard datasets + +In this note, you will know how to train, test and inference your own customized models under standard datasets. We use the cityscapes dataset to train a customized Cascade Mask R-CNN R50 model as an example to demonstrate the whole process, which using [`AugFPN`](https://github.com/Gus-Guo/AugFPN) to replace the default `FPN` as neck, and add `Rotate` or `TranslateX` as training-time auto augmentation. + +The basic steps are as below: + +1. Prepare the standard dataset +2. Prepare your own customized model +3. Prepare a config +4. Train, test, and inference models on the standard dataset. + +## Prepare the standard dataset + +In this note, as we use the standard cityscapes dataset as an example. + +It is recommended to symlink the dataset root to `$MMDETECTION/data`. +If your folder structure is different, you may need to change the corresponding paths in config files. + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ ├── cityscapes +│ │ ├── annotations +│ │ ├── leftImg8bit +│ │ │ ├── train +│ │ │ ├── val +│ │ ├── gtFine +│ │ │ ├── train +│ │ │ ├── val +│ ├── VOCdevkit +│ │ ├── VOC2007 +│ │ ├── VOC2012 + +``` + +Or you can set your dataset root through + +```bash +export MMDET_DATASETS=$data_root +``` + +We will replace dataset root with `$MMDET_DATASETS`, so you don't have to modify the corresponding path in config files. + +The cityscapes annotations have to be converted into the coco format using `tools/dataset_converters/cityscapes.py`: + +```shell +pip install cityscapesscripts +python tools/dataset_converters/cityscapes.py ./data/cityscapes --nproc 8 --out-dir ./data/cityscapes/annotations +``` + +Currently, the config files in `cityscapes` use COCO pre-trained weights to initialize. +You could download the pre-trained models in advance if the network is unavailable or slow, otherwise, it would cause errors at the beginning of training. + +## Prepare your own customized model + +The second step is to use your own module or training setting. Assume that we want to implement a new neck called `AugFPN` to replace with the default `FPN` under the existing detector Cascade Mask R-CNN R50. The following implements `AugFPN` under MMDetection. + +### 1. Define a new neck (e.g. AugFPN) + +Firstly create a new file `mmdet/models/necks/augfpn.py`. + +```python +import torch.nn as nn +from mmdet.registry import MODELS + + +@MODELS.register_module() +class AugFPN(nn.Module): + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False): + pass + + def forward(self, inputs): + # implementation is ignored + pass +``` + +### 2. Import the module + +You can either add the following line to `mmdet/models/necks/__init__.py`, + +```python +from .augfpn import AugFPN +``` + +or alternatively add + +```python +custom_imports = dict( + imports=['mmdet.models.necks.augfpn'], + allow_failed_imports=False) +``` + +to the config file and avoid modifying the original code. + +### 3. Modify the config file + +```python +neck=dict( + type='AugFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5) +``` + +For more detailed usages about customizing your own models (e.g. implement a new backbone, head, loss, etc) and runtime training settings (e.g. define a new optimizer, use gradient clip, customize training schedules and hooks, etc), please refer to the guideline [Customize Models](../advanced_guides/customize_models.md) and [Customize Runtime Settings](../advanced_guides/customize_runtime.md) respectively. + +## Prepare a config + +The third step is to prepare a config for your own training setting. Assume that we want to add `AugFPN` and `Rotate` or `Translate` augmentation to existing Cascade Mask R-CNN R50 to train the cityscapes dataset, and assume the config is under directory `configs/cityscapes/` and named as `cascade-mask-rcnn_r50_augfpn_autoaug-10e_cityscapes.py`, the config is as below. + +```python +# The new config inherits the base configs to highlight the necessary modification +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py' +] + +model = dict( + # set None to avoid loading ImageNet pre-trained backbone, + # instead here we set `load_from` to load from COCO pre-trained detectors. + backbone=dict(init_cfg=None), + # replace neck from defaultly `FPN` to our new implemented module `AugFPN` + neck=dict( + type='AugFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + # We also need to change the num_classes in head from 80 to 8, to match the + # cityscapes dataset's annotation. This modification involves `bbox_head` and `mask_head`. + roi_head=dict( + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + # change the number of classes from defaultly COCO to cityscapes + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + # change the number of classes from defaultly COCO to cityscapes + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + # change the number of classes from defaultly COCO to cityscapes + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + # change the number of classes from default COCO to cityscapes + num_classes=8, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))) + +# over-write `train_pipeline` for new added `AutoAugment` training setting +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='AutoAugment', + policies=[ + [dict( + type='Rotate', + level=5, + img_border_value=(124, 116, 104), + prob=0.5) + ], + [dict(type='Rotate', level=7, img_border_value=(124, 116, 104)), + dict( + type='TranslateX', + level=5, + prob=0.5, + img_border_value=(124, 116, 104)) + ], + ]), + dict( + type='RandomResize', + scale=[(2048, 800), (2048, 1024)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs'), +] + +# set batch_size per gpu, and set new training pipeline +train_dataloader = dict( + batch_size=1, + num_workers=3, + # over-write `pipeline` with new training pipeline setting + dataset=dict(pipeline=train_pipeline)) + +# Set optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +# Set customized learning policy +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=10, + by_epoch=True, + milestones=[8], + gamma=0.1) +] + +# train, val, test loop config +train_cfg = dict(max_epochs=10, val_interval=1) + +# We can use the COCO pre-trained Cascade Mask R-CNN R50 model for a more stable performance initialization +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth' +``` + +## Train a new model + +To train a model with the new config, you can simply run + +```shell +python tools/train.py configs/cityscapes/cascade-mask-rcnn_r50_augfpn_autoaug-10e_cityscapes.py +``` + +For more detailed usages, please refer to the [training guide](train.md). + +## Test and inference + +To test the trained model, you can simply run + +```shell +python tools/test.py configs/cityscapes/cascade-mask-rcnn_r50_augfpn_autoaug-10e_cityscapes.py work_dirs/cascade-mask-rcnn_r50_augfpn_autoaug-10e_cityscapes/epoch_10.pth +``` + +For more detailed usages, please refer to the [testing guide](test.md). diff --git a/mmdetection/docs/en/user_guides/robustness_benchmarking.md b/mmdetection/docs/en/user_guides/robustness_benchmarking.md new file mode 100644 index 0000000..f657956 --- /dev/null +++ b/mmdetection/docs/en/user_guides/robustness_benchmarking.md @@ -0,0 +1,110 @@ +# Corruption Benchmarking + +## Introduction + +We provide tools to test object detection and instance segmentation models on the image corruption benchmark defined in [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484). +This page provides basic tutorials how to use the benchmark. + +```latex +@article{michaelis2019winter, + title={Benchmarking Robustness in Object Detection: + Autonomous Driving when Winter is Coming}, + author={Michaelis, Claudio and Mitzkus, Benjamin and + Geirhos, Robert and Rusak, Evgenia and + Bringmann, Oliver and Ecker, Alexander S. and + Bethge, Matthias and Brendel, Wieland}, + journal={arXiv:1907.07484}, + year={2019} +} +``` + +![image corruption example](../../../resources/corruptions_sev_3.png) + +## About the benchmark + +To submit results to the benchmark please visit the [benchmark homepage](https://github.com/bethgelab/robust-detection-benchmark) + +The benchmark is modelled after the [imagenet-c benchmark](https://github.com/hendrycks/robustness) which was originally +published in [Benchmarking Neural Network Robustness to Common Corruptions and Perturbations](https://arxiv.org/abs/1903.12261) (ICLR 2019) by Dan Hendrycks and Thomas Dietterich. + +The image corruption functions are included in this library but can be installed separately using: + +```shell +pip install imagecorruptions +``` + +Compared to imagenet-c a few changes had to be made to handle images of arbitrary size and greyscale images. +We also modified the 'motion blur' and 'snow' corruptions to remove dependency from a linux specific library, +which would have to be installed separately otherwise. For details please refer to the [imagecorruptions repository](https://github.com/bethgelab/imagecorruptions). + +## Inference with pretrained models + +We provide a testing script to evaluate a models performance on any combination of the corruptions provided in the benchmark. + +### Test a dataset + +- [x] single GPU testing +- [ ] multiple GPU testing +- [ ] visualize detection results + +You can use the following commands to test a models performance under the 15 corruptions used in the benchmark. + +```shell +# single-gpu testing +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] +``` + +Alternatively different group of corruptions can be selected. + +```shell +# noise +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] --corruptions noise + +# blur +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] --corruptions blur + +# wetaher +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] --corruptions weather + +# digital +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] --corruptions digital +``` + +Or a costom set of corruptions e.g.: + +```shell +# gaussian noise, zoom blur and snow +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions gaussian_noise zoom_blur snow +``` + +Finally the corruption severities to evaluate can be chosen. +Severity 0 corresponds to clean data and the effect increases from 1 to 5. + +```shell +# severity 1 +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 1 + +# severities 0,2,4 +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 0 2 4 +``` + +## Results for modelzoo models + +The results on COCO 2017val are shown in the below table. + +| Model | Backbone | Style | Lr schd | box AP clean | box AP corr. | box % | mask AP clean | mask AP corr. | mask % | +| :-----------------: | :-----------------: | :-----: | :-----: | :----------: | :----------: | :---: | :-----------: | :-----------: | :----: | +| Faster R-CNN | R-50-FPN | pytorch | 1x | 36.3 | 18.2 | 50.2 | - | - | - | +| Faster R-CNN | R-101-FPN | pytorch | 1x | 38.5 | 20.9 | 54.2 | - | - | - | +| Faster R-CNN | X-101-32x4d-FPN | pytorch | 1x | 40.1 | 22.3 | 55.5 | - | - | - | +| Faster R-CNN | X-101-64x4d-FPN | pytorch | 1x | 41.3 | 23.4 | 56.6 | - | - | - | +| Faster R-CNN | R-50-FPN-DCN | pytorch | 1x | 40.0 | 22.4 | 56.1 | - | - | - | +| Faster R-CNN | X-101-32x4d-FPN-DCN | pytorch | 1x | 43.4 | 26.7 | 61.6 | - | - | - | +| Mask R-CNN | R-50-FPN | pytorch | 1x | 37.3 | 18.7 | 50.1 | 34.2 | 16.8 | 49.1 | +| Mask R-CNN | R-50-FPN-DCN | pytorch | 1x | 41.1 | 23.3 | 56.7 | 37.2 | 20.7 | 55.7 | +| Cascade R-CNN | R-50-FPN | pytorch | 1x | 40.4 | 20.1 | 49.7 | - | - | - | +| Cascade Mask R-CNN | R-50-FPN | pytorch | 1x | 41.2 | 20.7 | 50.2 | 35.7 | 17.6 | 49.3 | +| RetinaNet | R-50-FPN | pytorch | 1x | 35.6 | 17.8 | 50.1 | - | - | - | +| Hybrid Task Cascade | X-101-64x4d-FPN-DCN | pytorch | 1x | 50.6 | 32.7 | 64.7 | 43.8 | 28.1 | 64.0 | + +Results may vary slightly due to the stochastic application of the corruptions. diff --git a/mmdetection/docs/en/user_guides/semi_det.md b/mmdetection/docs/en/user_guides/semi_det.md new file mode 100644 index 0000000..ee86c30 --- /dev/null +++ b/mmdetection/docs/en/user_guides/semi_det.md @@ -0,0 +1,325 @@ +# Semi-supervised Object Detection + +Semi-supervised object detection uses both labeled data and unlabeled data for training. It not only reduces the annotation burden for training high-performance object detectors but also further improves the object detector by using a large number of unlabeled data. + +A typical procedure to train a semi-supervised object detector is as below: + +- [Semi-supervised Object Detection](#semi-supervised-object-detection) + - [Prepare and split dataset](#prepare-and-split-dataset) + - [Configure multi-branch pipeline](#configure-multi-branch-pipeline) + - [Configure semi-supervised dataloader](#configure-semi-supervised-dataloader) + - [Configure semi-supervised model](#configure-semi-supervised-model) + - [Configure MeanTeacherHook](#configure-meanteacherhook) + - [Configure TeacherStudentValLoop](#configure-teacherstudentvalloop) + +## Prepare and split dataset + +We provide a dataset download script, which downloads the coco2017 dataset by default and decompresses it automatically. + +```shell +python tools/misc/download_dataset.py +``` + +The decompressed dataset directory structure is as below: + +```plain +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── image_info_unlabeled2017.json +│ │ │ ├── instances_train2017.json +│ │ │ ├── instances_val2017.json +│ │ ├── test2017 +│ │ ├── train2017 +│ │ ├── unlabeled2017 +│ │ ├── val2017 +``` + +There are two common experimental settings for semi-supervised object detection on the coco2017 dataset: + +(1) Split `train2017` according to a fixed percentage (1%, 2%, 5% and 10%) as a labeled dataset, and the rest of `train2017` as an unlabeled dataset. Because the different splits of `train2017` as labeled datasets will cause significant fluctuation on the accuracy of the semi-supervised detectors, five-fold cross-validation is used in practice to evaluate the algorithm. We provide the dataset split script: + +```shell +python tools/misc/split_coco.py +``` + +By default, the script will split `train2017` according to the labeled data ratio 1%, 2%, 5% and 10%, and each split will be randomly repeated 5 times for cross-validation. The generated semi-supervised annotation file name format is as below: + +- the name format of labeled dataset: `instances_train2017.{fold}@{percent}.json` +- the name format of unlabeled dataset: `instances_train2017.{fold}@{percent}-unlabeled.json` + +Here, `fold` is used for cross-validation, and `percent` represents the ratio of labeled data. The directory structure of the divided dataset is as below: + +```plain +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── image_info_unlabeled2017.json +│ │ │ ├── instances_train2017.json +│ │ │ ├── instances_val2017.json +│ │ ├── semi_anns +│ │ │ ├── instances_train2017.1@1.json +│ │ │ ├── instances_train2017.1@1-unlabeled.json +│ │ │ ├── instances_train2017.1@2.json +│ │ │ ├── instances_train2017.1@2-unlabeled.json +│ │ │ ├── instances_train2017.1@5.json +│ │ │ ├── instances_train2017.1@5-unlabeled.json +│ │ │ ├── instances_train2017.1@10.json +│ │ │ ├── instances_train2017.1@10-unlabeled.json +│ │ │ ├── instances_train2017.2@1.json +│ │ │ ├── instances_train2017.2@1-unlabeled.json +│ │ ├── test2017 +│ │ ├── train2017 +│ │ ├── unlabeled2017 +│ │ ├── val2017 +``` + +(2) Use `train2017` as the labeled dataset and `unlabeled2017` as the unlabeled dataset. Since `image_info_unlabeled2017.json` does not contain `categories` information, the `CocoDataset` cannot be initialized, so you need to write the `categories` of `instances_train2017.json` into `image_info_unlabeled2017.json` and save it as `instances_unlabeled2017.json`, the relevant script is as below: + +```python +from mmengine.fileio import load, dump + +anns_train = load('instances_train2017.json') +anns_unlabeled = load('image_info_unlabeled2017.json') +anns_unlabeled['categories'] = anns_train['categories'] +dump(anns_unlabeled, 'instances_unlabeled2017.json') +``` + +The processed dataset directory is as below: + +```plain +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── image_info_unlabeled2017.json +│ │ │ ├── instances_train2017.json +│ │ │ ├── instances_unlabeled2017.json +│ │ │ ├── instances_val2017.json +│ │ ├── test2017 +│ │ ├── train2017 +│ │ ├── unlabeled2017 +│ │ ├── val2017 +``` + +## Configure multi-branch pipeline + +There are two main approaches to semi-supervised learning, +[consistency regularization](https://research.nvidia.com/sites/default/files/publications/laine2017iclr_paper.pdf) +and [pseudo label](https://www.researchgate.net/profile/Dong-Hyun-Lee/publication/280581078_Pseudo-Label_The_Simple_and_Efficient_Semi-Supervised_Learning_Method_for_Deep_Neural_Networks/links/55bc4ada08ae092e9660b776/Pseudo-Label-The-Simple-and-Efficient-Semi-Supervised-Learning-Method-for-Deep-Neural-Networks.pdf). +Consistency regularization often requires some careful design, while pseudo label have a simpler form and are easier to extend to downstream tasks. +We adopt a teacher-student joint training semi-supervised object detection framework based on pseudo label, so labeled data and unlabeled data need to configure different data pipeline: + +(1) Pipeline for labeled data: + +```python +# pipeline used to augment labeled data, +# which will be sent to student model for supervised training. +sup_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='RandAugment', aug_space=color_space, aug_num=1), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='MultiBranch', sup=dict(type='PackDetInputs')) +] +``` + +(2) Pipeline for unlabeled data: + +```python +# pipeline used to augment unlabeled data weakly, +# which will be sent to teacher model for predicting pseudo instances. +weak_pipeline = [ + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', + 'homography_matrix')), +] + +# pipeline used to augment unlabeled data strongly, +# which will be sent to student model for unsupervised training. +strong_pipeline = [ + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomOrder', + transforms=[ + dict(type='RandAugment', aug_space=color_space, aug_num=1), + dict(type='RandAugment', aug_space=geometric, aug_num=1), + ]), + dict(type='RandomErasing', n_patches=(1, 5), ratio=(0, 0.2)), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', + 'homography_matrix')), +] + +# pipeline used to augment unlabeled data into different views +unsup_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadEmptyAnnotations'), + dict( + type='MultiBranch', + unsup_teacher=weak_pipeline, + unsup_student=strong_pipeline, + ) +] +``` + +## Configure semi-supervised dataloader + +(1) Build a semi-supervised dataset. Use `ConcatDataset` to concatenate labeled and unlabeled datasets. + +```python +labeled_dataset = dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=sup_pipeline) + +unlabeled_dataset = dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_unlabeled2017.json', + data_prefix=dict(img='unlabeled2017/'), + filter_cfg=dict(filter_empty_gt=False), + pipeline=unsup_pipeline) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=num_workers, + persistent_workers=True, + sampler=dict( + type='GroupMultiSourceSampler', + batch_size=batch_size, + source_ratio=[1, 4]), + dataset=dict( + type='ConcatDataset', datasets=[labeled_dataset, unlabeled_dataset])) +``` + +(2) Use multi-source dataset sampler. Use `GroupMultiSourceSampler` to sample data form batches from `labeled_dataset` and `labeled_dataset`, `source_ratio` controls the proportion of labeled data and unlabeled data in the batch. `GroupMultiSourceSampler` also ensures that the images in the same batch have similar aspect ratios. If you don't need to guarantee the aspect ratio of the images in the batch, you can use `MultiSourceSampler`. The sampling diagram of `GroupMultiSourceSampler` is as below: + +
    + +
    + +`sup=1000` indicates that the scale of the labeled dataset is 1000, `sup_h=200` indicates that the scale of the images with an aspect ratio greater than or equal to 1 in the labeled dataset is 200, and `sup_w=800` indicates that the scale of the images with an aspect ratio less than 1 in the labeled dataset is 800, +`unsup=9000` indicates that the scale of the unlabeled dataset is 9000, `unsup_h=1800` indicates that the scale of the images with an aspect ratio greater than or equal to 1 in the unlabeled dataset is 1800, and `unsup_w=7200` indicates the scale of the images with an aspect ratio less than 1 in the unlabeled dataset is 7200. +`GroupMultiSourceSampler` randomly selects a group according to the overall aspect ratio distribution of the images in the labeled dataset and the unlabeled dataset, and then sample data to form batches from the two datasets according to `source_ratio`, so labeled datasets and unlabeled datasets have different repetitions. + +## Configure semi-supervised model + +We choose `Faster R-CNN` as `detector` for semi-supervised training. Take the semi-supervised object detection algorithm `SoftTeacher` as an example, +the model configuration can be inherited from `_base_/models/faster-rcnn_r50_fpn.py`, replacing the backbone network of the detector with `caffe` style. +Note that unlike the supervised training configs, `Faster R-CNN` as `detector` is an attribute of `model`, not `model` . +In addition, `data_preprocessor` needs to be set to `MultiBranchDataPreprocessor`, which is used to pad and normalize images from different pipelines. +Finally, parameters required for semi-supervised training and testing can be configured via `semi_train_cfg` and `semi_test_cfg`. + +```python +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/default_runtime.py', + '../_base_/datasets/semi_coco_detection.py' +] + +detector = _base_.model +detector.data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32) +detector.backbone = dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')) + +model = dict( + _delete_=True, + type='SoftTeacher', + detector=detector, + data_preprocessor=dict( + type='MultiBranchDataPreprocessor', + data_preprocessor=detector.data_preprocessor), + semi_train_cfg=dict( + freeze_teacher=True, + sup_weight=1.0, + unsup_weight=4.0, + pseudo_label_initial_score_thr=0.5, + rpn_pseudo_thr=0.9, + cls_pseudo_thr=0.9, + reg_pseudo_thr=0.02, + jitter_times=10, + jitter_scale=0.06, + min_pseudo_bbox_wh=(1e-2, 1e-2)), + semi_test_cfg=dict(predict_on='teacher')) +``` + +In addition, we also support semi-supervised training for other detection models, such as `RetinaNet` and `Cascade R-CNN`. Since `SoftTeacher` only supports `Faster R-CNN`, it needs to be replaced with `SemiBaseDetector`, example is as below: + +```python +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', '../_base_/default_runtime.py', + '../_base_/datasets/semi_coco_detection.py' +] + +detector = _base_.model + +model = dict( + _delete_=True, + type='SemiBaseDetector', + detector=detector, + data_preprocessor=dict( + type='MultiBranchDataPreprocessor', + data_preprocessor=detector.data_preprocessor), + semi_train_cfg=dict( + freeze_teacher=True, + sup_weight=1.0, + unsup_weight=1.0, + cls_pseudo_thr=0.9, + min_pseudo_bbox_wh=(1e-2, 1e-2)), + semi_test_cfg=dict(predict_on='teacher')) +``` + +Following the semi-supervised training configuration of `SoftTeacher`, change `batch_size` to 2 and `source_ratio` to `[1, 1]`, the experimental results of supervised and semi-supervised training of `RetinaNet`, `Faster R-CNN`, `Cascade R-CNN` and `SoftTeacher` on the 10% coco `train2017` are as below: + +| Model | Detector | BackBone | Style | sup-0.1-coco mAP | semi-0.1-coco mAP | +| :--------------: | :-----------: | :------: | :---: | :--------------: | :---------------: | +| SemiBaseDetector | RetinaNet | R-50-FPN | caffe | 23.5 | 27.7 | +| SemiBaseDetector | Faster R-CNN | R-50-FPN | caffe | 26.7 | 28.4 | +| SemiBaseDetector | Cascade R-CNN | R-50-FPN | caffe | 28.0 | 29.7 | +| SoftTeacher | Faster R-CNN | R-50-FPN | caffe | 26.7 | 31.1 | + +## Configure MeanTeacherHook + +Usually, the teacher model is updated by Exponential Moving Average (EMA) the student model, and then the teacher model is optimized with the optimization of the student model, which can be achieved by configuring `custom_hooks`: + +```python +custom_hooks = [dict(type='MeanTeacherHook')] +``` + +## Configure TeacherStudentValLoop + +Since there are two models in the teacher-student joint training framework, we can replace `ValLoop` with `TeacherStudentValLoop` to test the accuracy of both models during the training process. + +```python +val_cfg = dict(type='TeacherStudentValLoop') +``` diff --git a/mmdetection/docs/en/user_guides/single_stage_as_rpn.md b/mmdetection/docs/en/user_guides/single_stage_as_rpn.md new file mode 100644 index 0000000..93a48dd --- /dev/null +++ b/mmdetection/docs/en/user_guides/single_stage_as_rpn.md @@ -0,0 +1,176 @@ +# Use a single stage detector as RPN + +Region proposal network (RPN) is a submodule in [Faster R-CNN](https://arxiv.org/abs/1506.01497), which generates proposals for the second stage of Faster R-CNN. Most two-stage detectors in MMDetection use [`RPNHead`](../../../mmdet/models/dense_heads/rpn_head.py) to generate proposals as RPN. However, any single-stage detector can serve as an RPN since their bounding box predictions can also be regarded as region proposals and thus be refined in the R-CNN. Therefore, MMDetection v3.0 supports that. + +To illustrate the whole process, here we give an example of how to use an anchor-free single-stage model [FCOS](../../../configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py) as an RPN in [Faster R-CNN](../../../configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py). + +The outline of this tutorial is as below: + +1. Use `FCOSHead` as an `RPNHead` in Faster R-CNN +2. Evaluate proposals +3. Train the customized Faster R-CNN with pre-trained FCOS + +## Use `FCOSHead` as an `RPNHead` in Faster R-CNN + +To set `FCOSHead` as an `RPNHead` in Faster R-CNN, we should create a new config file named `configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py`, and replace with the setting of `rpn_head` with the setting of `bbox_head` in `configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py`. Besides, we still use the neck setting of FCOS with strides of `[8, 16, 32, 64, 128]`, and update `featmap_strides` of `bbox_roi_extractor` to `[8, 16, 32, 64, 128]`. To avoid loss goes NAN, we apply warmup during the first 1000 iterations instead of the first 500 iterations, which means that the lr increases more slowly. The config is as follows: + +```python +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + # copied from configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py + neck=dict( + start_level=1, + add_extra_convs='on_output', # use P5 + relu_before_extra_convs=True), + rpn_head=dict( + _delete_=True, # ignore the unused old settings + type='FCOSHead', + num_classes=1, # num_classes = 1 for rpn, if num_classes > 1, it will be set to 1 in TwoStageDetector automatically + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + roi_head=dict( # update featmap_strides due to the strides in neck + bbox_roi_extractor=dict(featmap_strides=[8, 16, 32, 64, 128]))) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), # Slowly increase lr, otherwise loss becomes NAN + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] +``` + +Then, we could use the following command to train our customized model. For more training commands, please refer to [here](train.md). + +```python +# training with 8 GPUS +bash tools/dist_train.sh configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py \ + 8 \ + --work-dir ./work_dirs/faster-rcnn_r50_fpn_fcos-rpn_1x_coco +``` + +## Evaluate proposals + +The quality of proposals is of great importance to the performance of detector, therefore, we also provide a way to evaluate proposals. Same as above, create a new config file named `configs/rpn/fcos-rpn_r50_fpn_1x_coco.py`, and replace with setting of `rpn_head` with the setting of `bbox_head` in `configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py`. + +```python +_base_ = [ + '../_base_/models/rpn_r50_fpn.py', '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +val_evaluator = dict(metric='proposal_fast') +test_evaluator = val_evaluator + +model = dict( + # copied from configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py + neck=dict( + start_level=1, + add_extra_convs='on_output', # use P5 + relu_before_extra_convs=True), + rpn_head=dict( + _delete_=True, # ignore the unused old settings + type='FCOSHead', + num_classes=1, # num_classes = 1 for rpn, if num_classes > 1, it will be set to 1 in RPN automatically + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0))) +``` + +Suppose we have the checkpoint `./work_dirs/faster-rcnn_r50_fpn_fcos-rpn_1x_coco/epoch_12.pth` after training, then we can evaluate the quality of proposals with the following command. + +```python +# testing with 8 GPUs +bash tools/dist_test.sh \ + configs/rpn/fcos-rpn_r50_fpn_1x_coco.py \ + ./work_dirs/faster-rcnn_r50_fpn_fcos-rpn_1x_coco/epoch_12.pth \ + 8 +``` + +## Train the customized Faster R-CNN with pre-trained FCOS + +Pre-training not only speeds up convergence of training, but also improves the performance of the detector. Therefore, here we give an example to illustrate how to do use a pre-trained FCOS as an RPN to accelerate training and improve the accuracy. Suppose we want to use `FCOSHead` as an rpn head in Faster R-CNN and train with the pre-trained [`fcos_r50-caffe_fpn_gn-head_1x_coco`](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth). The content of config file named `configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_fcos-rpn_1x_coco.py` is as the following. Note that `fcos_r50-caffe_fpn_gn-head_1x_coco` uses a caffe version of ResNet50, the pixel mean and std in `data_preprocessor` thus need to be updated. + +```python +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +model = dict( + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(type='BN', requires_grad=False), + style='caffe', + init_cfg=None), # the checkpoint in ``load_from`` contains the weights of backbone + neck=dict( + start_level=1, + add_extra_convs='on_output', # use P5 + relu_before_extra_convs=True), + rpn_head=dict( + _delete_=True, # ignore the unused old settings + type='FCOSHead', + num_classes=1, # num_classes = 1 for rpn, if num_classes > 1, it will be set to 1 in TwoStageDetector automatically + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + roi_head=dict( # update featmap_strides due to the strides in neck + bbox_roi_extractor=dict(featmap_strides=[8, 16, 32, 64, 128]))) + +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth' +``` + +The command for training is as below. + +```python +bash tools/dist_train.sh \ + configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_fcos-rpn_1x_coco.py \ + 8 \ + --work-dir ./work_dirs/faster-rcnn_r50-caffe_fpn_fcos-rpn_1x_coco +``` diff --git a/mmdetection/docs/en/user_guides/test.md b/mmdetection/docs/en/user_guides/test.md new file mode 100644 index 0000000..129a240 --- /dev/null +++ b/mmdetection/docs/en/user_guides/test.md @@ -0,0 +1,303 @@ +# Test existing models on standard datasets + +To evaluate a model's accuracy, one usually tests the model on some standard datasets, please refer to [dataset prepare guide](dataset_prepare.md) to prepare the dataset. + +This section will show how to test existing models on supported datasets. + +## Test existing models + +We provide testing scripts for evaluating an existing model on the whole dataset (COCO, PASCAL VOC, Cityscapes, etc.). +The following testing environments are supported: + +- single GPU +- CPU +- single node multiple GPUs +- multiple nodes + +Choose the proper script to perform testing depending on the testing environment. + +```shell +# Single-gpu testing +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--out ${RESULT_FILE}] \ + [--show] + +# CPU: disable GPUs and run single-gpu testing script +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--out ${RESULT_FILE}] \ + [--show] + +# Multi-gpu testing +bash tools/dist_test.sh \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + ${GPU_NUM} \ + [--out ${RESULT_FILE}] +``` + +`tools/dist_test.sh` also supports multi-node testing, but relies on PyTorch's [launch utility](https://pytorch.org/docs/stable/distributed.html#launch-utility). + +Optional arguments: + +- `RESULT_FILE`: Filename of the output results in pickle format. If not specified, the results will not be saved to a file. +- `--show`: If specified, detection results will be plotted on the images and shown in a new window. It is only applicable to single GPU testing and used for debugging and visualization. Please make sure that GUI is available in your environment. Otherwise, you may encounter an error like `cannot connect to X server`. +- `--show-dir`: If specified, detection results will be plotted on the images and saved to the specified directory. It is only applicable to single GPU testing and used for debugging and visualization. You do NOT need a GUI available in your environment for using this option. +- `--work-dir`: If specified, detection results containing evaluation metrics will be saved to the specified directory. +- `--cfg-options`: If specified, the key-value pair optional cfg will be merged into config file + +## Examples + +Assuming that you have already downloaded the checkpoints to the directory `checkpoints/`. + +1. Test RTMDet and visualize the results. Press any key for the next image. + Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet). + + ```shell + python tools/test.py \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --show + ``` + +2. Test RTMDet and save the painted images for future visualization. + Config and checkpoint files are available [here](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet). + + ```shell + python tools/test.py \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --show-dir faster_rcnn_r50_fpn_1x_results + ``` + +3. Test Faster R-CNN on PASCAL VOC (without saving the test results). + Config and checkpoint files are available [here](../../../configs/pascal_voc). + + ```shell + python tools/test.py \ + configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py \ + checkpoints/faster_rcnn_r50_fpn_1x_voc0712_20200624-c9895d40.pth + ``` + +4. Test Mask R-CNN with 8 GPUs, and evaluate. + Config and checkpoint files are available [here](../../../configs/mask_rcnn). + + ```shell + ./tools/dist_test.sh \ + configs/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ + 8 \ + --out results.pkl + ``` + +5. Test Mask R-CNN with 8 GPUs, and evaluate the metric **class-wise**. + Config and checkpoint files are available [here](../../../configs/mask_rcnn). + + ```shell + ./tools/dist_test.sh \ + configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ + 8 \ + --out results.pkl \ + --cfg-options test_evaluator.classwise=True + ``` + +6. Test Mask R-CNN on COCO test-dev with 8 GPUs, and generate JSON files for submitting to the official evaluation server. + Config and checkpoint files are available [here](../../../configs/mask_rcnn). + + Replace the original test_evaluator and test_dataloader with test_evaluator and test_dataloader in the comment in [config](../../../configs/_base_/datasets/coco_instance.py) and run: + + ```shell + ./tools/dist_test.sh \ + configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ + 8 + ``` + + This command generates two JSON files `./work_dirs/coco_instance/test.bbox.json` and `./work_dirs/coco_instance/test.segm.json`. + +7. Test Mask R-CNN on Cityscapes test with 8 GPUs, and generate txt and png files for submitting to the official evaluation server. + Config and checkpoint files are available [here](../../../configs/cityscapes). + + Replace the original test_evaluator and test_dataloader with test_evaluator and test_dataloader in the comment in [config](../../../configs/_base_/datasets/cityscapes_instance.py) and run: + + ```shell + ./tools/dist_test.sh \ + configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py \ + checkpoints/mask_rcnn_r50_fpn_1x_cityscapes_20200227-afe51d5a.pth \ + 8 + ``` + + The generated png and txt would be under `./work_dirs/cityscapes_metric/` directory. + +## Test without Ground Truth Annotations + +MMDetection supports to test models without ground-truth annotations using `CocoDataset`. If your dataset format is not in COCO format, please convert them to COCO format. For example, if your dataset format is VOC, you can directly convert it to COCO format by the [script in tools.](../../../tools/dataset_converters/pascal_voc.py) If your dataset format is Cityscapes, you can directly convert it to COCO format by the [script in tools.](../../../tools/dataset_converters/cityscapes.py) The rest of the formats can be converted using [this script](../../../tools/dataset_converters/images2coco.py). + +```shell +python tools/dataset_converters/images2coco.py \ + ${IMG_PATH} \ + ${CLASSES} \ + ${OUT} \ + [--exclude-extensions] +``` + +arguments: + +- `IMG_PATH`: The root path of images. +- `CLASSES`: The text file with a list of categories. +- `OUT`: The output annotation json file name. The save dir is in the same directory as `IMG_PATH`. +- `exclude-extensions`: The suffix of images to be excluded, such as 'png' and 'bmp'. + +After the conversion is complete, you need to replace the original test_evaluator and test_dataloader with test_evaluator and test_dataloader in the comment in [config](../../../configs/_base_/datasets/coco_detection.py)(find which dataset in 'configs/_base_/datasets' the current config corresponds to) and run: + +```shell +# Single-gpu testing +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--show] + +# CPU: disable GPUs and run single-gpu testing script +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--out ${RESULT_FILE}] \ + [--show] + +# Multi-gpu testing +bash tools/dist_test.sh \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + ${GPU_NUM} \ + [--show] +``` + +Assuming that the checkpoints in the [model zoo](https://mmdetection.readthedocs.io/en/latest/modelzoo_statistics.html) have been downloaded to the directory `checkpoints/`, we can test Mask R-CNN on COCO test-dev with 8 GPUs, and generate JSON files using the following command. + +```sh +./tools/dist_test.sh \ + configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ + 8 +``` + +This command generates two JSON files `./work_dirs/coco_instance/test.bbox.json` and `./work_dirs/coco_instance/test.segm.json`. + +## Batch Inference + +MMDetection supports inference with a single image or batched images in test mode. By default, we use single-image inference and you can use batch inference by modifying `samples_per_gpu` in the config of test data. You can do that either by modifying the config as below. + +```shell +data = dict(train_dataloader=dict(...), val_dataloader=dict(...), test_dataloader=dict(batch_size=2, ...)) +``` + +Or you can set it through `--cfg-options` as `--cfg-options test_dataloader.batch_size=2` + +## Test Time Augmentation (TTA) + +Test time augmentation (TTA) is a data augmentation strategy used during the test phase. It applies different augmentations, such as flipping and scaling, to the same image for model inference, and then merges the predictions of each augmented image to obtain more accurate predictions. To make it easier for users to use TTA, MMEngine provides [BaseTTAModel](https://mmengine.readthedocs.io/en/latest/api/generated/mmengine.model.BaseTTAModel.html#mmengine.model.BaseTTAModel) class, which allows users to implement different TTA strategies by simply extending the BaseTTAModel class according to their needs. + +In MMDetection, we provides [DetTTAModel](../../../mmdet/models/test_time_augs/det_tta.py) class, which inherits from BaseTTAModel. + +### Use case + +Using TTA requires two steps. First, you need to add `tta_model` and `tta_pipeline` in the configuration file: + +```shell +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict(nms=dict( + type='nms', + iou_threshold=0.5), + max_per_img=100)) + +tta_pipeline = [ + dict(type='LoadImageFromFile', + backend_args=None), + dict( + type='TestTimeAug', + transforms=[[ + dict(type='Resize', scale=(1333, 800), keep_ratio=True) + ], [ # It uses 2 flipping transformations (flipping and not flipping). + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', + 'img_shape', 'scale_factor', 'flip', + 'flip_direction')) + ]])] +``` + +Second, set `--tta` when running the test scripts as examples below: + +```shell +# Single-gpu testing +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--tta] + +# CPU: disable GPUs and run single-gpu testing script +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--out ${RESULT_FILE}] \ + [--tta] + +# Multi-gpu testing +bash tools/dist_test.sh \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + ${GPU_NUM} \ + [--tta] +``` + +You can also modify the TTA config by yourself, such as adding scaling enhancement: + +```shell +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict(nms=dict( + type='nms', + iou_threshold=0.5), + max_per_img=100)) + +img_scales = [(1333, 800), (666, 400), (2000, 1200)] +tta_pipeline = [ + dict(type='LoadImageFromFile', + backend_args=None), + dict( + type='TestTimeAug', + transforms=[[ + dict(type='Resize', scale=s, keep_ratio=True) for s in img_scales + ], [ + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', + 'img_shape', 'scale_factor', 'flip', + 'flip_direction')) + ]])] +``` + +The above data augmentation pipeline will first perform 3 multi-scaling transformations on the image, followed by 2 flipping transformations (flipping and not flipping). Finally, the image is packaged into the final result using PackDetInputs. + +Here are more TTA use cases for your reference: + +- [RetinaNet](../../../configs/retinanet/retinanet_tta.py) +- [CenterNet](../../../configs/centernet/centernet_tta.py) +- [YOLOX](../../../configs/rtmdet/rtmdet_tta.py) +- [RTMDet](../../../configs/yolox/yolox_tta.py) + +For more advanced usage and data flow of TTA, please refer to [MMEngine](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/test_time_augmentation.html#data-flow). We will support instance segmentation TTA latter. diff --git a/mmdetection/docs/en/user_guides/test_results_submission.md b/mmdetection/docs/en/user_guides/test_results_submission.md new file mode 100644 index 0000000..721347e --- /dev/null +++ b/mmdetection/docs/en/user_guides/test_results_submission.md @@ -0,0 +1,182 @@ +# Test Results Submission + +## Panoptic segmentation test results submission + +The following sections introduce how to produce the prediction results of panoptic segmentation models on the COCO test-dev set and submit the predictions to [COCO evaluation server](https://competitions.codalab.org/competitions/19507). + +### Prerequisites + +- Download [COCO test dataset images](http://images.cocodataset.org/zips/test2017.zip), [testing image info](http://images.cocodataset.org/annotations/image_info_test2017.zip), and [panoptic train/val annotations](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip), then unzip them, put 'test2017' to `data/coco/`, put json files and annotation files to `data/coco/annotations/`. + +```shell +# suppose data/coco/ does not exist +mkdir -pv data/coco/ + +# download test2017 +wget -P data/coco/ http://images.cocodataset.org/zips/test2017.zip +wget -P data/coco/ http://images.cocodataset.org/annotations/image_info_test2017.zip +wget -P data/coco/ http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip + +# unzip them +unzip data/coco/test2017.zip -d data/coco/ +unzip data/coco/image_info_test2017.zip -d data/coco/ +unzip data/coco/panoptic_annotations_trainval2017.zip -d data/coco/ + +# remove zip files (optional) +rm -rf data/coco/test2017.zip data/coco/image_info_test2017.zip data/coco/panoptic_annotations_trainval2017.zip +``` + +- Run the following code to update category information in testing image info. Since the attribute `isthing` is missing in category information of 'image_info_test-dev2017.json', we need to update it with the category information in 'panoptic_val2017.json'. + +```shell +python tools/misc/gen_coco_panoptic_test_info.py data/coco/annotations +``` + +After completing the above preparations, your directory structure of `data` should be like this: + +```text +data +`-- coco + |-- annotations + | |-- image_info_test-dev2017.json + | |-- image_info_test2017.json + | |-- panoptic_image_info_test-dev2017.json + | |-- panoptic_train2017.json + | |-- panoptic_train2017.zip + | |-- panoptic_val2017.json + | `-- panoptic_val2017.zip + `-- test2017 +``` + +### Inference on coco test-dev + +To do inference on coco test-dev, we should update the setting of `test_dataloder` and `test_evaluator` first. There two ways to do this: 1. update them in config file; 2. update them in command line. + +#### Update them in config file + +The relevant settings are provided at the end of `configs/_base_/datasets/coco_panoptic.py`, as below. + +```python +test_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/panoptic_image_info_test-dev2017.json', + data_prefix=dict(img='test2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_evaluator = dict( + type='CocoPanopticMetric', + format_only=True, + ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json', + outfile_prefix='./work_dirs/coco_panoptic/test') +``` + +Any of the following way can be used to update the setting for inference on coco test-dev set. + +Case 1: Directly uncomment the setting in `configs/_base_/datasets/coco_panoptic.py`. + +Case 2: Copy the following setting to the config file you used now. + +```python +test_dataloader = dict( + dataset=dict( + ann_file='annotations/panoptic_image_info_test-dev2017.json', + data_prefix=dict(img='test2017/', _delete_=True))) +test_evaluator = dict( + format_only=True, + ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json', + outfile_prefix='./work_dirs/coco_panoptic/test') +``` + +Then infer on coco test-dev et by the following command. + +```shell +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} +``` + +#### Update them in command line + +The command for update of the related settings and inference on coco test-dev are as below. + +```shell +# test with single gpu +CUDA_VISIBLE_DEVICES=0 python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + --cfg-options \ + test_dataloader.dataset.ann_file=annotations/panoptic_image_info_test-dev2017.json \ + test_dataloader.dataset.data_prefix.img=test2017 \ + test_dataloader.dataset.data_prefix._delete_=True \ + test_evaluator.format_only=True \ + test_evaluator.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json \ + test_evaluator.outfile_prefix=${WORK_DIR}/results + +# test with four gpus +CUDA_VISIBLE_DEVICES=0,1,3,4 bash tools/dist_test.sh \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + 8 \ # eights gpus + --cfg-options \ + test_dataloader.dataset.ann_file=annotations/panoptic_image_info_test-dev2017.json \ + test_dataloader.dataset.data_prefix.img=test2017 \ + test_dataloader.dataset.data_prefix._delete_=True \ + test_evaluator.format_only=True \ + test_evaluator.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json \ + test_evaluator.outfile_prefix=${WORK_DIR}/results + +# test with slurm +GPUS=8 tools/slurm_test.sh \ + ${Partition} \ + ${JOB_NAME} \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + --cfg-options \ + test_dataloader.dataset.ann_file=annotations/panoptic_image_info_test-dev2017.json \ + test_dataloader.dataset.data_prefix.img=test2017 \ + test_dataloader.dataset.data_prefix._delete_=True \ + test_evaluator.format_only=True \ + test_evaluator.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json \ + test_evaluator.outfile_prefix=${WORK_DIR}/results +``` + +Example + +Suppose we perform inference on `test2017` using pretrained MaskFormer with ResNet-50 backbone. + +```shell +# test with single gpu +CUDA_VISIBLE_DEVICES=0 python tools/test.py \ + configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py \ + checkpoints/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth \ + --cfg-options \ + test_dataloader.dataset.ann_file=annotations/panoptic_image_info_test-dev2017.json \ + test_dataloader.dataset.data_prefix.img=test2017 \ + test_dataloader.dataset.data_prefix._delete_=True \ + test_evaluator.format_only=True \ + test_evaluator.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json \ + test_evaluator.outfile_prefix=work_dirs/maskformer/results +``` + +### Rename files and zip results + +After inference, the panoptic segmentation results (a json file and a directory where the masks are stored) will be in `WORK_DIR`. We should rename them according to the naming convention described on [COCO's Website](https://cocodataset.org/#upload). Finally, we need to compress the json and the directory where the masks are stored into a zip file, and rename the zip file according to the naming convention. Note that the zip file should **directly** contains the above two files. + +The commands to rename files and zip results: + +```shell +# In WORK_DIR, we have panoptic segmentation results: 'panoptic' and 'results.panoptic.json'. +cd ${WORK_DIR} + +# replace '[algorithm_name]' with the name of algorithm you used. +mv ./panoptic ./panoptic_test-dev2017_[algorithm_name]_results +mv ./results.panoptic.json ./panoptic_test-dev2017_[algorithm_name]_results.json +zip panoptic_test-dev2017_[algorithm_name]_results.zip -ur panoptic_test-dev2017_[algorithm_name]_results panoptic_test-dev2017_[algorithm_name]_results.json +``` diff --git a/mmdetection/docs/en/user_guides/tracking_analysis_tools.md b/mmdetection/docs/en/user_guides/tracking_analysis_tools.md new file mode 100644 index 0000000..acced58 --- /dev/null +++ b/mmdetection/docs/en/user_guides/tracking_analysis_tools.md @@ -0,0 +1,86 @@ +**We provide lots of useful tools under the `tools/` directory.** + +## MOT Test-time Parameter Search + +`tools/analysis_tools/mot/mot_param_search.py` can search the parameters of the `tracker` in MOT models. +It is used as the same manner with `tools/test.py` but **different** in the configs. + +Here is an example that shows how to modify the configs: + +1. Define the desirable evaluation metrics to record. + + For example, you can define the `evaluator` as + + ```python + test_evaluator=dict(type='MOTChallengeMetrics', metric=['HOTA', 'CLEAR', 'Identity']) + ``` + + Of course, you can also customize the content of `metric` in `test_evaluator`. You are free to choose one or more of `['HOTA', 'CLEAR', 'Identity']`. + +2. Define the parameters and the values to search. + + Assume you have a tracker like + + ```python + model=dict( + tracker=dict( + type='BaseTracker', + obj_score_thr=0.5, + match_iou_thr=0.5 + ) + ) + ``` + + If you want to search the parameters of the tracker, just change the value to a list as follow + + ```python + model=dict( + tracker=dict( + type='BaseTracker', + obj_score_thr=[0.4, 0.5, 0.6], + match_iou_thr=[0.4, 0.5, 0.6, 0.7] + ) + ) + ``` + + Then the script will test the totally 12 cases and log the results. + +## MOT Error Visualize + +`tools/analysis_tools/mot/mot_error_visualize.py` can visualize errors for multiple object tracking. +This script needs the result of inference. By Default, the **red** bounding box denotes false positive, the **yellow** bounding box denotes the false negative and the **blue** bounding box denotes ID switch. + +``` +python tools/analysis_tools/mot/mot_error_visualize.py \ + ${CONFIG_FILE}\ + --input ${INPUT} \ + --result-dir ${RESULT_DIR} \ + [--output-dir ${OUTPUT}] \ + [--fps ${FPS}] \ + [--show] \ + [--backend ${BACKEND}] +``` + +The `RESULT_DIR` contains the inference results of all videos and the inference result is a `txt` file. + +Optional arguments: + +- `OUTPUT`: Output of the visualized demo. If not specified, the `--show` is obligate to show the video on the fly. +- `FPS`: FPS of the output video. +- `--show`: Whether show the video on the fly. +- `BACKEND`: The backend to visualize the boxes. Options are `cv2` and `plt`. + +## Browse dataset + +`tools/analysis_tools/mot/browse_dataset.py` can visualize the training dataset to check whether the dataset configuration is correct. + +**Examples:** + +```shell +python tools/analysis_tools/browse_dataset.py ${CONFIG_FILE} [--show-interval ${SHOW_INTERVAL}] +``` + +Optional arguments: + +- `SHOW_INTERVAL`: The interval of show (s). +- `--show`: Whether show the images on the fly. diff --git a/mmdetection/docs/en/user_guides/tracking_config.md b/mmdetection/docs/en/user_guides/tracking_config.md new file mode 100644 index 0000000..fa8aeea --- /dev/null +++ b/mmdetection/docs/en/user_guides/tracking_config.md @@ -0,0 +1,112 @@ +# Learn about Configs + +We use python files as our config system. You can find all the provided configs under $MMDetection/configs. + +We incorporate modular and inheritance design into our config system, +which is convenient to conduct various experiments. +If you wish to inspect the config file, +you may run `python tools/misc/print_config.py /PATH/TO/CONFIG` to see the complete config. + +## A brief description of a complete config + +A complete config usually contains the following primary fields: + +- `model`: the basic config of model, which may contain `data_preprocessor`, modules (e.g., `detector`, `motion`),`train_cfg`, `test_cfg`, etc. +- `train_dataloader`: the config of training dataloader, which usually contains `batch_size`, `num_workers`, `sampler`, `dataset`, etc. +- `val_dataloader`: the config of validation dataloader, which is similar with `train_dataloader`. +- `test_dataloader`: the config of testing dataloader, which is similar with `train_dataloader`. +- `val_evaluator`: the config of validation evaluator. For example,`type='MOTChallengeMetrics'` for MOT task on the MOTChallenge benchmarks. +- `test_evaluator`: the config of testing evaluator, which is similar with `val_evaluator`. +- `train_cfg`: the config of training loop. For example, `type='EpochBasedTrainLoop'`. +- `val_cfg`: the config of validation loop. For example, `type='VideoValLoop'`. +- `test_cfg`: the config of testing loop. For example, `type='VideoTestLoop'`. +- `default_hooks`: the config of default hooks, which may include hooks for timer, logger, param_scheduler, checkpoint, sampler_seed, visualization, etc. +- `vis_backends`: the config of visualization backends, which uses `type='LocalVisBackend'` as default. +- `visualizer`: the config of visualizer. `type='TrackLocalVisualizer'` for MOT tasks. +- `param_scheduler`: the config of parameter scheduler, which usually sets the learning rate scheduler. +- `optim_wrapper`: the config of optimizer wrapper, which contains optimization-related information, for example optimizer, gradient clipping, etc. +- `load_from`: load models as a pre-trained model from a given path. +- `resume`: If `True`, resume checkpoints from `load_from`, and the training will be resumed from the epoch when the checkpoint is saved. + +## Modify config through script arguments + +When submitting jobs using `tools/train.py` or `tools/test_tracking.py`, +you may specify `--cfg-options` to in-place modify the config. +We present several examples as follows. +For more details, please refer to [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/config.md). + +- **Update config keys of dict chains.** + + The config options can be specified following the order of the dict keys in the original config. + For example, `--cfg-options model.detector.backbone.norm_eval=False` changes the all BN modules in model backbones to train mode. + +- **Update keys inside a list of configs.** + + Some config dicts are composed as a list in your config. + For example, the testing pipeline `test_dataloader.dataset.pipeline` is normally a list e.g. `[dict(type='LoadImageFromFile'), ...]`. + If you want to change `LoadImageFromFile` to `LoadImageFromWebcam` in the pipeline, + you may specify `--cfg-options test_dataloader.dataset.pipeline.0.type=LoadImageFromWebcam`. + +- **Update values of list/tuples.** + + Maybe the value to be updated is a list or a tuple. + For example, you can change the key `mean` of `data_preprocessor` by specifying `--cfg-options model.data_preprocessor.mean=[0,0,0]`. + Note that **NO** white space is allowed inside the specified value. + +## Config File Structure + +There are 3 basic component types under `config/_base_`, i.e., dataset, model and default_runtime. +Many methods could be easily constructed with one of each like SORT, DeepSORT. +The configs that are composed by components from `_base_` are called *primitive*. + +For all configs under the same folder, it is recommended to have only **one** *primitive* config. +All other configs should inherit from the *primitive* config. +In this way, the maximum of inheritance level is 3. + +For easy understanding, we recommend contributors to inherit from exiting methods. +For example, if some modification is made base on Faster R-CNN, +user may first inherit the basic Faster R-CNN structure +by specifying `_base_ = ../_base_/models/faster-rcnn_r50-dc5.py`, +then modify the necessary fields in the config files. + +If you are building an entirely new method that does not share the structure with any of the existing methods, +you may create a folder `method_name` under `configs`. + +Please refer to [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/config.md) for detailed documentation. + +## Config Name Style + +We follow the below style to name config files. Contributors are advised to follow the same style. + +```shell +{method}_{module}_{train_cfg}_{train_data}_{test_data} +``` + +- `{method}`: method name, like `sort`. +- `{module}`: basic modules of the method, like `faster-rcnn_r50_fpn`. +- `{train_cfg}`: training config which usually contains batch size, epochs, etc, like `8xb4-80e`. +- `{train_data}`: training data, like `mot17halftrain`. +- `{test_data}`: testing data, like `test-mot17halfval`. + +## FAQ + +**Ignore some fields in the base configs** + +Sometimes, you may set `_delete_=True` to ignore some of fields in base configs. +You may refer to [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/config.md) for simple illustration. + +## Tracking Data Structure Introduction + +### Advantages and new features + +In mmdetection tracking task, we employ videos to organize the dataset and use +TrackDataSample to descirbe dataset info. + +- Based on video organization, we provide transform `UniformRefFrameSample` to sample key frames and ref frames and use `TransformBroadcaster` for for clip training. +- TrackDataSample can be viewd as a wrapper of multiple DetDataSample to some extent. It contains a property `video_data_samples` which is a list of DetDataSample, each of which corresponds to a single frame. In addition, it's metainfo includes key_frames_inds and ref_frames_inds to apply clip training way. +- Thanks to video-based data organization, the entire video can be directly tested. This way is more concise and intuitive. We also provide image_based test method, if your GPU mmemory cannot fit the entire video. + +### TODO + +- Some algorithms like StrongSORT, Mask2Former can not support video_based testing. These algorithms pose a challenge to GPU memory. we will optimize this problem in the future. +- Now we do not support joint training of video_based dataset like MOT Challenge Dataset and image_based dataset like Crowdhuman for the algorithm QDTrack. we will optimize this problem in the future. diff --git a/mmdetection/docs/en/user_guides/tracking_dataset_prepare.md b/mmdetection/docs/en/user_guides/tracking_dataset_prepare.md new file mode 100644 index 0000000..2c38569 --- /dev/null +++ b/mmdetection/docs/en/user_guides/tracking_dataset_prepare.md @@ -0,0 +1,247 @@ +## Dataset Preparation + +This page provides the instructions for dataset preparation on existing benchmarks, include + +- Multiple Object Tracking + + - [MOT Challenge](https://motchallenge.net/) + - [CrowdHuman](https://www.crowdhuman.org/) + +- Video Instance Segmentation + + - [YouTube-VIS](https://youtube-vos.org/dataset/vis/) + +### 1. Download Datasets + +Please download the datasets from the official websites. It is recommended to symlink the root of the datasets to `$MMDETECTION/data`. + +#### 1.1 Multiple Object Tracking + +- For the training and testing of multi object tracking task, one of the MOT Challenge datasets (e.g. MOT17, MOT20) are needed, CrowdHuman can be served as comlementary dataset. + +- For users in China, the following datasets can be downloaded from [OpenDataLab](https://opendatalab.com/) with high speed: + + - [MOT17](https://opendatalab.com/MOT17/download) + - [MOT20](https://opendatalab.com/MOT20/download) + - [CrowdHuman](https://opendatalab.com/CrowdHuman/download) + +#### 1.2 Video Instance Segmentation + +- For the training and testing of video instance segmetatioon task, only one of YouTube-VIS datasets (e.g. YouTube-VIS 2019, YouTube-VIS 2021) is needed. + +- YouTube-VIS 2019 dataset can be download from [YouTubeVOS](https://codalab.lisn.upsaclay.fr/competitions/6064) + +- YouTube-VIS 2021 dataset can be download from [YouTubeVOS](https://codalab.lisn.upsaclay.fr/competitions/7680) + +#### 1.3 Data Structure + +If your folder structure is different from the following, you may need to change the corresponding paths in config files. + +``` +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── annotations +│ │ +| ├── MOT15/MOT16/MOT17/MOT20 +| | ├── train +| | | ├── MOT17-02-DPM +| | | | ├── det +| │ │ │ ├── gt +| │ │ │ ├── img1 +| │ │ │ ├── seqinfo.ini +│ │ │ ├── ...... +| | ├── test +| | | ├── MOT17-01-DPM +| | | | ├── det +| │ │ │ ├── img1 +| │ │ │ ├── seqinfo.ini +│ │ │ ├── ...... +│ │ +│ ├── crowdhuman +│ │ ├── annotation_train.odgt +│ │ ├── annotation_val.odgt +│ │ ├── train +│ │ │ ├── Images +│ │ │ ├── CrowdHuman_train01.zip +│ │ │ ├── CrowdHuman_train02.zip +│ │ │ ├── CrowdHuman_train03.zip +│ │ ├── val +│ │ │ ├── Images +│ │ │ ├── CrowdHuman_val.zip +│ │ +``` + +### 2. Convert Annotations + +In this case, you need to convert the official annotations to coco style. We provide scripts and the usages are as following: + +```shell +# MOT17 +# The processing of other MOT Challenge dataset is the same as MOT17 +python ./tools/dataset_converters/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det +python ./tools/dataset_converters/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3 + +# CrowdHuman +python ./tools/dataset_converters/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations + +# YouTube-VIS 2019 +python ./tools/dataset_converters/youtubevis/youtubevis2coco.py -i ./data/youtube_vis_2019 -o ./data/youtube_vis_2019/annotations --version 2019 + +# YouTube-VIS 2021 +python ./tools/dataset_converters/youtubevis/youtubevis2coco.py -i ./data/youtube_vis_2021 -o ./data/youtube_vis_2021/annotations --version 2021 + +``` + +The folder structure will be as following after your run these scripts: + +``` +mmdetection +├── mmtrack +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── annotations +│ │ +| ├── MOT15/MOT16/MOT17/MOT20 +| | ├── train +| | | ├── MOT17-02-DPM +| | | | ├── det +| │ │ │ ├── gt +| │ │ │ ├── img1 +| │ │ │ ├── seqinfo.ini +│ │ │ ├── ...... +| | ├── test +| | | ├── MOT17-01-DPM +| | | | ├── det +| │ │ │ ├── img1 +| │ │ │ ├── seqinfo.ini +│ │ │ ├── ...... +| | ├── annotations +| | ├── reid +│ │ │ ├── imgs +│ │ │ ├── meta +│ │ +│ ├── crowdhuman +│ │ ├── annotation_train.odgt +│ │ ├── annotation_val.odgt +│ │ ├── train +│ │ │ ├── Images +│ │ │ ├── CrowdHuman_train01.zip +│ │ │ ├── CrowdHuman_train02.zip +│ │ │ ├── CrowdHuman_train03.zip +│ │ ├── val +│ │ │ ├── Images +│ │ │ ├── CrowdHuman_val.zip +│ │ ├── annotations +│ │ │ ├── crowdhuman_train.json +│ │ │ ├── crowdhuman_val.json +│ │ +│ ├── youtube_vis_2019 +│ │ │── train +│ │ │ │── JPEGImages +│ │ │ │── ...... +│ │ │── valid +│ │ │ │── JPEGImages +│ │ │ │── ...... +│ │ │── test +│ │ │ │── JPEGImages +│ │ │ │── ...... +│ │ │── train.json (the official annotation files) +│ │ │── valid.json (the official annotation files) +│ │ │── test.json (the official annotation files) +│ │ │── annotations (the converted annotation file) +│ │ +│ ├── youtube_vis_2021 +│ │ │── train +│ │ │ │── JPEGImages +│ │ │ │── instances.json (the official annotation files) +│ │ │ │── ...... +│ │ │── valid +│ │ │ │── JPEGImages +│ │ │ │── instances.json (the official annotation files) +│ │ │ │── ...... +│ │ │── test +│ │ │ │── JPEGImages +│ │ │ │── instances.json (the official annotation files) +│ │ │ │── ...... +│ │ │── annotations (the converted annotation file) +``` + +#### The folder of annotations and reid in MOT15/MOT16/MOT17/MOT20 + +We take MOT17 dataset as examples, the other datasets share similar structure. + +There are 8 JSON files in `data/MOT17/annotations`: + +`train_cocoformat.json`: JSON file containing the annotations information of the training set in MOT17 dataset. + +`train_detections.pkl`: Pickle file containing the public detections of the training set in MOT17 dataset. + +`test_cocoformat.json`: JSON file containing the annotations information of the testing set in MOT17 dataset. + +`test_detections.pkl`: Pickle file containing the public detections of the testing set in MOT17 dataset. + +`half-train_cocoformat.json`, `half-train_detections.pkl`, `half-val_cocoformat.json`and `half-val_detections.pkl` share similar meaning with `train_cocoformat.json` and `train_detections.pkl`. The `half` means we split each video in the training set into half. The first half videos are denoted as `half-train` set, and the second half videos are denoted as`half-val` set. + +The structure of `data/MOT17/reid` is as follows: + +``` +reid +├── imgs +│ ├── MOT17-02-FRCNN_000002 +│ │ ├── 000000.jpg +│ │ ├── 000001.jpg +│ │ ├── ... +│ ├── MOT17-02-FRCNN_000003 +│ │ ├── 000000.jpg +│ │ ├── 000001.jpg +│ │ ├── ... +├── meta +│ ├── train_80.txt +│ ├── val_20.txt +``` + +The `80` in `train_80.txt` means the proportion of the training dataset to the whole ReID dataset is 80%. While the proportion of the validation dataset is 20%. + +For training, we provide a annotation list `train_80.txt`. Each line of the list contains a filename and its corresponding ground-truth labels. The format is as follows: + +``` +MOT17-05-FRCNN_000110/000018.jpg 0 +MOT17-13-FRCNN_000146/000014.jpg 1 +MOT17-05-FRCNN_000088/000004.jpg 2 +MOT17-02-FRCNN_000009/000081.jpg 3 +``` + +`MOT17-05-FRCNN_000110` denotes the 110-th person in `MOT17-05-FRCNN` video. + +For validation, The annotation list `val_20.txt` remains the same as format above. + +Images in `reid/imgs` are cropped from raw images in `MOT17/train` by the corresponding `gt.txt`. The value of ground-truth labels should fall in range `[0, num_classes - 1]`. + +#### The folder of annotations in crowdhuman + +There are 2 JSON files in `data/crowdhuman/annotations`: + +`crowdhuman_train.json`: JSON file containing the annotations information of the training set in CrowdHuman dataset. +`crowdhuman_val.json`: JSON file containing the annotations information of the validation set in CrowdHuman dataset. + +#### The folder of annotations in youtube_vis_2019/youtube_vis2021 + +There are 3 JSON files in `data/youtube_vis_2019/annotations` or `data/youtube_vis_2021/annotations`: + +`youtube_vis_2019_train.json`/`youtube_vis_2021_train.json`: JSON file containing the annotations information of the training set in youtube_vis_2019/youtube_vis2021 dataset. + +`youtube_vis_2019_valid.json`/`youtube_vis_2021_valid.json`: JSON file containing the annotations information of the validation set in youtube_vis_2019/youtube_vis2021 dataset. + +`youtube_vis_2019_test.json`/`youtube_vis_2021_test.json`: JSON file containing the annotations information of the testing set in youtube_vis_2019/youtube_vis2021 dataset. diff --git a/mmdetection/docs/en/user_guides/tracking_inference.md b/mmdetection/docs/en/user_guides/tracking_inference.md new file mode 100644 index 0000000..06a6912 --- /dev/null +++ b/mmdetection/docs/en/user_guides/tracking_inference.md @@ -0,0 +1,55 @@ +# Inference + +We provide demo scripts to inference a given video or a folder that contains continuous images. The source codes are available [here](https://github.com/open-mmlab/mmdetection/tree/tracking/demo). + +Note that if you use a folder as the input, the image names there must be **sortable** , which means we can re-order the images according to the numbers contained in the filenames. We now only support reading the images whose filenames end with `.jpg`, `.jpeg` and `.png`. + +## Inference MOT models + +This script can inference an input video / images with a multiple object tracking or video instance segmentation model. + +```shell +python demo/mot_demo.py \ + ${INPUTS} + ${CONFIG_FILE} \ + [--checkpoint ${CHECKPOINT_FILE}] \ + [--detector ${DETECTOR_FILE}] \ + [--reid ${REID_FILE}] \ + [--score-thr ${SCORE_THR}] \ + [--device ${DEVICE}] \ + [--out ${OUTPUT}] \ + [--show] +``` + +The `INPUT` and `OUTPUT` support both _mp4 video_ format and the _folder_ format. + +**Important:** For `DeepSORT`, `SORT`, `StrongSORT`, they need load the weight of the `reid` and the weight of the `detector` separately. Therefore, we use `--detector` and `--reid` to load weights. Other algorithms such as `ByteTrack`, `OCSORT` `QDTrack` `MaskTrackRCNN` and `Mask2Former` use `--checkpoint` to load weights. + +Optional arguments: + +- `CHECKPOINT_FILE`: The checkpoint is optional. +- `DETECTOR_FILE`: The detector is optional. +- `REID_FILE`: The reid is optional. +- `SCORE_THR`: The threshold of score to filter bboxes. +- `DEVICE`: The device for inference. Options are `cpu` or `cuda:0`, etc. +- `OUTPUT`: Output of the visualized demo. If not specified, the `--show` is obligate to show the video on the fly. +- `--show`: Whether show the video on the fly. + +**Examples of running mot model:** + +```shell +# Example 1: do not specify --checkpoint to use --detector +python demo/mot_demo.py \ + demo/demo_mot.mp4 \ + configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \ + --detector \ + https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth \ + --out mot.mp4 + +# Example 2: use --checkpoint +python demo/mot_demo.py \ + demo/demo_mot.mp4 \ + configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \ + --checkpoint https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth \ + --out mot.mp4 +``` diff --git a/mmdetection/docs/en/user_guides/tracking_train_test.md b/mmdetection/docs/en/user_guides/tracking_train_test.md new file mode 100644 index 0000000..1a6871d --- /dev/null +++ b/mmdetection/docs/en/user_guides/tracking_train_test.md @@ -0,0 +1,229 @@ +# Learn to train and test + +## Train + +This section will show how to train existing models on supported datasets. +The following training environments are supported: + +- CPU +- single GPU +- single node multiple GPUs +- multiple nodes + +You can also manage jobs with Slurm. + +Important: + +- You can change the evaluation interval during training by modifying the `train_cfg` as + `train_cfg = dict(val_interval=10)`. That means evaluating the model every 10 epochs. +- The default learning rate in all config files is for 8 GPUs. + According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), + you need to set the learning rate proportional to the batch size if you use different GPUs or images per GPU, + e.g., `lr=0.01` for 8 GPUs * 1 img/gpu and lr=0.04 for 16 GPUs * 2 imgs/gpu. +- During training, log files and checkpoints will be saved to the working directory, + which is specified by CLI argument `--work-dir`. It uses `./work_dirs/CONFIG_NAME` as default. +- If you want the mixed precision training, simply specify CLI argument `--amp`. + +#### 1. Train on CPU + +The model is default put on cuda device. +Only if there are no cuda devices, the model will be put on cpu. +So if you want to train the model on CPU, you need to `export CUDA_VISIBLE_DEVICES=-1` to disable GPU visibility first. +More details in [MMEngine](https://github.com/open-mmlab/mmengine/blob/ca282aee9e402104b644494ca491f73d93a9544f/mmengine/runner/runner.py#L849-L850). + +```shell script +CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +An example of training the MOT model QDTrack on CPU: + +```shell script +CUDA_VISIBLE_DEVICES=-1 python tools/train.py configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py +``` + +#### 2. Train on single GPU + +If you want to train the model on single GPU, you can directly use the `tools/train.py` as follows. + +```shell script +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +You can use `export CUDA_VISIBLE_DEVICES=$GPU_ID` to select the GPU. + +An example of training the MOT model QDTrack on single GPU: + +```shell script +CUDA_VISIBLE_DEVICES=2 python tools/train.py configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py +``` + +#### 3. Train on single node multiple GPUs + +We provide `tools/dist_train.sh` to launch training on multiple GPUs. +The basic usage is as follows. + +```shell script +bash ./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments] +``` + +If you would like to launch multiple jobs on a single machine, +e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs, +you need to specify different ports (29500 by default) for each job to avoid communication conflict. + +For example, you can set the port in commands as follows. + +```shell script +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4 +``` + +An example of training the MOT model QDTrack on single node multiple GPUs: + +```shell script +bash ./tools/dist_train.sh configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 +``` + +#### 4. Train on multiple nodes + +If you launch with multiple machines simply connected with ethernet, you can simply run following commands: + +On the first machine: + +```shell script +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS +``` + +On the second machine: + +```shell script +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS +``` + +Usually it is slow if you do not have high speed networking like InfiniBand. + +#### 5. Train with Slurm + +[Slurm](https://slurm.schedmd.com/) is a good job scheduling system for computing clusters. +On a cluster managed by Slurm, you can use `slurm_train.sh` to spawn training jobs. +It supports both single-node and multi-node training. + +The basic usage is as follows. + +```shell script +bash ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} ${GPUS} +``` + +An example of training the MOT model QDTrack with Slurm: + +```shell script +PORT=29501 \ +GPUS_PER_NODE=8 \ +SRUN_ARGS="--quotatype=reserved" \ +bash ./tools/slurm_train.sh \ +mypartition \ +mottrack +configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py +./work_dirs/QDTrack \ +8 +``` + +## Test + +This section will show how to test existing models on supported datasets. +The following testing environments are supported: + +- CPU +- single GPU +- single node multiple GPUs +- multiple nodes + +You can also manage jobs with Slurm. + +Important: + +- In MOT, some algorithms like `DeepSORT`, `SORT`, `StrongSORT` need load the weight of the `reid` and the weight of the `detector` separately. + Other algorithms such as `ByteTrack`, `OCSORT` and `QDTrack` don't need. So we provide `--checkpoint`, `--detector` and `--reid` to load weights. +- We provide two ways to evaluate and test models, video_basede test and image_based test. some algorithms like `StrongSORT`, `Mask2former` only support + video_based test. if your GPU memory can't fit the entire video, you can switch test way by set sampler type. + For example: + video_based test: `sampler=dict(type='DefaultSampler', shuffle=False, round_up=False)` + image_based test: `sampler=dict(type='TrackImgSampler')` +- You can set the results saving path by modifying the key `outfile_prefix` in evaluator. + For example, `val_evaluator = dict(outfile_prefix='results/sort_mot17')`. + Otherwise, a temporal file will be created and will be removed after evaluation. +- If you just want the formatted results without evaluation, you can set `format_only=True`. + For example, `test_evaluator = dict(type='MOTChallengeMetric', metric=['HOTA', 'CLEAR', 'Identity'], outfile_prefix='sort_mot17_results', format_only=True)` + +#### 1. Test on CPU + +The model is default put on cuda device. +Only if there are no cuda devices, the model will be put on cpu. +So if you want to test the model on CPU, you need to `export CUDA_VISIBLE_DEVICES=-1` to disable GPU visibility first. +More details in [MMEngine](https://github.com/open-mmlab/mmengine/blob/ca282aee9e402104b644494ca491f73d93a9544f/mmengine/runner/runner.py#L849-L850). + +```shell script +CUDA_VISIBLE_DEVICES=-1 python tools/test_tracking.py ${CONFIG_FILE} [optional arguments] +``` + +An example of testing the MOT model SORT on CPU: + +```shell script +CUDA_VISIBLE_DEVICES=-1 python tools/test_tracking.py configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py --detector ${CHECKPOINT_FILE} +``` + +#### 2. Test on single GPU + +If you want to test the model on single GPU, you can directly use the `tools/test_tracking.py` as follows. + +```shell script +python tools/test_tracking.py ${CONFIG_FILE} [optional arguments] +``` + +You can use `export CUDA_VISIBLE_DEVICES=$GPU_ID` to select the GPU. + +An example of testing the MOT model QDTrack on single GPU: + +```shell script +CUDA_VISIBLE_DEVICES=2 python tools/test_tracking.py configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py --detector ${CHECKPOINT_FILE} +``` + +#### 3. Test on single node multiple GPUs + +We provide `tools/dist_test_tracking.sh` to launch testing on multiple GPUs. +The basic usage is as follows. + +```shell script +bash ./tools/dist_test_tracking.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments] +``` + +An example of testing the MOT model DeepSort on single node multiple GPUs: + +```shell script +bash ./tools/dist_test_tracking.sh configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 --detector ${CHECKPOINT_FILE} --reid ${CHECKPOINT_FILE} +``` + +#### 4. Test on multiple nodes + +You can test on multiple nodes, which is similar with "Train on multiple nodes". + +#### 5. Test with Slurm + +On a cluster managed by Slurm, you can use `slurm_test_tracking.sh` to spawn testing jobs. +It supports both single-node and multi-node testing. + +The basic usage is as follows. + +```shell script +[GPUS=${GPUS}] bash tools/slurm_test_tracking.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} [optional arguments] +``` + +An example of testing the VIS model Mask2former with Slurm: + +```shell script +GPUS=8 +bash tools/slurm_test_tracking.sh \ +mypartition \ +vis \ +configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py \ +--checkpoint ${CHECKPOINT_FILE} +``` diff --git a/mmdetection/docs/en/user_guides/tracking_visualization.md b/mmdetection/docs/en/user_guides/tracking_visualization.md new file mode 100644 index 0000000..2895325 --- /dev/null +++ b/mmdetection/docs/en/user_guides/tracking_visualization.md @@ -0,0 +1,47 @@ +# Learn about Visualization + +## Local Visualization + +This section will present how to visualize the detection/tracking results with local visualizer. + +If you want to draw prediction results, you can turn this feature on by setting `draw=True` in `TrackVisualizationHook` as follows. + +```shell script +default_hooks = dict(visualization=dict(type='TrackVisualizationHook', draw=True)) +``` + +Specifically, the `TrackVisualizationHook` has the following arguments: + +- `draw`: whether to draw prediction results. If it is False, it means that no drawing will be done. Defaults to False. +- `interval`: The interval of visualization. Defaults to 30. +- `score_thr`: The threshold to visualize the bboxes and masks. Defaults to 0.3. +- `show`: Whether to display the drawn image. Default to False. +- `wait_time`: The interval of show (s). Defaults to 0. +- `test_out_dir`: directory where painted images will be saved in testing process. +- `backend_args`: Arguments to instantiate a file client. Defaults to `None`. + +In the `TrackVisualizationHook`, `TrackLocalVisualizer` will be called to implement visualization for MOT and VIS tasks. +We will present the details below. +You can refer to MMEngine for more details about [Visualization](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/visualization.md) and [Hook](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/hook.md). + +#### Tracking Visualization + +We realize the tracking visualization with class `TrackLocalVisualizer`. +You can call it as follows. + +```python +visualizer = dict(type='TrackLocalVisualizer') +``` + +It has the following arguments: + +- `name`: Name of the instance. Defaults to 'visualizer'. +- `image`: The origin image to draw. The format should be RGB. Defaults to None. +- `vis_backends`: Visual backend config list. Defaults to None. +- `save_dir`: Save file dir for all storage backends. If it is None, the backend storage will not save any data. +- `line_width`: The linewidth of lines. Defaults to 3. +- `alpha`: The transparency of bboxes or mask. Defaults to 0.8. + +Here is a visualization example of DeepSORT: + +![test_img_89](https://user-images.githubusercontent.com/99722489/186062929-6d0e4663-0d8e-4045-9ec8-67e0e41da876.png) diff --git a/mmdetection/docs/en/user_guides/train.md b/mmdetection/docs/en/user_guides/train.md new file mode 100644 index 0000000..a68d5e4 --- /dev/null +++ b/mmdetection/docs/en/user_guides/train.md @@ -0,0 +1,456 @@ +# Train predefined models on standard datasets + +MMDetection also provides out-of-the-box tools for training detection models. +This section will show how to train _predefined_ models (under [configs](../../../configs)) on standard datasets i.e. COCO. + +## Prepare datasets + +Preparing datasets is also necessary for training. See section [Prepare datasets](#prepare-datasets) above for details. + +**Note**: +Currently, the config files under `configs/cityscapes` use COCO pre-trained weights to initialize. +If your network connection is slow or unavailable, it's advisable to download existing models before beginning training to avoid errors. + +## Learning rate auto scaling + +**Important**: The default learning rate in config files is for 8 GPUs and 2 sample per GPU (batch size = 8 * 2 = 16). And it had been set to `auto_scale_lr.base_batch_size` in `config/_base_/schedules/schedule_1x.py`. The learning rate will be automatically scaled based on the value at a batch size of 16. Meanwhile, to avoid affecting other codebases that use mmdet, the default setting for the `auto_scale_lr.enable` flag is `False`. + +If you want to enable this feature, you need to add argument `--auto-scale-lr`. And you need to check the config name which you want to use before you process the command, because the config name indicates the default batch size. +By default, it is `8 x 2 = 16 batch size`, like `faster_rcnn_r50_caffe_fpn_90k_coco.py` or `pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py`. In other cases, you will see the config file name have `_NxM_` in dictating, like `cornernet_hourglass104_mstest_32x3_210e_coco.py` which batch size is `32 x 3 = 96`, or `scnet_x101_64x4d_fpn_8x1_20e_coco.py` which batch size is `8 x 1 = 8`. + +**Please remember to check the bottom of the specific config file you want to use, it will have `auto_scale_lr.base_batch_size` if the batch size is not `16`. If you can't find those values, check the config file which in `_base_=[xxx]` and you will find it. Please do not modify its values if you want to automatically scale the LR.** + +The basic usage of learning rate auto scaling is as follows. + +```shell +python tools/train.py \ + ${CONFIG_FILE} \ + --auto-scale-lr \ + [optional arguments] +``` + +If you enabled this feature, the learning rate will be automatically scaled according to the number of GPUs on the machine and the batch size of training. See [linear scaling rule](https://arxiv.org/abs/1706.02677) for details. For example, If there are 4 GPUs and 2 pictures on each GPU, `lr = 0.01`, then if there are 16 GPUs and 4 pictures on each GPU, it will automatically scale to `lr = 0.08`. + +If you don't want to use it, you need to calculate the learning rate according to the [linear scaling rule](https://arxiv.org/abs/1706.02677) manually then change `optimizer.lr` in specific config file. + +## Training on a single GPU + +We provide `tools/train.py` to launch training jobs on a single GPU. +The basic usage is as follows. + +```shell +python tools/train.py \ + ${CONFIG_FILE} \ + [optional arguments] +``` + +During training, log files and checkpoints will be saved to the working directory, which is specified by `work_dir` in the config file or via CLI argument `--work-dir`. + +By default, the model is evaluated on the validation set every epoch, the evaluation interval can be specified in the config file as shown below. + +```python +# evaluate the model every 12 epochs. +train_cfg = dict(val_interval=12) +``` + +This tool accepts several optional arguments, including: + +- `--work-dir ${WORK_DIR}`: Override the working directory. +- `--resume`: resume from the latest checkpoint in the work_dir automatically. +- `--resume ${CHECKPOINT_FILE}`: resume from the specific checkpoint. +- `--cfg-options 'Key=value'`: Overrides other settings in the used config. + +**Note:** + +There is a difference between `resume` and `load-from`: + +`resume` loads both the weights of the model and the state of the optimizer, and it inherits the iteration number from the specified checkpoint, so training does not start again from scratch. `load-from`, on the other hand, only loads the weights of the model, and its training starts from scratch. It is often used for fine-tuning a model. `load-from` needs to be written in the config file, while `resume` is passed as a command line argument. + +## Training on CPU + +The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process. + +```shell +export CUDA_VISIBLE_DEVICES=-1 +``` + +And then run the script [above](#training-on-a-single-GPU). + +**Note**: + +We do not recommend users to use the CPU for training because it is too slow. We support this feature to allow users to debug on machines without GPU for convenience. + +## Training on multiple GPUs + +We provide `tools/dist_train.sh` to launch training on multiple GPUs. +The basic usage is as follows. + +```shell +bash ./tools/dist_train.sh \ + ${CONFIG_FILE} \ + ${GPU_NUM} \ + [optional arguments] +``` + +Optional arguments remain the same as stated [above](#training-on-a-single-GPU). + +### Launch multiple jobs simultaneously + +If you would like to launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs, +you need to specify different ports (29500 by default) for each job to avoid communication conflict. + +If you use `dist_train.sh` to launch training jobs, you can set the port in the commands. + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4 +``` + +## Train with multiple machines + +If you launch with multiple machines simply connected with ethernet, you can simply run the following commands: + +On the first machine: + +```shell +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS +``` + +On the second machine: + +```shell +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS +``` + +Usually, it is slow if you do not have high-speed networking like InfiniBand. + +## Manage jobs with Slurm + +[Slurm](https://slurm.schedmd.com/) is a good job scheduling system for computing clusters. +On a cluster managed by Slurm, you can use `slurm_train.sh` to spawn training jobs. It supports both single-node and multi-node training. + +The basic usage is as follows. + +```shell +[GPUS=${GPUS}] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} +``` + +Below is an example of using 16 GPUs to train Mask R-CNN on a Slurm partition named _dev_, and set the work-dir to some shared file systems. + +```shell +GPUS=16 ./tools/slurm_train.sh dev mask_r50_1x configs/mask-rcnn_r50_fpn_1x_coco.py /nfs/xxxx/mask_rcnn_r50_fpn_1x +``` + +You can check [the source code](../../../tools/slurm_train.sh) to review full arguments and environment variables. + +When using Slurm, the port option needs to be set in one of the following ways: + +1. Set the port through `--options`. This is more recommended since it does not change the original configs. + + ```shell + CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} --cfg-options 'dist_params.port=29500' + CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} --cfg-options 'dist_params.port=29501' + ``` + +2. Modify the config files to set different communication ports. + + In `config1.py`, set + + ```python + dist_params = dict(backend='nccl', port=29500) + ``` + + In `config2.py`, set + + ```python + dist_params = dict(backend='nccl', port=29501) + ``` + + Then you can launch two jobs with `config1.py` and `config2.py`. + + ```shell + CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} + CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} + ``` + +# Train with customized datasets + +In this part, you will know how to train predefined models with customized datasets and then test it. We use the [balloon dataset](https://github.com/matterport/Mask_RCNN/tree/master/samples/balloon) as an example to describe the whole process. + +The basic steps are as below: + +1. Prepare the customized dataset +2. Prepare a config +3. Train, test, and infer models on the customized dataset. + +## Prepare the customized dataset + +There are three ways to support a new dataset in MMDetection: + +1. Reorganize the dataset into COCO format. +2. Reorganize the dataset into a middle format. +3. Implement a new dataset. + +Usually, we recommend using the first two methods which are usually easier than the third. + +In this note, we give an example of converting the data into COCO format. + +**Note**: Datasets and metrics have been decoupled except CityScapes since MMDetection 3.0. Therefore, users can use any kind of evaluation metrics for any format of datasets during validation. For example: evaluate on COCO dataset with VOC metric, or evaluate on OpenImages dataset with both VOC and COCO metrics. + +### COCO annotation format + +The necessary keys of COCO format for instance segmentation are as below, for the complete details, please refer [here](https://cocodataset.org/#format-data). + +```json +{ + "images": [image], + "annotations": [annotation], + "categories": [category] +} + +image = { + "id": int, + "width": int, + "height": int, + "file_name": str, +} + +annotation = { + "id": int, + "image_id": int, + "category_id": int, + "segmentation": RLE or [polygon], + "area": float, + "bbox": [x,y,width,height], # (x, y) are the coordinates of the upper left corner of the bbox + "iscrowd": 0 or 1, +} + +categories = [{ + "id": int, + "name": str, + "supercategory": str, +}] +``` + +Assume we use the balloon dataset. +After downloading the data, we need to implement a function to convert the annotation format into the COCO format. Then we can use implemented `CocoDataset` to load the data and perform training and evaluation. + +If you take a look at the dataset, you will find the dataset format is as below: + +```json +{'base64_img_data': '', + 'file_attributes': {}, + 'filename': '34020010494_e5cb88e1c4_k.jpg', + 'fileref': '', + 'regions': {'0': {'region_attributes': {}, + 'shape_attributes': {'all_points_x': [1020, + 1000, + 994, + 1003, + 1023, + 1050, + 1089, + 1134, + 1190, + 1265, + 1321, + 1361, + 1403, + 1428, + 1442, + 1445, + 1441, + 1427, + 1400, + 1361, + 1316, + 1269, + 1228, + 1198, + 1207, + 1210, + 1190, + 1177, + 1172, + 1174, + 1170, + 1153, + 1127, + 1104, + 1061, + 1032, + 1020], + 'all_points_y': [963, + 899, + 841, + 787, + 738, + 700, + 663, + 638, + 621, + 619, + 643, + 672, + 720, + 765, + 800, + 860, + 896, + 942, + 990, + 1035, + 1079, + 1112, + 1129, + 1134, + 1144, + 1153, + 1166, + 1166, + 1150, + 1136, + 1129, + 1122, + 1112, + 1084, + 1037, + 989, + 963], + 'name': 'polygon'}}}, + 'size': 1115004} +``` + +The annotation is a JSON file where each key indicates an image's all annotations. +The code to convert the balloon dataset into coco format is as below. + +```python +import os.path as osp + +import mmcv + +from mmengine.fileio import dump, load +from mmengine.utils import track_iter_progress + + +def convert_balloon_to_coco(ann_file, out_file, image_prefix): + data_infos = load(ann_file) + + annotations = [] + images = [] + obj_count = 0 + for idx, v in enumerate(track_iter_progress(data_infos.values())): + filename = v['filename'] + img_path = osp.join(image_prefix, filename) + height, width = mmcv.imread(img_path).shape[:2] + + images.append( + dict(id=idx, file_name=filename, height=height, width=width)) + + for _, obj in v['regions'].items(): + assert not obj['region_attributes'] + obj = obj['shape_attributes'] + px = obj['all_points_x'] + py = obj['all_points_y'] + poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)] + poly = [p for x in poly for p in x] + + x_min, y_min, x_max, y_max = (min(px), min(py), max(px), max(py)) + + data_anno = dict( + image_id=idx, + id=obj_count, + category_id=0, + bbox=[x_min, y_min, x_max - x_min, y_max - y_min], + area=(x_max - x_min) * (y_max - y_min), + segmentation=[poly], + iscrowd=0) + annotations.append(data_anno) + obj_count += 1 + + coco_format_json = dict( + images=images, + annotations=annotations, + categories=[{ + 'id': 0, + 'name': 'balloon' + }]) + dump(coco_format_json, out_file) + + +if __name__ == '__main__': + convert_balloon_to_coco(ann_file='data/balloon/train/via_region_data.json', + out_file='data/balloon/train/annotation_coco.json', + image_prefix='data/balloon/train') + convert_balloon_to_coco(ann_file='data/balloon/val/via_region_data.json', + out_file='data/balloon/val/annotation_coco.json', + image_prefix='data/balloon/val') + +``` + +Using the function above, users can successfully convert the annotation file into json format, then we can use `CocoDataset` to train and evaluate the model with `CocoMetric`. + +## Prepare a config + +The second step is to prepare a config thus the dataset could be successfully loaded. Assume that we want to use Mask R-CNN with FPN, the config to train the detector on balloon dataset is as below. Assume the config is under directory `configs/balloon/` and named as `mask-rcnn_r50-caffe_fpn_ms-poly-1x_balloon.py`, the config is as below. Please refer [Learn about Configs - MMDetection 3.0.0 documentation](https://mmdetection.readthedocs.io/en/latest/user_guides/config.html) to get detailed information about config files. + +```python +# The new config inherits a base config to highlight the necessary modification +_base_ = '../mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py' + +# We also need to change the num_classes in head to match the dataset's annotation +model = dict( + roi_head=dict( + bbox_head=dict(num_classes=1), mask_head=dict(num_classes=1))) + +# Modify dataset related settings +data_root = 'data/balloon/' +metainfo = { + 'classes': ('balloon', ), + 'palette': [ + (220, 20, 60), + ] +} +train_dataloader = dict( + batch_size=1, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='train/annotation_coco.json', + data_prefix=dict(img='train/'))) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='val/annotation_coco.json', + data_prefix=dict(img='val/'))) +test_dataloader = val_dataloader + +# Modify metric related settings +val_evaluator = dict(ann_file=data_root + 'val/annotation_coco.json') +test_evaluator = val_evaluator + +# We can use the pre-trained Mask RCNN model to obtain higher performance +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' + +``` + +## Train a new model + +To train a model with the new config, you can simply run + +```shell +python tools/train.py configs/balloon/mask-rcnn_r50-caffe_fpn_ms-poly-1x_balloon.py +``` + +For more detailed usages, please refer to the [training guide](https://mmdetection.readthedocs.io/en/latest/user_guides/train.html#train-predefined-models-on-standard-datasets). + +## Test and inference + +To test the trained model, you can simply run + +```shell +python tools/test.py configs/balloon/mask-rcnn_r50-caffe_fpn_ms-poly-1x_balloon.py work_dirs/mask-rcnn_r50-caffe_fpn_ms-poly-1x_balloon/epoch_12.pth +``` + +For more detailed usages, please refer to the [testing guide](https://mmdetection.readthedocs.io/en/latest/user_guides/test.html). diff --git a/mmdetection/docs/en/user_guides/useful_hooks.md b/mmdetection/docs/en/user_guides/useful_hooks.md new file mode 100644 index 0000000..4c30686 --- /dev/null +++ b/mmdetection/docs/en/user_guides/useful_hooks.md @@ -0,0 +1,105 @@ +# Useful Hooks + +MMDetection and MMEngine provide users with various useful hooks including log hooks, `NumClassCheckHook`, etc. This tutorial introduces the functionalities and usages of hooks implemented in MMDetection. For using hooks in MMEngine, please read the [API documentation in MMEngine](https://github.com/open-mmlab/mmengine/tree/main/docs/en/tutorials/hook.md). + +## CheckInvalidLossHook + +## NumClassCheckHook + +## MemoryProfilerHook + +[Memory profiler hook](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/engine/hooks/memory_profiler_hook.py) records memory information including virtual memory, swap memory, and the memory of the current process. This hook helps grasp the memory usage of the system and discover potential memory leak bugs. To use this hook, users should install `memory_profiler` and `psutil` by `pip install memory_profiler psutil` first. + +### Usage + +To use this hook, users should add the following code to the config file. + +```python +custom_hooks = [ + dict(type='MemoryProfilerHook', interval=50) +] +``` + +### Result + +During training, you can see the messages in the log recorded by `MemoryProfilerHook` as below. + +```text +The system has 250 GB (246360 MB + 9407 MB) of memory and 8 GB (5740 MB + 2452 MB) of swap memory in total. Currently 9407 MB (4.4%) of memory and 5740 MB (29.9%) of swap memory were consumed. And the current training process consumed 5434 MB of memory. +``` + +```text +2022-04-21 08:49:56,881 - mmengine - INFO - Memory information available_memory: 246360 MB, used_memory: 9407 MB, memory_utilization: 4.4 %, available_swap_memory: 5740 MB, used_swap_memory: 2452 MB, swap_memory_utilization: 29.9 %, current_process_memory: 5434 MB +``` + +## SetEpochInfoHook + +## SyncNormHook + +## SyncRandomSizeHook + +## YOLOXLrUpdaterHook + +## YOLOXModeSwitchHook + +## How to implement a custom hook + +In general, there are 20 points where hooks can be inserted from the beginning to the end of model training. The users can implement custom hooks and insert them at different points in the process of training to do what they want. + +- global points: `before_run`, `after_run` +- points in training: `before_train`, `before_train_epoch`, `before_train_iter`, `after_train_iter`, `after_train_epoch`, `after_train` +- points in validation: `before_val`, `before_val_epoch`, `before_val_iter`, `after_val_iter`, `after_val_epoch`, `after_val` +- points at testing: `before_test`, `before_test_epoch`, `before_test_iter`, `after_test_iter`, `after_test_epoch`, `after_test` +- other points: `before_save_checkpoint`, `after_save_checkpoint` + +For example, users can implement a hook to check loss and terminate training when loss goes NaN. To achieve that, there are three steps to go: + +1. Implement a new hook that inherits the `Hook` class in MMEngine, and implement `after_train_iter` method which checks whether loss goes NaN after every `n` training iterations. +2. The implemented hook should be registered in `HOOKS` by `@HOOKS.register_module()` as shown in the code below. +3. Add `custom_hooks = [dict(type='MemoryProfilerHook', interval=50)]` in the config file. + +```python +from typing import Optional + +import torch +from mmengine.hooks import Hook +from mmengine.runner import Runner + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class CheckInvalidLossHook(Hook): + """Check invalid loss hook. + + This hook will regularly check whether the loss is valid + during training. + + Args: + interval (int): Checking interval (every k iterations). + Default: 50. + """ + + def __init__(self, interval: int = 50) -> None: + self.interval = interval + + def after_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None, + outputs: Optional[dict] = None) -> None: + """Regularly check whether the loss is valid every n iterations. + + Args: + runner (:obj:`Runner`): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict, Optional): Data from dataloader. + Defaults to None. + outputs (dict, Optional): Outputs from model. Defaults to None. + """ + if self.every_n_train_iters(runner, self.interval): + assert torch.isfinite(outputs['loss']), \ + runner.logger.info('loss become infinite or NaN!') +``` + +Please read [customize_runtime](../advanced_guides/customize_runtime.md) for more about implementing a custom hook. diff --git a/mmdetection/docs/en/user_guides/useful_tools.md b/mmdetection/docs/en/user_guides/useful_tools.md new file mode 100644 index 0000000..8a79f0c --- /dev/null +++ b/mmdetection/docs/en/user_guides/useful_tools.md @@ -0,0 +1,660 @@ +Apart from training/testing scripts, We provide lots of useful tools under the +`tools/` directory. + +## Log Analysis + +`tools/analysis_tools/analyze_logs.py` plots loss/mAP curves given a training +log file. Run `pip install seaborn` first to install the dependency. + +```shell +python tools/analysis_tools/analyze_logs.py plot_curve [--keys ${KEYS}] [--eval-interval ${EVALUATION_INTERVAL}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}] +``` + +![loss curve image](../../../resources/loss_curve.png) + +Examples: + +- Plot the classification loss of some run. + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls + ``` + +- Plot the classification and regression loss of some run, and save the figure to a pdf. + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls loss_bbox --out losses.pdf + ``` + +- Compare the bbox mAP of two runs in the same figure. + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log1.json log2.json --keys bbox_mAP --legend run1 run2 + ``` + +- Compute the average training speed. + + ```shell + python tools/analysis_tools/analyze_logs.py cal_train_time log.json [--include-outliers] + ``` + + The output is expected to be like the following. + + ```text + -----Analyze train time of work_dirs/some_exp/20190611_192040.log.json----- + slowest epoch 11, average time is 1.2024 + fastest epoch 1, average time is 1.1909 + time std over epochs is 0.0028 + average iter time: 1.1959 s/iter + ``` + +## Result Analysis + +`tools/analysis_tools/analyze_results.py` calculates single image mAP and saves or shows the topk images with the highest and lowest scores based on prediction results. + +**Usage** + +```shell +python tools/analysis_tools/analyze_results.py \ + ${CONFIG} \ + ${PREDICTION_PATH} \ + ${SHOW_DIR} \ + [--show] \ + [--wait-time ${WAIT_TIME}] \ + [--topk ${TOPK}] \ + [--show-score-thr ${SHOW_SCORE_THR}] \ + [--cfg-options ${CFG_OPTIONS}] +``` + +Description of all arguments: + +- `config` : The path of a model config file. +- `prediction_path`: Output result file in pickle format from `tools/test.py` +- `show_dir`: Directory where painted GT and detection images will be saved +- `--show`: Determines whether to show painted images, If not specified, it will be set to `False` +- `--wait-time`: The interval of show (s), 0 is block +- `--topk`: The number of saved images that have the highest and lowest `topk` scores after sorting. If not specified, it will be set to `20`. +- `--show-score-thr`: Show score threshold. If not specified, it will be set to `0`. +- `--cfg-options`: If specified, the key-value pair optional cfg will be merged into config file + +**Examples**: + +Assume that you have got result file in pickle format from `tools/test.py` in the path './result.pkl'. + +1. Test Faster R-CNN and visualize the results, save images to the directory `results/` + +```shell +python tools/analysis_tools/analyze_results.py \ + configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \ + result.pkl \ + results \ + --show +``` + +2. Test Faster R-CNN and specified topk to 50, save images to the directory `results/` + +```shell +python tools/analysis_tools/analyze_results.py \ + configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \ + result.pkl \ + results \ + --topk 50 +``` + +3. If you want to filter the low score prediction results, you can specify the `show-score-thr` parameter + +```shell +python tools/analysis_tools/analyze_results.py \ + configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \ + result.pkl \ + results \ + --show-score-thr 0.3 +``` + +## Fusing results from multiple models + +`tools/analysis_tools/fusion_results.py` can fusing predictions using Weighted Boxes Fusion(WBF) from different object detection models. (Currently support coco format only) + +**Usage** + +```shell +python tools/analysis_tools/fuse_results.py \ + ${PRED_RESULTS} \ + [--annotation ${ANNOTATION}] \ + [--weights ${WEIGHTS}] \ + [--fusion-iou-thr ${FUSION_IOU_THR}] \ + [--skip-box-thr ${SKIP_BOX_THR}] \ + [--conf-type ${CONF_TYPE}] \ + [--eval-single ${EVAL_SINGLE}] \ + [--save-fusion-results ${SAVE_FUSION_RESULTS}] \ + [--out-dir ${OUT_DIR}] +``` + +Description of all arguments: + +- `pred-results`: Paths of detection results from different models.(Currently support coco format only) +- `--annotation`: Path of ground-truth. +- `--weights`: List of weights for each model. Default: `None`, which means weight == 1 for each model. +- `--fusion-iou-thr`: IoU value for boxes to be a match。Default: `0.55`。 +- `--skip-box-thr`: The confidence threshold that needs to be excluded in the WBF algorithm. bboxes whose confidence is less than this value will be excluded.。Default: `0`。 +- `--conf-type`: How to calculate confidence in weighted boxes. + - `avg`: average value,default. + - `max`: maximum value. + - `box_and_model_avg`: box and model wise hybrid weighted average. + - `absent_model_aware_avg`: weighted average that takes into account the absent model. +- `--eval-single`: Whether evaluate every single model. Default: `False`. +- `--save-fusion-results`: Whether save fusion results. Default: `False`. +- `--out-dir`: Path of fusion results. + +**Examples**: +Assume that you have got 3 result files from corresponding models through `tools/test.py`, which paths are './faster-rcnn_r50-caffe_fpn_1x_coco.json', './retinanet_r50-caffe_fpn_1x_coco.json', './cascade-rcnn_r50-caffe_fpn_1x_coco.json' respectively. The ground-truth file path is './annotation.json'. + +1. Fusion of predictions from three models and evaluation of their effectiveness + +```shell +python tools/analysis_tools/fuse_results.py \ + ./faster-rcnn_r50-caffe_fpn_1x_coco.json \ + ./retinanet_r50-caffe_fpn_1x_coco.json \ + ./cascade-rcnn_r50-caffe_fpn_1x_coco.json \ + --annotation ./annotation.json \ + --weights 1 2 3 \ +``` + +2. Simultaneously evaluate each single model and fusion results + +```shell +python tools/analysis_tools/fuse_results.py \ + ./faster-rcnn_r50-caffe_fpn_1x_coco.json \ + ./retinanet_r50-caffe_fpn_1x_coco.json \ + ./cascade-rcnn_r50-caffe_fpn_1x_coco.json \ + --annotation ./annotation.json \ + --weights 1 2 3 \ + --eval-single +``` + +3. Fusion of prediction results from three models and save + +```shell +python tools/analysis_tools/fuse_results.py \ + ./faster-rcnn_r50-caffe_fpn_1x_coco.json \ + ./retinanet_r50-caffe_fpn_1x_coco.json \ + ./cascade-rcnn_r50-caffe_fpn_1x_coco.json \ + --annotation ./annotation.json \ + --weights 1 2 3 \ + --save-fusion-results \ + --out-dir outputs/fusion +``` + +## Visualization + +### Visualize Datasets + +`tools/analysis_tools/browse_dataset.py` helps the user to browse a detection dataset (both +images and bounding box annotations) visually, or save the image to a +designated directory. + +```shell +python tools/analysis_tools/browse_dataset.py ${CONFIG} [-h] [--skip-type ${SKIP_TYPE[SKIP_TYPE...]}] [--output-dir ${OUTPUT_DIR}] [--not-show] [--show-interval ${SHOW_INTERVAL}] +``` + +### Visualize Models + +First, convert the model to ONNX as described +[here](#convert-mmdetection-model-to-onnx-experimental). +Note that currently only RetinaNet is supported, support for other models +will be coming in later versions. +The converted model could be visualized by tools like [Netron](https://github.com/lutzroeder/netron). + +### Visualize Predictions + +If you need a lightweight GUI for visualizing the detection results, you can refer [DetVisGUI project](https://github.com/Chien-Hung/DetVisGUI/tree/mmdetection). + +## Error Analysis + +`tools/analysis_tools/coco_error_analysis.py` analyzes COCO results per category and by +different criterion. It can also make a plot to provide useful information. + +```shell +python tools/analysis_tools/coco_error_analysis.py ${RESULT} ${OUT_DIR} [-h] [--ann ${ANN}] [--types ${TYPES[TYPES...]}] +``` + +Example: + +Assume that you have got [Mask R-CNN checkpoint file](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) in the path 'checkpoint'. For other checkpoints, please refer to our [model zoo](./model_zoo.md). + +You can modify the test_evaluator to save the results bbox by: + +1. Find which dataset in 'configs/base/datasets' the current config corresponds to. +2. Replace the original test_evaluator and test_dataloader with test_evaluator and test_dataloader in the comment in dataset config. +3. Use the following command to get the results bbox and segmentation json file. + +```shell +python tools/test.py \ + configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoint/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ +``` + +1. Get COCO bbox error results per category , save analyze result images to the directory(In [config](../../../configs/_base_/datasets/coco_instance.py) the default directory is './work_dirs/coco_instance/test') + +```shell +python tools/analysis_tools/coco_error_analysis.py \ + results.bbox.json \ + results \ + --ann=data/coco/annotations/instances_val2017.json \ +``` + +2. Get COCO segmentation error results per category , save analyze result images to the directory + +```shell +python tools/analysis_tools/coco_error_analysis.py \ + results.segm.json \ + results \ + --ann=data/coco/annotations/instances_val2017.json \ + --types='segm' +``` + +## Model Serving + +In order to serve an `MMDetection` model with [`TorchServe`](https://pytorch.org/serve/), you can follow the steps: + +### 1. Install TorchServe + +Suppose you have a `Python` environment with `PyTorch` and `MMDetection` successfully installed, +then you could run the following command to install `TorchServe` and its dependencies. +For more other installation options, please refer to the [quick start](https://github.com/pytorch/serve/blob/master/README.md#serve-a-model). + +```shell +python -m pip install torchserve torch-model-archiver torch-workflow-archiver nvgpu +``` + +**Note**: Please refer to [torchserve docker](https://github.com/pytorch/serve/blob/master/docker/README.md) if you want to use `TorchServe` in docker. + +### 2. Convert model from MMDetection to TorchServe + +```shell +python tools/deployment/mmdet2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \ +--output-folder ${MODEL_STORE} \ +--model-name ${MODEL_NAME} +``` + +### 3. Start `TorchServe` + +```shell +torchserve --start --ncs \ + --model-store ${MODEL_STORE} \ + --models ${MODEL_NAME}.mar +``` + +### 4. Test deployment + +```shell +curl -O curl -O https://raw.githubusercontent.com/pytorch/serve/master/docs/images/3dogs.jpg +curl http://127.0.0.1:8080/predictions/${MODEL_NAME} -T 3dogs.jpg +``` + +You should obtain a response similar to: + +```json +[ + { + "class_label": 16, + "class_name": "dog", + "bbox": [ + 294.63409423828125, + 203.99111938476562, + 417.048583984375, + 281.62744140625 + ], + "score": 0.9987992644309998 + }, + { + "class_label": 16, + "class_name": "dog", + "bbox": [ + 404.26019287109375, + 126.0080795288086, + 574.5091552734375, + 293.6662292480469 + ], + "score": 0.9979367256164551 + }, + { + "class_label": 16, + "class_name": "dog", + "bbox": [ + 197.2144775390625, + 93.3067855834961, + 307.8505554199219, + 276.7560119628906 + ], + "score": 0.993338406085968 + } +] +``` + +#### Compare results + +And you can use `test_torchserver.py` to compare result of `TorchServe` and `PyTorch`, and visualize them. + +```shell +python tools/deployment/test_torchserver.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME} +[--inference-addr ${INFERENCE_ADDR}] [--device ${DEVICE}] [--score-thr ${SCORE_THR}] [--work-dir ${WORK_DIR}] +``` + +Example: + +```shell +python tools/deployment/test_torchserver.py \ +demo/demo.jpg \ +configs/yolo/yolov3_d53_8xb8-320-273e_coco.py \ +checkpoint/yolov3_d53_320_273e_coco-421362b6.pth \ +yolov3 \ +--work-dir ./work-dir +``` + +### 5. Stop `TorchServe` + +```shell +torchserve --stop +``` + +## Model Complexity + +`tools/analysis_tools/get_flops.py` is a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) to compute the FLOPs and params of a given model. + +```shell +python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}] +``` + +You will get the results like this. + +```text +============================== +Input shape: (3, 1280, 800) +Flops: 239.32 GFLOPs +Params: 37.74 M +============================== +``` + +**Note**: This tool is still experimental and we do not guarantee that the +number is absolutely correct. You may well use the result for simple +comparisons, but double check it before you adopt it in technical reports or papers. + +1. FLOPs are related to the input shape while parameters are not. The default + input shape is (1, 3, 1280, 800). +2. Some operators are not counted into FLOPs like GN and custom operators. Refer to [`mmcv.cnn.get_model_complexity_info()`](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/cnn/utils/flops_counter.py) for details. +3. The FLOPs of two-stage detectors is dependent on the number of proposals. + +## Model conversion + +### MMDetection model to ONNX + +We provide a script to convert model to [ONNX](https://github.com/onnx/onnx) format. We also support comparing the output results between Pytorch and ONNX model for verification. More details can refer to [mmdeploy](https://github.com/open-mmlab/mmdeploy) + +### MMDetection 1.x model to MMDetection 2.x + +`tools/model_converters/upgrade_model_version.py` upgrades a previous MMDetection checkpoint +to the new version. Note that this script is not guaranteed to work as some +breaking changes are introduced in the new version. It is recommended to +directly use the new checkpoints. + +```shell +python tools/model_converters/upgrade_model_version.py ${IN_FILE} ${OUT_FILE} [-h] [--num-classes NUM_CLASSES] +``` + +### RegNet model to MMDetection + +`tools/model_converters/regnet2mmdet.py` convert keys in pycls pretrained RegNet models to +MMDetection style. + +```shell +python tools/model_converters/regnet2mmdet.py ${SRC} ${DST} [-h] +``` + +### Detectron ResNet to Pytorch + +`tools/model_converters/detectron2pytorch.py` converts keys in the original detectron pretrained +ResNet models to PyTorch style. + +```shell +python tools/model_converters/detectron2pytorch.py ${SRC} ${DST} ${DEPTH} [-h] +``` + +### Prepare a model for publishing + +`tools/model_converters/publish_model.py` helps users to prepare their model for publishing. + +Before you upload a model to AWS, you may want to + +1. convert model weights to CPU tensors +2. delete the optimizer states and +3. compute the hash of the checkpoint file and append the hash id to the + filename. + +```shell +python tools/model_converters/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME} +``` + +E.g., + +```shell +python tools/model_converters/publish_model.py work_dirs/faster_rcnn/latest.pth faster_rcnn_r50_fpn_1x_20190801.pth +``` + +The final output filename will be `faster_rcnn_r50_fpn_1x_20190801-{hash id}.pth`. + +## Dataset Conversion + +`tools/data_converters/` contains tools to convert the Cityscapes dataset +and Pascal VOC dataset to the COCO format. + +```shell +python tools/dataset_converters/cityscapes.py ${CITYSCAPES_PATH} [-h] [--img-dir ${IMG_DIR}] [--gt-dir ${GT_DIR}] [-o ${OUT_DIR}] [--nproc ${NPROC}] +python tools/dataset_converters/pascal_voc.py ${DEVKIT_PATH} [-h] [-o ${OUT_DIR}] +``` + +## Dataset Download + +`tools/misc/download_dataset.py` supports downloading datasets such as COCO, VOC, and LVIS. + +```shell +python tools/misc/download_dataset.py --dataset-name coco2017 +python tools/misc/download_dataset.py --dataset-name voc2007 +python tools/misc/download_dataset.py --dataset-name lvis +``` + +For users in China, these datasets can also be downloaded from [OpenDataLab](https://opendatalab.com/?source=OpenMMLab%20GitHub) with high speed: + +- [COCO2017](https://opendatalab.com/COCO_2017/download?source=OpenMMLab%20GitHub) +- [VOC2007](https://opendatalab.com/PASCAL_VOC2007/download?source=OpenMMLab%20GitHub) +- [VOC2012](https://opendatalab.com/PASCAL_VOC2012/download?source=OpenMMLab%20GitHub) +- [LVIS](https://opendatalab.com/LVIS/download?source=OpenMMLab%20GitHub) + +## Benchmark + +### Robust Detection Benchmark + +`tools/analysis_tools/test_robustness.py` and`tools/analysis_tools/robustness_eval.py` helps users to evaluate model robustness. The core idea comes from [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484). For more information how to evaluate models on corrupted images and results for a set of standard models please refer to [robustness_benchmarking.md](robustness_benchmarking.md). + +### FPS Benchmark + +`tools/analysis_tools/benchmark.py` helps users to calculate FPS. The FPS value includes model forward and post-processing. In order to get a more accurate value, currently only supports single GPU distributed startup mode. + +```shell +python -m torch.distributed.launch --nproc_per_node=1 --master_port=${PORT} tools/analysis_tools/benchmark.py \ + ${CONFIG} \ + [--checkpoint ${CHECKPOINT}] \ + [--repeat-num ${REPEAT_NUM}] \ + [--max-iter ${MAX_ITER}] \ + [--log-interval ${LOG_INTERVAL}] \ + --launcher pytorch +``` + +Examples: Assuming that you have already downloaded the `Faster R-CNN` model checkpoint to the directory `checkpoints/`. + +```shell +python -m torch.distributed.launch --nproc_per_node=1 --master_port=29500 tools/analysis_tools/benchmark.py \ + configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \ + checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \ + --launcher pytorch +``` + +## Miscellaneous + +### Evaluating a metric + +`tools/analysis_tools/eval_metric.py` evaluates certain metrics of a pkl result file +according to a config file. + +```shell +python tools/analysis_tools/eval_metric.py ${CONFIG} ${PKL_RESULTS} [-h] [--format-only] [--eval ${EVAL[EVAL ...]}] + [--cfg-options ${CFG_OPTIONS [CFG_OPTIONS ...]}] + [--eval-options ${EVAL_OPTIONS [EVAL_OPTIONS ...]}] +``` + +### Print the entire config + +`tools/misc/print_config.py` prints the whole config verbatim, expanding all its +imports. + +```shell +python tools/misc/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}] +``` + +## Hyper-parameter Optimization + +### YOLO Anchor Optimization + +`tools/analysis_tools/optimize_anchors.py` provides two method to optimize YOLO anchors. + +One is k-means anchor cluster which refers from [darknet](https://github.com/AlexeyAB/darknet/blob/master/src/detector.c#L1421). + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} --algorithm k-means --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} --output-dir ${OUTPUT_DIR} +``` + +Another is using differential evolution to optimize anchors. + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} --algorithm differential_evolution --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} --output-dir ${OUTPUT_DIR} +``` + +E.g., + +```shell +python tools/analysis_tools/optimize_anchors.py configs/yolo/yolov3_d53_8xb8-320-273e_coco.py --algorithm differential_evolution --input-shape 608 608 --device cuda --output-dir work_dirs +``` + +You will get: + +``` +loading annotations into memory... +Done (t=9.70s) +creating index... +index created! +2021-07-19 19:37:20,951 - mmdet - INFO - Collecting bboxes from annotation... +[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 117266/117266, 15874.5 task/s, elapsed: 7s, ETA: 0s + +2021-07-19 19:37:28,753 - mmdet - INFO - Collected 849902 bboxes. +differential_evolution step 1: f(x)= 0.506055 +differential_evolution step 2: f(x)= 0.506055 +...... + +differential_evolution step 489: f(x)= 0.386625 +2021-07-19 19:46:40,775 - mmdet - INFO Anchor evolution finish. Average IOU: 0.6133754253387451 +2021-07-19 19:46:40,776 - mmdet - INFO Anchor differential evolution result:[[10, 12], [15, 30], [32, 22], [29, 59], [61, 46], [57, 116], [112, 89], [154, 198], [349, 336]] +2021-07-19 19:46:40,798 - mmdet - INFO Result saved in work_dirs/anchor_optimize_result.json +``` + +## Confusion Matrix + +A confusion matrix is a summary of prediction results. + +`tools/analysis_tools/confusion_matrix.py` can analyze the prediction results and plot a confusion matrix table. + +First, run `tools/test.py` to save the `.pkl` detection results. + +Then, run + +``` +python tools/analysis_tools/confusion_matrix.py ${CONFIG} ${DETECTION_RESULTS} ${SAVE_DIR} --show +``` + +And you will get a confusion matrix like this: + +![confusion_matrix_example](https://user-images.githubusercontent.com/12907710/140513068-994cdbf4-3a4a-48f0-8fd8-2830d93fd963.png) + +## COCO Separated & Occluded Mask Metric + +Detecting occluded objects still remains a challenge for state-of-the-art object detectors. +We implemented the metric presented in paper [A Tri-Layer Plugin to Improve Occluded Detection](https://arxiv.org/abs/2210.10046) to calculate the recall of separated and occluded masks. + +There are two ways to use this metric: + +### Offline evaluation + +We provide a script to calculate the metric with a dumped prediction file. + +First, use the `tools/test.py` script to dump the detection results: + +```shell +python tools/test.py ${CONFIG} ${MODEL_PATH} --out results.pkl +``` + +Then, run the `tools/analysis_tools/coco_occluded_separated_recall.py` script to get the recall of separated and occluded masks: + +```shell +python tools/analysis_tools/coco_occluded_separated_recall.py results.pkl --out occluded_separated_recall.json +``` + +The output should be like this: + +``` +loading annotations into memory... +Done (t=0.51s) +creating index... +index created! +processing detection results... +[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 5000/5000, 109.3 task/s, elapsed: 46s, ETA: 0s +computing occluded mask recall... +[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 5550/5550, 780.5 task/s, elapsed: 7s, ETA: 0s +COCO occluded mask recall: 58.79% +COCO occluded mask success num: 3263 +computing separated mask recall... +[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 3522/3522, 778.3 task/s, elapsed: 5s, ETA: 0s +COCO separated mask recall: 31.94% +COCO separated mask success num: 1125 + ++-----------+--------+-------------+ +| mask type | recall | num correct | ++-----------+--------+-------------+ +| occluded | 58.79% | 3263 | +| separated | 31.94% | 1125 | ++-----------+--------+-------------+ +Evaluation results have been saved to occluded_separated_recall.json. +``` + +### Online evaluation + +We implement `CocoOccludedSeparatedMetric` which inherits from the `CocoMetic`. +To evaluate the recall of separated and occluded masks during training, just replace the evaluator metric type with `'CocoOccludedSeparatedMetric'` in your config: + +```python +val_evaluator = dict( + type='CocoOccludedSeparatedMetric', # modify this + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False) +test_evaluator = val_evaluator +``` + +Please cite the paper if you use this metric: + +```latex +@article{zhan2022triocc, + title={A Tri-Layer Plugin to Improve Occluded Detection}, + author={Zhan, Guanqi and Xie, Weidi and Zisserman, Andrew}, + journal={British Machine Vision Conference}, + year={2022} +} +``` diff --git a/mmdetection/docs/en/user_guides/visualization.md b/mmdetection/docs/en/user_guides/visualization.md new file mode 100644 index 0000000..dade26e --- /dev/null +++ b/mmdetection/docs/en/user_guides/visualization.md @@ -0,0 +1,91 @@ +# Visualization + +Before reading this tutorial, it is recommended to read MMEngine's [Visualization](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/visualization.md) documentation to get a first glimpse of the `Visualizer` definition and usage. + +In brief, the [`Visualizer`](mmengine.visualization.Visualizer) is implemented in MMEngine to meet the daily visualization needs, and contains three main functions: + +- Implement common drawing APIs, such as [`draw_bboxes`](mmengine.visualization.Visualizer.draw_bboxes) which implements bounding box drawing functions, [`draw_lines`](mmengine.visualization.Visualizer.draw_lines) implements the line drawing function. +- Support writing visualization results, learning rate curves, loss function curves, and verification accuracy curves to various backends, including local disks and common deep learning training logging tools such as [TensorBoard](https://www.tensorflow.org/tensorboard) and [Wandb](https://wandb.ai/site). +- Support calling anywhere in the code to visualize or record intermediate states of the model during training or testing, such as feature maps and validation results. + +Based on MMEngine's Visualizer, MMDet comes with a variety of pre-built visualization tools that can be used by the user by simply modifying the following configuration files. + +- The `tools/analysis_tools/browse_dataset.py` script provides a dataset visualization function that draws images and corresponding annotations after Data Transforms, as described in [`browse_dataset.py`](useful_tools.md#Visualization). +- MMEngine implements `LoggerHook`, which uses `Visualizer` to write the learning rate, loss and evaluation results to the backend set by `Visualizer`. Therefore, by modifying the `Visualizer` backend in the configuration file, for example to ` TensorBoardVISBackend` or `WandbVISBackend`, you can implement logging to common training logging tools such as `TensorBoard` or `WandB`, thus making it easy for users to use these visualization tools to analyze and monitor the training process. +- The `VisualizerHook` is implemented in MMDet, which uses the `Visualizer` to visualize or store the prediction results of the validation or prediction phase into the backend set by the `Visualizer`, so by modifying the `Visualizer` backend in the configuration file, for example, to ` TensorBoardVISBackend` or `WandbVISBackend`, you can implement storing the predicted images to `TensorBoard` or `Wandb`. + +## Configuration + +Thanks to the use of the registration mechanism, in MMDet we can set the behavior of the `Visualizer` by modifying the configuration file. Usually, we define the default configuration for the visualizer in `configs/_base_/default_runtime.py`, see [configuration tutorial](config.md) for details. + +```Python +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') +``` + +Based on the above example, we can see that the configuration of `Visualizer` consists of two main parts, namely, the type of `Visualizer` and the visualization backend `vis_backends` it uses. + +- Users can directly use `DetLocalVisualizer` to visualize labels or predictions for support tasks. +- MMDet sets the visualization backend `vis_backend` to the local visualization backend `LocalVisBackend` by default, saving all visualization results and other training information in a local folder. + +## Storage + +MMDet uses the local visualization backend [`LocalVisBackend`](mmengine.visualization.LocalVisBackend) by default, and the model loss, learning rate, model evaluation accuracy and visualization The information stored in `VisualizerHook` and `LoggerHook`, including loss, learning rate, evaluation accuracy will be saved to the `{work_dir}/{config_name}/{time}/{vis_data}` folder by default. In addition, MMDet also supports other common visualization backends, such as `TensorboardVisBackend` and `WandbVisBackend`, and you only need to change the `vis_backends` type in the configuration file to the corresponding visualization backend. For example, you can store data to `TensorBoard` and `Wandb` by simply inserting the following code block into the configuration file. + +```Python +# https://mmengine.readthedocs.io/en/latest/api/visualization.html +_base_.visualizer.vis_backends = [ + dict(type='LocalVisBackend'), # + dict(type='TensorboardVisBackend'), + dict(type='WandbVisBackend'),] +``` + +## Plot + +### Plot the prediction results + +MMDet mainly uses [`DetVisualizationHook`](mmdet.engine.hooks.DetVisualizationHook) to plot the prediction results of validation and test, by default `DetVisualizationHook` is off, and the default configuration is as follows. + +```Python +visualization=dict( # user visualization of validation and test results + type='DetVisualizationHook', + draw=False, + interval=1, + show=False) +``` + +The following table shows the parameters supported by `DetVisualizationHook`. + +| Parameters | Description | +| :--------: | :-----------------------------------------------------------------------------------------------------------: | +| draw | The DetVisualizationHook is turned on and off by the enable parameter, which is the default state. | +| interval | Controls how much iteration to store or display the results of a val or test if VisualizationHook is enabled. | +| show | Controls whether to visualize the results of val or test. | + +If you want to enable `DetVisualizationHook` related functions and configurations during training or testing, you only need to modify the configuration, take `configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py` as an example, draw annotations and predictions at the same time, and display the images, the configuration can be modified as follows + +```Python +visualization = _base_.default_hooks.visualization +visualization.update(dict(draw=True, show=True)) +``` + +
    + +
    + +The `test.py` procedure is further simplified by providing the `--show` and `--show-dir` parameters to visualize the annotation and prediction results during the test without modifying the configuration. + +```Shell +# Show test results +python tools/test.py configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth --show + +# Specify where to store the prediction results +python tools/test.py configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth --show-dir imgs/ +``` + +
    + +
    diff --git a/mmdetection/docs/zh_cn/Makefile b/mmdetection/docs/zh_cn/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/mmdetection/docs/zh_cn/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/mmdetection/docs/zh_cn/_static/css/readthedocs.css b/mmdetection/docs/zh_cn/_static/css/readthedocs.css new file mode 100644 index 0000000..57ed0ad --- /dev/null +++ b/mmdetection/docs/zh_cn/_static/css/readthedocs.css @@ -0,0 +1,6 @@ +.header-logo { + background-image: url("../image/mmdet-logo.png"); + background-size: 156px 40px; + height: 40px; + width: 156px; +} diff --git a/mmdetection/docs/zh_cn/advanced_guides/conventions.md b/mmdetection/docs/zh_cn/advanced_guides/conventions.md new file mode 100644 index 0000000..9fb1f14 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/conventions.md @@ -0,0 +1,109 @@ +# 默认约定 + +如果你想把 MMDetection 修改为自己的项目,请遵循下面的约定。 + +## 关于图片 shape 顺序的说明 + +在OpenMMLab 2.0中, 为了与 OpenCV 的输入参数相一致,图片处理 pipeline 中关于图像 shape 的输入参数总是以 `(width, height)` 的顺序排列。 +相反,为了计算方便,经过 pipeline 和 model 的字段的顺序是 `(height, width)`。具体来说在每个数据 pipeline 处理的结果中,字段和它们的值含义如下: + +- img_shape: (height, width) +- ori_shape: (height, width) +- pad_shape: (height, width) +- batch_input_shape: (height, width) + +以 `Mosaic` 为例,其初始化参数如下所示: + +```python +@TRANSFORMS.register_module() +class Mosaic(BaseTransform): + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + prob: float = 1.0) -> None: + ... + + # img_scale 顺序应该是 (width, height) + self.img_scale = img_scale + + def transform(self, results: dict) -> dict: + ... + + results['img'] = mosaic_img + # (height, width) + results['img_shape'] = mosaic_img.shape[:2] +``` + +## 损失 + +在 MMDetection 中,`model(**data)` 的返回值是一个字典,包含着所有的损失和评价指标,他们将会由 `model(**data)` 返回。 + +例如,在 bbox head 中, + +```python +class BBoxHead(nn.Module): + ... + def loss(self, ...): + losses = dict() + # 分类损失 + losses['loss_cls'] = self.loss_cls(...) + # 分类准确率 + losses['acc'] = accuracy(...) + # 边界框损失 + losses['loss_bbox'] = self.loss_bbox(...) + return losses +``` + +`'bbox_head.loss()'` 在模型 forward 阶段会被调用。返回的字典中包含了 `'loss_bbox'`,`'loss_cls'`,`'acc'`。只有 `'loss_bbox'`, `'loss_cls'` 会被用于反向传播,`'acc'` 只会被作为评价指标来监控训练过程。 + +我们默认,只有那些键的名称中包含 `'loss'` 的值会被用于反向传播。这个行为可以通过修改 `BaseDetector.train_step()` 来改变。 + +## 空 proposals + +在 MMDetection 中,我们为两阶段方法中空 proposals 的情况增加了特殊处理和单元测试。我们同时需要处理整个 batch 和单一图片中空 proposals 的情况。例如,在 CascadeRoIHead 中, + +```python +# 简单的测试 +... + +# 在整个 batch中 都没有 proposals +if rois.shape[0] == 0: + bbox_results = [[ + np.zeros((0, 5), dtype=np.float32) + for _ in range(self.bbox_head[-1].num_classes) + ]] * num_imgs + if self.with_mask: + mask_classes = self.mask_head[-1].num_classes + segm_results = [[[] for _ in range(mask_classes)] + for _ in range(num_imgs)] + results = list(zip(bbox_results, segm_results)) + else: + results = bbox_results + return results +... + +# 在单张图片中没有 proposals +for i in range(self.num_stages): + ... + if i < self.num_stages - 1: + for j in range(num_imgs): + # 处理空 proposals + if rois[j].shape[0] > 0: + bbox_label = cls_score[j][:, :-1].argmax(dim=1) + refine_roi = self.bbox_head[i].regress_by_class( + rois[j], bbox_label[j], bbox_pred[j], img_metas[j]) + refine_roi_list.append(refine_roi) +``` + +如果你有自定义的 `RoIHead`, 你可以参考上面的方法来处理空 proposals 的情况。 + +## 全景分割数据集 + +在 MMDetection 中,我们支持了 COCO 全景分割数据集 `CocoPanopticDataset`。对于它的实现,我们在这里声明一些默认约定。 + +1. 在 mmdet\<=2.16.0 时,语义分割标注中的前景和背景标签范围与 MMDetection 中的默认规定有所不同。标签 `0` 代表 `VOID` 标签。 + 从 mmdet=2.17.0 开始,为了和框的类别标注保持一致,语义分割标注的类别标签也改为从 `0` 开始,标签 `255` 代表 `VOID` 类。 + 为了达成这一目标,我们在流程 `Pad` 里支持了设置 `seg` 的填充值的功能。 +2. 在评估中,全景分割结果必须是一个与原图大小相同的图。结果图中每个像素的值有如此形式:`instance_id * INSTANCE_OFFSET + category_id`。 diff --git a/mmdetection/docs/zh_cn/advanced_guides/customize_dataset.md b/mmdetection/docs/zh_cn/advanced_guides/customize_dataset.md new file mode 100644 index 0000000..e845f37 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/customize_dataset.md @@ -0,0 +1,425 @@ +# 自定义数据集 + +## 支持新的数据格式 + +为了支持新的数据格式,可以选择将数据转换成现成的格式(COCO 或者 PASCAL)或将其转换成中间格式。当然也可以选择以离线的形式(在训练之前使用脚本转换)或者在线的形式(实现一个新的 dataset 在训练中进行转换)来转换数据。 + +在 MMDetection 中,建议将数据转换成 COCO 格式并以离线的方式进行,因此在完成数据转换后只需修改配置文件中的标注数据的路径和类别即可。 + +### 将新的数据格式转换为现有的数据格式 + +最简单的方法就是将你的数据集转换成现有的数据格式(COCO 或者 PASCAL VOC) + +COCO 格式的 JSON 标注文件有如下必要的字段: + +```python +'images': [ + { + 'file_name': 'COCO_val2014_000000001268.jpg', + 'height': 427, + 'width': 640, + 'id': 1268 + }, + ... +], + +'annotations': [ + { + 'segmentation': [[192.81, + 247.09, + ... + 219.03, + 249.06]], # 如果有 mask 标签且为多边形 XY 点坐标格式,则需要保证至少包括 3 个点坐标,否则为无效多边形 + 'area': 1035.749, + 'iscrowd': 0, + 'image_id': 1268, + 'bbox': [192.81, 224.8, 74.73, 33.43], + 'category_id': 16, + 'id': 42986 + }, + ... +], + +'categories': [ + {'id': 0, 'name': 'car'}, + ] +``` + +在 JSON 文件中有三个必要的键: + +- `images`: 包含多个图片以及它们的信息的数组,例如 `file_name`、`height`、`width` 和 `id`。 +- `annotations`: 包含多个实例标注信息的数组。 +- `categories`: 包含多个类别名字和 ID 的数组。 + +在数据预处理之后,使用现有的数据格式来训练自定义的新数据集有如下两步(以 COCO 为例): + +1. 为自定义数据集修改配置文件。 +2. 检查自定义数据集的标注。 + +这里我们举一个例子来展示上面的两个步骤,这个例子使用包括 5 个类别的 COCO 格式的数据集来训练一个现有的 Cascade Mask R-CNN R50-FPN 检测器 + +#### 1. 为自定义数据集修改配置文件 + +配置文件的修改涉及两个方面: + +1. `dataloaer` 部分。需要在 `train_dataloader.dataset`、`val_dataloader.dataset` 和 `test_dataloader.dataset` 中添加 `metainfo=dict(classes=classes)`, 其中 classes 必须是 tuple 类型。 +2. `model` 部分中的 `num_classes`。需要将默认值(COCO 数据集中为 80)修改为自定义数据集中的类别数。 + +`configs/my_custom_config.py` 内容如下: + +```python + +# 新的配置来自基础的配置以更好地说明需要修改的地方 +_base_ = './cascade_mask_rcnn_r50_fpn_1x_coco.py' + +# 1. 数据集设定 +dataset_type = 'CocoDataset' +classes = ('a', 'b', 'c', 'd', 'e') +data_root='path/to/your/' + +train_dataloader = dict( + batch_size=2, + num_workers=2, + dataset=dict( + type=dataset_type, + # 将类别名字添加至 `metainfo` 字段中 + metainfo=dict(classes=classes), + data_root=data_root, + ann_file='train/annotation_data', + data_prefix=dict(img='train/image_data') + ) + ) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + type=dataset_type, + test_mode=True, + # 将类别名字添加至 `metainfo` 字段中 + metainfo=dict(classes=classes), + data_root=data_root, + ann_file='val/annotation_data', + data_prefix=dict(img='val/image_data') + ) + +test_dataloader = dict( + batch_size=1, + num_workers=2, + dataset=dict( + type=dataset_type, + test_mode=True, + # 将类别名字添加至 `metainfo` 字段中 + metainfo=dict(classes=classes), + data_root=data_root, + ann_file='test/annotation_data', + data_prefix=dict(img='test/image_data') + ) + ) + +# 2. 模型设置 + +# 将所有的 `num_classes` 默认值修改为 5(原来为80) +model = dict( + roi_head=dict( + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + # 将所有的 `num_classes` 默认值修改为 5(原来为 80) + num_classes=5), + dict( + type='Shared2FCBBoxHead', + # 将所有的 `num_classes` 默认值修改为 5(原来为 80) + num_classes=5), + dict( + type='Shared2FCBBoxHead', + # 将所有的 `num_classes` 默认值修改为 5(原来为 80) + num_classes=5)], + # 将所有的 `num_classes` 默认值修改为 5(原来为 80) + mask_head=dict(num_classes=5))) +``` + +#### 2. 检查自定义数据集的标注 + +假设你自己的数据集是 COCO 格式,那么需要保证数据的标注没有问题: + +1. 标注文件中 `categories` 的长度要与配置中的 `classes` 元组长度相匹配,它们都表示有几类。(如例子中有 5 个类别) +2. 配置文件中 `classes` 字段应与标注文件里 `categories` 下的 `name` 有相同的元素且顺序一致。MMDetection 会自动将 `categories` 中不连续的 `id` 映射成连续的索引,因此 `categories` 下的 `name`的字符串顺序会影响标签的索引。同时,配置文件中的 `classes` 的字符串顺序也会影响到预测框可视化时的标签。 +3. `annotations` 中的 `category_id` 必须是有效的值。比如所有 `category_id` 的值都应该属于 `categories` 中的 `id`。 + +下面是一个有效标注的例子: + +```python + +'annotations': [ + { + 'segmentation': [[192.81, + 247.09, + ... + 219.03, + 249.06]], # 如果有 mask 标签。 + 'area': 1035.749, + 'iscrowd': 0, + 'image_id': 1268, + 'bbox': [192.81, 224.8, 74.73, 33.43], + 'category_id': 16, + 'id': 42986 + }, + ... +], + +# MMDetection 会自动将 `categories` 中不连续的 `id` 映射成连续的索引。 +'categories': [ + {'id': 1, 'name': 'a'}, {'id': 3, 'name': 'b'}, {'id': 4, 'name': 'c'}, {'id': 16, 'name': 'd'}, {'id': 17, 'name': 'e'}, + ] +``` + +我们使用这种方式来支持 CityScapes 数据集。脚本在 [cityscapes.py](https://github.com/open-mmlab/mmdetection/blob/main/tools/dataset_converters/cityscapes.py) 并且我们提供了微调的 [configs](https://github.com/open-mmlab/mmdetection/blob/main/configs/cityscapes). + +**注意** + +1. 对于实例分割数据集, **MMDetection 目前只支持评估 COCO 格式的 mask AP**. +2. 推荐训练之前进行离线转换,这样就可以继续使用 `CocoDataset` 且只需修改标注文件的路径以及训练的种类。 + +### 调整新的数据格式为中间格式 + +如果不想将标注格式转换为 COCO 或者 PASCAL 格式也是可行的。实际上,我们在 MMEngine 的 [BaseDataset](https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/base_dataset.py#L116) 中定义了一种简单的标注格式并且与所有现有的数据格式兼容,也能进行离线或者在线转换。 + +数据集的标注必须为 `json` 或 `yaml`,`yml` 或 `pickle`,`pkl` 格式;标注文件中存储的字典必须包含 `metainfo` 和 `data_list` 两个字段。其中 `metainfo` 是一个字典,里面包含数据集的元信息,例如类别信息;`data_list` 是一个列表,列表中每个元素是一个字典,该字典定义了一个原始数据(raw data),每个原始数据包含一个或若干个训练/测试样本。 + +以下是一个 JSON 标注文件的例子: + +```json +{ + 'metainfo': + { + 'classes': ('person', 'bicycle', 'car', 'motorcycle'), + ... + }, + 'data_list': + [ + { + "img_path": "xxx/xxx_1.jpg", + "height": 604, + "width": 640, + "instances": + [ + { + "bbox": [0, 0, 10, 20], + "bbox_label": 1, + "ignore_flag": 0 + }, + { + "bbox": [10, 10, 110, 120], + "bbox_label": 2, + "ignore_flag": 0 + } + ] + }, + { + "img_path": "xxx/xxx_2.jpg", + "height": 320, + "width": 460, + "instances": + [ + { + "bbox": [10, 0, 20, 20], + "bbox_label": 3, + "ignore_flag": 1 + } + ] + }, + ... + ] +} +``` + +有些数据集可能会提供如:crowd/difficult/ignored bboxes 标注,那么我们使用 `ignore_flag`来包含它们。 + +在得到上述标准的数据标注格式后,可以直接在配置中使用 MMDetection 的 [BaseDetDataset](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/datasets/base_det_dataset.py#L13) ,而无需进行转换。 + +### 自定义数据集例子 + +假设文本文件中表示的是一种全新的标注格式。边界框的标注信息保存在 `annotation.txt` 中,内容如下: + +``` +# +000001.jpg +1280 720 +2 +10 20 40 60 1 +20 40 50 60 2 +# +000002.jpg +1280 720 +3 +50 20 40 60 2 +20 40 30 45 2 +30 40 50 60 3 +``` + +我们可以在 `mmdet/datasets/my_dataset.py` 中创建一个新的 dataset 用以加载数据。 + +```python +import mmengine +from mmdet.base_det_dataset import BaseDetDataset +from mmdet.registry import DATASETS + + +@DATASETS.register_module() +class MyDataset(BaseDetDataset): + + METAINFO = { + 'classes': ('person', 'bicycle', 'car', 'motorcycle'), + 'palette': [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230)] + } + + def load_data_list(self, ann_file): + ann_list = mmengine.list_from_file(ann_file) + + data_infos = [] + for i, ann_line in enumerate(ann_list): + if ann_line != '#': + continue + + img_shape = ann_list[i + 2].split(' ') + width = int(img_shape[0]) + height = int(img_shape[1]) + bbox_number = int(ann_list[i + 3]) + + instances = [] + for anns in ann_list[i + 4:i + 4 + bbox_number]: + instance = {} + instance['bbox'] = [float(ann) for ann in anns.split(' ')[:4]] + instance['bbox_label']=int(anns[4]) + instances.append(instance) + + data_infos.append( + dict( + img_path=ann_list[i + 1], + img_id=i, + width=width, + height=height, + instances=instances + )) + + return data_infos +``` + +配置文件中,可以使用 `MyDataset` 进行如下修改 + +```python +dataset_A_train = dict( + type='MyDataset', + ann_file = 'image_list.txt', + pipeline=train_pipeline +) +``` + +## 使用 dataset 包装器自定义数据集 + +MMEngine 也支持非常多的数据集包装器(wrapper)来混合数据集或在训练时修改数据集的分布,其支持如下三种数据集包装: + +- `RepeatDataset`:将整个数据集简单地重复。 +- `ClassBalancedDataset`:以类别均衡的方式重复数据集。 +- `ConcatDataset`:合并数据集。 + +具体使用方式见 [MMEngine 数据集包装器](#TODO)。 + +## 修改数据集的类别 + +根据现有数据集的类型,我们可以修改它们的类别名称来训练其标注的子集。 +例如,如果只想训练当前数据集中的三个类别,那么就可以修改数据集的 `metainfo` 字典,数据集就会自动屏蔽掉其他类别的真实框。 + +```python +classes = ('person', 'bicycle', 'car') +train_dataloader = dict( + dataset=dict( + metainfo=dict(classes=classes)) + ) +val_dataloader = dict( + dataset=dict( + metainfo=dict(classes=classes)) + ) +test_dataloader = dict( + dataset=dict( + metainfo=dict(classes=classes)) + ) +``` + +**注意** + +- 在 MMDetection v2.5.0 之前,如果类别为集合时数据集将自动过滤掉不包含 GT 的图片,且没办法通过修改配置将其关闭。这是一种不可取的行为而且会引起混淆,因为当类别不是集合时数据集时,只有在 `filter_empty_gt=True` 以及 `test_mode=False` 的情况下才会过滤掉不包含 GT 的图片。在 MMDetection v2.5.0 之后,我们将图片的过滤以及类别的修改进行解耦,数据集只有在 `filter_cfg=dict(filter_empty_gt=True)` 和 `test_mode=False` 的情况下才会过滤掉不包含 GT 的图片,无论类别是否为集合。设置类别只会影响用于训练的标注类别,用户可以自行决定是否过滤不包含 GT 的图片。 +- 直接使用 MMEngine 中的 `BaseDataset` 或者 MMDetection 中的 `BaseDetDataset` 时用户不能通过修改配置来过滤不含 GT 的图片,但是可以通过离线的方式来解决。 +- 当设置数据集中的 `classes` 时,记得修改 `num_classes`。从 v2.9.0 (PR#4508) 之后,我们实现了 [NumClassCheckHook](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/engine/hooks/num_class_check_hook.py) 来检查类别数是否一致。 + +## COCO 全景分割数据集 + +现在我们也支持 COCO Panoptic Dataset,全景注释的格式与 COCO 格式不同,其前景和背景都将存在于注释文件中。COCO Panoptic 格式的注释 JSON 文件具有以下必要的键: + +```python +'images': [ + { + 'file_name': '000000001268.jpg', + 'height': 427, + 'width': 640, + 'id': 1268 + }, + ... +] + +'annotations': [ + { + 'filename': '000000001268.jpg', + 'image_id': 1268, + 'segments_info': [ + { + 'id':8345037, # One-to-one correspondence with the id in the annotation map. + 'category_id': 51, + 'iscrowd': 0, + 'bbox': (x1, y1, w, h), # The bbox of the background is the outer rectangle of its mask. + 'area': 24315 + }, + ... + ] + }, + ... +] + +'categories': [ # including both foreground categories and background categories + {'id': 0, 'name': 'person'}, + ... + ] +``` + +此外,`seg` 必须设置为全景注释图像的路径。 + +```python +dataset_type = 'CocoPanopticDataset' +data_root='path/to/your/' + +train_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img='train/image_data/', seg='train/panoptic/image_annotation_data/') + ) +) +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img='val/image_data/', seg='val/panoptic/image_annotation_data/') + ) +) +test_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict( + img='test/image_data/', seg='test/panoptic/image_annotation_data/') + ) +) +``` diff --git a/mmdetection/docs/zh_cn/advanced_guides/customize_losses.md b/mmdetection/docs/zh_cn/advanced_guides/customize_losses.md new file mode 100644 index 0000000..07ccccd --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/customize_losses.md @@ -0,0 +1,125 @@ +# 自定义损失函数 + +MMDetection 为用户提供了不同的损失函数。但是默认的配置可能无法适应不同的数据和模型,所以用户可能会希望修改某一个损失函数来适应新的情况。 + +本教程首先详细的解释计算损失的过程然后给出一些关于如何修改每一个步骤的指导。对损失的修改可以被分为微调和加权。 + +## 一个损失的计算过程 + +给定输入(包括预测和目标,以及权重),损失函数会把输入的张量映射到最后的损失标量。映射过程可以分为下面五个步骤: + +1. 设置采样方法为对正负样本进行采样。 + +2. 通过损失核函数获取**元素**或者**样本**损失。 + +3. 通过权重张量来给损失**逐元素**权重。 + +4. 把损失张量归纳为一个**标量**。 + +5. 用一个**张量**给当前损失一个权重。 + +## 设置采样方法(步骤 1) + +对于一些损失函数,需要采样策略来避免正负样本之间的不平衡。 + +例如,在RPN head中使用`CrossEntropyLoss`时,我们需要在`train_cfg`中设置`RandomSampler` + +```python +train_cfg=dict( + rpn=dict( + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False)) +``` + +对于其他一些具有正负样本平衡机制的损失,例如 Focal Loss、GHMC 和 QualityFocalLoss,不再需要进行采样。 + +## 微调损失 + +微调一个损失主要与步骤 2,4,5 有关,大部分的修改可以在配置文件中指定。这里我们用 [Focal Loss (FL)](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/losses/focal_loss.py) 作为例子。 +下面的代码分别是构建 FL 的方法和它的配置文件,他们是一一对应的。 + +```python +@LOSSES.register_module() +class FocalLoss(nn.Module): + + def __init__(self, + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0): +``` + +```python +loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0) +``` + +### 微调超参数(步骤2) + +`gamma` 和 `beta` 是 Focal Loss 中的两个超参数。如果我们想把 `gamma` 的值设为 1.5,把 `alpha` 的值设为 0.5,我们可以在配置文件中按照如下指定: + +```python +loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=1.5, + alpha=0.5, + loss_weight=1.0) +``` + +### 微调归纳方式(步骤4) + +Focal Loss 默认的归纳方式是 `mean`。如果我们想把归纳方式从 `mean` 改成 `sum`,我们可以在配置文件中按照如下指定: + +```python +loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0, + reduction='sum') +``` + +### 微调损失权重(步骤5) + +这里的损失权重是一个标量,他用来控制多任务学习中不同损失的重要程度,例如,分类损失和回归损失。如果我们想把分类损失的权重设为 0.5,我们可以在配置文件中如下指定: + +```python +loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=0.5) +``` + +## 加权损失(步骤3) + +加权损失就是我们逐元素修改损失权重。更具体来说,我们给损失张量乘以一个与他有相同形状的权重张量。所以,损失中不同的元素可以被赋予不同的比例,所以这里叫做逐元素。损失的权重在不同模型中变化很大,而且与上下文相关,但是总的来说主要有两种损失权重:分类损失的 `label_weights` 和边界框的 `bbox_weights`。你可以在相应的头中的 `get_target` 方法中找到他们。这里我们使用 [ATSSHead](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/dense_heads/atss_head.py#L322) 作为一个例子。它继承了 [AnchorHead](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/dense_heads/anchor_head.py) ,但是我们重写它的 +`get_targets` 方法来产生不同的 `label_weights` 和 `bbox_weights`。 + +``` +class ATSSHead(AnchorHead): + + ... + + def get_targets(self, + anchor_list, + valid_flag_list, + gt_bboxes_list, + img_metas, + gt_bboxes_ignore_list=None, + gt_labels_list=None, + label_channels=1, + unmap_outputs=True): +``` diff --git a/mmdetection/docs/zh_cn/advanced_guides/customize_models.md b/mmdetection/docs/zh_cn/advanced_guides/customize_models.md new file mode 100644 index 0000000..5fa77e4 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/customize_models.md @@ -0,0 +1,412 @@ +# 自定义模型 + +我们简单地把模型的各个组件分为五类: + +- 主干网络 (backbone):通常是一个用来提取特征图 (feature map) 的全卷积网络 (FCN network),例如:ResNet, MobileNet。 +- Neck:主干网络和 Head 之间的连接部分,例如:FPN, PAFPN。 +- Head:用于具体任务的组件,例如:边界框预测和掩码预测。 +- 区域提取器 (roi extractor):从特征图中提取 RoI 特征,例如:RoI Align。 +- 损失 (loss):在 Head 组件中用于计算损失的部分,例如:FocalLoss, L1Loss, GHMLoss. + +## 开发新的组件 + +### 添加一个新的主干网络 + +这里,我们以 MobileNet 为例来展示如何开发新组件。 + +#### 1. 定义一个新的主干网络(以 MobileNet 为例) + +新建一个文件 `mmdet/models/backbones/mobilenet.py` + +```python +import torch.nn as nn + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class MobileNet(nn.Module): + + def __init__(self, arg1, arg2): + pass + + def forward(self, x): # should return a tuple + pass +``` + +#### 2. 导入该模块 + +你可以添加下述代码到 `mmdet/models/backbones/__init__.py` + +```python +from .mobilenet import MobileNet +``` + +或添加: + +```python +custom_imports = dict( + imports=['mmdet.models.backbones.mobilenet'], + allow_failed_imports=False) +``` + +到配置文件以避免原始代码被修改。 + +#### 3. 在你的配置文件中使用该主干网络 + +```python +model = dict( + ... + backbone=dict( + type='MobileNet', + arg1=xxx, + arg2=xxx), + ... +``` + +### 添加新的 Neck + +#### 1. 定义一个 Neck(以 PAFPN 为例) + +新建一个文件 `mmdet/models/necks/pafpn.py` + +```python +import torch.nn as nn + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class PAFPN(nn.Module): + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False): + pass + + def forward(self, inputs): + # implementation is ignored + pass +``` + +#### 2. 导入该模块 + +你可以添加下述代码到 `mmdet/models/necks/__init__.py` + +```python +from .pafpn import PAFPN +``` + +或添加: + +```python +custom_imports = dict( + imports=['mmdet.models.necks.pafpn'], + allow_failed_imports=False) +``` + +到配置文件以避免原始代码被修改。 + +#### 3. 修改配置文件 + +```python +neck=dict( + type='PAFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5) +``` + +### 添加新的 Head + +我们以 [Double Head R-CNN](https://arxiv.org/abs/1904.06493) 为例来展示如何添加一个新的 Head。 + +首先,添加一个新的 bbox head 到 `mmdet/models/roi_heads/bbox_heads/double_bbox_head.py`。 +Double Head R-CNN 在目标检测上实现了一个新的 bbox head。为了实现 bbox head,我们需要使用如下的新模块中三个函数。 + +```python +from typing import Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule, ModuleList +from torch import Tensor + +from mmdet.models.backbones.resnet import Bottleneck +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, MultiConfig, OptConfigType, OptMultiConfig +from .bbox_head import BBoxHead + + +@MODELS.register_module() +class DoubleConvFCBBoxHead(BBoxHead): + r"""Bbox head used in Double-Head R-CNN + + .. code-block:: none + + /-> cls + /-> shared convs -> + \-> reg + roi features + /-> cls + \-> shared fc -> + \-> reg + """ # noqa: W605 + + def __init__(self, + num_convs: int = 0, + num_fcs: int = 0, + conv_out_channels: int = 1024, + fc_out_channels: int = 1024, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + init_cfg: MultiConfig = dict( + type='Normal', + override=[ + dict(type='Normal', name='fc_cls', std=0.01), + dict(type='Normal', name='fc_reg', std=0.001), + dict( + type='Xavier', + name='fc_branch', + distribution='uniform') + ]), + **kwargs) -> None: + kwargs.setdefault('with_avg_pool', True) + super().__init__(init_cfg=init_cfg, **kwargs) + + def forward(self, x_cls: Tensor, x_reg: Tensor) -> Tuple[Tensor]: + +``` + +然后,如有必要,实现一个新的 bbox head。我们打算从 `StandardRoIHead` 来继承新的 `DoubleHeadRoIHead`。我们可以发现 `StandardRoIHead` 已经实现了下述函数。 + +```python +from typing import List, Optional, Tuple + +import torch +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList +from ..task_modules.samplers import SamplingResult +from ..utils import empty_instances, unpack_gt_instances +from .base_roi_head import BaseRoIHead + + +@MODELS.register_module() +class StandardRoIHead(BaseRoIHead): + """Simplest base roi head including one bbox head and one mask head.""" + + def init_assigner_sampler(self) -> None: + + def init_bbox_head(self, bbox_roi_extractor: ConfigType, + bbox_head: ConfigType) -> None: + + def init_mask_head(self, mask_roi_extractor: ConfigType, + mask_head: ConfigType) -> None: + + def forward(self, x: Tuple[Tensor], + rpn_results_list: InstanceList) -> tuple: + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: List[DetDataSample]) -> dict: + + def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + + def bbox_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult]) -> dict: + + def mask_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult], bbox_feats: Tensor, + batch_gt_instances: InstanceList) -> dict: + + def _mask_forward(self, + x: Tuple[Tensor], + rois: Tensor = None, + pos_inds: Optional[Tensor] = None, + bbox_feats: Optional[Tensor] = None) -> dict: + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False) -> InstanceList: + + def predict_mask(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + results_list: InstanceList, + rescale: bool = False) -> InstanceList: + +``` + +Double Head 的修改主要在 bbox_forward 的逻辑中,且它从 `StandardRoIHead` 中继承了其他逻辑。在 `mmdet/models/roi_heads/double_roi_head.py` 中,我们用下述代码实现新的 bbox head: + +```python +from typing import Tuple + +from torch import Tensor + +from mmdet.registry import MODELS +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class DoubleHeadRoIHead(StandardRoIHead): + """RoI head for `Double Head RCNN `_. + + Args: + reg_roi_scale_factor (float): The scale factor to extend the rois + used to extract the regression features. + """ + + def __init__(self, reg_roi_scale_factor: float, **kwargs): + super().__init__(**kwargs) + self.reg_roi_scale_factor = reg_roi_scale_factor + + def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + """Box head forward function used in both training and testing. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + bbox_cls_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], rois) + bbox_reg_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], + rois, + roi_scale_factor=self.reg_roi_scale_factor) + if self.with_shared_head: + bbox_cls_feats = self.shared_head(bbox_cls_feats) + bbox_reg_feats = self.shared_head(bbox_reg_feats) + cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats) + + bbox_results = dict( + cls_score=cls_score, + bbox_pred=bbox_pred, + bbox_feats=bbox_cls_feats) + return bbox_results +``` + +最终,用户需要把该模块添加到 `mmdet/models/bbox_heads/__init__.py` 和 `mmdet/models/roi_heads/__init__.py` 以使相关的注册表可以找到并加载他们。 + +或者,用户可以添加: + +```python +custom_imports=dict( + imports=['mmdet.models.roi_heads.double_roi_head', 'mmdet.models.roi_heads.bbox_heads.double_bbox_head']) +``` + +到配置文件并实现相同的目的。 + +Double Head R-CNN 的配置文件如下: + +```python +_base_ = '../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model = dict( + roi_head=dict( + type='DoubleHeadRoIHead', + reg_roi_scale_factor=1.3, + bbox_head=dict( + _delete_=True, + type='DoubleConvFCBBoxHead', + num_convs=4, + num_fcs=2, + in_channels=256, + conv_out_channels=1024, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=2.0)))) + +``` + +从 MMDetection 2.0 版本起,配置系统支持继承配置以使用户可以专注于修改。 +Double Head R-CNN 主要使用了一个新的 `DoubleHeadRoIHead` 和一个新的 `DoubleConvFCBBoxHead`,参数需要根据每个模块的 `__init__` 函数来设置。 + +### 添加新的损失 + +假设你想添加一个新的损失 `MyLoss` 用于边界框回归。 +为了添加一个新的损失函数,用户需要在 `mmdet/models/losses/my_loss.py` 中实现。 +装饰器 `weighted_loss` 可以使损失每个部分加权。 + +```python +import torch +import torch.nn as nn + +from mmdet.registry import LOSSES +from .utils import weighted_loss + + +@weighted_loss +def my_loss(pred, target): + assert pred.size() == target.size() and target.numel() > 0 + loss = torch.abs(pred - target) + return loss + +@LOSSES.register_module() +class MyLoss(nn.Module): + + def __init__(self, reduction='mean', loss_weight=1.0): + super(MyLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_bbox = self.loss_weight * my_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox +``` + +然后,用户需要把它加到 `mmdet/models/losses/__init__.py`。 + +```python +from .my_loss import MyLoss, my_loss +``` + +或者,你可以添加: + +```python +custom_imports=dict( + imports=['mmdet.models.losses.my_loss']) +``` + +到配置文件来实现相同的目的。 + +如使用,请修改 `loss_xxx` 字段。 +因为 MyLoss 是用于回归的,你需要在 Head 中修改 `loss_xxx` 字段。 + +```python +loss_bbox=dict(type='MyLoss', loss_weight=1.0)) +``` diff --git a/mmdetection/docs/zh_cn/advanced_guides/customize_runtime.md b/mmdetection/docs/zh_cn/advanced_guides/customize_runtime.md new file mode 100644 index 0000000..d4a1909 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/customize_runtime.md @@ -0,0 +1,387 @@ +# 自定义训练配置 + +## 自定义优化相关的配置 + +优化相关的配置现在已全部集成到 `optim_wrapper` 中,通常包含三个域:`optimizer`, `paramwise_cfg`,`clip_grad`,具体细节见 [OptimWrapper](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.md)。下面这个例子中,使用了 `AdamW` 作为优化器,主干部分的学习率缩小到原来的十分之一,以及添加了梯度裁剪。 + +```python +optim_wrapper = dict( + type='OptimWrapper', + # 优化器 + optimizer=dict( + type='AdamW', + lr=0.0001, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999)), + + # 参数层面的学习率和正则化设置 + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + }, + norm_decay_mult=0.0), + + # 梯度裁剪 + clip_grad=dict(max_norm=0.01, norm_type=2)) +``` + +### 自定义 Pytorch 中优化器设置 + +我们已经支持了 Pytorch 中实现的所有优化器,要使用这些优化器唯一要做就是修改配置文件中的 `optimi_wrapper` 中的 `optimzer` 域。比如,如果想要使用 `ADAM` 作为优化器(可能会导致性能下降),所需要做的修改如下。 + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='Adam', lr=0.0003, weight_decay=0.0001)) +``` + +要修改模型的学习率,用户只需要修改 `optimizer` 中的 `lr` 域。用户可以直接参考 PyToch 的 [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) 来进行参数的设置。 + +### 自定义优化器 + +#### 1. 定义一个新优化器 + +自定义优化器可以定义的方式如下: + +假设你想要添加一个名为 `MyOptimizer` 的优化器,它包含三个参数 `a`,`b`,`c`。你需要新建一个名为 +`mmdet/engine/optimizers` 的文件夹。然后在文件(比如,`mmdet/engine/optimizers/my_optimizer.py`)实现一个新的优化器。 + +```python +from mmdet.registry import OPTIMIZERS +from torch.optim import Optimizer + + +@OPTIMIZERS.register_module() +class MyOptimizer(Optimizer): + + def __init__(self, a, b, c) + +``` + +#### 2. 导入自定义的优化器 + +为了能找到上面的所定义的模块,这个模块必须要先导入到主命名空间中。有两种方式可以实现这一点。 + +- 修改 `mmdet/engine/optimizers/__init__.py` 来导入模块。 + + 新定义的模块必须导入到 `mmdet/engine/optimizers/__init__.py`,这样注册器才能找到该模块并添加它。 + +```python +from .my_optimizer import MyOptimizer +``` + +- 在配置文件使用 `custom_imports` 来手动导入模块。 + +```python +custom_imports = dict(imports=['mmdet.engine.optimizers.my_optimizer'], allow_failed_imports=False) +``` + +`mmdet.engine.optimizers.my_optimizer` 模块将在程序开始时导入,之后 `MyOptimizer` 类会被自动注册。注意:应该导入 `MyOptimizer` 所在的文件,即 `mmdet.engine.optimizers.my_optimizer`,而不是 `mmdet.engine.optimizers.my_optimizer.MyOptimizer`。 + +实际上,用户也可以在别的目录结构下来进行导入模块,只要改模块可以在 `PYTHONPATH` 中找到。 + +#### 3. 在配置文件中指定优化器 + +接下来,你可以在配置文件中的 `optim_wrapper` 域中的中 `optimizer` 域中设置你实现的优化器 `MyOptimizer`。在配置文件中,优化器在 `optimizer` 域中的配置方式如下: + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)) +``` + +为了使用你的优化器,可以进行如下修改 + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)) +``` + +### 自定义优化器包装构造类 + +一些模型可能存在一些特定参数的优化设置,比如,BN 层的权重衰减。用户可以通过自定义优化器包装构造类来实现这些精细化的参数调整。 + +```python +from mmengine.optim import DefaultOptiWrapperConstructor + +from mmdet.registry import OPTIM_WRAPPER_CONSTRUCTORS +from .my_optimizer import MyOptimizer + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class MyOptimizerWrapperConstructor(DefaultOptimWrapperConstructor): + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None): + + def __call__(self, model: nn.Module) -> OptimWrapper: + + return optim_wrapper + +``` + +优化器包装构造类的具体实现见[这里](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L18),用户以它为模板,来实现新的优化器包装构造类。 + +### 额外的设置 + +一些没有被优化器实现的技巧(比如,参数层面的学习率设置)应该通过优化器包装构造类来实现或者钩子。我们列出了一些常用的设置用于稳定训练或者加速训练。请随意创建 PR,发布更多设置。 + +- __使用梯度裁剪来稳定训练__: + 一些模型需要进行梯度裁剪来稳定训练过程,例子如下: + + ```python + optim_wrapper = dict( + _delete_=True, clip_grad=dict(max_norm=35, norm_type=2)) + ``` + + 如果你的配置已经集成了基础配置(包含了 `optim_wrapper` 的配置),那么你需要添加 `_delete_=True` 来覆盖掉不需要的设置。具体见[配置相关的文档](https://mmdetection.readthedocs.io/en/latest/tutorials/config.html)。 + +- __使用动量调度加速模型收敛__: + 我们支持动量调度器根据学习率修改模型的动量,这可以使模型以更快的方式收敛。动量调度器通常与学习率调度器一起使用,例如 [3D 检测](https://github.com/open-mmlab/mmdetection3d/blob/dev-1.x/configs/_base_/schedules/cyclic-20e.py) 中使用以下配置以加速收敛。 + 更多细节请参考 [CosineAnnealingLR](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L43) 和 [CosineAnnealingMomentum](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py#L71) 的具体实现。 + + ```python + param_scheduler = [ + # 学习率调度器 + # 在前 8 个 epoch, 学习率从 0 增大到 lr * 10 + # 在接下来 12 个 epoch, 学习率从 lr * 10 减小到 lr * 1e-4 + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=lr * 10, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=12, + eta_min=lr * 1e-4, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True), + # 动量调度器 + # 在前 8 个 epoch, 动量从 0 增大到 0.85 / 0.95 + # 在接下来 12 个 epoch, 学习率从 0.85 / 0.95 增大到 1 + dict( + type='CosineAnnealingMomentum', + T_max=8, + eta_min=0.85 / 0.95, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=12, + eta_min=1, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True) + ] + ``` + +## 自定义训练策略 + +默认情况下,我们使用 1x 的学习率调整策略,这会条用 MMEngine 中的 [MultiStepLR](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py#L139)。 +我们支持许多其他学习率调整策略,具体见[这里](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py),例如 `CosineAnnealingLR` 和 `PolyLR` 策略。下面有些例子 + +- 多项式学习率调整策略: + + ```python + param_scheduler = [ + dict( + type='PolyLR', + power=0.9, + eta_min=1e-4, + begin=0, + end=8, + by_epoch=True)] + ``` + +- 余弦退火学习率调整策略 + + ```python + param_scheduler = [ + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=lr * 1e-5, + begin=0, + end=8, + by_epoch=True)] + + ``` + +## 自定义训练循环 + +默认情况下,在 `train_cfg` 中使用 `EpochBasedTrainLoop`,并且在每个 epoch 训练之后进行验证,如下所示。 + +```python +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_begin=1, val_interval=1) +``` + +实际上,[`IterBasedTrainLoop`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L183%5D) 和\[`EpochBasedTrainLoop`\](https:// github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L18) 支持动态区间的方式进行验证,见下例。 + +```python +# 在第 365001 次迭代之前,我们每 5000 次迭代进行一次评估。 +# 在第 365000 次迭代后,我们每 368750 次迭代进行一次评估, +# 这意味着我们在训练结束时进行评估。 + +interval = 5000 +max_iters = 368750 +dynamic_intervals = [(max_iters // interval * interval + 1, max_iters)] +train_cfg = dict( + type='IterBasedTrainLoop', + max_iters=max_iters, + val_interval=interval, + dynamic_intervals=dynamic_intervals) +``` + +## 自定义钩子 + +### 自定义自行实现的钩子 + +#### 1. 实现一个新的钩子 + +MMEngine 提供了许多有用的[钩子](https://mmdetection.readthedocs.io/en/latest/tutorials/hooks.html),但在某些情况下用户可能需要实现新的钩子。MMDetection 在 v3.0 中支持自定义钩子。因此,用户可以直接在 mmdet 或其基于 mmdet 的代码库中实现钩子,并通过仅在训练中修改配置来使用钩子。 +这里我们给出一个在 mmdet 中创建一个新的钩子并在训练中使用它的例子。 + +```python +from mmengine.hooks import Hook +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class MyHook(Hook): + + def __init__(self, a, b): + + def before_run(self, runner) -> None: + + def after_run(self, runner) -> None: + + def before_train(self, runner) -> None: + + def after_train(self, runner) -> None: + + def before_train_epoch(self, runner) -> None: + + def after_train_epoch(self, runner) -> None: + + def before_train_iter(self, + runner, + batch_idx: int, + data_batch: DATA_BATCH = None) -> None: + + def after_train_iter(self, + runner, + batch_idx: int, + data_batch: DATA_BATCH = None, + outputs: Optional[dict] = None) -> None: +``` + +根据钩子的功能,用户需要在 `before_run`、`after_run`、`before_train`、`after_train`、`before_train_epoch`、`after_train_epoch`、`before_train_iter` 和 `after_train_iter`。还有更多可以插入钩子的点,更多细节请参考 [base hook class](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py#L9)。 + +#### 2. 注册新钩子 + +然后我们需要导入 `MyHook`。假设该文件位于 `mmdet/engine/hooks/my_hook.py` 中,有两种方法可以做到这一点: + +- 修改 `mmdet/engine/hooks/__init__.py` 以导入它。 + + 新定义的模块应该在 `mmdet/engine/hooks/__init__.py` 中导入,以便注册表找到新模块并添加它: + +```python +from .my_hook import MyHook +``` + +- 在配置中使用 `custom_imports` 手动导入它 + +```python +custom_imports = dict(imports=['mmdet.engine.hooks.my_hook'], allow_failed_imports=False) +``` + +#### 3. 修改配置 + +```python +custom_hooks = [ + dict(type='MyHook', a=a_value, b=b_value) +] +``` + +你还可以通过修改键 `priority` 的值为 `NORMAL` 或 `HIGHEST` 来设置挂钩的优先级,如下所示 + +```python +custom_hooks = [ + dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL') +] +``` + +默认情况下,钩子的优先级在注册期间设置为 `NORMAL`。 + +### 使用 MMDetection 中实现的钩子 + +如果 MMDetection 中已经实现了该钩子,你可以直接修改配置以使用该钩子,如下所示 + +#### 例子: `NumClassCheckHook` + +我们实现了一个名为 [NumClassCheckHook](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/engine/hooks/num_class_check_hook.py) 的自定义钩子来检查 `num_classes` 是否在 head 中和 `dataset` 中的 `classes` 的长度相匹配。 + +我们在 [default_runtime.py](https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/default_runtime.py) 中设置它。 + +```python +custom_hooks = [dict(type='NumClassCheckHook')] +``` + +### 修改默认运行时钩子 + +有一些常见的钩子是通过 `default_hooks` 注册的,它们是 + +- `IterTimerHook`:记录 “data_time” 用于加载数据和 “time” 用于模型训练步骤的钩子。 +- `LoggerHook`:从`Runner`的不同组件收集日志并将它们写入终端、JSON文件、tensorboard和 wandb 等的钩子。 +- `ParamSchedulerHook`:更新优化器中一些超参数的钩子,例如学习率和动量。 +- `CheckpointHook`:定期保存检查点的钩子。 +- `DistSamplerSeedHook`:为采样器和批处理采样器设置种子的钩子。 +- `DetVisualizationHook`:用于可视化验证和测试过程预测结果的钩子。 + +`IterTimerHook`、`ParamSchedulerHook` 和 `DistSamplerSeedHook` 很简单,通常不需要修改,所以这里我们将展示如何使用 `LoggerHook`、`CheckpointHook` 和 `DetVisualizationHook`。 + +#### CheckpointHook + +除了定期保存检查点,[`CheckpointHook`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L19) 提供了其他选项,例如`max_keep_ckpts`、`save_optimizer ` 等。用户可以设置 `max_keep_ckpts` 只保存少量检查点或通过 `save_optimizer` 决定是否存储优化器的状态字典。参数的更多细节在[这里](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L19)可以找到。 + +```python +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=1, + max_keep_ckpts=3, + save_optimizer=True)) +``` + +#### LoggerHook + +`LoggerHook` 可以设置间隔。详细用法可以在 [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py#L18) 中找到。 + +```python +default_hooks = dict(logger=dict(type='LoggerHook', interval=50)) +``` + +#### DetVisualizationHook + +`DetVisualizationHook` 使用 `DetLocalVisualizer` 来可视化预测结果,`DetLocalVisualizer` 支持不同的后端,例如 `TensorboardVisBackend` 和 `WandbVisBackend` (见 [docstring](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py) 了解更多细节)。用户可以添加多个后端来进行可视化,如下所示。 + +```python +default_hooks = dict( + visualization=dict(type='DetVisualizationHook', draw=True)) + +vis_backends = [dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +``` diff --git a/mmdetection/docs/zh_cn/advanced_guides/customize_transforms.md b/mmdetection/docs/zh_cn/advanced_guides/customize_transforms.md new file mode 100644 index 0000000..aa40717 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/customize_transforms.md @@ -0,0 +1,47 @@ +# 自定义数据预处理流程 + +1. 在任意文件里写一个新的流程,例如在 `my_pipeline.py`,它以一个字典作为输入并且输出一个字典: + + ```python + import random + from mmcv.transforms import BaseTransform + from mmdet.registry import TRANSFORMS + + + @TRANSFORMS.register_module() + class MyTransform(BaseTransform): + """Add your transform + + Args: + p (float): Probability of shifts. Default 0.5. + """ + + def __init__(self, prob=0.5): + self.prob = prob + + def transform(self, results): + if random.random() > self.prob: + results['dummy'] = True + return results + ``` + +2. 在配置文件里调用并使用你写的数据处理流程,需要确保你的训练脚本能够正确导入新增模块: + + ```python + custom_imports = dict(imports=['path.to.my_pipeline'], allow_failed_imports=False) + + train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='MyTransform', prob=0.2), + dict(type='PackDetInputs') + ] + ``` + +3. 可视化数据增强处理流程的结果 + + 如果想要可视化数据增强处理流程的结果,可以使用 `tools/misc/browse_dataset.py` 直观 + 地浏览检测数据集(图像和标注信息),或将图像保存到指定目录。 + 使用方法请参考[可视化文档](../user_guides/visualization.md) diff --git a/mmdetection/docs/zh_cn/advanced_guides/data_flow.md b/mmdetection/docs/zh_cn/advanced_guides/data_flow.md new file mode 100644 index 0000000..ccc734f --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/data_flow.md @@ -0,0 +1 @@ +# 数据流(待更新) diff --git a/mmdetection/docs/zh_cn/advanced_guides/datasets.md b/mmdetection/docs/zh_cn/advanced_guides/datasets.md new file mode 100644 index 0000000..16cc9bf --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/datasets.md @@ -0,0 +1 @@ +# 数据集(待更新) diff --git a/mmdetection/docs/zh_cn/advanced_guides/engine.md b/mmdetection/docs/zh_cn/advanced_guides/engine.md new file mode 100644 index 0000000..fa1a256 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/engine.md @@ -0,0 +1 @@ +# 执行引擎(待更新) diff --git a/mmdetection/docs/zh_cn/advanced_guides/evaluation.md b/mmdetection/docs/zh_cn/advanced_guides/evaluation.md new file mode 100644 index 0000000..0b49544 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/evaluation.md @@ -0,0 +1 @@ +# 精度评测(待更新) diff --git a/mmdetection/docs/zh_cn/advanced_guides/how_to.md b/mmdetection/docs/zh_cn/advanced_guides/how_to.md new file mode 100644 index 0000000..6705daf --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/how_to.md @@ -0,0 +1,220 @@ +本教程收集了任何如何使用 MMDetection 进行 xxx 的答案。 如果您遇到有关`如何做`的问题及答案,请随时更新此文档! + +## 使用 MMPretrain 的骨干网络 + +MMDet、MMPretrain、MMSeg 中的模型注册表都继承自 MMEngine 中的根注册表,允许这些存储库直接使用彼此已经实现的模块。 因此用户可以在 MMDetection 中使用来自 MMPretrain 的骨干网络,而无需实现MMPretrain 中已经存在的网络。 + +### 使用在 MMPretrain 中实现的骨干网络 + +假设想将 `MobileNetV3-small` 作为 `RetinaNet` 的骨干网络,则配置文件如下。 + +```python +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict(imports=['mmpretrain.models'], allow_failed_imports=False) +pretrained = 'https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth' +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmpretrain.MobileNetV3', # 使用 mmpretrain 中的 MobileNetV3 + arch='small', + out_indices=(3, 8, 11), # 修改 out_indices + init_cfg=dict( + type='Pretrained', + checkpoint=pretrained, + prefix='backbone.')), # mmpretrain 中骨干网络的预训练权重含义 prefix='backbone.',为了正常加载权重,需要把这个 prefix 去掉。 + # 修改 in_channels + neck=dict(in_channels=[24, 48, 96], start_level=0)) +``` + +### 通过 MMPretrain 使用 TIMM 中实现的骨干网络 + +由于 MMPretrain 提供了 Py**T**orch **Im**age **M**odels (`timm`) 骨干网络的封装,用户也可以通过 MMPretrain 直接使用 `timm` 中的骨干网络。假设想将 [`EfficientNet-B1`](../../../configs/timm_example/retinanet_timm-efficientnet-b1_fpn_1x_coco.py) 作为 `RetinaNet` 的骨干网络,则配置文件如下。 + +```python +# https://github.com/open-mmlab/mmdetection/blob/main/configs/timm_example/retinanet_timm_efficientnet_b1_fpn_1x_coco.py +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] + +# please install mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict(imports=['mmpretrain.models'], allow_failed_imports=False) +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmpretrain.TIMMBackbone', # 使用 mmpretrain 中 timm 骨干网络 + model_name='efficientnet_b1', + features_only=True, + pretrained=True, + out_indices=(1, 2, 3, 4)), # 修改 out_indices + neck=dict(in_channels=[24, 40, 112, 320])) # 修改 in_channels + +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +``` + +`type='mmpretrain.TIMMBackbone'` 表示在 MMDetection 中使用 MMPretrain 中的 `TIMMBackbone` 类,并且使用的模型为` EfficientNet-B1`,其中 `mmpretrain` 表示 MMPretrain 库,而 `TIMMBackbone ` 表示 MMPretrain 中实现的 TIMMBackbone 包装器。 + +关于层次注册器的具体原理可以参考 [MMEngine 文档](https://mmengine.readthedocs.io/zh_cn/latest/tutorials/config.md#跨项目继承配置文件),关于如何使用 MMPretrain 中的其他 backbone,可以参考 [MMPretrain 文档](https://mmpretrain.readthedocs.io/en/latest/user_guides/config.html)。 + +## 使用马赛克数据增强 + +如果你想在训练中使用 `Mosaic`,那么请确保你同时使用 `MultiImageMixDataset`。以 `Faster R-CNN` 算法为例,你可以通过如下做法实现: + +```python +# 直接打开 configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py ,增添如下字段 +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +img_scale=(1333, 800) + +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), # 图像经过马赛克处理后会放大4倍,所以我们使用仿射变换来恢复图像的大小。 + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs')) +] + +train_dataset = dict( + _delete_ = True, # 删除不必要的设置 + type='MultiImageMixDataset', + dataset=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ], + filter_empty_gt=False, + ), + pipeline=train_pipeline + ) + +data = dict( + train=train_dataset + ) +``` + +## 在配置文件中冻结骨干网络后在训练中解冻骨干网络 + +如果你在配置文件中已经冻结了骨干网络并希望在几个训练周期后解冻它,你可以通过 hook 来实现这个功能。以用 ResNet 为骨干网络的 Faster R-CNN 为例,你可以冻结一个骨干网络的一个层并在配置文件中添加如下 `custom_hooks`: + +```python +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + # freeze one stage of the backbone network. + backbone=dict(frozen_stages=1), +) +custom_hooks = [dict(type="UnfreezeBackboneEpochBasedHook", unfreeze_epoch=1)] +``` + +同时在 `mmdet/core/hook/unfreeze_backbone_epoch_based_hook.py` 当中书写 `UnfreezeBackboneEpochBasedHook` 类 + +```python +from mmengine.model import is_model_wrapper +from mmengine.hooks import Hook +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class UnfreezeBackboneEpochBasedHook(Hook): + """Unfreeze backbone network Hook. + + Args: + unfreeze_epoch (int): The epoch unfreezing the backbone network. + """ + + def __init__(self, unfreeze_epoch=1): + self.unfreeze_epoch = unfreeze_epoch + + def before_train_epoch(self, runner): + # Unfreeze the backbone network. + # Only valid for resnet. + if runner.epoch == self.unfreeze_epoch: + model = runner.model + if is_module_wrapper(model): + model = model.module + backbone = model.backbone + if backbone.frozen_stages >= 0: + if backbone.deep_stem: + backbone.stem.train() + for param in backbone.stem.parameters(): + param.requires_grad = True + else: + backbone.norm1.train() + for m in [backbone.conv1, backbone.norm1]: + for param in m.parameters(): + param.requires_grad = True + + for i in range(1, backbone.frozen_stages + 1): + m = getattr(backbone, f'layer{i}') + m.train() + for param in m.parameters(): + param.requires_grad = True +``` + +## 获得新的骨干网络的通道数 + +如果你想获得一个新骨干网络的通道数,你可以单独构建这个骨干网络并输入一个伪造的图片来获取每一个阶段的输出。 + +以 `ResNet` 为例: + +```python +from mmdet.models import ResNet +import torch +self = ResNet(depth=18) +self.eval() +inputs = torch.rand(1, 3, 32, 32) +level_outputs = self.forward(inputs) +for level_out in level_outputs: + print(tuple(level_out.shape)) + +``` + +以上脚本的输出为: + +```python +(1, 64, 8, 8) +(1, 128, 4, 4) +(1, 256, 2, 2) +(1, 512, 1, 1) +``` + +用户可以通过将脚本中的 `ResNet(depth=18)` 替换为自己的骨干网络配置来得到新的骨干网络的通道数。 + +# MMDetection 中训练 Detectron2 的模型 + +用户可以使用 `Detectron2Wrapper` 从而在 MMDetection 中使用 Detectron2 的模型。 +我们提供了 [Faster R-CNN](../../../configs/misc/d2_faster-rcnn_r50-caffe_fpn_ms-90k_coco.py), +[Mask R-CNN](../../../configs/misc/d2_mask-rcnn_r50-caffe_fpn_ms-90k_coco.py) 和 [RetinaNet](../../../configs/misc/d2_retinanet_r50-caffe_fpn_ms-90k_coco.py) 的示例来在 MMDetection 中训练/测试 Detectron2 的模型。 + +使用过程中需要注意配置文件中算法组件要和 Detectron2 中的相同。模型初始化时,我们首先初始化 [Detectron2](https://github.com/facebookresearch/detectron2/blob/main/detectron2/config/defaults.py) 的默认设置,然后配置文件中的设置将覆盖默认设置,模型将基于更新过的设置来建立。 +输入数据首先转换成 Detectron2 的类型并输入进 Detectron2 的模型中。在推理阶段,Detectron2 的模型结果将会转换回 MMDetection 的类型。 + +## 使用 Detectron2 的预训练权重 + +`Detectron2Wrapper` 中的权重初始化将不使用 MMDetection 的逻辑。用户可以设置 `model.d2_detector.weights=xxx` 来加载预训练的权重。 +例如,我们可以使用 `model.d2_detector.weights='detectron2://ImageNetPretrained/MSRA/R-50.pkl'` 来加载 ResNet-50 的预训练权重,或者使用 +`model.d2_detector.weights='detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl'` 来加载 Detectron2 中提出的预训练的Mask R-CNN权重。 + +**注意:** 不能直接使用 `load_from` 来加载 Detectron2 的预训练模型,但可以通过 `tools/model_converters/detectron2_to_mmdet.py` 先对该预训练模型进行转换。 + +在测试时,用户应该首先使用 `tools/model_converters/detectron2_to_mmdet.py` 将 Detectron2 的预训练权重转换为 MMDetection 可读取的结构。 + +```shell +python tools/model_converters/detectron2_to_mmdet.py ${Detectron2 ckpt path} ${MMDetectron ckpt path}。 +``` diff --git a/mmdetection/docs/zh_cn/advanced_guides/index.rst b/mmdetection/docs/zh_cn/advanced_guides/index.rst new file mode 100644 index 0000000..8e92539 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/index.rst @@ -0,0 +1,34 @@ +基础概念 +*************** + +.. toctree:: + :maxdepth: 1 + + data_flow.md + structures.md + models.md + datasets.md + transforms.md + evaluation.md + engine.md + conventions.md + +组件定制 +************************ + +.. toctree:: + :maxdepth: 1 + + customize_models.md + customize_losses.md + customize_dataset.md + customize_transforms.md + customize_runtime.md + +How to +************************ + +.. toctree:: + :maxdepth: 1 + + how_to.md diff --git a/mmdetection/docs/zh_cn/advanced_guides/models.md b/mmdetection/docs/zh_cn/advanced_guides/models.md new file mode 100644 index 0000000..c5119d0 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/models.md @@ -0,0 +1 @@ +# 模型(待更新) diff --git a/mmdetection/docs/zh_cn/advanced_guides/structures.md b/mmdetection/docs/zh_cn/advanced_guides/structures.md new file mode 100644 index 0000000..c2118c3 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/structures.md @@ -0,0 +1 @@ +# 数据结构(待更新) diff --git a/mmdetection/docs/zh_cn/advanced_guides/transforms.md b/mmdetection/docs/zh_cn/advanced_guides/transforms.md new file mode 100644 index 0000000..07d7db2 --- /dev/null +++ b/mmdetection/docs/zh_cn/advanced_guides/transforms.md @@ -0,0 +1,43 @@ +# 数据变换(待更新) + +按照惯例,我们使用 `Dataset` 和 `DataLoader` 进行多进程的数据加载。`Dataset` 返回字典类型的数据,数据内容为模型 `forward` 方法的各个参数。由于在目标检测中,输入的图像数据具有不同的大小,我们在 `MMCV` 里引入一个新的 `DataContainer` 类去收集和分发不同大小的输入数据。更多细节请参考[这里](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py)。 + +数据的准备流程和数据集是解耦的。通常一个数据集定义了如何处理标注数据(annotations)信息,而一个数据流程定义了准备一个数据字典的所有步骤。一个流程包括一系列的操作,每个操作都把一个字典作为输入,然后再输出一个新的字典给下一个变换操作。 + +我们在下图展示了一个经典的数据处理流程。蓝色块是数据处理操作,随着数据流程的处理,每个操作都可以在结果字典中加入新的键(标记为绿色)或更新现有的键(标记为橙色)。 + +![pipeline figure](../../../resources/data_pipeline.png) + +这些操作可以分为数据加载(data loading)、预处理(pre-processing)、格式变化(formatting)和测试时数据增强(test-time augmentation)。 + +下面的例子是 `Faster R-CNN` 的一个流程: + +```python +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +``` diff --git a/mmdetection/docs/zh_cn/api.rst b/mmdetection/docs/zh_cn/api.rst new file mode 100644 index 0000000..1b12732 --- /dev/null +++ b/mmdetection/docs/zh_cn/api.rst @@ -0,0 +1,161 @@ +mmdet.apis +-------------- +.. automodule:: mmdet.apis + :members: + +mmdet.datasets +-------------- + +datasets +^^^^^^^^^^ +.. automodule:: mmdet.datasets + :members: + +api_wrappers +^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.datasets.api_wrappers + :members: + +samplers +^^^^^^^^^^ +.. automodule:: mmdet.datasets.samplers + :members: + +transforms +^^^^^^^^^^^^ +.. automodule:: mmdet.datasets.transforms + :members: + +mmdet.engine +-------------- + +hooks +^^^^^^^^^^ +.. automodule:: mmdet.engine.hooks + :members: + +optimizers +^^^^^^^^^^^^^^^ +.. automodule:: mmdet.engine.optimizers + :members: + +runner +^^^^^^^^^^ +.. automodule:: mmdet.engine.runner + :members: + +schedulers +^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.engine.schedulers + :members: + +mmdet.evaluation +-------------------- + +functional +^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.evaluation.functional + :members: + +metrics +^^^^^^^^^^ +.. automodule:: mmdet.evaluation.metrics + :members: + + +mmdet.models +-------------- + +backbones +^^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.models.backbones + :members: + +data_preprocessors +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.models.data_preprocessors + :members: + +dense_heads +^^^^^^^^^^^^^^^ +.. automodule:: mmdet.models.dense_heads + :members: + +detectors +^^^^^^^^^^ +.. automodule:: mmdet.models.detectors + :members: + +layers +^^^^^^^^^^ +.. automodule:: mmdet.models.layers + :members: + +losses +^^^^^^^^^^ +.. automodule:: mmdet.models.losses + :members: + +necks +^^^^^^^^^^^^ +.. automodule:: mmdet.models.necks + :members: + +roi_heads +^^^^^^^^^^^^^ +.. automodule:: mmdet.models.roi_heads + :members: + +seg_heads +^^^^^^^^^^^^^ +.. automodule:: mmdet.models.seg_heads + :members: + +task_modules +^^^^^^^^^^^^^ +.. automodule:: mmdet.models.task_modules + :members: + +test_time_augs +^^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.models.test_time_augs + :members: + +utils +^^^^^^^^^^ +.. automodule:: mmdet.models.utils + :members: + + +mmdet.structures +-------------------- + +structures +^^^^^^^^^^^^^^^^^ +.. automodule:: mmdet.structures + :members: + +bbox +^^^^^^^^^^ +.. automodule:: mmdet.structures.bbox + :members: + +mask +^^^^^^^^^^ +.. automodule:: mmdet.structures.mask + :members: + +mmdet.testing +---------------- +.. automodule:: mmdet.testing + :members: + +mmdet.visualization +-------------------- +.. automodule:: mmdet.visualization + :members: + +mmdet.utils +-------------- +.. automodule:: mmdet.utils + :members: diff --git a/mmdetection/docs/zh_cn/article.md b/mmdetection/docs/zh_cn/article.md new file mode 100644 index 0000000..3b69830 --- /dev/null +++ b/mmdetection/docs/zh_cn/article.md @@ -0,0 +1,53 @@ +## 中文解读文案汇总(待更新) + +### 1 官方解读文案(v2.x) + +#### 1.1 框架解读 + +- **[轻松掌握 MMDetection 整体构建流程(一)](https://zhuanlan.zhihu.com/p/337375549)** +- **[轻松掌握 MMDetection 整体构建流程(二)](https://zhuanlan.zhihu.com/p/341954021)** +- **[轻松掌握 MMDetection 中 Head 流程](https://zhuanlan.zhihu.com/p/343433169)** + +#### 1.2 算法解读 + +- **[轻松掌握 MMDetection 中常用算法(一):RetinaNet 及配置详解](https://zhuanlan.zhihu.com/p/346198300)** +- **[轻松掌握 MMDetection 中常用算法(二):Faster R-CNN|Mask R-CNN](https://zhuanlan.zhihu.com/p/349807581)** +- [轻松掌握 MMDetection 中常用算法(三):FCOS](https://zhuanlan.zhihu.com/p/358056615) +- [轻松掌握 MMDetection 中常用算法(四):ATSS](https://zhuanlan.zhihu.com/p/358125611) +- [轻松掌握 MMDetection 中常用算法(五):Cascade R-CNN](https://zhuanlan.zhihu.com/p/360952172) +- [轻松掌握 MMDetection 中常用算法(六):YOLOF](https://zhuanlan.zhihu.com/p/370758213) +- [轻松掌握 MMDetection 中常用算法(七):CenterNet](https://zhuanlan.zhihu.com/p/374891478) +- [轻松掌握 MMDetection 中常用算法(八):YOLACT](https://zhuanlan.zhihu.com/p/376347955) +- [轻松掌握 MMDetection 中常用算法(九):AutoAssign](https://zhuanlan.zhihu.com/p/378581552) +- [YOLOX 在 MMDetection 中复现全流程解析](https://zhuanlan.zhihu.com/p/398545304) +- [喂喂喂!你可以减重了!小模型 - MMDetection 新增SSDLite 、 MobileNetV2YOLOV3 两大经典算法](https://zhuanlan.zhihu.com/p/402781143) + +#### 1.3 工具解读 + +- [OpenMMLab 中混合精度训练 AMP 的正确打开方式](https://zhuanlan.zhihu.com/p/375224982) +- [小白都能看懂!手把手教你使用混淆矩阵分析目标检测](https://zhuanlan.zhihu.com/p/443499860) +- [MMDetection 图像缩放 Resize 详细说明 OpenMMLab](https://zhuanlan.zhihu.com/p/381117525) +- [拿什么拯救我的 4G 显卡](https://zhuanlan.zhihu.com/p/430123077) +- [MMDet居然能用MMCls的Backbone?论配置文件的打开方式](https://zhuanlan.zhihu.com/p/436865195) + +#### 1.4 知乎问答 + +- [COCO数据集上1x模式下为什么不采用多尺度训练?](https://www.zhihu.com/question/462170786/answer/1915119662) +- [MMDetection中SOTA论文源码中将训练过程中BN层的eval打开?](https://www.zhihu.com/question/471189603/answer/2195540892) +- [基于PyTorch的MMDetection中训练的随机性来自何处?](https://www.zhihu.com/question/453511684/answer/1839683634) +- [单阶段、双阶段、anchor-based、anchor-free 这四者之间有什么联系吗?](https://www.zhihu.com/question/428972054/answer/1619925296) +- [目标检测的深度学习方法,有推荐的书籍或资料吗?](https://www.zhihu.com/question/391577080/answer/1612593817) +- [大佬们,刚入学研究生,想入门目标检测,有什么学习路线可以入门的?](https://www.zhihu.com/question/343768934/answer/1612580715) +- [目标检测领域还有什么可以做的?](https://www.zhihu.com/question/280703314/answer/1627885518) +- [如何看待Transformer在CV上的应用前景,未来有可能替代CNN吗?](https://www.zhihu.com/question/437495132/answer/1686380553) +- [MMDetection如何学习源码?](https://www.zhihu.com/question/451585041/answer/1832498963) +- [如何具体上手实现目标检测呢?](https://www.zhihu.com/question/341401981/answer/1848561187) + +#### 1.5 其他 + +- **[不得不知的 MMDetection 学习路线(个人经验版)](https://zhuanlan.zhihu.com/p/369826931)** +- [OpenMMLab 社区专访之 YOLOX 复现篇](https://zhuanlan.zhihu.com/p/405913343) + +### 2 社区解读文案(v2.x) + +- [手把手带你实现经典检测网络 Mask R-CNN 的推理](https://zhuanlan.zhihu.com/p/414082071) diff --git a/mmdetection/docs/zh_cn/conf.py b/mmdetection/docs/zh_cn/conf.py new file mode 100644 index 0000000..e687840 --- /dev/null +++ b/mmdetection/docs/zh_cn/conf.py @@ -0,0 +1,118 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import subprocess +import sys + +import pytorch_sphinx_theme + +sys.path.insert(0, os.path.abspath('../../')) + +# -- Project information ----------------------------------------------------- + +project = 'MMDetection' +copyright = '2018-2021, OpenMMLab' +author = 'MMDetection Authors' +version_file = '../../mmdet/version.py' + + +def get_version(): + with open(version_file, 'r') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +# The full version, including alpha/beta/rc tags +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_markdown_tables', + 'sphinx_copybutton', +] + +myst_enable_extensions = ['colon_fence'] +myst_heading_anchors = 3 + +autodoc_mock_imports = [ + 'matplotlib', 'pycocotools', 'terminaltables', 'mmdet.version', 'mmcv.ops' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +# The main toctree document. +master_doc = 'index' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'sphinx_rtd_theme' +html_theme = 'pytorch_sphinx_theme' +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] + +html_theme_options = { + 'menu': [ + { + 'name': 'GitHub', + 'url': 'https://github.com/open-mmlab/mmdetection' + }, + ], + # Specify the language of shared menu + 'menu_lang': + 'cn', +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_css_files = ['css/readthedocs.css'] + +language = 'zh_CN' + +# -- Extension configuration ------------------------------------------------- +# Ignore >>> when copying code +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + + +def builder_inited_handler(app): + subprocess.run(['./stat.py']) + + +def setup(app): + app.connect('builder-inited', builder_inited_handler) diff --git a/mmdetection/docs/zh_cn/get_started.md b/mmdetection/docs/zh_cn/get_started.md new file mode 100644 index 0000000..52d061e --- /dev/null +++ b/mmdetection/docs/zh_cn/get_started.md @@ -0,0 +1,230 @@ +# 开始你的第一步 + +## 依赖 + +本节中,我们将演示如何用 PyTorch 准备一个环境。 + +MMDetection 支持在 Linux,Windows 和 macOS 上运行。它需要 Python 3.7 以上,CUDA 9.2 以上和 PyTorch 1.8 及其以上。 + +```{note} +如果你对 PyTorch 有经验并且已经安装了它,你可以直接跳转到[下一小节](#安装流程)。否则,你可以按照下述步骤进行准备。 +``` + +**步骤 0.** 从[官方网站](https://docs.conda.io/en/latest/miniconda.html)下载并安装 Miniconda。 + +**步骤 1.** 创建并激活一个 conda 环境。 + +```shell +conda create --name openmmlab python=3.8 -y +conda activate openmmlab +``` + +**步骤 2.** 基于 [PyTorch 官方说明](https://pytorch.org/get-started/locally/)安装 PyTorch。 + +在 GPU 平台上: + +```shell +conda install pytorch torchvision -c pytorch +``` + +在 CPU 平台上: + +```shell +conda install pytorch torchvision cpuonly -c pytorch +``` + +## 安装流程 + +我们推荐用户参照我们的最佳实践安装 MMDetection。不过,整个过程也是可定制化的,更多信息请参考[自定义安装](#自定义安装)章节。 + +### 最佳实践 + +**步骤 0.** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMEngine](https://github.com/open-mmlab/mmengine) 和 [MMCV](https://github.com/open-mmlab/mmcv)。 + +```shell +pip install -U openmim +mim install mmengine +mim install "mmcv>=2.0.0" +``` + +**注意:** 在 MMCV-v2.x 中,`mmcv-full` 改名为 `mmcv`,如果你想安装不包含 CUDA 算子精简版,可以通过 `mim install "mmcv-lite>=2.0.0rc1"` 来安装。 + +**步骤 1.** 安装 MMDetection。 + +方案 a:如果你开发并直接运行 mmdet,从源码安装它: + +```shell +git clone https://github.com/open-mmlab/mmdetection.git +cd mmdetection +pip install -v -e . +# "-v" 指详细说明,或更多的输出 +# "-e" 表示在可编辑模式下安装项目,因此对代码所做的任何本地修改都会生效,从而无需重新安装。 +``` + +方案 b:如果你将 mmdet 作为依赖或第三方 Python 包,使用 MIM 安装: + +```shell +mim install mmdet +``` + +## 验证安装 + +为了验证 MMDetection 是否安装正确,我们提供了一些示例代码来执行模型推理。 + +**步骤 1.** 我们需要下载配置文件和模型权重文件。 + +```shell +mim download mmdet --config rtmdet_tiny_8xb32-300e_coco --dest . +``` + +下载将需要几秒钟或更长时间,这取决于你的网络环境。完成后,你会在当前文件夹中发现两个文件 `rtmdet_tiny_8xb32-300e_coco.py` 和 `rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth`。 + +**步骤 2.** 推理验证。 + +方案 a:如果你通过源码安装的 MMDetection,那么直接运行以下命令进行验证: + +```shell +python demo/image_demo.py demo/demo.jpg rtmdet_tiny_8xb32-300e_coco.py --weights rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth --device cpu +``` + +你会在当前文件夹中的 `outputs/vis` 文件夹中看到一个新的图像 `demo.jpg`,图像中包含有网络预测的检测框。 + +方案 b:如果你通过 MIM 安装的 MMDetection,那么可以打开你的 Python 解析器,复制并粘贴以下代码: + +```python +from mmdet.apis import init_detector, inference_detector + +config_file = 'rtmdet_tiny_8xb32-300e_coco.py' +checkpoint_file = 'rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth' +model = init_detector(config_file, checkpoint_file, device='cpu') # or device='cuda:0' +inference_detector(model, 'demo/demo.jpg') +``` + +你将会看到一个包含 `DetDataSample` 的列表,预测结果在 `pred_instance` 里,包含有检测框,类别和得分。 + +### 自定义安装 + +#### CUDA 版本 + +在安装 PyTorch 时,你需要指定 CUDA 的版本。如果你不清楚应该选择哪一个,请遵循我们的建议: + +- 对于 Ampere 架构的 NVIDIA GPU,例如 GeForce 30 系列以及 NVIDIA A100,CUDA 11 是必需的。 +- 对于更早的 NVIDIA GPU,CUDA 11 是向后兼容 (backward compatible) 的,但 CUDA 10.2 能够提供更好的兼容性,也更加轻量。 + +请确保你的 GPU 驱动版本满足最低的版本需求,参阅 NVIDIA 官方的 [CUDA 工具箱和相应的驱动版本关系表](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions)。 + +```{note} +如果按照我们的最佳实践,安装 CUDA 运行时库就足够了,这是因为不需要在本地编译 CUDA 代码。但如果你希望从源码编译 MMCV,或是开发其他 CUDA 算子,那么就必须安装完整的 CUDA 工具链,参见 [NVIDIA 官网](https://developer.nvidia.com/cuda-downloads),另外还需要确保该 CUDA 工具链的版本与 PyTorch 安装时的配置相匹配(如用 `conda install` 安装 PyTorch 时指定的 cudatoolkit 版本)。 +``` + +#### 不使用 MIM 安装 MMEngine + +要使用 pip 而不是 MIM 来安装 MMEngine,请遵照 [MMEngine 安装指南](https://mmengine.readthedocs.io/zh_CN/latest/get_started/installation.html)。 + +例如,你可以通过以下命令安装 MMEngine。 + +```shell +pip install mmengine +``` + +#### 不使用 MIM 安装 MMCV + +MMCV 包含 C++ 和 CUDA 扩展,因此其对 PyTorch 的依赖比较复杂。MIM 会自动解析这些依赖,选择合适的 MMCV 预编译包,使安装更简单,但它并不是必需的。 + +要使用 pip 而不是 MIM 来安装 MMCV,请遵照 [MMCV 安装指南](https://mmcv.readthedocs.io/zh_CN/2.x/get_started/installation.html)。它需要您用指定 url 的形式手动指定对应的 PyTorch 和 CUDA 版本。 + +例如,下述命令将会安装基于 PyTorch 1.12.x 和 CUDA 11.6 编译的 MMCV。 + +```shell +pip install "mmcv>=2.0.0" -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.12.0/index.html +``` + +#### 在 CPU 环境中安装 + +MMDetection 可以在 CPU 环境中构建。在 CPU 模式下,可以进行模型训练(需要 MMCV 版本 >= 2.0.0rc1)、测试或者推理。 + +但是,以下功能在该模式下不能使用: + +- Deformable Convolution +- Modulated Deformable Convolution +- ROI pooling +- Deformable ROI pooling +- CARAFE +- SyncBatchNorm +- CrissCrossAttention +- MaskedConv2d +- Temporal Interlace Shift +- nms_cuda +- sigmoid_focal_loss_cuda +- bbox_overlaps + +因此,如果尝试训练/测试/推理包含上述算子的模型,将会报错。下表列出了将会受影响的相关算法。 + +| 操作 | 模型 | +| :-----------------------------------------------------: | :--------------------------------------------------------------------------------------: | +| Deformable Convolution/Modulated Deformable Convolution | DCN、Guided Anchoring、RepPoints、CentripetalNet、VFNet、CascadeRPN、NAS-FCOS、DetectoRS | +| MaskedConv2d | Guided Anchoring | +| CARAFE | CARAFE | +| SyncBatchNorm | ResNeSt | + +#### 在 Google Colab 中安装 + +[Google Colab](https://colab.research.google.com/) 通常已经包含了 PyTorch 环境,因此我们只需要安装 MMEngine,MMCV 和 MMDetection 即可,命令如下: + +**步骤 1.** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMEngine](https://github.com/open-mmlab/mmengine) 和 [MMCV](https://github.com/open-mmlab/mmcv)。 + +```shell +!pip3 install openmim +!mim install mmengine +!mim install "mmcv>=2.0.0,<2.1.0" +``` + +**步骤 2.** 使用源码安装 MMDetection。 + +```shell +!git clone https://github.com/open-mmlab/mmdetection.git +%cd mmdetection +!pip install -e . +``` + +**步骤 3.** 验证安装是否成功。 + +```python +import mmdet +print(mmdet.__version__) +# 预期输出:3.0.0 或其他版本号 +``` + +```{note} +在 Jupyter Notebook 中,感叹号 `!` 用于执行外部命令,而 `%cd` 是一个[魔术命令](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd),用于切换 Python 的工作路径。 +``` + +#### 通过 Docker 使用 MMDetection + +我们提供了一个 [Dockerfile](../../docker/Dockerfile) 来构建一个镜像。请确保你的 [docker 版本](https://docs.docker.com/engine/install/) >=19.03。 + +```shell +# 基于 PyTorch 1.9,CUDA 11.1 构建镜像 +# 如果你想要其他版本,只需要修改 Dockerfile +docker build -t mmdetection docker/ +``` + +用以下命令运行 Docker 镜像: + +```shell +docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmdetection/data mmdetection +``` + +### 排除故障 + +如果你在安装过程中遇到一些问题,请先查看 [FAQ](notes/faq.md) 页面。如果没有找到解决方案,你也可以在 GitHub 上[提出一个问题](https://github.com/open-mmlab/mmdetection/issues/new/choose)。 + +### 使用多个 MMDetection 版本进行开发 + +训练和测试的脚本已经在 `PYTHONPATH` 中进行了修改,以确保脚本使用当前目录中的 MMDetection。 + +要使环境中安装默认版本的 MMDetection 而不是当前正在使用的,可以删除出现在相关脚本中的代码: + +```shell +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH +``` diff --git a/mmdetection/docs/zh_cn/index.rst b/mmdetection/docs/zh_cn/index.rst new file mode 100644 index 0000000..58a4d8a --- /dev/null +++ b/mmdetection/docs/zh_cn/index.rst @@ -0,0 +1,67 @@ +Welcome to MMDetection's documentation! +======================================= + +.. toctree:: + :maxdepth: 1 + :caption: 开始你的第一步 + + overview.md + get_started.md + +.. toctree:: + :maxdepth: 2 + :caption: 使用指南 + + user_guides/index.rst + +.. toctree:: + :maxdepth: 2 + :caption: 进阶教程 + + advanced_guides/index.rst + +.. toctree:: + :maxdepth: 1 + :caption: 迁移版本 + + migration/migration.md + +.. toctree:: + :maxdepth: 1 + :caption: 接口文档(英文) + + api.rst + +.. toctree:: + :maxdepth: 1 + :caption: 模型仓库 + + model_zoo.md + +.. toctree:: + :maxdepth: 1 + :caption: 说明 + + notes/contribution_guide.md + notes/projects.md + notes/faq.md + notes/compatibility.md + +.. toctree:: + :maxdepth: 1 + :caption: 文章 + + article.md + +.. toctree:: + :caption: 语言切换 + + switch_language.md + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/mmdetection/docs/zh_cn/make.bat b/mmdetection/docs/zh_cn/make.bat new file mode 100644 index 0000000..922152e --- /dev/null +++ b/mmdetection/docs/zh_cn/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/mmdetection/docs/zh_cn/migration/api_and_registry_migration.md b/mmdetection/docs/zh_cn/migration/api_and_registry_migration.md new file mode 100644 index 0000000..66e1c34 --- /dev/null +++ b/mmdetection/docs/zh_cn/migration/api_and_registry_migration.md @@ -0,0 +1 @@ +# 将 API 和注册器从 MMDetection 2.x 迁移至 3.x diff --git a/mmdetection/docs/zh_cn/migration/config_migration.md b/mmdetection/docs/zh_cn/migration/config_migration.md new file mode 100644 index 0000000..c4f9c8e --- /dev/null +++ b/mmdetection/docs/zh_cn/migration/config_migration.md @@ -0,0 +1,814 @@ +# 将配置文件从 MMDetection 2.x 迁移至 3.x + +MMDetection 3.x 的配置文件与 2.x 相比有较大变化,这篇文档将介绍如何将 2.x 的配置文件迁移到 3.x。 + +在前面的[配置文件教程](../user_guides/config.md)中,我们以 Mask R-CNN 为例介绍了 MMDetection 3.x 的配置文件结构,这里我们将按同样的结构介绍如何将 2.x 的配置文件迁移至 3.x。 + +## 模型配置 + +模型的配置与 2.x 相比并没有太大变化,对于模型的 backbone,neck,head,以及 train_cfg 和 test_cfg,它们的参数与 2.x 版本的参数保持一致。 + +不同的是,我们在 3.x 版本的模型中新增了 `DataPreprocessor` 模块。 +`DataPreprocessor` 模块的配置位于 `model.data_preprocessor` 中,它用于对输入数据进行预处理,例如对输入图像进行归一化,将不同大小的图片进行 padding 从而组成 batch,将图像从内存中读取到显存中等。这部分配置取代了原本存在于 train_pipeline 和 test_pipeline 中的 `Normalize` 和 `Pad`。 + + + + + + + + + +
    原配置 + +```python +# 图像归一化参数 +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +pipeline=[ + ..., + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), # 图像 padding 到 32 的倍数 + ... +] +``` + +
    新配置 + +```python +model = dict( + data_preprocessor=dict( + type='DetDataPreprocessor', + # 图像归一化参数 + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + # 图像 padding 参数 + pad_mask=True, # 在实例分割中,需要将 mask 也进行 padding + pad_size_divisor=32) # 图像 padding 到 32 的倍数 +) +``` + +
    + +## 数据集和评测器配置 + +数据集和评测部分的配置相比 2.x 版本有较大的变化。我们将从 Dataloader 和 Dataset,Data transform pipeline,以及评测器配置三个方面介绍如何将 2.x 版本的配置迁移到 3.x 版本。 + +### Dataloader 和 Dataset 配置 + +在新版本中,我们将数据加载的设置与 PyTorch 官方的 DataLoader 保持一致,这样可以使用户更容易理解和上手。 +我们将训练、验证和测试的数据加载设置分别放在 `train_dataloader`,`val_dataloader` 和 `test_dataloader` 中,用户可以分别对这些 dataloader 设置不同的参数,其输入参数与 [PyTorch 的 Dataloader](https://pytorch.org/docs/stable/data.html?highlight=dataloader#torch.utils.data.DataLoader) 所需要的参数基本一致。 + +通过这种方式,我们将 2.x 版本中不可配置的 `sampler`,`batch_sampler`,`persistent_workers` 等参数都放到了配置文件中,使得用户可以更加灵活地设置数据加载的参数。 + +用户可以通过 `train_dataloader.dataset`,`val_dataloader.dataset` 和 `test_dataloader.dataset` 来设置数据集的配置,它们分别对应 2.x 版本中的 `data.train`,`data.val` 和 `data.test`。 + + + + + + + + + +
    原配置 + +```python +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +``` + +
    新配置 + +```python +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, # 避免每次迭代后 dataloader 重新创建子进程 + sampler=dict(type='DefaultSampler', shuffle=True), # 默认的 sampler,同时支持分布式训练和非分布式训练 + batch_sampler=dict(type='AspectRatioBatchSampler'), # 默认的 batch_sampler,用于保证 batch 中的图片具有相似的长宽比,从而可以更好地利用显存 + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) +# 在 3.x 版本中可以独立配置验证和测试的 dataloader +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader # 测试 dataloader 的配置与验证 dataloader 的配置相同,这里省略 +``` + +
    + +### Data transform pipeline 配置 + +上文中提到,我们将图像 normalize 和 padding 的配置从 `train_pipeline` 和 `test_pipeline` 中独立出来,放到了 `model.data_preprocessor` 中,因此在 3.x 版本的 pipeline 中,我们不再需要 `Normalize` 和 `Pad` 这两个 transform。 + +同时,我们也对负责数据格式打包的 transform 进行了重构,将 `Collect` 和 `DefaultFormatBundle` 这两个 transform 合并为了 `PackDetInputs`,它负责将 data pipeline 中的数据打包成模型的输入格式,关于输入格式的转换,详见[数据流文档](../advanced_guides/data_flow.md)。 + +下面以 Mask R-CNN 1x 的 train_pipeline 为例,介绍如何将 2.x 版本的配置迁移到 3.x 版本: + + + + + + + + + +
    原配置 + +```python +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +``` + +
    新配置 + +```python +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +``` + +
    + +对于 test_pipeline,除了将 `Normalize` 和 `Pad` 这两个 transform 去掉之外,我们也将测试时的数据增强(TTA)与普通的测试流程分开,移除了 `MultiScaleFlipAug`。关于新版的 TTA 如何使用,详见[TTA 文档](../advanced_guides/tta.md)。 + +下面同样以 Mask R-CNN 1x 的 test_pipeline 为例,介绍如何将 2.x 版本的配置迁移到 3.x 版本: + + + + + + + + + +
    原配置 + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +``` + +
    新配置 + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +
    + +除此之外,我们还对一些数据增强进行了重构,下表列出了 2.x 版本中的 transform 与 3.x 版本中的 transform 的对应关系: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    名称原配置新配置
    Resize + +```python +dict(type='Resize', + img_scale=(1333, 800), + keep_ratio=True) +``` + + + +```python +dict(type='Resize', + scale=(1333, 800), + keep_ratio=True) +``` + +
    RandomResize + +```python +dict( + type='Resize', + img_scale=[ + (1333, 640), (1333, 800)], + multiscale_mode='range', + keep_ratio=True) +``` + + + +```python +dict( + type='RandomResize', + scale=[ + (1333, 640), (1333, 800)], + keep_ratio=True) +``` + +
    RandomChoiceResize + +```python +dict( + type='Resize', + img_scale=[ + (1333, 640), (1333, 672), + (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + multiscale_mode='value', + keep_ratio=True) +``` + + + +```python +dict( + type='RandomChoiceResize', + scales=[ + (1333, 640), (1333, 672), + (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True) +``` + +
    RandomFlip + +```python +dict(type='RandomFlip', + flip_ratio=0.5) +``` + + + +```python +dict(type='RandomFlip', + prob=0.5) +``` + +
    + +### 评测器配置 + +在 3.x 版本中,模型精度评测不再与数据集绑定,而是通过评测器(Evaluator)来完成。 +评测器配置分为 val_evaluator 和 test_evaluator 两部分,其中 val_evaluator 用于验证集评测,test_evaluator 用于测试集评测,对应 2.x 版本中的 evaluation 字段。 +下表列出了 2.x 版本与 3.x 版本中的评测器的对应关系: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    评测指标名称原配置新配置
    COCO + +```python +data = dict( + val=dict( + type='CocoDataset', + ann_file=data_root + 'annotations/instances_val2017.json')) +evaluation = dict(metric=['bbox', 'segm']) +``` + + + +```python +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False) +``` + +
    Pascal VOC + +```python +data = dict( + val=dict( + type=dataset_type, + ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt')) +evaluation = dict(metric='mAP') +``` + + + +```python +val_evaluator = dict( + type='VOCMetric', + metric='mAP', + eval_mode='11points') +``` + +
    OpenImages + +```python +data = dict( + val=dict( + type='OpenImagesDataset', + ann_file=data_root + 'annotations/validation-annotations-bbox.csv', + img_prefix=data_root + 'OpenImages/validation/', + label_file=data_root + 'annotations/class-descriptions-boxable.csv', + hierarchy_file=data_root + + 'annotations/bbox_labels_600_hierarchy.json', + meta_file=data_root + 'annotations/validation-image-metas.pkl', + image_level_ann_file=data_root + + 'annotations/validation-annotations-human-imagelabels-boxable.csv')) +evaluation = dict(interval=1, metric='mAP') +``` + + + +```python +val_evaluator = dict( + type='OpenImagesMetric', + iou_thrs=0.5, + ioa_thrs=0.5, + use_group_of=True, + get_supercategory=True) +``` + +
    CityScapes + +```python +data = dict( + val=dict( + type='CityScapesDataset', + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + img_prefix=data_root + 'leftImg8bit/val/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) +``` + + + +```python +val_evaluator = [ + dict( + type='CocoMetric', + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + metric=['bbox', 'segm']), + dict( + type='CityScapesMetric', + ann_file=data_root + + 'annotations/instancesonly_filtered_gtFine_val.json', + seg_prefix=data_root + '/gtFine/val', + outfile_prefix='./work_dirs/cityscapes_metric/instance') +] +``` + +
    + +## 训练和测试的配置 + + + + + + + + + +
    原配置 + +```python +runner = dict( + type='EpochBasedRunner', # 训练循环的类型 + max_epochs=12) # 最大训练轮次 +evaluation = dict(interval=2) # 验证间隔。每 2 个 epoch 验证一次 +``` + +
    新配置 + +```python +train_cfg = dict( + type='EpochBasedTrainLoop', # 训练循环的类型,请参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py + max_epochs=12, # 最大训练轮次 + val_interval=2) # 验证间隔。每 2 个 epoch 验证一次 +val_cfg = dict(type='ValLoop') # 验证循环的类型 +test_cfg = dict(type='TestLoop') # 测试循环的类型 +``` + +
    + +## 优化相关配置 + +优化器以及梯度裁剪的配置都移至 optim_wrapper 字段中。下表列出了 2.x 版本与 3.x 版本中的优化器配置的对应关系: + + + + + + + + + +
    原配置 + +```python +optimizer = dict( + type='SGD', # 随机梯度下降优化器 + lr=0.02, # 基础学习率 + momentum=0.9, # 带动量的随机梯度下降 + weight_decay=0.0001) # 权重衰减 +optimizer_config = dict(grad_clip=None) # 梯度裁剪的配置,设置为 None 关闭梯度裁剪 +``` + +
    新配置 + +```python +optim_wrapper = dict( # 优化器封装的配置 + type='OptimWrapper', # 优化器封装的类型。可以切换至 AmpOptimWrapper 来启用混合精度训练 + optimizer=dict( # 优化器配置。支持 PyTorch 的各种优化器。请参考 https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # 随机梯度下降优化器 + lr=0.02, # 基础学习率 + momentum=0.9, # 带动量的随机梯度下降 + weight_decay=0.0001), # 权重衰减 + clip_grad=None, # 梯度裁剪的配置,设置为 None 关闭梯度裁剪。使用方法请见 https://mmengine.readthedocs.io/en/latest/tutorials/optimizer.html + ) +``` + +
    + +学习率的配置也从 lr_config 字段中移至 param_scheduler 字段中。param_scheduler 的配置更贴近 PyTorch 的学习率调整策略,更加灵活。下表列出了 2.x 版本与 3.x 版本中的学习率配置的对应关系: + + + + + + + + + +
    原配置 + +```python +lr_config = dict( + policy='step', # 在训练过程中使用 multi step 学习率策略 + warmup='linear', # 使用线性学习率预热 + warmup_iters=500, # 到第 500 个 iteration 结束预热 + warmup_ratio=0.001, # 学习率预热的系数 + step=[8, 11], # 在哪几个 epoch 进行学习率衰减 + gamma=0.1) # 学习率衰减系数 +``` + +
    新配置 + +```python +param_scheduler = [ + dict( + type='LinearLR', # 使用线性学习率预热 + start_factor=0.001, # 学习率预热的系数 + by_epoch=False, # 按 iteration 更新预热学习率 + begin=0, # 从第一个 iteration 开始 + end=500), # 到第 500 个 iteration 结束 + dict( + type='MultiStepLR', # 在训练过程中使用 multi step 学习率策略 + by_epoch=True, # 按 epoch 更新学习率 + begin=0, # 从第一个 epoch 开始 + end=12, # 到第 12 个 epoch 结束 + milestones=[8, 11], # 在哪几个 epoch 进行学习率衰减 + gamma=0.1) # 学习率衰减系数 +] +``` + +
    + +关于其他的学习率调整策略的迁移,请参考 MMEngine 的[学习率迁移文档](https://mmengine.readthedocs.io/zh_CN/latest/migration/param_scheduler.html)。 + +## 其他配置的迁移 + +### 保存 checkpoint 的配置 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    功能原配置新配置
    设置保存间隔 + +```python +checkpoint_config = dict( + interval=1) +``` + + + +```python +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=1)) +``` + +
    保存最佳模型 + +```python +evaluation = dict( + save_best='auto') +``` + + + +```python +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_best='auto')) +``` + +
    只保留最新的几个模型 + +```python +checkpoint_config = dict( + max_keep_ckpts=3) +``` + + + +```python +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + max_keep_ckpts=3)) +``` + +
    + +### 日志的配置 + +3.x 版本中,日志的打印和可视化由 MMEngine 中的 logger 和 visualizer 分别完成。下表列出了 2.x 版本与 3.x 版本中的日志配置的对应关系: + + + + + + + + + + + + + + + + + + + + + + + + +
    功能原配置新配置
    设置日志打印间隔 + +```python +log_config = dict( + interval=50) +``` + + + +```python +default_hooks = dict( + logger=dict( + type='LoggerHook', + interval=50)) +# 可选: 配置日志打印数值的平滑窗口大小 +log_processor = dict( + type='LogProcessor', + window_size=50) +``` + +
    使用 TensorBoard 或 WandB 可视化日志 + +```python +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook'), + dict(type='MMDetWandbHook', + init_kwargs={ + 'project': 'mmdetection', + 'group': 'maskrcnn-r50-fpn-1x-coco' + }, + interval=50, + log_checkpoint=True, + log_checkpoint_metadata=True, + num_eval_images=100) + ]) +``` + + + +```python +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), + dict(type='WandbVisBackend', + init_kwargs={ + 'project': 'mmdetection', + 'group': 'maskrcnn-r50-fpn-1x-coco' + }) +] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +``` + +
    + +关于可视化相关的教程,请参考 MMDetection 的[可视化教程](../user_guides/visualization.md)。 + +### Runtime 的配置 + +3.x 版本中 runtime 的配置字段有所调整,具体的对应关系如下: + + + + + + + + + + + + + + + + +
    原配置新配置
    + +```python +cudnn_benchmark = False +opencv_num_threads = 0 +mp_start_method = 'fork' +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None + + +``` + + + +```python +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', + opencv_num_threads=0), + dist_cfg=dict(backend='nccl')) +log_level = 'INFO' +load_from = None +resume = False +``` + +
    diff --git a/mmdetection/docs/zh_cn/migration/dataset_migration.md b/mmdetection/docs/zh_cn/migration/dataset_migration.md new file mode 100644 index 0000000..c379b9f --- /dev/null +++ b/mmdetection/docs/zh_cn/migration/dataset_migration.md @@ -0,0 +1 @@ +# 将数据集从 MMDetection 2.x 迁移至 3.x diff --git a/mmdetection/docs/zh_cn/migration/migration.md b/mmdetection/docs/zh_cn/migration/migration.md new file mode 100644 index 0000000..d706856 --- /dev/null +++ b/mmdetection/docs/zh_cn/migration/migration.md @@ -0,0 +1,12 @@ +# 从 MMDetection 2.x 迁移至 3.x + +MMDetection 3.x 版本是一个重大更新,包含了许多 API 和配置文件的变化。本文档旨在帮助用户从 MMDetection 2.x 版本迁移到 3.x 版本。 +我们将迁移指南分为以下几个部分: + +- [配置文件迁移](./config_migration.md) +- [API 和 Registry 迁移](./api_and_registry_migration.md) +- [数据集迁移](./dataset_migration.md) +- [模型迁移](./model_migration.md) +- [常见问题](./migration_faq.md) + +如果您在迁移过程中遇到任何问题,欢迎在 issue 中提出。我们也欢迎您为本文档做出贡献。 diff --git a/mmdetection/docs/zh_cn/migration/migration_faq.md b/mmdetection/docs/zh_cn/migration/migration_faq.md new file mode 100644 index 0000000..208a138 --- /dev/null +++ b/mmdetection/docs/zh_cn/migration/migration_faq.md @@ -0,0 +1 @@ +# 迁移 FAQ diff --git a/mmdetection/docs/zh_cn/migration/model_migration.md b/mmdetection/docs/zh_cn/migration/model_migration.md new file mode 100644 index 0000000..d799244 --- /dev/null +++ b/mmdetection/docs/zh_cn/migration/model_migration.md @@ -0,0 +1 @@ +# 将模型从 MMDetection 2.x 迁移至 3.x diff --git a/mmdetection/docs/zh_cn/model_zoo.md b/mmdetection/docs/zh_cn/model_zoo.md new file mode 100644 index 0000000..b537615 --- /dev/null +++ b/mmdetection/docs/zh_cn/model_zoo.md @@ -0,0 +1,333 @@ +# 模型库 + +## 镜像地址 + +从 MMDetection V2.0 起,我们只通过阿里云维护模型库。V1.x 版本的模型已经弃用。 + +## 共同设置 + +- 所有模型都是在 `coco_2017_train` 上训练,在 `coco_2017_val` 上测试。 +- 我们使用分布式训练。 +- 所有 pytorch-style 的 ImageNet 预训练主干网络来自 PyTorch 的模型库,caffe-style 的预训练主干网络来自 detectron2 最新开源的模型。 +- 为了与其他代码库公平比较,文档中所写的 GPU 内存是8个 GPU 的 `torch.cuda.max_memory_allocated()` 的最大值,此值通常小于 nvidia-smi 显示的值。 +- 我们以网络 forward 和后处理的时间加和作为推理时间,不包含数据加载时间。所有结果通过 [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/main/tools/analysis_tools/benchmark.py) 脚本计算所得。该脚本会计算推理 2000 张图像的平均时间。 + +## ImageNet 预训练模型 + +通过 ImageNet 分类任务预训练的主干网络进行初始化是很常见的操作。所有预训练模型的链接都可以在 [open_mmlab](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json) 中找到。根据 `img_norm_cfg` 和原始权重,我们可以将所有 ImageNet 预训练模型分为以下几种情况: + +- TorchVision:torchvision 模型权重,包含 ResNet50, ResNet101。`img_norm_cfg` 为 `dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)`。 +- Pycls:[pycls](https://github.com/facebookresearch/pycls) 模型权重,包含 RegNetX。`img_norm_cfg` 为 `dict( mean=[103.530, 116.280, 123.675], std=[57.375, 57.12, 58.395], to_rgb=False)`。 +- MSRA styles:[MSRA](https://github.com/KaimingHe/deep-residual-networks) 模型权重,包含 ResNet50_Caffe,ResNet101_Caffe。`img_norm_cfg` 为 `dict( mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)`。 +- Caffe2 styles:现阶段只包含 ResNext101_32x8d。`img_norm_cfg` 为 `dict(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)`。 +- Other styles: SSD 的 `img_norm_cfg` 为 `dict(mean=[123.675, 116.28, 103.53], std=[1, 1, 1], to_rgb=True)`,YOLOv3 的 `img_norm_cfg` 为 `dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)`。 + +MMdetection 常用到的主干网络细节如下表所示: + +| 模型 | 来源 | 链接 | 描述 | +| ---------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| ResNet50 | TorchVision | [torchvision 中的 ResNet-50](https://download.pytorch.org/models/resnet50-19c8e357.pth) | 来自 [torchvision 中的 ResNet-50](https://download.pytorch.org/models/resnet50-19c8e357.pth)。 | +| ResNet101 | TorchVision | [torchvision 中的 ResNet-101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth) | 来自 [torchvision 中的 ResNet-101](https://download.pytorch.org/models/resnet101-5d3b4d8f.pth)。 | +| RegNetX | Pycls | [RegNetX_3.2gf](https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth),[RegNetX_800mf](https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth) 等 | 来自 [pycls](https://github.com/facebookresearch/pycls)。 | +| ResNet50_Caffe | MSRA | [MSRA 中的 ResNet-50](https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth) | 由 [Detectron2 中的 R-50.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl) 转化的副本。原始权重文件来自 [MSRA 中的原始 ResNet-50](https://github.com/KaimingHe/deep-residual-networks)。 | +| ResNet101_Caffe | MSRA | [MSRA 中的 ResNet-101](https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth) | 由 [Detectron2 中的 R-101.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl) 转化的副本。原始权重文件来自 [MSRA 中的原始 ResNet-101](https://github.com/KaimingHe/deep-residual-networks)。 | +| ResNext101_32x8d | Caffe2 | [Caffe2 ResNext101_32x8d](https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth) | 由 [Detectron2 中的 X-101-32x8d.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/FAIR/X-101-32x8d.pkl) 转化的副本。原始 ResNeXt-101-32x8d 由 FB 使用 Caffe2 训练。 | + +## Baselines + +### RPN + +请参考 [RPN](https://github.com/open-mmlab/mmdetection/blob/main/configs/rpn)。 + +### Faster R-CNN + +请参考 [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/faster_rcnn)。 + +### Mask R-CNN + +请参考 [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask_rcnn)。 + +### Fast R-CNN (使用提前计算的 proposals) + +请参考 [Fast R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/fast_rcnn)。 + +### RetinaNet + +请参考 [RetinaNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/retinanet)。 + +### Cascade R-CNN and Cascade Mask R-CNN + +请参考 [Cascade R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/cascade_rcnn)。 + +### Hybrid Task Cascade (HTC) + +请参考 [HTC](https://github.com/open-mmlab/mmdetection/blob/main/configs/htc)。 + +### SSD + +请参考 [SSD](https://github.com/open-mmlab/mmdetection/blob/main/configs/ssd)。 + +### Group Normalization (GN) + +请参考 [Group Normalization](https://github.com/open-mmlab/mmdetection/blob/main/configs/gn)。 + +### Weight Standardization + +请参考 [Weight Standardization](https://github.com/open-mmlab/mmdetection/blob/main/configs/gn+ws)。 + +### Deformable Convolution v2 + +请参考 [Deformable Convolutional Networks](https://github.com/open-mmlab/mmdetection/blob/main/configs/dcn)。 + +### CARAFE: Content-Aware ReAssembly of FEatures + +请参考 [CARAFE](https://github.com/open-mmlab/mmdetection/blob/main/configs/carafe)。 + +### Instaboost + +请参考 [Instaboost](https://github.com/open-mmlab/mmdetection/blob/main/configs/instaboost)。 + +### Libra R-CNN + +请参考 [Libra R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/libra_rcnn)。 + +### Guided Anchoring + +请参考 [Guided Anchoring](https://github.com/open-mmlab/mmdetection/blob/main/configs/guided_anchoring)。 + +### FCOS + +请参考 [FCOS](https://github.com/open-mmlab/mmdetection/blob/main/configs/fcos)。 + +### FoveaBox + +请参考 [FoveaBox](https://github.com/open-mmlab/mmdetection/blob/main/configs/foveabox)。 + +### RepPoints + +请参考 [RepPoints](https://github.com/open-mmlab/mmdetection/blob/main/configs/reppoints)。 + +### FreeAnchor + +请参考 [FreeAnchor](https://github.com/open-mmlab/mmdetection/blob/main/configs/free_anchor)。 + +### Grid R-CNN (plus) + +请参考 [Grid R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/grid_rcnn)。 + +### GHM + +请参考 [GHM](https://github.com/open-mmlab/mmdetection/blob/main/configs/ghm)。 + +### GCNet + +请参考 [GCNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/gcnet)。 + +### HRNet + +请参考 [HRNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/hrnet)。 + +### Mask Scoring R-CNN + +请参考 [Mask Scoring R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/ms_rcnn)。 + +### Train from Scratch + +请参考 [Rethinking ImageNet Pre-training](https://github.com/open-mmlab/mmdetection/blob/main/configs/scratch)。 + +### NAS-FPN + +请参考 [NAS-FPN](https://github.com/open-mmlab/mmdetection/blob/main/configs/nas_fpn)。 + +### ATSS + +请参考 [ATSS](https://github.com/open-mmlab/mmdetection/blob/main/configs/atss)。 + +### FSAF + +请参考 [FSAF](https://github.com/open-mmlab/mmdetection/blob/main/configs/fsaf)。 + +### RegNetX + +请参考 [RegNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/regnet)。 + +### Res2Net + +请参考 [Res2Net](https://github.com/open-mmlab/mmdetection/blob/main/configs/res2net)。 + +### GRoIE + +请参考 [GRoIE](https://github.com/open-mmlab/mmdetection/blob/main/configs/groie)。 + +### Dynamic R-CNN + +请参考 [Dynamic R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/dynamic_rcnn)。 + +### PointRend + +请参考 [PointRend](https://github.com/open-mmlab/mmdetection/blob/main/configs/point_rend)。 + +### DetectoRS + +请参考 [DetectoRS](https://github.com/open-mmlab/mmdetection/blob/main/configs/detectors)。 + +### Generalized Focal Loss + +请参考 [Generalized Focal Loss](https://github.com/open-mmlab/mmdetection/blob/main/configs/gfl)。 + +### CornerNet + +请参考 [CornerNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/cornernet)。 + +### YOLOv3 + +请参考 [YOLOv3](https://github.com/open-mmlab/mmdetection/blob/main/configs/yolo)。 + +### PAA + +请参考 [PAA](https://github.com/open-mmlab/mmdetection/blob/main/configs/paa)。 + +### SABL + +请参考 [SABL](https://github.com/open-mmlab/mmdetection/blob/main/configs/sabl)。 + +### CentripetalNet + +请参考 [CentripetalNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/centripetalnet)。 + +### ResNeSt + +请参考 [ResNeSt](https://github.com/open-mmlab/mmdetection/blob/main/configs/resnest)。 + +### DETR + +请参考 [DETR](https://github.com/open-mmlab/mmdetection/blob/main/configs/detr)。 + +### Deformable DETR + +请参考 [Deformable DETR](https://github.com/open-mmlab/mmdetection/blob/main/configs/deformable_detr)。 + +### AutoAssign + +请参考 [AutoAssign](https://github.com/open-mmlab/mmdetection/blob/main/configs/autoassign)。 + +### YOLOF + +请参考 [YOLOF](https://github.com/open-mmlab/mmdetection/blob/main/configs/yolof)。 + +### Seesaw Loss + +请参考 [Seesaw Loss](https://github.com/open-mmlab/mmdetection/blob/main/configs/seesaw_loss)。 + +### CenterNet + +请参考 [CenterNet](https://github.com/open-mmlab/mmdetection/blob/main/configs/centernet)。 + +### YOLOX + +请参考 [YOLOX](https://github.com/open-mmlab/mmdetection/blob/main/configs/yolox)。 + +### PVT + +请参考 [PVT](https://github.com/open-mmlab/mmdetection/blob/main/configs/pvt)。 + +### SOLO + +请参考 [SOLO](https://github.com/open-mmlab/mmdetection/blob/main/configs/solo)。 + +### QueryInst + +请参考 [QueryInst](https://github.com/open-mmlab/mmdetection/blob/main/configs/queryinst)。 + +### Other datasets + +我们还在 [PASCAL VOC](https://github.com/open-mmlab/mmdetection/blob/main/configs/pascal_voc),[Cityscapes](https://github.com/open-mmlab/mmdetection/blob/main/configs/cityscapes) 和 [WIDER FACE](https://github.com/open-mmlab/mmdetection/blob/main/configs/wider_face) 上对一些方法进行了基准测试。 + +### Pre-trained Models + +我们还通过多尺度训练和更长的训练策略来训练用 ResNet-50 和 [RegNetX-3.2G](https://github.com/open-mmlab/mmdetection/blob/main/configs/regnet) 作为主干网络的 [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/faster_rcnn) 和 [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask_rcnn)。这些模型可以作为下游任务的预训练模型。 + +## 速度基准 + +### 训练速度基准 + +我们提供 [analyze_logs.py](https://github.com/open-mmlab/mmdetection/blob/main/tools/analysis_tools/analyze_logs.py) 来得到训练中每一次迭代的平均时间。示例请参考 [Log Analysis](https://mmdetection.readthedocs.io/en/latest/useful_tools.html#log-analysis)。 + +我们与其他流行框架的 Mask R-CNN 训练速度进行比较(数据是从 [detectron2](https://github.com/facebookresearch/detectron2/blob/main/docs/notes/benchmarks.md/) 复制而来)。在 mmdetection 中,我们使用 [mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_poly-1x_coco_v1.py) 进行基准测试。它与 detectron2 的 [mask_rcnn_R_50_FPN_noaug_1x.yaml](https://github.com/facebookresearch/detectron2/blob/main/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml) 设置完全一样。同时,我们还提供了[模型权重](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug_compare_20200518-10127928.pth)和[训练 log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug/mask_rcnn_r50_caffe_fpn_poly_1x_coco_no_aug_20200518_105755.log.json) 作为参考。为了跳过 GPU 预热时间,吞吐量按照100-500次迭代之间的平均吞吐量来计算。 + +| 框架 | 吞吐量 (img/s) | +| -------------------------------------------------------------------------------------- | -------------- | +| [Detectron2](https://github.com/facebookresearch/detectron2) | 62 | +| [MMDetection](https://github.com/open-mmlab/mmdetection) | 61 | +| [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/) | 53 | +| [tensorpack](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN) | 50 | +| [simpledet](https://github.com/TuSimple/simpledet/) | 39 | +| [Detectron](https://github.com/facebookresearch/Detectron) | 19 | +| [matterport/Mask_RCNN](https://github.com/matterport/Mask_RCNN/) | 14 | + +### 推理时间基准 + +我们提供 [benchmark.py](https://github.com/open-mmlab/mmdetection/blob/main/tools/analysis_tools/benchmark.py) 对推理时间进行基准测试。此脚本将推理 2000 张图片并计算忽略前 5 次推理的平均推理时间。可以通过设置 `LOG-INTERVAL` 来改变 log 输出间隔(默认为 50)。 + +```shell +python tools/benchmark.py ${CONFIG} ${CHECKPOINT} [--log-interval $[LOG-INTERVAL]] [--fuse-conv-bn] +``` + +模型库中,所有模型在基准测量推理时间时都没设置 `fuse-conv-bn`, 此设置可以使推理时间更短。 + +## 与 Detectron2 对比 + +我们在速度和精度方面对 mmdetection 和 [Detectron2](https://github.com/facebookresearch/detectron2.git) 进行对比。对比所使用的 detectron2 的 commit id 为 [185c27e](https://github.com/facebookresearch/detectron2/tree/185c27e4b4d2d4c68b5627b3765420c6d7f5a659)(30/4/2020)。 +为了公平对比,我们所有的实验都在同一机器下进行。 + +### 硬件 + +- 8 NVIDIA Tesla V100 (32G) GPUs +- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz + +### 软件环境 + +- Python 3.7 +- PyTorch 1.4 +- CUDA 10.1 +- CUDNN 7.6.03 +- NCCL 2.4.08 + +### 精度 + +| 模型 | 训练策略 | Detectron2 | mmdetection | 下载 | +| ------------------------------------------------------------------------------------------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_ms-1x_coco.py) | 1x | [37.9](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml) | 38.0 | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-5324cff8.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco_20200429_234554.log.json) | +| [Mask R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py) | 1x | [38.6 & 35.2](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml) | 38.8 & 35.4 | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco-dbecf295.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco_20200430_054239.log.json) | +| [Retinanet](https://github.com/open-mmlab/mmdetection/blob/main/configs/retinanet/retinanet_r50-caffe_fpn_ms-1x_coco.py) | 1x | [36.5](https://github.com/facebookresearch/detectron2/blob/master/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml) | 37.0 | [model](https://download.openmmlab.com/mmdetection/v2.0/benchmark/retinanet_r50_caffe_fpn_mstrain_1x_coco/retinanet_r50_caffe_fpn_mstrain_1x_coco-586977a0.pth) \| [log](https://download.openmmlab.com/mmdetection/v2.0/benchmark/retinanet_r50_caffe_fpn_mstrain_1x_coco/retinanet_r50_caffe_fpn_mstrain_1x_coco_20200430_014748.log.json) | + +### 训练速度 + +训练速度使用 s/iter 来度量。结果越低越好。 + +| 模型 | Detectron2 | mmdetection | +| ------------ | ---------- | ----------- | +| Faster R-CNN | 0.210 | 0.216 | +| Mask R-CNN | 0.261 | 0.265 | +| Retinanet | 0.200 | 0.205 | + +### 推理速度 + +推理速度通过单张 GPU 下的 fps(img/s) 来度量,越高越好。 +为了与 Detectron2 保持一致,我们所写的推理时间除去了数据加载时间。 +对于 Mask RCNN,我们去除了后处理中 RLE 编码的时间。 +我们在括号中给出了官方给出的速度。由于硬件差异,官方给出的速度会比我们所测试得到的速度快一些。 + +| 模型 | Detectron2 | mmdetection | +| ------------ | ----------- | ----------- | +| Faster R-CNN | 25.6 (26.3) | 22.2 | +| Mask R-CNN | 22.5 (23.3) | 19.6 | +| Retinanet | 17.8 (18.2) | 20.6 | + +### 训练内存 + +| 模型 | Detectron2 | mmdetection | +| ------------ | ---------- | ----------- | +| Faster R-CNN | 3.0 | 3.8 | +| Mask R-CNN | 3.4 | 3.9 | +| Retinanet | 3.9 | 3.4 | diff --git a/mmdetection/docs/zh_cn/notes/compatibility.md b/mmdetection/docs/zh_cn/notes/compatibility.md new file mode 100644 index 0000000..e9ebdd9 --- /dev/null +++ b/mmdetection/docs/zh_cn/notes/compatibility.md @@ -0,0 +1,177 @@ +# MMDetection v2.x 兼容性说明 + +## MMDetection 2.25.0 + +为了加入 Mask2Former 实例分割模型,对 Mask2Former 的配置文件进行了重命名 [PR #7571](https://github.com/open-mmlab/mmdetection/pull/7571): + + + + + + + + + + + +
    在 v2.25.0 之前v2.25.0 及之后
    + +``` +'mask2former_xxx_coco.py' 代表全景分割的配置文件 +``` + + + +``` +'mask2former_xxx_coco.py' 代表实例分割的配置文件 +'mask2former_xxx_coco-panoptic.py' 代表全景分割的配置文件 +``` + +
    + +## MMDetection 2.21.0 + +为了支持 CPU 训练,MMCV 中进行批处理的 scatter 的代码逻辑已经被修改。我们推荐使用 MMCV v1.4.4 或更高版本, +更多信息请参考 [MMCV PR #1621](https://github.com/open-mmlab/mmcv/pull/1621). + +## MMDetection 2.18.1 + +### MMCV compatibility + +为了修复 BaseTransformerLayer 中的权重引用问题, MultiheadAttention 中 batch first 的逻辑有所改变。 +我们推荐使用 MMCV v1.3.17 或更高版本。 更多信息请参考 [MMCV PR #1418](https://github.com/open-mmlab/mmcv/pull/1418) 。 + +## MMDetection 2.18.0 + +### DIIHead 兼容性 + +为了支持 QueryInst,在 DIIHead 的返回元组中加入了 attn_feats。 + +## MMDetection v2.14.0 + +### MMCV 版本 + +为了修复 EvalHook 优先级过低的问题,MMCV v1.3.8 中所有 hook 的优先级都重新进行了调整,因此 MMDetection v2.14.0 需要依赖最新的 MMCV v1.3.8 版本。 相关信息请参考[PR #1120](https://github.com/open-mmlab/mmcv/pull/1120) ,相关问题请参考[#5343](https://github.com/open-mmlab/mmdetection/issues/5343) 。 + +### SSD 兼容性 + +在 v2.14.0 中,为了使 SSD 能够被更灵活地使用,[PR #5291](https://github.com/open-mmlab/mmdetection/pull/5291) 重构了 SSD 的 backbone、neck 和 head。用户可以使用 tools/model_converters/upgrade_ssd_version.py 转换旧版本训练的模型。 + +```shell +python tools/model_converters/upgrade_ssd_version.py ${OLD_MODEL_PATH} ${NEW_MODEL_PATH} + +``` + +- OLD_MODEL_PATH:旧版 SSD 模型的路径。 +- NEW_MODEL_PATH:保存转换后模型权重的路径。 + +## MMDetection v2.12.0 + +在 v2.12.0 到 v2.18.0(或以上)版本的这段时间,为了提升通用性和便捷性,MMDetection 正在进行大规模重构。在升级到 v2.12.0 后 MMDetection 不可避免地带来了一些 BC Breaking,包括 MMCV 的版本依赖、模型初始化方式、模型 registry 和 mask AP 的评估。 + +### MMCV 版本 + +MMDetection v2.12.0 依赖 MMCV v1.3.3 中新增加的功能,包括:使用 `BaseModule` 统一参数初始化,模型 registry,以及[Deformable DETR](https://arxiv.org/abs/2010.04159) 中的 `MultiScaleDeformableAttn` CUDA 算子。 +注意,尽管 MMCV v1.3.2 已经包含了 MMDet 所需的功能,但是存在一些已知的问题。我们建议用户跳过 MMCV v1.3.2 使用 v1.3.3 版本。 + +### 统一模型初始化 + +为了统一 OpenMMLab 项目中的参数初始化方式,MMCV 新增加了 `BaseModule` 类,使用 `init_cfg` 参数对模块进行统一且灵活的初始化配置管理。 +现在用户需要在训练脚本中显式调用 `model.init_weights()` 来初始化模型(例如 [这行代码](https://github.com/open-mmlab/mmdetection/blob/master/tools/train.py#L162) ,在这之前则是在 detector 中进行处理的。 +**下游项目必须相应地更新模型初始化方式才能使用 MMDetection v2.12.0**。请参阅 [PR #4750](https://github.com/open-mmlab/mmdetection/pull/4750) 了解详情。 + +### 统一模型 registry + +为了能够使用在其他 OpenMMLab 项目中实现的 backbone,MMDetection v2.12.0 继承了在 MMCV (#760) 中创建的模型 registry。 +这样,只要 OpenMMLab 项目实现了某个 backbone,并且该项目也使用 MMCV 中的 registry,那么用户只需修改配置即可在 MMDetection 中使用该 backbone,不再需要将代码复制到 MMDetection 中。 更多详细信息,请参阅 [PR #5059](https://github.com/open-mmlab/mmdetection/pull/5059) 。 + +### Mask AP 评估 + +在 [PR #4898](https://github.com/open-mmlab/mmdetection/pull/4898) 和 v2.12.0 之前,对小、中、大目标的 mask AP 的评估是基于其边界框区域而不是真正的 mask 区域。 +这导致 `APs` 和 `APm` 变得更高但 `APl` 变得更低,但是不会影响整体的 mask AP。 [PR #4898](https://github.com/open-mmlab/mmdetection/pull/4898) 删除了 mask AP 计算中的 `bbox` ,改为使用 mask 区域。 +新的计算方式不会影响整体的 mask AP 评估,与 [Detectron2](https://github.com/facebookresearch/detectron2/)一致。 + +## 与 MMDetection v1.x 的兼容性 + +MMDetection v2.0 经过了大规模重构并解决了许多遗留问题。 MMDetection v2.0 不兼容 v1.x 版本,在这两个版本中使用相同的模型权重运行推理会产生不同的结果。 因此,MMDetection v2.0 重新对所有模型进行了 benchmark,并在 model zoo 中提供了新模型的权重和训练记录。 + +新旧版本的主要的区别有四方面:坐标系、代码库约定、训练超参和模块设计。 + +### 坐标系 + +新坐标系与 [Detectron2](https://github.com/facebookresearch/detectron2/) 一致, +将最左上角的像素的中心视为坐标原点 (0, 0) 而不是最左上角像素的左上角。 因此 COCO 边界框和分割标注中的坐标被解析为范围 `[0,width]` 和 `[0,height]` 中的坐标。 这个修改影响了所有与 bbox 及像素选择相关的计算,变得更加自然且更加准确。 + +- 在新坐标系中,左上角和右下角为 (x1, y1) (x2, y2) 的框的宽度及高度计算公式为 `width = x2 - x1` 和 `height = y2 - y1`。 + 在 MMDetection v1.x 和之前的版本中,高度和宽度都多了 `+ 1` 的操作。 + 本次修改包括三部分: + + 1. box 回归中的检测框变换以及编码/解码。 + 2. IoU 计算。这会影响 ground truth 和检测框之间的匹配以及 NMS 。但对兼容性的影响可以忽略不计。 + 3. Box 的角点坐标为浮点型,不再取整。这能使得检测结果更为准确,也使得检测框和 RoI 的最小尺寸不再为 1,但影响很小。 + +- Anchor 的中心与特征图的网格点对齐,类型变为 float。 + 在 MMDetection v1.x 和之前的版本中,anchors 是 `int` 类型且没有居中对齐。 + 这会影响 RPN 中的 Anchor 生成和所有基于 Anchor 的方法。 + +- ROIAlign 更好地与图像坐标系对齐。新的实现来自 [Detectron2](https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign) 。 + 当 RoI 用于提取 RoI 特征时,与 MMDetection v1.x 相比默认情况下相差半个像素。 + 能够通过设置 `aligned=False` 而不是 `aligned=True` 来维持旧版本的设置。 + +- Mask 的裁剪和粘贴更准确。 + + 1. 我们使用新的 RoIAlign 来提取 mask 目标。 在 MMDetection v1.x 中,bounding box 在提取 mask 目标之前被取整,裁剪过程是 numpy 实现的。 而在新版本中,裁剪的边界框不经过取整直接输入 RoIAlign。 此实现大大加快了训练速度(每次迭代约加速 0.1 秒,1x schedule 训练 Mask R50 时加速约 2 小时)并且理论上会更准确。 + 2. 在 MMDetection v2.0 中,修改后的 `paste_mask()` 函数应该比之前版本更准确。 此更改参考了 [Detectron2](https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/masks.py) 中的修改,可以将 COCO 上的 mask AP 提高约 0.5%。 + +### 代码库约定 + +- MMDetection v2.0 更改了类别标签的顺序,减少了回归和 mask 分支里的无用参数并使得顺序更加自然(没有 +1 和 -1)。 + 这会影响模型的所有分类层,使其输出的类别标签顺序发生改变。回归分支和 mask head 的最后一层不再为 K 个类别保留 K+1 个通道,类别顺序与分类分支一致。 + + - 在 MMDetection v2.0 中,标签 “K” 表示背景,标签 \[0, K-1\] 对应于 K = num_categories 个对象类别。 + + - 在 MMDetection v1.x 及之前的版本中,标签 “0” 表示背景,标签 \[1, K\] 对应 K 个类别。 + + - **注意**:softmax RPN 的类顺序在 version\<=2.4.0 中仍然和 1.x 中的一样,而 sigmoid RPN 不受影响。从 MMDetection v2.5.0 开始,所有 head 中的类顺序是统一的。 + +- 不使用 R-CNN 中的低质量匹配。在 MMDetection v1.x 和之前的版本中,`max_iou_assigner` 会在 RPN 和 R-CNN 训练时给每个 ground truth 匹配低质量框。我们发现这会导致最佳的 GT 框不会被分配给某些边界框, + 因此,在MMDetection v2.0 的 R-CNN 训练中默认不允许低质量匹配。这有时可能会稍微改善 box AP(约为 0.1%)。 + +- 单独的宽高比例系数。在 MMDetection v1.x 和以前的版本中,`keep_ratio=True` 时比例系数是单个浮点数,这并不准确,因为宽度和高度的比例系数会有一定的差异。 MMDetection v2.0 对宽度和高度使用单独的比例系数,对 AP 的提升约为 0.1%。 + +- 修改了 config 文件名称的规范。 由于 model zoo 中模型不断增多, MMDetection v2.0 采用新的命名规则: + + ```shell + [model]_(model setting)_[backbone]_[neck]_(norm setting)_(misc)_(gpu x batch)_[schedule]_[dataset].py + ``` + + 其中 (`misc`) 包括 DCN 和 GCBlock 等。更多详细信息在 [配置文件说明文档](config.md) 中说明 + +- MMDetection v2.0 使用新的 ResNet Caffe backbone 来减少加载预训练模型时的警告。新 backbone 中的大部分权重与以前的相同,但没有 `conv.bias`,且它们使用不同的 `img_norm_cfg`。因此,新的 backbone 不会报 `unexpected keys` 的警告。 + +### 训练超参 + +训练超参的调整不会影响模型的兼容性,但会略微提高性能。主要有: + +- 通过设置 `nms_post=1000` 和 `max_num=1000`,将 nms 之后的 proposal 数量从 2000 更改为 1000。使 mask AP 和 bbox AP 提高了约 0.2%。 + +- Mask R-CNN、Faster R-CNN 和 RetinaNet 的默认回归损失从 smooth L1 损失更改为 L1 损失,使得 box AP 整体上都有所提升(约 0.6%)。但是,将 L1-loss 用在 Cascade R-CNN 和 HTC 等其他方法上并不能提高性能,因此我们保留这些方法的原始设置。 + +- 为简单起见,RoIAlign 层的 `sampling_ratio` 设置为 0。略微提升了 AP(约 0.2% 绝对值)。 + +- 为了提升训练速度,默认设置在训练过程中不再使用梯度裁剪。大多数模型的性能不会受到影响。对于某些模型(例如 RepPoints),我们依旧使用梯度裁剪来稳定训练过程从而获得更好的性能。 + +- 因为不再默认使用梯度裁剪,默认 warmup 比率从 1/3 更改为 0.001,以使模型训练预热更加平缓。不过我们重新进行基准测试时发现这种影响可以忽略不计。 + +### 将模型从 v1.x 升级至 v2.0 + +用户可以使用脚本 `tools/model_converters/upgrade_model_version.py` 来将 MMDetection 1.x 训练的模型转换为 MMDetection v2.0。转换后的模型可以在 MMDetection v2.0 中运行,但性能略有下降(小于 1% AP)。 +详细信息可以在 `configs/legacy` 中找到。 + +## pycocotools 兼容性 + +`mmpycocotools` 是 OpenMMLab 维护的 `pycocotools` 的复刻版,适用于 MMDetection 和 Detectron2。 +在 [PR #4939](https://github.com/open-mmlab/mmdetection/pull/4939) 之前,由于 `pycocotools` 和 `mmpycocotool` 具有相同的包名,如果用户已经安装了 `pyccocotools`(在相同环境下先安装了 Detectron2 ),那么 MMDetection 的安装过程会跳过安装 `mmpycocotool`。 导致 MMDetection 缺少 `mmpycocotools` 而报错。 +但如果在 Detectron2 之前安装 MMDetection,则可以在相同的环境下工作。 +[PR #4939](https://github.com/open-mmlab/mmdetection/pull/4939) 弃用 mmpycocotools,使用官方 pycocotools。 +在 [PR #4939](https://github.com/open-mmlab/mmdetection/pull/4939) 之后,用户能够在相同环境下安装 MMDetection 和 Detectron2,不再需要关注安装顺序。 diff --git a/mmdetection/docs/zh_cn/notes/faq.md b/mmdetection/docs/zh_cn/notes/faq.md new file mode 100644 index 0000000..8268bd1 --- /dev/null +++ b/mmdetection/docs/zh_cn/notes/faq.md @@ -0,0 +1,259 @@ +# 常见问题解答 + +我们在这里列出了使用时的一些常见问题及其相应的解决方案。 如果您发现有一些问题被遗漏,请随时提 PR 丰富这个列表。 如果您无法在此获得帮助,请使用 [issue模板](https://github.com/open-mmlab/mmdetection/blob/main/.github/ISSUE_TEMPLATE/error-report.md/)创建问题,但是请在模板中填写所有必填信息,这有助于我们更快定位问题。 + +## PyTorch 2.0 支持 + +MMDetection 目前绝大部分算法已经支持了 PyTorch 2.0 及其 `torch.compile` 功能, 用户只需要安装 MMDetection 3.0.0rc7 及其以上版本即可。如果你在使用中发现有不支持的算法,欢迎给我们反馈。我们也非常欢迎社区贡献者来 benchmark 对比 `torch.compile` 功能所带来的速度提升。 + +如果你想启动 `torch.compile` 功能,只需要在 `train.py` 或者 `test.py` 后面加上 `--cfg-options compile=True`。 以 RTMDet 为例,你可以使用以下命令启动 `torch.compile` 功能: + +```shell +# 单卡 +python tools/train.py configs/rtmdet/rtmdet_s_8xb32-300e_coco.py --cfg-options compile=True + +# 单机 8 卡 +./tools/dist_train.sh configs/rtmdet/rtmdet_s_8xb32-300e_coco.py 8 --cfg-options compile=True + +# 单机 8 卡 + AMP 混合精度训练 +./tools/dist_train.sh configs/rtmdet/rtmdet_s_8xb32-300e_coco.py 8 --cfg-options compile=True --amp +``` + +需要特别注意的是,PyTorch 2.0 对于动态 shape 支持不是非常完善,目标检测算法中大部分不仅输入 shape 是动态的,而且 loss 计算和后处理过程中也是动态的,这会导致在开启 `torch.compile` 功能后训练速度会变慢。基于此,如果你想启动 `torch.compile` 功能,则应该遵循如下原则: + +1. 输入到网络的图片是固定 shape 的,而非多尺度的 +2. 设置 `torch._dynamo.config.cache_size_limit` 参数。TorchDynamo 会将 Python 字节码转换并缓存,已编译的函数会被存入缓存中。当下一次检查发现需要重新编译时,该函数会被重新编译并缓存。但是如果重编译次数超过预设的最大值(64),则该函数将不再被缓存或重新编译。前面说过目标检测算法中的 loss 计算和后处理部分也是动态计算的,这些函数需要在每次迭代中重新编译。因此将 `torch._dynamo.config.cache_size_limit` 参数设置得更小一些可以有效减少编译时间 + +在 MMDetection 中可以通过环境变量 `DYNAMO_CACHE_SIZE_LIMIT` 设置 `torch._dynamo.config.cache_size_limit` 参数,以 RTMDet 为例,命令如下所示: + +```shell +# 单卡 +export DYNAMO_CACHE_SIZE_LIMIT = 4 +python tools/train.py configs/rtmdet/rtmdet_s_8xb32-300e_coco.py --cfg-options compile=True + +# 单机 8 卡 +export DYNAMO_CACHE_SIZE_LIMIT = 4 +./tools/dist_train.sh configs/rtmdet/rtmdet_s_8xb32-300e_coco.py 8 --cfg-options compile=True +``` + +关于 PyTorch 2.0 的 dynamo 常见问题,可以参考 [这里](https://pytorch.org/docs/stable/dynamo/faq.html) + +## 安装 + +- MMCV 与 MMDetection 的兼容问题: "ConvWS is already registered in conv layer"; "AssertionError: MMCV==xxx is used but incompatible. Please install mmcv>=xxx, \<=xxx." + + MMDetection,MMEngine 和 MMCV 的版本兼容关系如下。请选择合适的版本避免安装错误 。 + + | MMDetection 版本 | MMCV 版本 | MMEngine 版本 | + | :--------------: | :---------------------: | :----------------------: | + | main | mmcv>=2.0.0, \<2.2.0 | mmengine>=0.7.1, \<1.0.0 | + | 3.2.0 | mmcv>=2.0.0, \<2.2.0 | mmengine>=0.7.1, \<1.0.0 | + | 3.1.0 | mmcv>=2.0.0, \<2.1.0 | mmengine>=0.7.1, \<1.0.0 | + | 3.0.0 | mmcv>=2.0.0, \<2.1.0 | mmengine>=0.7.1, \<1.0.0 | + | 3.0.0rc6 | mmcv>=2.0.0rc4, \<2.1.0 | mmengine>=0.6.0, \<1.0.0 | + | 3.0.0rc5 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.3.0, \<1.0.0 | + | 3.0.0rc4 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.3.0, \<1.0.0 | + | 3.0.0rc3 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.3.0, \<1.0.0 | + | 3.0.0rc2 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.1.0, \<1.0.0 | + | 3.0.0rc1 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.1.0, \<1.0.0 | + | 3.0.0rc0 | mmcv>=2.0.0rc1, \<2.1.0 | mmengine>=0.1.0, \<1.0.0 | + + **注意:** + + 1. 如果你希望安装 mmdet-v2.x, MMDetection 和 MMCV 版本兼容表可以在 [这里](https://mmdetection.readthedocs.io/en/stable/faq.html#installation) 找到,请选择合适的版本避免安装错误。 + 2. 在 MMCV-v2.x 中,`mmcv-full` 改名为 `mmcv`,如果你想安装不包含 CUDA 算子的版本,可以选择安装 MMCV 精简版 `mmcv-lite`。 + +- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'". + + 原因是安装了 `mmcv-lite` 而不是 `mmcv`。 + + 1. `pip uninstall mmcv-lite` 卸载安装的 `mmcv-lite` + + 2. 安装 `mmcv` 根据 [安装说明](https://mmcv.readthedocs.io/zh_CN/2.x/get_started/installation.html)。 + +- 在 Windows 环境下安装过程中遇到 "Microsoft Visual C++ 14.0 or graeter is required" error . + + 这个错误发生在 pycotools 的 'pycocotools.\_mask' 扩展构建过程,其原因是缺少了对应 C++ 环境依赖。你需要到微软官方下载[对应工具](https://visualstudio.microsoft.com/zh-hans/visual-cpp-build-tools/),选择“使用 C++ 的桌面开发”选项安装最小依赖,随后重新安装 pycocotools。 + +- 使用 albumentations + +如果你希望使用 `albumentations`,我们建议使用 `pip install -r requirements/albu.txt` +或者 `pip install -U albumentations --no-binary qudida,albumentations` 进行安装。 +如果简单地使用 `pip install albumentations>=0.3.2` 进行安装, +则会同时安装 `opencv-python-headless`(即便已经安装了 `opencv-python` 也会再次安装)。 +我们建议在安装 `albumentations` 后检查环境,以确保没有同时安装 `opencv-python` 和 `opencv-python-headless`, +因为同时安装可能会导致一些问题。更多细节请参考[官方文档](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies) 。 + +- 在某些算法中出现 ModuleNotFoundError 错误 + +一些算法或者数据需要额外的依赖,例如 Instaboost、 Panoptic Segmentation、 LVIS dataset 等。请注意错误信息并安装相应的包,例如: + +```shell +# 安装 instaboost 依赖 +pip install instaboostfast +# 安装 panoptic segmentation 依赖 +pip install git+https://github.com/cocodataset/panopticapi.git +# 安装 LVIS dataset 依赖 +pip install git+https://github.com/lvis-dataset/lvis-api.git +``` + +## 代码 + +- 修改一些代码后是否需要重新安装 mmdet + +如果你遵循最佳实践,即使用 `pip install -v -e .` 安装的 mmdet,则对本地代码所作的任何修改都会生效,无需重新安装 + +- 如何使用多个 MMDetection 版本进行开发 + +你可以拥有多个文件夹,例如 mmdet-3.0,mmdet-3.1。 + +要使环境中安装默认的 MMDetection 而不是当前正在在使用的,可以删除出现在相关脚本中的代码: + +```shell +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH +``` + +## PyTorch/CUDA 环境相关 + +- "RTX 30 series card fails when building MMCV or MMDet" + + 1. 临时解决方案为使用命令 `MMCV_WITH_OPS=1 MMCV_CUDA_ARGS='-gencode=arch=compute_80,code=sm_80' pip install -e .` 进行编译。 常见报错信息为 `nvcc fatal : Unsupported gpu architecture 'compute_86'` 意思是你的编译器不支持 sm_86 架构(包括英伟达 30 系列的显卡)的优化,至 CUDA toolkit 11.0 依旧未支持. 这个命令是通过增加宏 `MMCV_CUDA_ARGS='-gencode=arch=compute_80,code=sm_80` 让 nvcc 编译器为英伟达 30 系列显卡进行 `sm_80` 的优化,虽然这有可能会无法发挥出显卡所有性能。 + + 2. 有开发者已经在 [pytorch/pytorch#47585](https://github.com/pytorch/pytorch/pull/47585) 更新了 PyTorch 默认的编译 flag, 但是我们对此并没有进行测试。 + +- "invalid device function" 或者 "no kernel image is available for execution". + + 1. 检查您正常安装了 CUDA runtime (一般在`/usr/local/`),或者使用 `nvcc --version` 检查本地版本,有时安装 PyTorch 会顺带安装一个 CUDA runtime,并且实际优先使用 conda 环境中的版本,你可以使用 `conda list cudatoolkit` 查看其版本。 + + 2. 编译 extension 的 CUDA Toolkit 版本与运行时的 CUDA Toolkit 版本是否相符, + + - 如果您从源码自己编译的,使用 `python mmdet/utils/collect_env.py` 检查编译编译 extension 的 CUDA Toolkit 版本,然后使用 `conda list cudatoolkit` 检查当前 conda 环境是否有 CUDA Toolkit,若有检查版本是否匹配, 如不匹配,更换 conda 环境的 CUDA Toolkit,或者使用匹配的 CUDA Toolkit 中的 nvcc 编译即可,如环境中无 CUDA Toolkit,可以使用 `nvcc -V`。 + + 等命令查看当前使用的 CUDA runtime。 + + - 如果您是通过 pip 下载的预编译好的版本,请确保与当前 CUDA runtime 一致。 + + 3. 运行 `python mmdet/utils/collect_env.py` 检查是否为正确的 GPU 架构编译的 PyTorch, torchvision, 与 MMCV。 你或许需要设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV,可以参考 [GPU 架构表](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list), + 例如, 运行 `TORCH_CUDA_ARCH_LIST=7.0 pip install mmcv` 为 Volta GPU 编译 MMCV。这种架构不匹配的问题一般会出现在使用一些旧型号的 GPU 时候出现, 例如, Tesla K80。 + +- "undefined symbol" 或者 "cannot open xxx.so". + + 1. 如果这些 symbol 属于 CUDA/C++ (如 libcudart.so 或者 GLIBCXX),使用 `python mmdet/utils/collect_env.py`检查 CUDA/GCC runtime 与编译 MMCV 的 CUDA 版本是否相同。 + 2. 如果这些 symbols 属于 PyTorch,(例如, symbols containing caffe, aten, and TH), 检查当前 Pytorch 版本是否与编译 MMCV 的版本一致。 + 3. 运行 `python mmdet/utils/collect_env.py` 检查 PyTorch, torchvision, MMCV 等的编译环境与运行环境一致。 + +- setuptools.sandbox.UnpickleableException: DistutilsSetupError("each element of 'ext_modules' option must be an Extension instance or 2-tuple") + + 1. 如果你在使用 miniconda 而不是 anaconda,检查是否正确的安装了 Cython 如 [#3379](https://github.com/open-mmlab/mmdetection/issues/3379). + 2. 检查环境中的 `setuptools`, `Cython`, and `PyTorch` 相互之间版本是否匹配。 + +- "Segmentation fault". + + 1. 检查 GCC 的版本,通常是因为 PyTorch 版本与 GCC 版本不匹配 (例如 GCC \< 4.9 ),我们推荐用户使用 GCC 5.4,我们也不推荐使用 GCC 5.5, 因为有反馈 GCC 5.5 会导致 "segmentation fault" 并且切换到 GCC 5.4 就可以解决问题。 + + 2. 检查是否正确安装了 CUDA 版本的 PyTorch 。 + + ```shell + python -c 'import torch; print(torch.cuda.is_available())' + ``` + + 是否返回True。 + + 3. 如果 `torch` 的安装是正确的,检查是否正确编译了 MMCV。 + + ```shell + python -c 'import mmcv; import mmcv.ops' + ``` + + 4. 如果 MMCV 与 PyTorch 都被正确安装了,则使用 `ipdb`, `pdb` 设置断点,直接查找哪一部分的代码导致了 `segmentation fault`。 + +## Training 相关 + +- "Loss goes Nan" + + 1. 检查数据的标注是否正常, 长或宽为 0 的框可能会导致回归 loss 变为 nan,一些小尺寸(宽度或高度小于 1)的框在数据增强(例如,instaboost)后也会导致此问题。 因此,可以检查标注并过滤掉那些特别小甚至面积为 0 的框,并关闭一些可能会导致 0 面积框出现数据增强。 + 2. 降低学习率:由于某些原因,例如 batch size 大小的变化, 导致当前学习率可能太大。 您可以降低为可以稳定训练模型的值。 + 3. 延长 warm up 的时间:一些模型在训练初始时对学习率很敏感,您可以把 `warmup_iters` 从 500 更改为 1000 或 2000。 + 4. 添加 gradient clipping: 一些模型需要梯度裁剪来稳定训练过程。 默认的 `grad_clip` 是 `None`, 你可以在 config 设置 `optimizer_config=dict(_delete_=True, grad_clip=dict(max_norm=35, norm_type=2))` 如果你的 config 没有继承任何包含 `optimizer_config=dict(grad_clip=None)`, 你可以直接设置`optimizer_config=dict(grad_clip=dict(max_norm=35, norm_type=2))`. + +- "GPU out of memory" + + 1. 存在大量 ground truth boxes 或者大量 anchor 的场景,可能在 assigner 会 OOM。 您可以在 assigner 的配置中设置 `gpu_assign_thr=N`,这样当超过 N 个 GT boxes 时,assigner 会通过 CPU 计算 IOU。 + + 2. 在 backbone 中设置 `with_cp=True`。 这使用 PyTorch 中的 `sublinear strategy` 来降低 backbone 占用的 GPU 显存。 + + 3. 使用 `config/fp16` 中的示例尝试混合精度训练。`loss_scale` 可能需要针对不同模型进行调整。 + + 4. 你也可以尝试使用 `AvoidCUDAOOM` 来避免该问题。首先它将尝试调用 `torch.cuda.empty_cache()`。如果失败,将会尝试把输入类型转换到 FP16。如果仍然失败,将会把输入从 GPUs 转换到 CPUs 进行计算。这里提供了两个使用的例子: + + ```python + from mmdet.utils import AvoidCUDAOOM + + output = AvoidCUDAOOM.retry_if_cuda_oom(some_function)(input1, input2) + ``` + + 你也可也使用 `AvoidCUDAOOM` 作为装饰器让代码遇到 OOM 的时候继续运行: + + ```python + from mmdet.utils import AvoidCUDAOOM + + @AvoidCUDAOOM.retry_if_cuda_oom + def function(*args, **kwargs): + ... + return xxx + ``` + +- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one" + + 1. 这个错误出现在存在参数没有在 forward 中使用,容易在 DDP 中运行不同分支时发生。 + 2. 你可以在 config 设置 `find_unused_parameters = True` 进行训练 (会降低训练速度)。 + 3. 你也可以通过在 config 中的 `optimizer_config` 里设置 `detect_anomalous_params=True` 查找哪些参数没有用到,但是需要 MMCV 的版本 >= 1.4.1。 + +- 训练中保存最好模型 + + 可以通过配置 `default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1, save_best='auto')`开启。在 `auto` 参数情况下会根据返回的验证结果中的第一个 key 作为选择最优模型的依据,你也可以直接设置评估结果中的 key 来手动设置,例如 `save_best='coco/bbox_mAP'`。 + +- 在 Resume 训练中使用 `ExpMomentumEMAHook` + + 如果在训练中使用了 `ExpMomentumEMAHook`,那么 resume 时候不能仅仅通过命令行参数 `--resume-from` 或 `--cfg-options resume_from` 实现恢复模型参数功能例如 `python tools/train.py configs/yolox/yolox_s_8x8_300e_coco.py --resume-from ./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth`。以 `yolox_s` 算法为例,由于 `ExpMomentumEMAHook` 需要重新加载权重,你可以通过如下做法实现: + + ```python + # 直接打开 configs/yolox/yolox_s_8x8_300e_coco.py 修改所有 resume_from 字段 + resume_from=./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth + custom_hooks=[... + dict( + type='ExpMomentumEMAHook', + resume_from=./work_dir/yolox_s_8x8_300e_coco/epoch_x.pth, + momentum=0.0001, + priority=49) + ] + ``` + +## Evaluation 相关 + +- 使用 COCO Dataset 的测评接口时, 测评结果中 AP 或者 AR = -1 + 1. 根据COCO数据集的定义,一张图像中的中等物体与小物体面积的阈值分别为 9216(96\*96)与 1024(32\*32)。 + 2. 如果在某个区间没有检测框 AP 与 AR 认定为 -1. + +## Model 相关 + +- **ResNet style 参数说明** + + ResNet style 可选参数允许 `pytorch` 和 `caffe`,其差别在于 Bottleneck 模块。Bottleneck 是 `1x1-3x3-1x1` 堆叠结构,在 `caffe` 模式模式下 stride=2 参数放置在第一个 `1x1` 卷积处,而 `pyorch` 模式下 stride=2 放在第二个 `3x3` 卷积处。一个简单示例如下: + + ```python + if self.style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + ``` + +- **ResNeXt 参数说明** + + ResNeXt 来自论文 [`Aggregated Residual Transformations for Deep Neural Networks`](https://arxiv.org/abs/1611.05431). 其引入分组卷积,并且通过变量基数来控制组的数量达到精度和复杂度的平衡,其有两个超参 `baseWidth` 和 `cardinality `来控制内部 Bottleneck 模块的基本宽度和分组数参数。以 MMDetection 中配置名为 `mask_rcnn_x101_64x4d_fpn_mstrain-poly_3x_coco.py` 为例,其中 `mask_rcnn` 代表算法采用 Mask R-CNN,`x101` 代表骨架网络采用 ResNeXt-101,`64x4d`代表 Bottleneck 一共分成 64 组,每组的基本宽度是 4。 + +- **骨架网络 eval 模式说明** + + 因为检测模型通常比较大且输入图片分辨率很高,这会导致检测模型的 batch 很小,通常是 2,这会使得 BatchNorm 在训练过程计算的统计量方差非常大,不如主干网络预训练时得到的统计量稳定,因此在训练是一般都会使用 `norm_eval=True` 模式,直接使用预训练主干网络中的 BatchNorm 统计量,少数使用大 batch 的算法是 `norm_eval=False` 模式,例如 NASFPN。对于没有 ImageNet 预训练的骨架网络,如果 batch 比较小,可以考虑使用 `SyncBN`。 diff --git a/mmdetection/docs/zh_cn/notes/projects.md b/mmdetection/docs/zh_cn/notes/projects.md new file mode 100644 index 0000000..6b9d300 --- /dev/null +++ b/mmdetection/docs/zh_cn/notes/projects.md @@ -0,0 +1,48 @@ +# 基于 MMDetection 的项目 + +有许多开源项目都是基于 MMDetection 搭建的,我们在这里列举一部分作为样例,展示如何基于 MMDetection 搭建您自己的项目。 +由于这个页面列举的项目并不完全,我们欢迎社区提交 Pull Request 来更新这个文档。 + +## MMDetection 的拓展项目 + +一些项目拓展了 MMDetection 的边界,如将 MMDetection 拓展支持 3D 检测或者将 MMDetection 用于部署。 +它们展示了 MMDetection 的许多可能性,所以我们在这里也列举一些。 + +- [OTEDetection](https://github.com/opencv/mmdetection): OpenVINO training extensions for object detection. +- [MMDetection3d](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. + +## 研究项目 + +同样有许多研究论文是基于 MMDetection 进行的。许多论文都发表在了顶级的会议或期刊上,或者对社区产生了深远的影响。 +为了向社区提供一个可以参考的论文列表,帮助大家开发或者比较新的前沿算法,我们在这里也遵循会议的时间顺序列举了一些论文。 +MMDetection 中已经支持的算法不在此列。 + +- Involution: Inverting the Inherence of Convolution for Visual Recognition, CVPR21. [\[paper\]](https://arxiv.org/abs/2103.06255)[\[github\]](https://github.com/d-li14/involution) +- Multiple Instance Active Learning for Object Detection, CVPR 2021. [\[paper\]](https://openaccess.thecvf.com/content/CVPR2021/papers/Yuan_Multiple_Instance_Active_Learning_for_Object_Detection_CVPR_2021_paper.pdf)[\[github\]](https://github.com/yuantn/MI-AOD) +- Adaptive Class Suppression Loss for Long-Tail Object Detection, CVPR 2021. [\[paper\]](https://arxiv.org/abs/2104.00885)[\[github\]](https://github.com/CASIA-IVA-Lab/ACSL) +- Generalizable Pedestrian Detection: The Elephant In The Room, CVPR2021. [\[paper\]](https://arxiv.org/abs/2003.08799)[\[github\]](https://github.com/hasanirtiza/Pedestron) +- Group Fisher Pruning for Practical Network Compression, ICML2021. [\[paper\]](https://github.com/jshilong/FisherPruning/blob/main/resources/paper.pdf)[\[github\]](https://github.com/jshilong/FisherPruning) +- Overcoming Classifier Imbalance for Long-tail Object Detection with Balanced Group Softmax, CVPR2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/papers/Li_Overcoming_Classifier_Imbalance_for_Long-Tail_Object_Detection_With_Balanced_Group_CVPR_2020_paper.pdf)[\[github\]](https://github.com/FishYuLi/BalancedGroupSoftmax) +- Coherent Reconstruction of Multiple Humans from a Single Image, CVPR2020. [\[paper\]](https://jiangwenpl.github.io/multiperson/)[\[github\]](https://github.com/JiangWenPL/multiperson) +- Look-into-Object: Self-supervised Structure Modeling for Object Recognition, CVPR 2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/papers/Zhou_Look-Into-Object_Self-Supervised_Structure_Modeling_for_Object_Recognition_CVPR_2020_paper.pdf)[\[github\]](https://github.com/JDAI-CV/LIO) +- Video Panoptic Segmentation, CVPR2020. [\[paper\]](https://arxiv.org/abs/2006.11339)[\[github\]](https://github.com/mcahny/vps) +- D2Det: Towards High Quality Object Detection and Instance Segmentation, CVPR2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/html/Cao_D2Det_Towards_High_Quality_Object_Detection_and_Instance_Segmentation_CVPR_2020_paper.html)[\[github\]](https://github.com/JialeCao001/D2Det) +- CentripetalNet: Pursuing High-quality Keypoint Pairs for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.09119)[\[github\]](https://github.com/KiveeDong/CentripetalNet) +- Learning a Unified Sample Weighting Network for Object Detection, CVPR 2020. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2020/html/Cai_Learning_a_Unified_Sample_Weighting_Network_for_Object_Detection_CVPR_2020_paper.html)[\[github\]](https://github.com/caiqi/sample-weighting-network) +- Scale-equalizing Pyramid Convolution for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2005.03101) [\[github\]](https://github.com/jshilong/SEPC) +- Revisiting the Sibling Head in Object Detector, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.07540)[\[github\]](https://github.com/Sense-X/TSD) +- PolarMask: Single Shot Instance Segmentation with Polar Representation, CVPR2020. [\[paper\]](https://arxiv.org/abs/1909.13226)[\[github\]](https://github.com/xieenze/PolarMask) +- Hit-Detector: Hierarchical Trinity Architecture Search for Object Detection, CVPR2020. [\[paper\]](https://arxiv.org/abs/2003.11818)[\[github\]](https://github.com/ggjy/HitDet.pytorch) +- ZeroQ: A Novel Zero Shot Quantization Framework, CVPR2020. [\[paper\]](https://arxiv.org/abs/2001.00281)[\[github\]](https://github.com/amirgholami/ZeroQ) +- CBNet: A Novel Composite Backbone Network Architecture for Object Detection, AAAI2020. [\[paper\]](https://aaai.org/Papers/AAAI/2020GB/AAAI-LiuY.1833.pdf)[\[github\]](https://github.com/VDIGPKU/CBNet) +- RDSNet: A New Deep Architecture for Reciprocal Object Detection and Instance Segmentation, AAAI2020. [\[paper\]](https://arxiv.org/abs/1912.05070)[\[github\]](https://github.com/wangsr126/RDSNet) +- Training-Time-Friendly Network for Real-Time Object Detection, AAAI2020. [\[paper\]](https://arxiv.org/abs/1909.00700)[\[github\]](https://github.com/ZJULearning/ttfnet) +- Cascade RPN: Delving into High-Quality Region Proposal Network with Adaptive Convolution, NeurIPS 2019. [\[paper\]](https://arxiv.org/abs/1909.06720)[\[github\]](https://github.com/thangvubk/Cascade-RPN) +- Reasoning R-CNN: Unifying Adaptive Global Reasoning into Large-scale Object Detection, CVPR2019. [\[paper\]](http://openaccess.thecvf.com/content_CVPR_2019/papers/Xu_Reasoning-RCNN_Unifying_Adaptive_Global_Reasoning_Into_Large-Scale_Object_Detection_CVPR_2019_paper.pdf)[\[github\]](https://github.com/chanyn/Reasoning-RCNN) +- Learning RoI Transformer for Oriented Object Detection in Aerial Images, CVPR2019. [\[paper\]](https://arxiv.org/abs/1812.00155)[\[github\]](https://github.com/dingjiansw101/AerialDetection) +- SOLO: Segmenting Objects by Locations. [\[paper\]](https://arxiv.org/abs/1912.04488)[\[github\]](https://github.com/WXinlong/SOLO) +- SOLOv2: Dynamic, Faster and Stronger. [\[paper\]](https://arxiv.org/abs/2003.10152)[\[github\]](https://github.com/WXinlong/SOLO) +- Dense Peppoints: Representing Visual Objects with Dense Point Sets. [\[paper\]](https://arxiv.org/abs/1912.11473)[\[github\]](https://github.com/justimyhxu/Dense-RepPoints) +- IterDet: Iterative Scheme for Object Detection in Crowded Environments. [\[paper\]](https://arxiv.org/abs/2005.05708)[\[github\]](https://github.com/saic-vul/iterdet) +- Cross-Iteration Batch Normalization. [\[paper\]](https://arxiv.org/abs/2002.05712)[\[github\]](https://github.com/Howal/Cross-iterationBatchNorm) +- A Ranking-based, Balanced Loss Function Unifying Classification and Localisation in Object Detection, NeurIPS2020 [\[paper\]](https://arxiv.org/abs/2009.13592)[\[github\]](https://github.com/kemaloksuz/aLRPLoss) diff --git a/mmdetection/docs/zh_cn/overview.md b/mmdetection/docs/zh_cn/overview.md new file mode 100644 index 0000000..5269aed --- /dev/null +++ b/mmdetection/docs/zh_cn/overview.md @@ -0,0 +1,54 @@ +# 概述 + +本章向您介绍 MMDetection 的整体框架,并提供详细的教程链接。 + +## 什么是 MMDetection + +![图片](https://user-images.githubusercontent.com/12907710/137271636-56ba1cd2-b110-4812-8221-b4c120320aa9.png) + +MMDetection 是一个目标检测工具箱,包含了丰富的目标检测、实例分割、全景分割算法以及相关的组件和模块,下面是它的整体框架: + +MMDetection 由 7 个主要部分组成,apis、structures、datasets、models、engine、evaluation 和 visualization。 + +- **apis** 为模型推理提供高级 API。 +- **structures** 提供 bbox、mask 和 DetDataSample 等数据结构。 +- **datasets** 支持用于目标检测、实例分割和全景分割的各种数据集。 + - **transforms** 包含各种数据增强变换。 + - **samplers** 定义了不同的数据加载器采样策略。 +- **models** 是检测器最重要的部分,包含检测器的不同组件。 + - **detectors** 定义所有检测模型类。 + - **data_preprocessors** 用于预处理模型的输入数据。 + - **backbones** 包含各种骨干网络。 + - **necks** 包含各种模型颈部组件。 + - **dense_heads** 包含执行密集预测的各种检测头。 + - **roi_heads** 包含从 RoI 预测的各种检测头。 + - **seg_heads** 包含各种分割头。 + - **losses** 包含各种损失函数。 + - **task_modules** 为检测任务提供模块,例如 assigners、samplers、box coders 和 prior generators。 + - **layers** 提供了一些基本的神经网络层。 +- **engine** 是运行时组件的一部分。 + - **runner** 为 [MMEngine 的执行器](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html)提供扩展。 + - **schedulers** 提供用于调整优化超参数的调度程序。 + - **optimizers** 提供优化器和优化器封装。 + - **hooks** 提供执行器的各种钩子。 +- **evaluation** 为评估模型性能提供不同的指标。 +- **visualization** 用于可视化检测结果。 + +## 如何使用本指南 + +以下是 MMDetection 的详细指南: + +1. 安装说明见[开始你的第一步](get_started.md)。 + +2. MMDetection 的基本使用方法请参考以下教程。 + + - [训练和测试](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/index.html#train-test) + + - [实用工具](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/index.html#useful-tools) + +3. 参考以下教程深入了解: + + - [基础概念](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/index.html#basic-concepts) + - [组件定制](https://mmdetection.readthedocs.io/zh_CN/latest/advanced_guides/index.html#component-customization) + +4. 对于 MMDetection 2.x 版本的用户,我们提供了[迁移指南](./migration/migration.md),帮助您完成新版本的适配。 diff --git a/mmdetection/docs/zh_cn/stat.py b/mmdetection/docs/zh_cn/stat.py new file mode 100755 index 0000000..1ea5fbd --- /dev/null +++ b/mmdetection/docs/zh_cn/stat.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +import functools as func +import glob +import os.path as osp +import re + +import numpy as np + +url_prefix = 'https://github.com/open-mmlab/mmdetection/blob/main/' + +files = sorted(glob.glob('../configs/*/README.md')) + +stats = [] +titles = [] +num_ckpts = 0 + +for f in files: + url = osp.dirname(f.replace('../', url_prefix)) + + with open(f, 'r') as content_file: + content = content_file.read() + + title = content.split('\n')[0].replace('# ', '').strip() + ckpts = set(x.lower().strip() + for x in re.findall(r'\[model\]\((https?.*)\)', content)) + + if len(ckpts) == 0: + continue + + _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)] + assert len(_papertype) > 0 + papertype = _papertype[0] + + paper = set([(papertype, title)]) + + titles.append(title) + num_ckpts += len(ckpts) + + statsmsg = f""" +\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts) +""" + stats.append((paper, ckpts, statsmsg)) + +allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats]) +msglist = '\n'.join(x for _, _, x in stats) + +papertypes, papercounts = np.unique([t for t, _ in allpapers], + return_counts=True) +countstr = '\n'.join( + [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)]) + +modelzoo = f""" +# Model Zoo Statistics + +* Number of papers: {len(set(titles))} +{countstr} + +* Number of checkpoints: {num_ckpts} + +{msglist} +""" + +with open('modelzoo_statistics.md', 'w') as f: + f.write(modelzoo) diff --git a/mmdetection/docs/zh_cn/switch_language.md b/mmdetection/docs/zh_cn/switch_language.md new file mode 100644 index 0000000..b2c4ad9 --- /dev/null +++ b/mmdetection/docs/zh_cn/switch_language.md @@ -0,0 +1,3 @@ +## English + +## 简体中文 diff --git a/mmdetection/docs/zh_cn/user_guides/config.md b/mmdetection/docs/zh_cn/user_guides/config.md new file mode 100644 index 0000000..3a670bf --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/config.md @@ -0,0 +1,589 @@ +# 学习配置文件 + +MMDetection 和其他 OpenMMLab 仓库使用 [MMEngine 的配置文件系统](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html)。 配置文件使用了模块化和继承设计,以便于进行各类实验。 + +## 配置文件的内容 + +MMDetection 采用模块化设计,所有功能的模块都可以通过配置文件进行配置。 以 Mask R-CNN 为例,我们将根据不同的功能模块介绍配置文件中的各个字段: + +### 模型配置 + +在 mmdetection 的配置中,我们使用 `model` 字段来配置检测算法的组件。 除了 `backbone`、`neck` 等神经网络组件外,还需要 `data_preprocessor`、`train_cfg` 和 `test_cfg`。 `data_preprocessor` 负责对 dataloader 输出的每一批数据进行预处理。 模型配置中的 `train_cfg` 和 `test_cfg` 用于设置训练和测试组件的超参数。 + +```python +model = dict( + type='MaskRCNN', # 检测器名 + data_preprocessor=dict( # 数据预处理器的配置,通常包括图像归一化和 padding + type='DetDataPreprocessor', # 数据预处理器的类型,参考 https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.data_preprocessors.DetDataPreprocessor + mean=[123.675, 116.28, 103.53], # 用于预训练骨干网络的图像归一化通道均值,按 R、G、B 排序 + std=[58.395, 57.12, 57.375], # 用于预训练骨干网络的图像归一化通道标准差,按 R、G、B 排序 + bgr_to_rgb=True, # 是否将图片通道从 BGR 转为 RGB + pad_mask=True, # 是否填充实例分割掩码 + pad_size_divisor=32), # padding 后的图像的大小应该可以被 ``pad_size_divisor`` 整除 + backbone=dict( # 主干网络的配置文件 + type='ResNet', # 主干网络的类别,可用选项请参考 https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.backbones.ResNet + depth=50, # 主干网络的深度,对于 ResNet 和 ResNext 通常设置为 50 或 101 + num_stages=4, # 主干网络状态(stages)的数目,这些状态产生的特征图作为后续的 head 的输入 + out_indices=(0, 1, 2, 3), # 每个状态产生的特征图输出的索引 + frozen_stages=1, # 第一个状态的权重被冻结 + norm_cfg=dict( # 归一化层(norm layer)的配置项 + type='BN', # 归一化层的类别,通常是 BN 或 GN + requires_grad=True), # 是否训练归一化里的 gamma 和 beta + norm_eval=True, # 是否冻结 BN 里的统计项 + style='pytorch', # 主干网络的风格,'pytorch' 意思是步长为2的层为 3x3 卷积, 'caffe' 意思是步长为2的层为 1x1 卷积 + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), # 加载通过 ImageNet 预训练的模型 + neck=dict( + type='FPN', # 检测器的 neck 是 FPN,我们同样支持 'NASFPN', 'PAFPN' 等,更多细节可以参考 https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.necks.FPN + in_channels=[256, 512, 1024, 2048], # 输入通道数,这与主干网络的输出通道一致 + out_channels=256, # 金字塔特征图每一层的输出通道 + num_outs=5), # 输出的范围(scales) + rpn_head=dict( + type='RPNHead', # rpn_head 的类型是 'RPNHead', 我们也支持 'GARPNHead' 等,更多细节可以参考 https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.dense_heads.RPNHead + in_channels=256, # 每个输入特征图的输入通道,这与 neck 的输出通道一致 + feat_channels=256, # head 卷积层的特征通道 + anchor_generator=dict( # 锚点(Anchor)生成器的配置 + type='AnchorGenerator', # 大多数方法使用 AnchorGenerator 作为锚点生成器, SSD 检测器使用 `SSDAnchorGenerator`。更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/prior_generators/anchor_generator.py#L18 + scales=[8], # 锚点的基本比例,特征图某一位置的锚点面积为 scale * base_sizes + ratios=[0.5, 1.0, 2.0], # 高度和宽度之间的比率 + strides=[4, 8, 16, 32, 64]), # 锚生成器的步幅。这与 FPN 特征步幅一致。 如果未设置 base_sizes,则当前步幅值将被视为 base_sizes + bbox_coder=dict( # 在训练和测试期间对框进行编码和解码 + type='DeltaXYWHBBoxCoder', # 框编码器的类别,'DeltaXYWHBBoxCoder' 是最常用的,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py#L13 + target_means=[0.0, 0.0, 0.0, 0.0], # 用于编码和解码框的目标均值 + target_stds=[1.0, 1.0, 1.0, 1.0]), # 用于编码和解码框的标准差 + loss_cls=dict( # 分类分支的损失函数配置 + type='CrossEntropyLoss', # 分类分支的损失类型,我们也支持 FocalLoss 等,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/losses/cross_entropy_loss.py#L201 + use_sigmoid=True, # RPN 通常进行二分类,所以通常使用 sigmoid 函数 + los_weight=1.0), # 分类分支的损失权重 + loss_bbox=dict( # 回归分支的损失函数配置 + type='L1Loss', # 损失类型,我们还支持许多 IoU Losses 和 Smooth L1-loss 等,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/losses/smooth_l1_loss.py#L56 + loss_weight=1.0)), # 回归分支的损失权重 + roi_head=dict( # RoIHead 封装了两步(two-stage)/级联(cascade)检测器的第二步 + type='StandardRoIHead', # RoI head 的类型,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/roi_heads/standard_roi_head.py#L17 + bbox_roi_extractor=dict( # 用于 bbox 回归的 RoI 特征提取器 + type='SingleRoIExtractor', # RoI 特征提取器的类型,大多数方法使用 SingleRoIExtractor,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py#L13 + roi_layer=dict( # RoI 层的配置 + type='RoIAlign', # RoI 层的类别, 也支持 DeformRoIPoolingPack 和 ModulatedDeformRoIPoolingPack,更多细节请参考 https://mmcv.readthedocs.io/en/latest/api.html#mmcv.ops.RoIAlign + output_size=7, # 特征图的输出大小 + sampling_ratio=0), # 提取 RoI 特征时的采样率。0 表示自适应比率 + out_channels=256, # 提取特征的输出通道 + featmap_strides=[4, 8, 16, 32]), # 多尺度特征图的步幅,应该与主干的架构保持一致 + bbox_head=dict( # RoIHead 中 box head 的配置 + type='Shared2FCBBoxHead', # bbox head 的类别,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py#L220 + in_channels=256, # bbox head 的输入通道。 这与 roi_extractor 中的 out_channels 一致 + fc_out_channels=1024, # FC 层的输出特征通道 + roi_feat_size=7, # 候选区域(Region of Interest)特征的大小 + num_classes=80, # 分类的类别数量 + bbox_coder=dict( # 第二阶段使用的框编码器 + type='DeltaXYWHBBoxCoder', # 框编码器的类别,大多数情况使用 'DeltaXYWHBBoxCoder' + target_means=[0.0, 0.0, 0.0, 0.0], # 用于编码和解码框的均值 + target_stds=[0.1, 0.1, 0.2, 0.2]), # 编码和解码的标准差。因为框更准确,所以值更小,常规设置时 [0.1, 0.1, 0.2, 0.2]。 + reg_class_agnostic=False, # 回归是否与类别无关 + loss_cls=dict( # 分类分支的损失函数配 + type='CrossEntropyLoss', # 分类分支的损失类型,我们也支持 FocalLoss 等 + use_sigmoid=False, # 是否使用 sigmoid + loss_weight=1.0), # 分类分支的损失权重 + loss_bbox=dict( # 回归分支的损失函数配置 + type='L1Loss', # 损失类型,我们还支持许多 IoU Losses 和 Smooth L1-loss 等 + loss_weight=1.0)), # 回归分支的损失权重 + mask_roi_extractor=dict( # 用于 mask 生成的 RoI 特征提取器 + type='SingleRoIExtractor', # RoI 特征提取器的类型,大多数方法使用 SingleRoIExtractor + roi_layer=dict( # 提取实例分割特征的 RoI 层配置 + type='RoIAlign', # RoI 层的类型,也支持 DeformRoIPoolingPack 和 ModulatedDeformRoIPoolingPack + output_size=14, # 特征图的输出大小 + sampling_ratio=0), # 提取 RoI 特征时的采样率 + out_channels=256, # 提取特征的输出通道 + featmap_strides=[4, 8, 16, 32]), # 多尺度特征图的步幅 + mask_head=dict( # mask 预测 head 模型 + type='FCNMaskHead', # mask head 的类型,更多细节请参考 https://mmdetection.readthedocs.io/en/latest/api.html#mmdet.models.roi_heads.FCNMaskHead + num_convs=4, # mask head 中的卷积层数 + in_channels=256, # 输入通道,应与 mask roi extractor 的输出通道一致 + conv_out_channels=256, # 卷积层的输出通道 + num_classes=80, # 要分割的类别数 + loss_mask=dict( # mask 分支的损失函数配置 + type='CrossEntropyLoss', # 用于分割的损失类型 + use_mask=True, # 是否只在正确的类中训练 mask + loss_weight=1.0))), # mask 分支的损失权重 + train_cfg = dict( # rpn 和 rcnn 训练超参数的配置 + rpn=dict( # rpn 的训练配置 + assigner=dict( # 分配器(assigner)的配置 + type='MaxIoUAssigner', # 分配器的类型,MaxIoUAssigner 用于许多常见的检测器,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/assigners/max_iou_assigner.py#L14 + pos_iou_thr=0.7, # IoU >= 0.7(阈值) 被视为正样本 + neg_iou_thr=0.3, # IoU < 0.3(阈值) 被视为负样本 + min_pos_iou=0.3, # 将框作为正样本的最小 IoU 阈值 + match_low_quality=True, # 是否匹配低质量的框(更多细节见 API 文档) + ignore_iof_thr=-1), # 忽略 bbox 的 IoF 阈值 + sampler=dict( # 正/负采样器(sampler)的配置 + type='RandomSampler', # 采样器类型,还支持 PseudoSampler 和其他采样器,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/samplers/random_sampler.py#L14 + num=256, # 样本数量。 + pos_fraction=0.5, # 正样本占总样本的比例 + neg_pos_ub=-1, # 基于正样本数量的负样本上限 + add_gt_as_proposals=False), # 采样后是否添加 GT 作为 proposal + allowed_border=-1, # 填充有效锚点后允许的边框 + pos_weight=-1, # 训练期间正样本的权重 + debug=False), # 是否设置调试(debug)模式 + rpn_proposal=dict( # 在训练期间生成 proposals 的配置 + nms_across_levels=False, # 是否对跨层的 box 做 NMS。仅适用于 `GARPNHead` ,naive rpn 不支持 nms cross levels + nms_pre=2000, # NMS 前的 box 数 + nms_post=1000, # NMS 要保留的 box 的数量,只在 GARPNHHead 中起作用 + max_per_img=1000, # NMS 后要保留的 box 数量 + nms=dict( # NMS 的配置 + type='nms', # NMS 的类别 + iou_threshold=0.7 # NMS 的阈值 + ), + min_bbox_size=0), # 允许的最小 box 尺寸 + rcnn=dict( # roi head 的配置。 + assigner=dict( # 第二阶段分配器的配置,这与 rpn 中的不同 + type='MaxIoUAssigner', # 分配器的类型,MaxIoUAssigner 目前用于所有 roi_heads。更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/assigners/max_iou_assigner.py#L14 + pos_iou_thr=0.5, # IoU >= 0.5(阈值)被认为是正样本 + neg_iou_thr=0.5, # IoU < 0.5(阈值)被认为是负样本 + min_pos_iou=0.5, # 将 box 作为正样本的最小 IoU 阈值 + match_low_quality=False, # 是否匹配低质量下的 box(有关更多详细信息,请参阅 API 文档) + ignore_iof_thr=-1), # 忽略 bbox 的 IoF 阈值 + sampler=dict( + type='RandomSampler', # 采样器的类型,还支持 PseudoSampler 和其他采样器,更多细节请参考 https://github.com/open-mmlab/mmdetection/blob/main/mmdet/models/task_modules/samplers/random_sampler.py#L14 + num=512, # 样本数量 + pos_fraction=0.25, # 正样本占总样本的比例 + neg_pos_ub=-1, # 基于正样本数量的负样本上限 + add_gt_as_proposals=True + ), # 采样后是否添加 GT 作为 proposal + mask_size=28, # mask 的大小 + pos_weight=-1, # 训练期间正样本的权重 + debug=False)), # 是否设置调试模式 + test_cfg = dict( # 用于测试 rpn 和 rcnn 超参数的配置 + rpn=dict( # 测试阶段生成 proposals 的配置 + nms_across_levels=False, # 是否对跨层的 box 做 NMS。仅适用于 `GARPNHead`,naive rpn 不支持做 NMS cross levels + nms_pre=1000, # NMS 前的 box 数 + nms_post=1000, # NMS 要保留的 box 的数量,只在 `GARPNHHead` 中起作用 + max_per_img=1000, # NMS 后要保留的 box 数量 + nms=dict( # NMS 的配置 + type='nms', # NMS 的类型 + iou_threshold=0.7 # NMS 阈值 + ), + min_bbox_size=0), # box 允许的最小尺寸 + rcnn=dict( # roi heads 的配置 + score_thr=0.05, # bbox 的分数阈值 + nms=dict( # 第二步的 NMS 配置 + type='nms', # NMS 的类型 + iou_thr=0.5), # NMS 的阈值 + max_per_img=100, # 每张图像的最大检测次数 + mask_thr_binary=0.5))) # mask 预处的阈值 +``` + +### 数据集和评测器配置 + +在使用[执行器](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html) 进行训练、测试、验证时,我们需要配置 [Dataloader](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/dataset.html)。构建数据 dataloader 需要设置数据集(dataset)和数据处理流程(data pipeline)。 由于这部分的配置较为复杂,我们使用中间变量来简化 dataloader 配置的编写。 + +```python +dataset_type = 'CocoDataset' # 数据集类型,这将被用来定义数据集。 +data_root = 'data/coco/' # 数据的根路径。 + +train_pipeline = [ # 训练数据处理流程 + dict(type='LoadImageFromFile'), # 第 1 个流程,从文件路径里加载图像。 + dict( + type='LoadAnnotations', # 第 2 个流程,对于当前图像,加载它的注释信息。 + with_bbox=True, # 是否使用标注框(bounding box), 目标检测需要设置为 True。 + with_mask=True, # 是否使用 instance mask,实例分割需要设置为 True。 + poly2mask=False), # 是否将 polygon mask 转化为 instance mask, 设置为 False 以加速和节省内存。 + dict( + type='Resize', # 变化图像和其标注大小的流程。 + scale=(1333, 800), # 图像的最大尺寸 + keep_ratio=True # 是否保持图像的长宽比。 + ), + dict( + type='RandomFlip', # 翻转图像和其标注的数据增广流程。 + prob=0.5), # 翻转图像的概率。 + dict(type='PackDetInputs') # 将数据转换为检测器输入格式的流程 +] +test_pipeline = [ # 测试数据处理流程 + dict(type='LoadImageFromFile'), # 第 1 个流程,从文件路径里加载图像。 + dict(type='Resize', scale=(1333, 800), keep_ratio=True), # 变化图像大小的流程。 + dict( + type='PackDetInputs', # 将数据转换为检测器输入格式的流程 + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( # 训练 dataloader 配置 + batch_size=2, # 单个 GPU 的 batch size + num_workers=2, # 单个 GPU 分配的数据加载线程数 + persistent_workers=True, # 如果设置为 True,dataloader 在迭代完一轮之后不会关闭数据读取的子进程,可以加速训练 + sampler=dict( # 训练数据的采样器 + type='DefaultSampler', # 默认的采样器,同时支持分布式和非分布式训练。请参考 https://mmengine.readthedocs.io/zh_CN/latest/api/generated/mmengine.dataset.DefaultSampler.html#mmengine.dataset.DefaultSampler + shuffle=True), # 随机打乱每个轮次训练数据的顺序 + batch_sampler=dict(type='AspectRatioBatchSampler'), # 批数据采样器,用于确保每一批次内的数据拥有相似的长宽比,可用于节省显存 + dataset=dict( # 训练数据集的配置 + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', # 标注文件路径 + data_prefix=dict(img='train2017/'), # 图片路径前缀 + filter_cfg=dict(filter_empty_gt=True, min_size=32), # 图片和标注的过滤配置 + pipeline=train_pipeline)) # 这是由之前创建的 train_pipeline 定义的数据处理流程。 +val_dataloader = dict( # 验证 dataloader 配置 + batch_size=1, # 单个 GPU 的 Batch size。如果 batch-szie > 1,组成 batch 时的额外填充会影响模型推理精度 + num_workers=2, # 单个 GPU 分配的数据加载线程数 + persistent_workers=True, # 如果设置为 True,dataloader 在迭代完一轮之后不会关闭数据读取的子进程,可以加速训练 + drop_last=False, # 是否丢弃最后未能组成一个批次的数据 + sampler=dict( + type='DefaultSampler', + shuffle=False), # 验证和测试时不打乱数据顺序 + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, # 开启测试模式,避免数据集过滤图片和标注 + pipeline=test_pipeline)) +test_dataloader = val_dataloader # 测试 dataloader 配置 +``` + +[评测器](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/evaluation.html) 用于计算训练模型在验证和测试数据集上的指标。评测器的配置由一个或一组评价指标(Metric)配置组成: + +```python +val_evaluator = dict( # 验证过程使用的评测器 + type='CocoMetric', # 用于评估检测和实例分割的 AR、AP 和 mAP 的 coco 评价指标 + ann_file=data_root + 'annotations/instances_val2017.json', # 标注文件路径 + metric=['bbox', 'segm'], # 需要计算的评价指标,`bbox` 用于检测,`segm` 用于实例分割 + format_only=False) +test_evaluator = val_evaluator # 测试过程使用的评测器 +``` + +由于测试数据集没有标注文件,因此 MMDetection 中的 test_dataloader 和 test_evaluator 配置通常等于val。 如果要保存在测试数据集上的检测结果,则可以像这样编写配置: + +```python +# 在测试集上推理, +# 并将检测结果转换格式以用于提交结果 +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'annotations/image_info_test-dev2017.json', + data_prefix=dict(img='test2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/image_info_test-dev2017.json', + metric=['bbox', 'segm'], + format_only=True, # 只将模型输出转换为 coco 的 JSON 格式并保存 + outfile_prefix='./work_dirs/coco_detection/test') # 要保存的 JSON 文件的前缀 +``` + +### 训练和测试的配置 + +MMEngine 的 Runner 使用 Loop 来控制训练,验证和测试过程。 +用户可以使用这些字段设置最大训练轮次和验证间隔。 + +```python +train_cfg = dict( + type='EpochBasedTrainLoop', # 训练循环的类型,请参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py + max_epochs=12, # 最大训练轮次 + val_interval=1) # 验证间隔。每个 epoch 验证一次 +val_cfg = dict(type='ValLoop') # 验证循环的类型 +test_cfg = dict(type='TestLoop') # 测试循环的类型 +``` + +### 优化相关配置 + +`optim_wrapper` 是配置优化相关设置的字段。优化器封装(OptimWrapper)不仅提供了优化器的功能,还支持梯度裁剪、混合精度训练等功能。更多内容请看[优化器封装教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/optim_wrapper.html) 。 + +```python +optim_wrapper = dict( # 优化器封装的配置 + type='OptimWrapper', # 优化器封装的类型。可以切换至 AmpOptimWrapper 来启用混合精度训练 + optimizer=dict( # 优化器配置。支持 PyTorch 的各种优化器。请参考 https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # 随机梯度下降优化器 + lr=0.02, # 基础学习率 + momentum=0.9, # 带动量的随机梯度下降 + weight_decay=0.0001), # 权重衰减 + clip_grad=None, # 梯度裁剪的配置,设置为 None 关闭梯度裁剪。使用方法请见 https://mmengine.readthedocs.io/en/latest/tutorials/optimizer.html + ) +``` + +`param_scheduler` 字段用于配置参数调度器(Parameter Scheduler)来调整优化器的超参数(例如学习率和动量)。 用户可以组合多个调度器来创建所需的参数调整策略。 在 [参数调度器教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/param_scheduler.html) 和 [参数调度器 API 文档](https://mmengine.readthedocs.io/zh_CN/latest/api/generated/mmengine.optim._ParamScheduler.html#mmengine.optim._ParamScheduler) 中查找更多信息。 + +```python +param_scheduler = [ + dict( + type='LinearLR', # 使用线性学习率预热 + start_factor=0.001, # 学习率预热的系数 + by_epoch=False, # 按 iteration 更新预热学习率 + begin=0, # 从第一个 iteration 开始 + end=500), # 到第 500 个 iteration 结束 + dict( + type='MultiStepLR', # 在训练过程中使用 multi step 学习率策略 + by_epoch=True, # 按 epoch 更新学习率 + begin=0, # 从第一个 epoch 开始 + end=12, # 到第 12 个 epoch 结束 + milestones=[8, 11], # 在哪几个 epoch 进行学习率衰减 + gamma=0.1) # 学习率衰减系数 +] +``` + +### 钩子配置 + +用户可以在训练、验证和测试循环上添加钩子,以便在运行期间插入一些操作。配置中有两种不同的钩子字段,一种是 `default_hooks`,另一种是 `custom_hooks`。 + +`default_hooks` 是一个字典,用于配置运行时必须使用的钩子。这些钩子具有默认优先级,如果未设置,runner 将使用默认值。如果要禁用默认钩子,用户可以将其配置设置为 `None`。更多内容请看 [钩子教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/hook.html) 。 + +```python +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='DetVisualizationHook')) +``` + +`custom_hooks` 是一个列表。用户可以在这个字段中加入自定义的钩子。 + +```python +custom_hooks = [] +``` + +### 运行相关配置 + +```python +default_scope = 'mmdet' # 默认的注册器域名,默认从此注册器域中寻找模块。请参考 https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/registry.html + +env_cfg = dict( + cudnn_benchmark=False, # 是否启用 cudnn benchmark + mp_cfg=dict( # 多进程设置 + mp_start_method='fork', # 使用 fork 来启动多进程。'fork' 通常比 'spawn' 更快,但可能存在隐患。请参考 https://github.com/pytorch/pytorch/issues/1355 + opencv_num_threads=0), # 关闭 opencv 的多线程以避免系统超负荷 + dist_cfg=dict(backend='nccl'), # 分布式相关设置 +) + +vis_backends = [dict(type='LocalVisBackend')] # 可视化后端,请参考 https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +log_processor = dict( + type='LogProcessor', # 日志处理器用于处理运行时日志 + window_size=50, # 日志数值的平滑窗口 + by_epoch=True) # 是否使用 epoch 格式的日志。需要与训练循环的类型保存一致。 + +log_level = 'INFO' # 日志等级 +load_from = None # 从给定路径加载模型检查点作为预训练模型。这不会恢复训练。 +resume = False # 是否从 `load_from` 中定义的检查点恢复。 如果 `load_from` 为 None,它将恢复 `work_dir` 中的最新检查点。 +``` + +## Iter-based 配置 + +MMEngine 的 Runner 除了基于轮次的训练循环(epoch)外,还提供了基于迭代(iteration)的训练循环。 +要使用基于迭代的训练,用户应该修改 `train_cfg`、`param_scheduler`、`train_dataloader`、`default_hooks` 和 `log_processor`。 +以下是将基于 epoch 的 RetinaNet 配置更改为基于 iteration 的示例:configs/retinanet/retinanet_r50_fpn_90k_coco.py + +```python +# iter-based 训练配置 +train_cfg = dict( + _delete_=True, # 忽略继承的配置文件中的值(可选) + type='IterBasedTrainLoop', # iter-based 训练循环 + max_iters=90000, # 最大迭代次数 + val_interval=10000) # 每隔多少次进行一次验证 + + +# 将参数调度器修改为 iter-based +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=90000, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] + +# 切换至 InfiniteSampler 来避免 dataloader 重启 +train_dataloader = dict(sampler=dict(type='InfiniteSampler')) + +# 将模型检查点保存间隔设置为按 iter 保存 +default_hooks = dict(checkpoint=dict(by_epoch=False, interval=10000)) + +# 将日志格式修改为 iter-based +log_processor = dict(by_epoch=False) +``` + +## 配置文件继承 + +在 `config/_base_` 文件夹下有 4 个基本组件类型,分别是:数据集(dataset),模型(model),训练策略(schedule)和运行时的默认设置(default runtime)。许多方法,例如 Faster R-CNN、Mask R-CNN、Cascade R-CNN、RPN、SSD 能够很容易地构建出来。由 `_base_` 下的组件组成的配置,被我们称为 _原始配置(primitive)_。 + +对于同一文件夹下的所有配置,推荐**只有一个**对应的**原始配置**文件。所有其他的配置文件都应该继承自这个**原始配置**文件。这样就能保证配置文件的最大继承深度为 3。 + +为了便于理解,我们建议贡献者继承现有方法。例如,如果在 Faster R-CNN 的基础上做了一些修改,用户首先可以通过指定 `_base_ = ../faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py` 来继承基础的 Faster R-CNN 结构,然后修改配置文件中的必要参数以完成继承。 + +如果你在构建一个与任何现有方法不共享结构的全新方法,那么可以在 `configs` 文件夹下创建一个新的例如 `xxx_rcnn` 文件夹。 + +更多细节请参考 [MMEngine 配置文件教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html) 。 + +通过设置 `_base_` 字段,我们可以设置当前配置文件继承自哪些文件。 + +当 `_base_` 为文件路径字符串时,表示继承一个配置文件的内容。 + +```python +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' +``` + +当 `_base_` 是多个文件路径的列表时,表示继承多个文件。 + +```python +_base_ = [ + '../_base_/models/mask-rcnn_r50_fpn.py', + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +``` + +如果需要检查配置文件,可以通过运行 `python tools/misc/print_config.py /PATH/TO/CONFIG` 来查看完整的配置。 + +### 忽略基础配置文件里的部分内容 + +有时,您也许会设置 `_delete_=True` 去忽略基础配置文件里的一些域内容。 您也许可以参照 [MMEngine 配置文件教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html) 来获得一些简单的指导。 + +在 MMDetection 里,例如为了改变 Mask R-CNN 的主干网络的某些内容: + +```python +model = dict( + type='MaskRCNN', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict(...), + rpn_head=dict(...), + roi_head=dict(...)) +``` + +基础配置的 `Mask R-CNN` 使用 `ResNet-50`,在需要将主干网络改成 `HRNet` 的时候,因为 `HRNet` 和 `ResNet` 中有不同的字段,需要使用 `_delete_=True` 将新的键去替换 `backbone` 域内所有老的键。 + +```python +_base_ = '../mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='HRNet', + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w32')), + neck=dict(...)) +``` + +### 使用配置文件里的中间变量 + +配置文件里会使用一些中间变量,例如数据集里的 `train_pipeline`/`test_pipeline`。我们在定义新的 `train_pipeline`/`test_pipeline` 之后,需要将它们传递到 `data` 里。例如,我们想在训练或测试时,改变 Mask R-CNN 的多尺度策略 (multi scale strategy),`train_pipeline`/`test_pipeline` 是我们想要修改的中间变量。 + +```python +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +``` + +我们首先定义新的 `train_pipeline`/`test_pipeline` 然后传递到 `data` 里。 + +同样的,如果我们想从 `SyncBN` 切换到 `BN` 或者 `MMSyncBN`,我们需要修改配置文件里的每一个 `norm_cfg`。 + +```python +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + backbone=dict(norm_cfg=norm_cfg), + neck=dict(norm_cfg=norm_cfg), + ...) +``` + +### 复用 \_base\_ 文件中的变量 + +如果用户希望在当前配置中复用 base 文件中的变量,则可以通过使用 `{{_base_.xxx}}` 的方式来获取对应变量的拷贝。例如: + +```python +_base_ = './mask-rcnn_r50_fpn_1x_coco.py' + +a = {{_base_.model}} # 变量 a 等于 _base_ 中定义的 model +``` + +## 通过脚本参数修改配置 + +当运行 `tools/train.py` 和 `tools/test.py` 时,可以通过 `--cfg-options` 来修改配置文件。 + +- 更新字典链中的配置 + + 可以按照原始配置文件中的 dict 键顺序地指定配置预选项。例如,使用 `--cfg-options model.backbone.norm_eval=False` 将模型主干网络中的所有 BN 模块都改为 `train` 模式。 + +- 更新配置列表中的键 + + 在配置文件里,一些字典型的配置被包含在列表中。例如,数据训练流程 `data.train.pipeline` 通常是一个列表,比如 `[dict(type='LoadImageFromFile'), ...]`。如果需要将 `'LoadImageFromFile'` 改成 `'LoadImageFromWebcam'`,需要写成下述形式: `--cfg-options data.train.pipeline.0.type=LoadImageFromNDArray`. + +- 更新列表或元组的值 + + 如果要更新的值是列表或元组。例如,配置文件通常设置 `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`. 如果需要改变这个键,可以通过 `--cfg-options model.data_preprocessor.mean="[127,127,127]"` 来重新设置。需要注意,引号 " 是支持列表或元组数据类型所必需的,并且在指定值的引号内**不允许**有空格。 + +## 配置文件名称风格 + +我们遵循以下样式来命名配置文件。建议贡献者遵循相同的风格。 + +``` +{algorithm name}_{model component names [component1]_[component2]_[...]}_{training settings}_{training dataset information}_{testing dataset information}.py +``` + +文件名分为五个部分。 每个部分用`_`连接,每个部分内的单词应该用`-`连接。 + +- `{algorithm name}`: 算法的名称。 它可以是检测器名称,例如 `faster-rcnn`、`mask-rcnn` 等。也可以是半监督或知识蒸馏算法,例如 `soft-teacher`、`lad` 等等 +- `{component names}`: 算法中使用的组件名称,如 backbone、neck 等。例如 `r50-caffe_fpn_gn-head` 表示在算法中使用 caffe 版本的 ResNet50、FPN 和 使用了 Group Norm 的检测头。 +- `{training settings}`: 训练设置的信息,例如 batch 大小、数据增强、损失、参数调度方式和训练最大轮次/迭代。 例如:`4xb4-mixup-giou-coslr-100e` 表示使用 8 个 gpu 每个 gpu 4 张图、mixup 数据增强、GIoU loss、余弦退火学习率,并训练 100 个 epoch。 + 缩写介绍: + - `{gpu x batch_per_gpu}`: GPU 数和每个 GPU 的样本数。`bN` 表示每个 GPU 上的 batch 大小为 N。例如 `4x4b` 是 4 个 GPU 每个 GPU 4 张图的缩写。如果没有注明,默认为 8 卡每卡 2 张图。 + - `{schedule}`: 训练方案,选项是 `1x`、 `2x`、 `20e` 等。`1x` 和 `2x` 分别代表 12 epoch 和 24 epoch,`20e` 在级联模型中使用,表示 20 epoch。对于 `1x`/`2x`,初始学习率在第 8/16 和第 11/22 epoch 衰减 10 倍;对于 `20e` ,初始学习率在第 16 和第 19 epoch 衰减 10 倍。 +- `{training dataset information}`: 训练数据集,例如 `coco`, `coco-panoptic`, `cityscapes`, `voc-0712`, `wider-face`。 +- `{testing dataset information}` (可选): 测试数据集,用于训练和测试在不同数据集上的模型配置。 如果没有注明,则表示训练和测试的数据集类型相同。 diff --git a/mmdetection/docs/zh_cn/user_guides/dataset_prepare.md b/mmdetection/docs/zh_cn/user_guides/dataset_prepare.md new file mode 100644 index 0000000..a8bf320 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/dataset_prepare.md @@ -0,0 +1,307 @@ +## 数据集准备 + +### 基础检测数据集准备 + +MMDetection 支持多个公共数据集,包括 [COCO](https://cocodataset.org/), [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC), [Cityscapes](https://www.cityscapes-dataset.com/) 和 [其他更多数据集](https://github.com/open-mmlab/mmdetection/tree/main/configs/_base_/datasets)。 + +一些公共数据集,比如 Pascal VOC 及其镜像数据集,或者 COCO 等数据集都可以从官方网站或者镜像网站获取。注意:在检测任务中,Pascal VOC 2012 是 Pascal VOC 2007 的无交集扩展,我们通常将两者一起使用。 我们建议将数据集下载,然后解压到项目外部的某个文件夹内,然后通过符号链接的方式,将数据集根目录链接到 `$MMDETECTION/data` 文件夹下, 如果你的文件夹结构和下方不同的话,你需要在配置文件中改变对应的路径。 + +我们提供了下载 COCO 等数据集的脚本,你可以运行 `python tools/misc/download_dataset.py --dataset-name coco2017` 下载 COCO 数据集。 对于中国境内的用户,我们也推荐通过开源数据平台 [OpenDataLab](https://opendatalab.com/?source=OpenMMLab%20GitHub) 来下载数据,以获得更好的下载体验。 + +更多用法请参考[数据集下载](./useful_tools.md#dataset-download) + +```text +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ ├── cityscapes +│ │ ├── annotations +│ │ ├── leftImg8bit +│ │ │ ├── train +│ │ │ ├── val +│ │ ├── gtFine +│ │ │ ├── train +│ │ │ ├── val +│ ├── VOCdevkit +│ │ ├── VOC2007 +│ │ ├── VOC2012 +``` + +有些模型需要额外的 [COCO-stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip) 数据集,比如 HTC,DetectoRS 和 SCNet,你可以下载并解压它们到 `coco` 文件夹下。文件夹会是如下结构: + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── stuffthingmaps +``` + +PanopticFPN 等全景分割模型需要额外的 [COCO Panoptic](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip) 数据集,你可以下载并解压它们到 `coco/annotations` 文件夹下。文件夹会是如下结构: + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +Cityscape 数据集的标注格式需要转换,以与 COCO 数据集标注格式保持一致,使用 `tools/dataset_converters/cityscapes.py` 来完成转换: + +```shell +pip install cityscapesscripts + +python tools/dataset_converters/cityscapes.py \ + ./data/cityscapes \ + --nproc 8 \ + --out-dir ./data/cityscapes/annotations +``` + +### COCO Caption 数据集准备 + +COCO Caption 采用的是 COCO2014 数据集作为图片,并且使用了 karpathy 的标注, + +首先你需要下载 COCO2014 数据集 + +```shell +python tools/misc/download_dataset.py --dataset-name coco2014 --unzip +``` + +数据集会下载到当前路径的 `data/coco` 下。然后下载 karpathy 的标注 + +```shell +cd data/coco/annotations +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json +wget https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json +``` + +最终直接可用于训练和测试的数据集文件夹结构如下: + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── coco_karpathy_train.json +│ │ │ ├── coco_karpathy_test.json +│ │ │ ├── coco_karpathy_val.json +│ │ │ ├── coco_karpathy_val_gt.json +│ │ │ ├── coco_karpathy_test_gt.json +│ │ ├── train2014 +│ │ ├── val2014 +│ │ ├── test2014 +``` + +### COCO semantic 数据集准备 + +COCO 语义分割有两种类型标注,主要差别在于类别名定义不一样,因此处理方式也有两种,第一种是直接使用 stuffthingmaps 数据集,第二种是使用 panoptic 数据集。 + +**(1) 使用 stuffthingmaps 数据集** + +该数据集的下载地址为 [stuffthingmaps_trainval2017](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip),请下载后解压到 `data/coco` 文件夹下。 + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── stuffthingmaps +``` + +该数据集不同于标准的 COCO 类别标注,其包括 172 个类: 80 thing 类、91 stuff 类和 1 个 'unlabeled',其每个类别的说明见 https://github.com/nightrome/cocostuff/blob/master/labels.md + +虽然只标注了 172 个类别,但是 `stuffthingmaps` 中最大标签 id 是 182,中间有些类别是没有标注的,并且第 0 类的 `unlabeled` 类别被移除。因此最终的 `stuffthingmaps` 图片中每个位置的值对应的类别关系见 https://github.com/kazuto1011/deeplab-pytorch/blob/master/data/datasets/cocostuff/labels.txt + +考虑到训练高效和方便用户,在开启训练或者评估前,我们需要将没有标注的 12 个类移除,这 12 个类的名字为: `street sign、hat、shoe、eye glasses、plate、mirror、window、desk、door、blender、hair brush`,最终可用于训练和评估的类别信息见 `mmdet/datasets/coco_semantic.py` + +你可以使用 `tools/dataset_converters/coco_stuff164k.py` 来完成将下载的 `stuffthingmaps` 转换为直接可以训练和评估的数据集,转换后的数据集文件夹结构如下: + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── stuffthingmaps +│ │ ├── stuffthingmaps_semseg +``` + +`stuffthingmaps_semseg` 即为新生成的可以直接训练和测试的 COCO 语义分割数据集。 + +**(2) 使用 panoptic 数据集** + +通过 panoptic 标注生成的语义分割数据集类别数相比使用 `stuffthingmaps` 数据集生成的会少一些。首先你需要准备全景分割标注,然后使用如下脚本完成转换 + +```shell +python tools/dataset_converters/prepare_coco_semantic_annos_from_panoptic_annos.py data/coco +``` + +转换后的数据集文件夹结构如下: + +```text +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── panoptic_train2017.json +│ │ │ ├── panoptic_train2017 +│ │ │ ├── panoptic_val2017.json +│ │ │ ├── panoptic_val2017 +│ │ │ ├── panoptic_semseg_train2017 +│ │ │ ├── panoptic_semseg_val2017 +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +``` + +`panoptic_semseg_train2017` 和 `panoptic_semseg_val2017` 即为新生成的可以直接训练和测试的 COCO 语义分割数据集。注意其类别信息就是 COCO 全景分割的类别信息,包括 thing 和 stuff。 + +### RefCOCO 数据集准备 + +[RefCOCO](https://github.com/lichengunc/refer)系列数据集的图像和注释可以通过运行 `tools/misc/download_dataset.py` 下载: + +```shell +python tools/misc/download_dataset.py --dataset-name refcoco --save-dir data/coco --unzip +``` + +然后,目录应该是这样的: + +```text +data +├── coco +│ ├── refcoco +│   │   ├── instances.json +│   │   ├── refs(google).p +│   │   └── refs(unc).p +│   ├── refcoco+ +│   │   ├── instances.json +│   │   └── refs(unc).p +│   ├── refcocog +│   │   ├── instances.json +│   │   ├── refs(google).p +│   │   └── refs(umd).p +| |── train2014 +``` + +### ADE20K 数据集准备 + +[ADE20K](http://groups.csail.mit.edu/vision/datasets/ADE20K/)数据集的图像和注释可以通过运行 `tools/misc/download_dataset.py` 下载: + +```shell +python tools/misc/download_dataset.py --dataset-name ade20k_2016 --save-dir data --unzip +``` + +然后将注释移至`data/ADEChallengeData2016`目录,并运行预处理脚本以产生coco格式注释: + +```shell +mv data/annotations_instance data/ADEChallengeData2016/ +mv data/categoryMapping.txt data/ADEChallengeData2016/ +mv data/imgCatIds.json data/ADEChallengeData2016/ +python tools/dataset_converters/ade20k2coco.py data/ADEChallengeData2016 --task panoptic +python tools/dataset_converters/ade20k2coco.py data/ADEChallengeData2016 --task instance +``` + +然后,目录应该是这样的: + +```text +data +├── ADEChallengeData2016 +│   ├── ade20k_instance_train.json +│   ├── ade20k_instance_val.json +│   ├── ade20k_panoptic_train +| | ├── ADE_train_00000001.png +| | ├── ADE_train_00000002.png +| | ├── ... +│   ├── ade20k_panoptic_train.json +│   ├── ade20k_panoptic_val +| | ├── ADE_val_00000001.png +| | ├── ADE_val_00000002.png +| | ├── ... +│   ├── ade20k_panoptic_val.json +│   ├── annotations +| | ├── training +| | | ├── ADE_train_00000001.png +| | | ├── ADE_train_00000002.png +| | | ├── ... +| | ├── validation +| | | ├── ADE_val_00000001.png +| | | ├── ADE_val_00000002.png +| | | ├── ... +│   ├── annotations_instance +| | ├── training +| | | ├── ADE_train_00000001.png +| | | ├── ADE_train_00000002.png +| | | ├── ... +| | ├── validation +| | | ├── ADE_val_00000001.png +| | | ├── ADE_val_00000002.png +| | | ├── ... +│   ├── categoryMapping.txt +│   ├── images +│   | ├── training +| | | ├── ADE_train_00000001.jpg +| | | ├── ADE_train_00000002.jpg +| | | ├── ... +| | ├── validation +| | | ├── ADE_val_00000001.jpg +| | | ├── ADE_val_00000002.jpg +| | | ├── ... +│   ├── imgCatIds.json +│   ├── objectInfo150.txt +| |── sceneCategories.txt +``` + +上述文件夹包括ADE20K的语义分割、实例分割和泛在分割的所有数据。 + +### 从 OpenDataLab 中下载 + +[OpenDataLab](https://opendatalab.com/) 为人工智能研究者提供免费开源的数据集,通过 OpenDataLab,研究者可以获得格式统一的各领域经典数据集。通过平台的搜索功能,研究者可以迅速便捷地找到自己所需数据集;通过平台的统一格式,研究者可以便捷地对跨数据集任务进行开发。 + +目前,MIM 支持使用一条命令行从 OpenDataLab 中下载 VOC 和 COCO 数据集,后续将支持更多数据集。你也可以直接访问 OpenDataLab 平台下载你所需的数据集,然后将其转化为 MMDetection 所要求的格式。 + +如果使用 MIM 下载,请确保版本大于 v0.3.8,你可以使用如下命令更新: + +```Bash +pip install -U openmim +``` + +```Bash +# install OpenXLab CLI tools +pip install -U openxlab +# log in OpenXLab, registry +openxlab login + +# download voc2007 and preprocess by MIM +mim download mmdet --dataset voc2007 + +# download voc2012 and preprocess by MIM +mim download mmdet --dataset voc2012 + +# download coco2017 and preprocess by MIM +mim download mmdet --dataset coco2017 +``` diff --git a/mmdetection/docs/zh_cn/user_guides/deploy.md b/mmdetection/docs/zh_cn/user_guides/deploy.md new file mode 100644 index 0000000..f796b00 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/deploy.md @@ -0,0 +1,174 @@ +# 模型部署 + +[MMDeploy](https://github.com/open-mmlab/mmdeploy) 是 OpenMMLab 的部署仓库,负责包括 MMPretrain、MMDetection 等在内的各算法库的部署工作。 +你可以从[这里](https://mmdeploy.readthedocs.io/zh_CN/1.x/04-supported-codebases/mmdet.html)获取 MMDeploy 对 MMDetection 部署支持的最新文档。 + +本文的结构如下: + +- [安装](#安装) +- [模型转换](#模型转换) +- [模型规范](#模型规范) +- [模型推理](#模型推理) + - [后端模型推理](#后端模型推理) + - [SDK 模型推理](#sdk-模型推理) +- [模型支持列表](#模型支持列表) +- + +## 安装 + +请参考[此处](https://mmdetection.readthedocs.io/en/latest/get_started.html)安装 mmdet。然后,按照[说明](https://mmdeploy.readthedocs.io/zh_CN/1.x/get_started.html#mmdeploy)安装 mmdeploy。 + +```{note} +如果安装的是 mmdeploy 预编译包,那么也请通过 'git clone https://github.com/open-mmlab/mmdeploy.git --depth=1' 下载 mmdeploy 源码。因为它包含了部署时要用到的配置文件 +``` + +## 模型转换 + +假设在安装步骤中,mmdetection 和 mmdeploy 代码库在同级目录下,并且当前的工作目录为 mmdetection 的根目录,那么以 [Faster R-CNN](https://github.com/open-mmlab/mmdetection/blob/main/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py) 模型为例,你可以从[此处](https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth)下载对应的 checkpoint,并使用以下代码将之转换为 onnx 模型: + +```python +from mmdeploy.apis import torch2onnx +from mmdeploy.backend.sdk.export_info import export2SDK + +img = 'demo/demo.jpg' +work_dir = 'mmdeploy_models/mmdet/onnx' +save_file = 'end2end.onnx' +deploy_cfg = '../mmdeploy/configs/mmdet/detection/detection_onnxruntime_dynamic.py' +model_cfg = 'configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +model_checkpoint = 'faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth' +device = 'cpu' + +# 1. convert model to onnx +torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, + model_checkpoint, device) + +# 2. extract pipeline info for inference by MMDeploy SDK +export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, + device=device) +``` + +转换的关键之一是使用正确的配置文件。项目中已内置了各后端部署[配置文件](https://github.com/open-mmlab/mmdeploy/tree/1.x/configs/mmdet)。 +文件的命名模式是: + +``` +{task}/{task}_{backend}-{precision}_{static | dynamic}_{shape}.py +``` + +其中: + +- **{task}:** mmdet 中的任务 + + mmdet 任务有2种:物体检测(detection)、实例分割(instance-seg)。例如,`RetinaNet`、`Faster R-CNN`、`DETR`等属于前者。`Mask R-CNN`、`SOLO`等属于后者。更多`模型-任务`的划分,请参考章节[模型支持列表](#模型支持列表)。 + + **请务必**使用 `detection/detection_*.py` 转换检测模型,使用 `instance-seg/instance-seg_*.py` 转换实例分割模型。 + +- **{backend}:** 推理后端名称。比如,onnxruntime、tensorrt、pplnn、ncnn、openvino、coreml 等等 + +- **{precision}:** 推理精度。比如,fp16、int8。不填表示 fp32 + +- **{static | dynamic}:** 动态、静态 shape + +- **{shape}:** 模型输入的 shape 或者 shape 范围 + +在上例中,你也可以把 `Faster R-CNN` 转为其他后端模型。比如使用`detection_tensorrt-fp16_dynamic-320x320-1344x1344.py`,把模型转为 tensorrt-fp16 模型。 + +```{tip} +当转 tensorrt 模型时, --device 需要被设置为 "cuda" +``` + +## 模型规范 + +在使用转换后的模型进行推理之前,有必要了解转换结果的结构。 它存放在 `--work-dir` 指定的路路径下。 + +上例中的`mmdeploy_models/mmdet/onnx`,结构如下: + +``` +mmdeploy_models/mmdet/onnx +├── deploy.json +├── detail.json +├── end2end.onnx +└── pipeline.json +``` + +重要的是: + +- **end2end.onnx**: 推理引擎文件。可用 ONNX Runtime 推理 +- ***xxx*.json**: mmdeploy SDK 推理所需的 meta 信息 + +整个文件夹被定义为**mmdeploy SDK model**。换言之,**mmdeploy SDK model**既包括推理引擎,也包括推理 meta 信息。 + +## 模型推理 + +## 后端模型推理 + +以上述模型转换后的 `end2end.onnx` 为例,你可以使用如下代码进行推理: + +```python +from mmdeploy.apis.utils import build_task_processor +from mmdeploy.utils import get_input_shape, load_config +import torch + +deploy_cfg = '../mmdeploy/configs/mmdet/detection/detection_onnxruntime_dynamic.py' +model_cfg = 'configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py' +device = 'cpu' +backend_model = ['mmdeploy_models/mmdet/onnx/end2end.onnx'] +image = 'demo/demo.jpg' + +# read deploy_cfg and model_cfg +deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg) + +# build task and backend model +task_processor = build_task_processor(model_cfg, deploy_cfg, device) +model = task_processor.build_backend_model(backend_model) + +# process input image +input_shape = get_input_shape(deploy_cfg) +model_inputs, _ = task_processor.create_input(image, input_shape) + +# do model inference +with torch.no_grad(): + result = model.test_step(model_inputs) + +# visualize results +task_processor.visualize( + image=image, + model=model, + result=result[0], + window_name='visualize', + output_file='output_detection.png') +``` + +## SDK 模型推理 + +你也可以参考如下代码,对 SDK model 进行推理: + +```python +from mmdeploy_python import Detector +import cv2 + +img = cv2.imread('demo/demo.jpg') + +# create a detector +detector = Detector(model_path='mmdeploy_models/mmdet/onnx', + device_name='cpu', device_id=0) +# perform inference +bboxes, labels, masks = detector(img) + +# visualize inference result +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('output_detection.png', img) +``` + +除了python API,mmdeploy SDK 还提供了诸如 C、C++、C#、Java等多语言接口。 +你可以参考[样例](https://github.com/open-mmlab/mmdeploy/tree/1.x/demo)学习其他语言接口的使用方法。 + +## 模型支持列表 + +请参考[这里](https://mmdeploy.readthedocs.io/zh_CN/1.x/04-supported-codebases/mmdet.html#id6) diff --git a/mmdetection/docs/zh_cn/user_guides/finetune.md b/mmdetection/docs/zh_cn/user_guides/finetune.md new file mode 100644 index 0000000..66bad94 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/finetune.md @@ -0,0 +1,96 @@ +# 模型微调 + +在 COCO 数据集上预训练的检测器可以作为其他数据集(例如 CityScapes 和 KITTI 数据集)优质的预训练模型。 +本教程将指导用户如何把 [ModelZoo](../model_zoo.md) 中提供的模型用于其他数据集中并使得当前所训练的模型获得更好性能。 + +以下是在新数据集中微调模型需要的两个步骤。 + +- 按 [教程2:自定义数据集](../advanced_guides/customize_dataset.md) 中的方法对新数据集添加支持中的方法对新数据集添加支持 +- 按照本教程中所讨论方法,修改配置信息 + +接下来将会以 Cityscapes Dataset 上的微调过程作为例子,具体讲述用户需要在配置中修改的五个部分。 + +## 继承基础配置 + +为了减轻编写整个配置的负担并减少漏洞的数量, MMDetection V3.0 支持从多个现有配置中继承配置信息。微调 MaskRCNN 模型的时候,新的配置信息需要使用从 `_base_/models/mask_rcnn_r50_fpn.py` 中继承的配置信息来构建模型的基本结构。当使用 Cityscapes 数据集时,新的配置信息可以简便地从`_base_/datasets/cityscapes_instance.py` 中继承。对于训练过程的运行设置部分,例如 `logger settings`,配置文件可以从 `_base_/default_runtime.py` 中继承。对于训练计划的配置则可以从`_base_/schedules/schedule_1x.py` 中继承。这些配置文件存放于 `configs` 目录下,用户可以选择全部内容的重新编写而不是使用继承方法。 + +```python +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py', + '../_base_/schedules/schedule_1x.py' +] +``` + +## Head 的修改 + +接下来新的配置还需要根据新数据集的类别数量对 Head 进行修改。只需要对 roi_head 中的 `num_classes`进行修改。修改后除了最后的预测模型的 Head 之外,预训练模型的权重的大部分都会被重新使用。 + +```python +model = dict( + roi_head=dict( + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=8, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))) +``` + +## 数据集的修改 + +用户可能还需要准备数据集并编写有关数据集的配置,可在 [Customize Datasets](../advanced_guides/customize_dataset.md) 中获取更多信息。目前 MMDetection V3.0 的配置文件已经支持 VOC、WIDERFACE、COCO、LIVS、OpenImages、DeepFashion、Objects365 和 Cityscapes Dataset 的数据集信息。 + +## 训练策略的修改 + +微调超参数与默认的训练策略不同。它通常需要更小的学习率和更少的训练回合。 + +```python +# 优化器 +# batch size 为 8 时的 lr 配置 +optim_wrapper = dict(optimizer=dict(lr=0.01)) + +# 学习率 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=8, + by_epoch=True, + milestones=[7], + gamma=0.1) +] + +# 设置 max epoch +train_cfg = dict(max_epochs=8) + +# 设置 log config +default_hooks = dict(logger=dict(interval=100)), + +``` + +## 使用预训练模型 + +如果要使用预训练模型,可以在 `load_from` 中查阅新的配置信息,用户需要在训练开始之前下载好需要的模型权重,从而避免在训练过程中浪费了宝贵时间。 + +```python +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa +``` diff --git a/mmdetection/docs/zh_cn/user_guides/index.rst b/mmdetection/docs/zh_cn/user_guides/index.rst new file mode 100644 index 0000000..5abc50a --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/index.rst @@ -0,0 +1,34 @@ +训练 & 测试 +************** + +MMDetection 在 `Model Zoo `_ 中提供了数百个预训练的检测模型, +并支持多种标准数据集格式,包括 Pascal VOC、COCO、CityScapes、LVIS 等。本文档将展示如何使用这些模型和数据集来执行常见的训练和测试任务: + +.. toctree:: + :maxdepth: 1 + + config.md + inference.md + dataset_prepare.md + test.md + train.md + new_model.md + finetune.md + test_results_submission.md + init_cfg.md + single_stage_as_rpn.md + semi_det.md + + +实用工具 +************ + +.. toctree:: + :maxdepth: 1 + + useful_tools.md + useful_hooks.md + visualization.md + robustness_benchmarking.md + deploy.md + label_studio.md diff --git a/mmdetection/docs/zh_cn/user_guides/inference.md b/mmdetection/docs/zh_cn/user_guides/inference.md new file mode 100644 index 0000000..a0fb08f --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/inference.md @@ -0,0 +1,438 @@ +# 使用已有模型在标准数据集上进行推理 + +MMDetection 提供了许多预训练好的检测模型,可以在 [Model Zoo](https://mmdetection.readthedocs.io/zh_CN/latest/model_zoo.html) 查看具体有哪些模型。 + +推理具体指使用训练好的模型来检测图像上的目标,本文将会展示具体步骤。 + +在 MMDetection 中,一个模型被定义为一个[配置文件](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/config.html) 和对应被存储在 checkpoint 文件内的模型参数的集合。 + +首先,我们建议从 [RTMDet](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet) 开始,其 [配置](https://github.com/open-mmlab/mmdetection/blob/main/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py) 文件和 [checkpoint](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_l_8xb32-300e_coco/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth) 文件在此。 +我们建议将 checkpoint 文件下载到 `checkpoints` 文件夹内。 + +## 推理的高层编程接口——推理器 + +在 OpenMMLab 中,所有的推理操作都被统一到了推理器 `Inferencer` 中。推理器被设计成为一个简洁易用的 API,它在不同的 OpenMMLab 库中都有着非常相似的接口。 +下面介绍的演示样例都放在 [demo/inference_demo.ipynb](https://github.com/open-mmlab/mmdetection/blob/main/demo/inference_demo.ipynb) 中方便大家尝试。 + +### 基础用法 + +使用 `DetInferencer`,您只需 3 行代码就可以获得推理结果。 + +```python +from mmdet.apis import DetInferencer + +# 初始化模型 +inferencer = DetInferencer('rtmdet_tiny_8xb32-300e_coco') + +# 推理示例图片 +inferencer('demo/demo.jpg', show=True) +``` + +可视化结果将被显示在一个新窗口中: + +
    + +
    + +```{note} +如果你在没有 GUI 的服务器上,或者通过禁用 X11 转发的 SSH 隧道运行以上命令,`show` 选项将不起作用。然而,你仍然可以通过设置 `out_dir` 参数将可视化数据保存到文件。阅读 [储存结果](#储存结果) 了解详情。 +``` + +### 初始化 + +每个推理器必须使用一个模型进行初始化。初始化时,可以手动选择推理设备。 + +#### 模型初始化 + +- 要用 MMDetection 的预训练模型进行推理,只需要把它的名字传给参数 `model`,权重将自动从 OpenMMLab 的模型库中下载和加载。 + + ```python + inferencer = DetInferencer(model='rtmdet_tiny_8xb32-300e_coco') + ``` + + 在 MMDetection 中有一个非常容易的方法,可以列出所有模型名称。 + + ```python + # models 是一个模型名称列表,它们将自动打印 + models = DetInferencer.list_models('mmdet') + ``` + + 你可以通过将权重的路径或 URL 传递给 `weights` 来让推理器加载自定义的权重。 + + ```python + inferencer = DetInferencer(model='rtmdet_tiny_8xb32-300e_coco', weights='path/to/rtmdet.pth') + ``` + +- 要加载自定义的配置和权重,你可以把配置文件的路径传给 `model`,把权重的路径传给 `weights`。 + + ```python + inferencer = DetInferencer(model='path/to/rtmdet_config.py', weights='path/to/rtmdet.pth') + ``` + +- 默认情况下,[MMEngine](https://github.com/open-mmlab/mmengine/) 会在训练模型时自动将配置文件转储到权重文件中。如果你有一个在 MMEngine 上训练的权重,你也可以将权重文件的路径传递给 `weights`,而不需要指定 `model`: + + ```python + # 如果无法在权重中找到配置文件,则会引发错误。目前 MMDetection 模型库中只有 ddq-detr-4scale_r50 的权重可以这样加载。 + inferencer = DetInferencer(weights='https://download.openmmlab.com/mmdetection/v3.0/ddq/ddq-detr-4scale_r50_8xb2-12e_coco/ddq-detr-4scale_r50_8xb2-12e_coco_20230809_170711-42528127.pth') + ``` + +- 传递配置文件到 `model` 而不指定 `weights` 则会产生一个随机初始化的模型。 + +#### 推理设备 + +每个推理器实例都会跟一个设备绑定。默认情况下,最佳设备是由 [MMEngine](https://github.com/open-mmlab/mmengine/) 自动决定的。你也可以通过指定 `device` 参数来改变设备。例如,你可以使用以下代码在 GPU 1 上创建一个推理器。 + +```python +inferencer = DetInferencer(model='rtmdet_tiny_8xb32-300e_coco', device='cuda:1') +``` + +如要在 CPU 上创建一个推理器: + +```python +inferencer = DetInferencer(model='rtmdet_tiny_8xb32-300e_coco', device='cpu') +``` + +请参考 [torch.device](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device) 了解 `device` 参数支持的所有形式。 + +### 推理 + +当推理器初始化后,你可以直接传入要推理的原始数据,从返回值中获取推理结果。 + +#### 输入 + +输入可以是以下任意一种格式: + +- str: 图像的路径/URL。 + + ```python + inferencer('demo/demo.jpg') + ``` + +- array: 图像的 numpy 数组。它应该是 BGR 格式。 + + ```python + import mmcv + array = mmcv.imread('demo/demo.jpg') + inferencer(array) + ``` + +- list: 基本类型的列表。列表中的每个元素都将单独处理。 + + ```python + inferencer(['img_1.jpg', 'img_2.jpg]) + # 列表内混合类型也是允许的 + inferencer(['img_1.jpg', array]) + ``` + +- str: 目录的路径。目录中的所有图像都将被处理。 + + ```python + inferencer('path/to/your_imgs/') + ``` + +#### 输出 + +默认情况下,每个推理器都以字典格式返回预测结果。 + +- `visualization` 包含可视化的预测结果。但默认情况下,它是一个空列表,除非 `return_vis=True`。 + +- `predictions` 包含以 json-可序列化格式返回的预测结果。 + +```python +{ + 'predictions' : [ + # 每个实例都对应于一个输入图像 + { + 'labels': [...], # 整数列表,长度为 (N, ) + 'scores': [...], # 浮点列表,长度为 (N, ) + 'bboxes': [...], # 2d 列表,形状为 (N, 4),格式为 [min_x, min_y, max_x, max_y] + }, + ... + ], + 'visualization' : [ + array(..., dtype=uint8), + ] + } +``` + +如果你想要从模型中获取原始输出,可以将 `return_datasamples` 设置为 `True` 来获取原始的 [DataSample](advanced_guides/structures.md),它将存储在 `predictions` 中。 + +#### 储存结果 + +除了从返回值中获取预测结果,你还可以通过设置 `out_dir` 和 `no_save_pred`/`no_save_vis` 参数将预测结果和可视化结果导出到文件中。 + +```python +inferencer('demo/demo.jpg', out_dir='outputs/', no_save_pred=False) +``` + +结果目录结构如下: + +```text +outputs +├── preds +│ └── demo.json +└── vis + └── demo.jpg +``` + +#### 批量推理 + +你可以通过设置 `batch_size` 来自定义批量推理的批大小。默认批大小为 1。 + +### API + +这里列出了推理器详尽的参数列表。 + +- **DetInferencer.\_\_init\_\_():** + +| 参数 | 类型 | 默认值 | 描述 | +| --------------- | ---------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | str , 可选 | None | 配置文件的路径或 metafile 中定义的模型名称。例如,可以是 'rtmdet-s' 或 'rtmdet_s_8xb32-300e_coco' 或 'configs/rtmdet/rtmdet_s_8xb32-300e_coco.py'。如果未指定模型,用户必须提供 MMEngine 保存的包含配置字符串的 "weights"。 | +| `weights` | str, 可选 | None | 模型权重文件的路径。如果未指定且 `model` 是 metafile 中的模型名称,权重将从 metafile 中加载。 | +| `device` | str, 可选 | None | 推理使用的设备,接受 `torch.device` 允许的所有字符串。例如,'cuda:0' 或 'cpu'。如果为 None,将自动使用可用设备。 默认为 None。 | +| `scope` | str, 可选 | 'mmdet' | 模型的”域名“。 | +| `palette` | str | 'none' | 用于可视化的配色。优先顺序为 palette -> config -> checkpoint。 | +| `show_progress` | bool | True | 控制是否在推理过程中显示进度条。 | + +- **DetInferencer.\_\_call\_\_()** + +| 参数 | 类型 | 默认值 | 描述 | +| -------------------- | ----------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `inputs` | str/list/tuple/np.array | **必需** | 它可以是一个图片/文件夹的路径,一个 numpy 数组,或者是一个包含图片路径或 numpy 数组的列表/元组 | +| `batch_size` | int | 1 | 推理的批大小。 | +| `return_vis` | bool | False | 是否返回可视化结果。 | +| `show` | bool | False | 是否在弹出窗口中显示可视化结果。 | +| `wait_time` | float | 0 | 弹窗展示可视化结果的时间间隔。 | +| `no_save_vis` | bool | False | 是否将可视化结果保存到 `out_dir`。默认为保存。 | +| `draw_pred` | bool | True | 是否绘制预测的边界框。 | +| `pred_score_thr` | float | 0.3 | 显示预测框的最低置信度。 | +| `return_datasamples` | bool | False | 是否将结果作为 `DetDataSample` 返回。 如果为 False,则结果将被打包到一个 dict 中。 | +| `print_result` | bool | False | 是否将推理结果打印到控制台。 | +| `no_save_pred` | bool | True | 是否将推理结果保存到 `out_dir`。默认为不保存。 | +| `out_dir` | str | '' | 结果的输出目录。 | +| `texts` | str/list\[str\],可选 | None | 文本提示词。 | +| `stuff_texts` | str/list\[str\],可选 | None | 物体文本提示词。 | +| `custom_entities` | bool | False | 是否使用自定义实体。只用于 GLIP 算法。 | +| \*\*kwargs | | | 传递给 :meth:`preprocess`、:meth:`forward`、:meth:`visualize` 和 :meth:`postprocess` 的其他关键字参数。kwargs 中的每个关键字都应在相应的 `preprocess_kwargs`、`forward_kwargs`、`visualize_kwargs` 和 `postprocess_kwargs` 中。 | + +## 演示脚本样例 + +我们还提供了四个演示脚本,它们是使用高层编程接口实现的。[源码在此](https://github.com/open-mmlab/mmdetection/blob/main/demo) 。 + +### 图片样例 + +这是在单张图片上进行推理的脚本。 + +```shell +python demo/image_demo.py \ + ${IMAGE_FILE} \ + ${CONFIG_FILE} \ + [--weights ${WEIGHTS}] \ + [--device ${GPU_ID}] \ + [--pred-score-thr ${SCORE_THR}] +``` + +运行样例: + +```shell +python demo/image_demo.py demo/demo.jpg \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + --weights checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --device cpu +``` + +### 摄像头样例 + +这是使用摄像头实时图片的推理脚本。 + +```shell +python demo/webcam_demo.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--device ${GPU_ID}] \ + [--camera-id ${CAMERA-ID}] \ + [--score-thr ${SCORE_THR}] +``` + +运行样例: + +```shell +python demo/webcam_demo.py \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth +``` + +### 视频样例 + +这是在视频样例上进行推理的脚本。 + +```shell +python demo/video_demo.py \ + ${VIDEO_FILE} \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--device ${GPU_ID}] \ + [--score-thr ${SCORE_THR}] \ + [--out ${OUT_FILE}] \ + [--show] \ + [--wait-time ${WAIT_TIME}] +``` + +运行样例: + +```shell +python demo/video_demo.py demo/demo.mp4 \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --out result.mp4 +``` + +#### 视频样例,显卡加速版本 + +这是在视频样例上进行推理的脚本,使用显卡加速。 + +```shell +python demo/video_gpuaccel_demo.py \ + ${VIDEO_FILE} \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--device ${GPU_ID}] \ + [--score-thr ${SCORE_THR}] \ + [--nvdecode] \ + [--out ${OUT_FILE}] \ + [--show] \ + [--wait-time ${WAIT_TIME}] + +``` + +运行样例: + +```shell +python demo/video_gpuaccel_demo.py demo/demo.mp4 \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --nvdecode --out result.mp4 +``` + +### 大图推理样例 + +这是在大图上进行切片推理的脚本。 + +```shell +python demo/large_image_demo.py \ + ${IMG_PATH} \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + --device ${GPU_ID} \ + --show \ + --tta \ + --score-thr ${SCORE_THR} \ + --patch-size ${PATCH_SIZE} \ + --patch-overlap-ratio ${PATCH_OVERLAP_RATIO} \ + --merge-iou-thr ${MERGE_IOU_THR} \ + --merge-nms-type ${MERGE_NMS_TYPE} \ + --batch-size ${BATCH_SIZE} \ + --debug \ + --save-patch +``` + +运行样例: + +```shell +# inferecnce without tta +wget -P checkpoint https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r101_fpn_2x_coco/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth + +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/faster_rcnn/faster-rcnn_r101_fpn_2x_coco.py \ + checkpoint/faster_rcnn_r101_fpn_2x_coco_bbox_mAP-0.398_20200504_210455-1d2dac9c.pth + +# inference with tta +wget -P checkpoint https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r50_fpn_1x_coco/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth + +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/retinanet/retinanet_r50_fpn_1x_coco.py \ + checkpoint/retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth --tta +``` + +## 多模态算法的推理和验证 + +随着多模态视觉算法的不断发展,MMDetection 也完成了对这类算法的支持。这一小节我们通过 GLIP 算法和模型来演示如何使用对应多模态算法的 demo 和 eval 脚本。同时 MMDetection 也在 projects 下完成了 [gradio_demo 项目](../../../projects/gradio_demo/),用户可以参照[文档](../../../projects/gradio_demo/README.md)在本地快速体验 MMDetection 中支持的各类图片输入的任务。 + +### 模型准备 + +首先需要安装多模态依赖: + +```shell +# if source +pip install -r requirements/multimodal.txt + +# if wheel +mim install mmdet[multimodal] +``` + +MMDetection 已经集成了 glip 算法和模型,可以直接使用链接下载使用: + +```shell +cd mmdetection +wget https://download.openmmlab.com/mmdetection/v3.0/glip/glip_tiny_a_mmdet-b3654169.pth +``` + +### 推理演示 + +下载完成后我们就可以利用 `demo` 下的多模态推理脚本完成推理: + +```shell +python demo/image_demo.py demo/demo.jpg glip_tiny_a_mmdet-b3654169.pth --texts bench +``` + +demo 效果如下图所示: + +
    + +
    + +如果想进行多种类型的识别,需要使用 `xx. xx` 的格式在 `--texts` 字段后声明目标类型: + +```shell +python demo/image_demo.py demo/demo.jpg glip_tiny_a_mmdet-b3654169.pth --texts 'bench. car' +``` + +结果如下图所示: + +
    + +
    + +推理脚本还支持输入一个句子作为 `--texts` 字段的输入: + +```shell +python demo/image_demo.py demo/demo.jpg glip_tiny_a_mmdet-b3654169.pth --texts 'There are a lot of cars here.' +``` + +结果可以参考下图: + +
    + +
    + +### 验证演示 + +MMDetection 支持后的 GLIP 算法对比官方版本没有精度上的损失, benchmark 如下所示: + +| Model | official mAP | mmdet mAP | +| ----------------------- | :----------: | :-------: | +| glip_A_Swin_T_O365.yaml | 42.9 | 43.0 | +| glip_Swin_T_O365.yaml | 44.9 | 44.9 | +| glip_Swin_L.yaml | 51.4 | 51.3 | + +用户可以使用 `test.py` 脚本对模型精度进行验证,使用如下所示: + +```shell +# 1 gpu +python tools/test.py configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365.py glip_tiny_a_mmdet-b3654169.pth + +# 8 GPU +./tools/dist_test.sh configs/glip/glip_atss_swin-t_fpn_dyhead_pretrain_obj365.py glip_tiny_a_mmdet-b3654169.pth 8 +``` diff --git a/mmdetection/docs/zh_cn/user_guides/init_cfg.md b/mmdetection/docs/zh_cn/user_guides/init_cfg.md new file mode 100644 index 0000000..b58b19d --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/init_cfg.md @@ -0,0 +1,161 @@ +# 权重初始化 + +在训练过程中,适当的初始化策略有利于加快训练速度或获得更⾼的性能。 [MMCV](https://github.com/open-mmlab/mmcv/blob/master/mmcv/cnn/utils/weight_init.py) 提供了一些常⽤的初始化模块的⽅法,如 `nn.Conv2d`。 MMdetection 中的模型初始化主要使⽤ `init_cfg`。⽤⼾可以通过以下两个步骤来初始化模型: + +1. 在 `model_cfg` 中为模型或其组件定义 `init_cfg`,但⼦组件的 `init_cfg` 优先级更⾼,会覆盖⽗模块的 `init_cfg` 。 +2. 像往常一样构建模型,然后显式调⽤ `model.init_weights()` ⽅法,此时模型参数将会被按照配置文件写法进行初始化。 + +MMdetection 初始化工作流的高层 API 调用流程是: + +model_cfg(init_cfg) -> build_from_cfg -> model -> init_weight() -> initialize(self, self.init_cfg) -> children's init_weight() + +### 描述 + +它的数据类型是 dict 或者 list\[dict\],包含了下列键值: + +- `type` (str),包含 `INTIALIZERS` 中的初始化器名称,后面跟着初始化器的参数。 +- `layer`(str 或 list\[str\]),包含 Pytorch 或 MMCV 中基本层的名称,以及将被初始化的可学习参数,例如 `'Conv2d'`,`'DeformConv2d'`。 +- `override` (dict 或 list\[dict\]),包含不继承⾃ `BaseModule` 且其初始化配置与 `layer` 键中的其他层不同的⼦模块。 `type` 中定义的初始化器将适⽤于 `layer` 中定义的所有层,因此如果⼦模块不是 `BaseModule` 的派⽣类但可以与 `layer` 中的层相同的⽅式初始化,则不需要使⽤ `override`。`override` 包含了: + - `type` 后跟初始化器的参数; + - `name` 用以指⽰将被初始化的⼦模块。 + +### 初始化参数 + +从 `mmcv.runner.BaseModule` 或 `mmdet.models` 继承一个新模型。这里我们用 FooModel 来举个例子。 + +```python +import torch.nn as nn +from mmcv.runner import BaseModule + +class FooModel(BaseModule) + def __init__(self, + arg1, + arg2, + init_cfg=None): + super(FooModel, self).__init__(init_cfg) + ... +``` + +- 直接在代码中使⽤ `init_cfg` 初始化模型 + + ```python + import torch.nn as nn + from mmcv.runner import BaseModule + # or directly inherit mmdet models + + class FooModel(BaseModule) + def __init__(self, + arg1, + arg2, + init_cfg=XXX): + super(FooModel, self).__init__(init_cfg) + ... + ``` + +- 在 `mmcv.Sequential` 或 `mmcv.ModuleList` 代码中直接使⽤ `init_cfg` 初始化模型 + + ```python + from mmcv.runner import BaseModule, ModuleList + + class FooModel(BaseModule) + def __init__(self, + arg1, + arg2, + init_cfg=None): + super(FooModel, self).__init__(init_cfg) + ... + self.conv1 = ModuleList(init_cfg=XXX) + ``` + +- 使⽤配置⽂件中的 `init_cfg` 初始化模型 + + ```python + model = dict( + ... + model = dict( + type='FooModel', + arg1=XXX, + arg2=XXX, + init_cfg=XXX), + ... + ``` + +### init_cfg 的使用 + +1. 用 `layer` 键初始化模型 + + 如果我们只定义了 `layer`, 它只会在 `layer` 键中初始化网络层。 + + 注意: `layer` 键对应的值是 Pytorch 的带有 weights 和 bias 属性的类名(因此不⽀持 `MultiheadAttention` 层)。 + +- 定义⽤于初始化具有相同配置的模块的 `layer` 键。 + + ```python + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1) + # ⽤相同的配置初始化整个模块 + ``` + +- 定义⽤于初始化具有不同配置的层的 `layer` 键。 + + ```python + init_cfg = [dict(type='Constant', layer='Conv1d', val=1), + dict(type='Constant', layer='Conv2d', val=2), + dict(type='Constant', layer='Linear', val=3)] + # nn.Conv1d 将被初始化为 dict(type='Constant', val=1) + # nn.Conv2d 将被初始化为 dict(type='Constant', val=2) + # nn.Linear 将被初始化为 dict(type='Constant', val=3) + ``` + +2. 使⽤ `override` 键初始化模型 + +- 当使⽤属性名初始化某些特定部分时,我们可以使⽤ `override` 键, `override` 中的值将忽略 init_cfg 中的值。 + + ```python + # layers: + # self.feat = nn.Conv1d(3, 1, 3) + # self.reg = nn.Conv2d(3, 3, 3) + # self.cls = nn.Linear(1,2) + + init_cfg = dict(type='Constant', + layer=['Conv1d','Conv2d'], val=1, bias=2, + override=dict(type='Constant', name='reg', val=3, bias=4)) + # self.feat and self.cls 将被初始化为 dict(type='Constant', val=1, bias=2) + # 叫 'reg' 的模块将被初始化为 dict(type='Constant', val=3, bias=4) + ``` + +- 如果 init_cfg 中的 `layer` 为 None,则只会初始化 override 中有 name 的⼦模块,⽽ override 中的 type 和其他参数可以省略。 + + ```python + # layers: + # self.feat = nn.Conv1d(3, 1, 3) + # self.reg = nn.Conv2d(3, 3, 3) + # self.cls = nn.Linear(1,2) + + init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg')) + + # self.feat and self.cls 将被 Pytorch 初始化 + # 叫 'reg' 的模块将被 dict(type='Constant', val=1, bias=2) 初始化 + ``` + +- 如果我们不定义 `layer` 或 `override` 键,它不会初始化任何东西。 + +- 无效的使用 + + ```python + # override 没有 name 键的话是无效的 + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2, + override=dict(type='Constant', val=3, bias=4)) + + # override 有 name 键和其他参数但是没有 type 键也是无效的 + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2, + override=dict(name='reg', val=3, bias=4)) + ``` + +3. 使⽤预训练模型初始化模型 + + ```python + init_cfg = dict(type='Pretrained', + checkpoint='torchvision://resnet50') + ``` + +更多细节可以参考 [MMEngine](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/initialize.html) 的文档 diff --git a/mmdetection/docs/zh_cn/user_guides/label_studio.md b/mmdetection/docs/zh_cn/user_guides/label_studio.md new file mode 100644 index 0000000..202122f --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/label_studio.md @@ -0,0 +1,255 @@ +# 使用 MMDetection 和 Label-Studio 进行半自动化目标检测标注 + +标注数据是一个费时费力的任务,本文介绍了如何使用 MMDetection 中的 RTMDet 算法联合 Label-Studio 软件进行半自动化标注。具体来说,使用 RTMDet 预测图片生成标注,然后使用 Label-Studio 进行微调标注,社区用户可以参考此流程和方法,将其应用到其他领域。 + +- RTMDet:RTMDet 是 OpenMMLab 自研的高精度单阶段的目标检测算法,开源于 MMDetection 目标检测工具箱中,其开源协议为 Apache 2.0,工业界的用户可以不受限的免费使用。 +- [Label Studio](https://github.com/heartexlabs/label-studio) 是一款优秀的标注软件,覆盖图像分类、目标检测、分割等领域数据集标注的功能。 + +本文将使用[喵喵数据集](https://download.openmmlab.com/mmyolo/data/cat_dataset.zip)的图片,进行半自动化标注。 + +## 环境配置 + +首先需要创建一个虚拟环境,然后安装 PyTorch 和 MMCV。在本文中,我们将指定 PyTorch 和 MMCV 的版本。接下来安装 MMDetection、Label-Studio 和 label-studio-ml-backend,具体步骤如下: + +创建虚拟环境: + +```shell +conda create -n rtmdet python=3.9 -y +conda activate rtmdet +``` + +安装 PyTorch + +```shell +# Linux and Windows CPU only +pip install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html +# Linux and Windows CUDA 11.3 +pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu113/torch_stable.html +# OSX +pip install torch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 +``` + +安装 MMCV + +```shell +pip install -U openmim +mim install "mmcv>=2.0.0" +# 安装 mmcv 的过程中会自动安装 mmengine +``` + +安装 MMDetection + +```shell +git clone https://github.com/open-mmlab/mmdetection +cd mmdetection +pip install -v -e . +``` + +安装 Label-Studio 和 label-studio-ml-backend + +```shell +# 安装 label-studio 需要一段时间,如果找不到版本请使用官方源 +pip install label-studio==1.7.2 +pip install label-studio-ml==1.0.9 +``` + +下载rtmdet权重 + +```shell +cd path/to/mmetection +mkdir work_dirs +cd work_dirs +wget https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_m_8xb32-300e_coco/rtmdet_m_8xb32-300e_coco_20220719_112220-229f527c.pth +``` + +## 启动服务 + +启动 RTMDet 后端推理服务: + +```shell +cd path/to/mmetection + +label-studio-ml start projects/LabelStudio/backend_template --with \ +config_file=configs/rtmdet/rtmdet_m_8xb32-300e_coco.py \ +checkpoint_file=./work_dirs/rtmdet_m_8xb32-300e_coco_20220719_112220-229f527c.pth \ +device=cpu \ +--port 8003 +# device=cpu 为使用 CPU 推理,如果使用 GPU 推理,将 cpu 替换为 cuda:0 +``` + +![](https://cdn.vansin.top/picgo20230330131601.png) + +此时,RTMDet 后端推理服务已经启动,后续在 Label-Studio Web 系统中配置 http://localhost:8003 后端推理服务即可。 + +现在启动 Label-Studio 网页服务: + +```shell +label-studio start +``` + +![](https://cdn.vansin.top/picgo20230330132913.png) + +打开浏览器访问 [http://localhost:8080/](http://localhost:8080/) 即可看到 Label-Studio 的界面。 + +![](https://cdn.vansin.top/picgo20230330133118.png) + +我们注册一个用户,然后创建一个 RTMDet-Semiautomatic-Label 项目。 + +![](https://cdn.vansin.top/picgo20230330133333.png) + +我们通过下面的方式下载好示例的喵喵图片,点击 Data Import 导入需要标注的猫图片。 + +```shell +cd path/to/mmetection +mkdir data && cd data + +wget https://download.openmmlab.com/mmyolo/data/cat_dataset.zip && unzip cat_dataset.zip +``` + +![](https://cdn.vansin.top/picgo20230330133628.png) + +![](https://cdn.vansin.top/picgo20230330133715.png) + +然后选择 Object Detection With Bounding Boxes 模板 + +![](https://cdn.vansin.top/picgo20230330133807.png) + +```shell +airplane +apple +backpack +banana +baseball_bat +baseball_glove +bear +bed +bench +bicycle +bird +boat +book +bottle +bowl +broccoli +bus +cake +car +carrot +cat +cell_phone +chair +clock +couch +cow +cup +dining_table +dog +donut +elephant +fire_hydrant +fork +frisbee +giraffe +hair_drier +handbag +horse +hot_dog +keyboard +kite +knife +laptop +microwave +motorcycle +mouse +orange +oven +parking_meter +person +pizza +potted_plant +refrigerator +remote +sandwich +scissors +sheep +sink +skateboard +skis +snowboard +spoon +sports_ball +stop_sign +suitcase +surfboard +teddy_bear +tennis_racket +tie +toaster +toilet +toothbrush +traffic_light +train +truck +tv +umbrella +vase +wine_glass +zebra +``` + +然后将上述类别复制添加到 Label-Studio,然后点击 Save。 + +![](https://cdn.vansin.top/picgo20230330134027.png) + +然后在设置中点击 Add Model 添加 RTMDet 后端推理服务。 + +![](https://cdn.vansin.top/picgo20230330134320.png) + +点击 Validate and Save,然后点击 Start Labeling。 + +![](https://cdn.vansin.top/picgo20230330134424.png) + +看到如下 Connected 就说明后端推理服务添加成功。 + +![](https://cdn.vansin.top/picgo20230330134554.png) + +## 开始半自动化标注 + +点击 Label 开始标注 + +![](https://cdn.vansin.top/picgo20230330134804.png) + +我们可以看到 RTMDet 后端推理服务已经成功返回了预测结果并显示在图片上,我们可以发现这个喵喵预测的框有点大。 + +![](https://cdn.vansin.top/picgo20230403104419.png) + +我们手工拖动框,修正一下框的位置,得到以下修正过后的标注,然后点击 Submit,本张图片就标注完毕了。 + +![](https://cdn.vansin.top/picgo/20230403105923.png) + +我们 submit 完毕所有图片后,点击 exprot 导出 COCO 格式的数据集,就能把标注好的数据集的压缩包导出来了。 + +![](https://cdn.vansin.top/picgo20230330135921.png) + +用 vscode 打开解压后的文件夹,可以看到标注好的数据集,包含了图片和 json 格式的标注文件。 + +![](https://cdn.vansin.top/picgo20230330140321.png) + +到此半自动化标注就完成了,我们可以用这个数据集在 MMDetection 训练精度更高的模型了,训练出更好的模型,然后再用这个模型继续半自动化标注新采集的图片,这样就可以不断迭代,扩充高质量数据集,提高模型的精度。 + +## 使用 MMYOLO 作为后端推理服务 + +如果想在 MMYOLO 中使用 Label-Studio,可以参考在启动后端推理服务时,将 config_file 和 checkpoint_file 替换为 MMYOLO 的配置文件和权重文件即可。 + +```shell +cd path/to/mmetection + +label-studio-ml start projects/LabelStudio/backend_template --with \ +config_file= path/to/mmyolo_config.py \ +checkpoint_file= path/to/mmyolo_weights.pth \ +device=cpu \ +--port 8003 +# device=cpu 为使用 CPU 推理,如果使用 GPU 推理,将 cpu 替换为 cuda:0 +``` + +旋转目标检测和实例分割还在支持中,敬请期待。 diff --git a/mmdetection/docs/zh_cn/user_guides/new_model.md b/mmdetection/docs/zh_cn/user_guides/new_model.md new file mode 100644 index 0000000..424c4f9 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/new_model.md @@ -0,0 +1,289 @@ +# 在标准数据集上训练自定义模型(待更新) + +在本文中,你将知道如何在标准数据集上训练、测试和推理自定义模型。我们将在 cityscapes 数据集上以自定义 Cascade Mask R-CNN R50 模型为例演示整个过程,为了方便说明,我们将 neck 模块中的 `FPN` 替换为 `AugFPN`,并且在训练中的自动增强类中增加 `Rotate` 或 `TranslateX`。 + +基本步骤如下所示: + +1. 准备标准数据集 +2. 准备你的自定义模型 +3. 准备配置文件 +4. 在标准数据集上对模型进行训练、测试和推理 + +## 准备标准数据集 + +在本文中,我们使用 cityscapes 标准数据集为例进行说明。 + +推荐将数据集根路径采用符号链接方式链接到 `$MMDETECTION/data`。 + +如果你的文件结构不同,你可能需要在配置文件中进行相应的路径更改。标准的文件组织格式如下所示: + +```none +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ ├── cityscapes +│ │ ├── annotations +│ │ ├── leftImg8bit +│ │ │ ├── train +│ │ │ ├── val +│ │ ├── gtFine +│ │ │ ├── train +│ │ │ ├── val +│ ├── VOCdevkit +│ │ ├── VOC2007 +│ │ ├── VOC2012 +``` + +你也可以通过如下方式设定数据集根路径 + +```bash +export MMDET_DATASETS=$data_root +``` + +我们将会使用环境便变量 `$MMDET_DATASETS` 作为数据集的根目录,因此你无需再修改相应配置文件的路径信息。 + +你需要使用脚本 `tools/dataset_converters/cityscapes.py` 将 cityscapes 标注转化为 coco 标注格式。 + +```shell +pip install cityscapesscripts +python tools/dataset_converters/cityscapes.py ./data/cityscapes --nproc 8 --out-dir ./data/cityscapes/annotations +``` + +目前在 `cityscapes `文件夹中的配置文件所对应模型是采用 COCO 预训练权重进行初始化的。 + +如果你的网络不可用或者比较慢,建议你先手动下载对应的预训练权重,否则可能在训练开始时候出现错误。 + +## 准备你的自定义模型 + +第二步是准备你的自定义模型或者训练相关配置。假设你想在已有的 Cascade Mask R-CNN R50 检测模型基础上,新增一个新的 neck 模块 `AugFPN` 去代替默认的 `FPN`,以下是具体实现: + +### 1 定义新的 neck (例如 AugFPN) + +首先创建新文件 `mmdet/models/necks/augfpn.py`. + +```python +import torch.nn as nn +from mmdet.registry import MODELS + +@MODELS.register_module() +class AugFPN(nn.Module): + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False): + pass + + def forward(self, inputs): + # implementation is ignored + pass +``` + +### 2 导入模块 + +你可以采用两种方式导入模块,第一种是在 `mmdet/models/necks/__init__.py` 中添加如下内容 + +```python +from .augfpn import AugFPN +``` + +第二种是增加如下代码到对应配置中,这种方式的好处是不需要改动代码 + +```python +custom_imports = dict( + imports=['mmdet.models.necks.augfpn'], + allow_failed_imports=False) +``` + +### 3 修改配置 + +```python +neck=dict( + type='AugFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5) +``` + +关于自定义模型其余相关细节例如实现新的骨架网络,头部网络、损失函数,以及运行时训练配置例如定义新的优化器、使用梯度裁剪、定制训练调度策略和钩子等,请参考文档 [自定义模型](tutorials/customize_models.md) 和 [自定义运行时训练配置](tutorials/customize_runtime.md)。 + +## 准备配置文件 + +第三步是准备训练配置所需要的配置文件。假设你打算基于 cityscapes 数据集,在 Cascade Mask R-CNN R50 中新增 `AugFPN` 模块,同时增加 `Rotate` 或者 `Translate` 数据增强策略,假设你的配置文件位于 `configs/cityscapes/` 目录下,并且取名为 `cascade-mask-rcnn_r50_augfpn_autoaug-10e_cityscapes.py`,则配置信息如下: + +```python +# 继承 base 配置,然后进行针对性修改 +_base_ = [ + '../_base_/models/cascade-mask-rcnn_r50_fpn.py', + '../_base_/datasets/cityscapes_instance.py', '../_base_/default_runtime.py' +] + +model = dict( + # 设置 `init_cfg` 为 None,表示不加载 ImageNet 预训练权重, + # 后续可以设置 `load_from` 参数用来加载 COCO 预训练权重 + backbone=dict(init_cfg=None), + # 使用新增的 `AugFPN` 模块代替默认的 `FPN` + neck=dict( + type='AugFPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + # 我们也需要将 num_classes 从 80 修改为 8 来匹配 cityscapes 数据集标注 + # 这个修改包括 `bbox_head` 和 `mask_head`. + roi_head=dict( + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + # 将 COCO 类别修改为 cityscapes 类别 + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + # 将 COCO 类别修改为 cityscapes 类别 + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + # 将 COCO 类别修改为 cityscapes 类别 + num_classes=8, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + # 将 COCO 类别修改为 cityscapes 类别 + num_classes=8, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))) + +# 覆写 `train_pipeline`,然后新增 `AutoAugment` 训练配置 +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='AutoAugment', + policies=[ + [dict( + type='Rotate', + level=5, + img_border_value=(124, 116, 104), + prob=0.5) + ], + [dict(type='Rotate', level=7, img_border_value=(124, 116, 104)), + dict( + type='TranslateX', + level=5, + prob=0.5, + img_border_value=(124, 116, 104)) + ], + ]), + dict( + type='RandomResize', + scale=[(2048, 800), (2048, 1024)], + keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs'), +] + +# 设置每张显卡的批处理大小,同时设置新的训练 pipeline +data = dict( + samples_per_gpu=1, + workers_per_gpu=3, + train=dict(dataset=dict(pipeline=train_pipeline))) + +# 设置优化器 +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)) + +# 设置定制的学习率策略 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type='MultiStepLR', + begin=0, + end=10, + by_epoch=True, + milestones=[8], + gamma=0.1) +] + +# 训练,验证,测试配置 +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# 我们采用 COCO 预训练过的 Cascade Mask R-CNN R50 模型权重作为初始化权重,可以得到更加稳定的性能 +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco/cascade_mask_rcnn_r50_fpn_1x_coco_20200203-9d4dcb24.pth' +``` + +## 训练新模型 + +为了能够使用新增配置来训练模型,你可以运行如下命令: + +```shell +python tools/train.py configs/cityscapes/cascade-mask-rcnn_r50_augfpn_autoaug-10e_cityscapes.py +``` + +如果想了解更多用法,可以参考 [例子1](1_exist_data_model.md)。 + +## 测试和推理 + +为了能够测试训练好的模型,你可以运行如下命令: + +```shell +python tools/test.py configs/cityscapes/cascade-mask-rcnn_r50_augfpn_autoaug-10e_cityscapes.py work_dirs/cascade-mask-rcnn_r50_augfpn_autoaug-10e_cityscapes/epoch_10.pth +``` + +如果想了解更多用法,可以参考 [例子1](1_exist_data_model.md)。 diff --git a/mmdetection/docs/zh_cn/user_guides/robustness_benchmarking.md b/mmdetection/docs/zh_cn/user_guides/robustness_benchmarking.md new file mode 100644 index 0000000..e95c79a --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/robustness_benchmarking.md @@ -0,0 +1,109 @@ +# 检测器鲁棒性检查 + +## 介绍 + +我们提供了在 [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484) 中定义的「图像损坏基准测试」上测试目标检测和实例分割模型的工具。 +此页面提供了如何使用该基准测试的基本教程。 + +```latex +@article{michaelis2019winter, + title={Benchmarking Robustness in Object Detection: + Autonomous Driving when Winter is Coming}, + author={Michaelis, Claudio and Mitzkus, Benjamin and + Geirhos, Robert and Rusak, Evgenia and + Bringmann, Oliver and Ecker, Alexander S. and + Bethge, Matthias and Brendel, Wieland}, + journal={arXiv:1907.07484}, + year={2019} +} +``` + +![image corruption example](../../../resources/corruptions_sev_3.png) + +## 关于基准测试 + +要将结果提交到基准测试,请访问[基准测试主页](https://github.com/bethgelab/robust-detection-benchmark) + +基准测试是仿照 [imagenet-c 基准测试](https://github.com/hendrycks/robustness),由 Dan Hendrycks 和 Thomas Dietterich 在[Benchmarking Neural Network Robustness to Common Corruptions and Perturbations](https://arxiv.org/abs/1903.12261)(ICLR 2019)中发表。 + +图像损坏变换功能包含在此库中,但可以使用以下方法单独安装: + +```shell +pip install imagecorruptions +``` + +与 imagenet-c 相比,我们必须进行一些更改以处理任意大小的图像和灰度图像。 +我们还修改了“运动模糊”和“雪”损坏,以解除对于 linux 特定库的依赖, +否则必须单独安装这些库。有关详细信息,请参阅 [imagecorruptions](https://github.com/bethgelab/imagecorruptions)。 + +## 使用预训练模型进行推理 + +我们提供了一个测试脚本来评估模型在基准测试中提供的各种损坏变换组合下的性能。 + +### 在数据集上测试 + +- [x] 单张 GPU 测试 +- [ ] 多张 GPU 测试 +- [ ] 可视化检测结果 + +您可以使用以下命令在基准测试中使用 15 种损坏变换来测试模型性能。 + +```shell +# single-gpu testing +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] +``` + +也可以选择其它不同类型的损坏变换。 + +```shell +# noise +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions noise + +# blur +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions blur + +# wetaher +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions weather + +# digital +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions digital +``` + +或者使用一组自定义的损坏变换,例如: + +```shell +# gaussian noise, zoom blur and snow +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --corruptions gaussian_noise zoom_blur snow +``` + +最后,我们也可以选择施加在图像上的损坏变换的严重程度。 +严重程度从 1 到 5 逐级增强,0 表示不对图像施加损坏变换,即原始图像数据。 + +```shell +# severity 1 +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 1 + +# severities 0,2,4 +python tools/analysis_tools/test_robustness.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] --severities 0 2 4 +``` + +## 模型测试结果 + +下表是各模型在 COCO 2017val 上的测试结果。 + +| Model | Backbone | Style | Lr schd | box AP clean | box AP corr. | box % | mask AP clean | mask AP corr. | mask % | +| :-----------------: | :-----------------: | :-----: | :-----: | :----------: | :----------: | :---: | :-----------: | :-----------: | :----: | +| Faster R-CNN | R-50-FPN | pytorch | 1x | 36.3 | 18.2 | 50.2 | - | - | - | +| Faster R-CNN | R-101-FPN | pytorch | 1x | 38.5 | 20.9 | 54.2 | - | - | - | +| Faster R-CNN | X-101-32x4d-FPN | pytorch | 1x | 40.1 | 22.3 | 55.5 | - | - | - | +| Faster R-CNN | X-101-64x4d-FPN | pytorch | 1x | 41.3 | 23.4 | 56.6 | - | - | - | +| Faster R-CNN | R-50-FPN-DCN | pytorch | 1x | 40.0 | 22.4 | 56.1 | - | - | - | +| Faster R-CNN | X-101-32x4d-FPN-DCN | pytorch | 1x | 43.4 | 26.7 | 61.6 | - | - | - | +| Mask R-CNN | R-50-FPN | pytorch | 1x | 37.3 | 18.7 | 50.1 | 34.2 | 16.8 | 49.1 | +| Mask R-CNN | R-50-FPN-DCN | pytorch | 1x | 41.1 | 23.3 | 56.7 | 37.2 | 20.7 | 55.7 | +| Cascade R-CNN | R-50-FPN | pytorch | 1x | 40.4 | 20.1 | 49.7 | - | - | - | +| Cascade Mask R-CNN | R-50-FPN | pytorch | 1x | 41.2 | 20.7 | 50.2 | 35.7 | 17.6 | 49.3 | +| RetinaNet | R-50-FPN | pytorch | 1x | 35.6 | 17.8 | 50.1 | - | - | - | +| Hybrid Task Cascade | X-101-64x4d-FPN-DCN | pytorch | 1x | 50.6 | 32.7 | 64.7 | 43.8 | 28.1 | 64.0 | + +由于对图像的损坏变换存在随机性,测试结果可能略有不同。 diff --git a/mmdetection/docs/zh_cn/user_guides/semi_det.md b/mmdetection/docs/zh_cn/user_guides/semi_det.md new file mode 100644 index 0000000..a223523 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/semi_det.md @@ -0,0 +1,320 @@ +# 半监督目标检测 + +半监督目标检测同时利用标签数据和无标签数据进行训练,一方面可以减少模型对检测框数量的依赖,另一方面也可以利用大量的未标记数据进一步提高模型。 + +按照以下流程进行半监督目标检测: + +- [半监督目标检测](#半监督目标检测) + - [准备和拆分数据集](#准备和拆分数据集) + - [配置多分支数据流程](#配置多分支数据流程) + - [配置半监督数据加载](#配置半监督数据加载) + - [配置半监督模型](#配置半监督模型) + - [配置MeanTeacherHook](#配置meanteacherhook) + - [配置TeacherStudentValLoop](#配置teacherstudentvalloop) + +## 准备和拆分数据集 + +我们提供了数据集下载脚本,默认下载 coco2017 数据集,并且自动解压。 + +```shell +python tools/misc/download_dataset.py +``` + +解压后的数据集目录如下: + +```plain +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── image_info_unlabeled2017.json +│ │ │ ├── instances_train2017.json +│ │ │ ├── instances_val2017.json +│ │ ├── test2017 +│ │ ├── train2017 +│ │ ├── unlabeled2017 +│ │ ├── val2017 +``` + +半监督目标检测在 coco 数据集上有两种比较通用的实验设置: + +(1)将 `train2017` 按照固定百分比(1%,2%,5% 和 10%)划分出一部分数据作为标签数据集,剩余的训练集数据作为无标签数据集,同时考虑划分不同的训练集数据作为标签数据集对半监督训练的结果影响较大,所以采用五折交叉验证来评估算法性能。我们提供了数据集划分脚本: + +```shell +python tools/misc/split_coco.py +``` + +该脚本默认会按照 1%,2%,5% 和 10% 的标签数据占比划分 `train2017`,每一种划分会随机重复 5 次,用于交叉验证。生成的半监督标注文件名称格式如下: + +- 标签数据集标注名称格式:`instances_train2017.{fold}@{percent}.json` + +- 无标签数据集名称标注:`instances_train2017.{fold}@{percent}-unlabeled.json` + +其中,`fold` 用于交叉验证,`percent` 表示标签数据的占比。 划分后的数据集目录结构如下: + +```plain +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── image_info_unlabeled2017.json +│ │ │ ├── instances_train2017.json +│ │ │ ├── instances_val2017.json +│ │ ├── semi_anns +│ │ │ ├── instances_train2017.1@1.json +│ │ │ ├── instances_train2017.1@1-unlabeled.json +│ │ │ ├── instances_train2017.1@2.json +│ │ │ ├── instances_train2017.1@2-unlabeled.json +│ │ │ ├── instances_train2017.1@5.json +│ │ │ ├── instances_train2017.1@5-unlabeled.json +│ │ │ ├── instances_train2017.1@10.json +│ │ │ ├── instances_train2017.1@10-unlabeled.json +│ │ │ ├── instances_train2017.2@1.json +│ │ │ ├── instances_train2017.2@1-unlabeled.json +│ │ ├── test2017 +│ │ ├── train2017 +│ │ ├── unlabeled2017 +│ │ ├── val2017 +``` + +(2)将 `train2017` 作为标签数据集,`unlabeled2017` 作为无标签数据集。由于 `image_info_unlabeled2017.json` 没有 `categories` 信息,无法初始化 `CocoDataset` ,所以需要将 `instances_train2017.json` 的 `categories` 写入 `image_info_unlabeled2017.json` ,另存为 `instances_unlabeled2017.json`,相关脚本如下: + +```python +from mmengine.fileio import load, dump + +anns_train = load('instances_train2017.json') +anns_unlabeled = load('image_info_unlabeled2017.json') +anns_unlabeled['categories'] = anns_train['categories'] +dump(anns_unlabeled, 'instances_unlabeled2017.json') +``` + +处理后的数据集目录如下: + +```plain +mmdetection +├── data +│ ├── coco +│ │ ├── annotations +│ │ │ ├── image_info_unlabeled2017.json +│ │ │ ├── instances_train2017.json +│ │ │ ├── instances_unlabeled2017.json +│ │ │ ├── instances_val2017.json +│ │ ├── test2017 +│ │ ├── train2017 +│ │ ├── unlabeled2017 +│ │ ├── val2017 +``` + +## 配置多分支数据流程 + +半监督学习有两个主要的方法,分别是 +[一致性正则化](https://research.nvidia.com/sites/default/files/publications/laine2017iclr_paper.pdf) +和[伪标签](https://www.researchgate.net/profile/Dong-Hyun-Lee/publication/280581078_Pseudo-Label_The_Simple_and_Efficient_Semi-Supervised_Learning_Method_for_Deep_Neural_Networks/links/55bc4ada08ae092e9660b776/Pseudo-Label-The-Simple-and-Efficient-Semi-Supervised-Learning-Method-for-Deep-Neural-Networks.pdf) 。 +一致性正则化往往需要一些精心的设计,而伪标签的形式比较简单,更容易拓展到下游任务。我们主要采用了基于伪标签的教师学生联合训练的半监督目标检测框架,对于标签数据和无标签数据需要配置不同的数据流程: +(1)标签数据的数据流程: + +```python +# pipeline used to augment labeled data, +# which will be sent to student model for supervised training. +sup_pipeline = [ + dict(type='LoadImageFromFile',backend_args = backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict(type='RandAugment', aug_space=color_space, aug_num=1), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='MultiBranch', sup=dict(type='PackDetInputs')) +] +``` + +(2)无标签的数据流程: + +```python +# pipeline used to augment unlabeled data weakly, +# which will be sent to teacher model for predicting pseudo instances. +weak_pipeline = [ + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', + 'homography_matrix')), +] + +# pipeline used to augment unlabeled data strongly, +# which will be sent to student model for unsupervised training. +strong_pipeline = [ + dict(type='RandomResize', scale=scale, keep_ratio=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomOrder', + transforms=[ + dict(type='RandAugment', aug_space=color_space, aug_num=1), + dict(type='RandAugment', aug_space=geometric, aug_num=1), + ]), + dict(type='RandomErasing', n_patches=(1, 5), ratio=(0, 0.2)), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction', + 'homography_matrix')), +] + +# pipeline used to augment unlabeled data into different views +unsup_pipeline = [ + dict(type='LoadImageFromFile', backend_args = backend_args), + dict(type='LoadEmptyAnnotations'), + dict( + type='MultiBranch', + unsup_teacher=weak_pipeline, + unsup_student=strong_pipeline, + ) +] +``` + +## 配置半监督数据加载 + +(1)构建半监督数据集。使用 `ConcatDataset` 拼接标签数据集和无标签数据集。 + +```python +labeled_dataset = dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=sup_pipeline) + +unlabeled_dataset = dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_unlabeled2017.json', + data_prefix=dict(img='unlabeled2017/'), + filter_cfg=dict(filter_empty_gt=False), + pipeline=unsup_pipeline) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=num_workers, + persistent_workers=True, + sampler=dict( + type='GroupMultiSourceSampler', + batch_size=batch_size, + source_ratio=[1, 4]), + dataset=dict( + type='ConcatDataset', datasets=[labeled_dataset, unlabeled_dataset])) +``` + +(2)使用多源数据集采样器。 使用 `GroupMultiSourceSampler` 从 `labeled_dataset` 和 `labeled_dataset` 采样数据组成 batch , `source_ratio` 控制 batch 中标签数据和无标签数据的占比。`GroupMultiSourceSampler` 还保证了同一个 batch 中的图片具有相近的长宽比例,如果不需要保证batch内图片的长宽比例,可以使用 `MultiSourceSampler`。`GroupMultiSourceSampler` 采样示意图如下: + +
    + +
    + +`sup=1000` 表示标签数据集的规模为 1000 ,`sup_h=200` 表示标签数据集中长宽比大于等于1的图片规模为 200,`sup_w=800` 表示标签数据集中长宽比小于1的图片规模为 800 ,`unsup=9000` 表示无标签数据集的规模为 9000 ,`unsup_h=1800` 表示无标签数据集中长宽比大于等于1的图片规模为 1800,`unsup_w=7200` 表示标签数据集中长宽比小于1的图片规模为 7200 ,`GroupMultiSourceSampler` 每次按照标签数据集和无标签数据集的图片的总体长宽比分布随机选择一组,然后按照 `source_ratio` 从两个数据集中采样组成 batch ,因此标签数据集和无标签数据集重复采样次数不同。 + +## 配置半监督模型 + +我们选择 `Faster R-CNN` 作为 `detector` 进行半监督训练,以半监督目标检测算法 `SoftTeacher` 为例,模型的配置可以继承 `_base_/models/faster-rcnn_r50_fpn.py`,将检测器的骨干网络替换成 `caffe` 风格。 +注意,与监督训练的配置文件不同的是,`Faster R-CNN` 作为 `detector`,是作为 `model`的一个属性,而不是 `model` 。此外,还需要将`data_preprocessor`设置为`MultiBranchDataPreprocessor`,用于处理不同数据流程图片的填充和归一化。 +最后,可以通过 `semi_train_cfg` 和 `semi_test_cfg` 配置半监督训练和测试需要的参数。 + +```python +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', '../_base_/default_runtime.py', + '../_base_/datasets/semi_coco_detection.py' +] + +detector = _base_.model +detector.data_preprocessor = dict( + type='DetDataPreprocessor', + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32) +detector.backbone = dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')) + +model = dict( + _delete_=True, + type='SoftTeacher', + detector=detector, + data_preprocessor=dict( + type='MultiBranchDataPreprocessor', + data_preprocessor=detector.data_preprocessor), + semi_train_cfg=dict( + freeze_teacher=True, + sup_weight=1.0, + unsup_weight=4.0, + pseudo_label_initial_score_thr=0.5, + rpn_pseudo_thr=0.9, + cls_pseudo_thr=0.9, + reg_pseudo_thr=0.02, + jitter_times=10, + jitter_scale=0.06, + min_pseudo_bbox_wh=(1e-2, 1e-2)), + semi_test_cfg=dict(predict_on='teacher')) +``` + +此外,我们也支持其他检测模型进行半监督训练,比如,`RetinaNet` 和 `Cascade R-CNN`。由于 `SoftTeacher` 仅支持 `Faster R-CNN`,所以需要将其替换为 `SemiBaseDetector`,示例如下: + +```python +_base_ = [ + '../_base_/models/retinanet_r50_fpn.py', '../_base_/default_runtime.py', + '../_base_/datasets/semi_coco_detection.py' +] + +detector = _base_.model + +model = dict( + _delete_=True, + type='SemiBaseDetector', + detector=detector, + data_preprocessor=dict( + type='MultiBranchDataPreprocessor', + data_preprocessor=detector.data_preprocessor), + semi_train_cfg=dict( + freeze_teacher=True, + sup_weight=1.0, + unsup_weight=1.0, + cls_pseudo_thr=0.9, + min_pseudo_bbox_wh=(1e-2, 1e-2)), + semi_test_cfg=dict(predict_on='teacher')) +``` + +沿用 `SoftTeacher` 的半监督训练配置,将 `batch_size` 改为 2 ,`source_ratio` 改为 `[1, 1]`,`RetinaNet`,`Faster R-CNN`, `Cascade R-CNN` 以及 `SoftTeacher` 在 10% coco 训练集上的监督训练和半监督训练的实验结果如下: + +| Model | Detector | BackBone | Style | sup-0.1-coco mAP | semi-0.1-coco mAP | +| :--------------: | :-----------: | :------: | :---: | :--------------: | :---------------: | +| SemiBaseDetector | RetinaNet | R-50-FPN | caffe | 23.5 | 27.7 | +| SemiBaseDetector | Faster R-CNN | R-50-FPN | caffe | 26.7 | 28.4 | +| SemiBaseDetector | Cascade R-CNN | R-50-FPN | caffe | 28.0 | 29.7 | +| SoftTeacher | Faster R-CNN | R-50-FPN | caffe | 26.7 | 31.1 | + +## 配置MeanTeacherHook + +通常,教师模型采用对学生模型指数滑动平均(EMA)的方式进行更新,进而教师模型随着学生模型的优化而优化,可以通过配置 `custom_hooks` 实现: + +```python +custom_hooks = [dict(type='MeanTeacherHook')] +``` + +## 配置TeacherStudentValLoop + +由于教师学生联合训练框架存在两个模型,我们可以用 `TeacherStudentValLoop` 替换 `ValLoop`,在训练的过程中同时检验两个模型的精度。 + +```python +val_cfg = dict(type='TeacherStudentValLoop') +``` diff --git a/mmdetection/docs/zh_cn/user_guides/single_stage_as_rpn.md b/mmdetection/docs/zh_cn/user_guides/single_stage_as_rpn.md new file mode 100644 index 0000000..39db35c --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/single_stage_as_rpn.md @@ -0,0 +1,171 @@ +# 将单阶段检测器作为 RPN + +候选区域网络 (Region Proposal Network, RPN) 作为 [Faster R-CNN](https://arxiv.org/abs/1506.01497) 的一个子模块,将为 Faster R-CNN 的第二阶段产生候选区域。在 MMDetection 里大多数的二阶段检测器使用 [`RPNHead`](../../../mmdet/models/dense_heads/rpn_head.py)作为候选区域网络来产生候选区域。然而,任何的单阶段检测器都可以作为候选区域网络,是因为他们对边界框的预测可以被视为是一种候选区域,并且因此能够在 R-CNN 中得到改进。因此在 MMDetection v3.0 中会支持将单阶段检测器作为 RPN 使用。 + +接下来我们通过一个例子,即如何在 [Faster R-CNN](../../../configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py) 中使用一个无锚框的单阶段的检测器模型 [FCOS](../../../configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py) 作为 RPN ,详细阐述具体的全部流程。 + +主要流程如下: + +1. 在 Faster R-CNN 中使用 `FCOSHead` 作为 `RPNHead` +2. 评估候选区域 +3. 用预先训练的 FCOS 训练定制的 Faster R-CNN + +## 在 Faster R-CNN 中使用 `FCOSHead` 作为` RPNHead` + +为了在 Faster R-CNN 中使用 `FCOSHead` 作为 `RPNHead` ,我们应该创建一个名为 `configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py` 的配置文件,并且在 `configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py` 中将 `rpn_head` 的设置替换为 `bbox_head` 的设置,此外我们仍然使用 FCOS 的瓶颈设置,步幅为`[8,16,32,64,128]`,并且更新 `bbox_roi_extractor` 的 `featmap_stride` 为 ` [8,16,32,64,128]`。为了避免损失变慢,我们在前1000次迭代而不是前500次迭代中应用预热,这意味着 lr 增长得更慢。相关配置如下: + +```python +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + # 从 configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py 复制 + neck=dict( + start_level=1, + add_extra_convs='on_output', # 使用 P5 + relu_before_extra_convs=True), + rpn_head=dict( + _delete_=True, # 忽略未使用的旧设置 + type='FCOSHead', + num_classes=1, # 对于 rpn, num_classes = 1,如果 num_classes > 1,它将在 TwoStageDetector 中自动设置为1 + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + roi_head=dict( # featmap_strides 的更新取决于于颈部的步伐 + bbox_roi_extractor=dict(featmap_strides=[8, 16, 32, 64, 128]))) +# 学习率 +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), # 慢慢增加 lr,否则损失变成 NAN + dict( + type='MultiStepLR', + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] +``` + +然后,我们可以使用下面的命令来训练我们的定制模型。更多训练命令,请参考[这里](train.md)。 + +```python +# 使用8个 GPU 进行训练 +bash +tools/dist_train.sh +configs/faster_rcnn/faster-rcnn_r50_fpn_fcos-rpn_1x_coco.py +--work-dir /work_dirs/faster-rcnn_r50_fpn_fcos-rpn_1x_coco +``` + +## 评估候选区域 + +候选区域的质量对检测器的性能有重要影响,因此,我们也提供了一种评估候选区域的方法。和上面一样创建一个新的名为 `configs/rpn/fcos-rpn_r50_fpn_1x_coco.py` 的配置文件,并且在 `configs/rpn/fcos-rpn_r50_fpn_1x_coco.py` 中将 `rpn_head` 的设置替换为 `bbox_head` 的设置。 + +```python +_base_ = [ + '../_base_/models/rpn_r50_fpn.py', '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +val_evaluator = dict(metric='proposal_fast') +test_evaluator = val_evaluator +model = dict( + # 从 configs/fcos/fcos_r50-caffe_fpn_gn-head_1x_coco.py 复制 + neck=dict( + start_level=1, + add_extra_convs='on_output', # 使用 P5 + relu_before_extra_convs=True), + rpn_head=dict( + _delete_=True, # 忽略未使用的旧设置 + type='FCOSHead', + num_classes=1, # 对于 rpn, num_classes = 1,如果 num_classes >为1,它将在 rpn 中自动设置为1 + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0))) +``` + +假设我们在训练之后有检查点 `./work_dirs/faster-rcnn_r50_fpn_fcos-rpn_1x_coco/epoch_12.pth` ,然后,我们可以使用下面的命令来评估建议的质量。 + +```python +# 使用8个 GPU 进行测试 +bash +tools/dist_test.sh +configs/rpn/fcos-rpn_r50_fpn_1x_coco.py +--work_dirs /faster-rcnn_r50_fpn_fcos-rpn_1x_coco/epoch_12.pth +``` + +## 用预先训练的 FCOS 训练定制的 Faster R-CNN + +预训练不仅加快了训练的收敛速度,而且提高了检测器的性能。因此,我们在这里给出一个例子来说明如何使用预先训练的 FCOS 作为 RPN 来加速训练和提高精度。假设我们想在 Faster R-CNN 中使用 `FCOSHead` 作为 `rpn_head`,并加载预先训练权重来进行训练 [`fcos_r50-caffe_fpn_gn-head_1x_coco`](https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth)。 配置文件 `configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_fcos- rpn_1x_copy .py` 的内容如下所示。注意,`fcos_r50-caffe_fpn_gn-head_1x_coco` 使用 ResNet50 的 caffe 版本,因此需要更新 `data_preprocessor` 中的像素平均值和 std。 + +```python +_base_ = [ + '../_base_/models/faster-rcnn_r50_fpn.py', + '../_base_/datasets/coco_detection.py', + '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py' +] +model = dict( + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(type='BN', requires_grad=False), + style='caffe', + init_cfg=None), # the checkpoint in ``load_from`` contains the weights of backbone + neck=dict( + start_level=1, + add_extra_convs='on_output', # 使用 P5 + relu_before_extra_convs=True), + rpn_head=dict( + _delete_=True, # 忽略未使用的旧设置 + type='FCOSHead', + num_classes=1, # 对于 rpn, num_classes = 1,如果 num_classes > 1,它将在 TwoStageDetector 中自动设置为1 + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='IoULoss', loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + roi_head=dict( # update featmap_strides due to the strides in neck + bbox_roi_extractor=dict(featmap_strides=[8, 16, 32, 64, 128]))) +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth' +``` + +训练命令如下。 + +```python +bash +tools/dist_train.sh +configs/faster_rcnn/faster-rcnn_r50-caffe_fpn_fcos-rpn_1x_coco.py \ +--work-dir /work_dirs/faster-rcnn_r50-caffe_fpn_fcos-rpn_1x_coco +``` diff --git a/mmdetection/docs/zh_cn/user_guides/test.md b/mmdetection/docs/zh_cn/user_guides/test.md new file mode 100644 index 0000000..2ada04d --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/test.md @@ -0,0 +1,285 @@ +# 测试现有模型 + +我们提供了测试脚本,能够测试一个现有模型在所有数据集(COCO,Pascal VOC,Cityscapes 等)上的性能。我们支持在如下环境下测试: + +- 单 GPU 测试 +- CPU 测试 +- 单节点多 GPU 测试 +- 多节点测试 + +根据以上测试环境,选择合适的脚本来执行测试过程。 + +```shell +# 单 GPU 测试 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--out ${RESULT_FILE}] \ + [--show] + +# CPU 测试:禁用 GPU 并运行单 GPU 测试脚本 +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--out ${RESULT_FILE}] \ + [--show] + +# 单节点多 GPU 测试 +bash tools/dist_test.sh \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + ${GPU_NUM} \ + [--out ${RESULT_FILE}] +``` + +`tools/dist_test.sh` 也支持多节点测试,不过需要依赖 PyTorch 的 [启动工具](https://pytorch.org/docs/stable/distributed.html#launch-utility) 。 + +可选参数: + +- `RESULT_FILE`: 结果文件名称,需以 .pkl 形式存储。如果没有声明,则不将结果存储到文件。 +- `--show`: 如果开启,检测结果将被绘制在图像上,以一个新窗口的形式展示。它只适用于单 GPU 的测试,是用于调试和可视化的。请确保使用此功能时,你的 GUI 可以在环境中打开。否则,你可能会遇到这么一个错误 `cannot connect to X server`。 +- `--show-dir`: 如果指明,检测结果将会被绘制在图像上并保存到指定目录。它只适用于单 GPU 的测试,是用于调试和可视化的。即使你的环境中没有 GUI,这个选项也可使用。 +- `--cfg-options`: 如果指明,这里的键值对将会被合并到配置文件中。 + +### 样例 + +假设你已经下载了 checkpoint 文件到 `checkpoints/` 文件下了。 + +1. 测试 RTMDet 并可视化其结果。按任意键继续下张图片的测试。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet) 。 + + ```shell + python tools/test.py \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --show + ``` + +2. 测试 RTMDet,并为了之后的可视化保存绘制的图像。配置文件和 checkpoint 文件 [在此](https://github.com/open-mmlab/mmdetection/tree/main/configs/rtmdet) 。 + + ```shell + python tools/test.py \ + configs/rtmdet/rtmdet_l_8xb32-300e_coco.py \ + checkpoints/rtmdet_l_8xb32-300e_coco_20220719_112030-5a0be7c4.pth \ + --show-dir rtmdet_l_8xb32-300e_coco_results + ``` + +3. 在 Pascal VOC 数据集上测试 Faster R-CNN,不保存测试结果,测试 `mAP`。配置文件和 checkpoint 文件 [在此](../../../configs/pascal_voc) 。 + + ```shell + python tools/test.py \ + configs/pascal_voc/faster-rcnn_r50_fpn_1x_voc0712.py \ + checkpoints/faster_rcnn_r50_fpn_1x_voc0712_20200624-c9895d40.pth + ``` + +4. 使用 8 块 GPU 测试 Mask R-CNN,测试 `bbox` 和 `mAP` 。配置文件和 checkpoint 文件 [在此](../../../configs/mask_rcnn) 。 + + ```shell + ./tools/dist_test.sh \ + configs/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ + 8 \ + --out results.pkl + ``` + +5. 使用 8 块 GPU 测试 Mask R-CNN,测试**每类**的 `bbox` 和 `mAP`。配置文件和 checkpoint 文件 [在此](../../../configs/mask_rcnn) 。 + + ```shell + ./tools/dist_test.sh \ + configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ + 8 + ``` + + 该命令生成两个JSON文件 `./work_dirs/coco_instance/test.bbox.json` 和 `./work_dirs/coco_instance/test.segm.json`。 + +6. 在 COCO test-dev 数据集上,使用 8 块 GPU 测试 Mask R-CNN,并生成 JSON 文件提交到官方评测服务器,配置文件和 checkpoint 文件 [在此](../../../configs/mask_rcnnn) 。你可以在 [config](./././configs/_base_/datasets/coco_instance.py) 的注释中用 test_evaluator 和 test_dataloader 替换原来的 test_evaluator 和 test_dataloader,然后运行: + + ```shell + ./tools/dist_test.sh \ + configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py \ + checkpoints/mask_rcnn_r50_fpn_1x_cityscapes_20200227-afe51d5a.pth \ + 8 + ``` + + 这行命令生成两个 JSON 文件 `mask_rcnn_test-dev_results.bbox.json` 和 `mask_rcnn_test-dev_results.segm.json`。 + +7. 在 Cityscapes 数据集上,使用 8 块 GPU 测试 Mask R-CNN,生成 txt 和 png 文件,并上传到官方评测服务器。配置文件和 checkpoint 文件 [在此](../../../configs/cityscapes) 。 你可以在 [config](./././configs/_base_/datasets/cityscapes_instance.py) 的注释中用 test_evaluator 和 test_dataloader 替换原来的 test_evaluator 和 test_dataloader,然后运行: + + ```shell + ./tools/dist_test.sh \ + configs/cityscapes/mask-rcnn_r50_fpn_1x_cityscapes.py \ + checkpoints/mask_rcnn_r50_fpn_1x_cityscapes_20200227-afe51d5a.pth \ + 8 + ``` + + 生成的 png 和 txt 文件在 `./work_dirs/cityscapes_metric` 文件夹下。 + +### 不使用 Ground Truth 标注进行测试 + +MMDetection 支持在不使用 ground-truth 标注的情况下对模型进行测试,这需要用到 `CocoDataset`。如果你的数据集格式不是 COCO 格式的,请将其转化成 COCO 格式。如果你的数据集格式是 VOC 或者 Cityscapes,你可以使用 [tools/dataset_converters](https://github.com/open-mmlab/mmdetection/tree/main/tools/dataset_converters) 内的脚本直接将其转化成 COCO 格式。如果是其他格式,可以使用 [images2coco 脚本](https://github.com/open-mmlab/mmdetection/tree/master/tools/dataset_converters/images2coco.py) 进行转换。 + +```shell +python tools/dataset_converters/images2coco.py \ + ${IMG_PATH} \ + ${CLASSES} \ + ${OUT} \ + [--exclude-extensions] +``` + +参数: + +- `IMG_PATH`: 图片根路径。 +- `CLASSES`: 类列表文本文件名。文本中每一行存储一个类别。 +- `OUT`: 输出 json 文件名。 默认保存目录和 `IMG_PATH` 在同一级。 +- `exclude-extensions`: 待排除的文件后缀名。 + +在转换完成后,使用如下命令进行测试 + +```shell +# 单 GPU 测试 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--show] + +# CPU 测试:禁用 GPU 并运行单 GPU 测试脚本 +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--out ${RESULT_FILE}] \ + [--show] + +# 单节点多 GPU 测试 +bash tools/dist_test.sh \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + ${GPU_NUM} \ + [--show] +``` + +假设 [model zoo](https://mmdetection.readthedocs.io/en/latest/modelzoo_statistics.html) 中的 checkpoint 文件被下载到了 `checkpoints/` 文件夹下, +我们可以使用以下命令,用 8 块 GPU 在 COCO test-dev 数据集上测试 Mask R-CNN,并且生成 JSON 文件。 + +```sh +./tools/dist_test.sh \ + configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoints/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ + 8 +``` + +这行命令生成两个 JSON 文件 `./work_dirs/coco_instance/test.bbox.json` 和 `./work_dirs/coco_instance/test.segm.json`。 + +### 批量推理 + +MMDetection 在测试模式下,既支持单张图片的推理,也支持对图像进行批量推理。默认情况下,我们使用单张图片的测试,你可以通过修改测试数据配置文件中的 `samples_per_gpu` 来开启批量测试。 +开启批量推理的配置文件修改方法为: + +```shell +data = dict(train_dataloader=dict(...), val_dataloader=dict(...), test_dataloader=dict(batch_size=2, ...)) +``` + +或者你可以通过将 `--cfg-options` 设置为 `--cfg-options test_dataloader.batch_size=` 来开启它。 + +## 测试时增强 (TTA) + +测试时增强 (TTA) 是一种在测试阶段使用的数据增强策略。它对同一张图片应用不同的增强,例如翻转和缩放,用于模型推理,然后将每个增强后的图像的预测结果合并,以获得更准确的预测结果。为了让用户更容易使用 TTA,MMEngine 提供了 [BaseTTAModel](https://mmengine.readthedocs.io/en/latest/api/generated/mmengine.model.BaseTTAModel.html#mmengine.model.BaseTTAModel) 类,允许用户根据自己的需求通过简单地扩展 BaseTTAModel 类来实现不同的 TTA 策略。 + +在 MMDetection 中,我们提供了 [DetTTAModel](../../../mmdet/models/test_time_augs/det_tta.py) 类,它继承自 BaseTTAModel。 + +### 使用案例 + +使用 TTA 需要两个步骤。首先,你需要在配置文件中添加 `tta_model` 和 `tta_pipeline`: + +```shell +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict(nms=dict( + type='nms', + iou_threshold=0.5), + max_per_img=100)) + +tta_pipeline = [ + dict(type='LoadImageFromFile', + backend_args=None), + dict( + type='TestTimeAug', + transforms=[[ + dict(type='Resize', scale=(1333, 800), keep_ratio=True) + ], [ # It uses 2 flipping transformations (flipping and not flipping). + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', + 'img_shape', 'scale_factor', 'flip', + 'flip_direction')) + ]])] +``` + +第二步,运行测试脚本时,设置 `--tta` 参数,如下所示: + +```shell +# 单 GPU 测试 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--tta] + +# CPU 测试:禁用 GPU 并运行单 GPU 测试脚本 +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + [--out ${RESULT_FILE}] \ + [--tta] + +# 多 GPU 测试 +bash tools/dist_test.sh \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + ${GPU_NUM} \ + [--tta] +``` + +你也可以自己修改 TTA 配置,例如添加缩放增强: + +```shell +tta_model = dict( + type='DetTTAModel', + tta_cfg=dict(nms=dict( + type='nms', + iou_threshold=0.5), + max_per_img=100)) + +img_scales = [(1333, 800), (666, 400), (2000, 1200)] +tta_pipeline = [ + dict(type='LoadImageFromFile', + backend_args=None), + dict( + type='TestTimeAug', + transforms=[[ + dict(type='Resize', scale=s, keep_ratio=True) for s in img_scales + ], [ + dict(type='RandomFlip', prob=1.), + dict(type='RandomFlip', prob=0.) + ], [ + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', + 'img_shape', 'scale_factor', 'flip', + 'flip_direction')) + ]])] +``` + +以上数据增强管道将首先对图像执行 3 个多尺度转换,然后执行 2 个翻转转换(翻转和不翻转),最后使用 PackDetInputs 将图像打包到最终结果中。 +这里有更多的 TTA 使用案例供您参考: + +- [RetinaNet](../../../configs/retinanet/retinanet_tta.py) +- [CenterNet](../../../configs/centernet/centernet_tta.py) +- [YOLOX](../../../configs/rtmdet/rtmdet_tta.py) +- [RTMDet](../../../configs/yolox/yolox_tta.py) + +更多高级用法和 TTA 的数据流,请参考 [MMEngine](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/test_time_augmentation.html#data-flow)。我们将在后续支持实例分割 TTA。 diff --git a/mmdetection/docs/zh_cn/user_guides/test_results_submission.md b/mmdetection/docs/zh_cn/user_guides/test_results_submission.md new file mode 100644 index 0000000..7a07658 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/test_results_submission.md @@ -0,0 +1,174 @@ +# 提交测试结果 + +## 全景分割测试结果提交 + +下面几节介绍如何在 COCO 测试开发集上生成泛视分割模型的预测结果,并将预测提交到 [COCO评估服务器](https://competitions.codalab.org/competitions/19507) + +### 前提条件 + +- 下载 [COCO测试数据集图像](http://images.cocodataset.org/zips/test2017.zip),[测试图像信息](http://images.cocodataset.org/annotations/image_info_test2017.zip),和[全景训练/相关注释](http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip),然后解压缩它们,把 `test2017` 放到 `data/coco/`,把 json 文件和注释文件放到 `data/coco/annotations/` 。 + +```shell +# 假设 data/coco/ 不存在 +mkdir -pv data/coco/ +# 下载 test2017 +wget -P data/coco/ http://images.cocodataset.org/zips/test2017.zip +wget -P data/coco/ http://images.cocodataset.org/annotations/image_info_test2017.zip +wget -P data/coco/ http://images.cocodataset.org/annotations/panoptic_annotations_trainval2017.zip +# 解压缩它们 +unzip data/coco/test2017.zip -d data/coco/ +unzip data/coco/image_info_test2017.zip -d data/coco/ +unzip data/coco/panoptic_annotations_trainval2017.zip -d data/coco/ +# 删除 zip 文件(可选) +rm -rf data/coco/test2017.zip data/coco/image_info_test2017.zip data/coco/panoptic_annotations_trainval2017.zip +``` + +- 运行以下代码更新测试图像信息中的类别信息。由于 `image_info_test-dev2017.json` 的类别信息中缺少属性 `isthing` ,我们需要用 `panoptic_val2017.json` 中的类别信息更新它。 + +```shell +python tools/misc/gen_coco_panoptic_test_info.py data/coco/annotations +``` + +在完成上述准备之后,你的 `data` 目录结构应该是这样: + +```text +data +`-- coco + |-- annotations + | |-- image_info_test-dev2017.json + | |-- image_info_test2017.json + | |-- panoptic_image_info_test-dev2017.json + | |-- panoptic_train2017.json + | |-- panoptic_train2017.zip + | |-- panoptic_val2017.json + | `-- panoptic_val2017.zip + `-- test2017 +``` + +### coco 测试开发的推理 + +要在 coco test-dev 上进行推断,我们应该首先更新 `test_dataloder` 和 `test_evaluator` 的设置。有两种方法可以做到这一点:1. 在配置文件中更新它们;2. 在命令行中更新它们。 + +#### 在配置文件中更新它们 + +相关的设置在 `configs/_base_/datasets/ coco_panoptical .py` 的末尾,如下所示。 + +```python +test_dataloader = dict( + batch_size=1, + num_workers=1, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/panoptic_image_info_test-dev2017.json', + data_prefix=dict(img='test2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_evaluator = dict( + type='CocoPanopticMetric', + format_only=True, + ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json', + outfile_prefix='./work_dirs/coco_panoptic/test') +``` + +以下任何一种方法都可以用于更新 coco test-dev 集上的推理设置 + +情况1:直接取消注释 `configs/_base_/datasets/ coco_panoptical .py` 中的设置。 + +情况2:将以下设置复制到您现在使用的配置文件中。 + +```python +test_dataloader = dict( + dataset=dict( + ann_file='annotations/panoptic_image_info_test-dev2017.json', + data_prefix=dict(img='test2017/', _delete_=True))) +test_evaluator = dict( + format_only=True, + ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json', + outfile_prefix='./work_dirs/coco_panoptic/test') +``` + +然后通过以下命令对 coco test-dev et 进行推断。 + +```shell +python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} +``` + +#### 在命令行中更新它们 + +coco test-dev 上更新相关设置和推理的命令如下所示。 + +```shell +# 用一个 gpu 测试 +CUDA_VISIBLE_DEVICES=0 python tools/test.py \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + --cfg-options \ + test_dataloader.dataset.ann_file=annotations/panoptic_image_info_test-dev2017.json \ + test_dataloader.dataset.data_prefix.img=test2017 \ + test_dataloader.dataset.data_prefix._delete_=True \ + test_evaluator.format_only=True \ + test_evaluator.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json \ + test_evaluator.outfile_prefix=${WORK_DIR}/results +# 用四个 gpu 测试 +CUDA_VISIBLE_DEVICES=0,1,3,4 bash tools/dist_test.sh \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + 8 \ # eights gpus + --cfg-options \ + test_dataloader.dataset.ann_file=annotations/panoptic_image_info_test-dev2017.json \ + test_dataloader.dataset.data_prefix.img=test2017 \ + test_dataloader.dataset.data_prefix._delete_=True \ + test_evaluator.format_only=True \ + test_evaluator.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json \ + test_evaluator.outfile_prefix=${WORK_DIR}/results +# 用 slurm 测试 +GPUS=8 tools/slurm_test.sh \ + ${Partition} \ + ${JOB_NAME} \ + ${CONFIG_FILE} \ + ${CHECKPOINT_FILE} \ + --cfg-options \ + test_dataloader.dataset.ann_file=annotations/panoptic_image_info_test-dev2017.json \ + test_dataloader.dataset.data_prefix.img=test2017 \ + test_dataloader.dataset.data_prefix._delete_=True \ + test_evaluator.format_only=True \ + test_evaluator.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json \ + test_evaluator.outfile_prefix=${WORK_DIR}/results +``` + +例子:假设我们使用预先训练的带有 ResNet-50 骨干网的 MaskFormer 对 `test2017` 执行推断。 + +```shell +# 单 gpu 测试 +CUDA_VISIBLE_DEVICES=0 python tools/test.py \ + configs/maskformer/maskformer_r50_mstrain_16x1_75e_coco.py \ + checkpoints/maskformer_r50_mstrain_16x1_75e_coco_20220221_141956-bc2699cb.pth \ + --cfg-options \ + test_dataloader.dataset.ann_file=annotations/panoptic_image_info_test-dev2017.json \ + test_dataloader.dataset.data_prefix.img=test2017 \ + test_dataloader.dataset.data_prefix._delete_=True \ + test_evaluator.format_only=True \ + test_evaluator.ann_file=data/coco/annotations/panoptic_image_info_test-dev2017.json \ + test_evaluator.outfile_prefix=work_dirs/maskformer/results +``` + +### 重命名文件并压缩结果 + +推理之后,全景分割结果(一个 json 文件和一个存储掩码的目录)将在 `WORK_DIR` 中。我们应该按照 [COCO's Website](https://cocodataset.org/#upload)上的命名约定重新命名它们。最后,我们需要将 json 和存储掩码的目录压缩到 zip 文件中,并根据命名约定重命名该 zip 文件。注意, zip 文件应该**直接**包含上述两个文件。 + +重命名文件和压缩结果的命令: + +```shell +# 在 WORK_DIR 中,我们有 panoptic 分割结果: 'panoptic' 和 'results. panoptical .json'。 +cd ${WORK_DIR} +# 将 '[algorithm_name]' 替换为您使用的算法名称 +mv ./panoptic ./panoptic_test-dev2017_[algorithm_name]_results +mv ./results.panoptic.json ./panoptic_test-dev2017_[algorithm_name]_results.json +zip panoptic_test-dev2017_[algorithm_name]_results.zip -ur panoptic_test-dev2017_[algorithm_name]_results panoptic_test-dev2017_[algorithm_name]_results.json +``` diff --git a/mmdetection/docs/zh_cn/user_guides/tracking_analysis_tools.md b/mmdetection/docs/zh_cn/user_guides/tracking_analysis_tools.md new file mode 100644 index 0000000..5330af1 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/tracking_analysis_tools.md @@ -0,0 +1,87 @@ +**我们在 `tools/` 目录下提供了很多有用的工具。** + +## MOT 测试时参数搜索 + +`tools/analysis_tools/mot/mot_param_search.py` 可以搜索 MOT 模型中 `tracker` 的参数。 +它与 `tools/test.py` 的使用方式相同,但配置上**有所不同**。 + +下面是修改配置的示例: + +1. 定义要记录的期望评估指标。 + + 例如,你可以将 `evaluator` 定义为: + + ```python + test_evaluator=dict(type='MOTChallengeMetrics', metric=['HOTA', 'CLEAR', 'Identity']) + ``` + + 当然,你也可以自定义 `test_evaluator` 中 `metric` 的内容。你可以自由选择 `['HOTA', 'CLEAR', 'Identity']` 中的一个或多个指标。 + +2. 定义要搜索的参数及其取值。 + + 假设你有一个 `tracker` 的配置如下: + + ```python + model=dict( + tracker=dict( + type='BaseTracker', + obj_score_thr=0.5, + match_iou_thr=0.5 + ) + ) + ``` + + 如果你想要搜索 `tracker` 的参数,只需将其值改为一个列表,如下所示: + + ```python + model=dict( + tracker=dict( + type='BaseTracker', + obj_score_thr=[0.4, 0.5, 0.6], + match_iou_thr=[0.4, 0.5, 0.6, 0.7] + ) + ) + ``` + + 然后,脚本将测试一共12种情况并且记录结果。 + +## MOT 误差可视化 + +`tools/analysis_tools/mot/mot_error_visualize.py` 可以为多目标跟踪可视化错误。 + +该脚本需要推断的结果作为输入。默认情况下,**红色**边界框表示误检(false positive),**黄色**边界框表示漏检(false negative),**蓝色**边界框表示ID切换(ID switch)。 + +``` +python tools/analysis_tools/mot/mot_error_visualize.py \ + ${CONFIG_FILE}\ + --input ${INPUT} \ + --result-dir ${RESULT_DIR} \ + [--output-dir ${OUTPUT}] \ + [--fps ${FPS}] \ + [--show] \ + [--backend ${BACKEND}] +``` + +`RESULT_DIR` 中包含了所有视频的推断结果,推断结果是一个 `txt` 文件。 + +可选参数: + +- `OUTPUT`:可视化演示的输出。如果未指定,`--show` 是必选的,用于即时显示视频。 +- `FPS`:输出视频的帧率。 +- `--show`:是否即时显示视频。 +- `BACKEND`:用于可视化边界框的后端。选项包括 `cv2` 和 `plt`。 + +## 浏览数据集 + +`tools/analysis_tools/mot/browse_dataset.py` 可以可视化训练数据集,以检查数据集配置是否正确。 + +**示例:** + +```shell +python tools/analysis_tools/browse_dataset.py ${CONFIG_FILE} [--show-interval ${SHOW_INTERVAL}] +``` + +可选参数: + +- `SHOW_INTERVAL`: 显示的间隔时间(秒)。 +- `--show`: 是否即时显示图像。 diff --git a/mmdetection/docs/zh_cn/user_guides/tracking_config.md b/mmdetection/docs/zh_cn/user_guides/tracking_config.md new file mode 100644 index 0000000..4a20da7 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/tracking_config.md @@ -0,0 +1,109 @@ +# 学习更多与配置相关的事 + +我们用 python 文档作为我们的配置系统。你可以在 `MMDetection/configs` 底下找到所有已提供的配置文件。 + +我们把模块化和继承化设计融入我们的配置系统,这使我们很方便去进行各种实验。如果你想查看相关的配置文件,你可以跑 `python tools/misc/print_config.py /PATH/TO/CONFIG` 去看完整的详细配置。 + +## 完整配置的简要说明 + +一个完整的配置通常包含以下主要的字段: + +`model`:一个模型的基本配置,包含 `data_preprocessor`、`detector`、`motion` 之类的模块,还有 `train_cfg`、`test_cfg` 等等; + +`train_dataloader`:训练数据集的配置,通常包含 `batch_size`、 `num_workers`、 `sampler`、 `dataset` 等等; + +`val_dataloader`:验证数据集的配置,与训练数据集的配置类似; + +`test_dataloader`:测试数据集的配置,与训练数据集的配置类似; + +`val_evaluator`:验证评估器的配置,例如 `type='MOTChallengeMetrics'` 是 MOT 任务里面的测量标准; + +`test_evaluator`:测试评估器的配置,与验证评估器的配置类似; + +`train_cfg`:训练循环的配置,例如 `type='EpochBasedTrainLoop'` ; + +`val_cfg`:验证循环的配置,例如 `type='VideoValLoop'` ; + +`test_cfg`:测试循环的配置,例如 `type='VideoTestLoop'` ; + +`default_hooks`:默认鱼钩的配置,包含计时器、日志、参数调度程序、检查点、样本种子、可视化; + +`vis_backends`:可视化后端的配置,默认使用 `type='LocalVisBackend'` ; + +`visualizer`:可视化工具的配置,例如MOT任务使用 `type='TrackLocalVisualizer'` ; + +`param_scheduler`:参数调度程序的配置,通常里面设置学习率调度程序; + +`optim_wrapper`:优化器封装的配置,包含优化相关的信息,例如优化器、梯度剪裁等; + +`load_from`:加载预训练模型的路径; + +`resume`:布尔值,如果是 `True` ,会从 `load_from` 加载模型的检查点,训练会恢复至检查点的迭代次数。 + +## 通过脚本参数修改配置 + +当使用 `tools/train.py` 或 `tools/test_trackin.py` 执行任务时,可以指定 `--cfg-options` 来就地修改配置。我们举几个例子如下。有关更多详细信息,请参阅[MMEngine](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html)。 + +### 更新 dict 链的配置键 + +可以按照原始配置中 `dict` 键的顺序指定配置选项,例如,设置 `--cfg-options model.detector.backbone.norm_eval=False` 会将模型主干中的所有 `BN` 模块更改为训练模式。 + +### 更新配置列表中的关键字 + +一些配置的 `dict` 关键字会以列表的形式组成,例如,测试管道中的 `test_dataloader.dataset.pipeline` 以列表形式出现,即 `[dict(type='LoadImageFromFile'), ...]`。如果你想在测试管道中将 `LoadImageFromFile` 更改为 `LoadImageFromWebcam`,可以设置 `--cfg-options test_dataloader.dataset.pipeline.0.type=LoadImageFromWebcam`。 + +### 更新列表/元组的值 + +要被更新的可能是一个列表或一个元组,例如,你可以通过指定 `--cfg options model.data_processor.mean=[0,0,0]` 来更改 `data_preprocessor` 的平均值的关键字。请注意,指定值内不允许有空格。 + +## 配置文件结构 + +`config/_base_` 下有三种基本组件类型,即数据集、模型和默认运行时间。可以用它们来轻松构建许多方法,例如 `SORT`,`DeepSORT`。由 `_base_` 中的组件组成的配置称为基元。 + +对于同一文件夹下的配置文件,建议只有一个基元配置文件。其他配置文件都应该从基元配置文件继承基本结构,这样,继承级别的最大值为 3。 + +为了便于理解,我们建议贡献者继承现有的方法。例如,如果在 `Faster R-CNN` 的基础上进行了一些修改,用户可以首先通过指定 `_base_ = ../_base_/models/faster-rcnn_r50-dc5.py` 来继承基本的 `Faster R-CNN` 结构,然后修改配置文件中的必要字段。 + +如果你正在构建一个与任何现有方法都不共享结构的全新方法,则可以在 `configs` 下创建一个新文件夹 method_name。 + +有关详细文档,请参阅[MMEngine](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html)。 + +## 配置命名风格 + +我们根据以下风格去命名配置文件,建议贡献者遵从相同风格。 + +`{method}_{module}_{train_cfg}_{train_data}_{test_data}` + +`{method}`: 方法名称,例如 `sort`; + +`{module}`: 方法的基本模块,例如 `faster-rcnn_r50_fpn`; + +`{train_cfg}`: 训练配置通常包含批量大小、迭代次数等,例如 `8xb4-80e`; + +`{train_data}`: 训练数据集,例如 `mot17halftrain`; + +`{test_data}`: 测试数据集,例如 `test-mot17halfval`。 + +## 常问问题 + +### 忽略基本配置中的某些字段 + +有时候你可以设置 `_delete_=True` 去忽略基本配置中的一些字段,你可以参考[MMEngine](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html)进行简单说明。 + +### 跟踪数据结构介绍 + +#### 优点和新功能 + +在 `mmdetection` 跟踪任务中,我们使用视频来组织数据集,并使用 `TrackDataSample` 来描述数据集信息。 + +基于视频组织,我们提供了 `transform UniformRefFrameSample` 来对关键帧和参考帧进行采样,并使用 `TransformBroadcaster` 进行剪辑训练。 + +在某种程度上,`TrackDataSample` 可以被视为多个 `DetDataSample` 的包装器。它包含一个 `video_data_samples`,这是一个以 `DetDataSample` 组成的列表,里面每个 `DetDataSample` 对应一个帧。此外,它的元信息包括关键帧的索引和参考帧的索引,用与剪辑训练。 + +得益于基于视频的数据组织,整个视频可以直接被测试。这种方式更简洁直观。如果你的 GPU 内存无法容纳整个视频,我们还提供基于图像的测试方法。 + +## 要做的事 + +`StrongSORT`、`Mask2Former` 等算法不支持基于视频的测试,这些算法对 GPU 内存提出了挑战,我们将来会优化这个问题。 + +现在,我们不支持像 `MOT Challenge dataset` 这样的基于视频的数据集和像 `Crowdhuman` 用于 `QDTrack` 算法这样的基于图像的数据集进行联合训练。我们将来会优化这个问题。 diff --git a/mmdetection/docs/zh_cn/user_guides/tracking_dataset_prepare.md b/mmdetection/docs/zh_cn/user_guides/tracking_dataset_prepare.md new file mode 100644 index 0000000..c99f188 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/tracking_dataset_prepare.md @@ -0,0 +1,245 @@ +## 数据集准备 + +本页面提供了现有基准数据集的准备说明,包括: + +- 多目标跟踪 + + - [MOT Challenge](https://motchallenge.net/) + - [CrowdHuman](https://www.crowdhuman.org/) + +- 视频实例分割 + + - [YouTube-VIS](https://youtube-vos.org/dataset/vis/) + +### 1. 下载数据集 + +请从官方网站下载数据集,并将数据集的根目录建立软链接到 `$MMDETECTION/data` 目录下。 + +#### 1.1 多目标跟踪 + +- 对于多目标跟踪任务的训练和测试,需要下载MOT Challenge数据集之一(例如MOT17、MOT20),CrowdHuman数据集可以作为补充数据集。 + +- 对于中国的用户,可以从 [OpenDataLab](https://opendatalab.com/) 上高速下载如下数据集: + + - [MOT17](https://opendatalab.com/MOT17/download) + - [MOT20](https://opendatalab.com/MOT20/download) + - [CrowdHuman](https://opendatalab.com/CrowdHuman/download) + +#### 1.2 视频实例分割 + +- 对于视频实例分割任务的训练和测试,只需要选择一个YouTube-VIS数据集(例如YouTube-VIS 2019、YouTube-VIS 2021)即可。 +- 可以从 [YouTubeVOS](https://codalab.lisn.upsaclay.fr/competitions/6064) 上下载YouTube-VIS 2019数据集。 +- 可以从 [YouTubeVOS](https://codalab.lisn.upsaclay.fr/competitions/7680) 上下载YouTube-VIS 2021数据集。 + +#### 1.3 数据结构 + +如果您的文件夹结构与以下结构不同,则可能需要在配置文件中更改相应的路径。 + +``` +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── annotations +│ │ +| ├── MOT15/MOT16/MOT17/MOT20 +| | ├── train +| | | ├── MOT17-02-DPM +| | | | ├── det +| │ │ │ ├── gt +| │ │ │ ├── img1 +| │ │ │ ├── seqinfo.ini +│ │ │ ├── ...... +| | ├── test +| | | ├── MOT17-01-DPM +| | | | ├── det +| │ │ │ ├── img1 +| │ │ │ ├── seqinfo.ini +│ │ │ ├── ...... +│ │ +│ ├── crowdhuman +│ │ ├── annotation_train.odgt +│ │ ├── annotation_val.odgt +│ │ ├── train +│ │ │ ├── Images +│ │ │ ├── CrowdHuman_train01.zip +│ │ │ ├── CrowdHuman_train02.zip +│ │ │ ├── CrowdHuman_train03.zip +│ │ ├── val +│ │ │ ├── Images +│ │ │ ├── CrowdHuman_val.zip +│ │ +``` + +### 2. 转换注释 + +在这种情况下,您需要将官方注释(Annotations)转换为COCO格式。我们提供了相应的脚本,使用方法如下: + +```shell +# MOT17 +# 其他 MOT Challenge 数据集的处理方式与 MOT17 相同。 +python ./tools/dataset_converters/mot2coco.py -i ./data/MOT17/ -o ./data/MOT17/annotations --split-train --convert-det +python ./tools/dataset_converters/mot2reid.py -i ./data/MOT17/ -o ./data/MOT17/reid --val-split 0.2 --vis-threshold 0.3 + +# CrowdHuman +python ./tools/dataset_converters/crowdhuman2coco.py -i ./data/crowdhuman -o ./data/crowdhuman/annotations + +# YouTube-VIS 2019 +python ./tools/dataset_converters/youtubevis/youtubevis2coco.py -i ./data/youtube_vis_2019 -o ./data/youtube_vis_2019/annotations --version 2019 + +# YouTube-VIS 2021 +python ./tools/dataset_converters/youtubevis/youtubevis2coco.py -i ./data/youtube_vis_2021 -o ./data/youtube_vis_2021/annotations --version 2021 + +``` + +运行这些脚本后,文件夹结构将如下所示: + +``` +mmdetection +├── mmtrack +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +│ │ ├── annotations +│ │ +| ├── MOT15/MOT16/MOT17/MOT20 +| | ├── train +| | | ├── MOT17-02-DPM +| | | | ├── det +| │ │ │ ├── gt +| │ │ │ ├── img1 +| │ │ │ ├── seqinfo.ini +│ │ │ ├── ...... +| | ├── test +| | | ├── MOT17-01-DPM +| | | | ├── det +| │ │ │ ├── img1 +| │ │ │ ├── seqinfo.ini +│ │ │ ├── ...... +| | ├── annotations +| | ├── reid +│ │ │ ├── imgs +│ │ │ ├── meta +│ │ +│ ├── crowdhuman +│ │ ├── annotation_train.odgt +│ │ ├── annotation_val.odgt +│ │ ├── train +│ │ │ ├── Images +│ │ │ ├── CrowdHuman_train01.zip +│ │ │ ├── CrowdHuman_train02.zip +│ │ │ ├── CrowdHuman_train03.zip +│ │ ├── val +│ │ │ ├── Images +│ │ │ ├── CrowdHuman_val.zip +│ │ ├── annotations +│ │ │ ├── crowdhuman_train.json +│ │ │ ├── crowdhuman_val.json +│ │ +│ ├── youtube_vis_2019 +│ │ │── train +│ │ │ │── JPEGImages +│ │ │ │── ...... +│ │ │── valid +│ │ │ │── JPEGImages +│ │ │ │── ...... +│ │ │── test +│ │ │ │── JPEGImages +│ │ │ │── ...... +│ │ │── train.json (the official annotation files) +│ │ │── valid.json (the official annotation files) +│ │ │── test.json (the official annotation files) +│ │ │── annotations (the converted annotation file) +│ │ +│ ├── youtube_vis_2021 +│ │ │── train +│ │ │ │── JPEGImages +│ │ │ │── instances.json (the official annotation files) +│ │ │ │── ...... +│ │ │── valid +│ │ │ │── JPEGImages +│ │ │ │── instances.json (the official annotation files) +│ │ │ │── ...... +│ │ │── test +│ │ │ │── JPEGImages +│ │ │ │── instances.json (the official annotation files) +│ │ │ │── ...... +│ │ │── annotations (the converted annotation file) +``` + +#### MOT15/MOT16/MOT17/MOT20中的注释和reid文件夹 + +以 MOT17 数据集为例,其他数据集的结构类似。 + +在 `data/MOT17/annotations` 文件夹中有8个JSON文件: + +`train_cocoformat.json`: 包含MOT17数据集训练集的注释信息的JSON文件。 + +`train_detections.pkl`: 包含MOT17数据集训练集的公共检测结果的Pickle文件。 + +`test_cocoformat.json`: 包含MOT17数据集测试集的注释信息的JSON文件。 + +`test_detections.pkl`: 包含MOT17数据集测试集的公共检测结果的Pickle文件。 + +`half-train_cocoformat.json`、`half-train_detections.pkl`、`half-val_cocoformat.json` 和 `half-val_detections.pkl` 与 `train_cocoformat.json` 和 `train_detections.pkl` 具有类似的含义。`half` 表示将训练集中的每个视频分成两半。前一半的视频被标记为 `half-train` 集,后一半的视频被标记为 `half-val` 集。 + +`data/MOT17/reid` 文件夹的结构如下所示: + +``` +reid +├── imgs +│ ├── MOT17-02-FRCNN_000002 +│ │ ├── 000000.jpg +│ │ ├── 000001.jpg +│ │ ├── ... +│ ├── MOT17-02-FRCNN_000003 +│ │ ├── 000000.jpg +│ │ ├── 000001.jpg +│ │ ├── ... +├── meta +│ ├── train_80.txt +│ ├── val_20.txt +``` + +`train_80.txt` 中的 `80` 表示训练数据集在整个ReID数据集中的比例为80%。而验证数据集的比例为20%。 + +关于训练,我们提供了一个注释列表 `train_80.txt`。列表中的每一行包含一个文件名及其对应的真实标签。格式如下所示: + +``` +MOT17-05-FRCNN_000110/000018.jpg 0 +MOT17-13-FRCNN_000146/000014.jpg 1 +MOT17-05-FRCNN_000088/000004.jpg 2 +MOT17-02-FRCNN_000009/000081.jpg 3 +``` + +`MOT17-05-FRCNN_000110` 表示 `MOT17-05-FRCNN` 视频中的第110个人。 + +对于验证集,注释列表 `val_20.txt` 的格式与上述相同。 + +`reid/imgs` 中的图像是通过相应的 `gt.txt` 从 `MOT17/train` 中的原始图像中裁剪而来。真实标签的值应在 `[0, num_classes - 1]` 的范围内。 + +#### CrowdHuman 中的 annotations 文件夹 + +`data/crowdhuman/annotations` 文件夹下有两个JSON文件: + +`crowdhuman_train.json`:包含 CrowdHuman 数据集训练集的注释信息的JSON文件。 +`crowdhuman_val.json`:包含 CrowdHuman 数据集验证集的注释信息的JSON文件。 + +#### youtube_vis_2019/youtube_vis2021 中的 annotations 文件夹 + +There are 3 JSON files in `data/youtube_vis_2019/annotations` or `data/youtube_vis_2021/annotations`: + +`youtube_vis_2019_train.json`/`youtube_vis_2021_train.json`:包含 youtube_vis_2019/youtube_vis2021 数据集训练集的注释信息的JSON文件。 + +`youtube_vis_2019_valid.json`/`youtube_vis_2021_valid.json`:包含 youtube_vis_2019/youtube_vis2021 数据集验证集的注释信息的JSON文件。 + +`youtube_vis_2019_test.json`/`youtube_vis_2021_test.json`:包含 youtube_vis_2019/youtube_vis2021 数据集测试集的注释信息的JSON文件。 diff --git a/mmdetection/docs/zh_cn/user_guides/tracking_interference.md b/mmdetection/docs/zh_cn/user_guides/tracking_interference.md new file mode 100644 index 0000000..1b1fc08 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/tracking_interference.md @@ -0,0 +1,55 @@ +# 推理 + +我们提供了一些演示脚本去推理一个给出的视频,或者是推理包含一系列连续照片的文件夹。想要获取该代码资源,请点击 [这里](https://github.com/open-mmlab/mmdetection/tree/tracking/demo)。 + +若输入为文件夹格式,你需要标明这点。并且,图片命名应该**易于整理**,以便于你根据文件名字中包含的数字信息来重新调整图片的顺序。我们现在只支持 `.jpg`,`.jpeg` 和 `.png` 格式的图片。 + +## MOT models 的推理 + +该脚本能够使用多任务跟踪或者视频实例分割方法来推理一段输入的视频/一张图片。 + +```shell +python demo/mot_demo.py \ + ${INPUTS} + ${CONFIG_FILE} \ + [--checkpoint ${CHECKPOINT_FILE}] \ + [--detector ${DETECTOR_FILE}] \ + [--reid ${REID_FILE}] \ + [--score-thr ${SCORE_THR}] \ + [--device ${DEVICE}] \ + [--out ${OUTPUT}] \ + [--show] +``` + +`INPUTS` 和 `OUTPUT` 参数支持 _mp4 视频_ 格式和_文件夹_格式。 + +**特别注意**:对于 `DeepSORT`、`SORT`、`StrongSORT`,他们需要单独加载 `reid` 和 `detector` 的权重。因此,我们会使用 `--detector` 和 `--reid` 来加载权重参数。其他的例如 `ByteTrack`、`OCSORT`、`QDTrack`、`MaskTrackRCNN` 以及 `Mask2Former` 这样的算法则使用 `--checkpoint` 来加载权重参数。 + +可选参数: + +- `CHECKPOINT_FILE`: 可选择 checkpoint。 +- `DETECTOR_FILE`: 可选择 detector。 +- `REID_FILE`: 可选择 reid。 +- `SCORE_THR`: bboxes 的得分阈值。 +- `DEVICE`: 推理所需配置。可以选择 `cpu`,`cuda:0`,或者其他。 +- `OUTPUT`: 输出结果可视化的示例。如果未指定, `--show` 将强制显示动态视频。 +- `--show`: 是否即时显示视频。 + +**运行 mot model 的示例:** + +```shell +# 示例 1:不指定 --checkpoint 使用 --detector +python demo/mot_demo.py \ + demo/demo_mot.mp4 \ + configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \ + --detector \ + https://download.openmmlab.com/mmtracking/mot/faster_rcnn/faster-rcnn_r50_fpn_4e_mot17-half-64ee2ed4.pth \ + --out mot.mp4 + +# 示例 2:使用 --checkpoint +python demo/mot_demo.py \ + demo/demo_mot.mp4 \ + configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py \ + --checkpoint https://download.openmmlab.com/mmtracking/mot/qdtrack/mot_dataset/qdtrack_faster-rcnn_r50_fpn_4e_mot17_20220315_145635-76f295ef.pth \ + --out mot.mp4 +``` diff --git a/mmdetection/docs/zh_cn/user_guides/tracking_train_test_zh_cn.md b/mmdetection/docs/zh_cn/user_guides/tracking_train_test_zh_cn.md new file mode 100644 index 0000000..0542b9a --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/tracking_train_test_zh_cn.md @@ -0,0 +1,229 @@ +# 学习训练和测试 + +## 训练 + +本节将介绍如何在支持的数据集上训练现有模型。 +支持以下训练环境: + +- CPU +- 单 GPU +- 单节点多 GPU +- 多节点 + +您还可以使用 Slurm 管理作业。 + +重要: + +- 在训练过程中,您可以通过修改 `train_cfg` 来改变评估间隔。 + `train_cfg = dict(val_interval=10)`。这意味着每 10 个 epoch 对模型进行一次评估。 +- 所有配置文件中的默认学习率为 8 个 GPU。 + 根据[线性扩展规则](https://arxiv.org/abs/1706.02677)、 + 如果在每个 GPU 上使用不同的 GPU 或图像,则需要设置与批次大小成比例的学习率、 + 例如,8 个 GPU * 1 个图像/GPU 的学习率为 `lr=0.01`,16 个 GPU * 2 个图像/GPU 的学习率为 lr=0.04。 +- 在训练过程中,日志文件和检查点将保存到工作目录、 + 该目录由 CLI 参数 `--work-dir`指定。它默认使用 `./work_dirs/CONFIG_NAME`。 +- 如果需要混合精度训练,只需指定 CLI 参数 `--amp`。 + +#### 1.在 CPU 上训练 + +该模型默认放在 cuda 设备上。 +仅当没有 cuda 设备时,该模型才会放在 CPU 上。 +因此,如果要在 CPU 上训练模型,则需要先 `export CUDA_VISIBLE_DEVICES=-1` 以禁用 GPU 可见性。 +更多细节参见 [MMEngine](https://github.com/open-mmlab/mmengine/blob/ca282aee9e402104b644494ca491f73d93a9544f/mmengine/runner/runner.py#L849-L850). + +```shell 脚本 +CUDA_VISIBLE_DEVICES=-1 python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +在 CPU 上训练 MOT 模型 QDTrack 的示例: + +```shell 脚本 +CUDA_VISIBLE_DEVICES=-1 python tools/train.py configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py +``` + +#### 2. 在单 GPU 上训练 + +如果您想在单 GPU 上训练模型, 您可以按照如下方法直接使用 `tools/train.py`. + +```shell 脚本 +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +您可以使用 `export CUDA_VISIBLE_DEVICES=$GPU_ID` 命令选择GPU. + +在单 GPU 上训练 MOT 模型 QDTrack 的示例: + +```shell 脚本 +CUDA_VISIBLE_DEVICES=2 python tools/train.py configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py +``` + +#### 3. 在单节点多 GPU 上进行训练 + +我们提供了 `tools/dist_train.sh`,用于在多个 GPU 上启动训练。 +基本用法如下。 + +```shell 脚本 +bash ./tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments] +``` + +如果您想在一台机器上启动多个作业、 +例如,在拥有 8 个 GPU 的机器上启动 2 个 4-GPU 训练作业、 +需要为每个作业指定不同的端口(默认为 29500),以避免通信冲突。 + +例如,可以在命令中设置端口如下。 + +```shell 脚本 +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4 +``` + +在单节点多 GPU 上训练 MOT 模型 QDTrack 的示例: + +```shell脚本 +bash ./tools/dist_train.sh configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 +``` + +#### 4. 在多个节点上训练 + +如果使用以太网连接多台机器,只需运行以下命令即可: + +在第一台机器上 + +```shell 脚本 +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS +``` + +在第二台机器上: + +```shell script +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train.sh $CONFIG $GPUS +``` + +如果没有 InfiniBand 等高速网络,速度通常会很慢。 + +#### 5. 使用 Slurm 进行训练 + +[Slurm](https://slurm.schedmd.com/)是一个用于计算集群的优秀作业调度系统。 +在 Slurm 管理的集群上,您可以使用 `slurm_train.sh` 生成训练作业。 +它支持单节点和多节点训练。 + +基本用法如下。 + +```shell 脚本 +bash ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} ${GPUS} +``` + +使用 Slurm 训练 MOT 模型 QDTrack 的示例: + +```shell脚本 +PORT=29501 \ +GPUS_PER_NODE=8 \ +SRUN_ARGS="--quotatype=reserved" \ +bash ./tools/slurm_train.sh \ +mypartition \ +mottrack +configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py +./work_dirs/QDTrack \ +8 +``` + +## 测试 + +本节将介绍如何在支持的数据集上测试现有模型。 +支持以下测试环境: + +- CPU +- 单 GPU +- 单节点多 GPU +- 多节点 + +您还可以使用 Slurm 管理作业。 + +重要: + +- 在 MOT 中,某些算法(如 `DeepSORT`、`SORT`、`StrongSORT`)需要分别加载 `reid` 的权重和 `detector` 的权重。 + 其他算法,如`ByteTrack`、`OCSORT`和`QDTrack`则不需要。因此,我们提供了 `--checkpoint`、`--detector` 和 `--reid`来加载权重。 +- 我们提供了两种评估和测试模型的方法,即基于视频的测试和基于图像的测试。 有些算法如 `StrongSORT`, `Mask2former` 只支持基于视频的测试. 如果您的 GPU 内存无法容纳整个视频,您可以通过设置采样器类型来切换测试方式。 + 例如 + 基于视频的测试:`sampler=dict(type='DefaultSampler', shuffle=False, round_up=False)` + 基于图像的测试:`sampler=dict(type='TrackImgSampler')` +- 您可以通过修改 evaluator 中的关键字 `outfile_prefix` 来设置结果保存路径。 + 例如,`val_evaluator = dict(outfile_prefix='results/sort_mot17')`。 + 否则,将创建一个临时文件,并在评估后删除。 +- 如果您只想要格式化的结果而不需要评估,可以设置 `format_only=True`。 + 例如,`test_evaluator = dict(type='MOTChallengeMetric', metric=['HOTA', 'CLEAR', 'Identity'], outfile_prefix='sort_mot17_results', format_only=True)` + +#### 1. 在 CPU 上测试 + +模型默认在 cuda 设备上运行。 +只有在没有 cuda 设备的情况下,模型才会在 CPU 上运行。 +因此,如果要在 CPU 上测试模型,您需要 `export CUDA_VISIBLE_DEVICES=-1` 先禁用 GPU 可见性。 + +更多细节请参考[MMEngine](https://github.com/open-mmlab/mmengine/blob/ca282aee9e402104b644494ca491f73d93a9544f/mmengine/runner/runner.py#L849-L850). + +```shell 脚本 +CUDA_VISIBLE_DEVICES=-1 python tools/test_tracking.py ${CONFIG_FILE} [optional arguments] +``` + +在 CPU 上测试 MOT 模型 SORT 的示例: + +```shell 脚本 +CUDA_VISIBLE_DEVICES=-1 python tools/test_tracking.py configs/sort/sort_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py --detector ${CHECKPOINT_FILE} +``` + +#### 2. 在单 GPU 上测试 + +如果您想在单 GPU 上测试模型,可以直接使用 `tools/test_tracking.py`,如下所示。 + +```shell 脚本 +python tools/test_tracking.py ${CONFIG_FILE} [optional arguments] +``` + +您可以使用 `export CUDA_VISIBLE_DEVICES=$GPU_ID` 来选择 GPU。 + +在单 GPU 上测试 MOT 模型 QDTrack 的示例: + +```shell 脚本 +CUDA_VISIBLE_DEVICES=2 python tools/test_tracking.py configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py --detector ${CHECKPOINT_FILE} +``` + +#### 3. 在单节点多 GPU 上进行测试 + +我们提供了 `tools/dist_test_tracking.sh`,用于在多个 GPU 上启动测试。 +基本用法如下。 + +```shell 脚本 +bash ./tools/dist_test_tracking.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments] +``` + +在单节点多 GPU 上测试 MOT 模型 DeepSort 的示例: + +```shell 脚本 +bash ./tools/dist_test_tracking.sh configs/qdtrack/qdtrack_faster-rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py 8 --detector ${CHECKPOINT_FILE} --reid ${CHECKPOINT_FILE} +``` + +#### 4. 在多个节点上测试 + +您可以在多个节点上进行测试,这与 "在多个节点上进行训练 "类似。 + +#### 5. 使用 Slurm 进行测试 + +在 Slurm 管理的集群上,您可以使用 `slurm_test_tracking.sh` 生成测试作业。 +它支持单节点和多节点测试。 + +基本用法如下。 + +```shell 脚本 +[GPUS=${GPUS}] bash tools/slurm_test_tracking.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} [optional arguments] +``` + +使用 Slurm 测试 VIS 模型 Mask2former 的示例: + +```shell 脚本 +GPUS=8 +bash tools/slurm_test_tracking.sh \ +mypartition \ +vis \ +configs/mask2former_vis/mask2former_r50_8xb2-8e_youtubevis2021.py \ +--checkpoint ${CHECKPOINT_FILE} +``` diff --git a/mmdetection/docs/zh_cn/user_guides/tracking_visualization.md b/mmdetection/docs/zh_cn/user_guides/tracking_visualization.md new file mode 100644 index 0000000..0d10952 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/tracking_visualization.md @@ -0,0 +1,51 @@ +# 了解可视化 + +## 本地的可视化 + +这一节将会展示如何使用本地的工具可视化 detection/tracking 的运行结果。 + +如果你想要画出预测结果的图像,你可以如下示例,将 `TrackVisualizationHook` 中的 draw 的参数设置为 `draw=True`。 + +```shell +default_hooks = dict(visualization=dict(type='TrackVisualizationHook', draw=True)) +``` + +`TrackVisualizationHook` 共有如下参数: + +- `draw`: 是否绘制预测结果。如果选择 False,将不会显示图像。该参数默认设置为 False。 +- `interval`: 可视化的间隔。默认值为 30。 +- `score_thr`: 确定是否可视化边界框和掩码的阈值。默认值是 0.3。 +- `show`: 是否展示绘制的图像。默认不显示。 +- `wait_time`: 展示的时间间隔(秒)。默认为 0。 +- `test_out_dir`: 测试过程中绘制图像保存的目录。 +- `backend_args`: 用于实例化文件客户端的参数。默认值为 `None `。 + +在 `TrackVisualizationHook` 中,将调用 `TrackLocalVisualizer` 来实现 MOT 和 VIS 任务的可视化。具体细节如下。 + +你可以通过 MMEngine 获取 [Visualization](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/visualization.md) 和 [Hook](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/hook.md) 的更多细节。 + +### Tracking 的可视化 + +我们使用 `TrackLocalVisualizer` 这个类以实现跟踪任务可视化。调用方式如下: + +```python +visualizer = dict(type='TrackLocalVisualizer') +``` + +visualizer 共有如下的参数: + +- `name`: 所选实例的名称。默认值为 ‘visualizer’。 + +- `image`: 用于绘制的原始图像。格式需要为 RGB。默认为 None。 + +- `vis_backends`: 可视化后端配置列表。默认为 None。 + +- `save_dir`: 所有后端存储的保存文件目录。如果为 None,后端将不会保存任何数据。 + +- `line_width`: 边框宽度。默认值为 3。 + +- `alpha`: 边界框和掩码的透明度。默认为 0.8。 + +这里提供了一个 DeepSORT 的可视化示例: + +![test_img_89](https://user-images.githubusercontent.com/99722489/186062929-6d0e4663-0d8e-4045-9ec8-67e0e41da876.png) diff --git a/mmdetection/docs/zh_cn/user_guides/train.md b/mmdetection/docs/zh_cn/user_guides/train.md new file mode 100644 index 0000000..8feb1aa --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/train.md @@ -0,0 +1,451 @@ +# 在标准数据集上训练预定义的模型 + +MMDetection 也为训练检测模型提供了开盖即食的工具。本节将展示在标准数据集(比如 COCO)上如何训练一个预定义的模型。 + +### 数据集 + +训练需要准备好数据集,细节请参考 [数据集准备](#%E6%95%B0%E6%8D%AE%E9%9B%86%E5%87%86%E5%A4%87) 。 + +**注意**: +目前,`configs/cityscapes` 文件夹下的配置文件都是使用 COCO 预训练权值进行初始化的。如果网络连接不可用或者速度很慢,你可以提前下载现存的模型。否则可能在训练的开始会有错误发生。 + +### 学习率自动缩放 + +**注意**:在配置文件中的学习率是在 8 块 GPU,每块 GPU 有 2 张图像(批大小为 8\*2=16)的情况下设置的。其已经设置在 `config/_base_/schedules/schedule_1x.py` 中的 `auto_scale_lr.base_batch_size`。学习率会基于批次大小为 `16`时的值进行自动缩放。同时,为了不影响其他基于 mmdet 的 codebase,启用自动缩放标志 `auto_scale_lr.enable` 默认设置为 `False`。 + +如果要启用此功能,需在命令添加参数 `--auto-scale-lr`。并且在启动命令之前,请检查下即将使用的配置文件的名称,因为配置名称指示默认的批处理大小。 +在默认情况下,批次大小是 `8 x 2 = 16`,例如:`faster_rcnn_r50_caffe_fpn_90k_coco.py` 或者 `pisa_faster_rcnn_x101_32x4d_fpn_1x_coco.py`;若不是默认批次,你可以在配置文件看到像 `_NxM_` 字样的,例如:`cornernet_hourglass104_mstest_32x3_210e_coco.py` 的批次大小是 `32 x 3 = 96`, 或者 `scnet_x101_64x4d_fpn_8x1_20e_coco.py` 的批次大小是 `8 x 1 = 8`。 + +**请记住:如果使用不是默认批次大小为 `16`的配置文件,请检查配置文件中的底部,会有 `auto_scale_lr.base_batch_size`。如果找不到,可以在其继承的 `_base_=[xxx]` 文件中找到。另外,如果想使用自动缩放学习率的功能,请不要修改这些值。** + +学习率自动缩放基本用法如下: + +```shell +python tools/train.py \ + ${CONFIG_FILE} \ + --auto-scale-lr \ + [optional arguments] +``` + +执行命令之后,会根据机器的GPU数量和训练的批次大小对学习率进行自动缩放,缩放方式详见 [线性扩展规则](https://arxiv.org/abs/1706.02677) ,比如:在 4 块 GPU 并且每张 GPU 上有 2 张图片的情况下 `lr=0.01`,那么在 16 块 GPU 并且每张 GPU 上有 4 张图片的情况下, LR 会自动缩放至 `lr=0.08`。 + +如果不启用该功能,则需要根据 [线性扩展规则](https://arxiv.org/abs/1706.02677) 来手动计算并修改配置文件里面 `optimizer.lr` 的值。 + +### 使用单 GPU 训练 + +我们提供了 `tools/train.py` 来开启在单张 GPU 上的训练任务。基本使用如下: + +```shell +python tools/train.py \ + ${CONFIG_FILE} \ + [optional arguments] +``` + +在训练期间,日志文件和 checkpoint 文件将会被保存在工作目录下,它需要通过配置文件中的 `work_dir` 或者 CLI 参数中的 `--work-dir` 来指定。 + +默认情况下,模型将在每轮训练之后在 validation 集上进行测试,测试的频率可以通过设置配置文件来指定: + +```python +# 每 12 轮迭代进行一次测试评估 +train_cfg = dict(val_interval=12) +``` + +这个工具接受以下参数: + +- `--work-dir ${WORK_DIR}`: 覆盖工作目录. +- `--resume`:自动从work_dir中的最新检查点恢复. +- `--resume ${CHECKPOINT_FILE}`: 从某个 checkpoint 文件继续训练. +- `--cfg-options 'Key=value'`: 覆盖使用的配置文件中的其他设置. + +**注意**: +`resume` 和 `load-from` 的区别: + +`resume` 既加载了模型的权重和优化器的状态,也会继承指定 checkpoint 的迭代次数,不会重新开始训练。`load-from` 则是只加载模型的权重,它的训练是从头开始的,经常被用于微调模型。其中load-from需要写入配置文件中,而resume作为命令行参数传入。 + +### 使用 CPU 训练 + +使用 CPU 训练的流程和使用单 GPU 训练的流程一致,我们仅需要在训练流程开始前禁用 GPU。 + +```shell +export CUDA_VISIBLE_DEVICES=-1 +``` + +之后运行单 GPU 训练脚本即可。 + +**注意**: + +我们不推荐用户使用 CPU 进行训练,这太过缓慢。我们支持这个功能是为了方便用户在没有 GPU 的机器上进行调试。 + +### 在多 GPU 上训练 + +我们提供了 `tools/dist_train.sh` 来开启在多 GPU 上的训练。基本使用如下: + +```shell +bash ./tools/dist_train.sh \ + ${CONFIG_FILE} \ + ${GPU_NUM} \ + [optional arguments] +``` + +可选参数和单 GPU 训练的可选参数一致。 + +#### 同时启动多个任务 + +如果你想在一台机器上启动多个任务的话,比如在一个有 8 块 GPU 的机器上启动 2 个需要 4 块GPU的任务,你需要给不同的训练任务指定不同的端口(默认为 29500)来避免冲突。 + +如果你使用 `dist_train.sh` 来启动训练任务,你可以使用命令来设置端口。 + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4 +``` + +### 使用多台机器训练 + +如果您想使用由 ethernet 连接起来的多台机器, 您可以使用以下命令: + +在第一台机器上: + +```shell +NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS +``` + +在第二台机器上: + +```shell +NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS +``` + +但是,如果您不使用高速网路连接这几台机器的话,训练将会非常慢。 + +### 使用 Slurm 来管理任务 + +Slurm 是一个常见的计算集群调度系统。在 Slurm 管理的集群上,你可以使用 `slurm.sh` 来开启训练任务。它既支持单节点训练也支持多节点训练。 + +基本使用如下: + +```shell +[GPUS=${GPUS}] ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${WORK_DIR} +``` + +以下是在一个名称为 _dev_ 的 Slurm 分区上,使用 16 块 GPU 来训练 Mask R-CNN 的例子,并且将 `work-dir` 设置在了某些共享文件系统下。 + +```shell +GPUS=16 ./tools/slurm_train.sh dev mask_r50_1x configs/mask_rcnn_r50_fpn_1x_coco.py /nfs/xxxx/mask_rcnn_r50_fpn_1x +``` + +你可以查看 [源码](https://github.com/open-mmlab/mmdetection/blob/main/tools/slurm_train.sh) 来检查全部的参数和环境变量. + +在使用 Slurm 时,端口需要以下方的某个方法之一来设置。 + +1. 通过 `--options` 来设置端口。我们非常建议用这种方法,因为它无需改变原始的配置文件。 + + ```shell + CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} --cfg-options 'dist_params.port=29500' + CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} --cfg-options 'dist_params.port=29501' + ``` + +2. 修改配置文件来设置不同的交流端口。 + + 在 `config1.py` 中,设置: + + ```python + dist_params = dict(backend='nccl', port=29500) + ``` + + 在 `config2.py` 中,设置: + + ```python + dist_params = dict(backend='nccl', port=29501) + ``` + + 然后你可以使用 `config1.py` 和 `config2.py` 来启动两个任务了。 + + ```shell + CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} + CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 ./tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} + ``` + +# 在自定义数据集上进行训练 + +通过本文档,你将会知道如何使用自定义数据集对预先定义好的模型进行推理,测试以及训练。我们使用 [balloon dataset](https://github.com/matterport/Mask_RCNN/tree/master/samples/balloon) 作为例子来描述整个过程。 + +基本步骤如下: + +1. 准备自定义数据集 +2. 准备配置文件 +3. 在自定义数据集上进行训练,测试和推理。 + +## 准备自定义数据集 + +MMDetection 一共支持三种形式应用新数据集: + +1. 将数据集重新组织为 COCO 格式。 +2. 将数据集重新组织为一个中间格式。 +3. 实现一个新的数据集。 + +我们通常建议使用前面两种方法,因为它们通常来说比第三种方法要简单。 + +在本文档中,我们展示一个例子来说明如何将数据转化为 COCO 格式。 + +**注意**:在 MMDetection 3.0 之后,数据集和指标已经解耦(除了 CityScapes)。因此,用户在验证阶段使用任意的评价指标来评价模型在任意数据集上的性能。比如,用 VOC 评价指标来评价模型在 COCO 数据集的性能,或者同时使用 VOC 评价指标和 COCO 评价指标来评价模型在 OpenImages 数据集上的性能。 + +### COCO标注格式 + +用于实例分割的 COCO 数据集格式如下所示,其中的键(key)都是必要的,参考[这里](https://cocodataset.org/#format-data)来获取更多细节。 + +```json +{ + "images": [image], + "annotations": [annotation], + "categories": [category] +} + + +image = { + "id": int, + "width": int, + "height": int, + "file_name": str, +} + +annotation = { + "id": int, + "image_id": int, + "category_id": int, + "segmentation": RLE or [polygon], + "area": float, + "bbox": [x,y,width,height], # (x, y) 为 bbox 左上角的坐标 + "iscrowd": 0 or 1, +} + +categories = [{ + "id": int, + "name": str, + "supercategory": str, +}] +``` + +现在假设我们使用 balloon dataset。 + +下载了数据集之后,我们需要实现一个函数将标注格式转化为 COCO 格式。然后我们就可以使用已经实现的 `CocoDataset` 类来加载数据并进行训练以及评测。 + +如果你浏览过新数据集,你会发现格式如下: + +```json +{'base64_img_data': '', + 'file_attributes': {}, + 'filename': '34020010494_e5cb88e1c4_k.jpg', + 'fileref': '', + 'regions': {'0': {'region_attributes': {}, + 'shape_attributes': {'all_points_x': [1020, + 1000, + 994, + 1003, + 1023, + 1050, + 1089, + 1134, + 1190, + 1265, + 1321, + 1361, + 1403, + 1428, + 1442, + 1445, + 1441, + 1427, + 1400, + 1361, + 1316, + 1269, + 1228, + 1198, + 1207, + 1210, + 1190, + 1177, + 1172, + 1174, + 1170, + 1153, + 1127, + 1104, + 1061, + 1032, + 1020], + 'all_points_y': [963, + 899, + 841, + 787, + 738, + 700, + 663, + 638, + 621, + 619, + 643, + 672, + 720, + 765, + 800, + 860, + 896, + 942, + 990, + 1035, + 1079, + 1112, + 1129, + 1134, + 1144, + 1153, + 1166, + 1166, + 1150, + 1136, + 1129, + 1122, + 1112, + 1084, + 1037, + 989, + 963], + 'name': 'polygon'}}}, + 'size': 1115004} +``` + +标注文件时是 JSON 格式的,其中所有键(key)组成了一张图片的所有标注。 + +其中将 balloon dataset 转化为 COCO 格式的代码如下所示。 + +```python +import os.path as osp + +import mmcv + +from mmengine.fileio import dump, load +from mmengine.utils import track_iter_progress + + +def convert_balloon_to_coco(ann_file, out_file, image_prefix): + data_infos = load(ann_file) + + annotations = [] + images = [] + obj_count = 0 + for idx, v in enumerate(track_iter_progress(data_infos.values())): + filename = v['filename'] + img_path = osp.join(image_prefix, filename) + height, width = mmcv.imread(img_path).shape[:2] + + images.append( + dict(id=idx, file_name=filename, height=height, width=width)) + + for _, obj in v['regions'].items(): + assert not obj['region_attributes'] + obj = obj['shape_attributes'] + px = obj['all_points_x'] + py = obj['all_points_y'] + poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)] + poly = [p for x in poly for p in x] + + x_min, y_min, x_max, y_max = (min(px), min(py), max(px), max(py)) + + data_anno = dict( + image_id=idx, + id=obj_count, + category_id=0, + bbox=[x_min, y_min, x_max - x_min, y_max - y_min], + area=(x_max - x_min) * (y_max - y_min), + segmentation=[poly], + iscrowd=0) + annotations.append(data_anno) + obj_count += 1 + + coco_format_json = dict( + images=images, + annotations=annotations, + categories=[{ + 'id': 0, + 'name': 'balloon' + }]) + dump(coco_format_json, out_file) + + +if __name__ == '__main__': + convert_balloon_to_coco(ann_file='data/balloon/train/via_region_data.json', + out_file='data/balloon/train/annotation_coco.json', + image_prefix='data/balloon/train') + convert_balloon_to_coco(ann_file='data/balloon/val/via_region_data.json', + out_file='data/balloon/val/annotation_coco.json', + image_prefix='data/balloon/val') +``` + +使用如上的函数,用户可以成功将标注文件转化为 JSON 格式,之后可以使用 `CocoDataset` 对模型进行训练,并用 `CocoMetric` 评测。 + +## 准备配置文件 + +第二步需要准备一个配置文件来成功加载数据集。假设我们想要用 balloon dataset 来训练配备了 FPN 的 Mask R-CNN ,如下是我们的配置文件。假设配置文件命名为 `mask-rcnn_r50-caffe_fpn_ms-poly-1x_balloon.py`,相应保存路径为 `configs/balloon/`,配置文件内容如下所示。详细的配置文件方法可以参考[学习配置文件 — MMDetection 3.0.0 文档](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/config.html#base)。 + +```python +# 新配置继承了基本配置,并做了必要的修改 +_base_ = '../mask_rcnn/mask-rcnn_r50-caffe_fpn_ms-poly-1x_coco.py' + +# 我们还需要更改 head 中的 num_classes 以匹配数据集中的类别数 +model = dict( + roi_head=dict( + bbox_head=dict(num_classes=1), mask_head=dict(num_classes=1))) + +# 修改数据集相关配置 +data_root = 'data/balloon/' +metainfo = { + 'classes': ('balloon', ), + 'palette': [ + (220, 20, 60), + ] +} +train_dataloader = dict( + batch_size=1, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='train/annotation_coco.json', + data_prefix=dict(img='train/'))) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='val/annotation_coco.json', + data_prefix=dict(img='val/'))) +test_dataloader = val_dataloader + +# 修改评价指标相关配置 +val_evaluator = dict(ann_file=data_root + 'val/annotation_coco.json') +test_evaluator = val_evaluator + +# 使用预训练的 Mask R-CNN 模型权重来做初始化,可以提高模型性能 +load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' + +``` + +## 训练一个新的模型 + +为了使用新的配置方法来对模型进行训练,你只需要运行如下命令。 + +```shell +python tools/train.py configs/balloon/mask-rcnn_r50-caffe_fpn_ms-poly-1x_balloon.py +``` + +参考 [在标准数据集上训练预定义的模型](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/train.html#id1) 来获取更多详细的使用方法。 + +## 测试以及推理 + +为了测试训练完毕的模型,你只需要运行如下命令。 + +```shell +python tools/test.py configs/balloon/mask-rcnn_r50-caffe_fpn_ms-poly-1x_balloon.py work_dirs/mask-rcnn_r50-caffe_fpn_ms-poly-1x_balloon/epoch_12.pth +``` + +参考 [测试现有模型](https://mmdetection.readthedocs.io/zh_CN/latest/user_guides/test.html) 来获取更多详细的使用方法。 diff --git a/mmdetection/docs/zh_cn/user_guides/useful_hooks.md b/mmdetection/docs/zh_cn/user_guides/useful_hooks.md new file mode 100644 index 0000000..07a59df --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/useful_hooks.md @@ -0,0 +1,107 @@ +# 实用的钩子 + +MMDetection 和 MMEngine 为用户提供了多种多样实用的钩子(Hook),包括 `MemoryProfilerHook`、`NumClassCheckHook` 等等。 +这篇教程介绍了 MMDetection 中实现的钩子功能及使用方式。若使用 MMEngine 定义的钩子请参考 [MMEngine 的钩子API文档](https://github.com/open-mmlab/mmengine/tree/main/docs/en/tutorials/hook.md). + +## CheckInvalidLossHook + +## NumClassCheckHook + +## MemoryProfilerHook + +[内存分析钩子](https://github.com/open-mmlab/mmdetection/blob/main/mmdet/engine/hooks/memory_profiler_hook.py) +记录了包括虚拟内存、交换内存、当前进程在内的所有内存信息,它能够帮助捕捉系统的使用状况与发现隐藏的内存泄露问题。为了使用这个钩子,你需要先通过 `pip install memory_profiler psutil` 命令安装 `memory_profiler` 和 `psutil`。 + +### 使用 + +为了使用这个钩子,使用者需要添加如下代码至 config 文件 + +```python +custom_hooks = [ + dict(type='MemoryProfilerHook', interval=50) +] +``` + +### 结果 + +在训练中,你会看到 `MemoryProfilerHook` 记录的如下信息: + +```text +The system has 250 GB (246360 MB + 9407 MB) of memory and 8 GB (5740 MB + 2452 MB) of swap memory in total. Currently 9407 MB (4.4%) of memory and 5740 MB (29.9%) of swap memory were consumed. And the current training process consumed 5434 MB of memory. +``` + +```text +2022-04-21 08:49:56,881 - mmengine - INFO - Memory information available_memory: 246360 MB, used_memory: 9407 MB, memory_utilization: 4.4 %, available_swap_memory: 5740 MB, used_swap_memory: 2452 MB, swap_memory_utilization: 29.9 %, current_process_memory: 5434 MB +``` + +## SetEpochInfoHook + +## SyncNormHook + +## SyncRandomSizeHook + +## YOLOXLrUpdaterHook + +## YOLOXModeSwitchHook + +## 如何实现自定义钩子 + +通常,从模型训练的开始到结束,共有20个点位可以执行钩子。我们可以实现自定义钩子在不同点位执行,以便在训练中实现自定义操作。 + +- global points: `before_run`, `after_run` +- points in training: `before_train`, `before_train_epoch`, `before_train_iter`, `after_train_iter`, `after_train_epoch`, `after_train` +- points in validation: `before_val`, `before_val_epoch`, `before_val_iter`, `after_val_iter`, `after_val_epoch`, `after_val` +- points at testing: `before_test`, `before_test_epoch`, `before_test_iter`, `after_test_iter`, `after_test_epoch`, `after_test` +- other points: `before_save_checkpoint`, `after_save_checkpoint` + +比如,我们要实现一个检查 loss 的钩子,当损失为 NaN 时自动结束训练。我们可以把这个过程分为三步: + +1. 在 MMEngine 实现一个继承于 `Hook` 类的新钩子,并实现 `after_train_iter` 方法用于检查每 `n` 次训练迭代后损失是否变为 NaN 。 +2. 使用 `@HOOKS.register_module()` 注册实现好了的自定义钩子,如下列代码所示。 +3. 在配置文件中添加 `custom_hooks = [dict(type='MemoryProfilerHook', interval=50)]` + +```python +from typing import Optional + +import torch +from mmengine.hooks import Hook +from mmengine.runner import Runner + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class CheckInvalidLossHook(Hook): + """Check invalid loss hook. + + This hook will regularly check whether the loss is valid + during training. + + Args: + interval (int): Checking interval (every k iterations). + Default: 50. + """ + + def __init__(self, interval: int = 50) -> None: + self.interval = interval + + def after_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None, + outputs: Optional[dict] = None) -> None: + """Regularly check whether the loss is valid every n iterations. + + Args: + runner (:obj:`Runner`): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict, Optional): Data from dataloader. + Defaults to None. + outputs (dict, Optional): Outputs from model. Defaults to None. + """ + if self.every_n_train_iters(runner, self.interval): + assert torch.isfinite(outputs['loss']), \ + runner.logger.info('loss become infinite or NaN!') +``` + +请参考 [自定义训练配置](../advanced_guides/customize_runtime.md) 了解更多与自定义钩子相关的内容。 diff --git a/mmdetection/docs/zh_cn/user_guides/useful_tools.md b/mmdetection/docs/zh_cn/user_guides/useful_tools.md new file mode 100644 index 0000000..8416472 --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/useful_tools.md @@ -0,0 +1,636 @@ +除了训练和测试脚本,我们还在 `tools/` 目录下提供了许多有用的工具。 + +## 日志分析 + +`tools/analysis_tools/analyze_logs.py` 可利用指定的训练 log 文件绘制 loss/mAP 曲线图, +第一次运行前请先运行 `pip install seaborn` 安装必要依赖. + +```shell +python tools/analysis_tools/analyze_logs.py plot_curve [--keys ${KEYS}] [--eval-interval ${EVALUATION_INTERVAL}] [--title ${TITLE}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}] +``` + +![loss curve image](../../../resources/loss_curve.png) + +样例: + +- 绘制分类损失曲线图 + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls --legend loss_cls + ``` + +- 绘制分类损失、回归损失曲线图,保存图片为对应的 pdf 文件 + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log.json --keys loss_cls loss_bbox --out losses.pdf + ``` + +- 在相同图像中比较两次运行结果的 bbox mAP + + ```shell + python tools/analysis_tools/analyze_logs.py plot_curve log1.json log2.json --keys bbox_mAP --legend run1 run2 + ``` + +- 计算平均训练速度 + + ```shell + python tools/analysis_tools/analyze_logs.py cal_train_time log.json [--include-outliers] + ``` + + 输出以如下形式展示 + + ```text + -----Analyze train time of work_dirs/some_exp/20190611_192040.log.json----- + slowest epoch 11, average time is 1.2024 + fastest epoch 1, average time is 1.1909 + time std over epochs is 0.0028 + average iter time: 1.1959 s/iter + ``` + +## 结果分析 + +使用 `tools/analysis_tools/analyze_results.py` 可计算每个图像 mAP,随后根据真实标注框与预测框的比较结果,展示或保存最高与最低 top-k 得分的预测图像。 + +**使用方法** + +```shell +python tools/analysis_tools/analyze_results.py \ + ${CONFIG} \ + ${PREDICTION_PATH} \ + ${SHOW_DIR} \ + [--show] \ + [--wait-time ${WAIT_TIME}] \ + [--topk ${TOPK}] \ + [--show-score-thr ${SHOW_SCORE_THR}] \ + [--cfg-options ${CFG_OPTIONS}] +``` + +各个参数选项的作用: + +- `config`: model config 文件的路径。 +- `prediction_path`: 使用 `tools/test.py` 输出的 pickle 格式结果文件。 +- `show_dir`: 绘制真实标注框与预测框的图像存放目录。 +- `--show`:决定是否展示绘制 box 后的图片,默认值为 `False`。 +- `--wait-time`: show 时间的间隔,若为 0 表示持续显示。 +- `--topk`: 根据最高或最低 `topk` 概率排序保存的图片数量,若不指定,默认设置为 `20`。 +- `--show-score-thr`: 能够展示的概率阈值,默认为 `0`。 +- `--cfg-options`: 如果指定,可根据指定键值对覆盖更新配置文件的对应选项 + +**样例**: +假设你已经通过 `tools/test.py` 得到了 pickle 格式的结果文件,路径为 './result.pkl'。 + +1. 测试 Faster R-CNN 并可视化结果,保存图片至 `results/` + +```shell +python tools/analysis_tools/analyze_results.py \ + configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \ + result.pkl \ + results \ + --show +``` + +2. 测试 Faster R-CNN 并指定 top-k 参数为 50,保存结果图片至 `results/` + +```shell +python tools/analysis_tools/analyze_results.py \ + configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \ + result.pkl \ + results \ + --topk 50 +``` + +3. 如果你想过滤低概率的预测结果,指定 `show-score-thr` 参数 + +```shell +python tools/analysis_tools/analyze_results.py \ + configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \ + result.pkl \ + results \ + --show-score-thr 0.3 +``` + +## 多模型检测结果融合 + +`tools/analysis_tools/fuse_results.py` 可使用 Weighted Boxes Fusion(WBF) 方法将多个模型的检测结果进行融合。(当前仅支持 COCO 格式) + +**使用方法** + +```shell +python tools/analysis_tools/fuse_results.py \ + ${PRED_RESULTS} \ + [--annotation ${ANNOTATION}] \ + [--weights ${WEIGHTS}] \ + [--fusion-iou-thr ${FUSION_IOU_THR}] \ + [--skip-box-thr ${SKIP_BOX_THR}] \ + [--conf-type ${CONF_TYPE}] \ + [--eval-single ${EVAL_SINGLE}] \ + [--save-fusion-results ${SAVE_FUSION_RESULTS}] \ + [--out-dir ${OUT_DIR}] +``` + +各个参数选项的作用: + +- `pred-results`: 多模型测试结果的保存路径。(目前仅支持 json 格式) +- `--annotation`: 真实标注框的保存路径。 +- `--weights`: 模型融合权重。默认设置下,每个模型的权重均为1。 +- `--fusion-iou-thr`: 在WBF算法中,匹配成功的 IoU 阈值,默认值为`0.55`。 +- `--skip-box-thr`: WBF算法中需剔除的置信度阈值,置信度小于该值的 bbox 会被剔除,默认值为`0`。 +- `--conf-type`: 如何计算融合后 bbox 的置信度。有以下四种选项: + - `avg`: 取平均值,默认为此选项。 + - `max`: 取最大值。 + - `box_and_model_avg`: box和模型尺度的加权平均值。 + - `absent_model_aware_avg`: 考虑缺失模型的加权平均值。 +- `--eval-single`: 是否评估每个单一模型,默认值为`False`。 +- `--save-fusion-results`: 是否保存融合结果,默认值为`False`。 +- `--out-dir`: 融合结果保存的路径。 + +**样例**: +假设你已经通过 `tools/test.py` 得到了3个模型的 json 格式的结果文件,路径分别为 './faster-rcnn_r50-caffe_fpn_1x_coco.json', './retinanet_r50-caffe_fpn_1x_coco.json', './cascade-rcnn_r50-caffe_fpn_1x_coco.json',真实标注框的文件路径为'./annotation.json'。 + +1. 融合三个模型的预测结果并评估其效果 + +```shell +python tools/analysis_tools/fuse_results.py \ + ./faster-rcnn_r50-caffe_fpn_1x_coco.json \ + ./retinanet_r50-caffe_fpn_1x_coco.json \ + ./cascade-rcnn_r50-caffe_fpn_1x_coco.json \ + --annotation ./annotation.json \ + --weights 1 2 3 \ +``` + +2. 同时评估每个单一模型与融合结果 + +```shell +python tools/analysis_tools/fuse_results.py \ + ./faster-rcnn_r50-caffe_fpn_1x_coco.json \ + ./retinanet_r50-caffe_fpn_1x_coco.json \ + ./cascade-rcnn_r50-caffe_fpn_1x_coco.json \ + --annotation ./annotation.json \ + --weights 1 2 3 \ + --eval-single +``` + +3. 融合三个模型的预测结果并保存 + +```shell +python tools/analysis_tools/fuse_results.py \ + ./faster-rcnn_r50-caffe_fpn_1x_coco.json \ + ./retinanet_r50-caffe_fpn_1x_coco.json \ + ./cascade-rcnn_r50-caffe_fpn_1x_coco.json \ + --annotation ./annotation.json \ + --weights 1 2 3 \ + --save-fusion-results \ + --out-dir outputs/fusion +``` + +## 可视化 + +### 可视化数据集 + +`tools/analysis_tools/browse_dataset.py` 可帮助使用者检查所使用的检测数据集(包括图像和标注),或保存图像至指定目录。 + +```shell +python tools/analysis_tools/browse_dataset.py ${CONFIG} [-h] [--skip-type ${SKIP_TYPE[SKIP_TYPE...]}] [--output-dir ${OUTPUT_DIR}] [--not-show] [--show-interval ${SHOW_INTERVAL}] +``` + +### 可视化模型 + +在可视化之前,需要先转换模型至 ONNX 格式,[可参考此处](#convert-mmdetection-model-to-onnx-experimental)。 +注意,现在只支持 RetinaNet,之后的版本将会支持其他模型 +转换后的模型可以被其他工具可视化[Netron](https://github.com/lutzroeder/netron)。 + +### 可视化预测结果 + +如果你想要一个轻量 GUI 可视化检测结果,你可以参考 [DetVisGUI project](https://github.com/Chien-Hung/DetVisGUI/tree/mmdetection)。 + +## 误差分析 + +`tools/analysis_tools/coco_error_analysis.py` 使用不同标准分析每个类别的 COCO 评估结果。同时将一些有帮助的信息体现在图表上。 + +```shell +python tools/analysis_tools/coco_error_analysis.py ${RESULT} ${OUT_DIR} [-h] [--ann ${ANN}] [--types ${TYPES[TYPES...]}] +``` + +样例: + +假设你已经把 [Mask R-CNN checkpoint file](https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_1x_coco/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth) 放置在文件夹 'checkpoint' 中(其他模型请在 [model zoo](./model_zoo.md) 中获取)。 + +为了保存 bbox 结果信息,我们需要用下列方式修改 `test_evaluator` : + +1. 查找当前 config 文件相对应的 'configs/base/datasets' 数据集信息。 +2. 用当前数据集 config 中的 test_evaluator 以及 test_dataloader 替换原始文件的 test_evaluator 以及 test_dataloader。 +3. 使用以下命令得到 bbox 或 segmentation 的 json 格式文件。 + +```shell +python tools/test.py \ + configs/mask_rcnn/mask-rcnn_r50_fpn_1x_coco.py \ + checkpoint/mask_rcnn_r50_fpn_1x_coco_20200205-d4b0c5d6.pth \ +``` + +1. 得到每一类的 COCO bbox 误差结果,并保存分析结果图像至指定目录。(在 [config](../../../configs/_base_/datasets/coco_instance.py) 中默认目录是 './work_dirs/coco_instance/test') + +```shell +python tools/analysis_tools/coco_error_analysis.py \ + results.bbox.json \ + results \ + --ann=data/coco/annotations/instances_val2017.json \ +``` + +2. 得到每一类的 COCO 分割误差结果,并保存分析结果图像至指定目录。 + +```shell +python tools/analysis_tools/coco_error_analysis.py \ + results.segm.json \ + results \ + --ann=data/coco/annotations/instances_val2017.json \ + --types='segm' +``` + +## 模型服务部署 + +如果你想使用 [`TorchServe`](https://pytorch.org/serve/) 搭建一个 `MMDetection` 模型服务,可以参考以下步骤: + +### 1. 安装 TorchServe + +假设你已经成功安装了包含 `PyTorch` 和 `MMDetection` 的 `Python` 环境,那么你可以运行以下命令来安装 `TorchServe` 及其依赖项。有关更多其他安装选项,请参考[快速入门](https://github.com/pytorch/serve/blob/master/README.md#serve-a-model)。 + +```shell +python -m pip install torchserve torch-model-archiver torch-workflow-archiver nvgpu +``` + +**注意**: 如果你想在 docker 中使用`TorchServe`,请参考[torchserve docker](https://github.com/pytorch/serve/blob/master/docker/README.md)。 + +### 2. 把 MMDetection 模型转换至 TorchServe + +```shell +python tools/deployment/mmdet2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \ +--output-folder ${MODEL_STORE} \ +--model-name ${MODEL_NAME} +``` + +### 3. 启动 `TorchServe` + +```shell +torchserve --start --ncs \ + --model-store ${MODEL_STORE} \ + --models ${MODEL_NAME}.mar +``` + +### 4. 测试部署效果 + +```shell +curl -O curl -O https://raw.githubusercontent.com/pytorch/serve/master/docs/images/3dogs.jpg +curl http://127.0.0.1:8080/predictions/${MODEL_NAME} -T 3dogs.jpg +``` + +你可以得到下列 json 信息: + +```json +[ + { + "class_label": 16, + "class_name": "dog", + "bbox": [ + 294.63409423828125, + 203.99111938476562, + 417.048583984375, + 281.62744140625 + ], + "score": 0.9987992644309998 + }, + { + "class_label": 16, + "class_name": "dog", + "bbox": [ + 404.26019287109375, + 126.0080795288086, + 574.5091552734375, + 293.6662292480469 + ], + "score": 0.9979367256164551 + }, + { + "class_label": 16, + "class_name": "dog", + "bbox": [ + 197.2144775390625, + 93.3067855834961, + 307.8505554199219, + 276.7560119628906 + ], + "score": 0.993338406085968 + } +] +``` + +#### 结果对比 + +你也可以使用 `test_torchserver.py` 来比较 `TorchServe` 和 `PyTorch` 的结果,并可视化: + +```shell +python tools/deployment/test_torchserver.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME} +[--inference-addr ${INFERENCE_ADDR}] [--device ${DEVICE}] [--score-thr ${SCORE_THR}] [--work-dir ${WORK_DIR}] +``` + +样例: + +```shell +python tools/deployment/test_torchserver.py \ +demo/demo.jpg \ +configs/yolo/yolov3_d53_8xb8-320-273e_coco.py \ +checkpoint/yolov3_d53_320_273e_coco-421362b6.pth \ +yolov3 \ +--work-dir ./work-dir +``` + +### 5. 停止 `TorchServe` + +```shell +torchserve --stop +``` + +## 模型复杂度 + +`tools/analysis_tools/get_flops.py` 工具可用于计算指定模型的 FLOPs、参数量大小(改编自 [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) )。 + +```shell +python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}] +``` + +获得的结果如下: + +```text +============================== +Input shape: (3, 1280, 800) +Flops: 239.32 GFLOPs +Params: 37.74 M +============================== +``` + +**注意**:这个工具还只是实验性质,我们不保证这个数值是绝对正确的。你可以将他用于简单的比较,但如果用于科技论文报告需要再三检查确认。 + +1. FLOPs 与输入的形状大小相关,参数量没有这个关系,默认的输入形状大小为 (1, 3, 1280, 800) 。 +2. 一些算子并不计入 FLOPs,比如 GN 或其他自定义的算子。你可以参考 [`mmcv.cnn.get_model_complexity_info()`](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/cnn/utils/flops_counter.py) 查看更详细的说明。 +3. 两阶段检测的 FLOPs 大小取决于 proposal 的数量。 + +## 模型转换 + +### MMDetection 模型转换至 ONNX 格式 + +我们提供了一个脚本用于转换模型至 [ONNX](https://github.com/onnx/onnx) 格式。同时还支持比较 Pytorch 与 ONNX 模型的输出结果以便对照。更详细的内容可以参考 [mmdeploy](https://github.com/open-mmlab/mmdeploy)。 + +### MMDetection 1.x 模型转换至 MMDetection 2.x 模型 + +`tools/model_converters/upgrade_model_version.py` 可将旧版本的 MMDetection checkpoints 转换至新版本。但要注意此脚本不保证在新版本加入非兼容更新后还能正常转换,建议您直接使用新版本的 checkpoints。 + +```shell +python tools/model_converters/upgrade_model_version.py ${IN_FILE} ${OUT_FILE} [-h] [--num-classes NUM_CLASSES] +``` + +### RegNet 模型转换至 MMDetection 模型 + +`tools/model_converters/regnet2mmdet.py` 将 pycls 编码的预训练 RegNet 模型转换为 MMDetection 风格。 + +```shell +python tools/model_converters/regnet2mmdet.py ${SRC} ${DST} [-h] +``` + +### Detectron ResNet 模型转换至 Pytorch 模型 + +`tools/model_converters/detectron2pytorch.py` 将 detectron 的原始预训练 RegNet 模型转换为 MMDetection 风格。 + +```shell +python tools/model_converters/detectron2pytorch.py ${SRC} ${DST} ${DEPTH} [-h] +``` + +### 制作发布用模型 + +`tools/model_converters/publish_model.py` 可用来制作一个发布用的模型。 + +在发布模型至 AWS 之前,你可能需要: + +1. 将模型转换至 CPU 张量 +2. 删除优化器状态 +3. 计算 checkpoint 文件的 hash 值,并将 hash 号码记录至文件名。 + +```shell +python tools/model_converters/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME} +``` + +样例: + +```shell +python tools/model_converters/publish_model.py work_dirs/faster_rcnn/latest.pth faster_rcnn_r50_fpn_1x_20190801.pth +``` + +最后输出的文件名如下所示: `faster_rcnn_r50_fpn_1x_20190801-{hash id}.pth`. + +## 数据集转换 + +`tools/data_converters/` 提供了将 Cityscapes 数据集与 Pascal VOC 数据集转换至 COCO 数据集格式的工具 + +```shell +python tools/dataset_converters/cityscapes.py ${CITYSCAPES_PATH} [-h] [--img-dir ${IMG_DIR}] [--gt-dir ${GT_DIR}] [-o ${OUT_DIR}] [--nproc ${NPROC}] +python tools/dataset_converters/pascal_voc.py ${DEVKIT_PATH} [-h] [-o ${OUT_DIR}] +``` + +## 数据集下载 + +`tools/misc/download_dataset.py` 可以下载各类形如 COCO, VOC, LVIS 数据集。 + +```shell +python tools/misc/download_dataset.py --dataset-name coco2017 +python tools/misc/download_dataset.py --dataset-name voc2007 +python tools/misc/download_dataset.py --dataset-name lvis +``` + +对于中国境内的用户,我们也推荐使用开源数据平台 [OpenDataLab](https://opendatalab.com/?source=OpenMMLab%20GitHub) 来获取这些数据集,以获得更好的下载体验: + +- [COCO2017](https://opendatalab.com/COCO_2017/download?source=OpenMMLab%20GitHub) +- [VOC2007](https://opendatalab.com/PASCAL_VOC2007/download?source=OpenMMLab%20GitHub) +- [VOC2012](https://opendatalab.com/PASCAL_VOC2012/download?source=OpenMMLab%20GitHub) +- [LVIS](https://opendatalab.com/LVIS/download?source=OpenMMLab%20GitHub) + +## 基准测试 + +### 鲁棒性测试基准 + +`tools/analysis_tools/test_robustness.py` 及 `tools/analysis_tools/robustness_eval.py` 帮助使用者衡量模型的鲁棒性。其核心思想来源于 [Benchmarking Robustness in Object Detection: Autonomous Driving when Winter is Coming](https://arxiv.org/abs/1907.07484)。如果你想了解如何在污损图像上评估模型的效果,以及参考该基准的一组标准模型,请参照 [robustness_benchmarking.md](robustness_benchmarking.md)。 + +### FPS 测试基准 + +`tools/analysis_tools/benchmark.py` 可帮助使用者计算 FPS,FPS 计算包括了模型向前传播与后处理过程。为了得到更精确的计算值,现在的分布式计算模式只支持一个 GPU。 + +```shell +python -m torch.distributed.launch --nproc_per_node=1 --master_port=${PORT} tools/analysis_tools/benchmark.py \ + ${CONFIG} \ + [--checkpoint ${CHECKPOINT}] \ + [--repeat-num ${REPEAT_NUM}] \ + [--max-iter ${MAX_ITER}] \ + [--log-interval ${LOG_INTERVAL}] \ + --launcher pytorch +``` + +样例:假设你已经下载了 `Faster R-CNN` 模型 checkpoint 并放置在 `checkpoints/` 目录下。 + +```shell +python -m torch.distributed.launch --nproc_per_node=1 --master_port=29500 tools/analysis_tools/benchmark.py \ + configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \ + checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \ + --launcher pytorch +``` + +## 更多工具 + +### 以某个评估标准进行评估 + +`tools/analysis_tools/eval_metric.py` 根据配置文件中的评估方式对 pkl 结果文件进行评估。 + +```shell +python tools/analysis_tools/eval_metric.py ${CONFIG} ${PKL_RESULTS} [-h] [--format-only] [--eval ${EVAL[EVAL ...]}] + [--cfg-options ${CFG_OPTIONS [CFG_OPTIONS ...]}] + [--eval-options ${EVAL_OPTIONS [EVAL_OPTIONS ...]}] +``` + +### 打印全部 config + +`tools/misc/print_config.py` 可将所有配置继承关系展开,完全打印相应的配置文件。 + +```shell +python tools/misc/print_config.py ${CONFIG} [-h] [--options ${OPTIONS [OPTIONS...]}] +``` + +## 超参数优化 + +### YOLO Anchor 优化 + +`tools/analysis_tools/optimize_anchors.py` 提供了两种方法优化 YOLO 的 anchors。 + +其中一种方法使用 K 均值 anchor 聚类(k-means anchor cluster),源自 [darknet](https://github.com/AlexeyAB/darknet/blob/master/src/detector.c#L1421)。 + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} --algorithm k-means --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} --output-dir ${OUTPUT_DIR} +``` + +另一种方法使用差分进化算法优化 anchors。 + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} --algorithm differential_evolution --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} --output-dir ${OUTPUT_DIR} +``` + +样例: + +```shell +python tools/analysis_tools/optimize_anchors.py configs/yolo/yolov3_d53_8xb8-320-273e_coco.py --algorithm differential_evolution --input-shape 608 608 --device cuda --output-dir work_dirs +``` + +你可能会看到如下结果: + +``` +loading annotations into memory... +Done (t=9.70s) +creating index... +index created! +2021-07-19 19:37:20,951 - mmdet - INFO - Collecting bboxes from annotation... +[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 117266/117266, 15874.5 task/s, elapsed: 7s, ETA: 0s + +2021-07-19 19:37:28,753 - mmdet - INFO - Collected 849902 bboxes. +differential_evolution step 1: f(x)= 0.506055 +differential_evolution step 2: f(x)= 0.506055 +...... + +differential_evolution step 489: f(x)= 0.386625 +2021-07-19 19:46:40,775 - mmdet - INFO Anchor evolution finish. Average IOU: 0.6133754253387451 +2021-07-19 19:46:40,776 - mmdet - INFO Anchor differential evolution result:[[10, 12], [15, 30], [32, 22], [29, 59], [61, 46], [57, 116], [112, 89], [154, 198], [349, 336]] +2021-07-19 19:46:40,798 - mmdet - INFO Result saved in work_dirs/anchor_optimize_result.json +``` + +## 混淆矩阵 + +混淆矩阵是对检测结果的概览。 +`tools/analysis_tools/confusion_matrix.py` 可对预测结果进行分析,绘制成混淆矩阵表。 +首先,运行 `tools/test.py` 保存 `.pkl` 预测结果。 +之后再运行: + +``` +python tools/analysis_tools/confusion_matrix.py ${CONFIG} ${DETECTION_RESULTS} ${SAVE_DIR} --show +``` + +最后你可以得到如图的混淆矩阵: + +![confusion_matrix_example](https://user-images.githubusercontent.com/12907710/140513068-994cdbf4-3a4a-48f0-8fd8-2830d93fd963.png) + +## COCO 分离和遮挡实例分割性能评估 + +对于最先进的目标检测器来说,检测被遮挡的物体仍然是一个挑战。 +我们实现了论文 [A Tri-Layer Plugin to Improve Occluded Detection](https://arxiv.org/abs/2210.10046) 中提出的指标来计算分离和遮挡目标的召回率。 + +使用此评价指标有两种方法: + +### 离线评测 + +我们提供了一个脚本对存储后的检测结果文件计算指标。 + +首先,使用 `tools/test.py` 脚本存储检测结果: + +```shell +python tools/test.py ${CONFIG} ${MODEL_PATH} --out results.pkl +``` + +然后,运行 `tools/analysis_tools/coco_occluded_separated_recall.py` 脚本来计算分离和遮挡目标的掩码的召回率: + +```shell +python tools/analysis_tools/coco_occluded_separated_recall.py results.pkl --out occluded_separated_recall.json +``` + +输出如下: + +``` +loading annotations into memory... +Done (t=0.51s) +creating index... +index created! +processing detection results... +[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 5000/5000, 109.3 task/s, elapsed: 46s, ETA: 0s +computing occluded mask recall... +[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 5550/5550, 780.5 task/s, elapsed: 7s, ETA: 0s +COCO occluded mask recall: 58.79% +COCO occluded mask success num: 3263 +computing separated mask recall... +[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 3522/3522, 778.3 task/s, elapsed: 5s, ETA: 0s +COCO separated mask recall: 31.94% +COCO separated mask success num: 1125 + ++-----------+--------+-------------+ +| mask type | recall | num correct | ++-----------+--------+-------------+ +| occluded | 58.79% | 3263 | +| separated | 31.94% | 1125 | ++-----------+--------+-------------+ +Evaluation results have been saved to occluded_separated_recall.json. +``` + +### 在线评测 + +我们实现继承自 `CocoMetic` 的 `CocoOccludedSeparatedMetric`。 +要在训练期间评估分离和遮挡掩码的召回率,只需在配置中将 evaluator 类型替换为 `CocoOccludedSeparatedMetric`: + +```python +val_evaluator = dict( + type='CocoOccludedSeparatedMetric', # 修改此处 + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False) +test_evaluator = val_evaluator +``` + +如果您使用了此指标,请引用论文: + +```latex +@article{zhan2022triocc, + title={A Tri-Layer Plugin to Improve Occluded Detection}, + author={Zhan, Guanqi and Xie, Weidi and Zisserman, Andrew}, + journal={British Machine Vision Conference}, + year={2022} +} +``` diff --git a/mmdetection/docs/zh_cn/user_guides/visualization.md b/mmdetection/docs/zh_cn/user_guides/visualization.md new file mode 100644 index 0000000..f90ab6d --- /dev/null +++ b/mmdetection/docs/zh_cn/user_guides/visualization.md @@ -0,0 +1,93 @@ +# 可视化 + +在阅读本教程之前,建议先阅读 MMEngine 的 [Visualization](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/visualization.md) 文档,以对 `Visualizer` 的定义和用法有一个初步的了解。 + +简而言之,`Visualizer` 在 MMEngine 中实现以满足日常可视化需求,并包含以下三个主要功能: + +- 实现通用的绘图 API,例如 [`draw_bboxes`](mmengine.visualization.Visualizer.draw_bboxes) 实现了绘制边界框的功能,[`draw_lines`](mmengine.visualization.Visualizer.draw_lines) 实现了绘制线条的功能。 +- 支持将可视化结果、学习率曲线、损失函数曲线以及验证精度曲线写入到各种后端中,包括本地磁盘以及常见的深度学习训练日志工具,例如 [TensorBoard](https://www.tensorflow.org/tensorboard) 和 [Wandb](https://wandb.ai/site)。 +- 支持在代码的任何位置调用以可视化或记录模型在训练或测试期间的中间状态,例如特征图和验证结果。 + +基于 MMEngine 的 `Visualizer`,MMDet 提供了各种预构建的可视化工具,用户可以通过简单地修改以下配置文件来使用它们。 + +- `tools/analysis_tools/browse_dataset.py` 脚本提供了一个数据集可视化功能,可以在数据经过数据转换后绘制图像和相应的注释,具体描述请参见[`browse_dataset.py`](useful_tools.md#Visualization)。 + +- MMEngine实现了`LoggerHook`,使用`Visualizer`将学习率、损失和评估结果写入由`Visualizer`设置的后端。因此,通过修改配置文件中的`Visualizer`后端,例如修改为`TensorBoardVISBackend`或`WandbVISBackend`,可以实现日志记录到常用的训练日志工具,如`TensorBoard`或`WandB`,从而方便用户使用这些可视化工具来分析和监控训练过程。 + +- 在MMDet中实现了`VisualizerHook`,它使用`Visualizer`将验证或预测阶段的预测结果可视化或存储到由`Visualizer`设置的后端。因此,通过修改配置文件中的`Visualizer`后端,例如修改为`TensorBoardVISBackend`或`WandbVISBackend`,可以将预测图像存储到`TensorBoard`或`Wandb`中。 + +## 配置 + +由于使用了注册机制,在MMDet中我们可以通过修改配置文件来设置`Visualizer`的行为。通常,我们会在`configs/_base_/default_runtime.py`中为可视化器定义默认配置,详细信息请参见[配置教程](config.md)。 + +```Python +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='DetLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') +``` + +基于上面的例子,我们可以看到`Visualizer`的配置由两个主要部分组成,即`Visualizer`类型和其使用的可视化后端`vis_backends`。 + +- 用户可直接使用`DetLocalVisualizer`来可视化支持任务的标签或预测结果。 +- MMDet默认将可视化后端`vis_backend`设置为本地可视化后端`LocalVisBackend`,将所有可视化结果和其他训练信息保存在本地文件夹中。 + +## 存储 + +MMDet默认使用本地可视化后端[`LocalVisBackend`](mmengine.visualization.LocalVisBackend),`VisualizerHook`和`LoggerHook`中存储的模型损失、学习率、模型评估精度和可视化信息,包括损失、学习率、评估精度将默认保存到`{work_dir}/{config_name}/{time}/{vis_data}`文件夹中。此外,MMDet还支持其他常见的可视化后端,例如`TensorboardVisBackend`和`WandbVisBackend`,您只需要在配置文件中更改`vis_backends`类型为相应的可视化后端即可。例如,只需在配置文件中插入以下代码块即可将数据存储到`TensorBoard`和`Wandb`中。 + +```Python +# https://mmengine.readthedocs.io/en/latest/api/visualization.html +_base_.visualizer.vis_backends = [ + dict(type='LocalVisBackend'), # + dict(type='TensorboardVisBackend'), + dict(type='WandbVisBackend'),] +``` + +## 绘图 + +### 绘制预测结果 + +MMDet主要使用[`DetVisualizationHook`](mmdet.engine.hooks.DetVisualizationHook)来绘制验证和测试的预测结果,默认情况下`DetVisualizationHook`是关闭的,其默认配置如下。 + +```Python +visualization=dict( #用户可视化验证和测试结果 + type='DetVisualizationHook', + draw=False, + interval=1, + show=False) +``` + +以下表格展示了`DetVisualizationHook`支持的参数。 + +| 参数 | 描述 | +| :------: | :------------------------------------------------------------------------------: | +| draw | DetVisualizationHook通过enable参数打开和关闭,默认状态为关闭。 | +| interval | 控制在DetVisualizationHook启用时存储或显示验证或测试结果的间隔,单位为迭代次数。 | +| show | 控制是否可视化验证或测试的结果。 | + +如果您想在训练或测试期间启用 `DetVisualizationHook` 相关功能和配置,您只需要修改配置文件,以 `configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py` 为例,同时绘制注释和预测,并显示图像,配置文件可以修改如下: + +```Python +visualization = _base_.default_hooks.visualization +visualization.update(dict(draw=True, show=True)) +``` + +
    + +
    + +`test.py`程序提供了`--show`和`--show-dir`参数,可以在测试过程中可视化注释和预测结果,而不需要修改配置文件,从而进一步简化了测试过程。 + +```Shell +# 展示测试结果 +python tools/test.py configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth --show + +# 指定存储预测结果的位置 +python tools/test.py configs/rtmdet/rtmdet_tiny_8xb32-300e_coco.py https://download.openmmlab.com/mmdetection/v3.0/rtmdet/rtmdet_tiny_8xb32-300e_coco/rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth --show-dir imgs/ +``` + +
    + +
    diff --git a/mmdetection/mmdet/__init__.py b/mmdetection/mmdet/__init__.py new file mode 100644 index 0000000..3ac884a --- /dev/null +++ b/mmdetection/mmdet/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import mmengine +from mmengine.utils import digit_version + +from .version import __version__, version_info + +mmcv_minimum_version = '2.0.0rc4' +mmcv_maximum_version = '2.2.0' +mmcv_version = digit_version(mmcv.__version__) + +mmengine_minimum_version = '0.7.1' +mmengine_maximum_version = '1.0.0' +mmengine_version = digit_version(mmengine.__version__) + +assert (mmcv_version >= digit_version(mmcv_minimum_version) + and mmcv_version < digit_version(mmcv_maximum_version)), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.' + +assert (mmengine_version >= digit_version(mmengine_minimum_version) + and mmengine_version < digit_version(mmengine_maximum_version)), \ + f'MMEngine=={mmengine.__version__} is used but incompatible. ' \ + f'Please install mmengine>={mmengine_minimum_version}, ' \ + f'<{mmengine_maximum_version}.' + +__all__ = ['__version__', 'version_info', 'digit_version'] diff --git a/mmdetection/mmdet/apis/__init__.py b/mmdetection/mmdet/apis/__init__.py new file mode 100644 index 0000000..c89dc72 --- /dev/null +++ b/mmdetection/mmdet/apis/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .det_inferencer import DetInferencer +from .inference import (async_inference_detector, inference_detector, + inference_mot, init_detector, init_track_model) + +__all__ = [ + 'init_detector', 'async_inference_detector', 'inference_detector', + 'DetInferencer', 'inference_mot', 'init_track_model' +] diff --git a/mmdetection/mmdet/apis/det_inferencer.py b/mmdetection/mmdet/apis/det_inferencer.py new file mode 100644 index 0000000..9efbb00 --- /dev/null +++ b/mmdetection/mmdet/apis/det_inferencer.py @@ -0,0 +1,644 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import warnings +from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union + +import mmcv +import mmengine +import numpy as np +import torch.nn as nn +from mmcv.transforms import LoadImageFromFile +from mmengine.dataset import Compose +from mmengine.fileio import (get_file_backend, isdir, join_path, + list_dir_or_file) +from mmengine.infer.infer import BaseInferencer, ModelType +from mmengine.model.utils import revert_sync_batchnorm +from mmengine.registry import init_default_scope +from mmengine.runner.checkpoint import _load_checkpoint_to_model +from mmengine.visualization import Visualizer +from rich.progress import track + +from mmdet.evaluation import INSTANCE_OFFSET +from mmdet.registry import DATASETS +from mmdet.structures import DetDataSample +from mmdet.structures.mask import encode_mask_results, mask2bbox +from mmdet.utils import ConfigType +from ..evaluation import get_classes + +try: + from panopticapi.evaluation import VOID + from panopticapi.utils import id2rgb +except ImportError: + id2rgb = None + VOID = None + +InputType = Union[str, np.ndarray] +InputsType = Union[InputType, Sequence[InputType]] +PredType = List[DetDataSample] +ImgType = Union[np.ndarray, Sequence[np.ndarray]] + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +class DetInferencer(BaseInferencer): + """Object Detection Inferencer. + + Args: + model (str, optional): Path to the config file or the model name + defined in metafile. For example, it could be + "rtmdet-s" or 'rtmdet_s_8xb32-300e_coco' or + "configs/rtmdet/rtmdet_s_8xb32-300e_coco.py". + If model is not specified, user must provide the + `weights` saved by MMEngine which contains the config string. + Defaults to None. + weights (str, optional): Path to the checkpoint. If it is not specified + and model is a model name of metafile, the weights will be loaded + from metafile. Defaults to None. + device (str, optional): Device to run inference. If None, the available + device will be automatically used. Defaults to None. + scope (str, optional): The scope of the model. Defaults to mmdet. + palette (str): Color palette used for visualization. The order of + priority is palette -> config -> checkpoint. Defaults to 'none'. + show_progress (bool): Control whether to display the progress + bar during the inference process. Defaults to True. + """ + + preprocess_kwargs: set = set() + forward_kwargs: set = set() + visualize_kwargs: set = { + 'return_vis', + 'show', + 'wait_time', + 'draw_pred', + 'pred_score_thr', + 'img_out_dir', + 'no_save_vis', + } + postprocess_kwargs: set = { + 'print_result', + 'pred_out_dir', + 'return_datasamples', + 'no_save_pred', + } + + def __init__(self, + model: Optional[Union[ModelType, str]] = None, + weights: Optional[str] = None, + device: Optional[str] = None, + scope: Optional[str] = 'mmdet', + palette: str = 'none', + show_progress: bool = True) -> None: + # A global counter tracking the number of images processed, for + # naming of the output images + self.num_visualized_imgs = 0 + self.num_predicted_imgs = 0 + self.palette = palette + init_default_scope(scope) + super().__init__( + model=model, weights=weights, device=device, scope=scope) + self.model = revert_sync_batchnorm(self.model) + self.show_progress = show_progress + + def _load_weights_to_model(self, model: nn.Module, + checkpoint: Optional[dict], + cfg: Optional[ConfigType]) -> None: + """Loading model weights and meta information from cfg and checkpoint. + + Args: + model (nn.Module): Model to load weights and meta information. + checkpoint (dict, optional): The loaded checkpoint. + cfg (Config or ConfigDict, optional): The loaded config. + """ + + if checkpoint is not None: + _load_checkpoint_to_model(model, checkpoint) + checkpoint_meta = checkpoint.get('meta', {}) + # save the dataset_meta in the model for convenience + if 'dataset_meta' in checkpoint_meta: + # mmdet 3.x, all keys should be lowercase + model.dataset_meta = { + k.lower(): v + for k, v in checkpoint_meta['dataset_meta'].items() + } + elif 'CLASSES' in checkpoint_meta: + # < mmdet 3.x + classes = checkpoint_meta['CLASSES'] + model.dataset_meta = {'classes': classes} + else: + warnings.warn( + 'dataset_meta or class names are not saved in the ' + 'checkpoint\'s meta data, use COCO classes by default.') + model.dataset_meta = {'classes': get_classes('coco')} + else: + warnings.warn('Checkpoint is not loaded, and the inference ' + 'result is calculated by the randomly initialized ' + 'model!') + warnings.warn('weights is None, use COCO classes by default.') + model.dataset_meta = {'classes': get_classes('coco')} + + # Priority: args.palette -> config -> checkpoint + if self.palette != 'none': + model.dataset_meta['palette'] = self.palette + else: + test_dataset_cfg = copy.deepcopy(cfg.test_dataloader.dataset) + # lazy init. We only need the metainfo. + test_dataset_cfg['lazy_init'] = True + metainfo = DATASETS.build(test_dataset_cfg).metainfo + cfg_palette = metainfo.get('palette', None) + if cfg_palette is not None: + model.dataset_meta['palette'] = cfg_palette + else: + if 'palette' not in model.dataset_meta: + warnings.warn( + 'palette does not exist, random is used by default. ' + 'You can also set the palette to customize.') + model.dataset_meta['palette'] = 'random' + + def _init_pipeline(self, cfg: ConfigType) -> Compose: + """Initialize the test pipeline.""" + pipeline_cfg = cfg.test_dataloader.dataset.pipeline + + # For inference, the key of ``img_id`` is not used. + if 'meta_keys' in pipeline_cfg[-1]: + pipeline_cfg[-1]['meta_keys'] = tuple( + meta_key for meta_key in pipeline_cfg[-1]['meta_keys'] + if meta_key != 'img_id') + + load_img_idx = self._get_transform_idx( + pipeline_cfg, ('LoadImageFromFile', LoadImageFromFile)) + if load_img_idx == -1: + raise ValueError( + 'LoadImageFromFile is not found in the test pipeline') + pipeline_cfg[load_img_idx]['type'] = 'mmdet.InferencerLoader' + return Compose(pipeline_cfg) + + def _get_transform_idx(self, pipeline_cfg: ConfigType, + name: Union[str, Tuple[str, type]]) -> int: + """Returns the index of the transform in a pipeline. + + If the transform is not found, returns -1. + """ + for i, transform in enumerate(pipeline_cfg): + if transform['type'] in name: + return i + return -1 + + def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]: + """Initialize visualizers. + + Args: + cfg (ConfigType): Config containing the visualizer information. + + Returns: + Visualizer or None: Visualizer initialized with config. + """ + visualizer = super()._init_visualizer(cfg) + visualizer.dataset_meta = self.model.dataset_meta + return visualizer + + def _inputs_to_list(self, inputs: InputsType) -> list: + """Preprocess the inputs to a list. + + Preprocess inputs to a list according to its type: + + - list or tuple: return inputs + - str: + - Directory path: return all files in the directory + - other cases: return a list containing the string. The string + could be a path to file, a url or other types of string according + to the task. + + Args: + inputs (InputsType): Inputs for the inferencer. + + Returns: + list: List of input for the :meth:`preprocess`. + """ + if isinstance(inputs, str): + backend = get_file_backend(inputs) + if hasattr(backend, 'isdir') and isdir(inputs): + # Backends like HttpsBackend do not implement `isdir`, so only + # those backends that implement `isdir` could accept the inputs + # as a directory + filename_list = list_dir_or_file( + inputs, list_dir=False, suffix=IMG_EXTENSIONS) + inputs = [ + join_path(inputs, filename) for filename in filename_list + ] + + if not isinstance(inputs, (list, tuple)): + inputs = [inputs] + + return list(inputs) + + def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs): + """Process the inputs into a model-feedable format. + + Customize your preprocess by overriding this method. Preprocess should + return an iterable object, of which each item will be used as the + input of ``model.test_step``. + + ``BaseInferencer.preprocess`` will return an iterable chunked data, + which will be used in __call__ like this: + + .. code-block:: python + + def __call__(self, inputs, batch_size=1, **kwargs): + chunked_data = self.preprocess(inputs, batch_size, **kwargs) + for batch in chunked_data: + preds = self.forward(batch, **kwargs) + + Args: + inputs (InputsType): Inputs given by user. + batch_size (int): batch size. Defaults to 1. + + Yields: + Any: Data processed by the ``pipeline`` and ``collate_fn``. + """ + chunked_data = self._get_chunk_data(inputs, batch_size) + yield from map(self.collate_fn, chunked_data) + + def _get_chunk_data(self, inputs: Iterable, chunk_size: int): + """Get batch data from inputs. + + Args: + inputs (Iterable): An iterable dataset. + chunk_size (int): Equivalent to batch size. + + Yields: + list: batch data. + """ + inputs_iter = iter(inputs) + while True: + try: + chunk_data = [] + for _ in range(chunk_size): + inputs_ = next(inputs_iter) + if isinstance(inputs_, dict): + if 'img' in inputs_: + ori_inputs_ = inputs_['img'] + else: + ori_inputs_ = inputs_['img_path'] + chunk_data.append( + (ori_inputs_, + self.pipeline(copy.deepcopy(inputs_)))) + else: + chunk_data.append((inputs_, self.pipeline(inputs_))) + yield chunk_data + except StopIteration: + if chunk_data: + yield chunk_data + break + + # TODO: Video and Webcam are currently not supported and + # may consume too much memory if your input folder has a lot of images. + # We will be optimized later. + def __call__( + self, + inputs: InputsType, + batch_size: int = 1, + return_vis: bool = False, + show: bool = False, + wait_time: int = 0, + no_save_vis: bool = False, + draw_pred: bool = True, + pred_score_thr: float = 0.3, + return_datasamples: bool = False, + print_result: bool = False, + no_save_pred: bool = True, + out_dir: str = '', + # by open image task + texts: Optional[Union[str, list]] = None, + # by open panoptic task + stuff_texts: Optional[Union[str, list]] = None, + # by GLIP + custom_entities: bool = False, + **kwargs) -> dict: + """Call the inferencer. + + Args: + inputs (InputsType): Inputs for the inferencer. + batch_size (int): Inference batch size. Defaults to 1. + show (bool): Whether to display the visualization results in a + popup window. Defaults to False. + wait_time (float): The interval of show (s). Defaults to 0. + no_save_vis (bool): Whether to force not to save prediction + vis results. Defaults to False. + draw_pred (bool): Whether to draw predicted bounding boxes. + Defaults to True. + pred_score_thr (float): Minimum score of bboxes to draw. + Defaults to 0.3. + return_datasamples (bool): Whether to return results as + :obj:`DetDataSample`. Defaults to False. + print_result (bool): Whether to print the inference result w/o + visualization to the console. Defaults to False. + no_save_pred (bool): Whether to force not to save prediction + results. Defaults to True. + out_dir: Dir to save the inference results or + visualization. If left as empty, no file will be saved. + Defaults to ''. + texts (str | list[str]): Text prompts. Defaults to None. + stuff_texts (str | list[str]): Stuff text prompts of open + panoptic task. Defaults to None. + custom_entities (bool): Whether to use custom entities. + Defaults to False. Only used in GLIP. + **kwargs: Other keyword arguments passed to :meth:`preprocess`, + :meth:`forward`, :meth:`visualize` and :meth:`postprocess`. + Each key in kwargs should be in the corresponding set of + ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs`` + and ``postprocess_kwargs``. + + Returns: + dict: Inference and visualization results. + """ + ( + preprocess_kwargs, + forward_kwargs, + visualize_kwargs, + postprocess_kwargs, + ) = self._dispatch_kwargs(**kwargs) + + ori_inputs = self._inputs_to_list(inputs) + + if texts is not None and isinstance(texts, str): + texts = [texts] * len(ori_inputs) + if stuff_texts is not None and isinstance(stuff_texts, str): + stuff_texts = [stuff_texts] * len(ori_inputs) + if texts is not None: + assert len(texts) == len(ori_inputs) + for i in range(len(texts)): + if isinstance(ori_inputs[i], str): + ori_inputs[i] = { + 'text': texts[i], + 'img_path': ori_inputs[i], + 'custom_entities': custom_entities + } + else: + ori_inputs[i] = { + 'text': texts[i], + 'img': ori_inputs[i], + 'custom_entities': custom_entities + } + if stuff_texts is not None: + assert len(stuff_texts) == len(ori_inputs) + for i in range(len(stuff_texts)): + ori_inputs[i]['stuff_text'] = stuff_texts[i] + + inputs = self.preprocess( + ori_inputs, batch_size=batch_size, **preprocess_kwargs) + + results_dict = {'predictions': [], 'visualization': []} + for ori_imgs, data in (track(inputs, description='Inference') + if self.show_progress else inputs): + preds = self.forward(data, **forward_kwargs) + visualization = self.visualize( + ori_imgs, + preds, + return_vis=return_vis, + show=show, + wait_time=wait_time, + draw_pred=draw_pred, + pred_score_thr=pred_score_thr, + no_save_vis=no_save_vis, + img_out_dir=out_dir, + **visualize_kwargs) + results = self.postprocess( + preds, + visualization, + return_datasamples=return_datasamples, + print_result=print_result, + no_save_pred=no_save_pred, + pred_out_dir=out_dir, + **postprocess_kwargs) + results_dict['predictions'].extend(results['predictions']) + if results['visualization'] is not None: + results_dict['visualization'].extend(results['visualization']) + return results_dict + + def visualize(self, + inputs: InputsType, + preds: PredType, + return_vis: bool = False, + show: bool = False, + wait_time: int = 0, + draw_pred: bool = True, + pred_score_thr: float = 0.3, + no_save_vis: bool = False, + img_out_dir: str = '', + **kwargs) -> Union[List[np.ndarray], None]: + """Visualize predictions. + + Args: + inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer. + preds (List[:obj:`DetDataSample`]): Predictions of the model. + return_vis (bool): Whether to return the visualization result. + Defaults to False. + show (bool): Whether to display the image in a popup window. + Defaults to False. + wait_time (float): The interval of show (s). Defaults to 0. + draw_pred (bool): Whether to draw predicted bounding boxes. + Defaults to True. + pred_score_thr (float): Minimum score of bboxes to draw. + Defaults to 0.3. + no_save_vis (bool): Whether to force not to save prediction + vis results. Defaults to False. + img_out_dir (str): Output directory of visualization results. + If left as empty, no file will be saved. Defaults to ''. + + Returns: + List[np.ndarray] or None: Returns visualization results only if + applicable. + """ + if no_save_vis is True: + img_out_dir = '' + + if not show and img_out_dir == '' and not return_vis: + return None + + if self.visualizer is None: + raise ValueError('Visualization needs the "visualizer" term' + 'defined in the config, but got None.') + + results = [] + + for single_input, pred in zip(inputs, preds): + if isinstance(single_input, str): + img_bytes = mmengine.fileio.get(single_input) + img = mmcv.imfrombytes(img_bytes) + img = img[:, :, ::-1] + img_name = osp.basename(single_input) + elif isinstance(single_input, np.ndarray): + img = single_input.copy() + img_num = str(self.num_visualized_imgs).zfill(8) + img_name = f'{img_num}.jpg' + else: + raise ValueError('Unsupported input type: ' + f'{type(single_input)}') + + out_file = osp.join(img_out_dir, 'vis', + img_name) if img_out_dir != '' else None + + self.visualizer.add_datasample( + img_name, + img, + pred, + show=show, + wait_time=wait_time, + draw_gt=False, + draw_pred=draw_pred, + pred_score_thr=pred_score_thr, + out_file=out_file, + ) + results.append(self.visualizer.get_image()) + self.num_visualized_imgs += 1 + + return results + + def postprocess( + self, + preds: PredType, + visualization: Optional[List[np.ndarray]] = None, + return_datasamples: bool = False, + print_result: bool = False, + no_save_pred: bool = False, + pred_out_dir: str = '', + **kwargs, + ) -> Dict: + """Process the predictions and visualization results from ``forward`` + and ``visualize``. + + This method should be responsible for the following tasks: + + 1. Convert datasamples into a json-serializable dict if needed. + 2. Pack the predictions and visualization results and return them. + 3. Dump or log the predictions. + + Args: + preds (List[:obj:`DetDataSample`]): Predictions of the model. + visualization (Optional[np.ndarray]): Visualized predictions. + return_datasamples (bool): Whether to use Datasample to store + inference results. If False, dict will be used. + print_result (bool): Whether to print the inference result w/o + visualization to the console. Defaults to False. + no_save_pred (bool): Whether to force not to save prediction + results. Defaults to False. + pred_out_dir: Dir to save the inference results w/o + visualization. If left as empty, no file will be saved. + Defaults to ''. + + Returns: + dict: Inference and visualization results with key ``predictions`` + and ``visualization``. + + - ``visualization`` (Any): Returned by :meth:`visualize`. + - ``predictions`` (dict or DataSample): Returned by + :meth:`forward` and processed in :meth:`postprocess`. + If ``return_datasamples=False``, it usually should be a + json-serializable dict containing only basic data elements such + as strings and numbers. + """ + if no_save_pred is True: + pred_out_dir = '' + + result_dict = {} + results = preds + if not return_datasamples: + results = [] + for pred in preds: + result = self.pred2dict(pred, pred_out_dir) + results.append(result) + elif pred_out_dir != '': + warnings.warn('Currently does not support saving datasample ' + 'when return_datasamples is set to True. ' + 'Prediction results are not saved!') + # Add img to the results after printing and dumping + result_dict['predictions'] = results + if print_result: + print(result_dict) + result_dict['visualization'] = visualization + return result_dict + + # TODO: The data format and fields saved in json need further discussion. + # Maybe should include model name, timestamp, filename, image info etc. + def pred2dict(self, + data_sample: DetDataSample, + pred_out_dir: str = '') -> Dict: + """Extract elements necessary to represent a prediction into a + dictionary. + + It's better to contain only basic data elements such as strings and + numbers in order to guarantee it's json-serializable. + + Args: + data_sample (:obj:`DetDataSample`): Predictions of the model. + pred_out_dir: Dir to save the inference results w/o + visualization. If left as empty, no file will be saved. + Defaults to ''. + + Returns: + dict: Prediction results. + """ + is_save_pred = True + if pred_out_dir == '': + is_save_pred = False + + if is_save_pred and 'img_path' in data_sample: + img_path = osp.basename(data_sample.img_path) + img_path = osp.splitext(img_path)[0] + out_img_path = osp.join(pred_out_dir, 'preds', + img_path + '_panoptic_seg.png') + out_json_path = osp.join(pred_out_dir, 'preds', img_path + '.json') + elif is_save_pred: + out_img_path = osp.join( + pred_out_dir, 'preds', + f'{self.num_predicted_imgs}_panoptic_seg.png') + out_json_path = osp.join(pred_out_dir, 'preds', + f'{self.num_predicted_imgs}.json') + self.num_predicted_imgs += 1 + + result = {} + if 'pred_instances' in data_sample: + masks = data_sample.pred_instances.get('masks') + pred_instances = data_sample.pred_instances.numpy() + result = { + 'labels': pred_instances.labels.tolist(), + 'scores': pred_instances.scores.tolist() + } + if 'bboxes' in pred_instances: + result['bboxes'] = pred_instances.bboxes.tolist() + if masks is not None: + if 'bboxes' not in pred_instances or pred_instances.bboxes.sum( + ) == 0: + # Fake bbox, such as the SOLO. + bboxes = mask2bbox(masks.cpu()).numpy().tolist() + result['bboxes'] = bboxes + encode_masks = encode_mask_results(pred_instances.masks) + for encode_mask in encode_masks: + if isinstance(encode_mask['counts'], bytes): + encode_mask['counts'] = encode_mask['counts'].decode() + result['masks'] = encode_masks + + if 'pred_panoptic_seg' in data_sample: + if VOID is None: + raise RuntimeError( + 'panopticapi is not installed, please install it by: ' + 'pip install git+https://github.com/cocodataset/' + 'panopticapi.git.') + + pan = data_sample.pred_panoptic_seg.sem_seg.cpu().numpy()[0] + pan[pan % INSTANCE_OFFSET == len( + self.model.dataset_meta['classes'])] = VOID + pan = id2rgb(pan).astype(np.uint8) + + if is_save_pred: + mmcv.imwrite(pan[:, :, ::-1], out_img_path) + result['panoptic_seg_path'] = out_img_path + else: + result['panoptic_seg'] = pan + + if is_save_pred: + mmengine.dump(result, out_json_path) + + return result diff --git a/mmdetection/mmdet/apis/inference.py b/mmdetection/mmdet/apis/inference.py new file mode 100644 index 0000000..7e6f914 --- /dev/null +++ b/mmdetection/mmdet/apis/inference.py @@ -0,0 +1,372 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from pathlib import Path +from typing import Optional, Sequence, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.ops import RoIPool +from mmcv.transforms import Compose +from mmengine.config import Config +from mmengine.dataset import default_collate +from mmengine.model.utils import revert_sync_batchnorm +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint + +from mmdet.registry import DATASETS +from mmdet.utils import ConfigType +from ..evaluation import get_classes +from ..registry import MODELS +from ..structures import DetDataSample, SampleList +from ..utils import get_test_pipeline_cfg + + +def init_detector( + config: Union[str, Path, Config], + checkpoint: Optional[str] = None, + palette: str = 'none', + device: str = 'cuda:0', + cfg_options: Optional[dict] = None, +) -> nn.Module: + """Initialize a detector from config file. + + Args: + config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path, + :obj:`Path`, or the config object. + checkpoint (str, optional): Checkpoint path. If left as None, the model + will not load any weights. + palette (str): Color palette used for visualization. If palette + is stored in checkpoint, use checkpoint's palette first, otherwise + use externally passed palette. Currently, supports 'coco', 'voc', + 'citys' and 'random'. Defaults to none. + device (str): The device where the anchors will be put on. + Defaults to cuda:0. + cfg_options (dict, optional): Options to override some settings in + the used config. + + Returns: + nn.Module: The constructed detector. + """ + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if cfg_options is not None: + config.merge_from_dict(cfg_options) + elif 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + scope = config.get('default_scope', 'mmdet') + if scope is not None: + init_default_scope(config.get('default_scope', 'mmdet')) + + model = MODELS.build(config.model) + model = revert_sync_batchnorm(model) + if checkpoint is None: + warnings.simplefilter('once') + warnings.warn('checkpoint is None, use COCO classes by default.') + model.dataset_meta = {'classes': get_classes('coco')} + else: + checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') + # Weights converted from elsewhere may not have meta fields. + checkpoint_meta = checkpoint.get('meta', {}) + + # save the dataset_meta in the model for convenience + if 'dataset_meta' in checkpoint_meta: + # mmdet 3.x, all keys should be lowercase + model.dataset_meta = { + k.lower(): v + for k, v in checkpoint_meta['dataset_meta'].items() + } + elif 'CLASSES' in checkpoint_meta: + # < mmdet 3.x + classes = checkpoint_meta['CLASSES'] + model.dataset_meta = {'classes': classes} + else: + warnings.simplefilter('once') + warnings.warn( + 'dataset_meta or class names are not saved in the ' + 'checkpoint\'s meta data, use COCO classes by default.') + model.dataset_meta = {'classes': get_classes('coco')} + + # Priority: args.palette -> config -> checkpoint + if palette != 'none': + model.dataset_meta['palette'] = palette + else: + test_dataset_cfg = copy.deepcopy(config.test_dataloader.dataset) + # lazy init. We only need the metainfo. + test_dataset_cfg['lazy_init'] = True + metainfo = DATASETS.build(test_dataset_cfg).metainfo + cfg_palette = metainfo.get('palette', None) + if cfg_palette is not None: + model.dataset_meta['palette'] = cfg_palette + else: + if 'palette' not in model.dataset_meta: + warnings.warn( + 'palette does not exist, random is used by default. ' + 'You can also set the palette to customize.') + model.dataset_meta['palette'] = 'random' + + model.cfg = config # save the config in the model for convenience + model.to(device) + model.eval() + return model + + +ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]] + + +def inference_detector( + model: nn.Module, + imgs: ImagesType, + test_pipeline: Optional[Compose] = None, + text_prompt: Optional[str] = None, + custom_entities: bool = False, +) -> Union[DetDataSample, SampleList]: + """Inference image(s) with the detector. + + Args: + model (nn.Module): The loaded detector. + imgs (str, ndarray, Sequence[str/ndarray]): + Either image files or loaded images. + test_pipeline (:obj:`Compose`): Test pipeline. + + Returns: + :obj:`DetDataSample` or list[:obj:`DetDataSample`]: + If imgs is a list or tuple, the same length list type results + will be returned, otherwise return the detection results directly. + """ + + if isinstance(imgs, (list, tuple)): + is_batch = True + else: + imgs = [imgs] + is_batch = False + + cfg = model.cfg + + if test_pipeline is None: + cfg = cfg.copy() + test_pipeline = get_test_pipeline_cfg(cfg) + if isinstance(imgs[0], np.ndarray): + # Calling this method across libraries will result + # in module unregistered error if not prefixed with mmdet. + test_pipeline[0].type = 'mmdet.LoadImageFromNDArray' + + test_pipeline = Compose(test_pipeline) + + if model.data_preprocessor.device.type == 'cpu': + for m in model.modules(): + assert not isinstance( + m, RoIPool + ), 'CPU inference with RoIPool is not supported currently.' + + result_list = [] + for i, img in enumerate(imgs): + # prepare data + if isinstance(img, np.ndarray): + # TODO: remove img_id. + data_ = dict(img=img, img_id=0) + else: + # TODO: remove img_id. + data_ = dict(img_path=img, img_id=0) + + if text_prompt: + data_['text'] = text_prompt + data_['custom_entities'] = custom_entities + + # build the data pipeline + data_ = test_pipeline(data_) + + data_['inputs'] = [data_['inputs']] + data_['data_samples'] = [data_['data_samples']] + + # forward the model + with torch.no_grad(): + results = model.test_step(data_)[0] + + result_list.append(results) + + if not is_batch: + return result_list[0] + else: + return result_list + + +# TODO: Awaiting refactoring +async def async_inference_detector(model, imgs): + """Async inference image(s) with the detector. + + Args: + model (nn.Module): The loaded detector. + img (str | ndarray): Either image files or loaded images. + + Returns: + Awaitable detection results. + """ + if not isinstance(imgs, (list, tuple)): + imgs = [imgs] + + cfg = model.cfg + + if isinstance(imgs[0], np.ndarray): + cfg = cfg.copy() + # set loading pipeline type + cfg.data.test.pipeline[0].type = 'LoadImageFromNDArray' + + # cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline) + test_pipeline = Compose(cfg.data.test.pipeline) + + datas = [] + for img in imgs: + # prepare data + if isinstance(img, np.ndarray): + # directly add img + data = dict(img=img) + else: + # add information into dict + data = dict(img_info=dict(filename=img), img_prefix=None) + # build the data pipeline + data = test_pipeline(data) + datas.append(data) + + for m in model.modules(): + assert not isinstance( + m, + RoIPool), 'CPU inference with RoIPool is not supported currently.' + + # We don't restore `torch.is_grad_enabled()` value during concurrent + # inference since execution can overlap + torch.set_grad_enabled(False) + results = await model.aforward_test(data, rescale=True) + return results + + +def build_test_pipeline(cfg: ConfigType) -> ConfigType: + """Build test_pipeline for mot/vis demo. In mot/vis infer, original + test_pipeline should remove the "LoadImageFromFile" and + "LoadTrackAnnotations". + + Args: + cfg (ConfigDict): The loaded config. + Returns: + ConfigType: new test_pipeline + """ + # remove the "LoadImageFromFile" and "LoadTrackAnnotations" in pipeline + transform_broadcaster = cfg.test_dataloader.dataset.pipeline[0].copy() + for transform in transform_broadcaster['transforms']: + if transform['type'] == 'Resize': + transform_broadcaster['transforms'] = transform + pack_track_inputs = cfg.test_dataloader.dataset.pipeline[-1].copy() + test_pipeline = Compose([transform_broadcaster, pack_track_inputs]) + + return test_pipeline + + +def inference_mot(model: nn.Module, img: np.ndarray, frame_id: int, + video_len: int) -> SampleList: + """Inference image(s) with the mot model. + + Args: + model (nn.Module): The loaded mot model. + img (np.ndarray): Loaded image. + frame_id (int): frame id. + video_len (int): demo video length + Returns: + SampleList: The tracking data samples. + """ + cfg = model.cfg + data = dict( + img=[img.astype(np.float32)], + frame_id=[frame_id], + ori_shape=[img.shape[:2]], + img_id=[frame_id + 1], + ori_video_length=[video_len]) + + test_pipeline = build_test_pipeline(cfg) + data = test_pipeline(data) + + if not next(model.parameters()).is_cuda: + for m in model.modules(): + assert not isinstance( + m, RoIPool + ), 'CPU inference with RoIPool is not supported currently.' + + # forward the model + with torch.no_grad(): + data = default_collate([data]) + result = model.test_step(data)[0] + return result + + +def init_track_model(config: Union[str, Config], + checkpoint: Optional[str] = None, + detector: Optional[str] = None, + reid: Optional[str] = None, + device: str = 'cuda:0', + cfg_options: Optional[dict] = None) -> nn.Module: + """Initialize a model from config file. + + Args: + config (str or :obj:`mmengine.Config`): Config file path or the config + object. + checkpoint (Optional[str], optional): Checkpoint path. Defaults to + None. + detector (Optional[str], optional): Detector Checkpoint path, use in + some tracking algorithms like sort. Defaults to None. + reid (Optional[str], optional): Reid checkpoint path. use in + some tracking algorithms like sort. Defaults to None. + device (str, optional): The device that the model inferences on. + Defaults to `cuda:0`. + cfg_options (Optional[dict], optional): Options to override some + settings in the used config. Defaults to None. + + Returns: + nn.Module: The constructed model. + """ + if isinstance(config, str): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if cfg_options is not None: + config.merge_from_dict(cfg_options) + + model = MODELS.build(config.model) + + if checkpoint is not None: + checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') + # Weights converted from elsewhere may not have meta fields. + checkpoint_meta = checkpoint.get('meta', {}) + # save the dataset_meta in the model for convenience + if 'dataset_meta' in checkpoint_meta: + if 'CLASSES' in checkpoint_meta['dataset_meta']: + value = checkpoint_meta['dataset_meta'].pop('CLASSES') + checkpoint_meta['dataset_meta']['classes'] = value + model.dataset_meta = checkpoint_meta['dataset_meta'] + + if detector is not None: + assert not (checkpoint and detector), \ + 'Error: checkpoint and detector checkpoint cannot both exist' + load_checkpoint(model.detector, detector, map_location='cpu') + + if reid is not None: + assert not (checkpoint and reid), \ + 'Error: checkpoint and reid checkpoint cannot both exist' + load_checkpoint(model.reid, reid, map_location='cpu') + + # Some methods don't load checkpoints or checkpoints don't contain + # 'dataset_meta' + # VIS need dataset_meta, MOT don't need dataset_meta + if not hasattr(model, 'dataset_meta'): + warnings.warn('dataset_meta or class names are missed, ' + 'use None by default.') + model.dataset_meta = {'classes': None} + + model.cfg = config # save the config in the model for convenience + model.to(device) + model.eval() + return model diff --git a/mmdetection/mmdet/configs/_base_/datasets/coco_detection.py b/mmdetection/mmdet/configs/_base_/datasets/coco_detection.py new file mode 100644 index 0000000..45041f6 --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/datasets/coco_detection.py @@ -0,0 +1,104 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms import LoadImageFromFile +from mmengine.dataset.sampler import DefaultSampler + +from mmdet.datasets import AspectRatioBatchSampler, CocoDataset +from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs, + RandomFlip, Resize) +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=2, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict( +# type=dataset_type, +# data_root=data_root, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type=CocoMetric, +# metric='bbox', +# format_only=True, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_detection/test') diff --git a/mmdetection/mmdet/configs/_base_/datasets/coco_instance.py b/mmdetection/mmdet/configs/_base_/datasets/coco_instance.py new file mode 100644 index 0000000..b957543 --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/datasets/coco_instance.py @@ -0,0 +1,106 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmengine.dataset.sampler import DefaultSampler + +from mmdet.datasets.coco import CocoDataset +from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import RandomFlip, Resize +from mmdet.evaluation.metrics.coco_metric import CocoMetric + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=CocoDataset, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=CocoDataset, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=2, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict( +# type=CocoDataset, +# data_root=data_root, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type=CocoMetric, +# metric=['bbox', 'segm'], +# format_only=True, +# ann_file=data_root + 'annotations/image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_instance/test') diff --git a/mmdetection/mmdet/configs/_base_/datasets/coco_instance_semantic.py b/mmdetection/mmdet/configs/_base_/datasets/coco_instance_semantic.py new file mode 100644 index 0000000..7cf5b2c --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/datasets/coco_instance_semantic.py @@ -0,0 +1,87 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmengine.dataset.sampler import DefaultSampler + +from mmdet.datasets.coco import CocoDataset +from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import RandomFlip, Resize +from mmdet.evaluation.metrics.coco_metric import CocoMetric + +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True, with_seg=True), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + # If you don't have a gt annotation, delete the pipeline + dict(type=LoadAnnotations, with_bbox=True, with_mask=True, with_seg=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=CocoDataset, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/', seg='stuffthingmaps/train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) + +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=CocoDataset, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) + +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator diff --git a/mmdetection/mmdet/configs/_base_/datasets/coco_panoptic.py b/mmdetection/mmdet/configs/_base_/datasets/coco_panoptic.py new file mode 100644 index 0000000..29d655f --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/datasets/coco_panoptic.py @@ -0,0 +1,105 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmengine.dataset.sampler import DefaultSampler + +from mmdet.datasets.coco_panoptic import CocoPanopticDataset +from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadPanopticAnnotations +from mmdet.datasets.transforms.transforms import RandomFlip, Resize +from mmdet.evaluation.metrics.coco_panoptic_metric import CocoPanopticMetric + +# dataset settings +dataset_type = 'CocoPanopticDataset' +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadPanopticAnnotations, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=LoadPanopticAnnotations, backend_args=backend_args), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=CocoPanopticDataset, + data_root=data_root, + ann_file='annotations/panoptic_train2017.json', + data_prefix=dict( + img='train2017/', seg='annotations/panoptic_train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=CocoPanopticDataset, + data_root=data_root, + ann_file='annotations/panoptic_val2017.json', + data_prefix=dict(img='val2017/', seg='annotations/panoptic_val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoPanopticMetric, + ann_file=data_root + 'annotations/panoptic_val2017.json', + seg_prefix=data_root + 'annotations/panoptic_val2017/', + backend_args=backend_args) +test_evaluator = val_evaluator + +# inference on test dataset and +# format the output results for submission. +# test_dataloader = dict( +# batch_size=1, +# num_workers=1, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict( +# type=CocoPanopticDataset, +# data_root=data_root, +# ann_file='annotations/panoptic_image_info_test-dev2017.json', +# data_prefix=dict(img='test2017/'), +# test_mode=True, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type=CocoPanopticMetric, +# format_only=True, +# ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json', +# outfile_prefix='./work_dirs/coco_panoptic/test') diff --git a/mmdetection/mmdet/configs/_base_/datasets/mot_challenge.py b/mmdetection/mmdet/configs/_base_/datasets/mot_challenge.py new file mode 100644 index 0000000..a71520a --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/datasets/mot_challenge.py @@ -0,0 +1,101 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms import (LoadImageFromFile, RandomResize, + TransformBroadcaster) + +from mmdet.datasets import MOTChallengeDataset +from mmdet.datasets.samplers import TrackImgSampler +from mmdet.datasets.transforms import (LoadTrackAnnotations, PackTrackInputs, + PhotoMetricDistortion, RandomCrop, + RandomFlip, Resize, + UniformRefFrameSample) +from mmdet.evaluation import MOTChallengeMetric + +# dataset settings +dataset_type = MOTChallengeDataset +data_root = 'data/MOT17/' +img_scale = (1088, 1088) + +backend_args = None +# data pipeline +train_pipeline = [ + dict( + type=UniformRefFrameSample, + num_ref_imgs=1, + frame_range=10, + filter_key_img=True), + dict( + type=TransformBroadcaster, + share_random_params=True, + transforms=[ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadTrackAnnotations), + dict( + type=RandomResize, + scale=img_scale, + ratio_range=(0.8, 1.2), + keep_ratio=True, + clip_object_border=False), + dict(type=PhotoMetricDistortion) + ]), + dict( + type=TransformBroadcaster, + # different cropped positions for different frames + share_random_params=False, + transforms=[ + dict(type=RandomCrop, crop_size=img_scale, bbox_clip_border=False) + ]), + dict( + type=TransformBroadcaster, + share_random_params=True, + transforms=[ + dict(type=RandomFlip, prob=0.5), + ]), + dict(type=PackTrackInputs) +] + +test_pipeline = [ + dict( + type=TransformBroadcaster, + transforms=[ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=img_scale, keep_ratio=True), + dict(type=LoadTrackAnnotations) + ]), + dict(type=PackTrackInputs) +] + +# dataloader +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=TrackImgSampler), # image-based sampling + dataset=dict( + type=dataset_type, + data_root=data_root, + visibility_thr=-1, + ann_file='annotations/half-train_cocoformat.json', + data_prefix=dict(img_path='train'), + metainfo=dict(classes=('pedestrian', )), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + # Now we support two ways to test, image_based and video_based + # if you want to use video_based sampling, you can use as follows + # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + sampler=dict(type=TrackImgSampler), # image-based sampling + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/half-val_cocoformat.json', + data_prefix=dict(img_path='train'), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# evaluator +val_evaluator = dict( + type=MOTChallengeMetric, metric=['HOTA', 'CLEAR', 'Identity']) +test_evaluator = val_evaluator diff --git a/mmdetection/mmdet/configs/_base_/default_runtime.py b/mmdetection/mmdet/configs/_base_/default_runtime.py new file mode 100644 index 0000000..ff96dbf --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/default_runtime.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.runner import LogProcessor +from mmengine.visualization import LocalVisBackend + +from mmdet.engine.hooks import DetVisualizationHook +from mmdet.visualization import DetLocalVisualizer + +default_scope = None + +default_hooks = dict( + timer=dict(type=IterTimerHook), + logger=dict(type=LoggerHook, interval=50), + param_scheduler=dict(type=ParamSchedulerHook), + checkpoint=dict(type=CheckpointHook, interval=1), + sampler_seed=dict(type=DistSamplerSeedHook), + visualization=dict(type=DetVisualizationHook)) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +vis_backends = [dict(type=LocalVisBackend)] +visualizer = dict( + type=DetLocalVisualizer, vis_backends=vis_backends, name='visualizer') +log_processor = dict(type=LogProcessor, window_size=50, by_epoch=True) + +log_level = 'INFO' +load_from = None +resume = False diff --git a/mmdetection/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/mmdetection/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py new file mode 100644 index 0000000..b9132ac --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py @@ -0,0 +1,220 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.cascade_rcnn import CascadeRCNN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import SmoothL1Loss +from mmdet.models.necks.fpn import FPN +from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \ + Shared2FCBBoxHead +from mmdet.models.roi_heads.cascade_roi_head import CascadeRoIHead +from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +model = dict( + type=CascadeRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type=RPNHead, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type=CascadeRoIHead, + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type=FCNMaskHead, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdetection/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py b/mmdetection/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py new file mode 100644 index 0000000..8e6654f --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py @@ -0,0 +1,201 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.cascade_rcnn import CascadeRCNN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import SmoothL1Loss +from mmdet.models.necks.fpn import FPN +from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \ + Shared2FCBBoxHead +from mmdet.models.roi_heads.cascade_roi_head import CascadeRoIHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +model = dict( + type=CascadeRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type=RPNHead, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type=CascadeRoIHead, + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)) + ]), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100))) diff --git a/mmdetection/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py b/mmdetection/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py new file mode 100644 index 0000000..7e18de2 --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py @@ -0,0 +1,138 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.faster_rcnn import FasterRCNN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import L1Loss +from mmdet.models.necks.fpn import FPN +from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \ + Shared2FCBBoxHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +model = dict( + type=FasterRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type=RPNHead, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + roi_head=dict( + type=StandardRoIHead, + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100) + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + )) diff --git a/mmdetection/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py b/mmdetection/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py new file mode 100644 index 0000000..3054818 --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from mmengine.model.weight_init import PretrainedInit +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.mask_rcnn import MaskRCNN +from mmdet.models.layers import ResLayer +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import L1Loss +from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead +from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +norm_cfg = dict(type=BatchNorm2d, requires_grad=False) +# model settings +model = dict( + type=MaskRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=3, + strides=(1, 2, 2), + dilations=(1, 1, 1), + out_indices=(2, ), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='caffe', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + type=RPNHead, + in_channels=1024, + feat_channels=1024, + anchor_generator=dict( + type=AnchorGenerator, + scales=[2, 4, 8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[16]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + roi_head=dict( + type=StandardRoIHead, + shared_head=dict( + type=ResLayer, + depth=50, + stage=3, + stride=2, + dilation=1, + style='caffe', + norm_cfg=norm_cfg, + norm_eval=True), + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0), + out_channels=1024, + featmap_strides=[16]), + bbox_head=dict( + type=BBoxHead, + with_avg_pool=True, + roi_feat_size=7, + in_channels=2048, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + mask_roi_extractor=None, + mask_head=dict( + type=FCNMaskHead, + num_convs=0, + in_channels=2048, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=12000, + max_per_img=2000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=14, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=6000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdetection/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py b/mmdetection/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py new file mode 100644 index 0000000..c8a0b03 --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py @@ -0,0 +1,154 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import RoIAlign, nms +from mmengine.model.weight_init import PretrainedInit +from torch.nn import BatchNorm2d + +from mmdet.models.backbones.resnet import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rpn_head import RPNHead +from mmdet.models.detectors.mask_rcnn import MaskRCNN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.smooth_l1_loss import L1Loss +from mmdet.models.necks.fpn import FPN +from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \ + Shared2FCBBoxHead +from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead +from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \ + SingleRoIExtractor +from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.models.task_modules.samplers.random_sampler import RandomSampler + +# model settings +model = dict( + type=MaskRCNN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type=RPNHead, + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + roi_head=dict( + type=StandardRoIHead, + bbox_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type=Shared2FCBBoxHead, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + mask_roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type=FCNMaskHead, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type=RandomSampler, + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type=nms, iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/mmdetection/mmdet/configs/_base_/models/retinanet_r50_fpn.py b/mmdetection/mmdet/configs/_base_/models/retinanet_r50_fpn.py new file mode 100644 index 0000000..33e5cc4 --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/models/retinanet_r50_fpn.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.ops import nms +from torch.nn import BatchNorm2d + +from mmdet.models import (FPN, DetDataPreprocessor, FocalLoss, L1Loss, ResNet, + RetinaHead, RetinaNet) +from mmdet.models.task_modules import (AnchorGenerator, DeltaXYWHBBoxCoder, + MaxIoUAssigner, PseudoSampler) + +# model settings +model = dict( + type=RetinaNet, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=FPN, + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_input', + num_outs=5), + bbox_head=dict( + type=RetinaHead, + num_classes=80, + in_channels=256, + stacked_convs=4, + feat_channels=256, + anchor_generator=dict( + type=AnchorGenerator, + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + bbox_coder=dict( + type=DeltaXYWHBBoxCoder, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type=FocalLoss, + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + sampler=dict( + type=PseudoSampler), # Focal loss should use PseudoSampler + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type=nms, iou_threshold=0.5), + max_per_img=100)) diff --git a/mmdetection/mmdet/configs/_base_/schedules/schedule_1x.py b/mmdetection/mmdet/configs/_base_/schedules/schedule_1x.py new file mode 100644 index 0000000..47d1fa6 --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/schedules/schedule_1x.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim.sgd import SGD + +# training schedule for 1x +train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=12, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=12, + by_epoch=True, + milestones=[8, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/mmdet/configs/_base_/schedules/schedule_2x.py b/mmdetection/mmdet/configs/_base_/schedules/schedule_2x.py new file mode 100644 index 0000000..51ba09a --- /dev/null +++ b/mmdetection/mmdet/configs/_base_/schedules/schedule_2x.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim.sgd import SGD + +# training schedule for 1x +train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=24, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=24, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py b/mmdetection/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..a81c25a --- /dev/null +++ b/mmdetection/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_instance import * + from .._base_.default_runtime import * + from .._base_.models.cascade_mask_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdetection/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py b/mmdetection/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..883f09b --- /dev/null +++ b/mmdetection/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_detection import * + from .._base_.default_runtime import * + from .._base_.models.cascade_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdetection/mmdet/configs/common/lsj_100e_coco_detection.py b/mmdetection/mmdet/configs/common/lsj_100e_coco_detection.py new file mode 100644 index 0000000..ea2d6ba --- /dev/null +++ b/mmdetection/mmdet/configs/common/lsj_100e_coco_detection.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + +from mmengine.dataset.sampler import DefaultSampler +from mmengine.optim import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmdet.datasets import CocoDataset, RepeatDataset +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations, + LoadImageFromFile) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + RandomResize, Resize) +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' +image_size = (1024, 1024) + +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=RandomResize, + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type=RandomCrop, + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type=FilterAnnotations, min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +# Use RepeatDataset to speed up training +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=RepeatDataset, + times=4, # simply change this from 2 to 16 for 50e - 400e training. + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +max_epochs = 25 + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=5) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +# optimizer assumes bs=64 +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004)) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] + +# only keep latest 2 checkpoints +default_hooks.update(dict(checkpoint=dict(max_keep_ckpts=2))) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/mmdet/configs/common/lsj_100e_coco_instance.py b/mmdetection/mmdet/configs/common/lsj_100e_coco_instance.py new file mode 100644 index 0000000..90104ee --- /dev/null +++ b/mmdetection/mmdet/configs/common/lsj_100e_coco_instance.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + +from mmengine.dataset.sampler import DefaultSampler +from mmengine.optim import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmdet.datasets import CocoDataset, RepeatDataset +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations, + LoadImageFromFile) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + RandomResize, Resize) +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' +image_size = (1024, 1024) + +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=RandomResize, + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type=RandomCrop, + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type=FilterAnnotations, min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +# Use RepeatDataset to speed up training +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=RepeatDataset, + times=4, # simply change this from 2 to 16 for 50e - 400e training. + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +max_epochs = 25 + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=5) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +# optimizer assumes bs=64 +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004)) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] + +# only keep latest 2 checkpoints +default_hooks.update(dict(checkpoint=dict(max_keep_ckpts=2))) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (32 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/mmdet/configs/common/lsj_200e_coco_detection.py b/mmdetection/mmdet/configs/common/lsj_200e_coco_detection.py new file mode 100644 index 0000000..5759499 --- /dev/null +++ b/mmdetection/mmdet/configs/common/lsj_200e_coco_detection.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .lsj_100e_coco_detection import * + +# 8x25=200e +train_dataloader.update(dict(dataset=dict(times=8))) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=1000), + dict( + type=MultiStepLR, + begin=0, + end=25, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] diff --git a/mmdetection/mmdet/configs/common/lsj_200e_coco_instance.py b/mmdetection/mmdet/configs/common/lsj_200e_coco_instance.py new file mode 100644 index 0000000..77c5cdd --- /dev/null +++ b/mmdetection/mmdet/configs/common/lsj_200e_coco_instance.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .lsj_100e_coco_instance import * + +# 8x25=200e +train_dataloader.update(dict(dataset=dict(times=8))) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=1000), + dict( + type=MultiStepLR, + begin=0, + end=25, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] diff --git a/mmdetection/mmdet/configs/common/ms_3x_coco.py b/mmdetection/mmdet/configs/common/ms_3x_coco.py new file mode 100644 index 0000000..c32b24d --- /dev/null +++ b/mmdetection/mmdet/configs/common/ms_3x_coco.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + +from mmcv.transforms import RandomResize +from mmengine.dataset import RepeatDataset +from mmengine.dataset.sampler import DefaultSampler +from mmengine.optim import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmdet.datasets import AspectRatioBatchSampler, CocoDataset +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (LoadAnnotations, + LoadImageFromFile) +from mmdet.datasets.transforms.transforms import RandomFlip, Resize +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=RandomResize, scale=[(1333, 640), (1333, 800)], keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=RepeatDataset, + times=3, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +# training schedule for 3x with `RepeatDataset` +train_cfg = dict(type=EpochBasedTrainLoop, max_iters=12, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=12, + by_epoch=False, + milestones=[9, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)) +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/mmdet/configs/common/ms_3x_coco_instance.py b/mmdetection/mmdet/configs/common/ms_3x_coco_instance.py new file mode 100644 index 0000000..3c78909 --- /dev/null +++ b/mmdetection/mmdet/configs/common/ms_3x_coco_instance.py @@ -0,0 +1,136 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + +from mmcv.transforms import RandomChoiceResize +from mmengine.dataset import RepeatDataset +from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler +from mmengine.optim import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmdet.datasets import AspectRatioBatchSampler, CocoDataset +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations, + LoadImageFromFile) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + RandomResize, Resize) +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader.update( + dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=RepeatDataset, + times=3, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args)))) +val_dataloader.update( + dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args))) +test_dataloader = val_dataloader + +val_evaluator.update( + dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + backend_args=backend_args)) +test_evaluator = val_evaluator + +# training schedule for 3x with `RepeatDataset` +train_cfg.update(dict(type=EpochBasedTrainLoop, max_epochs=12, val_interval=1)) +val_cfg.update(dict(type=ValLoop)) +test_cfg.update(dict(type=TestLoop)) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=12, + by_epoch=False, + milestones=[9, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper.update( + dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))) +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr.update(dict(enable=False, base_batch_size=16)) diff --git a/mmdetection/mmdet/configs/common/ms_90k_coco.py b/mmdetection/mmdet/configs/common/ms_90k_coco.py new file mode 100644 index 0000000..3abf1d4 --- /dev/null +++ b/mmdetection/mmdet/configs/common/ms_90k_coco.py @@ -0,0 +1,151 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + +from mmcv.transforms import RandomChoiceResize +from mmengine.dataset import RepeatDataset +from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler +from mmengine.optim import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmdet.datasets import AspectRatioBatchSampler, CocoDataset +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations, + LoadImageFromFile) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + RandomResize, Resize) +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# Align with Detectron2 +backend = 'pillow' +train_pipeline = [ + dict( + type=LoadImageFromFile, + backend_args=backend_args, + imdecode_backend=backend), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=RandomChoiceResize, + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True, + backend=backend), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict( + type=LoadImageFromFile, + backend_args=backend_args, + imdecode_backend=backend), + dict(type=Resize, scale=(1333, 800), keep_ratio=True, backend=backend), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader.update( + dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type=InfiniteSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader.update( + dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + pin_memory=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args))) +test_dataloader = val_dataloader + +val_evaluator.update( + dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox', + format_only=False, + backend_args=backend_args)) +test_evaluator = val_evaluator + +# training schedule for 90k +max_iter = 90000 +train_cfg.update( + dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000)) +val_cfg.update(dict(type=ValLoop)) +test_cfg.update(dict(type=TestLoop)) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000), + dict( + type=MultiStepLR, + begin=0, + end=max_iter, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] + +# optimizer +optim_wrapper.update( + dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))) +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr.update(dict(enable=False, base_batch_size=16)) + +default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000))) +log_processor.update(dict(by_epoch=False)) diff --git a/mmdetection/mmdet/configs/common/ms_poly_3x_coco_instance.py b/mmdetection/mmdet/configs/common/ms_poly_3x_coco_instance.py new file mode 100644 index 0000000..53913a0 --- /dev/null +++ b/mmdetection/mmdet/configs/common/ms_poly_3x_coco_instance.py @@ -0,0 +1,138 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + +from mmcv.transforms import RandomChoiceResize +from mmengine.dataset import RepeatDataset +from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler +from mmengine.optim import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmdet.datasets import AspectRatioBatchSampler, CocoDataset +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations, + LoadImageFromFile) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + RandomResize, Resize) +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)], +# multiscale_mode='range' +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type='RandomResize', scale=[(1333, 640), (1333, 800)], + keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader.update( + dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type=DefaultSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=RepeatDataset, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader.update( + dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + drop_last=False, + pin_memory=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args))) +test_dataloader = val_dataloader + +val_evaluator.update( + dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + backend_args=backend_args)) +test_evaluator = val_evaluator + +# training schedule for 3x with `RepeatDataset` +train_cfg.update(dict(type=EpochBasedTrainLoop, max_iters=12, val_interval=1)) +val_cfg.update(dict(type=ValLoop)) +test_cfg.update(dict(type=TestLoop)) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=12, + by_epoch=False, + milestones=[9, 11], + gamma=0.1) +] + +# optimizer +optim_wrapper.update( + dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))) +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr.update(dict(enable=False, base_batch_size=16)) diff --git a/mmdetection/mmdet/configs/common/ms_poly_90k_coco_instance.py b/mmdetection/mmdet/configs/common/ms_poly_90k_coco_instance.py new file mode 100644 index 0000000..5236735 --- /dev/null +++ b/mmdetection/mmdet/configs/common/ms_poly_90k_coco_instance.py @@ -0,0 +1,153 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + +from mmcv.transforms import RandomChoiceResize +from mmengine.dataset import RepeatDataset +from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler +from mmengine.optim import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmdet.datasets import AspectRatioBatchSampler, CocoDataset +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations, + LoadImageFromFile) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + RandomResize, Resize) +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# Align with Detectron2 +backend = 'pillow' +train_pipeline = [ + dict( + type=LoadImageFromFile, + backend_args=backend_args, + imdecode_backend=backend), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type=RandomChoiceResize, + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True, + backend=backend), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict( + type=LoadImageFromFile, + backend_args=backend_args, + imdecode_backend=backend), + dict(type=Resize, scale=(1333, 800), keep_ratio=True, backend=backend), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader.update( + dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(type=InfiniteSampler, shuffle=True), + batch_sampler=dict(type=AspectRatioBatchSampler), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader.update( + dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + pin_memory=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args))) +test_dataloader = val_dataloader + +val_evaluator.update( + dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args)) +test_evaluator = val_evaluator + +# training schedule for 90k +max_iter = 90000 +train_cfg.update( + dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000)) +val_cfg.update(dict(type=ValLoop)) +test_cfg.update(dict(type=TestLoop)) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000), + dict( + type=MultiStepLR, + begin=0, + end=max_iter, + by_epoch=False, + milestones=[60000, 80000], + gamma=0.1) +] + +# optimizer +optim_wrapper.update( + dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))) +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr.update(dict(enable=False, base_batch_size=16)) + +default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000))) +log_processor.update(dict(by_epoch=False)) diff --git a/mmdetection/mmdet/configs/common/ssj_270_coco_instance.py b/mmdetection/mmdet/configs/common/ssj_270_coco_instance.py new file mode 100644 index 0000000..ee86fda --- /dev/null +++ b/mmdetection/mmdet/configs/common/ssj_270_coco_instance.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + +from mmcv.transforms import RandomChoiceResize +from mmengine.dataset import RepeatDataset +from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler +from mmengine.optim import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR +from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop +from torch.optim import SGD + +from mmdet.datasets import AspectRatioBatchSampler, CocoDataset +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations, + LoadImageFromFile) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + RandomResize, Resize) +from mmdet.evaluation import CocoMetric + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# Standard Scale Jittering (SSJ) resizes and crops an image +# with a resize range of 0.8 to 1.25 of the original image size. +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=RandomResize, + scale=image_size, + ratio_range=(0.8, 1.25), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader.update( + dict( + batch_size=2, + num_workers=2, + persistent_workers=True, + sampler=dict(type=InfiniteSampler), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) +val_dataloader.update( + dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + backend_args=backend_args))) +test_dataloader = val_dataloader + +val_evaluator.update( + dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args)) +test_evaluator = val_evaluator + +val_evaluator = dict( + type=CocoMetric, + ann_file=data_root + 'annotations/instances_val2017.json', + metric=['bbox', 'segm'], + format_only=False, + backend_args=backend_args) +test_evaluator = val_evaluator + +# The model is trained by 270k iterations with batch_size 64, +# which is roughly equivalent to 144 epochs. + +max_iter = 270000 +train_cfg.update( + dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000)) +val_cfg.update(dict(type=ValLoop)) +test_cfg.update(dict(type=TestLoop)) + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000), + dict( + type=MultiStepLR, + begin=0, + end=max_iter, + by_epoch=False, + milestones=[243000, 256500, 263250], + gamma=0.1) +] + +# optimizer +optim_wrapper.update( + dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004))) +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). +auto_scale_lr.update(dict(base_batch_size=64)) + +default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000))) +log_processor.update(dict(by_epoch=False)) diff --git a/mmdetection/mmdet/configs/common/ssj_scp_270k_coco_instance.py b/mmdetection/mmdet/configs/common/ssj_scp_270k_coco_instance.py new file mode 100644 index 0000000..68bb1f0 --- /dev/null +++ b/mmdetection/mmdet/configs/common/ssj_scp_270k_coco_instance.py @@ -0,0 +1,70 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .ssj_270_coco_instance import * + +from mmdet.datasets import MultiImageMixDataset +from mmdet.datasets.transforms import CopyPaste + +# dataset settings +dataset_type = CocoDataset +data_root = 'data/coco/' +image_size = (1024, 1024) +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) +backend_args = None + +# Standard Scale Jittering (SSJ) resizes and crops an image +# with a resize range of 0.8 to 1.25 of the original image size. +load_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=RandomResize, + scale=image_size, + ratio_range=(0.8, 1.25), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=image_size), +] +train_pipeline = [ + dict(type=CopyPaste, max_num_pasted=100), + dict(type=PackDetInputs) +] + +train_dataloader.update( + dict( + type=MultiImageMixDataset, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=load_pipeline, + backend_args=backend_args), + pipeline=train_pipeline)) diff --git a/mmdetection/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py b/mmdetection/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py new file mode 100644 index 0000000..ee2a416 --- /dev/null +++ b/mmdetection/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_detection import * + from .._base_.default_runtime import * + +from mmcv.transforms import LoadImageFromFile, RandomChoice, RandomChoiceResize +from mmengine.optim.optimizer import OptimWrapper +from mmengine.optim.scheduler import MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim.adamw import AdamW + +from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs, + RandomCrop, RandomFlip, Resize) +from mmdet.models.backbones import ResNet +from mmdet.models.data_preprocessors import DetDataPreprocessor +from mmdet.models.dense_heads import DeformableDETRHead +from mmdet.models.detectors import DeformableDETR +from mmdet.models.losses import FocalLoss, GIoULoss, L1Loss +from mmdet.models.necks import ChannelMapper +from mmdet.models.task_modules import (BBoxL1Cost, FocalLossCost, + HungarianAssigner, IoUCost) + +model = dict( + type=DeformableDETR, + num_queries=300, + num_feature_levels=4, + with_box_refine=False, + as_two_stage=False, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type=ChannelMapper, + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + encoder=dict( # DeformableDetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DeformableDetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=1024, ffn_drop=0.1))), + decoder=dict( # DeformableDetrTransformerDecoder + num_layers=6, + return_intermediate=True, + layer_cfg=dict( # DeformableDetrTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + cross_attn_cfg=dict( # MultiScaleDeformableAttention + embed_dims=256, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, feedforward_channels=1024, ffn_drop=0.1)), + post_norm_cfg=None), + positional_encoding=dict(num_feats=128, normalize=True, offset=-0.5), + bbox_head=dict( + type=DeformableDETRHead, + num_classes=80, + sync_cls_avg_factor=True, + loss_cls=dict( + type=FocalLoss, + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type=L1Loss, loss_weight=5.0), + loss_iou=dict(type=GIoULoss, loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type=HungarianAssigner, + match_costs=[ + dict(type=FocalLossCost, weight=2.0), + dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'), + dict(type=IoUCost, iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=100)) + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=RandomFlip, prob=0.5), + dict( + type=RandomChoice, + transforms=[ + [ + dict( + type=RandomChoiceResize, + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + resize_type=Resize, + keep_ratio=True) + ], + [ + dict( + type=RandomChoiceResize, + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + resize_type=Resize, + keep_ratio=True), + dict( + type=RandomCrop, + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type=RandomChoiceResize, + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + resize_type=Resize, + keep_ratio=True) + ] + ]), + dict(type=PackDetInputs) +] +train_dataloader.update( + dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline))) + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=AdamW, lr=0.0002, weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1), + 'sampling_offsets': dict(lr_mult=0.1), + 'reference_points': dict(lr_mult=0.1) + })) + +# learning policy +max_epochs = 50 +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +param_scheduler = [ + dict( + type=MultiStepLR, + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[40], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (16 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=32) diff --git a/mmdetection/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py b/mmdetection/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py new file mode 100644 index 0000000..4f232d6 --- /dev/null +++ b/mmdetection/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .deformable_detr_r50_16xb2_50e_coco import * + +model.update(dict(with_box_refine=True)) diff --git a/mmdetection/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py b/mmdetection/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py new file mode 100644 index 0000000..1fac4d8 --- /dev/null +++ b/mmdetection/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .deformable_detr_refine_r50_16xb2_50e_coco import * + +model.update(dict(as_two_stage=True)) diff --git a/mmdetection/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py b/mmdetection/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py new file mode 100644 index 0000000..b961468 --- /dev/null +++ b/mmdetection/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base +from mmengine.model.weight_init import PretrainedInit + +with read_base(): + from .detr_r50_8xb2_500e_coco import * + +model.update( + dict( + backbone=dict( + depth=101, + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet101')))) diff --git a/mmdetection/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py b/mmdetection/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py new file mode 100644 index 0000000..11360af --- /dev/null +++ b/mmdetection/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base +from mmengine.model.weight_init import PretrainedInit + +with read_base(): + from .detr_r50_8xb2_500e_coco import * + +model.update( + dict( + backbone=dict( + depth=18, + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[512]))) diff --git a/mmdetection/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py b/mmdetection/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py new file mode 100644 index 0000000..c50726c --- /dev/null +++ b/mmdetection/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py @@ -0,0 +1,182 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms import RandomChoice, RandomChoiceResize +from mmcv.transforms.loading import LoadImageFromFile +from mmengine.config import read_base +from mmengine.model.weight_init import PretrainedInit +from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.nn.modules.activation import ReLU +from torch.nn.modules.batchnorm import BatchNorm2d +from torch.optim.adamw import AdamW + +from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs, + RandomCrop, RandomFlip, Resize) +from mmdet.models import (DETR, ChannelMapper, DetDataPreprocessor, DETRHead, + ResNet) +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.losses.iou_loss import GIoULoss +from mmdet.models.losses.smooth_l1_loss import L1Loss +from mmdet.models.task_modules import (BBoxL1Cost, ClassificationCost, + HungarianAssigner, IoUCost) + +with read_base(): + from .._base_.datasets.coco_detection import * + from .._base_.default_runtime import * + +model = dict( + type=DETR, + num_queries=100, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(3, ), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet50')), + neck=dict( + type=ChannelMapper, + in_channels=[2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=None, + num_outs=1), + encoder=dict( # DetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type=ReLU, inplace=True)))), + decoder=dict( # DetrTransformerDecoder + num_layers=6, + layer_cfg=dict( # DetrTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type=ReLU, inplace=True))), + return_intermediate=True), + positional_encoding=dict(num_feats=128, normalize=True), + bbox_head=dict( + type=DETRHead, + num_classes=80, + embed_dims=256, + loss_cls=dict( + type=CrossEntropyLoss, + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox=dict(type=L1Loss, loss_weight=5.0), + loss_iou=dict(type=GIoULoss, loss_weight=2.0)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type=HungarianAssigner, + match_costs=[ + dict(type=ClassificationCost, weight=1.), + dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'), + dict(type=IoUCost, iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=100)) + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=RandomFlip, prob=0.5), + dict( + type=RandomChoice, + transforms=[[ + dict( + type=RandomChoiceResize, + resize_type=Resize, + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type=RandomChoiceResize, + resize_type=Resize, + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True), + dict( + type=RandomCrop, + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type=RandomChoiceResize, + resize_type=Resize, + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True) + ]]), + dict(type=PackDetInputs) +] +train_dataloader.update(dataset=dict(pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=AdamW, lr=0.0001, weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) + +# learning policy +max_epochs = 150 +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +param_scheduler = [ + dict( + type=MultiStepLR, + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[100], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py b/mmdetection/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py new file mode 100644 index 0000000..d7d0817 --- /dev/null +++ b/mmdetection/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base +from mmengine.optim.scheduler.lr_scheduler import MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop + +with read_base(): + from .detr_r50_8xb2_150e_coco import * + +# learning policy +max_epochs = 500 +train_cfg.update( + type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=10) + +param_scheduler = [ + dict( + type=MultiStepLR, + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[334], + gamma=0.1) +] + +# only keep latest 2 checkpoints +default_hooks.update(checkpoint=dict(max_keep_ckpts=2)) diff --git a/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py b/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py new file mode 100644 index 0000000..ab8e95a --- /dev/null +++ b/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py @@ -0,0 +1,190 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms import RandomChoice, RandomChoiceResize +from mmcv.transforms.loading import LoadImageFromFile +from mmengine.config import read_base +from mmengine.model.weight_init import PretrainedInit +from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import MultiStepLR +from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.nn.modules.batchnorm import BatchNorm2d +from torch.nn.modules.normalization import GroupNorm +from torch.optim.adamw import AdamW + +from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs, + RandomCrop, RandomFlip, Resize) +from mmdet.models import (DINO, ChannelMapper, DetDataPreprocessor, DINOHead, + ResNet) +from mmdet.models.losses.focal_loss import FocalLoss +from mmdet.models.losses.iou_loss import GIoULoss +from mmdet.models.losses.smooth_l1_loss import L1Loss +from mmdet.models.task_modules import (BBoxL1Cost, FocalLossCost, + HungarianAssigner, IoUCost) + +with read_base(): + from .._base_.datasets.coco_detection import * + from .._base_.default_runtime import * + +model = dict( + type=DINO, + num_queries=900, # num_matching_queries + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet50')), + neck=dict( + type=ChannelMapper, + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type=GroupNorm, num_groups=32), + num_outs=4), + encoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0))), # 0.1 for DeformDETR + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, + dropout=0.0), # 0.1 for DeformDETR + cross_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0)), # 0.1 for DeformDETR + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, + normalize=True, + offset=0.0, # -0.5 for DeformDETR + temperature=20), # 10000 for DeformDETR + bbox_head=dict( + type=DINOHead, + num_classes=80, + sync_cls_avg_factor=True, + loss_cls=dict( + type=FocalLoss, + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), # 2.0 in DeformDETR + loss_bbox=dict(type=L1Loss, loss_weight=5.0), + loss_iou=dict(type=GIoULoss, loss_weight=2.0)), + dn_cfg=dict( # TODO: Move to model.train_cfg ? + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, + num_dn_queries=100)), # TODO: half num_dn_queries + # training and testing settings + train_cfg=dict( + assigner=dict( + type=HungarianAssigner, + match_costs=[ + dict(type=FocalLossCost, weight=2.0), + dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'), + dict(type=IoUCost, iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) # 100 for DeformDETR + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=RandomFlip, prob=0.5), + dict( + type=RandomChoice, + transforms=[ + [ + dict( + type=RandomChoiceResize, + resize_type=Resize, + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type=RandomChoiceResize, + resize_type=Resize, + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type=RandomCrop, + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type=RandomChoiceResize, + resize_type=Resize, + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type=PackDetInputs) +] +train_dataloader.update( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict( + type=AdamW, + lr=0.0001, # 0.0002 for DeformDETR + weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}) +) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa + +# learning policy +max_epochs = 12 +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1) + +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +param_scheduler = [ + dict( + type=MultiStepLR, + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py b/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py new file mode 100644 index 0000000..c10cc21 --- /dev/null +++ b/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base +from mmengine.runner.loops import EpochBasedTrainLoop + +with read_base(): + from .dino_4scale_r50_8xb2_12e_coco import * + +max_epochs = 24 +train_cfg.update( + dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)) + +param_scheduler[0].update(dict(milestones=[20])) diff --git a/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py b/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py new file mode 100644 index 0000000..3779744 --- /dev/null +++ b/mmdetection/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base +from mmengine.runner.loops import EpochBasedTrainLoop + +with read_base(): + from .dino_4scale_r50_8xb2_12e_coco import * + +max_epochs = 36 +train_cfg.update( + dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)) + +param_scheduler[0].update(dict(milestones=[30])) diff --git a/mmdetection/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py b/mmdetection/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py new file mode 100644 index 0000000..43c0720 --- /dev/null +++ b/mmdetection/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from .dino_4scale_r50_8xb2_12e_coco import * + +# from deformable detr hyper +model.update( + dict( + backbone=dict(frozen_stages=-1), + bbox_head=dict(loss_cls=dict(loss_weight=2.0)), + positional_encoding=dict(offset=-0.5, temperature=10000), + dn_cfg=dict(group_cfg=dict(num_dn_queries=300)))) + +# optimizer +optim_wrapper.update( + dict( + optimizer=dict(lr=0.0002), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1), + 'sampling_offsets': dict(lr_mult=0.1), + 'reference_points': dict(lr_mult=0.1) + }))) diff --git a/mmdetection/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py b/mmdetection/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py new file mode 100644 index 0000000..25aac01 --- /dev/null +++ b/mmdetection/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base +from mmengine.model.weight_init import PretrainedInit + +from mmdet.models import SwinTransformer + +with read_base(): + from .dino_4scale_r50_8xb2_12e_coco import * + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa +num_levels = 5 +model.merge( + dict( + num_feature_levels=num_levels, + backbone=dict( + _delete_=True, + type=SwinTransformer, + pretrain_img_size=384, + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(0, 1, 2, 3), + # Please only add indices that would be used + # in FPN, otherwise some parameter will not be used + with_cp=True, + convert_weights=True, + init_cfg=dict(type=PretrainedInit, checkpoint=pretrained)), + neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels), + encoder=dict( + layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))), + decoder=dict( + layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))) diff --git a/mmdetection/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py b/mmdetection/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py new file mode 100644 index 0000000..494acf5 --- /dev/null +++ b/mmdetection/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base +from mmengine.runner.loops import EpochBasedTrainLoop + +with read_base(): + from .dino_5scale_swin_l_8xb2_12e_coco import * + +max_epochs = 36 +train_cfg.update( + dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)) + +param_scheduler[0].update(dict(milestones=[27, 33])) diff --git a/mmdetection/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py b/mmdetection/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..f0a6d5a --- /dev/null +++ b/mmdetection/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_detection import * + from .._base_.default_runtime import * + from .._base_.models.faster_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py new file mode 100644 index 0000000..2780f4a --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_poly_1x_coco import * + +from mmengine.model.weight_init import PretrainedInit + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py new file mode 100644 index 0000000..8a1badf --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from ..common.ms_poly_3x_coco_instance import * + from .._base_.models.mask_rcnn_r50_fpn import * + +from mmengine.model.weight_init import PretrainedInit + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + depth=101, + norm_cfg=dict(requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnet101_caffe'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py new file mode 100644 index 0000000..6770cec --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.models.mask_rcnn_r50_fpn import * + +from mmengine.model.weight_init import PretrainedInit + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py new file mode 100644 index 0000000..fd2aafb --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_2x_coco import * + +from mmengine.model.weight_init import PretrainedInit + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py new file mode 100644 index 0000000..665808d --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco import * + +from mmengine.model.weight_init import PretrainedInit + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py new file mode 100644 index 0000000..1468879 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from ..common.ms_poly_3x_coco_instance import * + from .._base_.models.mask_rcnn_r50_fpn import * + +from mmengine.model.weight_init import PretrainedInit + +model = dict( + backbone=dict( + depth=101, + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet101'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py new file mode 100644 index 0000000..67bd86f --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco import * + +from mmengine.model.weight_init import PretrainedInit + +model = dict( + backbone=dict( + depth=18, + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet18')), + neck=dict(in_channels=[64, 128, 256, 512])) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py new file mode 100644 index 0000000..494e6ba --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_instance import * + from .._base_.default_runtime import * + from .._base_.models.mask_rcnn_r50_caffe_c4 import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py new file mode 100644 index 0000000..6481fcf --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_1x_coco import * + +from mmengine.model.weight_init import PretrainedInit + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py new file mode 100644 index 0000000..5952ed5 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_1x_coco import * + +from mmcv.transforms import RandomChoiceResize +from mmengine.model.weight_init import PretrainedInit + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args={{_base_.backend_args}}), + dict(type=LoadAnnotations, with_bbox=True, with_mask=True), + dict( + type=RandomChoiceResize, + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs), +] + +train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py new file mode 100644 index 0000000..d62b9eb --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_1x_coco import * + +from mmcv.transforms import RandomChoiceResize +from mmengine.model.weight_init import PretrainedInit + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args={{_base_.backend_args}}), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type=RandomChoiceResize, + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs) +] + +train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py new file mode 100644 index 0000000..fa41b7e --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco import * + +train_cfg = dict(max_epochs=24) +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=24, + by_epoch=True, + milestones=[16, 22], + gamma=0.1) +] diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py new file mode 100644 index 0000000..c5f9b97 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco import * + +train_cfg = dict(max_epochs=36) +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500), + dict( + type=MultiStepLR, + begin=0, + end=24, + by_epoch=True, + milestones=[28, 34], + gamma=0.1) +] diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py new file mode 100644 index 0000000..28ba7c7 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_1x_coco import * + +from mmengine.model.weight_init import PretrainedInit + +from mmdet.models.losses import SmoothL1Loss + +model = dict( + # use caffe img_norm + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False), + backbone=dict( + norm_cfg=dict(requires_grad=False), + style='caffe', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + rpn_head=dict( + loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + bbox_roi_extractor=dict( + roi_layer=dict( + type=RoIAlign, output_size=7, sampling_ratio=2, + aligned=False)), + bbox_head=dict( + loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)), + mask_roi_extractor=dict( + roi_layer=dict( + type=RoIAlign, output_size=14, sampling_ratio=2, + aligned=False)))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..8145d08 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_instance import * + from .._base_.default_runtime import * + from .._base_.models.mask_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py new file mode 100644 index 0000000..d2c0876 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_instance import * + from .._base_.default_runtime import * + from .._base_.models.mask_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * + +from mmengine.visualization import LocalVisBackend, WandbVisBackend + +vis_backends.update(dict(type=WandbVisBackend)) +vis_backends.update(dict(type=LocalVisBackend)) +visualizer.update(dict(vis_backends=vis_backends)) + +# MMEngine support the following two ways, users can choose +# according to convenience +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +default_hooks.update(dict(checkpoint=dict(interval=4))) + +train_cfg.update(dict(val_interval=2)) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py new file mode 100644 index 0000000..6be010b --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_instance import * + from .._base_.default_runtime import * + from .._base_.models.mask_rcnn_r50_fpn import * + from .._base_.schedules.schedule_2x import * diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py new file mode 100644 index 0000000..ef101fe --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py new file mode 100644 index 0000000..110c3c4 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_1x_coco import * + +from mmengine.optim.optimizer.amp_optimizer_wrapper import AmpOptimWrapper + +optim_wrapper.update(dict(type=AmpOptimWrapper)) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py new file mode 100644 index 0000000..ff4eec6 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.models.mask_rcnn_r50_fpn import * + from ..common.ms_poly_3x_coco_instance import * diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py new file mode 100644 index 0000000..012e711 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.coco_instance import * + from .._base_.default_runtime import * + from .._base_.models.mask_rcnn_r50_fpn import * + from .._base_.schedules.schedule_1x import * + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs), +] +train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py new file mode 100644 index 0000000..5429b1b --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r101_fpn_1x_coco import * + +from mmengine.model.weight_init import PretrainedInit + +from mmdet.models.backbones.resnext import ResNeXt + +model = dict( + backbone=dict( + type=ResNeXt, + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py new file mode 100644 index 0000000..ebae6c1 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r50_fpn_2x_coco import * + +from mmengine.model.weight_init import PretrainedInit + +from mmdet.models import ResNeXt + +model = dict( + backbone=dict( + type=ResNeXt, + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py new file mode 100644 index 0000000..aff45d8 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from ..common.ms_poly_3x_coco_instance import * + from .._base_.models.mask_rcnn_r50_fpn import * + +from mmengine.model.weight_init import PretrainedInit + +from mmdet.models.backbones import ResNeXt + +model = dict( + backbone=dict( + type=ResNeXt, + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py new file mode 100644 index 0000000..d9f2095 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_x101_32x4d_fpn_1x_coco import * + +model = dict( + # ResNeXt-101-32x8d model trained with Caffe2 at FB, + # so the mean and std need to be changed. + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False), + backbone=dict( + type=ResNeXt, + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=False), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py new file mode 100644 index 0000000..8eded94 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_r101_fpn_1x_coco import * + +from mmcv.transforms import RandomChoiceResize, RandomFlip +from mmcv.transforms.loading import LoadImageFromFile + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.models.backbones import ResNeXt + +model = dict( + # ResNeXt-101-32x8d model trained with Caffe2 at FB, + # so the mean and std need to be changed. + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False), + backbone=dict( + type=ResNeXt, + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=False), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) + +backend_args = None +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type=RandomChoiceResize, + scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736), + (1333, 768), (1333, 800)], + keep_ratio=True), + dict(type=RandomFlip, prob=0.5), + dict(type=PackDetInputs), +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py new file mode 100644 index 0000000..b3f5846 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from ..common.ms_poly_3x_coco_instance import * + from .._base_.models.mask_rcnn_r50_fpn import * + +from mmdet.models.backbones import ResNeXt + +model = dict( + # ResNeXt-101-32x8d model trained with Caffe2 at FB, + # so the mean and std need to be changed. + data_preprocessor=dict( + mean=[103.530, 116.280, 123.675], + std=[57.375, 57.120, 58.395], + bgr_to_rgb=False), + backbone=dict( + type=ResNeXt, + depth=101, + groups=32, + base_width=8, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=False), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnext101_32x8d'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py new file mode 100644 index 0000000..8bb6f63 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_x101_32x4d_fpn_1x_coco import * + +model = dict( + backbone=dict( + type=ResNeXt, + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py new file mode 100644 index 0000000..d661076 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .mask_rcnn_x101_32x4d_fpn_2x_coco import * + +model = dict( + backbone=dict( + type=ResNeXt, + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py new file mode 100644 index 0000000..d9ab364 --- /dev/null +++ b/mmdetection/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from ..common.ms_poly_3x_coco_instance import * + from .._base_.models.mask_rcnn_r50_fpn import * + +from mmdet.models.backbones import ResNeXt + +model = dict( + backbone=dict( + type=ResNeXt, + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=True), + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d'))) diff --git a/mmdetection/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py b/mmdetection/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py new file mode 100644 index 0000000..7074401 --- /dev/null +++ b/mmdetection/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py @@ -0,0 +1,249 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms import RandomChoice, RandomChoiceResize +from mmengine.config import read_base +from mmengine.model.weight_init import PretrainedInit +from mmengine.optim.optimizer import OptimWrapper +from mmengine.optim.scheduler import MultiStepLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.nn.modules.activation import ReLU +from torch.nn.modules.batchnorm import BatchNorm2d +from torch.nn.modules.normalization import GroupNorm +from torch.optim.adamw import AdamW + +from mmdet.datasets.transforms.transforms import RandomCrop +from mmdet.models import MaskFormer +from mmdet.models.backbones import ResNet +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.maskformer_head import MaskFormerHead +from mmdet.models.layers.pixel_decoder import TransformerEncoderPixelDecoder +from mmdet.models.losses import CrossEntropyLoss, DiceLoss, FocalLoss +from mmdet.models.seg_heads.panoptic_fusion_heads import MaskFormerFusionHead +from mmdet.models.task_modules.assigners.hungarian_assigner import \ + HungarianAssigner +from mmdet.models.task_modules.assigners.match_cost import (ClassificationCost, + DiceCost, + FocalLossCost) +from mmdet.models.task_modules.samplers import MaskPseudoSampler + +with read_base(): + from .._base_.datasets.coco_panoptic import * + from .._base_.default_runtime import * + +data_preprocessor = dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255) + +num_things_classes = 80 +num_stuff_classes = 53 +num_classes = num_things_classes + num_stuff_classes +model = dict( + type=MaskFormer, + data_preprocessor=data_preprocessor, + backbone=dict( + type=ResNet, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + norm_cfg=dict(type=BatchNorm2d, requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict( + type=PretrainedInit, checkpoint='torchvision://resnet50')), + panoptic_head=dict( + type=MaskFormerHead, + in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside + feat_channels=256, + out_channels=256, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + num_queries=100, + pixel_decoder=dict( + type=TransformerEncoderPixelDecoder, + norm_cfg=dict(type=GroupNorm, num_groups=32), + act_cfg=dict(type=ReLU), + encoder=dict( # DetrTransformerEncoder + num_layers=6, + layer_cfg=dict( # DetrTransformerEncoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type=ReLU, inplace=True)))), + positional_encoding=dict(num_feats=128, normalize=True)), + enforce_decoder_input_project=False, + positional_encoding=dict(num_feats=128, normalize=True), + transformer_decoder=dict( # DetrTransformerDecoder + num_layers=6, + layer_cfg=dict( # DetrTransformerDecoderLayer + self_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + cross_attn_cfg=dict( # MultiheadAttention + embed_dims=256, + num_heads=8, + dropout=0.1, + batch_first=True), + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type=ReLU, inplace=True))), + return_intermediate=True), + loss_cls=dict( + type=CrossEntropyLoss, + use_sigmoid=False, + loss_weight=1.0, + reduction='mean', + class_weight=[1.0] * num_classes + [0.1]), + loss_mask=dict( + type=FocalLoss, + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=20.0), + loss_dice=dict( + type=DiceLoss, + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=1.0)), + panoptic_fusion_head=dict( + type=MaskFormerFusionHead, + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + loss_panoptic=None, + init_cfg=None), + train_cfg=dict( + assigner=dict( + type=HungarianAssigner, + match_costs=[ + dict(type=ClassificationCost, weight=1.0), + dict(type=FocalLossCost, weight=20.0, binary_input=True), + dict(type=DiceCost, weight=1.0, pred_act=True, eps=1.0) + ]), + sampler=dict(type=MaskPseudoSampler)), + test_cfg=dict( + panoptic_on=True, + # For now, the dataset does not support + # evaluating semantic segmentation metric. + semantic_on=False, + instance_on=False, + # max_per_image is for instance segmentation. + max_per_image=100, + object_mask_thr=0.8, + iou_thr=0.8, + # In MaskFormer's panoptic postprocessing, + # it will not filter masks whose score is smaller than 0.5 . + filter_low_score=False), + init_cfg=None) + +# dataset settings +train_pipeline = [ + dict(type=LoadImageFromFile), + dict( + type=LoadPanopticAnnotations, + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type=RandomFlip, prob=0.5), + # dict(type=Resize, scale=(1333, 800), keep_ratio=True), + dict( + type=RandomChoice, + transforms=[[ + dict( + type=RandomChoiceResize, + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + resize_type=Resize, + keep_ratio=True) + ], + [ + dict( + type=RandomChoiceResize, + scales=[(400, 1333), (500, 1333), (600, 1333)], + resize_type=Resize, + keep_ratio=True), + dict( + type=RandomCrop, + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type=RandomChoiceResize, + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + resize_type=Resize, + keep_ratio=True) + ]]), + dict(type=PackDetInputs) +] + +train_dataloader.update( + dict(batch_size=1, num_workers=1, dataset=dict(pipeline=train_pipeline))) + +val_dataloader.update(dict(batch_size=1, num_workers=1)) + +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict( + type=AdamW, + lr=0.0001, + weight_decay=0.0001, + eps=1e-8, + betas=(0.9, 0.999)), + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'query_embed': dict(lr_mult=1.0, decay_mult=0.0) + }, + norm_decay_mult=0.0), + clip_grad=dict(max_norm=0.01, norm_type=2)) + +max_epochs = 75 + +# learning rate +param_scheduler = dict( + type=MultiStepLR, + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[50], + gamma=0.1) + +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (16 GPUs) x (1 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/mmdetection/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py b/mmdetection/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py new file mode 100644 index 0000000..2affe52 --- /dev/null +++ b/mmdetection/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base +from mmengine.optim.scheduler import LinearLR + +from mmdet.models.backbones import SwinTransformer +from mmdet.models.layers import PixelDecoder + +with read_base(): + from .maskformer_r50_ms_16xb1_75e_coco import * + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa +depths = [2, 2, 18, 2] +model.update( + dict( + backbone=dict( + _delete_=True, + type=SwinTransformer, + pretrain_img_size=384, + embed_dims=192, + patch_size=4, + window_size=12, + mlp_ratio=4, + depths=depths, + num_heads=[6, 12, 24, 48], + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type=PretrainedInit, checkpoint=pretrained)), + panoptic_head=dict( + in_channels=[192, 384, 768, 1536], # pass to pixel_decoder inside + pixel_decoder=dict( + _delete_=True, + type=PixelDecoder, + norm_cfg=dict(type=GroupNorm, num_groups=32), + act_cfg=dict(type=ReLU)), + enforce_decoder_input_project=True))) + +# optimizer + +# weight_decay = 0.01 +# norm_weight_decay = 0.0 +# embed_weight_decay = 0.0 +embed_multi = dict(lr_mult=1.0, decay_mult=0.0) +norm_multi = dict(lr_mult=1.0, decay_mult=0.0) +custom_keys = { + 'norm': norm_multi, + 'absolute_pos_embed': embed_multi, + 'relative_position_bias_table': embed_multi, + 'query_embed': embed_multi +} + +optim_wrapper.update( + dict( + optimizer=dict(lr=6e-5, weight_decay=0.01), + paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))) + +max_epochs = 300 + +# learning rate +param_scheduler = [ + dict(type=LinearLR, start_factor=1e-6, by_epoch=False, begin=0, end=1500), + dict( + type=MultiStepLR, + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[250], + gamma=0.1) +] + +train_cfg.update(dict(max_epochs=max_epochs)) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (64 GPUs) x (1 samples per GPU) +auto_scale_lr.update(dict(base_batch_size=64)) diff --git a/mmdetection/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py b/mmdetection/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py new file mode 100644 index 0000000..fc89328 --- /dev/null +++ b/mmdetection/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py @@ -0,0 +1,64 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.models.mask_rcnn_r50_fpn import * + from .._base_.datasets.coco_panoptic import * + from .._base_.schedules.schedule_1x import * + from .._base_.default_runtime import * + +from mmcv.ops import nms +from torch.nn import GroupNorm + +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.detectors.panoptic_fpn import PanopticFPN +from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss +from mmdet.models.seg_heads.panoptic_fpn_head import PanopticFPNHead +from mmdet.models.seg_heads.panoptic_fusion_heads import HeuristicFusionHead + +model.update( + dict( + type=PanopticFPN, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + pad_mask=True, + mask_pad_value=0, + pad_seg=True, + seg_pad_value=255), + semantic_head=dict( + type=PanopticFPNHead, + num_things_classes=80, + num_stuff_classes=53, + in_channels=256, + inner_channels=128, + start_level=0, + end_level=4, + norm_cfg=dict(type=GroupNorm, num_groups=32, requires_grad=True), + conv_cfg=None, + loss_seg=dict( + type=CrossEntropyLoss, ignore_index=255, loss_weight=0.5)), + panoptic_fusion_head=dict( + type=HeuristicFusionHead, + num_things_classes=80, + num_stuff_classes=53), + test_cfg=dict( + rcnn=dict( + score_thr=0.6, + nms=dict(type=nms, iou_threshold=0.5, class_agnostic=True), + max_per_img=100, + mask_thr_binary=0.5), + # used in HeuristicFusionHead + panoptic=dict(mask_overlap=0.5, stuff_area_limit=4096)))) + +# Forced to remove NumClassCheckHook +custom_hooks = [] diff --git a/mmdetection/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py b/mmdetection/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py new file mode 100644 index 0000000..c672e82 --- /dev/null +++ b/mmdetection/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py @@ -0,0 +1,141 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from .._base_.models.faster_rcnn_r50_fpn import * + from .._base_.models.faster_rcnn_r50_fpn import model + from .._base_.default_runtime import * + +from mmcv.ops import RoIAlign +from mmengine.hooks import LoggerHook, SyncBuffersHook +from mmengine.model.weight_init import PretrainedInit +from mmengine.optim import MultiStepLR, OptimWrapper +from mmengine.runner.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.nn.modules.batchnorm import BatchNorm2d +from torch.nn.modules.normalization import GroupNorm +from torch.optim import SGD + +from mmdet.engine.hooks import TrackVisualizationHook +from mmdet.models import (QDTrack, QuasiDenseEmbedHead, QuasiDenseTracker, + QuasiDenseTrackHead, SingleRoIExtractor, + TrackDataPreprocessor) +from mmdet.models.losses import (L1Loss, MarginL2Loss, + MultiPosCrossEntropyLoss, SmoothL1Loss) +from mmdet.models.task_modules import (CombinedSampler, + InstanceBalancedPosSampler, + MaxIoUAssigner, RandomSampler) +from mmdet.visualization import TrackLocalVisualizer + +detector = model +detector.pop('data_preprocessor') + +detector['backbone'].update( + dict( + norm_cfg=dict(type=BatchNorm2d, requires_grad=False), + style='caffe', + init_cfg=dict( + type=PretrainedInit, + checkpoint='open-mmlab://detectron2/resnet50_caffe'))) +detector.rpn_head.loss_bbox.update( + dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)) +detector.rpn_head.bbox_coder.update(dict(clip_border=False)) +detector.roi_head.bbox_head.update(dict(num_classes=1)) +detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False)) +detector['init_cfg'] = dict( + type=PretrainedInit, + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/' + 'faster_rcnn_r50_fpn_1x_coco-person/' + 'faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' + # noqa: E501 +) +del model + +model = dict( + type=QDTrack, + data_preprocessor=dict( + type=TrackDataPreprocessor, + mean=[103.530, 116.280, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + detector=detector, + track_head=dict( + type=QuasiDenseTrackHead, + roi_extractor=dict( + type=SingleRoIExtractor, + roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + embed_head=dict( + type=QuasiDenseEmbedHead, + num_convs=4, + num_fcs=1, + embed_channels=256, + norm_cfg=dict(type=GroupNorm, num_groups=32), + loss_track=dict(type=MultiPosCrossEntropyLoss, loss_weight=0.25), + loss_track_aux=dict( + type=MarginL2Loss, + neg_pos_ub=3, + pos_margin=0, + neg_margin=0.1, + hard_mining=True, + loss_weight=1.0)), + loss_bbox=dict(type=L1Loss, loss_weight=1.0), + train_cfg=dict( + assigner=dict( + type=MaxIoUAssigner, + pos_iou_thr=0.7, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type=CombinedSampler, + num=256, + pos_fraction=0.5, + neg_pos_ub=3, + add_gt_as_proposals=True, + pos_sampler=dict(type=InstanceBalancedPosSampler), + neg_sampler=dict(type=RandomSampler)))), + tracker=dict( + type=QuasiDenseTracker, + init_score_thr=0.9, + obj_score_thr=0.5, + match_score_thr=0.5, + memo_tracklet_frames=30, + memo_backdrop_frames=1, + memo_momentum=0.8, + nms_conf_thr=0.5, + nms_backdrop_iou_thr=0.3, + nms_class_iou_thr=0.7, + with_cats=True, + match_metric='bisoftmax')) +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=35, norm_type=2)) +# learning policy +param_scheduler = [ + dict(type=MultiStepLR, begin=0, end=4, by_epoch=True, milestones=[3]) +] + +# runtime settings +train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=4, val_interval=4) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +default_hooks.update( + logger=dict(type=LoggerHook, interval=50), + visualization=dict(type=TrackVisualizationHook, draw=False)) + +visualizer.update( + type=TrackLocalVisualizer, vis_backends=vis_backends, name='visualizer') + +# custom hooks +custom_hooks = [ + # Synchronize model buffers such as running_mean and running_var in BN + # at the end of each epoch + dict(type=SyncBuffersHook) +] diff --git a/mmdetection/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/mmdetection/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py new file mode 100644 index 0000000..2fa715e --- /dev/null +++ b/mmdetection/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from .._base_.datasets.mot_challenge import * + from .qdtrack_faster_rcnn_r50_fpn_4e_base import * + +from mmdet.evaluation import CocoVideoMetric, MOTChallengeMetric + +# evaluator +val_evaluator = [ + dict(type=CocoVideoMetric, metric=['bbox'], classwise=True), + dict(type=MOTChallengeMetric, metric=['HOTA', 'CLEAR', 'Identity']) +] diff --git a/mmdetection/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py b/mmdetection/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py new file mode 100644 index 0000000..847600e --- /dev/null +++ b/mmdetection/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.models.retinanet_r50_fpn import * + from .._base_.datasets.coco_detection import * + from .._base_.schedules.schedule_1x import * + from .._base_.default_runtime import * + from .retinanet_tta import * + +from torch.optim.sgd import SGD + +# optimizer +optim_wrapper.update( + dict(optimizer=dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0001))) diff --git a/mmdetection/mmdet/configs/retinanet/retinanet_tta.py b/mmdetection/mmdet/configs/retinanet/retinanet_tta.py new file mode 100644 index 0000000..4e340e5 --- /dev/null +++ b/mmdetection/mmdet/configs/retinanet/retinanet_tta.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import TestTimeAug + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import RandomFlip, Resize +from mmdet.models.test_time_augs.det_tta import DetTTAModel + +tta_model = dict( + type=DetTTAModel, + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.5), max_per_img=100)) + +img_scales = [(1333, 800), (666, 400), (2000, 1200)] +tta_pipeline = [ + dict(type=LoadImageFromFile, backend_args=None), + dict( + type=TestTimeAug, + transforms=[ + [dict(type=Resize, scale=s, keep_ratio=True) for s in img_scales], + [dict(type=RandomFlip, prob=1.), + dict(type=RandomFlip, prob=0.)], + [dict(type=LoadAnnotations, with_bbox=True)], + [ + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py new file mode 100644 index 0000000..302d7cd --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_l_8xb32_300e_coco import * + +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import RandomResize +from mmengine.hooks.ema_hook import EMAHook +from torch.nn.modules.activation import SiLU + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + Resize, YOLOXHSVRandomAug) +from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook +from mmdet.models.dense_heads.rtmdet_ins_head import RTMDetInsSepBNHead +from mmdet.models.layers.ema import ExpMomentumEMA +from mmdet.models.losses.dice_loss import DiceLoss +from mmdet.models.losses.gfocal_loss import QualityFocalLoss +from mmdet.models.losses.iou_loss import GIoULoss +from mmdet.models.task_modules.coders.distance_point_bbox_coder import \ + DistancePointBBoxCoder +from mmdet.models.task_modules.prior_generators.point_generator import \ + MlvlPointGenerator + +model.merge( + dict( + bbox_head=dict( + _delete_=True, + type=RTMDetInsSepBNHead, + num_classes=80, + in_channels=256, + stacked_convs=2, + share_conv=True, + pred_kernel_size=1, + feat_channels=256, + act_cfg=dict(type=SiLU, inplace=True), + norm_cfg=dict(type='SyncBN', requires_grad=True), + anchor_generator=dict( + type=MlvlPointGenerator, offset=0, strides=[8, 16, 32]), + bbox_coder=dict(type=DistancePointBBoxCoder), + loss_cls=dict( + type=QualityFocalLoss, + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type=GIoULoss, loss_weight=2.0), + loss_mask=dict( + type=DiceLoss, loss_weight=2.0, eps=5e-6, reduction='mean')), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100, + mask_thr_binary=0.5), + )) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0), + dict( + type=RandomResize, + scale=(1280, 1280), + ratio_range=(0.1, 2.0), + resize_type=Resize, + keep_ratio=True), + dict( + type=RandomCrop, + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type=CachedMixUp, + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)), + dict(type=PackDetInputs) +] + +train_dataloader.update( + dict(pin_memory=True, dataset=dict(pipeline=train_pipeline))) + +train_pipeline_stage2 = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type=RandomResize, + scale=(640, 640), + ratio_range=(0.1, 2.0), + resize_type=Resize, + keep_ratio=True), + dict( + type=RandomCrop, + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type=PackDetInputs) +] +custom_hooks = [ + dict( + type=EMAHook, + ema_type=ExpMomentumEMA, + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type=PipelineSwitchHook, + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator.update(dict(metric=['bbox', 'segm'])) +test_evaluator = val_evaluator diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py new file mode 100644 index 0000000..d90be92 --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_ins_l_8xb32_300e_coco import * + +model.update( + dict( + backbone=dict(deepen_factor=0.67, widen_factor=0.75), + neck=dict( + in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2), + bbox_head=dict(in_channels=192, feat_channels=192))) diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py new file mode 100644 index 0000000..58b5b1a --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py @@ -0,0 +1,101 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_ins_l_8xb32_300e_coco import * + +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import RandomResize +from mmengine.hooks.ema_hook import EMAHook + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + Resize, YOLOXHSVRandomAug) +from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook +from mmdet.models.layers.ema import ExpMomentumEMA + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa +model.update( + dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.5, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict( + in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1), + bbox_head=dict(in_channels=128, feat_channels=128))) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0), + dict( + type=RandomResize, + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + resize_type=Resize, + keep_ratio=True), + dict( + type=RandomCrop, + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type=CachedMixUp, + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)), + dict(type=PackDetInputs) +] + +train_pipeline_stage2 = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type=RandomResize, + scale=(640, 640), + ratio_range=(0.5, 2.0), + resize_type=Resize, + keep_ratio=True), + dict( + type=RandomCrop, + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type=PackDetInputs) +] + +train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline))) + +custom_hooks = [ + dict( + type=EMAHook, + ema_type=ExpMomentumEMA, + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type=PipelineSwitchHook, + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py new file mode 100644 index 0000000..0356b19 --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_ins_s_8xb32_300e_coco import * + +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import RandomResize + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import (FilterAnnotations, + LoadAnnotations) +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + Resize, YOLOXHSVRandomAug) + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +model.update( + dict( + backbone=dict( + deepen_factor=0.167, + widen_factor=0.375, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict( + in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1), + bbox_head=dict(in_channels=96, feat_channels=96))) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict( + type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False), + dict( + type=CachedMosaic, + img_scale=(640, 640), + pad_val=114.0, + max_cached_images=20, + random_pop=False), + dict( + type=RandomResize, + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type=CachedMixUp, + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=10, + random_pop=False, + pad_val=(114, 114, 114), + prob=0.5), + dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)), + dict(type=PackDetInputs) +] + +train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline))) diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py new file mode 100644 index 0000000..555b101 --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py @@ -0,0 +1,38 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_ins_l_8xb32_300e_coco import * +from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR, LinearLR + +model.update( + dict( + backbone=dict(deepen_factor=1.33, widen_factor=1.25), + neck=dict( + in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4), + bbox_head=dict(in_channels=320, feat_channels=320))) + +base_lr = 0.002 + +# optimizer +optim_wrapper.update(dict(optimizer=dict(lr=base_lr))) + +# learning rate +param_scheduler = [ + dict( + type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type=CosineAnnealingLR, + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py new file mode 100644 index 0000000..5dcda7b --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py @@ -0,0 +1,220 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .._base_.default_runtime import * + from .._base_.schedules.schedule_1x import * + from .._base_.datasets.coco_detection import * + from .rtmdet_tta import * + +from mmcv.ops import nms +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import RandomResize +from mmengine.hooks.ema_hook import EMAHook +from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper +from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR, LinearLR +from torch.nn import SyncBatchNorm +from torch.nn.modules.activation import SiLU +from torch.optim.adamw import AdamW + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + Resize, YOLOXHSVRandomAug) +from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook +from mmdet.models.backbones.cspnext import CSPNeXt +from mmdet.models.data_preprocessors.data_preprocessor import \ + DetDataPreprocessor +from mmdet.models.dense_heads.rtmdet_head import RTMDetSepBNHead +from mmdet.models.detectors.rtmdet import RTMDet +from mmdet.models.layers.ema import ExpMomentumEMA +from mmdet.models.losses.gfocal_loss import QualityFocalLoss +from mmdet.models.losses.iou_loss import GIoULoss +from mmdet.models.necks.cspnext_pafpn import CSPNeXtPAFPN +from mmdet.models.task_modules.assigners.dynamic_soft_label_assigner import \ + DynamicSoftLabelAssigner +from mmdet.models.task_modules.coders.distance_point_bbox_coder import \ + DistancePointBBoxCoder +from mmdet.models.task_modules.prior_generators.point_generator import \ + MlvlPointGenerator + +model = dict( + type=RTMDet, + data_preprocessor=dict( + type=DetDataPreprocessor, + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False, + batch_augments=None), + backbone=dict( + type=CSPNeXt, + arch='P5', + expand_ratio=0.5, + deepen_factor=1, + widen_factor=1, + channel_attention=True, + norm_cfg=dict(type=SyncBatchNorm), + act_cfg=dict(type=SiLU, inplace=True)), + neck=dict( + type=CSPNeXtPAFPN, + in_channels=[256, 512, 1024], + out_channels=256, + num_csp_blocks=3, + expand_ratio=0.5, + norm_cfg=dict(type=SyncBatchNorm), + act_cfg=dict(type=SiLU, inplace=True)), + bbox_head=dict( + type=RTMDetSepBNHead, + num_classes=80, + in_channels=256, + stacked_convs=2, + feat_channels=256, + anchor_generator=dict( + type=MlvlPointGenerator, offset=0, strides=[8, 16, 32]), + bbox_coder=dict(type=DistancePointBBoxCoder), + loss_cls=dict( + type=QualityFocalLoss, use_sigmoid=True, beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type=GIoULoss, loss_weight=2.0), + with_objectness=False, + exp_on_reg=True, + share_conv=True, + pred_kernel_size=1, + norm_cfg=dict(type=SyncBatchNorm), + act_cfg=dict(type=SiLU, inplace=True)), + train_cfg=dict( + assigner=dict(type=DynamicSoftLabelAssigner, topk=13), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type=nms, iou_threshold=0.65), + max_per_img=300), +) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0), + dict( + type=RandomResize, + scale=(1280, 1280), + ratio_range=(0.1, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type=CachedMixUp, + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type=PackDetInputs) +] + +train_pipeline_stage2 = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=RandomResize, + scale=(640, 640), + ratio_range=(0.1, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type=PackDetInputs) +] + +test_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=Resize, scale=(640, 640), keep_ratio=True), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader.update( + dict( + batch_size=32, + num_workers=10, + batch_sampler=None, + pin_memory=True, + dataset=dict(pipeline=train_pipeline))) +val_dataloader.update( + dict(batch_size=5, num_workers=10, dataset=dict(pipeline=test_pipeline))) +test_dataloader = val_dataloader + +max_epochs = 300 +stage2_num_epochs = 20 +base_lr = 0.004 +interval = 10 + +train_cfg.update( + dict( + max_epochs=max_epochs, + val_interval=interval, + dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)])) + +val_evaluator.update(dict(proposal_nums=(100, 1, 10))) +test_evaluator = val_evaluator + +# optimizer +optim_wrapper = dict( + type=OptimWrapper, + optimizer=dict(type=AdamW, lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type=CosineAnnealingLR, + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks.update( + dict( + checkpoint=dict( + interval=interval, + max_keep_ckpts=3 # only keep latest 3 checkpoints + ))) + +custom_hooks = [ + dict( + type=EMAHook, + ema_type=ExpMomentumEMA, + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type=PipelineSwitchHook, + switch_epoch=max_epochs - stage2_num_epochs, + switch_pipeline=train_pipeline_stage2) +] diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py new file mode 100644 index 0000000..e741d82 --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_l_8xb32_300e_coco import * + +model.update( + dict( + backbone=dict(deepen_factor=0.67, widen_factor=0.75), + neck=dict( + in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2), + bbox_head=dict(in_channels=192, feat_channels=192))) diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py new file mode 100644 index 0000000..db21b74 --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_l_8xb32_300e_coco import * + +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import RandomResize +from mmengine.hooks.ema_hook import EMAHook + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + Resize, YOLOXHSVRandomAug) +from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook +from mmdet.models.layers.ema import ExpMomentumEMA + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa +model.update( + dict( + backbone=dict( + deepen_factor=0.33, + widen_factor=0.5, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict( + in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1), + bbox_head=dict(in_channels=128, feat_channels=128, exp_on_reg=False))) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0), + dict( + type=RandomResize, + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type=CachedMixUp, + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=20, + pad_val=(114, 114, 114)), + dict(type=PackDetInputs) +] + +train_pipeline_stage2 = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=RandomResize, + scale=(640, 640), + ratio_range=(0.5, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict(type=PackDetInputs) +] + +train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline))) + +custom_hooks = [ + dict( + type=EMAHook, + ema_type=ExpMomentumEMA, + momentum=0.0002, + update_buffers=True, + priority=49), + dict( + type=PipelineSwitchHook, + switch_epoch=280, + switch_pipeline=train_pipeline_stage2) +] diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py new file mode 100644 index 0000000..949d056 --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py @@ -0,0 +1,64 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_s_8xb32_300e_coco import * + +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import RandomResize + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic, + Pad, RandomCrop, RandomFlip, + Resize, YOLOXHSVRandomAug) + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +model.update( + dict( + backbone=dict( + deepen_factor=0.167, + widen_factor=0.375, + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict( + in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1), + bbox_head=dict(in_channels=96, feat_channels=96, exp_on_reg=False))) + +train_pipeline = [ + dict(type=LoadImageFromFile, backend_args=backend_args), + dict(type=LoadAnnotations, with_bbox=True), + dict( + type=CachedMosaic, + img_scale=(640, 640), + pad_val=114.0, + max_cached_images=20, + random_pop=False), + dict( + type=RandomResize, + scale=(1280, 1280), + ratio_range=(0.5, 2.0), + resize_type=Resize, + keep_ratio=True), + dict(type=RandomCrop, crop_size=(640, 640)), + dict(type=YOLOXHSVRandomAug), + dict(type=RandomFlip, prob=0.5), + dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))), + dict( + type=CachedMixUp, + img_scale=(640, 640), + ratio_range=(1.0, 1.0), + max_cached_images=10, + random_pop=False, + pad_val=(114, 114, 114), + prob=0.5), + dict(type=PackDetInputs) +] + +train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline))) diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_tta.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_tta.py new file mode 100644 index 0000000..f27b7aa --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_tta.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms.loading import LoadImageFromFile +from mmcv.transforms.processing import TestTimeAug + +from mmdet.datasets.transforms.formatting import PackDetInputs +from mmdet.datasets.transforms.loading import LoadAnnotations +from mmdet.datasets.transforms.transforms import Pad, RandomFlip, Resize +from mmdet.models.test_time_augs.det_tta import DetTTAModel + +tta_model = dict( + type=DetTTAModel, + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100)) + +img_scales = [(640, 640), (320, 320), (960, 960)] + +tta_pipeline = [ + dict(type=LoadImageFromFile, backend_args=None), + dict( + type=TestTimeAug, + transforms=[ + [dict(type=Resize, scale=s, keep_ratio=True) for s in img_scales], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type=RandomFlip, prob=1.), + dict(type=RandomFlip, prob=0.) + ], + [ + dict( + type=Pad, + size=(960, 960), + pad_val=dict(img=(114, 114, 114))), + ], + [dict(type=LoadAnnotations, with_bbox=True)], + [ + dict( + type=PackDetInputs, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/mmdetection/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py b/mmdetection/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py new file mode 100644 index 0000000..04d67d0 --- /dev/null +++ b/mmdetection/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa +# mmcv >= 2.0.1 +# mmengine >= 0.8.0 + +from mmengine.config import read_base + +with read_base(): + from .rtmdet_l_8xb32_300e_coco import * + +model.update( + dict( + backbone=dict(deepen_factor=1.33, widen_factor=1.25), + neck=dict( + in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4), + bbox_head=dict(in_channels=320, feat_channels=320))) diff --git a/mmdetection/mmdet/datasets/__init__.py b/mmdetection/mmdet/datasets/__init__.py new file mode 100644 index 0000000..044efe4 --- /dev/null +++ b/mmdetection/mmdet/datasets/__init__.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ade20k import (ADE20KInstanceDataset, ADE20KPanopticDataset, + ADE20KSegDataset) +from .base_det_dataset import BaseDetDataset +from .base_semseg_dataset import BaseSegDataset +from .base_video_dataset import BaseVideoDataset +from .cityscapes import CityscapesDataset +from .coco import CocoDataset +from .coco_caption import CocoCaptionDataset +from .coco_panoptic import CocoPanopticDataset +from .coco_semantic import CocoSegDataset +from .crowdhuman import CrowdHumanDataset +from .dataset_wrappers import ConcatDataset, MultiImageMixDataset +from .deepfashion import DeepFashionDataset +from .dsdl import DSDLDetDataset +from .isaid import iSAIDDataset +from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset +from .mot_challenge_dataset import MOTChallengeDataset +from .objects365 import Objects365V1Dataset, Objects365V2Dataset +from .openimages import OpenImagesChallengeDataset, OpenImagesDataset +from .refcoco import RefCocoDataset +from .reid_dataset import ReIDDataset +from .samplers import (AspectRatioBatchSampler, ClassAwareSampler, + GroupMultiSourceSampler, MultiSourceSampler, + TrackAspectRatioBatchSampler, TrackImgSampler) +from .utils import get_loading_pipeline +from .v3det import V3DetDataset +from .voc import VOCDataset +from .wider_face import WIDERFaceDataset +from .xml_style import XMLDataset +from .youtube_vis_dataset import YouTubeVISDataset + +__all__ = [ + 'XMLDataset', 'CocoDataset', 'DeepFashionDataset', 'VOCDataset', + 'CityscapesDataset', 'LVISDataset', 'LVISV05Dataset', 'LVISV1Dataset', + 'WIDERFaceDataset', 'get_loading_pipeline', 'CocoPanopticDataset', + 'MultiImageMixDataset', 'OpenImagesDataset', 'OpenImagesChallengeDataset', + 'AspectRatioBatchSampler', 'ClassAwareSampler', 'MultiSourceSampler', + 'GroupMultiSourceSampler', 'BaseDetDataset', 'CrowdHumanDataset', + 'Objects365V1Dataset', 'Objects365V2Dataset', 'DSDLDetDataset', + 'BaseVideoDataset', 'MOTChallengeDataset', 'TrackImgSampler', + 'ReIDDataset', 'YouTubeVISDataset', 'TrackAspectRatioBatchSampler', + 'ADE20KPanopticDataset', 'CocoCaptionDataset', 'RefCocoDataset', + 'BaseSegDataset', 'ADE20KSegDataset', 'CocoSegDataset', + 'ADE20KInstanceDataset', 'iSAIDDataset', 'V3DetDataset', 'ConcatDataset' +] diff --git a/mmdetection/mmdet/datasets/ade20k.py b/mmdetection/mmdet/datasets/ade20k.py new file mode 100644 index 0000000..573271c --- /dev/null +++ b/mmdetection/mmdet/datasets/ade20k.py @@ -0,0 +1,260 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import List + +from mmengine import fileio + +from mmdet.registry import DATASETS +from .base_semseg_dataset import BaseSegDataset +from .coco import CocoDataset +from .coco_panoptic import CocoPanopticDataset + +ADE_PALETTE = [(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50), + (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255), + (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7), + (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82), + (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3), + (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255), + (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220), + (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224), + (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255), + (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7), + (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153), + (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255), + (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0), + (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255), + (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255), + (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255), + (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0), + (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0), + (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255), + (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255), + (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20), + (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255), + (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255), + (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255), + (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0), + (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0), + (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255), + (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112), + (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160), + (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163), + (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0), + (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0), + (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255), + (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204), + (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255), + (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255), + (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194), + (102, 255, 0), (92, 0, 255)] + + +@DATASETS.register_module() +class ADE20KPanopticDataset(CocoPanopticDataset): + METAINFO = { + 'classes': + ('bed', 'window', 'cabinet', 'person', 'door', 'table', 'curtain', + 'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror', + 'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press', + 'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar', + 'signboard, sign', 'chest of drawers, chest, bureau, dresser', + 'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs', + 'case, display case, showcase, vitrine', + 'pool table, billiard table, snooker table', 'pillow', + 'screen door, screen', 'bookcase', 'coffee table', + 'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower', + 'book', 'bench', 'countertop', 'stove', 'palm, palm tree', + 'kitchen island', 'computer', 'swivel chair', 'boat', + 'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier', + 'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv', + 'airplane', 'clothes', 'pole', + 'bannister, banister, balustrade, balusters, handrail', + 'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship', + 'fountain', 'washer, automatic washer, washing machine', + 'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket', + 'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food', + 'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle', + 'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce', + 'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate', + 'monitor', 'bulletin board', 'radiator', 'glass, drinking glass', + 'clock', 'flag', 'wall', 'building', 'sky', 'floor', 'tree', + 'ceiling', 'road, route', 'grass', 'sidewalk, pavement', + 'earth, ground', 'mountain, mount', 'plant', 'water', 'house', 'sea', + 'rug', 'field', 'rock, stone', 'base, pedestal, stand', 'sand', + 'skyscraper', 'grandstand, covered stand', 'path', 'runway', + 'stairway, staircase', 'river', 'bridge, span', 'blind, screen', + 'hill', 'bar', 'hovel, hut, hutch, shack, shanty', 'tower', + 'dirt track', 'land, ground, soil', + 'escalator, moving staircase, moving stairway', + 'buffet, counter, sideboard', + 'poster, posting, placard, notice, bill, card', 'stage', + 'conveyer belt, conveyor belt, conveyer, conveyor, transporter', + 'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank', + 'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'), + 'thing_classes': + ('bed', 'window', 'cabinet', 'person', 'door', 'table', 'curtain', + 'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror', + 'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press', + 'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar', + 'signboard, sign', 'chest of drawers, chest, bureau, dresser', + 'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs', + 'case, display case, showcase, vitrine', + 'pool table, billiard table, snooker table', 'pillow', + 'screen door, screen', 'bookcase', 'coffee table', + 'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower', + 'book', 'bench', 'countertop', 'stove', 'palm, palm tree', + 'kitchen island', 'computer', 'swivel chair', 'boat', + 'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier', + 'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv', + 'airplane', 'clothes', 'pole', + 'bannister, banister, balustrade, balusters, handrail', + 'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship', + 'fountain', 'washer, automatic washer, washing machine', + 'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket', + 'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food', + 'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle', + 'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce', + 'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate', + 'monitor', 'bulletin board', 'radiator', 'glass, drinking glass', + 'clock', 'flag'), + 'stuff_classes': + ('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road, route', + 'grass', 'sidewalk, pavement', 'earth, ground', 'mountain, mount', + 'plant', 'water', 'house', 'sea', 'rug', 'field', 'rock, stone', + 'base, pedestal, stand', 'sand', 'skyscraper', + 'grandstand, covered stand', 'path', 'runway', 'stairway, staircase', + 'river', 'bridge, span', 'blind, screen', 'hill', 'bar', + 'hovel, hut, hutch, shack, shanty', 'tower', 'dirt track', + 'land, ground, soil', 'escalator, moving staircase, moving stairway', + 'buffet, counter, sideboard', + 'poster, posting, placard, notice, bill, card', 'stage', + 'conveyer belt, conveyor belt, conveyer, conveyor, transporter', + 'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank', + 'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'), + 'palette': + ADE_PALETTE + } + + +@DATASETS.register_module() +class ADE20KInstanceDataset(CocoDataset): + METAINFO = { + 'classes': + ('bed', 'windowpane', 'cabinet', 'person', 'door', 'table', 'curtain', + 'chair', 'car', 'painting', 'sofa', 'shelf', 'mirror', 'armchair', + 'seat', 'fence', 'desk', 'wardrobe', 'lamp', 'bathtub', 'railing', + 'cushion', 'box', 'column', 'signboard', 'chest of drawers', + 'counter', 'sink', 'fireplace', 'refrigerator', 'stairs', 'case', + 'pool table', 'pillow', 'screen door', 'bookcase', 'coffee table', + 'toilet', 'flower', 'book', 'bench', 'countertop', 'stove', 'palm', + 'kitchen island', 'computer', 'swivel chair', 'boat', + 'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier', + 'awning', 'streetlight', 'booth', 'television receiver', 'airplane', + 'apparel', 'pole', 'bannister', 'ottoman', 'bottle', 'van', 'ship', + 'fountain', 'washer', 'plaything', 'stool', 'barrel', 'basket', 'bag', + 'minibike', 'oven', 'ball', 'food', 'step', 'trade name', 'microwave', + 'pot', 'animal', 'bicycle', 'dishwasher', 'screen', 'sculpture', + 'hood', 'sconce', 'vase', 'traffic light', 'tray', 'ashcan', 'fan', + 'plate', 'monitor', 'bulletin board', 'radiator', 'glass', 'clock', + 'flag'), + 'palette': [(204, 5, 255), (230, 230, 230), (224, 5, 255), + (150, 5, 61), (8, 255, 51), (255, 6, 82), (255, 51, 7), + (204, 70, 3), (0, 102, 200), (255, 6, 51), (11, 102, 255), + (255, 7, 71), (220, 220, 220), (8, 255, 214), + (7, 255, 224), (255, 184, 6), (10, 255, 71), (7, 255, 255), + (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7), + (0, 255, 20), (255, 8, 41), (255, 5, 153), (6, 51, 255), + (235, 12, 255), (0, 163, 255), (250, 10, 15), (20, 255, 0), + (255, 224, 0), (0, 0, 255), (255, 71, 0), (0, 235, 255), + (0, 173, 255), (0, 255, 245), (0, 255, 112), (0, 255, 133), + (255, 0, 0), (255, 163, 0), (194, 255, 0), (0, 143, 255), + (51, 255, 0), (0, 82, 255), (0, 255, 41), (0, 255, 173), + (10, 0, 255), (173, 255, 0), (255, 92, 0), (255, 0, 245), + (255, 0, 102), (255, 173, 0), (255, 0, 20), (0, 31, 255), + (0, 255, 61), (0, 71, 255), (255, 0, 204), (0, 255, 194), + (0, 255, 82), (0, 112, 255), (51, 0, 255), (0, 122, 255), + (255, 153, 0), (0, 255, 10), (163, 255, 0), (255, 235, 0), + (8, 184, 170), (184, 0, 255), (255, 0, 31), (0, 214, 255), + (255, 0, 112), (92, 255, 0), (70, 184, 160), (163, 0, 255), + (71, 255, 0), (255, 0, 163), (255, 204, 0), (255, 0, 143), + (133, 255, 0), (255, 0, 235), (245, 0, 255), (255, 0, 122), + (255, 245, 0), (214, 255, 0), (0, 204, 255), (255, 255, 0), + (0, 153, 255), (0, 41, 255), (0, 255, 204), (41, 0, 255), + (41, 255, 0), (173, 0, 255), (0, 245, 255), (0, 255, 184), + (0, 92, 255), (184, 255, 0), (255, 214, 0), (25, 194, 194), + (102, 255, 0), (92, 0, 255)], + } + + +@DATASETS.register_module() +class ADE20KSegDataset(BaseSegDataset): + """ADE20K dataset. + + In segmentation map annotation for ADE20K, 0 stands for background, which + is not included in 150 categories. The ``img_suffix`` is fixed to '.jpg', + and ``seg_map_suffix`` is fixed to '.png'. + """ + METAINFO = dict( + classes=('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', + 'bed ', 'windowpane', 'grass', 'cabinet', 'sidewalk', + 'person', 'earth', 'door', 'table', 'mountain', 'plant', + 'curtain', 'chair', 'car', 'water', 'painting', 'sofa', + 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair', + 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp', + 'bathtub', 'railing', 'cushion', 'base', 'box', 'column', + 'signboard', 'chest of drawers', 'counter', 'sand', 'sink', + 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', + 'path', 'stairs', 'runway', 'case', 'pool table', 'pillow', + 'screen door', 'stairway', 'river', 'bridge', 'bookcase', + 'blind', 'coffee table', 'toilet', 'flower', 'book', 'hill', + 'bench', 'countertop', 'stove', 'palm', 'kitchen island', + 'computer', 'swivel chair', 'boat', 'bar', 'arcade machine', + 'hovel', 'bus', 'towel', 'light', 'truck', 'tower', + 'chandelier', 'awning', 'streetlight', 'booth', + 'television receiver', 'airplane', 'dirt track', 'apparel', + 'pole', 'land', 'bannister', 'escalator', 'ottoman', 'bottle', + 'buffet', 'poster', 'stage', 'van', 'ship', 'fountain', + 'conveyer belt', 'canopy', 'washer', 'plaything', + 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', + 'tent', 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', + 'step', 'tank', 'trade name', 'microwave', 'pot', 'animal', + 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket', + 'sculpture', 'hood', 'sconce', 'vase', 'traffic light', + 'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate', + 'monitor', 'bulletin board', 'shower', 'radiator', 'glass', + 'clock', 'flag'), + palette=ADE_PALETTE) + + def __init__(self, + img_suffix='.jpg', + seg_map_suffix='.png', + return_classes=False, + **kwargs) -> None: + self.return_classes = return_classes + super().__init__( + img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotation from directory or annotation file. + + Returns: + List[dict]: All data info of dataset. + """ + data_list = [] + img_dir = self.data_prefix.get('img_path', None) + ann_dir = self.data_prefix.get('seg_map_path', None) + for img in fileio.list_dir_or_file( + dir_path=img_dir, + list_dir=False, + suffix=self.img_suffix, + recursive=True, + backend_args=self.backend_args): + data_info = dict(img_path=osp.join(img_dir, img)) + if ann_dir is not None: + seg_map = img.replace(self.img_suffix, self.seg_map_suffix) + data_info['seg_map_path'] = osp.join(ann_dir, seg_map) + data_info['label_map'] = self.label_map + if self.return_classes: + data_info['text'] = list(self._metainfo['classes']) + data_list.append(data_info) + return data_list diff --git a/mmdetection/mmdet/datasets/api_wrappers/__init__.py b/mmdetection/mmdet/datasets/api_wrappers/__init__.py new file mode 100644 index 0000000..8e3c41a --- /dev/null +++ b/mmdetection/mmdet/datasets/api_wrappers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .coco_api import COCO, COCOeval, COCOPanoptic +from .cocoeval_mp import COCOevalMP + +__all__ = ['COCO', 'COCOeval', 'COCOPanoptic', 'COCOevalMP'] diff --git a/mmdetection/mmdet/datasets/api_wrappers/coco_api.py b/mmdetection/mmdet/datasets/api_wrappers/coco_api.py new file mode 100644 index 0000000..40f7f2c --- /dev/null +++ b/mmdetection/mmdet/datasets/api_wrappers/coco_api.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This file add snake case alias for coco api + +import warnings +from collections import defaultdict +from typing import List, Optional, Union + +import pycocotools +from pycocotools.coco import COCO as _COCO +from pycocotools.cocoeval import COCOeval as _COCOeval + + +class COCO(_COCO): + """This class is almost the same as official pycocotools package. + + It implements some snake case function aliases. So that the COCO class has + the same interface as LVIS class. + """ + + def __init__(self, annotation_file=None): + if getattr(pycocotools, '__version__', '0') >= '12.0.2': + warnings.warn( + 'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"', # noqa: E501 + UserWarning) + super().__init__(annotation_file=annotation_file) + self.img_ann_map = self.imgToAnns + self.cat_img_map = self.catToImgs + + def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None): + return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd) + + def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]): + return self.getCatIds(cat_names, sup_names, cat_ids) + + def get_img_ids(self, img_ids=[], cat_ids=[]): + return self.getImgIds(img_ids, cat_ids) + + def load_anns(self, ids): + return self.loadAnns(ids) + + def load_cats(self, ids): + return self.loadCats(ids) + + def load_imgs(self, ids): + return self.loadImgs(ids) + + +# just for the ease of import +COCOeval = _COCOeval + + +class COCOPanoptic(COCO): + """This wrapper is for loading the panoptic style annotation file. + + The format is shown in the CocoPanopticDataset class. + + Args: + annotation_file (str, optional): Path of annotation file. + Defaults to None. + """ + + def __init__(self, annotation_file: Optional[str] = None) -> None: + super(COCOPanoptic, self).__init__(annotation_file) + + def createIndex(self) -> None: + """Create index.""" + # create index + print('creating index...') + # anns stores 'segment_id -> annotation' + anns, cats, imgs = {}, {}, {} + img_to_anns, cat_to_imgs = defaultdict(list), defaultdict(list) + if 'annotations' in self.dataset: + for ann in self.dataset['annotations']: + for seg_ann in ann['segments_info']: + # to match with instance.json + seg_ann['image_id'] = ann['image_id'] + img_to_anns[ann['image_id']].append(seg_ann) + # segment_id is not unique in coco dataset orz... + # annotations from different images but + # may have same segment_id + if seg_ann['id'] in anns.keys(): + anns[seg_ann['id']].append(seg_ann) + else: + anns[seg_ann['id']] = [seg_ann] + + # filter out annotations from other images + img_to_anns_ = defaultdict(list) + for k, v in img_to_anns.items(): + img_to_anns_[k] = [x for x in v if x['image_id'] == k] + img_to_anns = img_to_anns_ + + if 'images' in self.dataset: + for img_info in self.dataset['images']: + img_info['segm_file'] = img_info['file_name'].replace( + 'jpg', 'png') + imgs[img_info['id']] = img_info + + if 'categories' in self.dataset: + for cat in self.dataset['categories']: + cats[cat['id']] = cat + + if 'annotations' in self.dataset and 'categories' in self.dataset: + for ann in self.dataset['annotations']: + for seg_ann in ann['segments_info']: + cat_to_imgs[seg_ann['category_id']].append(ann['image_id']) + + print('index created!') + + self.anns = anns + self.imgToAnns = img_to_anns + self.catToImgs = cat_to_imgs + self.imgs = imgs + self.cats = cats + + def load_anns(self, + ids: Union[List[int], int] = []) -> Optional[List[dict]]: + """Load anns with the specified ids. + + ``self.anns`` is a list of annotation lists instead of a + list of annotations. + + Args: + ids (Union[List[int], int]): Integer ids specifying anns. + + Returns: + anns (List[dict], optional): Loaded ann objects. + """ + anns = [] + + if hasattr(ids, '__iter__') and hasattr(ids, '__len__'): + # self.anns is a list of annotation lists instead of + # a list of annotations + for id in ids: + anns += self.anns[id] + return anns + elif type(ids) == int: + return self.anns[ids] diff --git a/mmdetection/mmdet/datasets/api_wrappers/cocoeval_mp.py b/mmdetection/mmdet/datasets/api_wrappers/cocoeval_mp.py new file mode 100644 index 0000000..b3673ea --- /dev/null +++ b/mmdetection/mmdet/datasets/api_wrappers/cocoeval_mp.py @@ -0,0 +1,296 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import itertools +import time +from collections import defaultdict + +import numpy as np +import torch.multiprocessing as mp +from mmengine.logging import MMLogger +from pycocotools.cocoeval import COCOeval +from tqdm import tqdm + + +class COCOevalMP(COCOeval): + + def _prepare(self): + ''' + Prepare ._gts and ._dts for evaluation based on params + :return: None + ''' + + def _toMask(anns, coco): + # modify ann['segmentation'] by reference + for ann in anns: + rle = coco.annToRLE(ann) + ann['segmentation'] = rle + + p = self.params + if p.useCats: + gts = [] + dts = [] + img_ids = set(p.imgIds) + cat_ids = set(p.catIds) + for gt in self.cocoGt.dataset['annotations']: + if (gt['category_id'] in cat_ids) and (gt['image_id'] + in img_ids): + gts.append(gt) + for dt in self.cocoDt.dataset['annotations']: + if (dt['category_id'] in cat_ids) and (dt['image_id'] + in img_ids): + dts.append(dt) + # gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # noqa + # dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # noqa + # gts=self.cocoGt.dataset['annotations'] + # dts=self.cocoDt.dataset['annotations'] + else: + gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) + dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) + + # convert ground truth to mask if iouType == 'segm' + if p.iouType == 'segm': + _toMask(gts, self.cocoGt) + _toMask(dts, self.cocoDt) + # set ignore flag + for gt in gts: + gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0 + gt['ignore'] = 'iscrowd' in gt and gt['iscrowd'] + if p.iouType == 'keypoints': + gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore'] + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + for gt in gts: + self._gts[gt['image_id'], gt['category_id']].append(gt) + for dt in dts: + self._dts[dt['image_id'], dt['category_id']].append(dt) + self.evalImgs = defaultdict( + list) # per-image per-category evaluation results + self.eval = {} # accumulated evaluation results + + def evaluate(self): + """Run per image evaluation on given images and store results (a list + of dict) in self.evalImgs. + + :return: None + """ + tic = time.time() + print('Running per image evaluation...') + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = 'segm' if p.useSegm == 1 else 'bbox' + print('useSegm (deprecated) is not None. Running {} evaluation'. + format(p.iouType)) + print('Evaluate annotation type *{}*'.format(p.iouType)) + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + nproc = 8 + split_size = len(catIds) // nproc + mp_params = [] + for i in range(nproc): + begin = i * split_size + end = (i + 1) * split_size + if i == nproc - 1: + end = len(catIds) + mp_params.append((catIds[begin:end], )) + + MMLogger.get_current_instance().info( + 'start multi processing evaluation ...') + with mp.Pool(nproc) as pool: + self.evalImgs = pool.starmap(self._evaluateImg, mp_params) + + self.evalImgs = list(itertools.chain(*self.evalImgs)) + + self._paramsEval = copy.deepcopy(self.params) + toc = time.time() + print('DONE (t={:0.2f}s).'.format(toc - tic)) + + def _evaluateImg(self, catids_chunk): + self._prepare() + p = self.params + maxDet = max(p.maxDets) + all_params = [] + for catId in catids_chunk: + for areaRng in p.areaRng: + for imgId in p.imgIds: + all_params.append((catId, areaRng, imgId)) + evalImgs = [ + self.evaluateImg(imgId, catId, areaRng, maxDet) + for catId, areaRng, imgId in tqdm(all_params) + ] + return evalImgs + + def evaluateImg(self, imgId, catId, aRng, maxDet): + p = self.params + if p.useCats: + gt = self._gts[imgId, catId] + dt = self._dts[imgId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] + if len(gt) == 0 and len(dt) == 0: + return None + + for g in gt: + if g['ignore'] or (g['area'] < aRng[0] or g['area'] > aRng[1]): + g['_ignore'] = 1 + else: + g['_ignore'] = 0 + + # sort dt highest score first, sort gt ignore last + gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort') + gt = [gt[i] for i in gtind] + dtind = np.argsort([-d['score'] for d in dt], kind='mergesort') + dt = [dt[i] for i in dtind[0:maxDet]] + iscrowd = [int(o['iscrowd']) for o in gt] + # load computed ious + # ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] # noqa + ious = self.computeIoU(imgId, catId) + ious = ious[:, gtind] if len(ious) > 0 else ious + + T = len(p.iouThrs) + G = len(gt) + D = len(dt) + gtm = np.zeros((T, G)) + dtm = np.zeros((T, D)) + gtIg = np.array([g['_ignore'] for g in gt]) + dtIg = np.zeros((T, D)) + if not len(ious) == 0: + for tind, t in enumerate(p.iouThrs): + for dind, d in enumerate(dt): + # information about best match so far (m=-1 -> unmatched) + iou = min([t, 1 - 1e-10]) + m = -1 + for gind, g in enumerate(gt): + # if this gt already matched, and not a crowd, continue + if gtm[tind, gind] > 0 and not iscrowd[gind]: + continue + # if dt matched to reg gt, and on ignore gt, stop + if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1: + break + # continue to next gt unless better match made + if ious[dind, gind] < iou: + continue + # if match successful and best so far, + # store appropriately + iou = ious[dind, gind] + m = gind + # if match made store id of match for both dt and gt + if m == -1: + continue + dtIg[tind, dind] = gtIg[m] + dtm[tind, dind] = gt[m]['id'] + gtm[tind, m] = d['id'] + # set unmatched detections outside of area range to ignore + a = np.array([d['area'] < aRng[0] or d['area'] > aRng[1] + for d in dt]).reshape((1, len(dt))) + dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, + 0))) + # store results for given image and category + + return { + 'image_id': imgId, + 'category_id': catId, + 'aRng': aRng, + 'maxDet': maxDet, + 'dtIds': [d['id'] for d in dt], + 'gtIds': [g['id'] for g in gt], + 'dtMatches': dtm, + 'gtMatches': gtm, + 'dtScores': [d['score'] for d in dt], + 'gtIgnore': gtIg, + 'dtIgnore': dtIg, + } + + def summarize(self): + """Compute and display summary metrics for evaluation results. + + Note this function can *only* be applied on the default parameter + setting + """ + + def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100): + p = self.params + iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}' # noqa + titleStr = 'Average Precision' if ap == 1 else 'Average Recall' + typeStr = '(AP)' if ap == 1 else '(AR)' + iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \ + if iouThr is None else '{:0.2f}'.format(iouThr) + + aind = [ + i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng + ] + mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] + if ap == 1: + # dimension of precision: [TxRxKxAxM] + s = self.eval['precision'] + # IoU + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, :, aind, mind] + else: + # dimension of recall: [TxKxAxM] + s = self.eval['recall'] + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, aind, mind] + if len(s[s > -1]) == 0: + mean_s = -1 + else: + mean_s = np.mean(s[s > -1]) + print( + iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, + mean_s)) + return mean_s + + def _summarizeDets(): + stats = [] + stats.append(_summarize(1, maxDets=self.params.maxDets[-1])) + stats.append( + _summarize(1, iouThr=.5, maxDets=self.params.maxDets[-1])) + stats.append( + _summarize(1, iouThr=.75, maxDets=self.params.maxDets[-1])) + for area_rng in ('small', 'medium', 'large'): + stats.append( + _summarize( + 1, areaRng=area_rng, maxDets=self.params.maxDets[-1])) + for max_det in self.params.maxDets: + stats.append(_summarize(0, maxDets=max_det)) + for area_rng in ('small', 'medium', 'large'): + stats.append( + _summarize( + 0, areaRng=area_rng, maxDets=self.params.maxDets[-1])) + stats = np.array(stats) + return stats + + def _summarizeKps(): + stats = np.zeros((10, )) + stats[0] = _summarize(1, maxDets=20) + stats[1] = _summarize(1, maxDets=20, iouThr=.5) + stats[2] = _summarize(1, maxDets=20, iouThr=.75) + stats[3] = _summarize(1, maxDets=20, areaRng='medium') + stats[4] = _summarize(1, maxDets=20, areaRng='large') + stats[5] = _summarize(0, maxDets=20) + stats[6] = _summarize(0, maxDets=20, iouThr=.5) + stats[7] = _summarize(0, maxDets=20, iouThr=.75) + stats[8] = _summarize(0, maxDets=20, areaRng='medium') + stats[9] = _summarize(0, maxDets=20, areaRng='large') + return stats + + if not self.eval: + raise Exception('Please run accumulate() first') + iouType = self.params.iouType + if iouType == 'segm' or iouType == 'bbox': + summarize = _summarizeDets + elif iouType == 'keypoints': + summarize = _summarizeKps + self.stats = summarize() diff --git a/mmdetection/mmdet/datasets/base_det_dataset.py b/mmdetection/mmdet/datasets/base_det_dataset.py new file mode 100644 index 0000000..57bc709 --- /dev/null +++ b/mmdetection/mmdet/datasets/base_det_dataset.py @@ -0,0 +1,124 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import List, Optional + +from mmengine.dataset import BaseDataset +from mmengine.fileio import load +from mmengine.utils import is_abs + +from ..registry import DATASETS + + +@DATASETS.register_module() +class BaseDetDataset(BaseDataset): + """Base dataset for detection. + + Args: + proposal_file (str, optional): Proposals file path. Defaults to None. + file_client_args (dict): Arguments to instantiate the + corresponding backend in mmdet <= 3.0.0rc6. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + return_classes (bool): Whether to return class information + for open vocabulary-based algorithms. Defaults to False. + """ + + def __init__(self, + *args, + seg_map_suffix: str = '.png', + proposal_file: Optional[str] = None, + file_client_args: dict = None, + backend_args: dict = None, + return_classes: bool = False, + **kwargs) -> None: + self.seg_map_suffix = seg_map_suffix + self.proposal_file = proposal_file + self.backend_args = backend_args + self.return_classes = return_classes + if file_client_args is not None: + raise RuntimeError( + 'The `file_client_args` is deprecated, ' + 'please use `backend_args` instead, please refer to' + 'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py' # noqa: E501 + ) + super().__init__(*args, **kwargs) + + def full_init(self) -> None: + """Load annotation file and set ``BaseDataset._fully_initialized`` to + True. + + If ``lazy_init=False``, ``full_init`` will be called during the + instantiation and ``self._fully_initialized`` will be set to True. If + ``obj._fully_initialized=False``, the class method decorated by + ``force_full_init`` will call ``full_init`` automatically. + + Several steps to initialize annotation: + + - load_data_list: Load annotations from annotation file. + - load_proposals: Load proposals from proposal file, if + `self.proposal_file` is not None. + - filter data information: Filter annotations according to + filter_cfg. + - slice_data: Slice dataset according to ``self._indices`` + - serialize_data: Serialize ``self.data_list`` if + ``self.serialize_data`` is True. + """ + if self._fully_initialized: + return + # load data information + self.data_list = self.load_data_list() + # get proposals from file + if self.proposal_file is not None: + self.load_proposals() + # filter illegal data, such as data that has no annotations. + self.data_list = self.filter_data() + + # Get subset data according to indices. + if self._indices is not None: + self.data_list = self._get_unserialized_subset(self._indices) + + # serialize data_list + if self.serialize_data: + self.data_bytes, self.data_address = self._serialize_data() + + self._fully_initialized = True + + def load_proposals(self) -> None: + """Load proposals from proposals file. + + The `proposals_list` should be a dict[img_path: proposals] + with the same length as `data_list`. And the `proposals` should be + a `dict` or :obj:`InstanceData` usually contains following keys. + + - bboxes (np.ndarry): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - scores (np.ndarry): Classification scores, has a shape + (num_instance, ). + """ + # TODO: Add Unit Test after fully support Dump-Proposal Metric + if not is_abs(self.proposal_file): + self.proposal_file = osp.join(self.data_root, self.proposal_file) + proposals_list = load( + self.proposal_file, backend_args=self.backend_args) + assert len(self.data_list) == len(proposals_list) + for data_info in self.data_list: + img_path = data_info['img_path'] + # `file_name` is the key to obtain the proposals from the + # `proposals_list`. + file_name = osp.join( + osp.split(osp.split(img_path)[0])[-1], + osp.split(img_path)[-1]) + proposals = proposals_list[file_name] + data_info['proposals'] = proposals + + def get_cat_ids(self, idx: int) -> List[int]: + """Get COCO category ids by index. + + Args: + idx (int): Index of data. + + Returns: + List[int]: All categories in the image of specified index. + """ + instances = self.get_data_info(idx)['instances'] + return [instance['bbox_label'] for instance in instances] diff --git a/mmdetection/mmdet/datasets/base_semseg_dataset.py b/mmdetection/mmdet/datasets/base_semseg_dataset.py new file mode 100644 index 0000000..d10f762 --- /dev/null +++ b/mmdetection/mmdet/datasets/base_semseg_dataset.py @@ -0,0 +1,265 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +from typing import Callable, Dict, List, Optional, Sequence, Union + +import mmengine +import mmengine.fileio as fileio +import numpy as np +from mmengine.dataset import BaseDataset, Compose + +from mmdet.registry import DATASETS + + +@DATASETS.register_module() +class BaseSegDataset(BaseDataset): + """Custom dataset for semantic segmentation. An example of file structure + is as followed. + + .. code-block:: none + + ├── data + │ ├── my_dataset + │ │ ├── img_dir + │ │ │ ├── train + │ │ │ │ ├── xxx{img_suffix} + │ │ │ │ ├── yyy{img_suffix} + │ │ │ │ ├── zzz{img_suffix} + │ │ │ ├── val + │ │ ├── ann_dir + │ │ │ ├── train + │ │ │ │ ├── xxx{seg_map_suffix} + │ │ │ │ ├── yyy{seg_map_suffix} + │ │ │ │ ├── zzz{seg_map_suffix} + │ │ │ ├── val + + The img/gt_semantic_seg pair of BaseSegDataset should be of the same + except suffix. A valid img/gt_semantic_seg filename pair should be like + ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included + in the suffix). If split is given, then ``xxx`` is specified in txt file. + Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded. + Please refer to ``docs/en/tutorials/new_dataset.md`` for more details. + + + Args: + ann_file (str): Annotation file path. Defaults to ''. + metainfo (dict, optional): Meta information for dataset, such as + specify classes to load. Defaults to None. + data_root (str, optional): The root directory for ``data_prefix`` and + ``ann_file``. Defaults to None. + data_prefix (dict, optional): Prefix for training data. Defaults to + dict(img_path=None, seg_map_path=None). + img_suffix (str): Suffix of images. Default: '.jpg' + seg_map_suffix (str): Suffix of segmentation maps. Default: '.png' + filter_cfg (dict, optional): Config for filter data. Defaults to None. + indices (int or Sequence[int], optional): Support using first few + data in annotation file to facilitate training/testing on a smaller + dataset. Defaults to None which means using all ``data_infos``. + serialize_data (bool, optional): Whether to hold memory using + serialized objects, when enabled, data loader workers can use + shared RAM from master process instead of making a copy. Defaults + to True. + pipeline (list, optional): Processing pipeline. Defaults to []. + test_mode (bool, optional): ``test_mode=True`` means in test phase. + Defaults to False. + lazy_init (bool, optional): Whether to load annotation during + instantiation. In some cases, such as visualization, only the meta + information of the dataset is needed, which is not necessary to + load annotation file. ``Basedataset`` can skip load annotations to + save time by set ``lazy_init=True``. Defaults to False. + use_label_map (bool, optional): Whether to use label map. + Defaults to False. + max_refetch (int, optional): If ``Basedataset.prepare_data`` get a + None img. The maximum extra number of cycles to get a valid + image. Defaults to 1000. + backend_args (dict, Optional): Arguments to instantiate a file backend. + See https://mmengine.readthedocs.io/en/latest/api/fileio.htm + for details. Defaults to None. + Notes: mmcv>=2.0.0rc4 required. + """ + METAINFO: dict = dict() + + def __init__(self, + ann_file: str = '', + img_suffix='.jpg', + seg_map_suffix='.png', + metainfo: Optional[dict] = None, + data_root: Optional[str] = None, + data_prefix: dict = dict(img_path='', seg_map_path=''), + filter_cfg: Optional[dict] = None, + indices: Optional[Union[int, Sequence[int]]] = None, + serialize_data: bool = True, + pipeline: List[Union[dict, Callable]] = [], + test_mode: bool = False, + lazy_init: bool = False, + use_label_map: bool = False, + max_refetch: int = 1000, + backend_args: Optional[dict] = None) -> None: + + self.img_suffix = img_suffix + self.seg_map_suffix = seg_map_suffix + self.backend_args = backend_args.copy() if backend_args else None + + self.data_root = data_root + self.data_prefix = copy.copy(data_prefix) + self.ann_file = ann_file + self.filter_cfg = copy.deepcopy(filter_cfg) + self._indices = indices + self.serialize_data = serialize_data + self.test_mode = test_mode + self.max_refetch = max_refetch + self.data_list: List[dict] = [] + self.data_bytes: np.ndarray + + # Set meta information. + self._metainfo = self._load_metainfo(copy.deepcopy(metainfo)) + + # Get label map for custom classes + new_classes = self._metainfo.get('classes', None) + self.label_map = self.get_label_map( + new_classes) if use_label_map else None + self._metainfo.update(dict(label_map=self.label_map)) + + # Update palette based on label map or generate palette + # if it is not defined + updated_palette = self._update_palette() + self._metainfo.update(dict(palette=updated_palette)) + + # Join paths. + if self.data_root is not None: + self._join_prefix() + + # Build pipeline. + self.pipeline = Compose(pipeline) + # Full initialize the dataset. + if not lazy_init: + self.full_init() + + if test_mode: + assert self._metainfo.get('classes') is not None, \ + 'dataset metainfo `classes` should be specified when testing' + + @classmethod + def get_label_map(cls, + new_classes: Optional[Sequence] = None + ) -> Union[Dict, None]: + """Require label mapping. + + The ``label_map`` is a dictionary, its keys are the old label ids and + its values are the new label ids, and is used for changing pixel + labels in load_annotations. If and only if old classes in cls.METAINFO + is not equal to new classes in self._metainfo and nether of them is not + None, `label_map` is not None. + + Args: + new_classes (list, tuple, optional): The new classes name from + metainfo. Default to None. + + + Returns: + dict, optional: The mapping from old classes in cls.METAINFO to + new classes in self._metainfo + """ + old_classes = cls.METAINFO.get('classes', None) + if (new_classes is not None and old_classes is not None + and list(new_classes) != list(old_classes)): + + label_map = {} + if not set(new_classes).issubset(cls.METAINFO['classes']): + raise ValueError( + f'new classes {new_classes} is not a ' + f'subset of classes {old_classes} in METAINFO.') + for i, c in enumerate(old_classes): + if c not in new_classes: + # 0 is background + label_map[i] = 0 + else: + label_map[i] = new_classes.index(c) + return label_map + else: + return None + + def _update_palette(self) -> list: + """Update palette after loading metainfo. + + If length of palette is equal to classes, just return the palette. + If palette is not defined, it will randomly generate a palette. + If classes is updated by customer, it will return the subset of + palette. + + Returns: + Sequence: Palette for current dataset. + """ + palette = self._metainfo.get('palette', []) + classes = self._metainfo.get('classes', []) + # palette does match classes + if len(palette) == len(classes): + return palette + + if len(palette) == 0: + # Get random state before set seed, and restore + # random state later. + # It will prevent loss of randomness, as the palette + # may be different in each iteration if not specified. + # See: https://github.com/open-mmlab/mmdetection/issues/5844 + state = np.random.get_state() + np.random.seed(42) + # random palette + new_palette = np.random.randint( + 0, 255, size=(len(classes), 3)).tolist() + np.random.set_state(state) + elif len(palette) >= len(classes) and self.label_map is not None: + new_palette = [] + # return subset of palette + for old_id, new_id in sorted( + self.label_map.items(), key=lambda x: x[1]): + # 0 is background + if new_id != 0: + new_palette.append(palette[old_id]) + new_palette = type(palette)(new_palette) + elif len(palette) >= len(classes): + # Allow palette length is greater than classes. + return palette + else: + raise ValueError('palette does not match classes ' + f'as metainfo is {self._metainfo}.') + return new_palette + + def load_data_list(self) -> List[dict]: + """Load annotation from directory or annotation file. + + Returns: + list[dict]: All data info of dataset. + """ + data_list = [] + img_dir = self.data_prefix.get('img_path', None) + ann_dir = self.data_prefix.get('seg_map_path', None) + if not osp.isdir(self.ann_file) and self.ann_file: + assert osp.isfile(self.ann_file), \ + f'Failed to load `ann_file` {self.ann_file}' + lines = mmengine.list_from_file( + self.ann_file, backend_args=self.backend_args) + for line in lines: + img_name = line.strip() + data_info = dict( + img_path=osp.join(img_dir, img_name + self.img_suffix)) + if ann_dir is not None: + seg_map = img_name + self.seg_map_suffix + data_info['seg_map_path'] = osp.join(ann_dir, seg_map) + data_info['label_map'] = self.label_map + data_list.append(data_info) + else: + for img in fileio.list_dir_or_file( + dir_path=img_dir, + list_dir=False, + suffix=self.img_suffix, + recursive=True, + backend_args=self.backend_args): + data_info = dict(img_path=osp.join(img_dir, img)) + if ann_dir is not None: + seg_map = img.replace(self.img_suffix, self.seg_map_suffix) + data_info['seg_map_path'] = osp.join(ann_dir, seg_map) + data_info['label_map'] = self.label_map + data_list.append(data_info) + data_list = sorted(data_list, key=lambda x: x['img_path']) + return data_list diff --git a/mmdetection/mmdet/datasets/base_video_dataset.py b/mmdetection/mmdet/datasets/base_video_dataset.py new file mode 100644 index 0000000..0a4a7a2 --- /dev/null +++ b/mmdetection/mmdet/datasets/base_video_dataset.py @@ -0,0 +1,304 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +from collections import defaultdict +from typing import Any, List, Tuple + +import mmengine.fileio as fileio +from mmengine.dataset import BaseDataset +from mmengine.logging import print_log + +from mmdet.datasets.api_wrappers import COCO +from mmdet.registry import DATASETS + + +@DATASETS.register_module() +class BaseVideoDataset(BaseDataset): + """Base video dataset for VID, MOT and VIS tasks.""" + + META = dict(classes=None) + # ann_id is unique in coco dataset. + ANN_ID_UNIQUE = True + + def __init__(self, *args, backend_args: dict = None, **kwargs): + self.backend_args = backend_args + super().__init__(*args, **kwargs) + + def load_data_list(self) -> Tuple[List[dict], List]: + """Load annotations from an annotation file named as ``self.ann_file``. + + Returns: + tuple(list[dict], list): A list of annotation and a list of + valid data indices. + """ + with fileio.get_local_path(self.ann_file) as local_path: + self.coco = COCO(local_path) + # The order of returned `cat_ids` will not + # change with the order of the classes + self.cat_ids = self.coco.get_cat_ids( + cat_names=self.metainfo['classes']) + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) + # used in `filter_data` + self.img_ids_with_ann = set() + + img_ids = self.coco.get_img_ids() + total_ann_ids = [] + # if ``video_id`` is not in the annotation file, we will assign a big + # unique video_id for this video. + single_video_id = 100000 + videos = {} + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + if 'video_id' not in raw_img_info: + single_video_id = single_video_id + 1 + video_id = single_video_id + else: + video_id = raw_img_info['video_id'] + + if video_id not in videos: + videos[video_id] = { + 'video_id': video_id, + 'images': [], + 'video_length': 0 + } + + videos[video_id]['video_length'] += 1 + ann_ids = self.coco.get_ann_ids( + img_ids=[img_id], cat_ids=self.cat_ids) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + parsed_data_info = self.parse_data_info( + dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info)) + + if len(parsed_data_info['instances']) > 0: + self.img_ids_with_ann.add(parsed_data_info['img_id']) + + videos[video_id]['images'].append(parsed_data_info) + + data_list = [v for v in videos.values()] + + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + + return data_list + + def parse_data_info(self, raw_data_info: dict) -> dict: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information loaded from + ``ann_file``. + + Returns: + dict: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + ann_info = raw_data_info['raw_ann_info'] + data_info = {} + + data_info.update(img_info) + if self.data_prefix.get('img_path', None) is not None: + img_path = osp.join(self.data_prefix['img_path'], + img_info['file_name']) + else: + img_path = img_info['file_name'] + data_info['img_path'] = img_path + + instances = [] + for i, ann in enumerate(ann_info): + instance = {} + + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) + inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + if ann['category_id'] not in self.cat_ids: + continue + bbox = [x1, y1, x1 + w, y1 + h] + + if ann.get('iscrowd', False): + instance['ignore_flag'] = 1 + else: + instance['ignore_flag'] = 0 + instance['bbox'] = bbox + instance['bbox_label'] = self.cat2label[ann['category_id']] + if ann.get('segmentation', None): + instance['mask'] = ann['segmentation'] + if ann.get('instance_id', None): + instance['instance_id'] = ann['instance_id'] + else: + # image dataset usually has no `instance_id`. + # Therefore, we set it to `i`. + instance['instance_id'] = i + instances.append(instance) + data_info['instances'] = instances + return data_info + + def filter_data(self) -> List[int]: + """Filter image annotations according to filter_cfg. + + Returns: + list[int]: Filtered results. + """ + if self.test_mode: + return self.data_list + + num_imgs_before_filter = sum( + [len(info['images']) for info in self.data_list]) + num_imgs_after_filter = 0 + + # obtain images that contain annotations of the required categories + ids_in_cat = set() + for i, class_id in enumerate(self.cat_ids): + ids_in_cat |= set(self.cat_img_map[class_id]) + # merge the image id sets of the two conditions and use the merged set + # to filter out images if self.filter_empty_gt=True + ids_in_cat &= self.img_ids_with_ann + + new_data_list = [] + for video_data_info in self.data_list: + imgs_data_info = video_data_info['images'] + valid_imgs_data_info = [] + + for data_info in imgs_data_info: + img_id = data_info['img_id'] + width = data_info['width'] + height = data_info['height'] + # TODO: simplify these conditions + if self.filter_cfg is None: + if img_id not in ids_in_cat: + video_data_info['video_length'] -= 1 + continue + if min(width, height) >= 32: + valid_imgs_data_info.append(data_info) + num_imgs_after_filter += 1 + else: + video_data_info['video_length'] -= 1 + else: + if self.filter_cfg.get('filter_empty_gt', + True) and img_id not in ids_in_cat: + video_data_info['video_length'] -= 1 + continue + if min(width, height) >= self.filter_cfg.get( + 'min_size', 32): + valid_imgs_data_info.append(data_info) + num_imgs_after_filter += 1 + else: + video_data_info['video_length'] -= 1 + video_data_info['images'] = valid_imgs_data_info + new_data_list.append(video_data_info) + + print_log( + 'The number of samples before and after filtering: ' + f'{num_imgs_before_filter} / {num_imgs_after_filter}', 'current') + return new_data_list + + def prepare_data(self, idx) -> Any: + """Get date processed by ``self.pipeline``. Note that ``idx`` is a + video index in default since the base element of video dataset is a + video. However, in some cases, we need to specific both the video index + and frame index. For example, in traing mode, we may want to sample the + specific frames and all the frames must be sampled once in a epoch; in + test mode, we may want to output data of a single image rather than the + whole video for saving memory. + + Args: + idx (int): The index of ``data_info``. + + Returns: + Any: Depends on ``self.pipeline``. + """ + if isinstance(idx, tuple): + assert len(idx) == 2, 'The length of idx must be 2: ' + '(video_index, frame_index)' + video_idx, frame_idx = idx[0], idx[1] + else: + video_idx, frame_idx = idx, None + + data_info = self.get_data_info(video_idx) + if self.test_mode: + # Support two test_mode: frame-level and video-level + final_data_info = defaultdict(list) + if frame_idx is None: + frames_idx_list = list(range(data_info['video_length'])) + else: + frames_idx_list = [frame_idx] + for index in frames_idx_list: + frame_ann = data_info['images'][index] + frame_ann['video_id'] = data_info['video_id'] + # Collate data_list (list of dict to dict of list) + for key, value in frame_ann.items(): + final_data_info[key].append(value) + # copy the info in video-level into img-level + # TODO: the value of this key is the same as that of + # `video_length` in test mode + final_data_info['ori_video_length'].append( + data_info['video_length']) + + final_data_info['video_length'] = [len(frames_idx_list) + ] * len(frames_idx_list) + return self.pipeline(final_data_info) + else: + # Specify `key_frame_id` for the frame sampling in the pipeline + if frame_idx is not None: + data_info['key_frame_id'] = frame_idx + return self.pipeline(data_info) + + def get_cat_ids(self, index) -> List[int]: + """Following image detection, we provide this interface function. Get + category ids by video index and frame index. + + Args: + index: The index of the dataset. It support two kinds of inputs: + Tuple: + video_idx (int): Index of video. + frame_idx (int): Index of frame. + Int: Index of video. + + Returns: + List[int]: All categories in the image of specified video index + and frame index. + """ + if isinstance(index, tuple): + assert len( + index + ) == 2, f'Expect the length of index is 2, but got {len(index)}' + video_idx, frame_idx = index + instances = self.get_data_info( + video_idx)['images'][frame_idx]['instances'] + return [instance['bbox_label'] for instance in instances] + else: + cat_ids = [] + for img in self.get_data_info(index)['images']: + for instance in img['instances']: + cat_ids.append(instance['bbox_label']) + return cat_ids + + @property + def num_all_imgs(self): + """Get the number of all the images in this video dataset.""" + return sum( + [len(self.get_data_info(i)['images']) for i in range(len(self))]) + + def get_len_per_video(self, idx): + """Get length of one video. + + Args: + idx (int): Index of video. + + Returns: + int (int): The length of the video. + """ + return len(self.get_data_info(idx)['images']) diff --git a/mmdetection/mmdet/datasets/cityscapes.py b/mmdetection/mmdet/datasets/cityscapes.py new file mode 100644 index 0000000..09755eb --- /dev/null +++ b/mmdetection/mmdet/datasets/cityscapes.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/datasets/cityscapes.py # noqa +# and https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa + +from typing import List + +from mmdet.registry import DATASETS +from .coco import CocoDataset + + +@DATASETS.register_module() +class CityscapesDataset(CocoDataset): + """Dataset for Cityscapes.""" + + METAINFO = { + 'classes': ('person', 'rider', 'car', 'truck', 'bus', 'train', + 'motorcycle', 'bicycle'), + 'palette': [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), + (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)] + } + + def filter_data(self) -> List[dict]: + """Filter annotations according to filter_cfg. + + Returns: + List[dict]: Filtered results. + """ + if self.test_mode: + return self.data_list + + if self.filter_cfg is None: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) + min_size = self.filter_cfg.get('min_size', 0) + + # obtain images that contain annotation + ids_with_ann = set(data_info['img_id'] for data_info in self.data_list) + # obtain images that contain annotations of the required categories + ids_in_cat = set() + for i, class_id in enumerate(self.cat_ids): + ids_in_cat |= set(self.cat_img_map[class_id]) + # merge the image id sets of the two conditions and use the merged set + # to filter out images if self.filter_empty_gt=True + ids_in_cat &= ids_with_ann + + valid_data_infos = [] + for i, data_info in enumerate(self.data_list): + img_id = data_info['img_id'] + width = data_info['width'] + height = data_info['height'] + all_is_crowd = all([ + instance['ignore_flag'] == 1 + for instance in data_info['instances'] + ]) + if filter_empty_gt and (img_id not in ids_in_cat or all_is_crowd): + continue + if min(width, height) >= min_size: + valid_data_infos.append(data_info) + + return valid_data_infos diff --git a/mmdetection/mmdet/datasets/coco.py b/mmdetection/mmdet/datasets/coco.py new file mode 100644 index 0000000..277b759 --- /dev/null +++ b/mmdetection/mmdet/datasets/coco.py @@ -0,0 +1,200 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +from typing import List, Union + +from mmengine.fileio import get_local_path + +from mmdet.registry import DATASETS +from .api_wrappers import COCO +from .base_det_dataset import BaseDetDataset + + +@DATASETS.register_module() +class CocoDataset(BaseDetDataset): + """Dataset for COCO.""" + + METAINFO = { + 'classes': + ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', + 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', + 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', + 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', + 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', + 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', + 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', + 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy bear', 'hair drier', 'toothbrush'), + # palette is a list of color tuples, which is used for visualization. + 'palette': + [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228), + (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30), + (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30), + (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255), + (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255), + (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92), + (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164), + (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0), + (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174), + (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54), + (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51), + (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65), + (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0), + (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161), + (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120), + (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133), + (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62), + (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45), + (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1), + (246, 0, 122), (191, 162, 208)] + } + COCOAPI = COCO + # ann_id is unique in coco dataset. + ANN_ID_UNIQUE = True + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + # The order of returned `cat_ids` will not + # change with the order of the `classes` + self.cat_ids = self.coco.get_cat_ids( + cat_names=self.metainfo['classes']) + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + + return data_list + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file`` + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + ann_info = raw_data_info['raw_ann_info'] + + data_info = {} + + # TODO: need to change data_prefix['img'] to data_prefix['img_path'] + img_path = osp.join(self.data_prefix['img'], img_info['file_name']) + if self.data_prefix.get('seg', None): + seg_map_path = osp.join( + self.data_prefix['seg'], + img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix) + else: + seg_map_path = None + data_info['img_path'] = img_path + data_info['img_id'] = img_info['img_id'] + data_info['seg_map_path'] = seg_map_path + data_info['height'] = img_info['height'] + data_info['width'] = img_info['width'] + + if self.return_classes: + data_info['text'] = self.metainfo['classes'] + data_info['custom_entities'] = True + + instances = [] + for i, ann in enumerate(ann_info): + instance = {} + + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) + inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + if ann['category_id'] not in self.cat_ids: + continue + bbox = [x1, y1, x1 + w, y1 + h] + + if ann.get('iscrowd', False): + instance['ignore_flag'] = 1 + else: + instance['ignore_flag'] = 0 + instance['bbox'] = bbox + instance['bbox_label'] = self.cat2label[ann['category_id']] + + if ann.get('segmentation', None): + instance['mask'] = ann['segmentation'] + + instances.append(instance) + data_info['instances'] = instances + return data_info + + def filter_data(self) -> List[dict]: + """Filter annotations according to filter_cfg. + + Returns: + List[dict]: Filtered results. + """ + if self.test_mode: + return self.data_list + + if self.filter_cfg is None: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) + min_size = self.filter_cfg.get('min_size', 0) + + # obtain images that contain annotation + ids_with_ann = set(data_info['img_id'] for data_info in self.data_list) + # obtain images that contain annotations of the required categories + ids_in_cat = set() + for i, class_id in enumerate(self.cat_ids): + ids_in_cat |= set(self.cat_img_map[class_id]) + # merge the image id sets of the two conditions and use the merged set + # to filter out images if self.filter_empty_gt=True + ids_in_cat &= ids_with_ann + + valid_data_infos = [] + for i, data_info in enumerate(self.data_list): + img_id = data_info['img_id'] + width = data_info['width'] + height = data_info['height'] + if filter_empty_gt and img_id not in ids_in_cat: + continue + if min(width, height) >= min_size: + valid_data_infos.append(data_info) + + return valid_data_infos diff --git a/mmdetection/mmdet/datasets/coco_caption.py b/mmdetection/mmdet/datasets/coco_caption.py new file mode 100644 index 0000000..ee695fe --- /dev/null +++ b/mmdetection/mmdet/datasets/coco_caption.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from pathlib import Path +from typing import List + +import mmengine +from mmengine.dataset import BaseDataset +from mmengine.fileio import get_file_backend + +from mmdet.registry import DATASETS + + +@DATASETS.register_module() +class CocoCaptionDataset(BaseDataset): + """COCO2014 Caption dataset.""" + + def load_data_list(self) -> List[dict]: + """Load data list.""" + img_prefix = self.data_prefix['img_path'] + annotations = mmengine.load(self.ann_file) + file_backend = get_file_backend(img_prefix) + + data_list = [] + for ann in annotations: + data_info = { + 'img_id': Path(ann['image']).stem.split('_')[-1], + 'img_path': file_backend.join_path(img_prefix, ann['image']), + 'gt_caption': ann['caption'], + } + + data_list.append(data_info) + + return data_list diff --git a/mmdetection/mmdet/datasets/coco_panoptic.py b/mmdetection/mmdet/datasets/coco_panoptic.py new file mode 100644 index 0000000..d5ca785 --- /dev/null +++ b/mmdetection/mmdet/datasets/coco_panoptic.py @@ -0,0 +1,292 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Callable, List, Optional, Sequence, Union + +from mmdet.registry import DATASETS +from .api_wrappers import COCOPanoptic +from .coco import CocoDataset + + +@DATASETS.register_module() +class CocoPanopticDataset(CocoDataset): + """Coco dataset for Panoptic segmentation. + + The annotation format is shown as follows. The `ann` field is optional + for testing. + + .. code-block:: none + + [ + { + 'filename': f'{image_id:012}.png', + 'image_id':9 + 'segments_info': + [ + { + 'id': 8345037, (segment_id in panoptic png, + convert from rgb) + 'category_id': 51, + 'iscrowd': 0, + 'bbox': (x1, y1, w, h), + 'area': 24315 + }, + ... + ] + }, + ... + ] + + Args: + ann_file (str): Annotation file path. Defaults to ''. + metainfo (dict, optional): Meta information for dataset, such as class + information. Defaults to None. + data_root (str, optional): The root directory for ``data_prefix`` and + ``ann_file``. Defaults to None. + data_prefix (dict, optional): Prefix for training data. Defaults to + ``dict(img=None, ann=None, seg=None)``. The prefix ``seg`` which is + for panoptic segmentation map must be not None. + filter_cfg (dict, optional): Config for filter data. Defaults to None. + indices (int or Sequence[int], optional): Support using first few + data in annotation file to facilitate training/testing on a smaller + dataset. Defaults to None which means using all ``data_infos``. + serialize_data (bool, optional): Whether to hold memory using + serialized objects, when enabled, data loader workers can use + shared RAM from master process instead of making a copy. Defaults + to True. + pipeline (list, optional): Processing pipeline. Defaults to []. + test_mode (bool, optional): ``test_mode=True`` means in test phase. + Defaults to False. + lazy_init (bool, optional): Whether to load annotation during + instantiation. In some cases, such as visualization, only the meta + information of the dataset is needed, which is not necessary to + load annotation file. ``Basedataset`` can skip load annotations to + save time by set ``lazy_init=False``. Defaults to False. + max_refetch (int, optional): If ``Basedataset.prepare_data`` get a + None img. The maximum extra number of cycles to get a valid + image. Defaults to 1000. + """ + + METAINFO = { + 'classes': + ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', + 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', + 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', + 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', + 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', + 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', + 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', + 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner', + 'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff', + 'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light', + 'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield', + 'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow', + 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile', + 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', + 'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged', + 'paper-merged', 'food-other-merged', 'building-other-merged', + 'rock-merged', 'wall-other-merged', 'rug-merged'), + 'thing_classes': + ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', + 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', + 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', + 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', + 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', + 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', + 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', + 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy bear', 'hair drier', 'toothbrush'), + 'stuff_classes': + ('banner', 'blanket', 'bridge', 'cardboard', 'counter', 'curtain', + 'door-stuff', 'floor-wood', 'flower', 'fruit', 'gravel', 'house', + 'light', 'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield', + 'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow', + 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile', + 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', + 'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged', + 'paper-merged', 'food-other-merged', 'building-other-merged', + 'rock-merged', 'wall-other-merged', 'rug-merged'), + 'palette': + [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228), + (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30), + (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30), + (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255), + (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255), + (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92), + (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164), + (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0), + (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174), + (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54), + (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51), + (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65), + (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0), + (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161), + (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120), + (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133), + (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62), + (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45), + (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1), + (246, 0, 122), (191, 162, 208), (255, 255, 128), (147, 211, 203), + (150, 100, 100), (168, 171, 172), (146, 112, 198), (210, 170, 100), + (92, 136, 89), (218, 88, 184), (241, 129, 0), (217, 17, 255), + (124, 74, 181), (70, 70, 70), (255, 228, 255), (154, 208, 0), + (193, 0, 92), (76, 91, 113), (255, 180, 195), (106, 154, 176), + (230, 150, 140), (60, 143, 255), (128, 64, 128), (92, 82, 55), + (254, 212, 124), (73, 77, 174), (255, 160, 98), (255, 255, 255), + (104, 84, 109), (169, 164, 131), (225, 199, 255), (137, 54, 74), + (135, 158, 223), (7, 246, 231), (107, 255, 200), (58, 41, 149), + (183, 121, 142), (255, 73, 97), (107, 142, 35), (190, 153, 153), + (146, 139, 141), (70, 130, 180), (134, 199, 156), (209, 226, 140), + (96, 36, 108), (96, 96, 96), (64, 170, 64), (152, 251, 152), + (208, 229, 228), (206, 186, 171), (152, 161, 64), (116, 112, 0), + (0, 114, 143), (102, 102, 156), (250, 141, 255)] + } + COCOAPI = COCOPanoptic + # ann_id is not unique in coco panoptic dataset. + ANN_ID_UNIQUE = False + + def __init__(self, + ann_file: str = '', + metainfo: Optional[dict] = None, + data_root: Optional[str] = None, + data_prefix: dict = dict(img=None, ann=None, seg=None), + filter_cfg: Optional[dict] = None, + indices: Optional[Union[int, Sequence[int]]] = None, + serialize_data: bool = True, + pipeline: List[Union[dict, Callable]] = [], + test_mode: bool = False, + lazy_init: bool = False, + max_refetch: int = 1000, + backend_args: dict = None, + **kwargs) -> None: + super().__init__( + ann_file=ann_file, + metainfo=metainfo, + data_root=data_root, + data_prefix=data_prefix, + filter_cfg=filter_cfg, + indices=indices, + serialize_data=serialize_data, + pipeline=pipeline, + test_mode=test_mode, + lazy_init=lazy_init, + max_refetch=max_refetch, + backend_args=backend_args, + **kwargs) + + def parse_data_info(self, raw_data_info: dict) -> dict: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file``. + + Returns: + dict: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + ann_info = raw_data_info['raw_ann_info'] + # filter out unmatched annotations which have + # same segment_id but belong to other image + ann_info = [ + ann for ann in ann_info if ann['image_id'] == img_info['img_id'] + ] + data_info = {} + + img_path = osp.join(self.data_prefix['img'], img_info['file_name']) + if self.data_prefix.get('seg', None): + seg_map_path = osp.join( + self.data_prefix['seg'], + img_info['file_name'].replace('jpg', 'png')) + else: + seg_map_path = None + data_info['img_path'] = img_path + data_info['img_id'] = img_info['img_id'] + data_info['seg_map_path'] = seg_map_path + data_info['height'] = img_info['height'] + data_info['width'] = img_info['width'] + + if self.return_classes: + data_info['text'] = self.metainfo['thing_classes'] + data_info['stuff_text'] = self.metainfo['stuff_classes'] + data_info['custom_entities'] = True # no important + + instances = [] + segments_info = [] + for ann in ann_info: + instance = {} + x1, y1, w, h = ann['bbox'] + if ann['area'] <= 0 or w < 1 or h < 1: + continue + bbox = [x1, y1, x1 + w, y1 + h] + category_id = ann['category_id'] + contiguous_cat_id = self.cat2label[category_id] + + is_thing = self.coco.load_cats(ids=category_id)[0]['isthing'] + if is_thing: + is_crowd = ann.get('iscrowd', False) + instance['bbox'] = bbox + instance['bbox_label'] = contiguous_cat_id + if not is_crowd: + instance['ignore_flag'] = 0 + else: + instance['ignore_flag'] = 1 + is_thing = False + + segment_info = { + 'id': ann['id'], + 'category': contiguous_cat_id, + 'is_thing': is_thing + } + segments_info.append(segment_info) + if len(instance) > 0 and is_thing: + instances.append(instance) + data_info['instances'] = instances + data_info['segments_info'] = segments_info + return data_info + + def filter_data(self) -> List[dict]: + """Filter images too small or without ground truth. + + Returns: + List[dict]: ``self.data_list`` after filtering. + """ + if self.test_mode: + return self.data_list + + if self.filter_cfg is None: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) + min_size = self.filter_cfg.get('min_size', 0) + + ids_with_ann = set() + # check whether images have legal thing annotations. + for data_info in self.data_list: + for segment_info in data_info['segments_info']: + if not segment_info['is_thing']: + continue + ids_with_ann.add(data_info['img_id']) + + valid_data_list = [] + for data_info in self.data_list: + img_id = data_info['img_id'] + width = data_info['width'] + height = data_info['height'] + if filter_empty_gt and img_id not in ids_with_ann: + continue + if min(width, height) >= min_size: + valid_data_list.append(data_info) + + return valid_data_list diff --git a/mmdetection/mmdet/datasets/coco_semantic.py b/mmdetection/mmdet/datasets/coco_semantic.py new file mode 100644 index 0000000..7525684 --- /dev/null +++ b/mmdetection/mmdet/datasets/coco_semantic.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import DATASETS +from .ade20k import ADE20KSegDataset + + +@DATASETS.register_module() +class CocoSegDataset(ADE20KSegDataset): + """COCO dataset. + + In segmentation map annotation for COCO. The ``img_suffix`` is fixed to + '.jpg', and ``seg_map_suffix`` is fixed to '.png'. + """ + + METAINFO = dict( + classes=( + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', + 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', + 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', + 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', + 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', + 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', + 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', + 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', + 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner', + 'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet', + 'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile', + 'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain', + 'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble', + 'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower', + 'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel', + 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal', + 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net', + 'paper', 'pavement', 'pillow', 'plant-other', 'plastic', + 'platform', 'playingfield', 'railing', 'railroad', 'river', 'road', + 'rock', 'roof', 'rug', 'salad', 'sand', 'sea', 'shelf', + 'sky-other', 'skyscraper', 'snow', 'solid-other', 'stairs', + 'stone', 'straw', 'structural-other', 'table', 'tent', + 'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick', + 'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone', + 'wall-tile', 'wall-wood', 'water-other', 'waterdrops', + 'window-blind', 'window-other', 'wood'), + palette=[(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50), + (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255), + (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7), + (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82), + (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3), + (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255), + (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220), + (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224), + (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255), + (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7), + (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153), + (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255), + (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0), + (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255), + (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255), + (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255), + (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0), + (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0), + (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255), + (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255), + (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20), + (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255), + (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255), + (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255), + (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0), + (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0), + (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255), + (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112), + (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160), + (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163), + (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0), + (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0), + (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255), + (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204), + (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255), + (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255), + (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194), + (102, 255, 0), (92, 0, 255), (107, 255, 200), (58, 41, 149), + (183, 121, 142), (255, 73, 97), (107, 142, 35), + (190, 153, 153), (146, 139, 141), (70, 130, 180), + (134, 199, 156), (209, 226, 140), (96, 36, 108), (96, 96, 96), + (64, 170, 64), (152, 251, 152), (208, 229, 228), + (206, 186, 171), (152, 161, 64), (116, 112, 0), (0, 114, 143), + (102, 102, 156), (250, 141, 255)]) diff --git a/mmdetection/mmdet/datasets/crowdhuman.py b/mmdetection/mmdet/datasets/crowdhuman.py new file mode 100644 index 0000000..650176e --- /dev/null +++ b/mmdetection/mmdet/datasets/crowdhuman.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import logging +import os.path as osp +import warnings +from typing import List, Union + +import mmcv +from mmengine.dist import get_rank +from mmengine.fileio import dump, get, get_text, load +from mmengine.logging import print_log +from mmengine.utils import ProgressBar + +from mmdet.registry import DATASETS +from .base_det_dataset import BaseDetDataset + + +@DATASETS.register_module() +class CrowdHumanDataset(BaseDetDataset): + r"""Dataset for CrowdHuman. + + Args: + data_root (str): The root directory for + ``data_prefix`` and ``ann_file``. + ann_file (str): Annotation file path. + extra_ann_file (str | optional):The path of extra image metas + for CrowdHuman. It can be created by CrowdHumanDataset + automatically or by tools/misc/get_crowdhuman_id_hw.py + manually. Defaults to None. + """ + + METAINFO = { + 'classes': ('person', ), + # palette is a list of color tuples, which is used for visualization. + 'palette': [(220, 20, 60)] + } + + def __init__(self, data_root, ann_file, extra_ann_file=None, **kwargs): + # extra_ann_file record the size of each image. This file is + # automatically created when you first load the CrowdHuman + # dataset by mmdet. + if extra_ann_file is not None: + self.extra_ann_exist = True + self.extra_anns = load(extra_ann_file) + else: + ann_file_name = osp.basename(ann_file) + if 'train' in ann_file_name: + self.extra_ann_file = osp.join(data_root, 'id_hw_train.json') + elif 'val' in ann_file_name: + self.extra_ann_file = osp.join(data_root, 'id_hw_val.json') + self.extra_ann_exist = False + if not osp.isfile(self.extra_ann_file): + print_log( + 'extra_ann_file does not exist, prepare to collect ' + 'image height and width...', + level=logging.INFO) + self.extra_anns = {} + else: + self.extra_ann_exist = True + self.extra_anns = load(self.extra_ann_file) + super().__init__(data_root=data_root, ann_file=ann_file, **kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + anno_strs = get_text( + self.ann_file, backend_args=self.backend_args).strip().split('\n') + print_log('loading CrowdHuman annotation...', level=logging.INFO) + data_list = [] + prog_bar = ProgressBar(len(anno_strs)) + for i, anno_str in enumerate(anno_strs): + anno_dict = json.loads(anno_str) + parsed_data_info = self.parse_data_info(anno_dict) + data_list.append(parsed_data_info) + prog_bar.update() + if not self.extra_ann_exist and get_rank() == 0: + # TODO: support file client + try: + dump(self.extra_anns, self.extra_ann_file, file_format='json') + except: # noqa + warnings.warn( + 'Cache files can not be saved automatically! To speed up' + 'loading the dataset, please manually generate the cache' + ' file by file tools/misc/get_crowdhuman_id_hw.py') + + print_log( + f'\nsave extra_ann_file in {self.data_root}', + level=logging.INFO) + + del self.extra_anns + print_log('\nDone', level=logging.INFO) + return data_list + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file`` + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + data_info = {} + img_path = osp.join(self.data_prefix['img'], + f"{raw_data_info['ID']}.jpg") + data_info['img_path'] = img_path + data_info['img_id'] = raw_data_info['ID'] + + if not self.extra_ann_exist: + img_bytes = get(img_path, backend_args=self.backend_args) + img = mmcv.imfrombytes(img_bytes, backend='cv2') + data_info['height'], data_info['width'] = img.shape[:2] + self.extra_anns[raw_data_info['ID']] = img.shape[:2] + del img, img_bytes + else: + data_info['height'], data_info['width'] = self.extra_anns[ + raw_data_info['ID']] + + instances = [] + for i, ann in enumerate(raw_data_info['gtboxes']): + instance = {} + if ann['tag'] not in self.metainfo['classes']: + instance['bbox_label'] = -1 + instance['ignore_flag'] = 1 + else: + instance['bbox_label'] = self.metainfo['classes'].index( + ann['tag']) + instance['ignore_flag'] = 0 + if 'extra' in ann: + if 'ignore' in ann['extra']: + if ann['extra']['ignore'] != 0: + instance['bbox_label'] = -1 + instance['ignore_flag'] = 1 + + x1, y1, w, h = ann['fbox'] + bbox = [x1, y1, x1 + w, y1 + h] + instance['bbox'] = bbox + + # Record the full bbox(fbox), head bbox(hbox) and visible + # bbox(vbox) as additional information. If you need to use + # this information, you just need to design the pipeline + # instead of overriding the CrowdHumanDataset. + instance['fbox'] = bbox + hbox = ann['hbox'] + instance['hbox'] = [ + hbox[0], hbox[1], hbox[0] + hbox[2], hbox[1] + hbox[3] + ] + vbox = ann['vbox'] + instance['vbox'] = [ + vbox[0], vbox[1], vbox[0] + vbox[2], vbox[1] + vbox[3] + ] + + instances.append(instance) + + data_info['instances'] = instances + return data_info diff --git a/mmdetection/mmdet/datasets/dataset_wrappers.py b/mmdetection/mmdet/datasets/dataset_wrappers.py new file mode 100644 index 0000000..e651e2b --- /dev/null +++ b/mmdetection/mmdet/datasets/dataset_wrappers.py @@ -0,0 +1,252 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import collections +import copy +from typing import List, Sequence, Union + +from mmengine.dataset import BaseDataset +from mmengine.dataset import ConcatDataset as MMENGINE_ConcatDataset +from mmengine.dataset import force_full_init + +from mmdet.registry import DATASETS, TRANSFORMS + + +@DATASETS.register_module() +class MultiImageMixDataset: + """A wrapper of multiple images mixed dataset. + + Suitable for training on multiple images mixed data augmentation like + mosaic and mixup. For the augmentation pipeline of mixed image data, + the `get_indexes` method needs to be provided to obtain the image + indexes, and you can set `skip_flags` to change the pipeline running + process. At the same time, we provide the `dynamic_scale` parameter + to dynamically change the output image size. + + Args: + dataset (:obj:`CustomDataset`): The dataset to be mixed. + pipeline (Sequence[dict]): Sequence of transform object or + config dict to be composed. + dynamic_scale (tuple[int], optional): The image scale can be changed + dynamically. Default to None. It is deprecated. + skip_type_keys (list[str], optional): Sequence of type string to + be skip pipeline. Default to None. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Default: 15. + """ + + def __init__(self, + dataset: Union[BaseDataset, dict], + pipeline: Sequence[str], + skip_type_keys: Union[Sequence[str], None] = None, + max_refetch: int = 15, + lazy_init: bool = False) -> None: + assert isinstance(pipeline, collections.abc.Sequence) + if skip_type_keys is not None: + assert all([ + isinstance(skip_type_key, str) + for skip_type_key in skip_type_keys + ]) + self._skip_type_keys = skip_type_keys + + self.pipeline = [] + self.pipeline_types = [] + for transform in pipeline: + if isinstance(transform, dict): + self.pipeline_types.append(transform['type']) + transform = TRANSFORMS.build(transform) + self.pipeline.append(transform) + else: + raise TypeError('pipeline must be a dict') + + self.dataset: BaseDataset + if isinstance(dataset, dict): + self.dataset = DATASETS.build(dataset) + elif isinstance(dataset, BaseDataset): + self.dataset = dataset + else: + raise TypeError( + 'elements in datasets sequence should be config or ' + f'`BaseDataset` instance, but got {type(dataset)}') + + self._metainfo = self.dataset.metainfo + if hasattr(self.dataset, 'flag'): + self.flag = self.dataset.flag + self.num_samples = len(self.dataset) + self.max_refetch = max_refetch + + self._fully_initialized = False + if not lazy_init: + self.full_init() + + @property + def metainfo(self) -> dict: + """Get the meta information of the multi-image-mixed dataset. + + Returns: + dict: The meta information of multi-image-mixed dataset. + """ + return copy.deepcopy(self._metainfo) + + def full_init(self): + """Loop to ``full_init`` each dataset.""" + if self._fully_initialized: + return + + self.dataset.full_init() + self._ori_len = len(self.dataset) + self._fully_initialized = True + + @force_full_init + def get_data_info(self, idx: int) -> dict: + """Get annotation by index. + + Args: + idx (int): Global index of ``ConcatDataset``. + + Returns: + dict: The idx-th annotation of the datasets. + """ + return self.dataset.get_data_info(idx) + + @force_full_init + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + results = copy.deepcopy(self.dataset[idx]) + for (transform, transform_type) in zip(self.pipeline, + self.pipeline_types): + if self._skip_type_keys is not None and \ + transform_type in self._skip_type_keys: + continue + + if hasattr(transform, 'get_indexes'): + for i in range(self.max_refetch): + # Make sure the results passed the loading pipeline + # of the original dataset is not None. + indexes = transform.get_indexes(self.dataset) + if not isinstance(indexes, collections.abc.Sequence): + indexes = [indexes] + mix_results = [ + copy.deepcopy(self.dataset[index]) for index in indexes + ] + if None not in mix_results: + results['mix_results'] = mix_results + break + else: + raise RuntimeError( + 'The loading pipeline of the original dataset' + ' always return None. Please check the correctness ' + 'of the dataset and its pipeline.') + + for i in range(self.max_refetch): + # To confirm the results passed the training pipeline + # of the wrapper is not None. + updated_results = transform(copy.deepcopy(results)) + if updated_results is not None: + results = updated_results + break + else: + raise RuntimeError( + 'The training pipeline of the dataset wrapper' + ' always return None.Please check the correctness ' + 'of the dataset and its pipeline.') + + if 'mix_results' in results: + results.pop('mix_results') + + return results + + def update_skip_type_keys(self, skip_type_keys): + """Update skip_type_keys. It is called by an external hook. + + Args: + skip_type_keys (list[str], optional): Sequence of type + string to be skip pipeline. + """ + assert all([ + isinstance(skip_type_key, str) for skip_type_key in skip_type_keys + ]) + self._skip_type_keys = skip_type_keys + + +@DATASETS.register_module() +class ConcatDataset(MMENGINE_ConcatDataset): + """A wrapper of concatenated dataset. + + Same as ``torch.utils.data.dataset.ConcatDataset``, support + lazy_init and get_dataset_source. + + Note: + ``ConcatDataset`` should not inherit from ``BaseDataset`` since + ``get_subset`` and ``get_subset_`` could produce ambiguous meaning + sub-dataset which conflicts with original dataset. If you want to use + a sub-dataset of ``ConcatDataset``, you should set ``indices`` + arguments for wrapped dataset which inherit from ``BaseDataset``. + + Args: + datasets (Sequence[BaseDataset] or Sequence[dict]): A list of datasets + which will be concatenated. + lazy_init (bool, optional): Whether to load annotation during + instantiation. Defaults to False. + ignore_keys (List[str] or str): Ignore the keys that can be + unequal in `dataset.metainfo`. Defaults to None. + `New in version 0.3.0.` + """ + + def __init__(self, + datasets: Sequence[Union[BaseDataset, dict]], + lazy_init: bool = False, + ignore_keys: Union[str, List[str], None] = None): + self.datasets: List[BaseDataset] = [] + for i, dataset in enumerate(datasets): + if isinstance(dataset, dict): + self.datasets.append(DATASETS.build(dataset)) + elif isinstance(dataset, BaseDataset): + self.datasets.append(dataset) + else: + raise TypeError( + 'elements in datasets sequence should be config or ' + f'`BaseDataset` instance, but got {type(dataset)}') + if ignore_keys is None: + self.ignore_keys = [] + elif isinstance(ignore_keys, str): + self.ignore_keys = [ignore_keys] + elif isinstance(ignore_keys, list): + self.ignore_keys = ignore_keys + else: + raise TypeError('ignore_keys should be a list or str, ' + f'but got {type(ignore_keys)}') + + meta_keys: set = set() + for dataset in self.datasets: + meta_keys |= dataset.metainfo.keys() + # if the metainfo of multiple datasets are the same, use metainfo + # of the first dataset, else the metainfo is a list with metainfo + # of all the datasets + is_all_same = True + self._metainfo_first = self.datasets[0].metainfo + for i, dataset in enumerate(self.datasets, 1): + for key in meta_keys: + if key in self.ignore_keys: + continue + if key not in dataset.metainfo: + is_all_same = False + break + if self._metainfo_first[key] != dataset.metainfo[key]: + is_all_same = False + break + + if is_all_same: + self._metainfo = self.datasets[0].metainfo + else: + self._metainfo = [dataset.metainfo for dataset in self.datasets] + + self._fully_initialized = False + if not lazy_init: + self.full_init() + + def get_dataset_source(self, idx: int) -> int: + dataset_idx, _ = self._get_ori_dataset_idx(idx) + return dataset_idx diff --git a/mmdetection/mmdet/datasets/deepfashion.py b/mmdetection/mmdet/datasets/deepfashion.py new file mode 100644 index 0000000..f853fc6 --- /dev/null +++ b/mmdetection/mmdet/datasets/deepfashion.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import DATASETS +from .coco import CocoDataset + + +@DATASETS.register_module() +class DeepFashionDataset(CocoDataset): + """Dataset for DeepFashion.""" + + METAINFO = { + 'classes': ('top', 'skirt', 'leggings', 'dress', 'outer', 'pants', + 'bag', 'neckwear', 'headwear', 'eyeglass', 'belt', + 'footwear', 'hair', 'skin', 'face'), + # palette is a list of color tuples, which is used for visualization. + 'palette': [(0, 192, 64), (0, 64, 96), (128, 192, 192), (0, 64, 64), + (0, 192, 224), (0, 192, 192), (128, 192, 64), (0, 192, 96), + (128, 32, 192), (0, 0, 224), (0, 0, 64), (0, 160, 192), + (128, 0, 96), (128, 0, 192), (0, 32, 192)] + } diff --git a/mmdetection/mmdet/datasets/dsdl.py b/mmdetection/mmdet/datasets/dsdl.py new file mode 100644 index 0000000..75570a2 --- /dev/null +++ b/mmdetection/mmdet/datasets/dsdl.py @@ -0,0 +1,192 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from typing import List + +from mmdet.registry import DATASETS +from .base_det_dataset import BaseDetDataset + +try: + from dsdl.dataset import DSDLDataset +except ImportError: + DSDLDataset = None + + +@DATASETS.register_module() +class DSDLDetDataset(BaseDetDataset): + """Dataset for dsdl detection. + + Args: + with_bbox(bool): Load bbox or not, defaults to be True. + with_polygon(bool): Load polygon or not, defaults to be False. + with_mask(bool): Load seg map mask or not, defaults to be False. + with_imagelevel_label(bool): Load image level label or not, + defaults to be False. + with_hierarchy(bool): Load hierarchy information or not, + defaults to be False. + specific_key_path(dict): Path of specific key which can not + be loaded by it's field name. + pre_transform(dict): pre-transform functions before loading. + """ + + METAINFO = {} + + def __init__(self, + with_bbox: bool = True, + with_polygon: bool = False, + with_mask: bool = False, + with_imagelevel_label: bool = False, + with_hierarchy: bool = False, + specific_key_path: dict = {}, + pre_transform: dict = {}, + **kwargs) -> None: + + if DSDLDataset is None: + raise RuntimeError( + 'Package dsdl is not installed. Please run "pip install dsdl".' + ) + + self.with_hierarchy = with_hierarchy + self.specific_key_path = specific_key_path + + loc_config = dict(type='LocalFileReader', working_dir='') + if kwargs.get('data_root'): + kwargs['ann_file'] = os.path.join(kwargs['data_root'], + kwargs['ann_file']) + self.required_fields = ['Image', 'ImageShape', 'Label', 'ignore_flag'] + if with_bbox: + self.required_fields.append('Bbox') + if with_polygon: + self.required_fields.append('Polygon') + if with_mask: + self.required_fields.append('LabelMap') + if with_imagelevel_label: + self.required_fields.append('image_level_labels') + assert 'image_level_labels' in specific_key_path.keys( + ), '`image_level_labels` not specified in `specific_key_path` !' + + self.extra_keys = [ + key for key in self.specific_key_path.keys() + if key not in self.required_fields + ] + + self.dsdldataset = DSDLDataset( + dsdl_yaml=kwargs['ann_file'], + location_config=loc_config, + required_fields=self.required_fields, + specific_key_path=specific_key_path, + transform=pre_transform, + ) + + BaseDetDataset.__init__(self, **kwargs) + + def load_data_list(self) -> List[dict]: + """Load data info from an dsdl yaml file named as ``self.ann_file`` + + Returns: + List[dict]: A list of data info. + """ + if self.with_hierarchy: + # get classes_names and relation_matrix + classes_names, relation_matrix = \ + self.dsdldataset.class_dom.get_hierarchy_info() + self._metainfo['classes'] = tuple(classes_names) + self._metainfo['RELATION_MATRIX'] = relation_matrix + + else: + self._metainfo['classes'] = tuple(self.dsdldataset.class_names) + + data_list = [] + + for i, data in enumerate(self.dsdldataset): + # basic image info, including image id, path and size. + datainfo = dict( + img_id=i, + img_path=os.path.join(self.data_prefix['img_path'], + data['Image'][0].location), + width=data['ImageShape'][0].width, + height=data['ImageShape'][0].height, + ) + + # get image label info + if 'image_level_labels' in data.keys(): + if self.with_hierarchy: + # get leaf node name when using hierarchy classes + datainfo['image_level_labels'] = [ + self._metainfo['classes'].index(i.leaf_node_name) + for i in data['image_level_labels'] + ] + else: + datainfo['image_level_labels'] = [ + self._metainfo['classes'].index(i.name) + for i in data['image_level_labels'] + ] + + # get semantic segmentation info + if 'LabelMap' in data.keys(): + datainfo['seg_map_path'] = data['LabelMap'] + + # load instance info + instances = [] + if 'Bbox' in data.keys(): + for idx in range(len(data['Bbox'])): + bbox = data['Bbox'][idx] + if self.with_hierarchy: + # get leaf node name when using hierarchy classes + label = data['Label'][idx].leaf_node_name + label_index = self._metainfo['classes'].index(label) + else: + label = data['Label'][idx].name + label_index = self._metainfo['classes'].index(label) + + instance = {} + instance['bbox'] = bbox.xyxy + instance['bbox_label'] = label_index + + if 'ignore_flag' in data.keys(): + # get ignore flag + instance['ignore_flag'] = data['ignore_flag'][idx] + else: + instance['ignore_flag'] = 0 + + if 'Polygon' in data.keys(): + # get polygon info + polygon = data['Polygon'][idx] + instance['mask'] = polygon.openmmlabformat + + for key in self.extra_keys: + # load extra instance info + instance[key] = data[key][idx] + + instances.append(instance) + + datainfo['instances'] = instances + # append a standard sample in data list + if len(datainfo['instances']) > 0: + data_list.append(datainfo) + + return data_list + + def filter_data(self) -> List[dict]: + """Filter annotations according to filter_cfg. + + Returns: + List[dict]: Filtered results. + """ + if self.test_mode: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \ + if self.filter_cfg is not None else False + min_size = self.filter_cfg.get('min_size', 0) \ + if self.filter_cfg is not None else 0 + + valid_data_list = [] + for i, data_info in enumerate(self.data_list): + width = data_info['width'] + height = data_info['height'] + if filter_empty_gt and len(data_info['instances']) == 0: + continue + if min(width, height) >= min_size: + valid_data_list.append(data_info) + + return valid_data_list diff --git a/mmdetection/mmdet/datasets/isaid.py b/mmdetection/mmdet/datasets/isaid.py new file mode 100644 index 0000000..87067d8 --- /dev/null +++ b/mmdetection/mmdet/datasets/isaid.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import DATASETS +from .coco import CocoDataset + + +@DATASETS.register_module() +class iSAIDDataset(CocoDataset): + """Dataset for iSAID instance segmentation. + + iSAID: A Large-scale Dataset for Instance Segmentation + in Aerial Images. + + For more detail, please refer to "projects/iSAID/README.md" + """ + + METAINFO = dict( + classes=('background', 'ship', 'store_tank', 'baseball_diamond', + 'tennis_court', 'basketball_court', 'Ground_Track_Field', + 'Bridge', 'Large_Vehicle', 'Small_Vehicle', 'Helicopter', + 'Swimming_pool', 'Roundabout', 'Soccer_ball_field', 'plane', + 'Harbor'), + palette=[(0, 0, 0), (0, 0, 63), (0, 63, 63), (0, 63, 0), (0, 63, 127), + (0, 63, 191), (0, 63, 255), (0, 127, 63), (0, 127, 127), + (0, 0, 127), (0, 0, 191), (0, 0, 255), (0, 191, 127), + (0, 127, 191), (0, 127, 255), (0, 100, 155)]) diff --git a/mmdetection/mmdet/datasets/lvis.py b/mmdetection/mmdet/datasets/lvis.py new file mode 100644 index 0000000..b9629f5 --- /dev/null +++ b/mmdetection/mmdet/datasets/lvis.py @@ -0,0 +1,638 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from typing import List + +from mmengine.fileio import get_local_path + +from mmdet.registry import DATASETS +from .coco import CocoDataset + + +@DATASETS.register_module() +class LVISV05Dataset(CocoDataset): + """LVIS v0.5 dataset for detection.""" + + METAINFO = { + 'classes': + ('acorn', 'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', + 'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', + 'antenna', 'apple', 'apple_juice', 'applesauce', 'apricot', 'apron', + 'aquarium', 'armband', 'armchair', 'armoire', 'armor', 'artichoke', + 'trash_can', 'ashtray', 'asparagus', 'atomizer', 'avocado', 'award', + 'awning', 'ax', 'baby_buggy', 'basketball_backboard', 'backpack', + 'handbag', 'suitcase', 'bagel', 'bagpipe', 'baguet', 'bait', 'ball', + 'ballet_skirt', 'balloon', 'bamboo', 'banana', 'Band_Aid', 'bandage', + 'bandanna', 'banjo', 'banner', 'barbell', 'barge', 'barrel', + 'barrette', 'barrow', 'baseball_base', 'baseball', 'baseball_bat', + 'baseball_cap', 'baseball_glove', 'basket', 'basketball_hoop', + 'basketball', 'bass_horn', 'bat_(animal)', 'bath_mat', 'bath_towel', + 'bathrobe', 'bathtub', 'batter_(food)', 'battery', 'beachball', + 'bead', 'beaker', 'bean_curd', 'beanbag', 'beanie', 'bear', 'bed', + 'bedspread', 'cow', 'beef_(food)', 'beeper', 'beer_bottle', + 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt', 'belt_buckle', + 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor', 'binder', + 'binoculars', 'bird', 'birdfeeder', 'birdbath', 'birdcage', + 'birdhouse', 'birthday_cake', 'birthday_card', 'biscuit_(bread)', + 'pirate_flag', 'black_sheep', 'blackboard', 'blanket', 'blazer', + 'blender', 'blimp', 'blinker', 'blueberry', 'boar', 'gameboard', + 'boat', 'bobbin', 'bobby_pin', 'boiled_egg', 'bolo_tie', 'deadbolt', + 'bolt', 'bonnet', 'book', 'book_bag', 'bookcase', 'booklet', + 'bookmark', 'boom_microphone', 'boot', 'bottle', 'bottle_opener', + 'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)', 'bow-tie', + 'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'bowling_pin', + 'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere', + 'bread-bin', 'breechcloth', 'bridal_gown', 'briefcase', + 'bristle_brush', 'broccoli', 'broach', 'broom', 'brownie', + 'brussels_sprouts', 'bubble_gum', 'bucket', 'horse_buggy', 'bull', + 'bulldog', 'bulldozer', 'bullet_train', 'bulletin_board', + 'bulletproof_vest', 'bullhorn', 'corned_beef', 'bun', 'bunk_bed', + 'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butcher_knife', + 'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car', + 'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf', + 'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)', + 'can', 'can_opener', 'candelabrum', 'candle', 'candle_holder', + 'candy_bar', 'candy_cane', 'walking_cane', 'canister', 'cannon', + 'canoe', 'cantaloup', 'canteen', 'cap_(headwear)', 'bottle_cap', + 'cape', 'cappuccino', 'car_(automobile)', 'railcar_(part_of_a_train)', + 'elevator_car', 'car_battery', 'identity_card', 'card', 'cardigan', + 'cargo_ship', 'carnation', 'horse_carriage', 'carrot', 'tote_bag', + 'cart', 'carton', 'cash_register', 'casserole', 'cassette', 'cast', + 'cat', 'cauliflower', 'caviar', 'cayenne_(spice)', 'CD_player', + 'celery', 'cellular_telephone', 'chain_mail', 'chair', + 'chaise_longue', 'champagne', 'chandelier', 'chap', 'checkbook', + 'checkerboard', 'cherry', 'chessboard', + 'chest_of_drawers_(furniture)', 'chicken_(animal)', 'chicken_wire', + 'chickpea', 'Chihuahua', 'chili_(vegetable)', 'chime', 'chinaware', + 'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar', + 'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker', + 'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider', + 'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet', + 'clasp', 'cleansing_agent', 'clementine', 'clip', 'clipboard', + 'clock', 'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag', + 'coaster', 'coat', 'coat_hanger', 'coatrack', 'cock', 'coconut', + 'coffee_filter', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil', + 'coin', 'colander', 'coleslaw', 'coloring_material', + 'combination_lock', 'pacifier', 'comic_book', 'computer_keyboard', + 'concrete_mixer', 'cone', 'control', 'convertible_(automobile)', + 'sofa_bed', 'cookie', 'cookie_jar', 'cooking_utensil', + 'cooler_(for_food)', 'cork_(bottle_plug)', 'corkboard', 'corkscrew', + 'edible_corn', 'cornbread', 'cornet', 'cornice', 'cornmeal', 'corset', + 'romaine_lettuce', 'costume', 'cougar', 'coverall', 'cowbell', + 'cowboy_hat', 'crab_(animal)', 'cracker', 'crape', 'crate', 'crayon', + 'cream_pitcher', 'credit_card', 'crescent_roll', 'crib', 'crock_pot', + 'crossbar', 'crouton', 'crow', 'crown', 'crucifix', 'cruise_ship', + 'police_cruiser', 'crumb', 'crutch', 'cub_(animal)', 'cube', + 'cucumber', 'cufflink', 'cup', 'trophy_cup', 'cupcake', 'hair_curler', + 'curling_iron', 'curtain', 'cushion', 'custard', 'cutting_tool', + 'cylinder', 'cymbal', 'dachshund', 'dagger', 'dartboard', + 'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk', + 'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', + 'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher', + 'dishwasher_detergent', 'diskette', 'dispenser', 'Dixie_cup', 'dog', + 'dog_collar', 'doll', 'dollar', 'dolphin', 'domestic_ass', 'eye_mask', + 'doorbell', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly', + 'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit', + 'dresser', 'drill', 'drinking_fountain', 'drone', 'dropper', + 'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling', + 'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan', + 'Dutch_oven', 'eagle', 'earphone', 'earplug', 'earring', 'easel', + 'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater', + 'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk', + 'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan', + 'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)', + 'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', + 'fire_alarm', 'fire_engine', 'fire_extinguisher', 'fire_hose', + 'fireplace', 'fireplug', 'fish', 'fish_(food)', 'fishbowl', + 'fishing_boat', 'fishing_rod', 'flag', 'flagpole', 'flamingo', + 'flannel', 'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)', + 'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal', + 'folding_chair', 'food_processor', 'football_(American)', + 'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car', + 'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice', + 'fruit_salad', 'frying_pan', 'fudge', 'funnel', 'futon', 'gag', + 'garbage', 'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', + 'garlic', 'gasmask', 'gazelle', 'gelatin', 'gemstone', 'giant_panda', + 'gift_wrap', 'ginger', 'giraffe', 'cincture', + 'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles', + 'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose', + 'gorilla', 'gourd', 'surgical_gown', 'grape', 'grasshopper', 'grater', + 'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle', + 'grillroom', 'grinder_(tool)', 'grits', 'grizzly', 'grocery_bag', + 'guacamole', 'guitar', 'gull', 'gun', 'hair_spray', 'hairbrush', + 'hairnet', 'hairpin', 'ham', 'hamburger', 'hammer', 'hammock', + 'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel', + 'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw', + 'hardback_book', 'harmonium', 'hat', 'hatbox', 'hatch', 'veil', + 'headband', 'headboard', 'headlight', 'headscarf', 'headset', + 'headstall_(for_horses)', 'hearing_aid', 'heart', 'heater', + 'helicopter', 'helmet', 'heron', 'highchair', 'hinge', 'hippopotamus', + 'hockey_stick', 'hog', 'home_plate_(baseball)', 'honey', 'fume_hood', + 'hook', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce', + 'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear', + 'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate', + 'ice_tea', 'igniter', 'incense', 'inhaler', 'iPod', + 'iron_(for_clothing)', 'ironing_board', 'jacket', 'jam', 'jean', + 'jeep', 'jelly_bean', 'jersey', 'jet_plane', 'jewelry', 'joystick', + 'jumpsuit', 'kayak', 'keg', 'kennel', 'kettle', 'key', 'keycard', + 'kilt', 'kimono', 'kitchen_sink', 'kitchen_table', 'kite', 'kitten', + 'kiwi_fruit', 'knee_pad', 'knife', 'knight_(chess_piece)', + 'knitting_needle', 'knob', 'knocker_(on_a_door)', 'koala', 'lab_coat', + 'ladder', 'ladle', 'ladybug', 'lamb_(animal)', 'lamb-chop', 'lamp', + 'lamppost', 'lampshade', 'lantern', 'lanyard', 'laptop_computer', + 'lasagna', 'latch', 'lawn_mower', 'leather', 'legging_(clothing)', + 'Lego', 'lemon', 'lemonade', 'lettuce', 'license_plate', 'life_buoy', + 'life_jacket', 'lightbulb', 'lightning_rod', 'lime', 'limousine', + 'linen_paper', 'lion', 'lip_balm', 'lipstick', 'liquor', 'lizard', + 'Loafer_(type_of_shoe)', 'log', 'lollipop', 'lotion', + 'speaker_(stereo_equipment)', 'loveseat', 'machine_gun', 'magazine', + 'magnet', 'mail_slot', 'mailbox_(at_home)', 'mallet', 'mammoth', + 'mandarin_orange', 'manger', 'manhole', 'map', 'marker', 'martini', + 'mascot', 'mashed_potato', 'masher', 'mask', 'mast', + 'mat_(gym_equipment)', 'matchbox', 'mattress', 'measuring_cup', + 'measuring_stick', 'meatball', 'medicine', 'melon', 'microphone', + 'microscope', 'microwave_oven', 'milestone', 'milk', 'minivan', + 'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', 'money', + 'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor', + 'motor_scooter', 'motor_vehicle', 'motorboat', 'motorcycle', + 'mound_(baseball)', 'mouse_(animal_rodent)', + 'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom', + 'music_stool', 'musical_instrument', 'nailfile', 'nameplate', + 'napkin', 'neckerchief', 'necklace', 'necktie', 'needle', 'nest', + 'newsstand', 'nightshirt', 'nosebag_(for_animals)', + 'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker', + 'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil', + 'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'oregano', + 'ostrich', 'ottoman', 'overalls_(clothing)', 'owl', 'packet', + 'inkpad', 'pad', 'paddle', 'padlock', 'paintbox', 'paintbrush', + 'painting', 'pajamas', 'palette', 'pan_(for_cooking)', + 'pan_(metal_container)', 'pancake', 'pantyhose', 'papaya', + 'paperclip', 'paper_plate', 'paper_towel', 'paperback_book', + 'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', + 'parchment', 'parka', 'parking_meter', 'parrot', + 'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport', + 'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter', + 'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'pegboard', + 'pelican', 'pen', 'pencil', 'pencil_box', 'pencil_sharpener', + 'pendulum', 'penguin', 'pennant', 'penny_(coin)', 'pepper', + 'pepper_mill', 'perfume', 'persimmon', 'baby', 'pet', 'petfood', + 'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano', + 'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow', + 'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball', + 'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)', + 'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat', + 'plate', 'platter', 'playing_card', 'playpen', 'pliers', + 'plow_(farm_equipment)', 'pocket_watch', 'pocketknife', + 'poker_(fire_stirring_tool)', 'pole', 'police_van', 'polo_shirt', + 'poncho', 'pony', 'pool_table', 'pop_(soda)', 'portrait', + 'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', + 'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', + 'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune', + 'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher', + 'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit', + 'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish', + 'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat', + 'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt', + 'recliner', 'record_player', 'red_cabbage', 'reflector', + 'remote_control', 'rhinoceros', 'rib_(food)', 'rifle', 'ring', + 'river_boat', 'road_map', 'robe', 'rocking_chair', 'roller_skate', + 'Rollerblade', 'rolling_pin', 'root_beer', + 'router_(computer_equipment)', 'rubber_band', 'runner_(carpet)', + 'plastic_bag', 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', + 'safety_pin', 'sail', 'salad', 'salad_plate', 'salami', + 'salmon_(fish)', 'salmon_(food)', 'salsa', 'saltshaker', + 'sandal_(type_of_shoe)', 'sandwich', 'satchel', 'saucepan', 'saucer', + 'sausage', 'sawhorse', 'saxophone', 'scale_(measuring_instrument)', + 'scarecrow', 'scarf', 'school_bus', 'scissors', 'scoreboard', + 'scrambled_eggs', 'scraper', 'scratcher', 'screwdriver', + 'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane', + 'seashell', 'seedling', 'serving_dish', 'sewing_machine', 'shaker', + 'shampoo', 'shark', 'sharpener', 'Sharpie', 'shaver_(electric)', + 'shaving_cream', 'shawl', 'shears', 'sheep', 'shepherd_dog', + 'sherbert', 'shield', 'shirt', 'shoe', 'shopping_bag', + 'shopping_cart', 'short_pants', 'shot_glass', 'shoulder_bag', + 'shovel', 'shower_head', 'shower_curtain', 'shredder_(for_paper)', + 'sieve', 'signboard', 'silo', 'sink', 'skateboard', 'skewer', 'ski', + 'ski_boot', 'ski_parka', 'ski_pole', 'skirt', 'sled', 'sleeping_bag', + 'sling_(bandage)', 'slipper_(footwear)', 'smoothie', 'snake', + 'snowboard', 'snowman', 'snowmobile', 'soap', 'soccer_ball', 'sock', + 'soda_fountain', 'carbonated_water', 'sofa', 'softball', + 'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon', + 'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)', + 'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'sponge', + 'spoon', 'sportswear', 'spotlight', 'squirrel', + 'stapler_(stapling_machine)', 'starfish', 'statue_(sculpture)', + 'steak_(food)', 'steak_knife', 'steamer_(kitchen_appliance)', + 'steering_wheel', 'stencil', 'stepladder', 'step_stool', + 'stereo_(sound_system)', 'stew', 'stirrer', 'stirrup', + 'stockings_(leg_wear)', 'stool', 'stop_sign', 'brake_light', 'stove', + 'strainer', 'strap', 'straw_(for_drinking)', 'strawberry', + 'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer', + 'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', + 'sunglasses', 'sunhat', 'sunscreen', 'surfboard', 'sushi', 'mop', + 'sweat_pants', 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato', + 'swimsuit', 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table', + 'table', 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', + 'taillight', 'tambourine', 'army_tank', 'tank_(storage_vessel)', + 'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure', + 'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup', + 'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth', + 'telephone_pole', 'telephoto_lens', 'television_camera', + 'television_set', 'tennis_ball', 'tennis_racket', 'tequila', + 'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread', + 'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', + 'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', + 'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs', + 'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover', + 'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy', + 'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike', + 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray', + 'tree_house', 'trench_coat', 'triangle_(musical_instrument)', + 'tricycle', 'tripod', 'trousers', 'truck', 'truffle_(chocolate)', + 'trunk', 'vat', 'turban', 'turkey_(bird)', 'turkey_(food)', 'turnip', + 'turtle', 'turtleneck_(clothing)', 'typewriter', 'umbrella', + 'underwear', 'unicycle', 'urinal', 'urn', 'vacuum_cleaner', 'valve', + 'vase', 'vending_machine', 'vent', 'videotape', 'vinegar', 'violin', + 'vodka', 'volleyball', 'vulture', 'waffle', 'waffle_iron', 'wagon', + 'wagon_wheel', 'walking_stick', 'wall_clock', 'wall_socket', 'wallet', + 'walrus', 'wardrobe', 'wasabi', 'automatic_washer', 'watch', + 'water_bottle', 'water_cooler', 'water_faucet', 'water_filter', + 'water_heater', 'water_jug', 'water_gun', 'water_scooter', + 'water_ski', 'water_tower', 'watering_can', 'watermelon', + 'weathervane', 'webcam', 'wedding_cake', 'wedding_ring', 'wet_suit', + 'wheel', 'wheelchair', 'whipped_cream', 'whiskey', 'whistle', 'wick', + 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)', + 'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket', + 'wineglass', 'wing_chair', 'blinder_(for_horses)', 'wok', 'wolf', + 'wooden_spoon', 'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', + 'yak', 'yogurt', 'yoke_(animal_equipment)', 'zebra', 'zucchini'), + 'palette': + None + } + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + try: + import lvis + if getattr(lvis, '__version__', '0') >= '10.5.3': + warnings.warn( + 'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"', # noqa: E501 + UserWarning) + from lvis import LVIS + except ImportError: + raise ImportError( + 'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".' # noqa: E501 + ) + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.lvis = LVIS(local_path) + self.cat_ids = self.lvis.get_cat_ids() + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map) + + img_ids = self.lvis.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.lvis.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + if raw_img_info['file_name'].startswith('COCO'): + # Convert form the COCO 2014 file naming convention of + # COCO_[train/val/test]2014_000000000000.jpg to the 2017 + # naming convention of 000000000000.jpg + # (LVIS v1 will fix this naming issue) + raw_img_info['file_name'] = raw_img_info['file_name'][-16:] + ann_ids = self.lvis.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.lvis.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.lvis + + return data_list + + +LVISDataset = LVISV05Dataset +DATASETS.register_module(name='LVISDataset', module=LVISDataset) + + +@DATASETS.register_module() +class LVISV1Dataset(LVISDataset): + """LVIS v1 dataset for detection.""" + + METAINFO = { + 'classes': + ('aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', + 'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', + 'antenna', 'apple', 'applesauce', 'apricot', 'apron', 'aquarium', + 'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor', + 'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer', + 'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy', + 'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel', + 'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon', + 'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo', + 'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow', + 'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap', + 'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)', + 'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)', + 'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie', + 'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper', + 'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt', + 'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor', + 'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath', + 'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card', + 'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket', + 'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry', + 'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg', + 'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase', + 'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle', + 'bottle_opener', 'bouquet', 'bow_(weapon)', + 'bow_(decorative_ribbons)', 'bow-tie', 'bowl', 'pipe_bowl', + 'bowler_hat', 'bowling_ball', 'box', 'boxing_glove', 'suspenders', + 'bracelet', 'brass_plaque', 'brassiere', 'bread-bin', 'bread', + 'breechcloth', 'bridal_gown', 'briefcase', 'broccoli', 'broach', + 'broom', 'brownie', 'brussels_sprouts', 'bubble_gum', 'bucket', + 'horse_buggy', 'bull', 'bulldog', 'bulldozer', 'bullet_train', + 'bulletin_board', 'bulletproof_vest', 'bullhorn', 'bun', 'bunk_bed', + 'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butter', + 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car', 'cabinet', + 'locker', 'cake', 'calculator', 'calendar', 'calf', 'camcorder', + 'camel', 'camera', 'camera_lens', 'camper_(vehicle)', 'can', + 'can_opener', 'candle', 'candle_holder', 'candy_bar', 'candy_cane', + 'walking_cane', 'canister', 'canoe', 'cantaloup', 'canteen', + 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino', + 'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car', + 'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship', + 'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton', + 'cash_register', 'casserole', 'cassette', 'cast', 'cat', + 'cauliflower', 'cayenne_(spice)', 'CD_player', 'celery', + 'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue', + 'chalice', 'chandelier', 'chap', 'checkbook', 'checkerboard', + 'cherry', 'chessboard', 'chicken_(animal)', 'chickpea', + 'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)', + 'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk', + 'chocolate_mousse', 'choker', 'chopping_board', 'chopstick', + 'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette', + 'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent', + 'cleat_(for_securing_rope)', 'clementine', 'clip', 'clipboard', + 'clippers_(for_plants)', 'cloak', 'clock', 'clock_tower', + 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat', + 'coat_hanger', 'coatrack', 'cock', 'cockroach', 'cocoa_(beverage)', + 'coconut', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil', + 'coin', 'colander', 'coleslaw', 'coloring_material', + 'combination_lock', 'pacifier', 'comic_book', 'compass', + 'computer_keyboard', 'condiment', 'cone', 'control', + 'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie', + 'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)', + 'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet', + 'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall', + 'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker', + 'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib', + 'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown', + 'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch', + 'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup', + 'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain', + 'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard', + 'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk', + 'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', + 'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher', + 'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup', + 'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin', + 'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', + 'dragonfly', 'drawer', 'underdrawers', 'dress', 'dress_hat', + 'dress_suit', 'dresser', 'drill', 'drone', 'dropper', + 'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling', + 'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan', 'eagle', + 'earphone', 'earplug', 'earring', 'easel', 'eclair', 'eel', 'egg', + 'egg_roll', 'egg_yolk', 'eggbeater', 'eggplant', 'electric_chair', + 'refrigerator', 'elephant', 'elk', 'envelope', 'eraser', 'escargot', + 'eyepatch', 'falcon', 'fan', 'faucet', 'fedora', 'ferret', + 'Ferris_wheel', 'ferry', 'fig_(fruit)', 'fighter_jet', 'figurine', + 'file_cabinet', 'file_(tool)', 'fire_alarm', 'fire_engine', + 'fire_extinguisher', 'fire_hose', 'fireplace', 'fireplug', + 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl', 'fishing_rod', + 'flag', 'flagpole', 'flamingo', 'flannel', 'flap', 'flash', + 'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)', + 'flower_arrangement', 'flute_glass', 'foal', 'folding_chair', + 'food_processor', 'football_(American)', 'football_helmet', + 'footstool', 'fork', 'forklift', 'freight_car', 'French_toast', + 'freshener', 'frisbee', 'frog', 'fruit_juice', 'frying_pan', 'fudge', + 'funnel', 'futon', 'gag', 'garbage', 'garbage_truck', 'garden_hose', + 'gargle', 'gargoyle', 'garlic', 'gasmask', 'gazelle', 'gelatin', + 'gemstone', 'generator', 'giant_panda', 'gift_wrap', 'ginger', + 'giraffe', 'cincture', 'glass_(drink_container)', 'globe', 'glove', + 'goat', 'goggles', 'goldfish', 'golf_club', 'golfcart', + 'gondola_(boat)', 'goose', 'gorilla', 'gourd', 'grape', 'grater', + 'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle', + 'grill', 'grits', 'grizzly', 'grocery_bag', 'guitar', 'gull', 'gun', + 'hairbrush', 'hairnet', 'hairpin', 'halter_top', 'ham', 'hamburger', + 'hammer', 'hammock', 'hamper', 'hamster', 'hair_dryer', 'hand_glass', + 'hand_towel', 'handcart', 'handcuff', 'handkerchief', 'handle', + 'handsaw', 'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', + 'headband', 'headboard', 'headlight', 'headscarf', 'headset', + 'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet', + 'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog', + 'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah', + 'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce', + 'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear', + 'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate', + 'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board', + 'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey', + 'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak', + 'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono', + 'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit', + 'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)', + 'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)', + 'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard', + 'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather', + 'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', + 'lettuce', 'license_plate', 'life_buoy', 'life_jacket', 'lightbulb', + 'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor', + 'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat', + 'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)', + 'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', + 'manger', 'manhole', 'map', 'marker', 'martini', 'mascot', + 'mashed_potato', 'masher', 'mask', 'mast', 'mat_(gym_equipment)', + 'matchbox', 'mattress', 'measuring_cup', 'measuring_stick', + 'meatball', 'medicine', 'melon', 'microphone', 'microscope', + 'microwave_oven', 'milestone', 'milk', 'milk_can', 'milkshake', + 'minivan', 'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', + 'money', 'monitor_(computer_equipment) computer_monitor', 'monkey', + 'motor', 'motor_scooter', 'motor_vehicle', 'motorcycle', + 'mound_(baseball)', 'mouse_(computer_equipment)', 'mousepad', + 'muffin', 'mug', 'mushroom', 'music_stool', 'musical_instrument', + 'nailfile', 'napkin', 'neckerchief', 'necklace', 'necktie', 'needle', + 'nest', 'newspaper', 'newsstand', 'nightshirt', + 'nosebag_(for_animals)', 'noseband_(for_animals)', 'notebook', + 'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)', + 'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion', + 'orange_(fruit)', 'orange_juice', 'ostrich', 'ottoman', 'oven', + 'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle', + 'padlock', 'paintbrush', 'painting', 'pajamas', 'palette', + 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose', + 'papaya', 'paper_plate', 'paper_towel', 'paperback_book', + 'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', + 'parasol', 'parchment', 'parka', 'parking_meter', 'parrot', + 'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport', + 'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter', + 'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg', + 'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box', + 'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)', + 'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet', + 'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano', + 'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow', + 'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball', + 'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)', + 'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat', + 'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)', + 'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)', + 'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)', + 'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', + 'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', + 'pretzel', 'printer', 'projectile_(weapon)', 'projector', 'propeller', + 'prune', 'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', + 'puncher', 'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', + 'rabbit', 'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', + 'radish', 'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', + 'rat', 'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt', + 'recliner', 'record_player', 'reflector', 'remote_control', + 'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map', + 'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade', + 'rolling_pin', 'root_beer', 'router_(computer_equipment)', + 'rubber_band', 'runner_(carpet)', 'plastic_bag', + 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin', + 'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)', + 'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)', + 'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse', + 'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf', + 'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver', + 'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane', + 'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark', + 'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl', + 'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt', + 'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass', + 'shoulder_bag', 'shovel', 'shower_head', 'shower_cap', + 'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink', + 'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole', + 'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)', + 'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman', + 'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball', + 'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon', + 'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)', + 'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish', + 'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)', + 'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish', + 'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel', + 'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', + 'stirrer', 'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', + 'strainer', 'strap', 'straw_(for_drinking)', 'strawberry', + 'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer', + 'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', + 'sunglasses', 'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', + 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', + 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table', + 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight', + 'tambourine', 'army_tank', 'tank_(storage_vessel)', + 'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure', + 'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup', + 'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth', + 'telephone_pole', 'telephoto_lens', 'television_camera', + 'television_set', 'tennis_ball', 'tennis_racket', 'tequila', + 'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread', + 'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', + 'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', + 'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs', + 'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover', + 'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy', + 'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike', + 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray', + 'trench_coat', 'triangle_(musical_instrument)', 'tricycle', 'tripod', + 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat', 'turban', + 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)', + 'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn', + 'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest', + 'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture', + 'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick', + 'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe', + 'washbasin', 'automatic_washer', 'watch', 'water_bottle', + 'water_cooler', 'water_faucet', 'water_heater', 'water_jug', + 'water_gun', 'water_scooter', 'water_ski', 'water_tower', + 'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake', + 'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream', + 'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)', + 'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket', + 'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon', + 'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt', + 'yoke_(animal_equipment)', 'zebra', 'zucchini'), + 'palette': + None + } + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + try: + import lvis + if getattr(lvis, '__version__', '0') >= '10.5.3': + warnings.warn( + 'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"', # noqa: E501 + UserWarning) + from lvis import LVIS + except ImportError: + raise ImportError( + 'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".' # noqa: E501 + ) + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.lvis = LVIS(local_path) + self.cat_ids = self.lvis.get_cat_ids() + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map) + + img_ids = self.lvis.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.lvis.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + # coco_url is used in LVISv1 instead of file_name + # e.g. http://images.cocodataset.org/train2017/000000391895.jpg + # train/val split in specified in url + raw_img_info['file_name'] = raw_img_info['coco_url'].replace( + 'http://images.cocodataset.org/', '') + ann_ids = self.lvis.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.lvis.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.lvis + + return data_list diff --git a/mmdetection/mmdet/datasets/mot_challenge_dataset.py b/mmdetection/mmdet/datasets/mot_challenge_dataset.py new file mode 100644 index 0000000..ffbdc48 --- /dev/null +++ b/mmdetection/mmdet/datasets/mot_challenge_dataset.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import List, Union + +from mmdet.registry import DATASETS +from .base_video_dataset import BaseVideoDataset + + +@DATASETS.register_module() +class MOTChallengeDataset(BaseVideoDataset): + """Dataset for MOTChallenge. + + Args: + visibility_thr (float, optional): The minimum visibility + for the objects during training. Default to -1. + """ + + METAINFO = { + 'classes': + ('pedestrian', 'person_on_vehicle', 'car', 'bicycle', 'motorbike', + 'non_mot_vehicle', 'static_person', 'distractor', 'occluder', + 'occluder_on_ground', 'occluder_full', 'reflection', 'crowd') + } + + def __init__(self, visibility_thr: float = -1, *args, **kwargs): + self.visibility_thr = visibility_thr + super().__init__(*args, **kwargs) + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. The difference between this + function and the one in ``BaseVideoDataset`` is that the parsing here + adds ``visibility`` and ``mot_conf``. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file`` + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + ann_info = raw_data_info['raw_ann_info'] + data_info = {} + + data_info.update(img_info) + if self.data_prefix.get('img_path', None) is not None: + img_path = osp.join(self.data_prefix['img_path'], + img_info['file_name']) + else: + img_path = img_info['file_name'] + data_info['img_path'] = img_path + + instances = [] + for i, ann in enumerate(ann_info): + instance = {} + + if (not self.test_mode) and (ann['visibility'] < + self.visibility_thr): + continue + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) + inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + if ann['category_id'] not in self.cat_ids: + continue + bbox = [x1, y1, x1 + w, y1 + h] + + if ann.get('iscrowd', False): + instance['ignore_flag'] = 1 + else: + instance['ignore_flag'] = 0 + instance['bbox'] = bbox + instance['bbox_label'] = self.cat2label[ann['category_id']] + instance['instance_id'] = ann['instance_id'] + instance['category_id'] = ann['category_id'] + instance['mot_conf'] = ann['mot_conf'] + instance['visibility'] = ann['visibility'] + if len(instance) > 0: + instances.append(instance) + if not self.test_mode: + assert len(instances) > 0, f'No valid instances found in ' \ + f'image {data_info["img_path"]}!' + data_info['instances'] = instances + return data_info diff --git a/mmdetection/mmdet/datasets/objects365.py b/mmdetection/mmdet/datasets/objects365.py new file mode 100644 index 0000000..e99869b --- /dev/null +++ b/mmdetection/mmdet/datasets/objects365.py @@ -0,0 +1,284 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +from typing import List + +from mmengine.fileio import get_local_path + +from mmdet.registry import DATASETS +from .api_wrappers import COCO +from .coco import CocoDataset + +# images exist in annotations but not in image folder. +objv2_ignore_list = [ + osp.join('patch16', 'objects365_v2_00908726.jpg'), + osp.join('patch6', 'objects365_v1_00320532.jpg'), + osp.join('patch6', 'objects365_v1_00320534.jpg'), +] + + +@DATASETS.register_module() +class Objects365V1Dataset(CocoDataset): + """Objects365 v1 dataset for detection.""" + + METAINFO = { + 'classes': + ('person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle', + 'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk', + 'handbag', 'street lights', 'book', 'plate', 'helmet', + 'leather shoes', 'pillow', 'glove', 'potted plant', 'bracelet', + 'flower', 'tv', 'storage box', 'vase', 'bench', 'wine glass', 'boots', + 'bowl', 'dining table', 'umbrella', 'boat', 'flag', 'speaker', + 'trash bin/can', 'stool', 'backpack', 'couch', 'belt', 'carpet', + 'basket', 'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table', + 'suv', 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', + 'microphone', 'sandals', 'canned', 'necklace', 'mirror', 'faucet', + 'bicycle', 'bread', 'high heels', 'ring', 'van', 'watch', 'sink', + 'horse', 'fish', 'apple', 'camera', 'candle', 'teddy bear', 'cake', + 'motorcycle', 'wild bird', 'laptop', 'knife', 'traffic sign', + 'cell phone', 'paddle', 'truck', 'cow', 'power outlet', 'clock', + 'drum', 'fork', 'bus', 'hanger', 'nightstand', 'pot/pan', 'sheep', + 'guitar', 'traffic cone', 'tea pot', 'keyboard', 'tripod', 'hockey', + 'fan', 'dog', 'spoon', 'blackboard/whiteboard', 'balloon', + 'air conditioner', 'cymbal', 'mouse', 'telephone', 'pickup truck', + 'orange', 'banana', 'airplane', 'luggage', 'skis', 'soccer', + 'trolley', 'oven', 'remote', 'baseball glove', 'paper towel', + 'refrigerator', 'train', 'tomato', 'machinery vehicle', 'tent', + 'shampoo/shower gel', 'head phone', 'lantern', 'donut', + 'cleaning products', 'sailboat', 'tangerine', 'pizza', 'kite', + 'computer box', 'elephant', 'toiletries', 'gas stove', 'broccoli', + 'toilet', 'stroller', 'shovel', 'baseball bat', 'microwave', + 'skateboard', 'surfboard', 'surveillance camera', 'gun', 'life saver', + 'cat', 'lemon', 'liquid soap', 'zebra', 'duck', 'sports car', + 'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator', 'converter', + 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies', + 'cutting/chopping board', 'tennis racket', 'candy', + 'skating and skiing shoes', 'scissors', 'folder', 'baseball', + 'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine', + 'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear', + 'american football', 'basketball', 'potato', 'paint brush', 'printer', + 'billiards', 'fire hydrant', 'goose', 'projector', 'sausage', + 'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball', + 'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee', + 'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender', + 'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango', + 'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion', + 'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale', + 'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple', + 'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle', + 'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar', + 'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD', + 'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado', + 'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear', + 'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn', + 'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball', + 'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice', + 'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel', + 'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', + 'antelope', 'shrimp', 'rickshaw', 'trombone', 'pomegranate', + 'coconut', 'jellyfish', 'mushroom', 'calculator', 'treadmill', + 'butterfly', 'egg tart', 'cheese', 'pig', 'pomelo', 'race car', + 'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 'hair drier', + 'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey', + 'electric drill', 'spring rolls', 'tortoise/turtle', 'parrot', + 'flute', 'measuring cup', 'shark', 'steak', 'poker card', + 'binoculars', 'llama', 'radish', 'noodles', 'yak', 'mop', 'crab', + 'microscope', 'barbell', 'bread/bun', 'baozi', 'lion', 'red cabbage', + 'polar bear', 'lighter', 'seal', 'mangosteen', 'comb', 'eraser', + 'pitaya', 'scallop', 'pencil case', 'saw', 'table tennis paddle', + 'okra', 'starfish', 'eagle', 'monkey', 'durian', 'game board', + 'rabbit', 'french horn', 'ambulance', 'asparagus', 'hoverboard', + 'pasta', 'target', 'hotair balloon', 'chainsaw', 'lobster', 'iron', + 'flashlight'), + 'palette': + None + } + + COCOAPI = COCO + # ann_id is unique in coco dataset. + ANN_ID_UNIQUE = True + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + + # 'categories' list in objects365_train.json and objects365_val.json + # is inconsistent, need sort list(or dict) before get cat_ids. + cats = self.coco.cats + sorted_cats = {i: cats[i] for i in sorted(cats)} + self.coco.cats = sorted_cats + categories = self.coco.dataset['categories'] + sorted_categories = sorted(categories, key=lambda i: i['id']) + self.coco.dataset['categories'] = sorted_categories + # The order of returned `cat_ids` will not + # change with the order of the `classes` + self.cat_ids = self.coco.get_cat_ids( + cat_names=self.metainfo['classes']) + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + + return data_list + + +@DATASETS.register_module() +class Objects365V2Dataset(CocoDataset): + """Objects365 v2 dataset for detection.""" + METAINFO = { + 'classes': + ('Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp', + 'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf', + 'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet', + 'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower', + 'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots', + 'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt', + 'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker', + 'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool', + 'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum', + 'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle', + 'Guitar', 'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned', + 'Truck', 'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel', + 'Stuffed Toy', 'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed', + 'Faucet', 'Tent', 'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple', + 'Air Conditioner', 'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck', + 'Fork', 'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock', + 'Pot', 'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger', + 'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine', + 'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle', + 'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane', + 'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage', + 'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone', + 'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane', + 'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat', + 'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza', + 'Elephant', 'Skateboard', 'Surfboard', 'Gun', + 'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot', + 'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper', + 'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks', + 'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board', + 'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder', + 'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball', + 'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle', + 'Violin', 'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck', + 'Billards', 'Converter', 'Bathtub', 'Wheelchair', 'Golf Club', + 'Briefcase', 'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear', + 'Heavy Truck', 'Hamburger', 'Extractor', 'Extention Cord', 'Tong', + 'Tennis Racket', 'Folder', 'American Football', 'earphone', 'Mask', + 'Kettle', 'Tennis', 'Ship', 'Swing', 'Coffee Machine', 'Slide', + 'Carriage', 'Onion', 'Green beans', 'Projector', 'Frisbee', + 'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon', + 'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon', + 'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog', + 'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer', + 'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple', + 'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle', + 'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone', + 'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion', + 'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom', + 'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit', + 'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese', + 'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue', + 'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap', + 'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut', + 'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak', + 'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate', + 'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker', + 'Tuba', 'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal', + 'Buttefly', 'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin', + 'Electric Drill', 'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill', + 'Lighter', 'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi', + 'Target', 'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case', + 'Yak', 'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop', + 'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle', + 'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster', + 'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling', + 'Table Tennis '), + 'palette': + None + } + + COCOAPI = COCO + # ann_id is unique in coco dataset. + ANN_ID_UNIQUE = True + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + # The order of returned `cat_ids` will not + # change with the order of the `classes` + self.cat_ids = self.coco.get_cat_ids( + cat_names=self.metainfo['classes']) + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + # file_name should be `patchX/xxx.jpg` + file_name = osp.join( + osp.split(osp.split(raw_img_info['file_name'])[0])[-1], + osp.split(raw_img_info['file_name'])[-1]) + + if file_name in objv2_ignore_list: + continue + + raw_img_info['file_name'] = file_name + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + + return data_list diff --git a/mmdetection/mmdet/datasets/openimages.py b/mmdetection/mmdet/datasets/openimages.py new file mode 100644 index 0000000..a3c6c8e --- /dev/null +++ b/mmdetection/mmdet/datasets/openimages.py @@ -0,0 +1,484 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import csv +import os.path as osp +from collections import defaultdict +from typing import Dict, List, Optional + +import numpy as np +from mmengine.fileio import get_local_path, load +from mmengine.utils import is_abs + +from mmdet.registry import DATASETS +from .base_det_dataset import BaseDetDataset + + +@DATASETS.register_module() +class OpenImagesDataset(BaseDetDataset): + """Open Images dataset for detection. + + Args: + ann_file (str): Annotation file path. + label_file (str): File path of the label description file that + maps the classes names in MID format to their short + descriptions. + meta_file (str): File path to get image metas. + hierarchy_file (str): The file path of the class hierarchy. + image_level_ann_file (str): Human-verified image level annotation, + which is used in evaluation. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + """ + + METAINFO: dict = dict(dataset_type='oid_v6') + + def __init__(self, + label_file: str, + meta_file: str, + hierarchy_file: str, + image_level_ann_file: Optional[str] = None, + **kwargs) -> None: + self.label_file = label_file + self.meta_file = meta_file + self.hierarchy_file = hierarchy_file + self.image_level_ann_file = image_level_ann_file + super().__init__(**kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ + classes_names, label_id_mapping = self._parse_label_file( + self.label_file) + self._metainfo['classes'] = classes_names + self.label_id_mapping = label_id_mapping + + if self.image_level_ann_file is not None: + img_level_anns = self._parse_img_level_ann( + self.image_level_ann_file) + else: + img_level_anns = None + + # OpenImagesMetric can get the relation matrix from the dataset meta + relation_matrix = self._get_relation_matrix(self.hierarchy_file) + self._metainfo['RELATION_MATRIX'] = relation_matrix + + data_list = [] + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + with open(local_path, 'r') as f: + reader = csv.reader(f) + last_img_id = None + instances = [] + for i, line in enumerate(reader): + if i == 0: + continue + img_id = line[0] + if last_img_id is None: + last_img_id = img_id + label_id = line[2] + assert label_id in self.label_id_mapping + label = int(self.label_id_mapping[label_id]) + bbox = [ + float(line[4]), # xmin + float(line[6]), # ymin + float(line[5]), # xmax + float(line[7]) # ymax + ] + is_occluded = True if int(line[8]) == 1 else False + is_truncated = True if int(line[9]) == 1 else False + is_group_of = True if int(line[10]) == 1 else False + is_depiction = True if int(line[11]) == 1 else False + is_inside = True if int(line[12]) == 1 else False + + instance = dict( + bbox=bbox, + bbox_label=label, + ignore_flag=0, + is_occluded=is_occluded, + is_truncated=is_truncated, + is_group_of=is_group_of, + is_depiction=is_depiction, + is_inside=is_inside) + last_img_path = osp.join(self.data_prefix['img'], + f'{last_img_id}.jpg') + if img_id != last_img_id: + # switch to a new image, record previous image's data. + data_info = dict( + img_path=last_img_path, + img_id=last_img_id, + instances=instances, + ) + data_list.append(data_info) + instances = [] + instances.append(instance) + last_img_id = img_id + data_list.append( + dict( + img_path=last_img_path, + img_id=last_img_id, + instances=instances, + )) + + # add image metas to data list + img_metas = load( + self.meta_file, file_format='pkl', backend_args=self.backend_args) + assert len(img_metas) == len(data_list) + for i, meta in enumerate(img_metas): + img_id = data_list[i]['img_id'] + assert f'{img_id}.jpg' == osp.split(meta['filename'])[-1] + h, w = meta['ori_shape'][:2] + data_list[i]['height'] = h + data_list[i]['width'] = w + # denormalize bboxes + for j in range(len(data_list[i]['instances'])): + data_list[i]['instances'][j]['bbox'][0] *= w + data_list[i]['instances'][j]['bbox'][2] *= w + data_list[i]['instances'][j]['bbox'][1] *= h + data_list[i]['instances'][j]['bbox'][3] *= h + # add image-level annotation + if img_level_anns is not None: + img_labels = [] + confidences = [] + img_ann_list = img_level_anns.get(img_id, []) + for ann in img_ann_list: + img_labels.append(int(ann['image_level_label'])) + confidences.append(float(ann['confidence'])) + data_list[i]['image_level_labels'] = np.array( + img_labels, dtype=np.int64) + data_list[i]['confidences'] = np.array( + confidences, dtype=np.float32) + return data_list + + def _parse_label_file(self, label_file: str) -> tuple: + """Get classes name and index mapping from cls-label-description file. + + Args: + label_file (str): File path of the label description file that + maps the classes names in MID format to their short + descriptions. + + Returns: + tuple: Class name of OpenImages. + """ + + index_list = [] + classes_names = [] + with get_local_path( + label_file, backend_args=self.backend_args) as local_path: + with open(local_path, 'r') as f: + reader = csv.reader(f) + for line in reader: + # self.cat2label[line[0]] = line[1] + classes_names.append(line[1]) + index_list.append(line[0]) + index_mapping = {index: i for i, index in enumerate(index_list)} + return classes_names, index_mapping + + def _parse_img_level_ann(self, + img_level_ann_file: str) -> Dict[str, List[dict]]: + """Parse image level annotations from csv style ann_file. + + Args: + img_level_ann_file (str): CSV style image level annotation + file path. + + Returns: + Dict[str, List[dict]]: Annotations where item of the defaultdict + indicates an image, each of which has (n) dicts. + Keys of dicts are: + + - `image_level_label` (int): Label id. + - `confidence` (float): Labels that are human-verified to be + present in an image have confidence = 1 (positive labels). + Labels that are human-verified to be absent from an image + have confidence = 0 (negative labels). Machine-generated + labels have fractional confidences, generally >= 0.5. + The higher the confidence, the smaller the chance for + the label to be a false positive. + """ + + item_lists = defaultdict(list) + with get_local_path( + img_level_ann_file, + backend_args=self.backend_args) as local_path: + with open(local_path, 'r') as f: + reader = csv.reader(f) + for i, line in enumerate(reader): + if i == 0: + continue + img_id = line[0] + item_lists[img_id].append( + dict( + image_level_label=int( + self.label_id_mapping[line[2]]), + confidence=float(line[3]))) + return item_lists + + def _get_relation_matrix(self, hierarchy_file: str) -> np.ndarray: + """Get the matrix of class hierarchy from the hierarchy file. Hierarchy + for 600 classes can be found at https://storage.googleapis.com/openimag + es/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html. + + Args: + hierarchy_file (str): File path to the hierarchy for classes. + + Returns: + np.ndarray: The matrix of the corresponding relationship between + the parent class and the child class, of shape + (class_num, class_num). + """ # noqa + + hierarchy = load( + hierarchy_file, file_format='json', backend_args=self.backend_args) + class_num = len(self._metainfo['classes']) + relation_matrix = np.eye(class_num, class_num) + relation_matrix = self._convert_hierarchy_tree(hierarchy, + relation_matrix) + return relation_matrix + + def _convert_hierarchy_tree(self, + hierarchy_map: dict, + relation_matrix: np.ndarray, + parents: list = [], + get_all_parents: bool = True) -> np.ndarray: + """Get matrix of the corresponding relationship between the parent + class and the child class. + + Args: + hierarchy_map (dict): Including label name and corresponding + subcategory. Keys of dicts are: + + - `LabeName` (str): Name of the label. + - `Subcategory` (dict | list): Corresponding subcategory(ies). + relation_matrix (ndarray): The matrix of the corresponding + relationship between the parent class and the child class, + of shape (class_num, class_num). + parents (list): Corresponding parent class. + get_all_parents (bool): Whether get all parent names. + Default: True + + Returns: + ndarray: The matrix of the corresponding relationship between + the parent class and the child class, of shape + (class_num, class_num). + """ + + if 'Subcategory' in hierarchy_map: + for node in hierarchy_map['Subcategory']: + if 'LabelName' in node: + children_name = node['LabelName'] + children_index = self.label_id_mapping[children_name] + children = [children_index] + else: + continue + if len(parents) > 0: + for parent_index in parents: + if get_all_parents: + children.append(parent_index) + relation_matrix[children_index, parent_index] = 1 + relation_matrix = self._convert_hierarchy_tree( + node, relation_matrix, parents=children) + return relation_matrix + + def _join_prefix(self): + """Join ``self.data_root`` with annotation path.""" + super()._join_prefix() + if not is_abs(self.label_file) and self.label_file: + self.label_file = osp.join(self.data_root, self.label_file) + if not is_abs(self.meta_file) and self.meta_file: + self.meta_file = osp.join(self.data_root, self.meta_file) + if not is_abs(self.hierarchy_file) and self.hierarchy_file: + self.hierarchy_file = osp.join(self.data_root, self.hierarchy_file) + if self.image_level_ann_file and not is_abs(self.image_level_ann_file): + self.image_level_ann_file = osp.join(self.data_root, + self.image_level_ann_file) + + +@DATASETS.register_module() +class OpenImagesChallengeDataset(OpenImagesDataset): + """Open Images Challenge dataset for detection. + + Args: + ann_file (str): Open Images Challenge box annotation in txt format. + """ + + METAINFO: dict = dict(dataset_type='oid_challenge') + + def __init__(self, ann_file: str, **kwargs) -> None: + if not ann_file.endswith('txt'): + raise TypeError('The annotation file of Open Images Challenge ' + 'should be a txt file.') + + super().__init__(ann_file=ann_file, **kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ + classes_names, label_id_mapping = self._parse_label_file( + self.label_file) + self._metainfo['classes'] = classes_names + self.label_id_mapping = label_id_mapping + + if self.image_level_ann_file is not None: + img_level_anns = self._parse_img_level_ann( + self.image_level_ann_file) + else: + img_level_anns = None + + # OpenImagesMetric can get the relation matrix from the dataset meta + relation_matrix = self._get_relation_matrix(self.hierarchy_file) + self._metainfo['RELATION_MATRIX'] = relation_matrix + + data_list = [] + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + with open(local_path, 'r') as f: + lines = f.readlines() + i = 0 + while i < len(lines): + instances = [] + filename = lines[i].rstrip() + i += 2 + img_gt_size = int(lines[i]) + i += 1 + for j in range(img_gt_size): + sp = lines[i + j].split() + instances.append( + dict( + bbox=[ + float(sp[1]), + float(sp[2]), + float(sp[3]), + float(sp[4]) + ], + bbox_label=int(sp[0]) - 1, # labels begin from 1 + ignore_flag=0, + is_group_ofs=True if int(sp[5]) == 1 else False)) + i += img_gt_size + data_list.append( + dict( + img_path=osp.join(self.data_prefix['img'], filename), + instances=instances, + )) + + # add image metas to data list + img_metas = load( + self.meta_file, file_format='pkl', backend_args=self.backend_args) + assert len(img_metas) == len(data_list) + for i, meta in enumerate(img_metas): + img_id = osp.split(data_list[i]['img_path'])[-1][:-4] + assert img_id == osp.split(meta['filename'])[-1][:-4] + h, w = meta['ori_shape'][:2] + data_list[i]['height'] = h + data_list[i]['width'] = w + data_list[i]['img_id'] = img_id + # denormalize bboxes + for j in range(len(data_list[i]['instances'])): + data_list[i]['instances'][j]['bbox'][0] *= w + data_list[i]['instances'][j]['bbox'][2] *= w + data_list[i]['instances'][j]['bbox'][1] *= h + data_list[i]['instances'][j]['bbox'][3] *= h + # add image-level annotation + if img_level_anns is not None: + img_labels = [] + confidences = [] + img_ann_list = img_level_anns.get(img_id, []) + for ann in img_ann_list: + img_labels.append(int(ann['image_level_label'])) + confidences.append(float(ann['confidence'])) + data_list[i]['image_level_labels'] = np.array( + img_labels, dtype=np.int64) + data_list[i]['confidences'] = np.array( + confidences, dtype=np.float32) + return data_list + + def _parse_label_file(self, label_file: str) -> tuple: + """Get classes name and index mapping from cls-label-description file. + + Args: + label_file (str): File path of the label description file that + maps the classes names in MID format to their short + descriptions. + + Returns: + tuple: Class name of OpenImages. + """ + label_list = [] + id_list = [] + index_mapping = {} + with get_local_path( + label_file, backend_args=self.backend_args) as local_path: + with open(local_path, 'r') as f: + reader = csv.reader(f) + for line in reader: + label_name = line[0] + label_id = int(line[2]) + label_list.append(line[1]) + id_list.append(label_id) + index_mapping[label_name] = label_id - 1 + indexes = np.argsort(id_list) + classes_names = [] + for index in indexes: + classes_names.append(label_list[index]) + return classes_names, index_mapping + + def _parse_img_level_ann(self, image_level_ann_file): + """Parse image level annotations from csv style ann_file. + + Args: + image_level_ann_file (str): CSV style image level annotation + file path. + + Returns: + defaultdict[list[dict]]: Annotations where item of the defaultdict + indicates an image, each of which has (n) dicts. + Keys of dicts are: + + - `image_level_label` (int): of shape 1. + - `confidence` (float): of shape 1. + """ + + item_lists = defaultdict(list) + with get_local_path( + image_level_ann_file, + backend_args=self.backend_args) as local_path: + with open(local_path, 'r') as f: + reader = csv.reader(f) + i = -1 + for line in reader: + i += 1 + if i == 0: + continue + else: + img_id = line[0] + label_id = line[1] + assert label_id in self.label_id_mapping + image_level_label = int( + self.label_id_mapping[label_id]) + confidence = float(line[2]) + item_lists[img_id].append( + dict( + image_level_label=image_level_label, + confidence=confidence)) + return item_lists + + def _get_relation_matrix(self, hierarchy_file: str) -> np.ndarray: + """Get the matrix of class hierarchy from the hierarchy file. + + Args: + hierarchy_file (str): File path to the hierarchy for classes. + + Returns: + np.ndarray: The matrix of the corresponding + relationship between the parent class and the child class, + of shape (class_num, class_num). + """ + with get_local_path( + hierarchy_file, backend_args=self.backend_args) as local_path: + class_label_tree = np.load(local_path, allow_pickle=True) + return class_label_tree[1:, 1:] diff --git a/mmdetection/mmdet/datasets/recycle.py b/mmdetection/mmdet/datasets/recycle.py new file mode 100644 index 0000000..eb8a07f --- /dev/null +++ b/mmdetection/mmdet/datasets/recycle.py @@ -0,0 +1,171 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +from typing import List, Union + +from mmengine.fileio import get_local_path + +from mmdet.registry import DATASETS +from .api_wrappers import COCO +from .base_det_dataset import BaseDetDataset + + +@DATASETS.register_module() +class RecycleDataset(BaseDetDataset): + """Dataset for COCO.""" + + METAINFO = { + 'classes': + ('General trash', 'Paper', 'Paper pack', 'Metal', 'Glass', + 'Plastic', 'Styrofoam', 'Plastic bag', 'Battery', 'Clothing',), + # palette is a list of color tuples, which is used for visualization. + 'palette': + [(220, 20, 60), (119, 11, 32), (0, 0, 230), (106, 0, 228), (60, 20, 220), + (0, 80, 100), (0, 0, 70), (50, 0, 192), (250, 170, 30), (255, 0, 0)] + } + COCOAPI = COCO + # ann_id is unique in coco dataset. + ANN_ID_UNIQUE = True + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + # The order of returned `cat_ids` will not + # change with the order of the `classes` + self.cat_ids = self.coco.get_cat_ids( + cat_names=self.metainfo['classes']) + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + + return data_list + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file`` + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + ann_info = raw_data_info['raw_ann_info'] + + data_info = {} + + # TODO: need to change data_prefix['img'] to data_prefix['img_path'] + img_path = osp.join(self.data_prefix['img'], img_info['file_name']) + if self.data_prefix.get('seg', None): + seg_map_path = osp.join( + self.data_prefix['seg'], + img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix) + else: + seg_map_path = None + data_info['img_path'] = img_path + data_info['img_id'] = img_info['img_id'] + data_info['seg_map_path'] = seg_map_path + data_info['height'] = img_info['height'] + data_info['width'] = img_info['width'] + + if self.return_classes: + data_info['text'] = self.metainfo['classes'] + data_info['custom_entities'] = True + + instances = [] + for i, ann in enumerate(ann_info): + instance = {} + + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) + inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + if ann['category_id'] not in self.cat_ids: + continue + bbox = [x1, y1, x1 + w, y1 + h] + + if ann.get('iscrowd', False): + instance['ignore_flag'] = 1 + else: + instance['ignore_flag'] = 0 + instance['bbox'] = bbox + instance['bbox_label'] = self.cat2label[ann['category_id']] + + if ann.get('segmentation', None): + instance['mask'] = ann['segmentation'] + + instances.append(instance) + data_info['instances'] = instances + return data_info + + def filter_data(self) -> List[dict]: + """Filter annotations according to filter_cfg. + + Returns: + List[dict]: Filtered results. + """ + if self.test_mode: + return self.data_list + + if self.filter_cfg is None: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) + min_size = self.filter_cfg.get('min_size', 0) + + # obtain images that contain annotation + ids_with_ann = set(data_info['img_id'] for data_info in self.data_list) + # obtain images that contain annotations of the required categories + ids_in_cat = set() + for i, class_id in enumerate(self.cat_ids): + ids_in_cat |= set(self.cat_img_map[class_id]) + # merge the image id sets of the two conditions and use the merged set + # to filter out images if self.filter_empty_gt=True + ids_in_cat &= ids_with_ann + + valid_data_infos = [] + for i, data_info in enumerate(self.data_list): + img_id = data_info['img_id'] + width = data_info['width'] + height = data_info['height'] + if filter_empty_gt and img_id not in ids_in_cat: + continue + if min(width, height) >= min_size: + valid_data_infos.append(data_info) + + return valid_data_infos diff --git a/mmdetection/mmdet/datasets/refcoco.py b/mmdetection/mmdet/datasets/refcoco.py new file mode 100644 index 0000000..0dae75f --- /dev/null +++ b/mmdetection/mmdet/datasets/refcoco.py @@ -0,0 +1,163 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import collections +import os.path as osp +import random +from typing import Dict, List + +import mmengine +from mmengine.dataset import BaseDataset + +from mmdet.registry import DATASETS + + +@DATASETS.register_module() +class RefCocoDataset(BaseDataset): + """RefCOCO dataset. + + The `Refcoco` and `Refcoco+` dataset is based on + `ReferItGame: Referring to Objects in Photographs of Natural Scenes + `_. + + The `Refcocog` dataset is based on + `Generation and Comprehension of Unambiguous Object Descriptions + `_. + + Args: + ann_file (str): Annotation file path. + data_root (str): The root directory for ``data_prefix`` and + ``ann_file``. Defaults to ''. + data_prefix (str): Prefix for training data. + split_file (str): Split file path. + split (str): Split name. Defaults to 'train'. + text_mode (str): Text mode. Defaults to 'random'. + **kwargs: Other keyword arguments in :class:`BaseDataset`. + """ + + def __init__(self, + data_root: str, + ann_file: str, + split_file: str, + data_prefix: Dict, + split: str = 'train', + text_mode: str = 'random', + **kwargs): + self.split_file = split_file + self.split = split + + assert text_mode in ['original', 'random', 'concat', 'select_first'] + self.text_mode = text_mode + super().__init__( + data_root=data_root, + data_prefix=data_prefix, + ann_file=ann_file, + **kwargs, + ) + + def _join_prefix(self): + if not mmengine.is_abs(self.split_file) and self.split_file: + self.split_file = osp.join(self.data_root, self.split_file) + + return super()._join_prefix() + + def _init_refs(self): + """Initialize the refs for RefCOCO.""" + anns, imgs = {}, {} + for ann in self.instances['annotations']: + anns[ann['id']] = ann + for img in self.instances['images']: + imgs[img['id']] = img + + refs, ref_to_ann = {}, {} + for ref in self.splits: + # ids + ref_id = ref['ref_id'] + ann_id = ref['ann_id'] + # add mapping related to ref + refs[ref_id] = ref + ref_to_ann[ref_id] = anns[ann_id] + + self.refs = refs + self.ref_to_ann = ref_to_ann + + def load_data_list(self) -> List[dict]: + """Load data list.""" + self.splits = mmengine.load(self.split_file, file_format='pkl') + self.instances = mmengine.load(self.ann_file, file_format='json') + self._init_refs() + img_prefix = self.data_prefix['img_path'] + + ref_ids = [ + ref['ref_id'] for ref in self.splits if ref['split'] == self.split + ] + full_anno = [] + for ref_id in ref_ids: + ref = self.refs[ref_id] + ann = self.ref_to_ann[ref_id] + ann.update(ref) + full_anno.append(ann) + + image_id_list = [] + final_anno = {} + for anno in full_anno: + image_id_list.append(anno['image_id']) + final_anno[anno['ann_id']] = anno + annotations = [value for key, value in final_anno.items()] + + coco_train_id = [] + image_annot = {} + for i in range(len(self.instances['images'])): + coco_train_id.append(self.instances['images'][i]['id']) + image_annot[self.instances['images'][i] + ['id']] = self.instances['images'][i] + + images = [] + for image_id in list(set(image_id_list)): + images += [image_annot[image_id]] + + data_list = [] + + grounding_dict = collections.defaultdict(list) + for anno in annotations: + image_id = int(anno['image_id']) + grounding_dict[image_id].append(anno) + + join_path = mmengine.fileio.get_file_backend(img_prefix).join_path + for image in images: + img_id = image['id'] + instances = [] + sentences = [] + for grounding_anno in grounding_dict[img_id]: + texts = [x['raw'].lower() for x in grounding_anno['sentences']] + # random select one text + if self.text_mode == 'random': + idx = random.randint(0, len(texts) - 1) + text = [texts[idx]] + # concat all texts + elif self.text_mode == 'concat': + text = [''.join(texts)] + # select the first text + elif self.text_mode == 'select_first': + text = [texts[0]] + # use all texts + elif self.text_mode == 'original': + text = texts + else: + raise ValueError(f'Invalid text mode "{self.text_mode}".') + ins = [{ + 'mask': grounding_anno['segmentation'], + 'ignore_flag': 0 + }] * len(text) + instances.extend(ins) + sentences.extend(text) + data_info = { + 'img_path': join_path(img_prefix, image['file_name']), + 'img_id': img_id, + 'instances': instances, + 'text': sentences + } + data_list.append(data_info) + + if len(data_list) == 0: + raise ValueError(f'No sample in split "{self.split}".') + + return data_list diff --git a/mmdetection/mmdet/datasets/reid_dataset.py b/mmdetection/mmdet/datasets/reid_dataset.py new file mode 100644 index 0000000..1eed3ee --- /dev/null +++ b/mmdetection/mmdet/datasets/reid_dataset.py @@ -0,0 +1,127 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +from collections import defaultdict +from typing import Any, Dict, List + +import numpy as np +from mmengine.dataset import BaseDataset +from mmengine.utils import check_file_exist + +from mmdet.registry import DATASETS + + +@DATASETS.register_module() +class ReIDDataset(BaseDataset): + """Dataset for ReID. + + Args: + triplet_sampler (dict, optional): The sampler for hard mining + triplet loss. Defaults to None. + keys: num_ids (int): The number of person ids. + ins_per_id (int): The number of image for each person. + """ + + def __init__(self, triplet_sampler: dict = None, *args, **kwargs): + self.triplet_sampler = triplet_sampler + super().__init__(*args, **kwargs) + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ''self.ann_file''. + + Returns: + list[dict]: A list of annotation. + """ + assert isinstance(self.ann_file, str) + check_file_exist(self.ann_file) + data_list = [] + with open(self.ann_file) as f: + samples = [x.strip().split(' ') for x in f.readlines()] + for filename, gt_label in samples: + info = dict(img_prefix=self.data_prefix) + if self.data_prefix['img_path'] is not None: + info['img_path'] = osp.join(self.data_prefix['img_path'], + filename) + else: + info['img_path'] = filename + info['gt_label'] = np.array(gt_label, dtype=np.int64) + data_list.append(info) + self._parse_ann_info(data_list) + return data_list + + def _parse_ann_info(self, data_list: List[dict]): + """Parse person id annotations.""" + index_tmp_dic = defaultdict(list) # pid->[idx1,...,idxN] + self.index_dic = dict() # pid->array([idx1,...,idxN]) + for idx, info in enumerate(data_list): + pid = info['gt_label'] + index_tmp_dic[int(pid)].append(idx) + for pid, idxs in index_tmp_dic.items(): + self.index_dic[pid] = np.asarray(idxs, dtype=np.int64) + self.pids = np.asarray(list(self.index_dic.keys()), dtype=np.int64) + + def prepare_data(self, idx: int) -> Any: + """Get data processed by ''self.pipeline''. + + Args: + idx (int): The index of ''data_info'' + + Returns: + Any: Depends on ''self.pipeline'' + """ + data_info = self.get_data_info(idx) + if self.triplet_sampler is not None: + img_info = self.triplet_sampling(data_info['gt_label'], + **self.triplet_sampler) + data_info = copy.deepcopy(img_info) # triplet -> list + else: + data_info = copy.deepcopy(data_info) # no triplet -> dict + return self.pipeline(data_info) + + def triplet_sampling(self, + pos_pid, + num_ids: int = 8, + ins_per_id: int = 4) -> Dict: + """Triplet sampler for hard mining triplet loss. First, for one + pos_pid, random sample ins_per_id images with same person id. + + Then, random sample num_ids - 1 images for each negative id. + Finally, random sample ins_per_id images for each negative id. + + Args: + pos_pid (ndarray): The person id of the anchor. + num_ids (int): The number of person ids. + ins_per_id (int): The number of images for each person. + + Returns: + Dict: Annotation information of num_ids X ins_per_id images. + """ + assert len(self.pids) >= num_ids, \ + 'The number of person ids in the training set must ' \ + 'be greater than the number of person ids in the sample.' + + pos_idxs = self.index_dic[int( + pos_pid)] # all positive idxs for pos_pid + idxs_list = [] + # select positive samplers + idxs_list.extend(pos_idxs[np.random.choice( + pos_idxs.shape[0], ins_per_id, replace=True)]) + # select negative ids + neg_pids = np.random.choice( + [i for i, _ in enumerate(self.pids) if i != pos_pid], + num_ids - 1, + replace=False) + # select negative samplers for each negative id + for neg_pid in neg_pids: + neg_idxs = self.index_dic[neg_pid] + idxs_list.extend(neg_idxs[np.random.choice( + neg_idxs.shape[0], ins_per_id, replace=True)]) + # return the final triplet batch + triplet_img_infos = [] + for idx in idxs_list: + triplet_img_infos.append(copy.deepcopy(self.get_data_info(idx))) + # Collect data_list scatters (list of dict -> dict of list) + out = dict() + for key in triplet_img_infos[0].keys(): + out[key] = [_info[key] for _info in triplet_img_infos] + return out diff --git a/mmdetection/mmdet/datasets/samplers/__init__.py b/mmdetection/mmdet/datasets/samplers/__init__.py new file mode 100644 index 0000000..a942ff2 --- /dev/null +++ b/mmdetection/mmdet/datasets/samplers/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .batch_sampler import (AspectRatioBatchSampler, + MultiDataAspectRatioBatchSampler, + TrackAspectRatioBatchSampler) +from .class_aware_sampler import ClassAwareSampler +from .multi_data_sampler import MultiDataSampler +from .multi_source_sampler import GroupMultiSourceSampler, MultiSourceSampler +from .track_img_sampler import TrackImgSampler + +__all__ = [ + 'ClassAwareSampler', 'AspectRatioBatchSampler', 'MultiSourceSampler', + 'GroupMultiSourceSampler', 'TrackImgSampler', + 'TrackAspectRatioBatchSampler', 'MultiDataSampler', + 'MultiDataAspectRatioBatchSampler' +] diff --git a/mmdetection/mmdet/datasets/samplers/batch_sampler.py b/mmdetection/mmdet/datasets/samplers/batch_sampler.py new file mode 100644 index 0000000..c17789c --- /dev/null +++ b/mmdetection/mmdet/datasets/samplers/batch_sampler.py @@ -0,0 +1,193 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +from torch.utils.data import BatchSampler, Sampler + +from mmdet.datasets.samplers.track_img_sampler import TrackImgSampler +from mmdet.registry import DATA_SAMPLERS + + +# TODO: maybe replace with a data_loader wrapper +@DATA_SAMPLERS.register_module() +class AspectRatioBatchSampler(BatchSampler): + """A sampler wrapper for grouping images with similar aspect ratio (< 1 or. + + >= 1) into a same batch. + + Args: + sampler (Sampler): Base sampler. + batch_size (int): Size of mini-batch. + drop_last (bool): If ``True``, the sampler will drop the last batch if + its size would be less than ``batch_size``. + """ + + def __init__(self, + sampler: Sampler, + batch_size: int, + drop_last: bool = False) -> None: + if not isinstance(sampler, Sampler): + raise TypeError('sampler should be an instance of ``Sampler``, ' + f'but got {sampler}') + if not isinstance(batch_size, int) or batch_size <= 0: + raise ValueError('batch_size should be a positive integer value, ' + f'but got batch_size={batch_size}') + self.sampler = sampler + self.batch_size = batch_size + self.drop_last = drop_last + # two groups for w < h and w >= h + self._aspect_ratio_buckets = [[] for _ in range(2)] + + def __iter__(self) -> Sequence[int]: + for idx in self.sampler: + data_info = self.sampler.dataset.get_data_info(idx) + width, height = data_info['width'], data_info['height'] + bucket_id = 0 if width < height else 1 + bucket = self._aspect_ratio_buckets[bucket_id] + bucket.append(idx) + # yield a batch of indices in the same aspect ratio group + if len(bucket) == self.batch_size: + yield bucket[:] + del bucket[:] + + # yield the rest data and reset the bucket + left_data = self._aspect_ratio_buckets[0] + self._aspect_ratio_buckets[ + 1] + self._aspect_ratio_buckets = [[] for _ in range(2)] + while len(left_data) > 0: + if len(left_data) <= self.batch_size: + if not self.drop_last: + yield left_data[:] + left_data = [] + else: + yield left_data[:self.batch_size] + left_data = left_data[self.batch_size:] + + def __len__(self) -> int: + if self.drop_last: + return len(self.sampler) // self.batch_size + else: + return (len(self.sampler) + self.batch_size - 1) // self.batch_size + + +@DATA_SAMPLERS.register_module() +class TrackAspectRatioBatchSampler(AspectRatioBatchSampler): + """A sampler wrapper for grouping images with similar aspect ratio (< 1 or. + + >= 1) into a same batch. + + Args: + sampler (Sampler): Base sampler. + batch_size (int): Size of mini-batch. + drop_last (bool): If ``True``, the sampler will drop the last batch if + its size would be less than ``batch_size``. + """ + + def __iter__(self) -> Sequence[int]: + for idx in self.sampler: + # hard code to solve TrackImgSampler + if isinstance(self.sampler, TrackImgSampler): + video_idx, _ = idx + else: + video_idx = idx + # video_idx + data_info = self.sampler.dataset.get_data_info(video_idx) + # data_info {video_id, images, video_length} + img_data_info = data_info['images'][0] + width, height = img_data_info['width'], img_data_info['height'] + bucket_id = 0 if width < height else 1 + bucket = self._aspect_ratio_buckets[bucket_id] + bucket.append(idx) + # yield a batch of indices in the same aspect ratio group + if len(bucket) == self.batch_size: + yield bucket[:] + del bucket[:] + + # yield the rest data and reset the bucket + left_data = self._aspect_ratio_buckets[0] + self._aspect_ratio_buckets[ + 1] + self._aspect_ratio_buckets = [[] for _ in range(2)] + while len(left_data) > 0: + if len(left_data) <= self.batch_size: + if not self.drop_last: + yield left_data[:] + left_data = [] + else: + yield left_data[:self.batch_size] + left_data = left_data[self.batch_size:] + + +@DATA_SAMPLERS.register_module() +class MultiDataAspectRatioBatchSampler(BatchSampler): + """A sampler wrapper for grouping images with similar aspect ratio (< 1 or. + + >= 1) into a same batch for multi-source datasets. + + Args: + sampler (Sampler): Base sampler. + batch_size (Sequence(int)): Size of mini-batch for multi-source + datasets. + num_datasets(int): Number of multi-source datasets. + drop_last (bool): If ``True``, the sampler will drop the last batch if + its size would be less than ``batch_size``. + """ + + def __init__(self, + sampler: Sampler, + batch_size: Sequence[int], + num_datasets: int, + drop_last: bool = True) -> None: + if not isinstance(sampler, Sampler): + raise TypeError('sampler should be an instance of ``Sampler``, ' + f'but got {sampler}') + self.sampler = sampler + self.batch_size = batch_size + self.num_datasets = num_datasets + self.drop_last = drop_last + # two groups for w < h and w >= h for each dataset --> 2 * num_datasets + self._buckets = [[] for _ in range(2 * self.num_datasets)] + + def __iter__(self) -> Sequence[int]: + for idx in self.sampler: + data_info = self.sampler.dataset.get_data_info(idx) + width, height = data_info['width'], data_info['height'] + dataset_source_idx = self.sampler.dataset.get_dataset_source(idx) + aspect_ratio_bucket_id = 0 if width < height else 1 + bucket_id = dataset_source_idx * 2 + aspect_ratio_bucket_id + bucket = self._buckets[bucket_id] + bucket.append(idx) + # yield a batch of indices in the same aspect ratio group + if len(bucket) == self.batch_size[dataset_source_idx]: + yield bucket[:] + del bucket[:] + + # yield the rest data and reset the bucket + for i in range(self.num_datasets): + left_data = self._buckets[i * 2 + 0] + self._buckets[i * 2 + 1] + while len(left_data) > 0: + if len(left_data) <= self.batch_size[i]: + if not self.drop_last: + yield left_data[:] + left_data = [] + else: + yield left_data[:self.batch_size[i]] + left_data = left_data[self.batch_size[i]:] + + self._buckets = [[] for _ in range(2 * self.num_datasets)] + + def __len__(self) -> int: + sizes = [0 for _ in range(self.num_datasets)] + for idx in self.sampler: + dataset_source_idx = self.sampler.dataset.get_dataset_source(idx) + sizes[dataset_source_idx] += 1 + + if self.drop_last: + lens = 0 + for i in range(self.num_datasets): + lens += sizes[i] // self.batch_size[i] + return lens + else: + lens = 0 + for i in range(self.num_datasets): + lens += (sizes[i] + self.batch_size[i] - + 1) // self.batch_size[i] + return lens diff --git a/mmdetection/mmdet/datasets/samplers/class_aware_sampler.py b/mmdetection/mmdet/datasets/samplers/class_aware_sampler.py new file mode 100644 index 0000000..6ca2f9b --- /dev/null +++ b/mmdetection/mmdet/datasets/samplers/class_aware_sampler.py @@ -0,0 +1,192 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Dict, Iterator, Optional, Union + +import numpy as np +import torch +from mmengine.dataset import BaseDataset +from mmengine.dist import get_dist_info, sync_random_seed +from torch.utils.data import Sampler + +from mmdet.registry import DATA_SAMPLERS + + +@DATA_SAMPLERS.register_module() +class ClassAwareSampler(Sampler): + r"""Sampler that restricts data loading to the label of the dataset. + + A class-aware sampling strategy to effectively tackle the + non-uniform class distribution. The length of the training data is + consistent with source data. Simple improvements based on `Relay + Backpropagation for Effective Learning of Deep Convolutional + Neural Networks `_ + + The implementation logic is referred to + https://github.com/Sense-X/TSD/blob/master/mmdet/datasets/samplers/distributed_classaware_sampler.py + + Args: + dataset: Dataset used for sampling. + seed (int, optional): random seed used to shuffle the sampler. + This number should be identical across all + processes in the distributed group. Defaults to None. + num_sample_class (int): The number of samples taken from each + per-label list. Defaults to 1. + """ + + def __init__(self, + dataset: BaseDataset, + seed: Optional[int] = None, + num_sample_class: int = 1) -> None: + rank, world_size = get_dist_info() + self.rank = rank + self.world_size = world_size + + self.dataset = dataset + self.epoch = 0 + # Must be the same across all workers. If None, will use a + # random seed shared among workers + # (require synchronization among all workers) + if seed is None: + seed = sync_random_seed() + self.seed = seed + + # The number of samples taken from each per-label list + assert num_sample_class > 0 and isinstance(num_sample_class, int) + self.num_sample_class = num_sample_class + # Get per-label image list from dataset + self.cat_dict = self.get_cat2imgs() + + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / world_size)) + self.total_size = self.num_samples * self.world_size + + # get number of images containing each category + self.num_cat_imgs = [len(x) for x in self.cat_dict.values()] + # filter labels without images + self.valid_cat_inds = [ + i for i, length in enumerate(self.num_cat_imgs) if length != 0 + ] + self.num_classes = len(self.valid_cat_inds) + + def get_cat2imgs(self) -> Dict[int, list]: + """Get a dict with class as key and img_ids as values. + + Returns: + dict[int, list]: A dict of per-label image list, + the item of the dict indicates a label index, + corresponds to the image index that contains the label. + """ + classes = self.dataset.metainfo.get('classes', None) + if classes is None: + raise ValueError('dataset metainfo must contain `classes`') + # sort the label index + cat2imgs = {i: [] for i in range(len(classes))} + for i in range(len(self.dataset)): + cat_ids = set(self.dataset.get_cat_ids(i)) + for cat in cat_ids: + cat2imgs[cat].append(i) + return cat2imgs + + def __iter__(self) -> Iterator[int]: + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch + self.seed) + + # initialize label list + label_iter_list = RandomCycleIter(self.valid_cat_inds, generator=g) + # initialize each per-label image list + data_iter_dict = dict() + for i in self.valid_cat_inds: + data_iter_dict[i] = RandomCycleIter(self.cat_dict[i], generator=g) + + def gen_cat_img_inds(cls_list, data_dict, num_sample_cls): + """Traverse the categories and extract `num_sample_cls` image + indexes of the corresponding categories one by one.""" + id_indices = [] + for _ in range(len(cls_list)): + cls_idx = next(cls_list) + for _ in range(num_sample_cls): + id = next(data_dict[cls_idx]) + id_indices.append(id) + return id_indices + + # deterministically shuffle based on epoch + num_bins = int( + math.ceil(self.total_size * 1.0 / self.num_classes / + self.num_sample_class)) + indices = [] + for i in range(num_bins): + indices += gen_cat_img_inds(label_iter_list, data_iter_dict, + self.num_sample_class) + + # fix extra samples to make it evenly divisible + if len(indices) >= self.total_size: + indices = indices[:self.total_size] + else: + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset:offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self) -> int: + """The number of samples in this rank.""" + return self.num_samples + + def set_epoch(self, epoch: int) -> None: + """Sets the epoch for this sampler. + + When :attr:`shuffle=True`, this ensures all replicas use a different + random ordering for each epoch. Otherwise, the next iteration of this + sampler will yield the same ordering. + + Args: + epoch (int): Epoch number. + """ + self.epoch = epoch + + +class RandomCycleIter: + """Shuffle the list and do it again after the list have traversed. + + The implementation logic is referred to + https://github.com/wutong16/DistributionBalancedLoss/blob/master/mllt/datasets/loader/sampler.py + + Example: + >>> label_list = [0, 1, 2, 4, 5] + >>> g = torch.Generator() + >>> g.manual_seed(0) + >>> label_iter_list = RandomCycleIter(label_list, generator=g) + >>> index = next(label_iter_list) + Args: + data (list or ndarray): The data that needs to be shuffled. + generator: An torch.Generator object, which is used in setting the seed + for generating random numbers. + """ # noqa: W605 + + def __init__(self, + data: Union[list, np.ndarray], + generator: torch.Generator = None) -> None: + self.data = data + self.length = len(data) + self.index = torch.randperm(self.length, generator=generator).numpy() + self.i = 0 + self.generator = generator + + def __iter__(self) -> Iterator: + return self + + def __len__(self) -> int: + return len(self.data) + + def __next__(self): + if self.i == self.length: + self.index = torch.randperm( + self.length, generator=self.generator).numpy() + self.i = 0 + idx = self.data[self.index[self.i]] + self.i += 1 + return idx diff --git a/mmdetection/mmdet/datasets/samplers/multi_data_sampler.py b/mmdetection/mmdet/datasets/samplers/multi_data_sampler.py new file mode 100644 index 0000000..c3a4b60 --- /dev/null +++ b/mmdetection/mmdet/datasets/samplers/multi_data_sampler.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Iterator, Optional, Sequence, Sized + +import torch +from mmengine.dist import get_dist_info, sync_random_seed +from mmengine.registry import DATA_SAMPLERS +from torch.utils.data import Sampler + + +@DATA_SAMPLERS.register_module() +class MultiDataSampler(Sampler): + """The default data sampler for both distributed and non-distributed + environment. + + It has several differences from the PyTorch ``DistributedSampler`` as + below: + + 1. This sampler supports non-distributed environment. + + 2. The round up behaviors are a little different. + + - If ``round_up=True``, this sampler will add extra samples to make the + number of samples is evenly divisible by the world size. And + this behavior is the same as the ``DistributedSampler`` with + ``drop_last=False``. + - If ``round_up=False``, this sampler won't remove or add any samples + while the ``DistributedSampler`` with ``drop_last=True`` will remove + tail samples. + + Args: + dataset (Sized): The dataset. + dataset_ratio (Sequence(int)) The ratios of different datasets. + seed (int, optional): Random seed used to shuffle the sampler if + :attr:`shuffle=True`. This number should be identical across all + processes in the distributed group. Defaults to None. + round_up (bool): Whether to add extra samples to make the number of + samples evenly divisible by the world size. Defaults to True. + """ + + def __init__(self, + dataset: Sized, + dataset_ratio: Sequence[int], + seed: Optional[int] = None, + round_up: bool = True) -> None: + rank, world_size = get_dist_info() + self.rank = rank + self.world_size = world_size + + self.dataset = dataset + self.dataset_ratio = dataset_ratio + + if seed is None: + seed = sync_random_seed() + self.seed = seed + self.epoch = 0 + self.round_up = round_up + + if self.round_up: + self.num_samples = math.ceil(len(self.dataset) / world_size) + self.total_size = self.num_samples * self.world_size + else: + self.num_samples = math.ceil( + (len(self.dataset) - rank) / world_size) + self.total_size = len(self.dataset) + + self.sizes = [len(dataset) for dataset in self.dataset.datasets] + + dataset_weight = [ + torch.ones(s) * max(self.sizes) / s * r / sum(self.dataset_ratio) + for i, (r, s) in enumerate(zip(self.dataset_ratio, self.sizes)) + ] + self.weights = torch.cat(dataset_weight) + + def __iter__(self) -> Iterator[int]: + """Iterate the indices.""" + # deterministically shuffle based on epoch and seed + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + + indices = torch.multinomial( + self.weights, len(self.weights), generator=g, + replacement=True).tolist() + + # add extra samples to make it evenly divisible + if self.round_up: + indices = ( + indices * + int(self.total_size / len(indices) + 1))[:self.total_size] + + # subsample + indices = indices[self.rank:self.total_size:self.world_size] + + return iter(indices) + + def __len__(self) -> int: + """The number of samples in this rank.""" + return self.num_samples + + def set_epoch(self, epoch: int) -> None: + """Sets the epoch for this sampler. + + When :attr:`shuffle=True`, this ensures all replicas use a different + random ordering for each epoch. Otherwise, the next iteration of this + sampler will yield the same ordering. + + Args: + epoch (int): Epoch number. + """ + self.epoch = epoch diff --git a/mmdetection/mmdet/datasets/samplers/multi_source_sampler.py b/mmdetection/mmdet/datasets/samplers/multi_source_sampler.py new file mode 100644 index 0000000..6efcde3 --- /dev/null +++ b/mmdetection/mmdet/datasets/samplers/multi_source_sampler.py @@ -0,0 +1,214 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import itertools +from typing import Iterator, List, Optional, Sized, Union + +import numpy as np +import torch +from mmengine.dataset import BaseDataset +from mmengine.dist import get_dist_info, sync_random_seed +from torch.utils.data import Sampler + +from mmdet.registry import DATA_SAMPLERS + + +@DATA_SAMPLERS.register_module() +class MultiSourceSampler(Sampler): + r"""Multi-Source Infinite Sampler. + + According to the sampling ratio, sample data from different + datasets to form batches. + + Args: + dataset (Sized): The dataset. + batch_size (int): Size of mini-batch. + source_ratio (list[int | float]): The sampling ratio of different + source datasets in a mini-batch. + shuffle (bool): Whether shuffle the dataset or not. Defaults to True. + seed (int, optional): Random seed. If None, set a random seed. + Defaults to None. + + Examples: + >>> dataset_type = 'ConcatDataset' + >>> sub_dataset_type = 'CocoDataset' + >>> data_root = 'data/coco/' + >>> sup_ann = '../coco_semi_annos/instances_train2017.1@10.json' + >>> unsup_ann = '../coco_semi_annos/' \ + >>> 'instances_train2017.1@10-unlabeled.json' + >>> dataset = dict(type=dataset_type, + >>> datasets=[ + >>> dict( + >>> type=sub_dataset_type, + >>> data_root=data_root, + >>> ann_file=sup_ann, + >>> data_prefix=dict(img='train2017/'), + >>> filter_cfg=dict(filter_empty_gt=True, min_size=32), + >>> pipeline=sup_pipeline), + >>> dict( + >>> type=sub_dataset_type, + >>> data_root=data_root, + >>> ann_file=unsup_ann, + >>> data_prefix=dict(img='train2017/'), + >>> filter_cfg=dict(filter_empty_gt=True, min_size=32), + >>> pipeline=unsup_pipeline), + >>> ]) + >>> train_dataloader = dict( + >>> batch_size=5, + >>> num_workers=5, + >>> persistent_workers=True, + >>> sampler=dict(type='MultiSourceSampler', + >>> batch_size=5, source_ratio=[1, 4]), + >>> batch_sampler=None, + >>> dataset=dataset) + """ + + def __init__(self, + dataset: Sized, + batch_size: int, + source_ratio: List[Union[int, float]], + shuffle: bool = True, + seed: Optional[int] = None) -> None: + + assert hasattr(dataset, 'cumulative_sizes'),\ + f'The dataset must be ConcatDataset, but get {dataset}' + assert isinstance(batch_size, int) and batch_size > 0, \ + 'batch_size must be a positive integer value, ' \ + f'but got batch_size={batch_size}' + assert isinstance(source_ratio, list), \ + f'source_ratio must be a list, but got source_ratio={source_ratio}' + assert len(source_ratio) == len(dataset.cumulative_sizes), \ + 'The length of source_ratio must be equal to ' \ + f'the number of datasets, but got source_ratio={source_ratio}' + + rank, world_size = get_dist_info() + self.rank = rank + self.world_size = world_size + + self.dataset = dataset + self.cumulative_sizes = [0] + dataset.cumulative_sizes + self.batch_size = batch_size + self.source_ratio = source_ratio + + self.num_per_source = [ + int(batch_size * sr / sum(source_ratio)) for sr in source_ratio + ] + self.num_per_source[0] = batch_size - sum(self.num_per_source[1:]) + + assert sum(self.num_per_source) == batch_size, \ + 'The sum of num_per_source must be equal to ' \ + f'batch_size, but get {self.num_per_source}' + + self.seed = sync_random_seed() if seed is None else seed + self.shuffle = shuffle + self.source2inds = { + source: self._indices_of_rank(len(ds)) + for source, ds in enumerate(dataset.datasets) + } + + def _infinite_indices(self, sample_size: int) -> Iterator[int]: + """Infinitely yield a sequence of indices.""" + g = torch.Generator() + g.manual_seed(self.seed) + while True: + if self.shuffle: + yield from torch.randperm(sample_size, generator=g).tolist() + else: + yield from torch.arange(sample_size).tolist() + + def _indices_of_rank(self, sample_size: int) -> Iterator[int]: + """Slice the infinite indices by rank.""" + yield from itertools.islice( + self._infinite_indices(sample_size), self.rank, None, + self.world_size) + + def __iter__(self) -> Iterator[int]: + batch_buffer = [] + while True: + for source, num in enumerate(self.num_per_source): + batch_buffer_per_source = [] + for idx in self.source2inds[source]: + idx += self.cumulative_sizes[source] + batch_buffer_per_source.append(idx) + if len(batch_buffer_per_source) == num: + batch_buffer += batch_buffer_per_source + break + yield from batch_buffer + batch_buffer = [] + + def __len__(self) -> int: + return len(self.dataset) + + def set_epoch(self, epoch: int) -> None: + """Not supported in `epoch-based runner.""" + pass + + +@DATA_SAMPLERS.register_module() +class GroupMultiSourceSampler(MultiSourceSampler): + r"""Group Multi-Source Infinite Sampler. + + According to the sampling ratio, sample data from different + datasets but the same group to form batches. + + Args: + dataset (Sized): The dataset. + batch_size (int): Size of mini-batch. + source_ratio (list[int | float]): The sampling ratio of different + source datasets in a mini-batch. + shuffle (bool): Whether shuffle the dataset or not. Defaults to True. + seed (int, optional): Random seed. If None, set a random seed. + Defaults to None. + """ + + def __init__(self, + dataset: BaseDataset, + batch_size: int, + source_ratio: List[Union[int, float]], + shuffle: bool = True, + seed: Optional[int] = None) -> None: + super().__init__( + dataset=dataset, + batch_size=batch_size, + source_ratio=source_ratio, + shuffle=shuffle, + seed=seed) + + self._get_source_group_info() + self.group_source2inds = [{ + source: + self._indices_of_rank(self.group2size_per_source[source][group]) + for source in range(len(dataset.datasets)) + } for group in range(len(self.group_ratio))] + + def _get_source_group_info(self) -> None: + self.group2size_per_source = [{0: 0, 1: 0}, {0: 0, 1: 0}] + self.group2inds_per_source = [{0: [], 1: []}, {0: [], 1: []}] + for source, dataset in enumerate(self.dataset.datasets): + for idx in range(len(dataset)): + data_info = dataset.get_data_info(idx) + width, height = data_info['width'], data_info['height'] + group = 0 if width < height else 1 + self.group2size_per_source[source][group] += 1 + self.group2inds_per_source[source][group].append(idx) + + self.group_sizes = np.zeros(2, dtype=np.int64) + for group2size in self.group2size_per_source: + for group, size in group2size.items(): + self.group_sizes[group] += size + self.group_ratio = self.group_sizes / sum(self.group_sizes) + + def __iter__(self) -> Iterator[int]: + batch_buffer = [] + while True: + group = np.random.choice( + list(range(len(self.group_ratio))), p=self.group_ratio) + for source, num in enumerate(self.num_per_source): + batch_buffer_per_source = [] + for idx in self.group_source2inds[group][source]: + idx = self.group2inds_per_source[source][group][ + idx] + self.cumulative_sizes[source] + batch_buffer_per_source.append(idx) + if len(batch_buffer_per_source) == num: + batch_buffer += batch_buffer_per_source + break + yield from batch_buffer + batch_buffer = [] diff --git a/mmdetection/mmdet/datasets/samplers/track_img_sampler.py b/mmdetection/mmdet/datasets/samplers/track_img_sampler.py new file mode 100644 index 0000000..d7db629 --- /dev/null +++ b/mmdetection/mmdet/datasets/samplers/track_img_sampler.py @@ -0,0 +1,146 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import random +from typing import Iterator, Optional, Sized + +import numpy as np +from mmengine.dataset import ClassBalancedDataset, ConcatDataset +from mmengine.dist import get_dist_info, sync_random_seed +from torch.utils.data import Sampler + +from mmdet.registry import DATA_SAMPLERS +from ..base_video_dataset import BaseVideoDataset + + +@DATA_SAMPLERS.register_module() +class TrackImgSampler(Sampler): + """Sampler that providing image-level sampling outputs for video datasets + in tracking tasks. It could be both used in both distributed and + non-distributed environment. + If using the default sampler in pytorch, the subsequent data receiver will + get one video, which is not desired in some cases: + (Take a non-distributed environment as an example) + 1. In test mode, we want only one image is fed into the data pipeline. This + is in consideration of memory usage since feeding the whole video commonly + requires a large amount of memory (>=20G on MOTChallenge17 dataset), which + is not available in some machines. + 2. In training mode, we may want to make sure all the images in one video + are randomly sampled once in one epoch and this can not be guaranteed in + the default sampler in pytorch. + + Args: + dataset (Sized): Dataset used for sampling. + seed (int, optional): random seed used to shuffle the sampler. This + number should be identical across all processes in the distributed + group. Defaults to None. + """ + + def __init__( + self, + dataset: Sized, + seed: Optional[int] = None, + ) -> None: + rank, world_size = get_dist_info() + self.rank = rank + self.world_size = world_size + self.epoch = 0 + if seed is None: + self.seed = sync_random_seed() + else: + self.seed = seed + + self.dataset = dataset + self.indices = [] + # Hard code here to handle different dataset wrapper + if isinstance(self.dataset, ConcatDataset): + cat_datasets = self.dataset.datasets + assert isinstance( + cat_datasets[0], BaseVideoDataset + ), f'expected BaseVideoDataset, but got {type(cat_datasets[0])}' + self.test_mode = cat_datasets[0].test_mode + assert not self.test_mode, "'ConcatDataset' should not exist in " + 'test mode' + for dataset in cat_datasets: + num_videos = len(dataset) + for video_ind in range(num_videos): + self.indices.extend([ + (video_ind, frame_ind) for frame_ind in range( + dataset.get_len_per_video(video_ind)) + ]) + elif isinstance(self.dataset, ClassBalancedDataset): + ori_dataset = self.dataset.dataset + assert isinstance( + ori_dataset, BaseVideoDataset + ), f'expected BaseVideoDataset, but got {type(ori_dataset)}' + self.test_mode = ori_dataset.test_mode + assert not self.test_mode, "'ClassBalancedDataset' should not " + 'exist in test mode' + video_indices = self.dataset.repeat_indices + for index in video_indices: + self.indices.extend([(index, frame_ind) for frame_ind in range( + ori_dataset.get_len_per_video(index))]) + else: + assert isinstance( + self.dataset, BaseVideoDataset + ), 'TrackImgSampler is only supported in BaseVideoDataset or ' + 'dataset wrapper: ClassBalancedDataset and ConcatDataset, but ' + f'got {type(self.dataset)} ' + self.test_mode = self.dataset.test_mode + num_videos = len(self.dataset) + + if self.test_mode: + # in test mode, the images belong to the same video must be put + # on the same device. + if num_videos < self.world_size: + raise ValueError(f'only {num_videos} videos loaded,' + f'but {self.world_size} gpus were given.') + chunks = np.array_split( + list(range(num_videos)), self.world_size) + for videos_inds in chunks: + indices_chunk = [] + for video_ind in videos_inds: + indices_chunk.extend([ + (video_ind, frame_ind) for frame_ind in range( + self.dataset.get_len_per_video(video_ind)) + ]) + self.indices.append(indices_chunk) + else: + for video_ind in range(num_videos): + self.indices.extend([ + (video_ind, frame_ind) for frame_ind in range( + self.dataset.get_len_per_video(video_ind)) + ]) + + if self.test_mode: + self.num_samples = len(self.indices[self.rank]) + self.total_size = sum( + [len(index_list) for index_list in self.indices]) + else: + self.num_samples = int( + math.ceil(len(self.indices) * 1.0 / self.world_size)) + self.total_size = self.num_samples * self.world_size + + def __iter__(self) -> Iterator: + if self.test_mode: + # in test mode, the order of frames can not be shuffled. + indices = self.indices[self.rank] + else: + # deterministically shuffle based on epoch + rng = random.Random(self.epoch + self.seed) + indices = rng.sample(self.indices, len(self.indices)) + + # add extra samples to make it evenly divisible + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank:self.total_size:self.world_size] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/mmdetection/mmdet/datasets/transforms/__init__.py b/mmdetection/mmdet/datasets/transforms/__init__.py new file mode 100644 index 0000000..1f30d6c --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/__init__.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .augment_wrappers import AutoAugment, RandAugment +from .colorspace import (AutoContrast, Brightness, Color, ColorTransform, + Contrast, Equalize, Invert, Posterize, Sharpness, + Solarize, SolarizeAdd) +from .formatting import (ImageToTensor, PackDetInputs, PackReIDInputs, + PackTrackInputs, ToTensor, Transpose) +from .frame_sampling import BaseFrameSample, UniformRefFrameSample +from .geometric import (GeomTransform, Rotate, ShearX, ShearY, TranslateX, + TranslateY) +from .instaboost import InstaBoost +from .loading import (FilterAnnotations, InferencerLoader, LoadAnnotations, + LoadEmptyAnnotations, LoadImageFromNDArray, + LoadMultiChannelImageFromFiles, LoadPanopticAnnotations, + LoadProposals, LoadTrackAnnotations) +from .transformers_glip import GTBoxSubOne_GLIP, RandomFlip_GLIP +from .transforms import (Albu, CachedMixUp, CachedMosaic, CopyPaste, CutOut, + Expand, FixScaleResize, FixShapeResize, + MinIoURandomCrop, MixUp, Mosaic, Pad, + PhotoMetricDistortion, RandomAffine, + RandomCenterCropPad, RandomCrop, RandomErasing, + RandomFlip, RandomShift, Resize, ResizeShortestEdge, + SegRescale, YOLOXHSVRandomAug) +from .wrappers import MultiBranch, ProposalBroadcaster, RandomOrder + +__all__ = [ + 'PackDetInputs', 'ToTensor', 'ImageToTensor', 'Transpose', + 'LoadImageFromNDArray', 'LoadAnnotations', 'LoadPanopticAnnotations', + 'LoadMultiChannelImageFromFiles', 'LoadProposals', 'Resize', 'RandomFlip', + 'RandomCrop', 'SegRescale', 'MinIoURandomCrop', 'Expand', + 'PhotoMetricDistortion', 'Albu', 'InstaBoost', 'RandomCenterCropPad', + 'AutoAugment', 'CutOut', 'ShearX', 'ShearY', 'Rotate', 'Color', 'Equalize', + 'Brightness', 'Contrast', 'TranslateX', 'TranslateY', 'RandomShift', + 'Mosaic', 'MixUp', 'RandomAffine', 'YOLOXHSVRandomAug', 'CopyPaste', + 'FilterAnnotations', 'Pad', 'GeomTransform', 'ColorTransform', + 'RandAugment', 'Sharpness', 'Solarize', 'SolarizeAdd', 'Posterize', + 'AutoContrast', 'Invert', 'MultiBranch', 'RandomErasing', + 'LoadEmptyAnnotations', 'RandomOrder', 'CachedMosaic', 'CachedMixUp', + 'FixShapeResize', 'ProposalBroadcaster', 'InferencerLoader', + 'LoadTrackAnnotations', 'BaseFrameSample', 'UniformRefFrameSample', + 'PackTrackInputs', 'PackReIDInputs', 'FixScaleResize', + 'ResizeShortestEdge', 'GTBoxSubOne_GLIP', 'RandomFlip_GLIP' +] diff --git a/mmdetection/mmdet/datasets/transforms/augment_wrappers.py b/mmdetection/mmdet/datasets/transforms/augment_wrappers.py new file mode 100644 index 0000000..19fae6e --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/augment_wrappers.py @@ -0,0 +1,264 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Union + +import numpy as np +from mmcv.transforms import RandomChoice +from mmcv.transforms.utils import cache_randomness +from mmengine.config import ConfigDict + +from mmdet.registry import TRANSFORMS + +# AutoAugment uses reinforcement learning to search for +# some widely useful data augmentation strategies, +# here we provide AUTOAUG_POLICIES_V0. +# For AUTOAUG_POLICIES_V0, each tuple is an augmentation +# operation of the form (operation, probability, magnitude). +# Each element in policies is a policy that will be applied +# sequentially on the image. + +# RandAugment defines a data augmentation search space, RANDAUG_SPACE, +# sampling 1~3 data augmentations each time, and +# setting the magnitude of each data augmentation randomly, +# which will be applied sequentially on the image. + +_MAX_LEVEL = 10 + +AUTOAUG_POLICIES_V0 = [ + [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)], + [('Color', 0.4, 9), ('Equalize', 0.6, 3)], + [('Color', 0.4, 1), ('Rotate', 0.6, 8)], + [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], + [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], + [('Color', 0.2, 0), ('Equalize', 0.8, 8)], + [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], + [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)], + [('Color', 0.6, 1), ('Equalize', 1.0, 2)], + [('Invert', 0.4, 9), ('Rotate', 0.6, 0)], + [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)], + [('Color', 0.4, 7), ('Equalize', 0.6, 0)], + [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)], + [('Solarize', 0.6, 8), ('Color', 0.6, 9)], + [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)], + [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)], + [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)], + [('ShearY', 0.8, 0), ('Color', 0.6, 4)], + [('Color', 1.0, 0), ('Rotate', 0.6, 2)], + [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], + [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], + [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)], + [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], + [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], + [('Color', 0.8, 6), ('Rotate', 0.4, 5)], +] + + +def policies_v0(): + """Autoaugment policies that was used in AutoAugment Paper.""" + policies = list() + for policy_args in AUTOAUG_POLICIES_V0: + policy = list() + for args in policy_args: + policy.append(dict(type=args[0], prob=args[1], level=args[2])) + policies.append(policy) + return policies + + +RANDAUG_SPACE = [[dict(type='AutoContrast')], [dict(type='Equalize')], + [dict(type='Invert')], [dict(type='Rotate')], + [dict(type='Posterize')], [dict(type='Solarize')], + [dict(type='SolarizeAdd')], [dict(type='Color')], + [dict(type='Contrast')], [dict(type='Brightness')], + [dict(type='Sharpness')], [dict(type='ShearX')], + [dict(type='ShearY')], [dict(type='TranslateX')], + [dict(type='TranslateY')]] + + +def level_to_mag(level: Optional[int], min_mag: float, + max_mag: float) -> float: + """Map from level to magnitude.""" + if level is None: + return round(np.random.rand() * (max_mag - min_mag) + min_mag, 1) + else: + return round(level / _MAX_LEVEL * (max_mag - min_mag) + min_mag, 1) + + +@TRANSFORMS.register_module() +class AutoAugment(RandomChoice): + """Auto augmentation. + + This data augmentation is proposed in `AutoAugment: Learning + Augmentation Policies from Data `_ + and in `Learning Data Augmentation Strategies for Object Detection + `_. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_ignore_flags (bool) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes + - gt_bboxes_labels + - gt_masks + - gt_ignore_flags + - gt_seg_map + + Added Keys: + + - homography_matrix + + Args: + policies (List[List[Union[dict, ConfigDict]]]): + The policies of auto augmentation.Each policy in ``policies`` + is a specific augmentation policy, and is composed by several + augmentations. When AutoAugment is called, a random policy in + ``policies`` will be selected to augment images. + Defaults to policy_v0(). + prob (list[float], optional): The probabilities associated + with each policy. The length should be equal to the policy + number and the sum should be 1. If not given, a uniform + distribution will be assumed. Defaults to None. + + Examples: + >>> policies = [ + >>> [ + >>> dict(type='Sharpness', prob=0.0, level=8), + >>> dict(type='ShearX', prob=0.4, level=0,) + >>> ], + >>> [ + >>> dict(type='Rotate', prob=0.6, level=10), + >>> dict(type='Color', prob=1.0, level=6) + >>> ] + >>> ] + >>> augmentation = AutoAugment(policies) + >>> img = np.ones(100, 100, 3) + >>> gt_bboxes = np.ones(10, 4) + >>> results = dict(img=img, gt_bboxes=gt_bboxes) + >>> results = augmentation(results) + """ + + def __init__(self, + policies: List[List[Union[dict, ConfigDict]]] = policies_v0(), + prob: Optional[List[float]] = None) -> None: + assert isinstance(policies, list) and len(policies) > 0, \ + 'Policies must be a non-empty list.' + for policy in policies: + assert isinstance(policy, list) and len(policy) > 0, \ + 'Each policy in policies must be a non-empty list.' + for augment in policy: + assert isinstance(augment, dict) and 'type' in augment, \ + 'Each specific augmentation must be a dict with key' \ + ' "type".' + super().__init__(transforms=policies, prob=prob) + self.policies = policies + + def __repr__(self) -> str: + return f'{self.__class__.__name__}(policies={self.policies}, ' \ + f'prob={self.prob})' + + +@TRANSFORMS.register_module() +class RandAugment(RandomChoice): + """Rand augmentation. + + This data augmentation is proposed in `RandAugment: + Practical automated data augmentation with a reduced + search space `_. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_ignore_flags (bool) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes + - gt_bboxes_labels + - gt_masks + - gt_ignore_flags + - gt_seg_map + + Added Keys: + + - homography_matrix + + Args: + aug_space (List[List[Union[dict, ConfigDict]]]): The augmentation space + of rand augmentation. Each augmentation transform in ``aug_space`` + is a specific transform, and is composed by several augmentations. + When RandAugment is called, a random transform in ``aug_space`` + will be selected to augment images. Defaults to aug_space. + aug_num (int): Number of augmentation to apply equentially. + Defaults to 2. + prob (list[float], optional): The probabilities associated with + each augmentation. The length should be equal to the + augmentation space and the sum should be 1. If not given, + a uniform distribution will be assumed. Defaults to None. + + Examples: + >>> aug_space = [ + >>> dict(type='Sharpness'), + >>> dict(type='ShearX'), + >>> dict(type='Color'), + >>> ], + >>> augmentation = RandAugment(aug_space) + >>> img = np.ones(100, 100, 3) + >>> gt_bboxes = np.ones(10, 4) + >>> results = dict(img=img, gt_bboxes=gt_bboxes) + >>> results = augmentation(results) + """ + + def __init__(self, + aug_space: List[Union[dict, ConfigDict]] = RANDAUG_SPACE, + aug_num: int = 2, + prob: Optional[List[float]] = None) -> None: + assert isinstance(aug_space, list) and len(aug_space) > 0, \ + 'Augmentation space must be a non-empty list.' + for aug in aug_space: + assert isinstance(aug, list) and len(aug) == 1, \ + 'Each augmentation in aug_space must be a list.' + for transform in aug: + assert isinstance(transform, dict) and 'type' in transform, \ + 'Each specific transform must be a dict with key' \ + ' "type".' + super().__init__(transforms=aug_space, prob=prob) + self.aug_space = aug_space + self.aug_num = aug_num + + @cache_randomness + def random_pipeline_index(self): + indices = np.arange(len(self.transforms)) + return np.random.choice( + indices, self.aug_num, p=self.prob, replace=False) + + def transform(self, results: dict) -> dict: + """Transform function to use RandAugment. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with RandAugment. + """ + for idx in self.random_pipeline_index(): + results = self.transforms[idx](results) + return results + + def __repr__(self) -> str: + return f'{self.__class__.__name__}(' \ + f'aug_space={self.aug_space}, '\ + f'aug_num={self.aug_num}, ' \ + f'prob={self.prob})' diff --git a/mmdetection/mmdet/datasets/transforms/colorspace.py b/mmdetection/mmdet/datasets/transforms/colorspace.py new file mode 100644 index 0000000..e0ba2e9 --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/colorspace.py @@ -0,0 +1,493 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform +from mmcv.transforms.utils import cache_randomness + +from mmdet.registry import TRANSFORMS +from .augment_wrappers import _MAX_LEVEL, level_to_mag + + +@TRANSFORMS.register_module() +class ColorTransform(BaseTransform): + """Base class for color transformations. All color transformations need to + inherit from this base class. ``ColorTransform`` unifies the class + attributes and class functions of color transformations (Color, Brightness, + Contrast, Sharpness, Solarize, SolarizeAdd, Equalize, AutoContrast, Invert, + and Posterize), and only distort color channels, without impacting the + locations of the instances. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing the geometric + transformation and should be in range [0, 1]. Defaults to 1.0. + level (int, optional): The level should be in range [0, _MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for color transformation. + Defaults to 0.1. + max_mag (float): The maximum magnitude for color transformation. + Defaults to 1.9. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.1, + max_mag: float = 1.9) -> None: + assert 0 <= prob <= 1.0, f'The probability of the transformation ' \ + f'should be in range [0,1], got {prob}.' + assert level is None or isinstance(level, int), \ + f'The level should be None or type int, got {type(level)}.' + assert level is None or 0 <= level <= _MAX_LEVEL, \ + f'The level should be in range [0,{_MAX_LEVEL}], got {level}.' + assert isinstance(min_mag, float), \ + f'min_mag should be type float, got {type(min_mag)}.' + assert isinstance(max_mag, float), \ + f'max_mag should be type float, got {type(max_mag)}.' + assert min_mag <= max_mag, \ + f'min_mag should smaller than max_mag, ' \ + f'got min_mag={min_mag} and max_mag={max_mag}' + self.prob = prob + self.level = level + self.min_mag = min_mag + self.max_mag = max_mag + + def _transform_img(self, results: dict, mag: float) -> None: + """Transform the image.""" + pass + + @cache_randomness + def _random_disable(self): + """Randomly disable the transform.""" + return np.random.rand() > self.prob + + @cache_randomness + def _get_mag(self): + """Get the magnitude of the transform.""" + return level_to_mag(self.level, self.min_mag, self.max_mag) + + def transform(self, results: dict) -> dict: + """Transform function for images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Transformed results. + """ + + if self._random_disable(): + return results + mag = self._get_mag() + self._transform_img(results, mag) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(prob={self.prob}, ' + repr_str += f'level={self.level}, ' + repr_str += f'min_mag={self.min_mag}, ' + repr_str += f'max_mag={self.max_mag})' + return repr_str + + +@TRANSFORMS.register_module() +class Color(ColorTransform): + """Adjust the color balance of the image, in a manner similar to the + controls on a colour TV set. A magnitude=0 gives a black & white image, + whereas magnitude=1 gives the original image. The bboxes, masks and + segmentations are not modified. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing Color transformation. + Defaults to 1.0. + level (int, optional): Should be in range [0,_MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for Color transformation. + Defaults to 0.1. + max_mag (float): The maximum magnitude for Color transformation. + Defaults to 1.9. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.1, + max_mag: float = 1.9) -> None: + assert 0. <= min_mag <= 2.0, \ + f'min_mag for Color should be in range [0,2], got {min_mag}.' + assert 0. <= max_mag <= 2.0, \ + f'max_mag for Color should be in range [0,2], got {max_mag}.' + super().__init__( + prob=prob, level=level, min_mag=min_mag, max_mag=max_mag) + + def _transform_img(self, results: dict, mag: float) -> None: + """Apply Color transformation to image.""" + # NOTE defaultly the image should be BGR format + img = results['img'] + results['img'] = mmcv.adjust_color(img, mag).astype(img.dtype) + + +@TRANSFORMS.register_module() +class Brightness(ColorTransform): + """Adjust the brightness of the image. A magnitude=0 gives a black image, + whereas magnitude=1 gives the original image. The bboxes, masks and + segmentations are not modified. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing Brightness transformation. + Defaults to 1.0. + level (int, optional): Should be in range [0,_MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for Brightness transformation. + Defaults to 0.1. + max_mag (float): The maximum magnitude for Brightness transformation. + Defaults to 1.9. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.1, + max_mag: float = 1.9) -> None: + assert 0. <= min_mag <= 2.0, \ + f'min_mag for Brightness should be in range [0,2], got {min_mag}.' + assert 0. <= max_mag <= 2.0, \ + f'max_mag for Brightness should be in range [0,2], got {max_mag}.' + super().__init__( + prob=prob, level=level, min_mag=min_mag, max_mag=max_mag) + + def _transform_img(self, results: dict, mag: float) -> None: + """Adjust the brightness of image.""" + img = results['img'] + results['img'] = mmcv.adjust_brightness(img, mag).astype(img.dtype) + + +@TRANSFORMS.register_module() +class Contrast(ColorTransform): + """Control the contrast of the image. A magnitude=0 gives a gray image, + whereas magnitude=1 gives the original imageThe bboxes, masks and + segmentations are not modified. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing Contrast transformation. + Defaults to 1.0. + level (int, optional): Should be in range [0,_MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for Contrast transformation. + Defaults to 0.1. + max_mag (float): The maximum magnitude for Contrast transformation. + Defaults to 1.9. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.1, + max_mag: float = 1.9) -> None: + assert 0. <= min_mag <= 2.0, \ + f'min_mag for Contrast should be in range [0,2], got {min_mag}.' + assert 0. <= max_mag <= 2.0, \ + f'max_mag for Contrast should be in range [0,2], got {max_mag}.' + super().__init__( + prob=prob, level=level, min_mag=min_mag, max_mag=max_mag) + + def _transform_img(self, results: dict, mag: float) -> None: + """Adjust the image contrast.""" + img = results['img'] + results['img'] = mmcv.adjust_contrast(img, mag).astype(img.dtype) + + +@TRANSFORMS.register_module() +class Sharpness(ColorTransform): + """Adjust images sharpness. A positive magnitude would enhance the + sharpness and a negative magnitude would make the image blurry. A + magnitude=0 gives the origin img. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing Sharpness transformation. + Defaults to 1.0. + level (int, optional): Should be in range [0,_MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for Sharpness transformation. + Defaults to 0.1. + max_mag (float): The maximum magnitude for Sharpness transformation. + Defaults to 1.9. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.1, + max_mag: float = 1.9) -> None: + assert 0. <= min_mag <= 2.0, \ + f'min_mag for Sharpness should be in range [0,2], got {min_mag}.' + assert 0. <= max_mag <= 2.0, \ + f'max_mag for Sharpness should be in range [0,2], got {max_mag}.' + super().__init__( + prob=prob, level=level, min_mag=min_mag, max_mag=max_mag) + + def _transform_img(self, results: dict, mag: float) -> None: + """Adjust the image sharpness.""" + img = results['img'] + results['img'] = mmcv.adjust_sharpness(img, mag).astype(img.dtype) + + +@TRANSFORMS.register_module() +class Solarize(ColorTransform): + """Solarize images (Invert all pixels above a threshold value of + magnitude.). + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing Solarize transformation. + Defaults to 1.0. + level (int, optional): Should be in range [0,_MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for Solarize transformation. + Defaults to 0.0. + max_mag (float): The maximum magnitude for Solarize transformation. + Defaults to 256.0. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 256.0) -> None: + assert 0. <= min_mag <= 256.0, f'min_mag for Solarize should be ' \ + f'in range [0, 256], got {min_mag}.' + assert 0. <= max_mag <= 256.0, f'max_mag for Solarize should be ' \ + f'in range [0, 256], got {max_mag}.' + super().__init__( + prob=prob, level=level, min_mag=min_mag, max_mag=max_mag) + + def _transform_img(self, results: dict, mag: float) -> None: + """Invert all pixel values above magnitude.""" + img = results['img'] + results['img'] = mmcv.solarize(img, mag).astype(img.dtype) + + +@TRANSFORMS.register_module() +class SolarizeAdd(ColorTransform): + """SolarizeAdd images. For each pixel in the image that is less than 128, + add an additional amount to it decided by the magnitude. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing SolarizeAdd + transformation. Defaults to 1.0. + level (int, optional): Should be in range [0,_MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for SolarizeAdd transformation. + Defaults to 0.0. + max_mag (float): The maximum magnitude for SolarizeAdd transformation. + Defaults to 110.0. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 110.0) -> None: + assert 0. <= min_mag <= 110.0, f'min_mag for SolarizeAdd should be ' \ + f'in range [0, 110], got {min_mag}.' + assert 0. <= max_mag <= 110.0, f'max_mag for SolarizeAdd should be ' \ + f'in range [0, 110], got {max_mag}.' + super().__init__( + prob=prob, level=level, min_mag=min_mag, max_mag=max_mag) + + def _transform_img(self, results: dict, mag: float) -> None: + """SolarizeAdd the image.""" + img = results['img'] + img_solarized = np.where(img < 128, np.minimum(img + mag, 255), img) + results['img'] = img_solarized.astype(img.dtype) + + +@TRANSFORMS.register_module() +class Posterize(ColorTransform): + """Posterize images (reduce the number of bits for each color channel). + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing Posterize + transformation. Defaults to 1.0. + level (int, optional): Should be in range [0,_MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for Posterize transformation. + Defaults to 0.0. + max_mag (float): The maximum magnitude for Posterize transformation. + Defaults to 4.0. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 4.0) -> None: + assert 0. <= min_mag <= 8.0, f'min_mag for Posterize should be ' \ + f'in range [0, 8], got {min_mag}.' + assert 0. <= max_mag <= 8.0, f'max_mag for Posterize should be ' \ + f'in range [0, 8], got {max_mag}.' + super().__init__( + prob=prob, level=level, min_mag=min_mag, max_mag=max_mag) + + def _transform_img(self, results: dict, mag: float) -> None: + """Posterize the image.""" + img = results['img'] + results['img'] = mmcv.posterize(img, math.ceil(mag)).astype(img.dtype) + + +@TRANSFORMS.register_module() +class Equalize(ColorTransform): + """Equalize the image histogram. The bboxes, masks and segmentations are + not modified. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing Equalize transformation. + Defaults to 1.0. + level (int, optional): No use for Equalize transformation. + Defaults to None. + min_mag (float): No use for Equalize transformation. Defaults to 0.1. + max_mag (float): No use for Equalize transformation. Defaults to 1.9. + """ + + def _transform_img(self, results: dict, mag: float) -> None: + """Equalizes the histogram of one image.""" + img = results['img'] + results['img'] = mmcv.imequalize(img).astype(img.dtype) + + +@TRANSFORMS.register_module() +class AutoContrast(ColorTransform): + """Auto adjust image contrast. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing AutoContrast should + be in range [0, 1]. Defaults to 1.0. + level (int, optional): No use for AutoContrast transformation. + Defaults to None. + min_mag (float): No use for AutoContrast transformation. + Defaults to 0.1. + max_mag (float): No use for AutoContrast transformation. + Defaults to 1.9. + """ + + def _transform_img(self, results: dict, mag: float) -> None: + """Auto adjust image contrast.""" + img = results['img'] + results['img'] = mmcv.auto_contrast(img).astype(img.dtype) + + +@TRANSFORMS.register_module() +class Invert(ColorTransform): + """Invert images. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + prob (float): The probability for performing invert therefore should + be in range [0, 1]. Defaults to 1.0. + level (int, optional): No use for Invert transformation. + Defaults to None. + min_mag (float): No use for Invert transformation. Defaults to 0.1. + max_mag (float): No use for Invert transformation. Defaults to 1.9. + """ + + def _transform_img(self, results: dict, mag: float) -> None: + """Invert the image.""" + img = results['img'] + results['img'] = mmcv.iminvert(img).astype(img.dtype) diff --git a/mmdetection/mmdet/datasets/transforms/formatting.py b/mmdetection/mmdet/datasets/transforms/formatting.py new file mode 100644 index 0000000..0526380 --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/formatting.py @@ -0,0 +1,512 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence + +import numpy as np +from mmcv.transforms import to_tensor +from mmcv.transforms.base import BaseTransform +from mmengine.structures import InstanceData, PixelData + +from mmdet.registry import TRANSFORMS +from mmdet.structures import DetDataSample, ReIDDataSample, TrackDataSample +from mmdet.structures.bbox import BaseBoxes + + +@TRANSFORMS.register_module() +class PackDetInputs(BaseTransform): + """Pack the inputs data for the detection / semantic segmentation / + panoptic segmentation. + + The ``img_meta`` item is always populated. The contents of the + ``img_meta`` dictionary depends on ``meta_keys``. By default this includes: + + - ``img_id``: id of the image + + - ``img_path``: path to the image file + + - ``ori_shape``: original shape of the image as a tuple (h, w) + + - ``img_shape``: shape of the image input to the network as a tuple \ + (h, w). Note that images may be zero padded on the \ + bottom/right if the batch tensor is larger than this shape. + + - ``scale_factor``: a float indicating the preprocessing scale + + - ``flip``: a boolean indicating if image flip transform was used + + - ``flip_direction``: the flipping direction + + Args: + meta_keys (Sequence[str], optional): Meta keys to be converted to + ``mmcv.DataContainer`` and collected in ``data[img_metas]``. + Default: ``('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')`` + """ + mapping_table = { + 'gt_bboxes': 'bboxes', + 'gt_bboxes_labels': 'labels', + 'gt_masks': 'masks' + } + + def __init__(self, + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')): + self.meta_keys = meta_keys + + def transform(self, results: dict) -> dict: + """Method to pack the input data. + + Args: + results (dict): Result dict from the data pipeline. + + Returns: + dict: + + - 'inputs' (obj:`torch.Tensor`): The forward data of models. + - 'data_sample' (obj:`DetDataSample`): The annotation info of the + sample. + """ + packed_results = dict() + if 'img' in results: + img = results['img'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + # To improve the computational speed by by 3-5 times, apply: + # If image is not contiguous, use + # `numpy.transpose()` followed by `numpy.ascontiguousarray()` + # If image is already contiguous, use + # `torch.permute()` followed by `torch.contiguous()` + # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 + # for more details + if not img.flags.c_contiguous: + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + img = to_tensor(img) + else: + img = to_tensor(img).permute(2, 0, 1).contiguous() + + packed_results['inputs'] = img + + if 'gt_ignore_flags' in results: + valid_idx = np.where(results['gt_ignore_flags'] == 0)[0] + ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0] + + data_sample = DetDataSample() + instance_data = InstanceData() + ignore_instance_data = InstanceData() + + for key in self.mapping_table.keys(): + if key not in results: + continue + if key == 'gt_masks' or isinstance(results[key], BaseBoxes): + if 'gt_ignore_flags' in results: + instance_data[ + self.mapping_table[key]] = results[key][valid_idx] + ignore_instance_data[ + self.mapping_table[key]] = results[key][ignore_idx] + else: + instance_data[self.mapping_table[key]] = results[key] + else: + if 'gt_ignore_flags' in results: + instance_data[self.mapping_table[key]] = to_tensor( + results[key][valid_idx]) + ignore_instance_data[self.mapping_table[key]] = to_tensor( + results[key][ignore_idx]) + else: + instance_data[self.mapping_table[key]] = to_tensor( + results[key]) + data_sample.gt_instances = instance_data + data_sample.ignored_instances = ignore_instance_data + + if 'proposals' in results: + proposals = InstanceData( + bboxes=to_tensor(results['proposals']), + scores=to_tensor(results['proposals_scores'])) + data_sample.proposals = proposals + + if 'gt_seg_map' in results: + gt_sem_seg_data = dict( + sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy())) + gt_sem_seg_data = PixelData(**gt_sem_seg_data) + if 'ignore_index' in results: + metainfo = dict(ignore_index=results['ignore_index']) + gt_sem_seg_data.set_metainfo(metainfo) + data_sample.gt_sem_seg = gt_sem_seg_data + + img_meta = {} + for key in self.meta_keys: + if key in results: + img_meta[key] = results[key] + data_sample.set_metainfo(img_meta) + packed_results['data_samples'] = data_sample + + return packed_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(meta_keys={self.meta_keys})' + return repr_str + + +@TRANSFORMS.register_module() +class ToTensor: + """Convert some results to :obj:`torch.Tensor` by given keys. + + Args: + keys (Sequence[str]): Keys that need to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert data in results to :obj:`torch.Tensor`. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data converted + to :obj:`torch.Tensor`. + """ + for key in self.keys: + results[key] = to_tensor(results[key]) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@TRANSFORMS.register_module() +class ImageToTensor: + """Convert image to :obj:`torch.Tensor` by given keys. + + The dimension order of input image is (H, W, C). The pipeline will convert + it to (C, H, W). If only 2 dimension (H, W) is given, the output would be + (1, H, W). + + Args: + keys (Sequence[str]): Key of images to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert image in results to :obj:`torch.Tensor` and + transpose the channel order. + + Args: + results (dict): Result dict contains the image data to convert. + + Returns: + dict: The result dict contains the image converted + to :obj:`torch.Tensor` and permuted to (C, H, W) order. + """ + for key in self.keys: + img = results[key] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + results[key] = to_tensor(img).permute(2, 0, 1).contiguous() + + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@TRANSFORMS.register_module() +class Transpose: + """Transpose some results by given keys. + + Args: + keys (Sequence[str]): Keys of results to be transposed. + order (Sequence[int]): Order of transpose. + """ + + def __init__(self, keys, order): + self.keys = keys + self.order = order + + def __call__(self, results): + """Call function to transpose the channel order of data in results. + + Args: + results (dict): Result dict contains the data to transpose. + + Returns: + dict: The result dict contains the data transposed to \ + ``self.order``. + """ + for key in self.keys: + results[key] = results[key].transpose(self.order) + return results + + def __repr__(self): + return self.__class__.__name__ + \ + f'(keys={self.keys}, order={self.order})' + + +@TRANSFORMS.register_module() +class WrapFieldsToLists: + """Wrap fields of the data dictionary into lists for evaluation. + + This class can be used as a last step of a test or validation + pipeline for single image evaluation or inference. + + Example: + >>> test_pipeline = [ + >>> dict(type='LoadImageFromFile'), + >>> dict(type='Normalize', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True), + >>> dict(type='Pad', size_divisor=32), + >>> dict(type='ImageToTensor', keys=['img']), + >>> dict(type='Collect', keys=['img']), + >>> dict(type='WrapFieldsToLists') + >>> ] + """ + + def __call__(self, results): + """Call function to wrap fields into lists. + + Args: + results (dict): Result dict contains the data to wrap. + + Returns: + dict: The result dict where value of ``self.keys`` are wrapped \ + into list. + """ + + # Wrap dict fields into lists + for key, val in results.items(): + results[key] = [val] + return results + + def __repr__(self): + return f'{self.__class__.__name__}()' + + +@TRANSFORMS.register_module() +class PackTrackInputs(BaseTransform): + """Pack the inputs data for the multi object tracking and video instance + segmentation. All the information of images are packed to ``inputs``. All + the information except images are packed to ``data_samples``. In order to + get the original annotaiton and meta info, we add `instances` key into meta + keys. + + Args: + meta_keys (Sequence[str]): Meta keys to be collected in + ``data_sample.metainfo``. Defaults to None. + default_meta_keys (tuple): Default meta keys. Defaults to ('img_id', + 'img_path', 'ori_shape', 'img_shape', 'scale_factor', + 'flip', 'flip_direction', 'frame_id', 'is_video_data', + 'video_id', 'video_length', 'instances'). + """ + mapping_table = { + 'gt_bboxes': 'bboxes', + 'gt_bboxes_labels': 'labels', + 'gt_masks': 'masks', + 'gt_instances_ids': 'instances_ids' + } + + def __init__(self, + meta_keys: Optional[dict] = None, + default_meta_keys: tuple = ('img_id', 'img_path', 'ori_shape', + 'img_shape', 'scale_factor', + 'flip', 'flip_direction', + 'frame_id', 'video_id', + 'video_length', + 'ori_video_length', 'instances')): + self.meta_keys = default_meta_keys + if meta_keys is not None: + if isinstance(meta_keys, str): + meta_keys = (meta_keys, ) + else: + assert isinstance(meta_keys, tuple), \ + 'meta_keys must be str or tuple' + self.meta_keys += meta_keys + + def transform(self, results: dict) -> dict: + """Method to pack the input data. + Args: + results (dict): Result dict from the data pipeline. + Returns: + dict: + - 'inputs' (dict[Tensor]): The forward data of models. + - 'data_samples' (obj:`TrackDataSample`): The annotation info of + the samples. + """ + packed_results = dict() + packed_results['inputs'] = dict() + + # 1. Pack images + if 'img' in results: + imgs = results['img'] + imgs = np.stack(imgs, axis=0) + imgs = imgs.transpose(0, 3, 1, 2) + packed_results['inputs'] = to_tensor(imgs) + + # 2. Pack InstanceData + if 'gt_ignore_flags' in results: + gt_ignore_flags_list = results['gt_ignore_flags'] + valid_idx_list, ignore_idx_list = [], [] + for gt_ignore_flags in gt_ignore_flags_list: + valid_idx = np.where(gt_ignore_flags == 0)[0] + ignore_idx = np.where(gt_ignore_flags == 1)[0] + valid_idx_list.append(valid_idx) + ignore_idx_list.append(ignore_idx) + + assert 'img_id' in results, "'img_id' must contained in the results " + 'for counting the number of images' + + num_imgs = len(results['img_id']) + instance_data_list = [InstanceData() for _ in range(num_imgs)] + ignore_instance_data_list = [InstanceData() for _ in range(num_imgs)] + + for key in self.mapping_table.keys(): + if key not in results: + continue + if key == 'gt_masks': + mapped_key = self.mapping_table[key] + gt_masks_list = results[key] + if 'gt_ignore_flags' in results: + for i, gt_mask in enumerate(gt_masks_list): + valid_idx, ignore_idx = valid_idx_list[ + i], ignore_idx_list[i] + instance_data_list[i][mapped_key] = gt_mask[valid_idx] + ignore_instance_data_list[i][mapped_key] = gt_mask[ + ignore_idx] + + else: + for i, gt_mask in enumerate(gt_masks_list): + instance_data_list[i][mapped_key] = gt_mask + + else: + anns_list = results[key] + if 'gt_ignore_flags' in results: + for i, ann in enumerate(anns_list): + valid_idx, ignore_idx = valid_idx_list[ + i], ignore_idx_list[i] + instance_data_list[i][ + self.mapping_table[key]] = to_tensor( + ann[valid_idx]) + ignore_instance_data_list[i][ + self.mapping_table[key]] = to_tensor( + ann[ignore_idx]) + else: + for i, ann in enumerate(anns_list): + instance_data_list[i][ + self.mapping_table[key]] = to_tensor(ann) + + det_data_samples_list = [] + for i in range(num_imgs): + det_data_sample = DetDataSample() + det_data_sample.gt_instances = instance_data_list[i] + det_data_sample.ignored_instances = ignore_instance_data_list[i] + det_data_samples_list.append(det_data_sample) + + # 3. Pack metainfo + for key in self.meta_keys: + if key not in results: + continue + img_metas_list = results[key] + for i, img_meta in enumerate(img_metas_list): + det_data_samples_list[i].set_metainfo({f'{key}': img_meta}) + + track_data_sample = TrackDataSample() + track_data_sample.video_data_samples = det_data_samples_list + if 'key_frame_flags' in results: + key_frame_flags = np.asarray(results['key_frame_flags']) + key_frames_inds = np.where(key_frame_flags)[0].tolist() + ref_frames_inds = np.where(~key_frame_flags)[0].tolist() + track_data_sample.set_metainfo( + dict(key_frames_inds=key_frames_inds)) + track_data_sample.set_metainfo( + dict(ref_frames_inds=ref_frames_inds)) + + packed_results['data_samples'] = track_data_sample + return packed_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'meta_keys={self.meta_keys}, ' + repr_str += f'default_meta_keys={self.default_meta_keys})' + return repr_str + + +@TRANSFORMS.register_module() +class PackReIDInputs(BaseTransform): + """Pack the inputs data for the ReID. The ``meta_info`` item is always + populated. The contents of the ``meta_info`` dictionary depends on + ``meta_keys``. By default this includes: + + - ``img_path``: path to the image file. + - ``ori_shape``: original shape of the image as a tuple (H, W). + - ``img_shape``: shape of the image input to the network as a tuple + (H, W). Note that images may be zero padded on the bottom/right + if the batch tensor is larger than this shape. + - ``scale``: scale of the image as a tuple (W, H). + - ``scale_factor``: a float indicating the pre-processing scale. + - ``flip``: a boolean indicating if image flip transform was used. + - ``flip_direction``: the flipping direction. + Args: + meta_keys (Sequence[str], optional): The meta keys to saved in the + ``metainfo`` of the packed ``data_sample``. + """ + default_meta_keys = ('img_path', 'ori_shape', 'img_shape', 'scale', + 'scale_factor') + + def __init__(self, meta_keys: Sequence[str] = ()) -> None: + self.meta_keys = self.default_meta_keys + if meta_keys is not None: + if isinstance(meta_keys, str): + meta_keys = (meta_keys, ) + else: + assert isinstance(meta_keys, tuple), \ + 'meta_keys must be str or tuple.' + self.meta_keys += meta_keys + + def transform(self, results: dict) -> dict: + """Method to pack the input data. + Args: + results (dict): Result dict from the data pipeline. + Returns: + dict: + - 'inputs' (dict[Tensor]): The forward data of models. + - 'data_samples' (obj:`ReIDDataSample`): The meta info of the + sample. + """ + packed_results = dict(inputs=dict(), data_samples=None) + assert 'img' in results, 'Missing the key ``img``.' + _type = type(results['img']) + label = results['gt_label'] + + if _type == list: + img = results['img'] + label = np.stack(label, axis=0) # (N,) + assert all([type(v) == _type for v in results.values()]), \ + 'All items in the results must have the same type.' + else: + img = [results['img']] + + img = np.stack(img, axis=3) # (H, W, C, N) + img = img.transpose(3, 2, 0, 1) # (N, C, H, W) + img = np.ascontiguousarray(img) + + packed_results['inputs'] = to_tensor(img) + + data_sample = ReIDDataSample() + data_sample.set_gt_label(label) + + meta_info = dict() + for key in self.meta_keys: + meta_info[key] = results[key] + data_sample.set_metainfo(meta_info) + packed_results['data_samples'] = data_sample + + return packed_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(meta_keys={self.meta_keys})' + return repr_str diff --git a/mmdetection/mmdet/datasets/transforms/frame_sampling.py b/mmdetection/mmdet/datasets/transforms/frame_sampling.py new file mode 100644 index 0000000..a91f1e7 --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/frame_sampling.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +from collections import defaultdict +from typing import Dict, List, Optional, Union + +from mmcv.transforms import BaseTransform + +from mmdet.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class BaseFrameSample(BaseTransform): + """Directly get the key frame, no reference frames. + + Args: + collect_video_keys (list[str]): The keys of video info to be + collected. + """ + + def __init__(self, + collect_video_keys: List[str] = ['video_id', 'video_length']): + self.collect_video_keys = collect_video_keys + + def prepare_data(self, video_infos: dict, + sampled_inds: List[int]) -> Dict[str, List]: + """Prepare data for the subsequent pipeline. + + Args: + video_infos (dict): The whole video information. + sampled_inds (list[int]): The sampled frame indices. + + Returns: + dict: The processed data information. + """ + frames_anns = video_infos['images'] + final_data_info = defaultdict(list) + # for data in frames_anns: + for index in sampled_inds: + data = frames_anns[index] + # copy the info in video-level into img-level + for key in self.collect_video_keys: + if key == 'video_length': + data['ori_video_length'] = video_infos[key] + data['video_length'] = len(sampled_inds) + else: + data[key] = video_infos[key] + # Collate data_list (list of dict to dict of list) + for key, value in data.items(): + final_data_info[key].append(value) + + return final_data_info + + def transform(self, video_infos: dict) -> Optional[Dict[str, List]]: + """Transform the video information. + + Args: + video_infos (dict): The whole video information. + + Returns: + dict: The data information of the key frames. + """ + if 'key_frame_id' in video_infos: + key_frame_id = video_infos['key_frame_id'] + assert isinstance(video_infos['key_frame_id'], int) + else: + key_frame_id = random.sample( + list(range(video_infos['video_length'])), 1)[0] + results = self.prepare_data(video_infos, [key_frame_id]) + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(collect_video_keys={self.collect_video_keys})' + return repr_str + + +@TRANSFORMS.register_module() +class UniformRefFrameSample(BaseFrameSample): + """Uniformly sample reference frames. + + Args: + num_ref_imgs (int): Number of reference frames to be sampled. + frame_range (int | list[int]): Range of frames to be sampled around + key frame. If int, the range is [-frame_range, frame_range]. + Defaults to 10. + filter_key_img (bool): Whether to filter the key frame when + sampling reference frames. Defaults to True. + collect_video_keys (list[str]): The keys of video info to be + collected. + """ + + def __init__(self, + num_ref_imgs: int = 1, + frame_range: Union[int, List[int]] = 10, + filter_key_img: bool = True, + collect_video_keys: List[str] = ['video_id', 'video_length']): + self.num_ref_imgs = num_ref_imgs + self.filter_key_img = filter_key_img + if isinstance(frame_range, int): + assert frame_range >= 0, 'frame_range can not be a negative value.' + frame_range = [-frame_range, frame_range] + elif isinstance(frame_range, list): + assert len(frame_range) == 2, 'The length must be 2.' + assert frame_range[0] <= 0 and frame_range[1] >= 0 + for i in frame_range: + assert isinstance(i, int), 'Each element must be int.' + else: + raise TypeError('The type of frame_range must be int or list.') + self.frame_range = frame_range + super().__init__(collect_video_keys=collect_video_keys) + + def sampling_frames(self, video_length: int, key_frame_id: int): + """Sampling frames. + + Args: + video_length (int): The length of the video. + key_frame_id (int): The key frame id. + + Returns: + list[int]: The sampled frame indices. + """ + if video_length > 1: + left = max(0, key_frame_id + self.frame_range[0]) + right = min(key_frame_id + self.frame_range[1], video_length - 1) + frame_ids = list(range(0, video_length)) + + valid_ids = frame_ids[left:right + 1] + if self.filter_key_img and key_frame_id in valid_ids: + valid_ids.remove(key_frame_id) + assert len( + valid_ids + ) > 0, 'After filtering key frame, there are no valid frames' + if len(valid_ids) < self.num_ref_imgs: + valid_ids = valid_ids * self.num_ref_imgs + ref_frame_ids = random.sample(valid_ids, self.num_ref_imgs) + else: + ref_frame_ids = [key_frame_id] * self.num_ref_imgs + + sampled_frames_ids = [key_frame_id] + ref_frame_ids + sampled_frames_ids = sorted(sampled_frames_ids) + + key_frames_ind = sampled_frames_ids.index(key_frame_id) + key_frame_flags = [False] * len(sampled_frames_ids) + key_frame_flags[key_frames_ind] = True + return sampled_frames_ids, key_frame_flags + + def transform(self, video_infos: dict) -> Optional[Dict[str, List]]: + """Transform the video information. + + Args: + video_infos (dict): The whole video information. + + Returns: + dict: The data information of the sampled frames. + """ + if 'key_frame_id' in video_infos: + key_frame_id = video_infos['key_frame_id'] + assert isinstance(video_infos['key_frame_id'], int) + else: + key_frame_id = random.sample( + list(range(video_infos['video_length'])), 1)[0] + + (sampled_frames_ids, key_frame_flags) = self.sampling_frames( + video_infos['video_length'], key_frame_id=key_frame_id) + results = self.prepare_data(video_infos, sampled_frames_ids) + results['key_frame_flags'] = key_frame_flags + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(num_ref_imgs={self.num_ref_imgs}, ' + repr_str += f'frame_range={self.frame_range}, ' + repr_str += f'filter_key_img={self.filter_key_img}, ' + repr_str += f'collect_video_keys={self.collect_video_keys})' + return repr_str diff --git a/mmdetection/mmdet/datasets/transforms/geometric.py b/mmdetection/mmdet/datasets/transforms/geometric.py new file mode 100644 index 0000000..d2cd6be --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/geometric.py @@ -0,0 +1,754 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import Optional, Union + +import cv2 +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform +from mmcv.transforms.utils import cache_randomness + +from mmdet.registry import TRANSFORMS +from mmdet.structures.bbox import autocast_box_type +from .augment_wrappers import _MAX_LEVEL, level_to_mag + + +@TRANSFORMS.register_module() +class GeomTransform(BaseTransform): + """Base class for geometric transformations. All geometric transformations + need to inherit from this base class. ``GeomTransform`` unifies the class + attributes and class functions of geometric transformations (ShearX, + ShearY, Rotate, TranslateX, and TranslateY), and records the homography + matrix. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_masks + - gt_seg_map + + Added Keys: + + - homography_matrix + + Args: + prob (float): The probability for performing the geometric + transformation and should be in range [0, 1]. Defaults to 1.0. + level (int, optional): The level should be in range [0, _MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum magnitude for geometric transformation. + Defaults to 0.0. + max_mag (float): The maximum magnitude for geometric transformation. + Defaults to 1.0. + reversal_prob (float): The probability that reverses the geometric + transformation magnitude. Should be in range [0,1]. + Defaults to 0.5. + img_border_value (int | float | tuple): The filled values for + image border. If float, the same fill value will be used for + all the three channels of image. If tuple, it should be 3 elements. + Defaults to 128. + mask_border_value (int): The fill value used for masks. Defaults to 0. + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Defaults to 255. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. Defaults + to 'bilinear'. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 1.0, + reversal_prob: float = 0.5, + img_border_value: Union[int, float, tuple] = 128, + mask_border_value: int = 0, + seg_ignore_label: int = 255, + interpolation: str = 'bilinear') -> None: + assert 0 <= prob <= 1.0, f'The probability of the transformation ' \ + f'should be in range [0,1], got {prob}.' + assert level is None or isinstance(level, int), \ + f'The level should be None or type int, got {type(level)}.' + assert level is None or 0 <= level <= _MAX_LEVEL, \ + f'The level should be in range [0,{_MAX_LEVEL}], got {level}.' + assert isinstance(min_mag, float), \ + f'min_mag should be type float, got {type(min_mag)}.' + assert isinstance(max_mag, float), \ + f'max_mag should be type float, got {type(max_mag)}.' + assert min_mag <= max_mag, \ + f'min_mag should smaller than max_mag, ' \ + f'got min_mag={min_mag} and max_mag={max_mag}' + assert isinstance(reversal_prob, float), \ + f'reversal_prob should be type float, got {type(max_mag)}.' + assert 0 <= reversal_prob <= 1.0, \ + f'The reversal probability of the transformation magnitude ' \ + f'should be type float, got {type(reversal_prob)}.' + if isinstance(img_border_value, (float, int)): + img_border_value = tuple([float(img_border_value)] * 3) + elif isinstance(img_border_value, tuple): + assert len(img_border_value) == 3, \ + f'img_border_value as tuple must have 3 elements, ' \ + f'got {len(img_border_value)}.' + img_border_value = tuple([float(val) for val in img_border_value]) + else: + raise ValueError( + 'img_border_value must be float or tuple with 3 elements.') + assert np.all([0 <= val <= 255 for val in img_border_value]), 'all ' \ + 'elements of img_border_value should between range [0,255].' \ + f'got {img_border_value}.' + self.prob = prob + self.level = level + self.min_mag = min_mag + self.max_mag = max_mag + self.reversal_prob = reversal_prob + self.img_border_value = img_border_value + self.mask_border_value = mask_border_value + self.seg_ignore_label = seg_ignore_label + self.interpolation = interpolation + + def _transform_img(self, results: dict, mag: float) -> None: + """Transform the image.""" + pass + + def _transform_masks(self, results: dict, mag: float) -> None: + """Transform the masks.""" + pass + + def _transform_seg(self, results: dict, mag: float) -> None: + """Transform the segmentation map.""" + pass + + def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: + """Get the homography matrix for the geometric transformation.""" + return np.eye(3, dtype=np.float32) + + def _transform_bboxes(self, results: dict, mag: float) -> None: + """Transform the bboxes.""" + results['gt_bboxes'].project_(self.homography_matrix) + results['gt_bboxes'].clip_(results['img_shape']) + + def _record_homography_matrix(self, results: dict) -> None: + """Record the homography matrix for the geometric transformation.""" + if results.get('homography_matrix', None) is None: + results['homography_matrix'] = self.homography_matrix + else: + results['homography_matrix'] = self.homography_matrix @ results[ + 'homography_matrix'] + + @cache_randomness + def _random_disable(self): + """Randomly disable the transform.""" + return np.random.rand() > self.prob + + @cache_randomness + def _get_mag(self): + """Get the magnitude of the transform.""" + mag = level_to_mag(self.level, self.min_mag, self.max_mag) + return -mag if np.random.rand() > self.reversal_prob else mag + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function for images, bounding boxes, masks and semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Transformed results. + """ + + if self._random_disable(): + return results + mag = self._get_mag() + self.homography_matrix = self._get_homography_matrix(results, mag) + self._record_homography_matrix(results) + self._transform_img(results, mag) + if results.get('gt_bboxes', None) is not None: + self._transform_bboxes(results, mag) + if results.get('gt_masks', None) is not None: + self._transform_masks(results, mag) + if results.get('gt_seg_map', None) is not None: + self._transform_seg(results, mag) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(prob={self.prob}, ' + repr_str += f'level={self.level}, ' + repr_str += f'min_mag={self.min_mag}, ' + repr_str += f'max_mag={self.max_mag}, ' + repr_str += f'reversal_prob={self.reversal_prob}, ' + repr_str += f'img_border_value={self.img_border_value}, ' + repr_str += f'mask_border_value={self.mask_border_value}, ' + repr_str += f'seg_ignore_label={self.seg_ignore_label}, ' + repr_str += f'interpolation={self.interpolation})' + return repr_str + + +@TRANSFORMS.register_module() +class ShearX(GeomTransform): + """Shear the images, bboxes, masks and segmentation map horizontally. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_masks + - gt_seg_map + + Added Keys: + + - homography_matrix + + Args: + prob (float): The probability for performing Shear and should be in + range [0, 1]. Defaults to 1.0. + level (int, optional): The level should be in range [0, _MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum angle for the horizontal shear. + Defaults to 0.0. + max_mag (float): The maximum angle for the horizontal shear. + Defaults to 30.0. + reversal_prob (float): The probability that reverses the horizontal + shear magnitude. Should be in range [0,1]. Defaults to 0.5. + img_border_value (int | float | tuple): The filled values for + image border. If float, the same fill value will be used for + all the three channels of image. If tuple, it should be 3 elements. + Defaults to 128. + mask_border_value (int): The fill value used for masks. Defaults to 0. + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Defaults to 255. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. Defaults + to 'bilinear'. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 30.0, + reversal_prob: float = 0.5, + img_border_value: Union[int, float, tuple] = 128, + mask_border_value: int = 0, + seg_ignore_label: int = 255, + interpolation: str = 'bilinear') -> None: + assert 0. <= min_mag <= 90., \ + f'min_mag angle for ShearX should be ' \ + f'in range [0, 90], got {min_mag}.' + assert 0. <= max_mag <= 90., \ + f'max_mag angle for ShearX should be ' \ + f'in range [0, 90], got {max_mag}.' + super().__init__( + prob=prob, + level=level, + min_mag=min_mag, + max_mag=max_mag, + reversal_prob=reversal_prob, + img_border_value=img_border_value, + mask_border_value=mask_border_value, + seg_ignore_label=seg_ignore_label, + interpolation=interpolation) + + @cache_randomness + def _get_mag(self): + """Get the magnitude of the transform.""" + mag = level_to_mag(self.level, self.min_mag, self.max_mag) + mag = np.tan(mag * np.pi / 180) + return -mag if np.random.rand() > self.reversal_prob else mag + + def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: + """Get the homography matrix for ShearX.""" + return np.array([[1, mag, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32) + + def _transform_img(self, results: dict, mag: float) -> None: + """Shear the image horizontally.""" + results['img'] = mmcv.imshear( + results['img'], + mag, + direction='horizontal', + border_value=self.img_border_value, + interpolation=self.interpolation) + + def _transform_masks(self, results: dict, mag: float) -> None: + """Shear the masks horizontally.""" + results['gt_masks'] = results['gt_masks'].shear( + results['img_shape'], + mag, + direction='horizontal', + border_value=self.mask_border_value, + interpolation=self.interpolation) + + def _transform_seg(self, results: dict, mag: float) -> None: + """Shear the segmentation map horizontally.""" + results['gt_seg_map'] = mmcv.imshear( + results['gt_seg_map'], + mag, + direction='horizontal', + border_value=self.seg_ignore_label, + interpolation='nearest') + + +@TRANSFORMS.register_module() +class ShearY(GeomTransform): + """Shear the images, bboxes, masks and segmentation map vertically. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_masks + - gt_seg_map + + Added Keys: + + - homography_matrix + + Args: + prob (float): The probability for performing ShearY and should be in + range [0, 1]. Defaults to 1.0. + level (int, optional): The level should be in range [0,_MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum angle for the vertical shear. + Defaults to 0.0. + max_mag (float): The maximum angle for the vertical shear. + Defaults to 30.0. + reversal_prob (float): The probability that reverses the vertical + shear magnitude. Should be in range [0,1]. Defaults to 0.5. + img_border_value (int | float | tuple): The filled values for + image border. If float, the same fill value will be used for + all the three channels of image. If tuple, it should be 3 elements. + Defaults to 128. + mask_border_value (int): The fill value used for masks. Defaults to 0. + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Defaults to 255. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. Defaults + to 'bilinear'. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 30., + reversal_prob: float = 0.5, + img_border_value: Union[int, float, tuple] = 128, + mask_border_value: int = 0, + seg_ignore_label: int = 255, + interpolation: str = 'bilinear') -> None: + assert 0. <= min_mag <= 90., \ + f'min_mag angle for ShearY should be ' \ + f'in range [0, 90], got {min_mag}.' + assert 0. <= max_mag <= 90., \ + f'max_mag angle for ShearY should be ' \ + f'in range [0, 90], got {max_mag}.' + super().__init__( + prob=prob, + level=level, + min_mag=min_mag, + max_mag=max_mag, + reversal_prob=reversal_prob, + img_border_value=img_border_value, + mask_border_value=mask_border_value, + seg_ignore_label=seg_ignore_label, + interpolation=interpolation) + + @cache_randomness + def _get_mag(self): + """Get the magnitude of the transform.""" + mag = level_to_mag(self.level, self.min_mag, self.max_mag) + mag = np.tan(mag * np.pi / 180) + return -mag if np.random.rand() > self.reversal_prob else mag + + def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: + """Get the homography matrix for ShearY.""" + return np.array([[1, 0, 0], [mag, 1, 0], [0, 0, 1]], dtype=np.float32) + + def _transform_img(self, results: dict, mag: float) -> None: + """Shear the image vertically.""" + results['img'] = mmcv.imshear( + results['img'], + mag, + direction='vertical', + border_value=self.img_border_value, + interpolation=self.interpolation) + + def _transform_masks(self, results: dict, mag: float) -> None: + """Shear the masks vertically.""" + results['gt_masks'] = results['gt_masks'].shear( + results['img_shape'], + mag, + direction='vertical', + border_value=self.mask_border_value, + interpolation=self.interpolation) + + def _transform_seg(self, results: dict, mag: float) -> None: + """Shear the segmentation map vertically.""" + results['gt_seg_map'] = mmcv.imshear( + results['gt_seg_map'], + mag, + direction='vertical', + border_value=self.seg_ignore_label, + interpolation='nearest') + + +@TRANSFORMS.register_module() +class Rotate(GeomTransform): + """Rotate the images, bboxes, masks and segmentation map. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_masks + - gt_seg_map + + Added Keys: + + - homography_matrix + + Args: + prob (float): The probability for perform transformation and + should be in range 0 to 1. Defaults to 1.0. + level (int, optional): The level should be in range [0, _MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The maximum angle for rotation. + Defaults to 0.0. + max_mag (float): The maximum angle for rotation. + Defaults to 30.0. + reversal_prob (float): The probability that reverses the rotation + magnitude. Should be in range [0,1]. Defaults to 0.5. + img_border_value (int | float | tuple): The filled values for + image border. If float, the same fill value will be used for + all the three channels of image. If tuple, it should be 3 elements. + Defaults to 128. + mask_border_value (int): The fill value used for masks. Defaults to 0. + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Defaults to 255. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. Defaults + to 'bilinear'. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 30.0, + reversal_prob: float = 0.5, + img_border_value: Union[int, float, tuple] = 128, + mask_border_value: int = 0, + seg_ignore_label: int = 255, + interpolation: str = 'bilinear') -> None: + assert 0. <= min_mag <= 180., \ + f'min_mag for Rotate should be in range [0,180], got {min_mag}.' + assert 0. <= max_mag <= 180., \ + f'max_mag for Rotate should be in range [0,180], got {max_mag}.' + super().__init__( + prob=prob, + level=level, + min_mag=min_mag, + max_mag=max_mag, + reversal_prob=reversal_prob, + img_border_value=img_border_value, + mask_border_value=mask_border_value, + seg_ignore_label=seg_ignore_label, + interpolation=interpolation) + + def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: + """Get the homography matrix for Rotate.""" + img_shape = results['img_shape'] + center = ((img_shape[1] - 1) * 0.5, (img_shape[0] - 1) * 0.5) + cv2_rotation_matrix = cv2.getRotationMatrix2D(center, -mag, 1.0) + return np.concatenate( + [cv2_rotation_matrix, + np.array([0, 0, 1]).reshape((1, 3))]).astype(np.float32) + + def _transform_img(self, results: dict, mag: float) -> None: + """Rotate the image.""" + results['img'] = mmcv.imrotate( + results['img'], + mag, + border_value=self.img_border_value, + interpolation=self.interpolation) + + def _transform_masks(self, results: dict, mag: float) -> None: + """Rotate the masks.""" + results['gt_masks'] = results['gt_masks'].rotate( + results['img_shape'], + mag, + border_value=self.mask_border_value, + interpolation=self.interpolation) + + def _transform_seg(self, results: dict, mag: float) -> None: + """Rotate the segmentation map.""" + results['gt_seg_map'] = mmcv.imrotate( + results['gt_seg_map'], + mag, + border_value=self.seg_ignore_label, + interpolation='nearest') + + +@TRANSFORMS.register_module() +class TranslateX(GeomTransform): + """Translate the images, bboxes, masks and segmentation map horizontally. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_masks + - gt_seg_map + + Added Keys: + + - homography_matrix + + Args: + prob (float): The probability for perform transformation and + should be in range 0 to 1. Defaults to 1.0. + level (int, optional): The level should be in range [0, _MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum pixel's offset ratio for horizontal + translation. Defaults to 0.0. + max_mag (float): The maximum pixel's offset ratio for horizontal + translation. Defaults to 0.1. + reversal_prob (float): The probability that reverses the horizontal + translation magnitude. Should be in range [0,1]. Defaults to 0.5. + img_border_value (int | float | tuple): The filled values for + image border. If float, the same fill value will be used for + all the three channels of image. If tuple, it should be 3 elements. + Defaults to 128. + mask_border_value (int): The fill value used for masks. Defaults to 0. + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Defaults to 255. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. Defaults + to 'bilinear'. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 0.1, + reversal_prob: float = 0.5, + img_border_value: Union[int, float, tuple] = 128, + mask_border_value: int = 0, + seg_ignore_label: int = 255, + interpolation: str = 'bilinear') -> None: + assert 0. <= min_mag <= 1., \ + f'min_mag ratio for TranslateX should be ' \ + f'in range [0, 1], got {min_mag}.' + assert 0. <= max_mag <= 1., \ + f'max_mag ratio for TranslateX should be ' \ + f'in range [0, 1], got {max_mag}.' + super().__init__( + prob=prob, + level=level, + min_mag=min_mag, + max_mag=max_mag, + reversal_prob=reversal_prob, + img_border_value=img_border_value, + mask_border_value=mask_border_value, + seg_ignore_label=seg_ignore_label, + interpolation=interpolation) + + def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: + """Get the homography matrix for TranslateX.""" + mag = int(results['img_shape'][1] * mag) + return np.array([[1, 0, mag], [0, 1, 0], [0, 0, 1]], dtype=np.float32) + + def _transform_img(self, results: dict, mag: float) -> None: + """Translate the image horizontally.""" + mag = int(results['img_shape'][1] * mag) + results['img'] = mmcv.imtranslate( + results['img'], + mag, + direction='horizontal', + border_value=self.img_border_value, + interpolation=self.interpolation) + + def _transform_masks(self, results: dict, mag: float) -> None: + """Translate the masks horizontally.""" + mag = int(results['img_shape'][1] * mag) + results['gt_masks'] = results['gt_masks'].translate( + results['img_shape'], + mag, + direction='horizontal', + border_value=self.mask_border_value, + interpolation=self.interpolation) + + def _transform_seg(self, results: dict, mag: float) -> None: + """Translate the segmentation map horizontally.""" + mag = int(results['img_shape'][1] * mag) + results['gt_seg_map'] = mmcv.imtranslate( + results['gt_seg_map'], + mag, + direction='horizontal', + border_value=self.seg_ignore_label, + interpolation='nearest') + + +@TRANSFORMS.register_module() +class TranslateY(GeomTransform): + """Translate the images, bboxes, masks and segmentation map vertically. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_masks + - gt_seg_map + + Added Keys: + + - homography_matrix + + Args: + prob (float): The probability for perform transformation and + should be in range 0 to 1. Defaults to 1.0. + level (int, optional): The level should be in range [0, _MAX_LEVEL]. + If level is None, it will generate from [0, _MAX_LEVEL] randomly. + Defaults to None. + min_mag (float): The minimum pixel's offset ratio for vertical + translation. Defaults to 0.0. + max_mag (float): The maximum pixel's offset ratio for vertical + translation. Defaults to 0.1. + reversal_prob (float): The probability that reverses the vertical + translation magnitude. Should be in range [0,1]. Defaults to 0.5. + img_border_value (int | float | tuple): The filled values for + image border. If float, the same fill value will be used for + all the three channels of image. If tuple, it should be 3 elements. + Defaults to 128. + mask_border_value (int): The fill value used for masks. Defaults to 0. + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Defaults to 255. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. Defaults + to 'bilinear'. + """ + + def __init__(self, + prob: float = 1.0, + level: Optional[int] = None, + min_mag: float = 0.0, + max_mag: float = 0.1, + reversal_prob: float = 0.5, + img_border_value: Union[int, float, tuple] = 128, + mask_border_value: int = 0, + seg_ignore_label: int = 255, + interpolation: str = 'bilinear') -> None: + assert 0. <= min_mag <= 1., \ + f'min_mag ratio for TranslateY should be ' \ + f'in range [0,1], got {min_mag}.' + assert 0. <= max_mag <= 1., \ + f'max_mag ratio for TranslateY should be ' \ + f'in range [0,1], got {max_mag}.' + super().__init__( + prob=prob, + level=level, + min_mag=min_mag, + max_mag=max_mag, + reversal_prob=reversal_prob, + img_border_value=img_border_value, + mask_border_value=mask_border_value, + seg_ignore_label=seg_ignore_label, + interpolation=interpolation) + + def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray: + """Get the homography matrix for TranslateY.""" + mag = int(results['img_shape'][0] * mag) + return np.array([[1, 0, 0], [0, 1, mag], [0, 0, 1]], dtype=np.float32) + + def _transform_img(self, results: dict, mag: float) -> None: + """Translate the image vertically.""" + mag = int(results['img_shape'][0] * mag) + results['img'] = mmcv.imtranslate( + results['img'], + mag, + direction='vertical', + border_value=self.img_border_value, + interpolation=self.interpolation) + + def _transform_masks(self, results: dict, mag: float) -> None: + """Translate masks vertically.""" + mag = int(results['img_shape'][0] * mag) + results['gt_masks'] = results['gt_masks'].translate( + results['img_shape'], + mag, + direction='vertical', + border_value=self.mask_border_value, + interpolation=self.interpolation) + + def _transform_seg(self, results: dict, mag: float) -> None: + """Translate segmentation map vertically.""" + mag = int(results['img_shape'][0] * mag) + results['gt_seg_map'] = mmcv.imtranslate( + results['gt_seg_map'], + mag, + direction='vertical', + border_value=self.seg_ignore_label, + interpolation='nearest') diff --git a/mmdetection/mmdet/datasets/transforms/instaboost.py b/mmdetection/mmdet/datasets/transforms/instaboost.py new file mode 100644 index 0000000..30dc160 --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/instaboost.py @@ -0,0 +1,150 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import numpy as np +from mmcv.transforms import BaseTransform + +from mmdet.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class InstaBoost(BaseTransform): + r"""Data augmentation method in `InstaBoost: Boosting Instance + Segmentation Via Probability Map Guided Copy-Pasting + `_. + + Refer to https://github.com/GothicAi/Instaboost for implementation details. + + + Required Keys: + + - img (np.uint8) + - instances + + Modified Keys: + + - img (np.uint8) + - instances + + Args: + action_candidate (tuple): Action candidates. "normal", "horizontal", \ + "vertical", "skip" are supported. Defaults to ('normal', \ + 'horizontal', 'skip'). + action_prob (tuple): Corresponding action probabilities. Should be \ + the same length as action_candidate. Defaults to (1, 0, 0). + scale (tuple): (min scale, max scale). Defaults to (0.8, 1.2). + dx (int): The maximum x-axis shift will be (instance width) / dx. + Defaults to 15. + dy (int): The maximum y-axis shift will be (instance height) / dy. + Defaults to 15. + theta (tuple): (min rotation degree, max rotation degree). \ + Defaults to (-1, 1). + color_prob (float): Probability of images for color augmentation. + Defaults to 0.5. + hflag (bool): Whether to use heatmap guided. Defaults to False. + aug_ratio (float): Probability of applying this transformation. \ + Defaults to 0.5. + """ + + def __init__(self, + action_candidate: tuple = ('normal', 'horizontal', 'skip'), + action_prob: tuple = (1, 0, 0), + scale: tuple = (0.8, 1.2), + dx: int = 15, + dy: int = 15, + theta: tuple = (-1, 1), + color_prob: float = 0.5, + hflag: bool = False, + aug_ratio: float = 0.5) -> None: + + import matplotlib + import matplotlib.pyplot as plt + default_backend = plt.get_backend() + + try: + import instaboostfast as instaboost + except ImportError: + raise ImportError( + 'Please run "pip install instaboostfast" ' + 'to install instaboostfast first for instaboost augmentation.') + + # instaboost will modify the default backend + # and cause visualization to fail. + matplotlib.use(default_backend) + + self.cfg = instaboost.InstaBoostConfig(action_candidate, action_prob, + scale, dx, dy, theta, + color_prob, hflag) + self.aug_ratio = aug_ratio + + def _load_anns(self, results: dict) -> Tuple[list, list]: + """Convert raw anns to instaboost expected input format.""" + anns = [] + ignore_anns = [] + for instance in results['instances']: + label = instance['bbox_label'] + bbox = instance['bbox'] + mask = instance['mask'] + x1, y1, x2, y2 = bbox + # assert (x2 - x1) >= 1 and (y2 - y1) >= 1 + bbox = [x1, y1, x2 - x1, y2 - y1] + + if instance['ignore_flag'] == 0: + anns.append({ + 'category_id': label, + 'segmentation': mask, + 'bbox': bbox + }) + else: + # Ignore instances without data augmentation + ignore_anns.append(instance) + return anns, ignore_anns + + def _parse_anns(self, results: dict, anns: list, ignore_anns: list, + img: np.ndarray) -> dict: + """Restore the result of instaboost processing to the original anns + format.""" + instances = [] + for ann in anns: + x1, y1, w, h = ann['bbox'] + # TODO: more essential bug need to be fixed in instaboost + if w <= 0 or h <= 0: + continue + bbox = [x1, y1, x1 + w, y1 + h] + instances.append( + dict( + bbox=bbox, + bbox_label=ann['category_id'], + mask=ann['segmentation'], + ignore_flag=0)) + + instances.extend(ignore_anns) + results['img'] = img + results['instances'] = instances + return results + + def transform(self, results) -> dict: + """The transform function.""" + img = results['img'] + ori_type = img.dtype + if 'instances' not in results or len(results['instances']) == 0: + return results + + anns, ignore_anns = self._load_anns(results) + if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]): + try: + import instaboostfast as instaboost + except ImportError: + raise ImportError('Please run "pip install instaboostfast" ' + 'to install instaboostfast first.') + anns, img = instaboost.get_new_data( + anns, img.astype(np.uint8), self.cfg, background=None) + + results = self._parse_anns(results, anns, ignore_anns, + img.astype(ori_type)) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(aug_ratio={self.aug_ratio})' + return repr_str diff --git a/mmdetection/mmdet/datasets/transforms/loading.py b/mmdetection/mmdet/datasets/transforms/loading.py new file mode 100644 index 0000000..722d4b0 --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/loading.py @@ -0,0 +1,1074 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import mmcv +import numpy as np +import pycocotools.mask as maskUtils +import torch +from mmcv.transforms import BaseTransform +from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations +from mmcv.transforms import LoadImageFromFile +from mmengine.fileio import get +from mmengine.structures import BaseDataElement + +from mmdet.registry import TRANSFORMS +from mmdet.structures.bbox import get_box_type +from mmdet.structures.bbox.box_type import autocast_box_type +from mmdet.structures.mask import BitmapMasks, PolygonMasks + + +@TRANSFORMS.register_module() +class LoadImageFromNDArray(LoadImageFromFile): + """Load an image from ``results['img']``. + + Similar with :obj:`LoadImageFromFile`, but the image has been loaded as + :obj:`np.ndarray` in ``results['img']``. Can be used when loading image + from webcam. + + Required Keys: + + - img + + Modified Keys: + + - img + - img_path + - img_shape + - ori_shape + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + """ + + def transform(self, results: dict) -> dict: + """Transform function to add image meta information. + + Args: + results (dict): Result dict with Webcam read image in + ``results['img']``. + + Returns: + dict: The dict contains loaded image and meta information. + """ + + img = results['img'] + if self.to_float32: + img = img.astype(np.float32) + + results['img_path'] = None + results['img'] = img + results['img_shape'] = img.shape[:2] + results['ori_shape'] = img.shape[:2] + return results + + +@TRANSFORMS.register_module() +class LoadMultiChannelImageFromFiles(BaseTransform): + """Load multi-channel images from a list of separate channel files. + + Required Keys: + + - img_path + + Modified Keys: + + - img + - img_shape + - ori_shape + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + color_type (str): The flag argument for :func:``mmcv.imfrombytes``. + Defaults to 'unchanged'. + imdecode_backend (str): The image decoding backend type. The backend + argument for :func:``mmcv.imfrombytes``. + See :func:``mmcv.imfrombytes`` for details. + Defaults to 'cv2'. + file_client_args (dict): Arguments to instantiate the + corresponding backend in mmdet <= 3.0.0rc6. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend in mmdet >= 3.0.0rc7. Defaults to None. + """ + + def __init__( + self, + to_float32: bool = False, + color_type: str = 'unchanged', + imdecode_backend: str = 'cv2', + file_client_args: dict = None, + backend_args: dict = None, + ) -> None: + self.to_float32 = to_float32 + self.color_type = color_type + self.imdecode_backend = imdecode_backend + self.backend_args = backend_args + if file_client_args is not None: + raise RuntimeError( + 'The `file_client_args` is deprecated, ' + 'please use `backend_args` instead, please refer to' + 'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py' # noqa: E501 + ) + + def transform(self, results: dict) -> dict: + """Transform functions to load multiple images and get images meta + information. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded images and meta information. + """ + + assert isinstance(results['img_path'], list) + img = [] + for name in results['img_path']: + img_bytes = get(name, backend_args=self.backend_args) + img.append( + mmcv.imfrombytes( + img_bytes, + flag=self.color_type, + backend=self.imdecode_backend)) + img = np.stack(img, axis=-1) + if self.to_float32: + img = img.astype(np.float32) + + results['img'] = img + results['img_shape'] = img.shape[:2] + results['ori_shape'] = img.shape[:2] + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'to_float32={self.to_float32}, ' + f"color_type='{self.color_type}', " + f"imdecode_backend='{self.imdecode_backend}', " + f'backend_args={self.backend_args})') + return repr_str + + +@TRANSFORMS.register_module() +class LoadAnnotations(MMCV_LoadAnnotations): + """Load and process the ``instances`` and ``seg_map`` annotation provided + by dataset. + + The annotation format is as the following: + + .. code-block:: python + + { + 'instances': + [ + { + # List of 4 numbers representing the bounding box of the + # instance, in (x1, y1, x2, y2) order. + 'bbox': [x1, y1, x2, y2], + + # Label of image classification. + 'bbox_label': 1, + + # Used in instance/panoptic segmentation. The segmentation mask + # of the instance or the information of segments. + # 1. If list[list[float]], it represents a list of polygons, + # one for each connected component of the object. Each + # list[float] is one simple polygon in the format of + # [x1, y1, ..., xn, yn] (n >= 3). The Xs and Ys are absolute + # coordinates in unit of pixels. + # 2. If dict, it represents the per-pixel segmentation mask in + # COCO's compressed RLE format. The dict should have keys + # “size” and “counts”. Can be loaded by pycocotools + 'mask': list[list[float]] or dict, + + } + ] + # Filename of semantic or panoptic segmentation ground truth file. + 'seg_map_path': 'a/b/c' + } + + After this module, the annotation has been changed to the format below: + + .. code-block:: python + + { + # In (x1, y1, x2, y2) order, float type. N is the number of bboxes + # in an image + 'gt_bboxes': BaseBoxes(N, 4) + # In int type. + 'gt_bboxes_labels': np.ndarray(N, ) + # In built-in class + 'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W) + # In uint8 type. + 'gt_seg_map': np.ndarray (H, W) + # in (x, y, v) order, float type. + } + + Required Keys: + + - height + - width + - instances + + - bbox (optional) + - bbox_label + - mask (optional) + - ignore_flag + + - seg_map_path (optional) + + Added Keys: + + - gt_bboxes (BaseBoxes[torch.float32]) + - gt_bboxes_labels (np.int64) + - gt_masks (BitmapMasks | PolygonMasks) + - gt_seg_map (np.uint8) + - gt_ignore_flags (bool) + + Args: + with_bbox (bool): Whether to parse and load the bbox annotation. + Defaults to True. + with_label (bool): Whether to parse and load the label annotation. + Defaults to True. + with_mask (bool): Whether to parse and load the mask annotation. + Default: False. + with_seg (bool): Whether to parse and load the semantic segmentation + annotation. Defaults to False. + poly2mask (bool): Whether to convert mask to bitmap. Default: True. + box_type (str): The box type used to wrap the bboxes. If ``box_type`` + is None, gt_bboxes will keep being np.ndarray. Defaults to 'hbox'. + reduce_zero_label (bool): Whether reduce all label value + by 1. Usually used for datasets where 0 is background label. + Defaults to False. + ignore_index (int): The label index to be ignored. + Valid only if reduce_zero_label is true. Defaults is 255. + imdecode_backend (str): The image decoding backend type. The backend + argument for :func:``mmcv.imfrombytes``. + See :fun:``mmcv.imfrombytes`` for details. + Defaults to 'cv2'. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + """ + + def __init__( + self, + with_mask: bool = False, + poly2mask: bool = True, + box_type: str = 'hbox', + # use for semseg + reduce_zero_label: bool = False, + ignore_index: int = 255, + **kwargs) -> None: + super(LoadAnnotations, self).__init__(**kwargs) + self.with_mask = with_mask + self.poly2mask = poly2mask + self.box_type = box_type + self.reduce_zero_label = reduce_zero_label + self.ignore_index = ignore_index + + def _load_bboxes(self, results: dict) -> None: + """Private function to load bounding box annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + Returns: + dict: The dict contains loaded bounding box annotations. + """ + gt_bboxes = [] + gt_ignore_flags = [] + for instance in results.get('instances', []): + gt_bboxes.append(instance['bbox']) + gt_ignore_flags.append(instance['ignore_flag']) + if self.box_type is None: + results['gt_bboxes'] = np.array( + gt_bboxes, dtype=np.float32).reshape((-1, 4)) + else: + _, box_type_cls = get_box_type(self.box_type) + results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + + def _load_labels(self, results: dict) -> None: + """Private function to load label annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + dict: The dict contains loaded label annotations. + """ + gt_bboxes_labels = [] + for instance in results.get('instances', []): + gt_bboxes_labels.append(instance['bbox_label']) + # TODO: Inconsistent with mmcv, consider how to deal with it later. + results['gt_bboxes_labels'] = np.array( + gt_bboxes_labels, dtype=np.int64) + + def _poly2mask(self, mask_ann: Union[list, dict], img_h: int, + img_w: int) -> np.ndarray: + """Private function to convert masks represented with polygon to + bitmaps. + + Args: + mask_ann (list | dict): Polygon mask annotation input. + img_h (int): The height of output mask. + img_w (int): The width of output mask. + + Returns: + np.ndarray: The decode bitmap mask of shape (img_h, img_w). + """ + + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) + rle = maskUtils.merge(rles) + elif isinstance(mask_ann['counts'], list): + # uncompressed RLE + rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = maskUtils.decode(rle) + return mask + + def _process_masks(self, results: dict) -> list: + """Process gt_masks and filter invalid polygons. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + list: Processed gt_masks. + """ + gt_masks = [] + gt_ignore_flags = [] + for instance in results.get('instances', []): + gt_mask = instance['mask'] + # If the annotation of segmentation mask is invalid, + # ignore the whole instance. + if isinstance(gt_mask, list): + gt_mask = [ + np.array(polygon) for polygon in gt_mask + if len(polygon) % 2 == 0 and len(polygon) >= 6 + ] + if len(gt_mask) == 0: + # ignore this instance and set gt_mask to a fake mask + instance['ignore_flag'] = 1 + gt_mask = [np.zeros(6)] + elif not self.poly2mask: + # `PolygonMasks` requires a ploygon of format List[np.array], + # other formats are invalid. + instance['ignore_flag'] = 1 + gt_mask = [np.zeros(6)] + elif isinstance(gt_mask, dict) and \ + not (gt_mask.get('counts') is not None and + gt_mask.get('size') is not None and + isinstance(gt_mask['counts'], (list, str))): + # if gt_mask is a dict, it should include `counts` and `size`, + # so that `BitmapMasks` can uncompressed RLE + instance['ignore_flag'] = 1 + gt_mask = [np.zeros(6)] + gt_masks.append(gt_mask) + # re-process gt_ignore_flags + gt_ignore_flags.append(instance['ignore_flag']) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + return gt_masks + + def _load_masks(self, results: dict) -> None: + """Private function to load mask annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + """ + h, w = results['ori_shape'] + gt_masks = self._process_masks(results) + if self.poly2mask: + gt_masks = BitmapMasks( + [self._poly2mask(mask, h, w) for mask in gt_masks], h, w) + else: + # fake polygon masks will be ignored in `PackDetInputs` + gt_masks = PolygonMasks([mask for mask in gt_masks], h, w) + results['gt_masks'] = gt_masks + + def _load_seg_map(self, results: dict) -> None: + """Private function to load semantic segmentation annotations. + + Args: + results (dict): Result dict from :obj:``mmcv.BaseDataset``. + + Returns: + dict: The dict contains loaded semantic segmentation annotations. + """ + if results.get('seg_map_path', None) is None: + return + + img_bytes = get( + results['seg_map_path'], backend_args=self.backend_args) + gt_semantic_seg = mmcv.imfrombytes( + img_bytes, flag='unchanged', + backend=self.imdecode_backend).squeeze() + + if self.reduce_zero_label: + # avoid using underflow conversion + gt_semantic_seg[gt_semantic_seg == 0] = self.ignore_index + gt_semantic_seg = gt_semantic_seg - 1 + gt_semantic_seg[gt_semantic_seg == self.ignore_index - + 1] = self.ignore_index + + # modify if custom classes + if results.get('label_map', None) is not None: + # Add deep copy to solve bug of repeatedly + # replace `gt_semantic_seg`, which is reported in + # https://github.com/open-mmlab/mmsegmentation/pull/1445/ + gt_semantic_seg_copy = gt_semantic_seg.copy() + for old_id, new_id in results['label_map'].items(): + gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id + results['gt_seg_map'] = gt_semantic_seg + results['ignore_index'] = self.ignore_index + + def transform(self, results: dict) -> dict: + """Function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box, label and + semantic segmentation. + """ + + if self.with_bbox: + self._load_bboxes(results) + if self.with_label: + self._load_labels(results) + if self.with_mask: + self._load_masks(results) + if self.with_seg: + self._load_seg_map(results) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(with_bbox={self.with_bbox}, ' + repr_str += f'with_label={self.with_label}, ' + repr_str += f'with_mask={self.with_mask}, ' + repr_str += f'with_seg={self.with_seg}, ' + repr_str += f'poly2mask={self.poly2mask}, ' + repr_str += f"imdecode_backend='{self.imdecode_backend}', " + repr_str += f'backend_args={self.backend_args})' + return repr_str + + +@TRANSFORMS.register_module() +class LoadPanopticAnnotations(LoadAnnotations): + """Load multiple types of panoptic annotations. + + The annotation format is as the following: + + .. code-block:: python + + { + 'instances': + [ + { + # List of 4 numbers representing the bounding box of the + # instance, in (x1, y1, x2, y2) order. + 'bbox': [x1, y1, x2, y2], + + # Label of image classification. + 'bbox_label': 1, + }, + ... + ] + 'segments_info': + [ + { + # id = cls_id + instance_id * INSTANCE_OFFSET + 'id': int, + + # Contiguous category id defined in dataset. + 'category': int + + # Thing flag. + 'is_thing': bool + }, + ... + ] + + # Filename of semantic or panoptic segmentation ground truth file. + 'seg_map_path': 'a/b/c' + } + + After this module, the annotation has been changed to the format below: + + .. code-block:: python + + { + # In (x1, y1, x2, y2) order, float type. N is the number of bboxes + # in an image + 'gt_bboxes': BaseBoxes(N, 4) + # In int type. + 'gt_bboxes_labels': np.ndarray(N, ) + # In built-in class + 'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W) + # In uint8 type. + 'gt_seg_map': np.ndarray (H, W) + # in (x, y, v) order, float type. + } + + Required Keys: + + - height + - width + - instances + - bbox + - bbox_label + - ignore_flag + - segments_info + - id + - category + - is_thing + - seg_map_path + + Added Keys: + + - gt_bboxes (BaseBoxes[torch.float32]) + - gt_bboxes_labels (np.int64) + - gt_masks (BitmapMasks | PolygonMasks) + - gt_seg_map (np.uint8) + - gt_ignore_flags (bool) + + Args: + with_bbox (bool): Whether to parse and load the bbox annotation. + Defaults to True. + with_label (bool): Whether to parse and load the label annotation. + Defaults to True. + with_mask (bool): Whether to parse and load the mask annotation. + Defaults to True. + with_seg (bool): Whether to parse and load the semantic segmentation + annotation. Defaults to False. + box_type (str): The box mode used to wrap the bboxes. + imdecode_backend (str): The image decoding backend type. The backend + argument for :func:``mmcv.imfrombytes``. + See :fun:``mmcv.imfrombytes`` for details. + Defaults to 'cv2'. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend in mmdet >= 3.0.0rc7. Defaults to None. + """ + + def __init__(self, + with_bbox: bool = True, + with_label: bool = True, + with_mask: bool = True, + with_seg: bool = True, + box_type: str = 'hbox', + imdecode_backend: str = 'cv2', + backend_args: dict = None) -> None: + try: + from panopticapi import utils + except ImportError: + raise ImportError( + 'panopticapi is not installed, please install it by: ' + 'pip install git+https://github.com/cocodataset/' + 'panopticapi.git.') + self.rgb2id = utils.rgb2id + + super(LoadPanopticAnnotations, self).__init__( + with_bbox=with_bbox, + with_label=with_label, + with_mask=with_mask, + with_seg=with_seg, + with_keypoints=False, + box_type=box_type, + imdecode_backend=imdecode_backend, + backend_args=backend_args) + + def _load_masks_and_semantic_segs(self, results: dict) -> None: + """Private function to load mask and semantic segmentation annotations. + + In gt_semantic_seg, the foreground label is from ``0`` to + ``num_things - 1``, the background label is from ``num_things`` to + ``num_things + num_stuff - 1``, 255 means the ignored label (``VOID``). + + Args: + results (dict): Result dict from :obj:``mmdet.CustomDataset``. + """ + # seg_map_path is None, when inference on the dataset without gts. + if results.get('seg_map_path', None) is None: + return + + img_bytes = get( + results['seg_map_path'], backend_args=self.backend_args) + pan_png = mmcv.imfrombytes( + img_bytes, flag='color', channel_order='rgb').squeeze() + pan_png = self.rgb2id(pan_png) + + gt_masks = [] + gt_seg = np.zeros_like(pan_png) + 255 # 255 as ignore + + for segment_info in results['segments_info']: + mask = (pan_png == segment_info['id']) + gt_seg = np.where(mask, segment_info['category'], gt_seg) + + # The legal thing masks + if segment_info.get('is_thing'): + gt_masks.append(mask.astype(np.uint8)) + + if self.with_mask: + h, w = results['ori_shape'] + gt_masks = BitmapMasks(gt_masks, h, w) + results['gt_masks'] = gt_masks + + if self.with_seg: + results['gt_seg_map'] = gt_seg + + def transform(self, results: dict) -> dict: + """Function to load multiple types panoptic annotations. + + Args: + results (dict): Result dict from :obj:``mmdet.CustomDataset``. + + Returns: + dict: The dict contains loaded bounding box, label, mask and + semantic segmentation annotations. + """ + + if self.with_bbox: + self._load_bboxes(results) + if self.with_label: + self._load_labels(results) + if self.with_mask or self.with_seg: + # The tasks completed by '_load_masks' and '_load_semantic_segs' + # in LoadAnnotations are merged to one function. + self._load_masks_and_semantic_segs(results) + + return results + + +@TRANSFORMS.register_module() +class LoadProposals(BaseTransform): + """Load proposal pipeline. + + Required Keys: + + - proposals + + Modified Keys: + + - proposals + + Args: + num_max_proposals (int, optional): Maximum number of proposals to load. + If not specified, all proposals will be loaded. + """ + + def __init__(self, num_max_proposals: Optional[int] = None) -> None: + self.num_max_proposals = num_max_proposals + + def transform(self, results: dict) -> dict: + """Transform function to load proposals from file. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded proposal annotations. + """ + + proposals = results['proposals'] + # the type of proposals should be `dict` or `InstanceData` + assert isinstance(proposals, dict) \ + or isinstance(proposals, BaseDataElement) + bboxes = proposals['bboxes'].astype(np.float32) + assert bboxes.shape[1] == 4, \ + f'Proposals should have shapes (n, 4), but found {bboxes.shape}' + + if 'scores' in proposals: + scores = proposals['scores'].astype(np.float32) + assert bboxes.shape[0] == scores.shape[0] + else: + scores = np.zeros(bboxes.shape[0], dtype=np.float32) + + if self.num_max_proposals is not None: + # proposals should sort by scores during dumping the proposals + bboxes = bboxes[:self.num_max_proposals] + scores = scores[:self.num_max_proposals] + + if len(bboxes) == 0: + bboxes = np.zeros((0, 4), dtype=np.float32) + scores = np.zeros(0, dtype=np.float32) + + results['proposals'] = bboxes + results['proposals_scores'] = scores + return results + + def __repr__(self): + return self.__class__.__name__ + \ + f'(num_max_proposals={self.num_max_proposals})' + + +@TRANSFORMS.register_module() +class FilterAnnotations(BaseTransform): + """Filter invalid annotations. + + Required Keys: + + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_ignore_flags (bool) (optional) + + Modified Keys: + + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_masks (optional) + - gt_ignore_flags (optional) + + Args: + min_gt_bbox_wh (tuple[float]): Minimum width and height of ground truth + boxes. Default: (1., 1.) + min_gt_mask_area (int): Minimum foreground area of ground truth masks. + Default: 1 + by_box (bool): Filter instances with bounding boxes not meeting the + min_gt_bbox_wh threshold. Default: True + by_mask (bool): Filter instances with masks not meeting + min_gt_mask_area threshold. Default: False + keep_empty (bool): Whether to return None when it + becomes an empty bbox after filtering. Defaults to True. + """ + + def __init__(self, + min_gt_bbox_wh: Tuple[int, int] = (1, 1), + min_gt_mask_area: int = 1, + by_box: bool = True, + by_mask: bool = False, + keep_empty: bool = True) -> None: + # TODO: add more filter options + assert by_box or by_mask + self.min_gt_bbox_wh = min_gt_bbox_wh + self.min_gt_mask_area = min_gt_mask_area + self.by_box = by_box + self.by_mask = by_mask + self.keep_empty = keep_empty + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """Transform function to filter annotations. + + Args: + results (dict): Result dict. + + Returns: + dict: Updated result dict. + """ + assert 'gt_bboxes' in results + gt_bboxes = results['gt_bboxes'] + if gt_bboxes.shape[0] == 0: + return results + + tests = [] + if self.by_box: + tests.append( + ((gt_bboxes.widths > self.min_gt_bbox_wh[0]) & + (gt_bboxes.heights > self.min_gt_bbox_wh[1])).numpy()) + if self.by_mask: + assert 'gt_masks' in results + gt_masks = results['gt_masks'] + tests.append(gt_masks.areas >= self.min_gt_mask_area) + + keep = tests[0] + for t in tests[1:]: + keep = keep & t + + if not keep.any(): + if self.keep_empty: + return None + + keys = ('gt_bboxes', 'gt_bboxes_labels', 'gt_masks', 'gt_ignore_flags') + for key in keys: + if key in results: + results[key] = results[key][keep] + + return results + + def __repr__(self): + return self.__class__.__name__ + \ + f'(min_gt_bbox_wh={self.min_gt_bbox_wh}, ' \ + f'keep_empty={self.keep_empty})' + + +@TRANSFORMS.register_module() +class LoadEmptyAnnotations(BaseTransform): + """Load Empty Annotations for unlabeled images. + + Added Keys: + - gt_bboxes (np.float32) + - gt_bboxes_labels (np.int64) + - gt_masks (BitmapMasks | PolygonMasks) + - gt_seg_map (np.uint8) + - gt_ignore_flags (bool) + + Args: + with_bbox (bool): Whether to load the pseudo bbox annotation. + Defaults to True. + with_label (bool): Whether to load the pseudo label annotation. + Defaults to True. + with_mask (bool): Whether to load the pseudo mask annotation. + Default: False. + with_seg (bool): Whether to load the pseudo semantic segmentation + annotation. Defaults to False. + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Defaults to 255. + """ + + def __init__(self, + with_bbox: bool = True, + with_label: bool = True, + with_mask: bool = False, + with_seg: bool = False, + seg_ignore_label: int = 255) -> None: + self.with_bbox = with_bbox + self.with_label = with_label + self.with_mask = with_mask + self.with_seg = with_seg + self.seg_ignore_label = seg_ignore_label + + def transform(self, results: dict) -> dict: + """Transform function to load empty annotations. + + Args: + results (dict): Result dict. + Returns: + dict: Updated result dict. + """ + if self.with_bbox: + results['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32) + results['gt_ignore_flags'] = np.zeros((0, ), dtype=bool) + if self.with_label: + results['gt_bboxes_labels'] = np.zeros((0, ), dtype=np.int64) + if self.with_mask: + # TODO: support PolygonMasks + h, w = results['img_shape'] + gt_masks = np.zeros((0, h, w), dtype=np.uint8) + results['gt_masks'] = BitmapMasks(gt_masks, h, w) + if self.with_seg: + h, w = results['img_shape'] + results['gt_seg_map'] = self.seg_ignore_label * np.ones( + (h, w), dtype=np.uint8) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(with_bbox={self.with_bbox}, ' + repr_str += f'with_label={self.with_label}, ' + repr_str += f'with_mask={self.with_mask}, ' + repr_str += f'with_seg={self.with_seg}, ' + repr_str += f'seg_ignore_label={self.seg_ignore_label})' + return repr_str + + +@TRANSFORMS.register_module() +class InferencerLoader(BaseTransform): + """Load an image from ``results['img']``. + + Similar with :obj:`LoadImageFromFile`, but the image has been loaded as + :obj:`np.ndarray` in ``results['img']``. Can be used when loading image + from webcam. + + Required Keys: + + - img + + Modified Keys: + + - img + - img_path + - img_shape + - ori_shape + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + """ + + def __init__(self, **kwargs) -> None: + super().__init__() + self.from_file = TRANSFORMS.build( + dict(type='LoadImageFromFile', **kwargs)) + self.from_ndarray = TRANSFORMS.build( + dict(type='mmdet.LoadImageFromNDArray', **kwargs)) + + def transform(self, results: Union[str, np.ndarray, dict]) -> dict: + """Transform function to add image meta information. + + Args: + results (str, np.ndarray or dict): The result. + + Returns: + dict: The dict contains loaded image and meta information. + """ + if isinstance(results, str): + inputs = dict(img_path=results) + elif isinstance(results, np.ndarray): + inputs = dict(img=results) + elif isinstance(results, dict): + inputs = results + else: + raise NotImplementedError + + if 'img' in inputs: + return self.from_ndarray(inputs) + return self.from_file(inputs) + + +@TRANSFORMS.register_module() +class LoadTrackAnnotations(LoadAnnotations): + """Load and process the ``instances`` and ``seg_map`` annotation provided + by dataset. It must load ``instances_ids`` which is only used in the + tracking tasks. The annotation format is as the following: + + .. code-block:: python + { + 'instances': + [ + { + # List of 4 numbers representing the bounding box of the + # instance, in (x1, y1, x2, y2) order. + 'bbox': [x1, y1, x2, y2], + # Label of image classification. + 'bbox_label': 1, + # Used in tracking. + # Id of instances. + 'instance_id': 100, + # Used in instance/panoptic segmentation. The segmentation mask + # of the instance or the information of segments. + # 1. If list[list[float]], it represents a list of polygons, + # one for each connected component of the object. Each + # list[float] is one simple polygon in the format of + # [x1, y1, ..., xn, yn] (n >= 3). The Xs and Ys are absolute + # coordinates in unit of pixels. + # 2. If dict, it represents the per-pixel segmentation mask in + # COCO's compressed RLE format. The dict should have keys + # “size” and “counts”. Can be loaded by pycocotools + 'mask': list[list[float]] or dict, + } + ] + # Filename of semantic or panoptic segmentation ground truth file. + 'seg_map_path': 'a/b/c' + } + + After this module, the annotation has been changed to the format below: + .. code-block:: python + { + # In (x1, y1, x2, y2) order, float type. N is the number of bboxes + # in an image + 'gt_bboxes': np.ndarray(N, 4) + # In int type. + 'gt_bboxes_labels': np.ndarray(N, ) + # In built-in class + 'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W) + # In uint8 type. + 'gt_seg_map': np.ndarray (H, W) + # in (x, y, v) order, float type. + } + + Required Keys: + + - height (optional) + - width (optional) + - instances + - bbox (optional) + - bbox_label + - instance_id (optional) + - mask (optional) + - ignore_flag (optional) + - seg_map_path (optional) + + Added Keys: + + - gt_bboxes (np.float32) + - gt_bboxes_labels (np.int32) + - gt_instances_ids (np.int32) + - gt_masks (BitmapMasks | PolygonMasks) + - gt_seg_map (np.uint8) + - gt_ignore_flags (np.bool) + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def _load_bboxes(self, results: dict) -> None: + """Private function to load bounding box annotations. + + Args: + results (dict): Result dict from :obj:``mmcv.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box annotations. + """ + gt_bboxes = [] + gt_ignore_flags = [] + # TODO: use bbox_type + for instance in results['instances']: + # The datasets which are only format in evaluation don't have + # groundtruth boxes. + if 'bbox' in instance: + gt_bboxes.append(instance['bbox']) + if 'ignore_flag' in instance: + gt_ignore_flags.append(instance['ignore_flag']) + + # TODO: check this case + if len(gt_bboxes) != len(gt_ignore_flags): + # There may be no ``gt_ignore_flags`` in some cases, we treat them + # as all False in order to keep the length of ``gt_bboxes`` and + # ``gt_ignore_flags`` the same + gt_ignore_flags = [False] * len(gt_bboxes) + + results['gt_bboxes'] = np.array( + gt_bboxes, dtype=np.float32).reshape(-1, 4) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + + def _load_instances_ids(self, results: dict) -> None: + """Private function to load instances id annotations. + + Args: + results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``. + + Returns: + dict: The dict containing instances id annotations. + """ + gt_instances_ids = [] + for instance in results['instances']: + gt_instances_ids.append(instance['instance_id']) + results['gt_instances_ids'] = np.array( + gt_instances_ids, dtype=np.int32) + + def transform(self, results: dict) -> dict: + """Function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:``mmcv.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box, label, instances id + and semantic segmentation and keypoints annotations. + """ + results = super().transform(results) + self._load_instances_ids(results) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(with_bbox={self.with_bbox}, ' + repr_str += f'with_label={self.with_label}, ' + repr_str += f'with_mask={self.with_mask}, ' + repr_str += f'with_seg={self.with_seg}, ' + repr_str += f'poly2mask={self.poly2mask}, ' + repr_str += f"imdecode_backend='{self.imdecode_backend}', " + repr_str += f'file_client_args={self.file_client_args})' + return repr_str diff --git a/mmdetection/mmdet/datasets/transforms/transformers_glip.py b/mmdetection/mmdet/datasets/transforms/transformers_glip.py new file mode 100644 index 0000000..60c4f87 --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/transformers_glip.py @@ -0,0 +1,66 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform + +from mmdet.registry import TRANSFORMS +from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type +from .transforms import RandomFlip + + +@TRANSFORMS.register_module() +class GTBoxSubOne_GLIP(BaseTransform): + """Subtract 1 from the x2 and y2 coordinates of the gt_bboxes.""" + + def transform(self, results: dict) -> dict: + if 'gt_bboxes' in results: + gt_bboxes = results['gt_bboxes'] + if isinstance(gt_bboxes, np.ndarray): + gt_bboxes[:, 2:] -= 1 + results['gt_bboxes'] = gt_bboxes + elif isinstance(gt_bboxes, HorizontalBoxes): + gt_bboxes = results['gt_bboxes'].tensor + gt_bboxes[:, 2:] -= 1 + results['gt_bboxes'] = HorizontalBoxes(gt_bboxes) + else: + raise NotImplementedError + return results + + +@TRANSFORMS.register_module() +class RandomFlip_GLIP(RandomFlip): + """Flip the image & bboxes & masks & segs horizontally or vertically. + + When using horizontal flipping, the corresponding bbox x-coordinate needs + to be additionally subtracted by one. + """ + + @autocast_box_type() + def _flip(self, results: dict) -> None: + """Flip images, bounding boxes, and semantic segmentation map.""" + # flip image + results['img'] = mmcv.imflip( + results['img'], direction=results['flip_direction']) + + img_shape = results['img'].shape[:2] + + # flip bboxes + if results.get('gt_bboxes', None) is not None: + results['gt_bboxes'].flip_(img_shape, results['flip_direction']) + # Only change this line + if results['flip_direction'] == 'horizontal': + results['gt_bboxes'].translate_([-1, 0]) + + # TODO: check it + # flip masks + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'].flip( + results['flip_direction']) + + # flip segs + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = mmcv.imflip( + results['gt_seg_map'], direction=results['flip_direction']) + + # record homography matrix for flip + self._record_homography_matrix(results) diff --git a/mmdetection/mmdet/datasets/transforms/transforms.py b/mmdetection/mmdet/datasets/transforms/transforms.py new file mode 100644 index 0000000..4ac2bf7 --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/transforms.py @@ -0,0 +1,3854 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import inspect +import math +import warnings +from typing import List, Optional, Sequence, Tuple, Union + +import cv2 +import mmcv +import numpy as np +from mmcv.image import imresize +from mmcv.image.geometric import _scale_size +from mmcv.transforms import BaseTransform +from mmcv.transforms import Pad as MMCV_Pad +from mmcv.transforms import RandomFlip as MMCV_RandomFlip +from mmcv.transforms import Resize as MMCV_Resize +from mmcv.transforms.utils import avoid_cache_randomness, cache_randomness +from mmengine.dataset import BaseDataset +from mmengine.utils import is_str +from numpy import random + +from mmdet.registry import TRANSFORMS +from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type +from mmdet.structures.mask import BitmapMasks, PolygonMasks +from mmdet.utils import log_img_scale + +try: + from imagecorruptions import corrupt +except ImportError: + corrupt = None + +try: + import albumentations + from albumentations import Compose +except ImportError: + albumentations = None + Compose = None + +Number = Union[int, float] + + +def _fixed_scale_size( + size: Tuple[int, int], + scale: Union[float, int, tuple], +) -> Tuple[int, int]: + """Rescale a size by a ratio. + + Args: + size (tuple[int]): (w, h). + scale (float | tuple(float)): Scaling factor. + + Returns: + tuple[int]: scaled size. + """ + if isinstance(scale, (float, int)): + scale = (scale, scale) + w, h = size + # don't need o.5 offset + return int(w * float(scale[0])), int(h * float(scale[1])) + + +def rescale_size(old_size: tuple, + scale: Union[float, int, tuple], + return_scale: bool = False) -> tuple: + """Calculate the new size to be rescaled to. + + Args: + old_size (tuple[int]): The old size (w, h) of image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by this + factor, else if it is a tuple of 2 integers, then the image will + be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image size. + + Returns: + tuple[int]: The new rescaled image size. + """ + w, h = old_size + if isinstance(scale, (float, int)): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + scale_factor = scale + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + else: + raise TypeError( + f'Scale must be a number or tuple of int, but got {type(scale)}') + # only change this + new_size = _fixed_scale_size((w, h), scale_factor) + + if return_scale: + return new_size, scale_factor + else: + return new_size + + +def imrescale( + img: np.ndarray, + scale: Union[float, Tuple[int, int]], + return_scale: bool = False, + interpolation: str = 'bilinear', + backend: Optional[str] = None +) -> Union[np.ndarray, Tuple[np.ndarray, float]]: + """Resize image while keeping the aspect ratio. + + Args: + img (ndarray): The input image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by this + factor, else if it is a tuple of 2 integers, then the image will + be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image. + interpolation (str): Same as :func:`resize`. + backend (str | None): Same as :func:`resize`. + + Returns: + ndarray: The rescaled image. + """ + h, w = img.shape[:2] + new_size, scale_factor = rescale_size((w, h), scale, return_scale=True) + rescaled_img = imresize( + img, new_size, interpolation=interpolation, backend=backend) + if return_scale: + return rescaled_img, scale_factor + else: + return rescaled_img + + +@TRANSFORMS.register_module() +class Resize(MMCV_Resize): + """Resize images & bbox & seg. + + This transform resizes the input image according to ``scale`` or + ``scale_factor``. Bboxes, masks, and seg map are then resized + with the same scale factor. + if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to + resize. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes + - gt_masks + - gt_seg_map + + + Added Keys: + + - scale + - scale_factor + - keep_ratio + - homography_matrix + + Args: + scale (int or tuple): Images scales for resizing. Defaults to None + scale_factor (float or tuple[float]): Scale factors for resizing. + Defaults to None. + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. Defaults to False. + clip_object_border (bool): Whether to clip the objects + outside the border of the image. In some dataset like MOT17, the gt + bboxes are allowed to cross the border of images. Therefore, we + don't need to clip the gt bboxes in these cases. Defaults to True. + backend (str): Image resize backend, choices are 'cv2' and 'pillow'. + These two backends generates slightly different results. Defaults + to 'cv2'. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. Defaults + to 'bilinear'. + """ + + def _resize_masks(self, results: dict) -> None: + """Resize masks with ``results['scale']``""" + if results.get('gt_masks', None) is not None: + if self.keep_ratio: + results['gt_masks'] = results['gt_masks'].rescale( + results['scale']) + else: + results['gt_masks'] = results['gt_masks'].resize( + results['img_shape']) + + def _resize_bboxes(self, results: dict) -> None: + """Resize bounding boxes with ``results['scale_factor']``.""" + if results.get('gt_bboxes', None) is not None: + results['gt_bboxes'].rescale_(results['scale_factor']) + if self.clip_object_border: + results['gt_bboxes'].clip_(results['img_shape']) + + def _record_homography_matrix(self, results: dict) -> None: + """Record the homography matrix for the Resize.""" + w_scale, h_scale = results['scale_factor'] + homography_matrix = np.array( + [[w_scale, 0, 0], [0, h_scale, 0], [0, 0, 1]], dtype=np.float32) + if results.get('homography_matrix', None) is None: + results['homography_matrix'] = homography_matrix + else: + results['homography_matrix'] = homography_matrix @ results[ + 'homography_matrix'] + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to resize images, bounding boxes and semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map', + 'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys + are updated in result dict. + """ + if self.scale: + results['scale'] = self.scale + else: + img_shape = results['img'].shape[:2] + results['scale'] = _scale_size(img_shape[::-1], self.scale_factor) + self._resize_img(results) + self._resize_bboxes(results) + self._resize_masks(results) + self._resize_seg(results) + self._record_homography_matrix(results) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(scale={self.scale}, ' + repr_str += f'scale_factor={self.scale_factor}, ' + repr_str += f'keep_ratio={self.keep_ratio}, ' + repr_str += f'clip_object_border={self.clip_object_border}), ' + repr_str += f'backend={self.backend}), ' + repr_str += f'interpolation={self.interpolation})' + return repr_str + + +@TRANSFORMS.register_module() +class FixScaleResize(Resize): + """Compared to Resize, FixScaleResize fixes the scaling issue when + `keep_ratio=true`.""" + + def _resize_img(self, results): + """Resize images with ``results['scale']``.""" + if results.get('img', None) is not None: + if self.keep_ratio: + img, scale_factor = imrescale( + results['img'], + results['scale'], + interpolation=self.interpolation, + return_scale=True, + backend=self.backend) + new_h, new_w = img.shape[:2] + h, w = results['img'].shape[:2] + w_scale = new_w / w + h_scale = new_h / h + else: + img, w_scale, h_scale = mmcv.imresize( + results['img'], + results['scale'], + interpolation=self.interpolation, + return_scale=True, + backend=self.backend) + results['img'] = img + results['img_shape'] = img.shape[:2] + results['scale_factor'] = (w_scale, h_scale) + results['keep_ratio'] = self.keep_ratio + + +@TRANSFORMS.register_module() +class ResizeShortestEdge(BaseTransform): + """Resize the image and mask while keeping the aspect ratio unchanged. + + Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501 + + This transform attempts to scale the shorter edge to the given + `scale`, as long as the longer edge does not exceed `max_size`. + If `max_size` is reached, then downscale so that the longer + edge does not exceed `max_size`. + + Required Keys: + - img + - gt_seg_map (optional) + Modified Keys: + - img + - img_shape + - gt_seg_map (optional)) + Added Keys: + - scale + - scale_factor + - keep_ratio + + Args: + scale (Union[int, Tuple[int, int]]): The target short edge length. + If it's tuple, will select the min value as the short edge length. + max_size (int): The maximum allowed longest edge length. + """ + + def __init__(self, + scale: Union[int, Tuple[int, int]], + max_size: Optional[int] = None, + resize_type: str = 'Resize', + **resize_kwargs) -> None: + super().__init__() + self.scale = scale + self.max_size = max_size + + self.resize_cfg = dict(type=resize_type, **resize_kwargs) + self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg}) + + def _get_output_shape( + self, img: np.ndarray, + short_edge_length: Union[int, Tuple[int, int]]) -> Tuple[int, int]: + """Compute the target image shape with the given `short_edge_length`. + + Args: + img (np.ndarray): The input image. + short_edge_length (Union[int, Tuple[int, int]]): The target short + edge length. If it's tuple, will select the min value as the + short edge length. + """ + h, w = img.shape[:2] + if isinstance(short_edge_length, int): + size = short_edge_length * 1.0 + elif isinstance(short_edge_length, tuple): + size = min(short_edge_length) * 1.0 + scale = size / min(h, w) + if h < w: + new_h, new_w = size, scale * w + else: + new_h, new_w = scale * h, size + + if self.max_size and max(new_h, new_w) > self.max_size: + scale = self.max_size * 1.0 / max(new_h, new_w) + new_h *= scale + new_w *= scale + + new_h = int(new_h + 0.5) + new_w = int(new_w + 0.5) + return new_w, new_h + + def transform(self, results: dict) -> dict: + self.resize.scale = self._get_output_shape(results['img'], self.scale) + return self.resize(results) + + +@TRANSFORMS.register_module() +class FixShapeResize(Resize): + """Resize images & bbox & seg to the specified size. + + This transform resizes the input image according to ``width`` and + ``height``. Bboxes, masks, and seg map are then resized + with the same parameters. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes + - gt_masks + - gt_seg_map + + + Added Keys: + + - scale + - scale_factor + - keep_ratio + - homography_matrix + + Args: + width (int): width for resizing. + height (int): height for resizing. + Defaults to None. + pad_val (Number | dict[str, Number], optional): Padding value for if + the pad_mode is "constant". If it is a single number, the value + to pad the image is the number and to pad the semantic + segmentation map is 255. If it is a dict, it should have the + following keys: + + - img: The value to pad the image. + - seg: The value to pad the semantic segmentation map. + Defaults to dict(img=0, seg=255). + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. Defaults to False. + clip_object_border (bool): Whether to clip the objects + outside the border of the image. In some dataset like MOT17, the gt + bboxes are allowed to cross the border of images. Therefore, we + don't need to clip the gt bboxes in these cases. Defaults to True. + backend (str): Image resize backend, choices are 'cv2' and 'pillow'. + These two backends generates slightly different results. Defaults + to 'cv2'. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. Defaults + to 'bilinear'. + """ + + def __init__(self, + width: int, + height: int, + pad_val: Union[Number, dict] = dict(img=0, seg=255), + keep_ratio: bool = False, + clip_object_border: bool = True, + backend: str = 'cv2', + interpolation: str = 'bilinear') -> None: + assert width is not None and height is not None, ( + '`width` and' + '`height` can not be `None`') + + self.width = width + self.height = height + self.scale = (width, height) + + self.backend = backend + self.interpolation = interpolation + self.keep_ratio = keep_ratio + self.clip_object_border = clip_object_border + + if keep_ratio is True: + # padding to the fixed size when keep_ratio=True + self.pad_transform = Pad(size=self.scale, pad_val=pad_val) + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to resize images, bounding boxes and semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map', + 'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys + are updated in result dict. + """ + img = results['img'] + h, w = img.shape[:2] + if self.keep_ratio: + scale_factor = min(self.width / w, self.height / h) + results['scale_factor'] = (scale_factor, scale_factor) + real_w, real_h = int(w * float(scale_factor) + + 0.5), int(h * float(scale_factor) + 0.5) + img, scale_factor = mmcv.imrescale( + results['img'], (real_w, real_h), + interpolation=self.interpolation, + return_scale=True, + backend=self.backend) + # the w_scale and h_scale has minor difference + # a real fix should be done in the mmcv.imrescale in the future + results['img'] = img + results['img_shape'] = img.shape[:2] + results['keep_ratio'] = self.keep_ratio + results['scale'] = (real_w, real_h) + else: + results['scale'] = (self.width, self.height) + results['scale_factor'] = (self.width / w, self.height / h) + super()._resize_img(results) + + self._resize_bboxes(results) + self._resize_masks(results) + self._resize_seg(results) + self._record_homography_matrix(results) + if self.keep_ratio: + self.pad_transform(results) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(width={self.width}, height={self.height}, ' + repr_str += f'keep_ratio={self.keep_ratio}, ' + repr_str += f'clip_object_border={self.clip_object_border}), ' + repr_str += f'backend={self.backend}), ' + repr_str += f'interpolation={self.interpolation})' + return repr_str + + +@TRANSFORMS.register_module() +class RandomFlip(MMCV_RandomFlip): + """Flip the image & bbox & mask & segmentation map. Added or Updated keys: + flip, flip_direction, img, gt_bboxes, and gt_seg_map. There are 3 flip + modes: + + - ``prob`` is float, ``direction`` is string: the image will be + ``direction``ly flipped with probability of ``prob`` . + E.g., ``prob=0.5``, ``direction='horizontal'``, + then image will be horizontally flipped with probability of 0.5. + - ``prob`` is float, ``direction`` is list of string: the image will + be ``direction[i]``ly flipped with probability of + ``prob/len(direction)``. + E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``, + then image will be horizontally flipped with probability of 0.25, + vertically with probability of 0.25. + - ``prob`` is list of float, ``direction`` is list of string: + given ``len(prob) == len(direction)``, the image will + be ``direction[i]``ly flipped with probability of ``prob[i]``. + E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal', + 'vertical']``, then image will be horizontally flipped with + probability of 0.3, vertically with probability of 0.5. + + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_masks + - gt_seg_map + + Added Keys: + + - flip + - flip_direction + - homography_matrix + + + Args: + prob (float | list[float], optional): The flipping probability. + Defaults to None. + direction(str | list[str]): The flipping direction. Options + If input is a list, the length must equal ``prob``. Each + element in ``prob`` indicates the flip probability of + corresponding direction. Defaults to 'horizontal'. + """ + + def _record_homography_matrix(self, results: dict) -> None: + """Record the homography matrix for the RandomFlip.""" + cur_dir = results['flip_direction'] + h, w = results['img'].shape[:2] + + if cur_dir == 'horizontal': + homography_matrix = np.array([[-1, 0, w], [0, 1, 0], [0, 0, 1]], + dtype=np.float32) + elif cur_dir == 'vertical': + homography_matrix = np.array([[1, 0, 0], [0, -1, h], [0, 0, 1]], + dtype=np.float32) + elif cur_dir == 'diagonal': + homography_matrix = np.array([[-1, 0, w], [0, -1, h], [0, 0, 1]], + dtype=np.float32) + else: + homography_matrix = np.eye(3, dtype=np.float32) + + if results.get('homography_matrix', None) is None: + results['homography_matrix'] = homography_matrix + else: + results['homography_matrix'] = homography_matrix @ results[ + 'homography_matrix'] + + @autocast_box_type() + def _flip(self, results: dict) -> None: + """Flip images, bounding boxes, and semantic segmentation map.""" + # flip image + results['img'] = mmcv.imflip( + results['img'], direction=results['flip_direction']) + + img_shape = results['img'].shape[:2] + + # flip bboxes + if results.get('gt_bboxes', None) is not None: + results['gt_bboxes'].flip_(img_shape, results['flip_direction']) + + # flip masks + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'].flip( + results['flip_direction']) + + # flip segs + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = mmcv.imflip( + results['gt_seg_map'], direction=results['flip_direction']) + + # record homography matrix for flip + self._record_homography_matrix(results) + + +@TRANSFORMS.register_module() +class RandomShift(BaseTransform): + """Shift the image and box given shift pixels and probability. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) + - gt_bboxes_labels (np.int64) + - gt_ignore_flags (bool) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_bboxes_labels + - gt_ignore_flags (bool) (optional) + + Args: + prob (float): Probability of shifts. Defaults to 0.5. + max_shift_px (int): The max pixels for shifting. Defaults to 32. + filter_thr_px (int): The width and height threshold for filtering. + The bbox and the rest of the targets below the width and + height threshold will be filtered. Defaults to 1. + """ + + def __init__(self, + prob: float = 0.5, + max_shift_px: int = 32, + filter_thr_px: int = 1) -> None: + assert 0 <= prob <= 1 + assert max_shift_px >= 0 + self.prob = prob + self.max_shift_px = max_shift_px + self.filter_thr_px = int(filter_thr_px) + + @cache_randomness + def _random_prob(self) -> float: + return random.uniform(0, 1) + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to random shift images, bounding boxes. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Shift results. + """ + if self._random_prob() < self.prob: + img_shape = results['img'].shape[:2] + + random_shift_x = random.randint(-self.max_shift_px, + self.max_shift_px) + random_shift_y = random.randint(-self.max_shift_px, + self.max_shift_px) + new_x = max(0, random_shift_x) + ori_x = max(0, -random_shift_x) + new_y = max(0, random_shift_y) + ori_y = max(0, -random_shift_y) + + # TODO: support mask and semantic segmentation maps. + bboxes = results['gt_bboxes'].clone() + bboxes.translate_([random_shift_x, random_shift_y]) + + # clip border + bboxes.clip_(img_shape) + + # remove invalid bboxes + valid_inds = (bboxes.widths > self.filter_thr_px).numpy() & ( + bboxes.heights > self.filter_thr_px).numpy() + # If the shift does not contain any gt-bbox area, skip this + # image. + if not valid_inds.any(): + return results + bboxes = bboxes[valid_inds] + results['gt_bboxes'] = bboxes + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + valid_inds] + + if results.get('gt_ignore_flags', None) is not None: + results['gt_ignore_flags'] = \ + results['gt_ignore_flags'][valid_inds] + + # shift img + img = results['img'] + new_img = np.zeros_like(img) + img_h, img_w = img.shape[:2] + new_h = img_h - np.abs(random_shift_y) + new_w = img_w - np.abs(random_shift_x) + new_img[new_y:new_y + new_h, new_x:new_x + new_w] \ + = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w] + results['img'] = new_img + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(prob={self.prob}, ' + repr_str += f'max_shift_px={self.max_shift_px}, ' + repr_str += f'filter_thr_px={self.filter_thr_px})' + return repr_str + + +@TRANSFORMS.register_module() +class Pad(MMCV_Pad): + """Pad the image & segmentation map. + + There are three padding modes: (1) pad to a fixed size and (2) pad to the + minimum size that is divisible by some number. and (3)pad to square. Also, + pad to square and pad to the minimum size can be used as the same time. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - img_shape + - gt_masks + - gt_seg_map + + Added Keys: + + - pad_shape + - pad_fixed_size + - pad_size_divisor + + Args: + size (tuple, optional): Fixed padding size. + Expected padding shape (width, height). Defaults to None. + size_divisor (int, optional): The divisor of padded size. Defaults to + None. + pad_to_square (bool): Whether to pad the image into a square. + Currently only used for YOLOX. Defaults to False. + pad_val (Number | dict[str, Number], optional) - Padding value for if + the pad_mode is "constant". If it is a single number, the value + to pad the image is the number and to pad the semantic + segmentation map is 255. If it is a dict, it should have the + following keys: + + - img: The value to pad the image. + - seg: The value to pad the semantic segmentation map. + Defaults to dict(img=0, seg=255). + padding_mode (str): Type of padding. Should be: constant, edge, + reflect or symmetric. Defaults to 'constant'. + + - constant: pads with a constant value, this value is specified + with pad_val. + - edge: pads with the last value at the edge of the image. + - reflect: pads with reflection of image without repeating the last + value on the edge. For example, padding [1, 2, 3, 4] with 2 + elements on both sides in reflect mode will result in + [3, 2, 1, 2, 3, 4, 3, 2]. + - symmetric: pads with reflection of image repeating the last value + on the edge. For example, padding [1, 2, 3, 4] with 2 elements on + both sides in symmetric mode will result in + [2, 1, 1, 2, 3, 4, 4, 3] + """ + + def _pad_masks(self, results: dict) -> None: + """Pad masks according to ``results['pad_shape']``.""" + if results.get('gt_masks', None) is not None: + pad_val = self.pad_val.get('masks', 0) + pad_shape = results['pad_shape'][:2] + results['gt_masks'] = results['gt_masks'].pad( + pad_shape, pad_val=pad_val) + + def transform(self, results: dict) -> dict: + """Call function to pad images, masks, semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Updated result dict. + """ + self._pad_img(results) + self._pad_seg(results) + self._pad_masks(results) + return results + + +@TRANSFORMS.register_module() +class RandomCrop(BaseTransform): + """Random crop the image & bboxes & masks. + + The absolute ``crop_size`` is sampled based on ``crop_type`` and + ``image_size``, then the cropped results are generated. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_ignore_flags (bool) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_masks (optional) + - gt_ignore_flags (optional) + - gt_seg_map (optional) + - gt_instances_ids (options, only used in MOT/VIS) + + Added Keys: + + - homography_matrix + + Args: + crop_size (tuple): The relative ratio or absolute pixels of + (width, height). + crop_type (str, optional): One of "relative_range", "relative", + "absolute", "absolute_range". "relative" randomly crops + (h * crop_size[0], w * crop_size[1]) part from an input of size + (h, w). "relative_range" uniformly samples relative crop size from + range [crop_size[0], 1] and [crop_size[1], 1] for height and width + respectively. "absolute" crops from an input with absolute size + (crop_size[0], crop_size[1]). "absolute_range" uniformly samples + crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w + in range [crop_size[0], min(w, crop_size[1])]. + Defaults to "absolute". + allow_negative_crop (bool, optional): Whether to allow a crop that does + not contain any bbox area. Defaults to False. + recompute_bbox (bool, optional): Whether to re-compute the boxes based + on cropped instance masks. Defaults to False. + bbox_clip_border (bool, optional): Whether clip the objects outside + the border of the image. Defaults to True. + + Note: + - If the image is smaller than the absolute crop size, return the + original image. + - The keys for bboxes, labels and masks must be aligned. That is, + ``gt_bboxes`` corresponds to ``gt_labels`` and ``gt_masks``, and + ``gt_bboxes_ignore`` corresponds to ``gt_labels_ignore`` and + ``gt_masks_ignore``. + - If the crop does not contain any gt-bbox region and + ``allow_negative_crop`` is set to False, skip this image. + """ + + def __init__(self, + crop_size: tuple, + crop_type: str = 'absolute', + allow_negative_crop: bool = False, + recompute_bbox: bool = False, + bbox_clip_border: bool = True) -> None: + if crop_type not in [ + 'relative_range', 'relative', 'absolute', 'absolute_range' + ]: + raise ValueError(f'Invalid crop_type {crop_type}.') + if crop_type in ['absolute', 'absolute_range']: + assert crop_size[0] > 0 and crop_size[1] > 0 + assert isinstance(crop_size[0], int) and isinstance( + crop_size[1], int) + if crop_type == 'absolute_range': + assert crop_size[0] <= crop_size[1] + else: + assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1 + self.crop_size = crop_size + self.crop_type = crop_type + self.allow_negative_crop = allow_negative_crop + self.bbox_clip_border = bbox_clip_border + self.recompute_bbox = recompute_bbox + + def _crop_data(self, results: dict, crop_size: Tuple[int, int], + allow_negative_crop: bool) -> Union[dict, None]: + """Function to randomly crop images, bounding boxes, masks, semantic + segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + crop_size (Tuple[int, int]): Expected absolute size after + cropping, (h, w). + allow_negative_crop (bool): Whether to allow a crop that does not + contain any bbox area. + + Returns: + results (Union[dict, None]): Randomly cropped results, 'img_shape' + key in result dict is updated according to crop size. None will + be returned when there is no valid bbox after cropping. + """ + assert crop_size[0] > 0 and crop_size[1] > 0 + img = results['img'] + margin_h = max(img.shape[0] - crop_size[0], 0) + margin_w = max(img.shape[1] - crop_size[1], 0) + offset_h, offset_w = self._rand_offset((margin_h, margin_w)) + crop_y1, crop_y2 = offset_h, offset_h + crop_size[0] + crop_x1, crop_x2 = offset_w, offset_w + crop_size[1] + + # Record the homography matrix for the RandomCrop + homography_matrix = np.array( + [[1, 0, -offset_w], [0, 1, -offset_h], [0, 0, 1]], + dtype=np.float32) + if results.get('homography_matrix', None) is None: + results['homography_matrix'] = homography_matrix + else: + results['homography_matrix'] = homography_matrix @ results[ + 'homography_matrix'] + + # crop the image + img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] + img_shape = img.shape + results['img'] = img + results['img_shape'] = img_shape[:2] + + # crop bboxes accordingly and clip to the image boundary + if results.get('gt_bboxes', None) is not None: + bboxes = results['gt_bboxes'] + bboxes.translate_([-offset_w, -offset_h]) + if self.bbox_clip_border: + bboxes.clip_(img_shape[:2]) + valid_inds = bboxes.is_inside(img_shape[:2]).numpy() + # If the crop does not contain any gt-bbox area and + # allow_negative_crop is False, skip this image. + if (not valid_inds.any() and not allow_negative_crop): + return None + + results['gt_bboxes'] = bboxes[valid_inds] + + if results.get('gt_ignore_flags', None) is not None: + results['gt_ignore_flags'] = \ + results['gt_ignore_flags'][valid_inds] + + if results.get('gt_bboxes_labels', None) is not None: + results['gt_bboxes_labels'] = \ + results['gt_bboxes_labels'][valid_inds] + + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'][ + valid_inds.nonzero()[0]].crop( + np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) + if self.recompute_bbox: + results['gt_bboxes'] = results['gt_masks'].get_bboxes( + type(results['gt_bboxes'])) + + # We should remove the instance ids corresponding to invalid boxes. + if results.get('gt_instances_ids', None) is not None: + results['gt_instances_ids'] = \ + results['gt_instances_ids'][valid_inds] + + # crop semantic seg + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2, + crop_x1:crop_x2] + + return results + + @cache_randomness + def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]: + """Randomly generate crop offset. + + Args: + margin (Tuple[int, int]): The upper bound for the offset generated + randomly. + + Returns: + Tuple[int, int]: The random offset for the crop. + """ + margin_h, margin_w = margin + offset_h = np.random.randint(0, margin_h + 1) + offset_w = np.random.randint(0, margin_w + 1) + + return offset_h, offset_w + + @cache_randomness + def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]: + """Randomly generates the absolute crop size based on `crop_type` and + `image_size`. + + Args: + image_size (Tuple[int, int]): (h, w). + + Returns: + crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels. + """ + h, w = image_size + if self.crop_type == 'absolute': + return min(self.crop_size[1], h), min(self.crop_size[0], w) + elif self.crop_type == 'absolute_range': + crop_h = np.random.randint( + min(h, self.crop_size[0]), + min(h, self.crop_size[1]) + 1) + crop_w = np.random.randint( + min(w, self.crop_size[0]), + min(w, self.crop_size[1]) + 1) + return crop_h, crop_w + elif self.crop_type == 'relative': + crop_w, crop_h = self.crop_size + return int(h * crop_h + 0.5), int(w * crop_w + 0.5) + else: + # 'relative_range' + crop_size = np.asarray(self.crop_size, dtype=np.float32) + crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size) + return int(h * crop_h + 0.5), int(w * crop_w + 0.5) + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """Transform function to randomly crop images, bounding boxes, masks, + semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + results (Union[dict, None]): Randomly cropped results, 'img_shape' + key in result dict is updated according to crop size. None will + be returned when there is no valid bbox after cropping. + """ + image_size = results['img'].shape[:2] + crop_size = self._get_crop_size(image_size) + results = self._crop_data(results, crop_size, self.allow_negative_crop) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(crop_size={self.crop_size}, ' + repr_str += f'crop_type={self.crop_type}, ' + repr_str += f'allow_negative_crop={self.allow_negative_crop}, ' + repr_str += f'recompute_bbox={self.recompute_bbox}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + +@TRANSFORMS.register_module() +class SegRescale(BaseTransform): + """Rescale semantic segmentation maps. + + This transform rescale the ``gt_seg_map`` according to ``scale_factor``. + + Required Keys: + + - gt_seg_map + + Modified Keys: + + - gt_seg_map + + Args: + scale_factor (float): The scale factor of the final output. Defaults + to 1. + backend (str): Image rescale backend, choices are 'cv2' and 'pillow'. + These two backends generates slightly different results. Defaults + to 'cv2'. + """ + + def __init__(self, scale_factor: float = 1, backend: str = 'cv2') -> None: + self.scale_factor = scale_factor + self.backend = backend + + def transform(self, results: dict) -> dict: + """Transform function to scale the semantic segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with semantic segmentation map scaled. + """ + if self.scale_factor != 1: + results['gt_seg_map'] = mmcv.imrescale( + results['gt_seg_map'], + self.scale_factor, + interpolation='nearest', + backend=self.backend) + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(scale_factor={self.scale_factor}, ' + repr_str += f'backend={self.backend})' + return repr_str + + +@TRANSFORMS.register_module() +class PhotoMetricDistortion(BaseTransform): + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. The position of random contrast is in + second or second to last. + + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + 8. randomly swap channels + + Required Keys: + + - img (np.uint8) + + Modified Keys: + + - img (np.float32) + + Args: + brightness_delta (int): delta of brightness. + contrast_range (sequence): range of contrast. + saturation_range (sequence): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta: int = 32, + contrast_range: Sequence[Number] = (0.5, 1.5), + saturation_range: Sequence[Number] = (0.5, 1.5), + hue_delta: int = 18) -> None: + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + @cache_randomness + def _random_flags(self) -> Sequence[Number]: + mode = random.randint(2) + brightness_flag = random.randint(2) + contrast_flag = random.randint(2) + saturation_flag = random.randint(2) + hue_flag = random.randint(2) + swap_flag = random.randint(2) + delta_value = random.uniform(-self.brightness_delta, + self.brightness_delta) + alpha_value = random.uniform(self.contrast_lower, self.contrast_upper) + saturation_value = random.uniform(self.saturation_lower, + self.saturation_upper) + hue_value = random.uniform(-self.hue_delta, self.hue_delta) + swap_value = random.permutation(3) + + return (mode, brightness_flag, contrast_flag, saturation_flag, + hue_flag, swap_flag, delta_value, alpha_value, + saturation_value, hue_value, swap_value) + + def transform(self, results: dict) -> dict: + """Transform function to perform photometric distortion on images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images distorted. + """ + assert 'img' in results, '`img` is not found in results' + img = results['img'] + img = img.astype(np.float32) + + (mode, brightness_flag, contrast_flag, saturation_flag, hue_flag, + swap_flag, delta_value, alpha_value, saturation_value, hue_value, + swap_value) = self._random_flags() + + # random brightness + if brightness_flag: + img += delta_value + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + if mode == 1: + if contrast_flag: + img *= alpha_value + + # convert color from BGR to HSV + img = mmcv.bgr2hsv(img) + + # random saturation + if saturation_flag: + img[..., 1] *= saturation_value + # For image(type=float32), after convert bgr to hsv by opencv, + # valid saturation value range is [0, 1] + if saturation_value > 1: + img[..., 1] = img[..., 1].clip(0, 1) + + # random hue + if hue_flag: + img[..., 0] += hue_value + img[..., 0][img[..., 0] > 360] -= 360 + img[..., 0][img[..., 0] < 0] += 360 + + # convert color from HSV to BGR + img = mmcv.hsv2bgr(img) + + # random contrast + if mode == 0: + if contrast_flag: + img *= alpha_value + + # randomly swap channels + if swap_flag: + img = img[..., swap_value] + + results['img'] = img + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(brightness_delta={self.brightness_delta}, ' + repr_str += 'contrast_range=' + repr_str += f'{(self.contrast_lower, self.contrast_upper)}, ' + repr_str += 'saturation_range=' + repr_str += f'{(self.saturation_lower, self.saturation_upper)}, ' + repr_str += f'hue_delta={self.hue_delta})' + return repr_str + + +@TRANSFORMS.register_module() +class Expand(BaseTransform): + """Random expand the image & bboxes & masks & segmentation map. + + Randomly place the original image on a canvas of ``ratio`` x original image + size filled with mean values. The ratio is in the range of ratio_range. + + Required Keys: + + - img + - img_shape + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes + - gt_masks + - gt_seg_map + + + Args: + mean (sequence): mean value of dataset. + to_rgb (bool): if need to convert the order of mean to align with RGB. + ratio_range (sequence)): range of expand ratio. + seg_ignore_label (int): label of ignore segmentation map. + prob (float): probability of applying this transformation + """ + + def __init__(self, + mean: Sequence[Number] = (0, 0, 0), + to_rgb: bool = True, + ratio_range: Sequence[Number] = (1, 4), + seg_ignore_label: int = None, + prob: float = 0.5) -> None: + self.to_rgb = to_rgb + self.ratio_range = ratio_range + if to_rgb: + self.mean = mean[::-1] + else: + self.mean = mean + self.min_ratio, self.max_ratio = ratio_range + self.seg_ignore_label = seg_ignore_label + self.prob = prob + + @cache_randomness + def _random_prob(self) -> float: + return random.uniform(0, 1) + + @cache_randomness + def _random_ratio(self) -> float: + return random.uniform(self.min_ratio, self.max_ratio) + + @cache_randomness + def _random_left_top(self, ratio: float, h: int, + w: int) -> Tuple[int, int]: + left = int(random.uniform(0, w * ratio - w)) + top = int(random.uniform(0, h * ratio - h)) + return left, top + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to expand images, bounding boxes, masks, + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images, bounding boxes, masks, segmentation + map expanded. + """ + if self._random_prob() > self.prob: + return results + assert 'img' in results, '`img` is not found in results' + img = results['img'] + h, w, c = img.shape + ratio = self._random_ratio() + # speedup expand when meets large image + if np.all(self.mean == self.mean[0]): + expand_img = np.empty((int(h * ratio), int(w * ratio), c), + img.dtype) + expand_img.fill(self.mean[0]) + else: + expand_img = np.full((int(h * ratio), int(w * ratio), c), + self.mean, + dtype=img.dtype) + left, top = self._random_left_top(ratio, h, w) + expand_img[top:top + h, left:left + w] = img + results['img'] = expand_img + results['img_shape'] = expand_img.shape[:2] + + # expand bboxes + if results.get('gt_bboxes', None) is not None: + results['gt_bboxes'].translate_([left, top]) + + # expand masks + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'].expand( + int(h * ratio), int(w * ratio), top, left) + + # expand segmentation map + if results.get('gt_seg_map', None) is not None: + gt_seg = results['gt_seg_map'] + expand_gt_seg = np.full((int(h * ratio), int(w * ratio)), + self.seg_ignore_label, + dtype=gt_seg.dtype) + expand_gt_seg[top:top + h, left:left + w] = gt_seg + results['gt_seg_map'] = expand_gt_seg + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'seg_ignore_label={self.seg_ignore_label}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class MinIoURandomCrop(BaseTransform): + """Random crop the image & bboxes & masks & segmentation map, the cropped + patches have minimum IoU requirement with original image & bboxes & masks. + + & segmentation map, the IoU threshold is randomly selected from min_ious. + + + Required Keys: + + - img + - img_shape + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - gt_ignore_flags (bool) (optional) + - gt_seg_map (np.uint8) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes + - gt_bboxes_labels + - gt_masks + - gt_ignore_flags + - gt_seg_map + + + Args: + min_ious (Sequence[float]): minimum IoU threshold for all intersections + with bounding boxes. + min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, + where a >= min_crop_size). + bbox_clip_border (bool, optional): Whether clip the objects outside + the border of the image. Defaults to True. + """ + + def __init__(self, + min_ious: Sequence[float] = (0.1, 0.3, 0.5, 0.7, 0.9), + min_crop_size: float = 0.3, + bbox_clip_border: bool = True) -> None: + + self.min_ious = min_ious + self.sample_mode = (1, *min_ious, 0) + self.min_crop_size = min_crop_size + self.bbox_clip_border = bbox_clip_border + + @cache_randomness + def _random_mode(self) -> Number: + return random.choice(self.sample_mode) + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to crop images and bounding boxes with minimum + IoU constraint. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images and bounding boxes cropped, \ + 'img_shape' key is updated. + """ + assert 'img' in results, '`img` is not found in results' + assert 'gt_bboxes' in results, '`gt_bboxes` is not found in results' + img = results['img'] + boxes = results['gt_bboxes'] + h, w, c = img.shape + while True: + mode = self._random_mode() + self.mode = mode + if mode == 1: + return results + + min_iou = self.mode + for i in range(50): + new_w = random.uniform(self.min_crop_size * w, w) + new_h = random.uniform(self.min_crop_size * h, h) + + # h / w in [0.5, 2] + if new_h / new_w < 0.5 or new_h / new_w > 2: + continue + + left = random.uniform(w - new_w) + top = random.uniform(h - new_h) + + patch = np.array( + (int(left), int(top), int(left + new_w), int(top + new_h))) + # Line or point crop is not allowed + if patch[2] == patch[0] or patch[3] == patch[1]: + continue + overlaps = boxes.overlaps( + HorizontalBoxes(patch.reshape(-1, 4).astype(np.float32)), + boxes).numpy().reshape(-1) + if len(overlaps) > 0 and overlaps.min() < min_iou: + continue + + # center of boxes should inside the crop img + # only adjust boxes and instance masks when the gt is not empty + if len(overlaps) > 0: + # adjust boxes + def is_center_of_bboxes_in_patch(boxes, patch): + centers = boxes.centers.numpy() + mask = ((centers[:, 0] > patch[0]) * + (centers[:, 1] > patch[1]) * + (centers[:, 0] < patch[2]) * + (centers[:, 1] < patch[3])) + return mask + + mask = is_center_of_bboxes_in_patch(boxes, patch) + if not mask.any(): + continue + if results.get('gt_bboxes', None) is not None: + boxes = results['gt_bboxes'] + mask = is_center_of_bboxes_in_patch(boxes, patch) + boxes = boxes[mask] + boxes.translate_([-patch[0], -patch[1]]) + if self.bbox_clip_border: + boxes.clip_( + [patch[3] - patch[1], patch[2] - patch[0]]) + results['gt_bboxes'] = boxes + + # ignore_flags + if results.get('gt_ignore_flags', None) is not None: + results['gt_ignore_flags'] = \ + results['gt_ignore_flags'][mask] + + # labels + if results.get('gt_bboxes_labels', None) is not None: + results['gt_bboxes_labels'] = results[ + 'gt_bboxes_labels'][mask] + + # mask fields + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'][ + mask.nonzero()[0]].crop(patch) + # adjust the img no matter whether the gt is empty before crop + img = img[patch[1]:patch[3], patch[0]:patch[2]] + results['img'] = img + results['img_shape'] = img.shape[:2] + + # seg fields + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = results['gt_seg_map'][ + patch[1]:patch[3], patch[0]:patch[2]] + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(min_ious={self.min_ious}, ' + repr_str += f'min_crop_size={self.min_crop_size}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + +@TRANSFORMS.register_module() +class Corrupt(BaseTransform): + """Corruption augmentation. + + Corruption transforms implemented based on + `imagecorruptions `_. + + Required Keys: + + - img (np.uint8) + + + Modified Keys: + + - img (np.uint8) + + + Args: + corruption (str): Corruption name. + severity (int): The severity of corruption. Defaults to 1. + """ + + def __init__(self, corruption: str, severity: int = 1) -> None: + self.corruption = corruption + self.severity = severity + + def transform(self, results: dict) -> dict: + """Call function to corrupt image. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images corrupted. + """ + + if corrupt is None: + raise RuntimeError('imagecorruptions is not installed') + results['img'] = corrupt( + results['img'].astype(np.uint8), + corruption_name=self.corruption, + severity=self.severity) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(corruption={self.corruption}, ' + repr_str += f'severity={self.severity})' + return repr_str + + +@TRANSFORMS.register_module() +@avoid_cache_randomness +class Albu(BaseTransform): + """Albumentation augmentation. + + Adds custom transformations from Albumentations library. + Please, visit `https://albumentations.readthedocs.io` + to get more information. + + Required Keys: + + - img (np.uint8) + - gt_bboxes (HorizontalBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + + Modified Keys: + + - img (np.uint8) + - gt_bboxes (HorizontalBoxes[torch.float32]) (optional) + - gt_masks (BitmapMasks | PolygonMasks) (optional) + - img_shape (tuple) + + An example of ``transforms`` is as followed: + + .. code-block:: + + [ + dict( + type='ShiftScaleRotate', + shift_limit=0.0625, + scale_limit=0.0, + rotate_limit=0, + interpolation=1, + p=0.5), + dict( + type='RandomBrightnessContrast', + brightness_limit=[0.1, 0.3], + contrast_limit=[0.1, 0.3], + p=0.2), + dict(type='ChannelShuffle', p=0.1), + dict( + type='OneOf', + transforms=[ + dict(type='Blur', blur_limit=3, p=1.0), + dict(type='MedianBlur', blur_limit=3, p=1.0) + ], + p=0.1), + ] + + Args: + transforms (list[dict]): A list of albu transformations + bbox_params (dict, optional): Bbox_params for albumentation `Compose` + keymap (dict, optional): Contains + {'input key':'albumentation-style key'} + skip_img_without_anno (bool): Whether to skip the image if no ann left + after aug. Defaults to False. + """ + + def __init__(self, + transforms: List[dict], + bbox_params: Optional[dict] = None, + keymap: Optional[dict] = None, + skip_img_without_anno: bool = False) -> None: + if Compose is None: + raise RuntimeError('albumentations is not installed') + + # Args will be modified later, copying it will be safer + transforms = copy.deepcopy(transforms) + if bbox_params is not None: + bbox_params = copy.deepcopy(bbox_params) + if keymap is not None: + keymap = copy.deepcopy(keymap) + self.transforms = transforms + self.filter_lost_elements = False + self.skip_img_without_anno = skip_img_without_anno + + # A simple workaround to remove masks without boxes + if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params + and 'filter_lost_elements' in bbox_params): + self.filter_lost_elements = True + self.origin_label_fields = bbox_params['label_fields'] + bbox_params['label_fields'] = ['idx_mapper'] + del bbox_params['filter_lost_elements'] + + self.bbox_params = ( + self.albu_builder(bbox_params) if bbox_params else None) + self.aug = Compose([self.albu_builder(t) for t in self.transforms], + bbox_params=self.bbox_params) + + if not keymap: + self.keymap_to_albu = { + 'img': 'image', + 'gt_masks': 'masks', + 'gt_bboxes': 'bboxes' + } + else: + self.keymap_to_albu = keymap + self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()} + + def albu_builder(self, cfg: dict) -> albumentations: + """Import a module from albumentations. + + It inherits some of :func:`build_from_cfg` logic. + + Args: + cfg (dict): Config dict. It should at least contain the key "type". + + Returns: + obj: The constructed object. + """ + + assert isinstance(cfg, dict) and 'type' in cfg + args = cfg.copy() + obj_type = args.pop('type') + if is_str(obj_type): + if albumentations is None: + raise RuntimeError('albumentations is not installed') + obj_cls = getattr(albumentations, obj_type) + elif inspect.isclass(obj_type): + obj_cls = obj_type + else: + raise TypeError( + f'type must be a str or valid type, but got {type(obj_type)}') + + if 'transforms' in args: + args['transforms'] = [ + self.albu_builder(transform) + for transform in args['transforms'] + ] + + return obj_cls(**args) + + @staticmethod + def mapper(d: dict, keymap: dict) -> dict: + """Dictionary mapper. Renames keys according to keymap provided. + + Args: + d (dict): old dict + keymap (dict): {'old_key':'new_key'} + Returns: + dict: new dict. + """ + updated_dict = {} + for k, v in zip(d.keys(), d.values()): + new_k = keymap.get(k, k) + updated_dict[new_k] = d[k] + return updated_dict + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """Transform function of Albu.""" + # TODO: gt_seg_map is not currently supported + # dict to albumentations format + results = self.mapper(results, self.keymap_to_albu) + results, ori_masks = self._preprocess_results(results) + results = self.aug(**results) + results = self._postprocess_results(results, ori_masks) + if results is None: + return None + # back to the original format + results = self.mapper(results, self.keymap_back) + results['img_shape'] = results['img'].shape[:2] + return results + + def _preprocess_results(self, results: dict) -> tuple: + """Pre-processing results to facilitate the use of Albu.""" + if 'bboxes' in results: + # to list of boxes + if not isinstance(results['bboxes'], HorizontalBoxes): + raise NotImplementedError( + 'Albu only supports horizontal boxes now') + bboxes = results['bboxes'].numpy() + results['bboxes'] = [x for x in bboxes] + # add pseudo-field for filtration + if self.filter_lost_elements: + results['idx_mapper'] = np.arange(len(results['bboxes'])) + + # TODO: Support mask structure in albu + ori_masks = None + if 'masks' in results: + if isinstance(results['masks'], PolygonMasks): + raise NotImplementedError( + 'Albu only supports BitMap masks now') + ori_masks = results['masks'] + if albumentations.__version__ < '0.5': + results['masks'] = results['masks'].masks + else: + results['masks'] = [mask for mask in results['masks'].masks] + + return results, ori_masks + + def _postprocess_results( + self, + results: dict, + ori_masks: Optional[Union[BitmapMasks, + PolygonMasks]] = None) -> dict: + """Post-processing Albu output.""" + # albumentations may return np.array or list on different versions + if 'gt_bboxes_labels' in results and isinstance( + results['gt_bboxes_labels'], list): + results['gt_bboxes_labels'] = np.array( + results['gt_bboxes_labels'], dtype=np.int64) + if 'gt_ignore_flags' in results and isinstance( + results['gt_ignore_flags'], list): + results['gt_ignore_flags'] = np.array( + results['gt_ignore_flags'], dtype=bool) + + if 'bboxes' in results: + if isinstance(results['bboxes'], list): + results['bboxes'] = np.array( + results['bboxes'], dtype=np.float32) + results['bboxes'] = results['bboxes'].reshape(-1, 4) + results['bboxes'] = HorizontalBoxes(results['bboxes']) + + # filter label_fields + if self.filter_lost_elements: + + for label in self.origin_label_fields: + results[label] = np.array( + [results[label][i] for i in results['idx_mapper']]) + if 'masks' in results: + assert ori_masks is not None + results['masks'] = np.array( + [results['masks'][i] for i in results['idx_mapper']]) + results['masks'] = ori_masks.__class__( + results['masks'], ori_masks.height, ori_masks.width) + + if (not len(results['idx_mapper']) + and self.skip_img_without_anno): + return None + elif 'masks' in results: + results['masks'] = ori_masks.__class__(results['masks'], + ori_masks.height, + ori_masks.width) + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + f'(transforms={self.transforms})' + return repr_str + + +@TRANSFORMS.register_module() +@avoid_cache_randomness +class RandomCenterCropPad(BaseTransform): + """Random center crop and random around padding for CornerNet. + + This operation generates randomly cropped image from the original image and + pads it simultaneously. Different from :class:`RandomCrop`, the output + shape may not equal to ``crop_size`` strictly. We choose a random value + from ``ratios`` and the output shape could be larger or smaller than + ``crop_size``. The padding operation is also different from :class:`Pad`, + here we use around padding instead of right-bottom padding. + + The relation between output image (padding image) and original image: + + .. code:: text + + output image + + +----------------------------+ + | padded area | + +------|----------------------------|----------+ + | | cropped area | | + | | +---------------+ | | + | | | . center | | | original image + | | | range | | | + | | +---------------+ | | + +------|----------------------------|----------+ + | padded area | + +----------------------------+ + + There are 5 main areas in the figure: + + - output image: output image of this operation, also called padding + image in following instruction. + - original image: input image of this operation. + - padded area: non-intersect area of output image and original image. + - cropped area: the overlap of output image and original image. + - center range: a smaller area where random center chosen from. + center range is computed by ``border`` and original image's shape + to avoid our random center is too close to original image's border. + + Also this operation act differently in train and test mode, the summary + pipeline is listed below. + + Train pipeline: + + 1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image + will be ``random_ratio * crop_size``. + 2. Choose a ``random_center`` in center range. + 3. Generate padding image with center matches the ``random_center``. + 4. Initialize the padding image with pixel value equals to ``mean``. + 5. Copy the cropped area to padding image. + 6. Refine annotations. + + Test pipeline: + + 1. Compute output shape according to ``test_pad_mode``. + 2. Generate padding image with center matches the original image + center. + 3. Initialize the padding image with pixel value equals to ``mean``. + 4. Copy the ``cropped area`` to padding image. + + Required Keys: + + - img (np.float32) + - img_shape (tuple) + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + + Modified Keys: + + - img (np.float32) + - img_shape (tuple) + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + + Args: + crop_size (tuple, optional): expected size after crop, final size will + computed according to ratio. Requires (width, height) + in train mode, and None in test mode. + ratios (tuple, optional): random select a ratio from tuple and crop + image to (crop_size[0] * ratio) * (crop_size[1] * ratio). + Only available in train mode. Defaults to (0.9, 1.0, 1.1). + border (int, optional): max distance from center select area to image + border. Only available in train mode. Defaults to 128. + mean (sequence, optional): Mean values of 3 channels. + std (sequence, optional): Std values of 3 channels. + to_rgb (bool, optional): Whether to convert the image from BGR to RGB. + test_mode (bool): whether involve random variables in transform. + In train mode, crop_size is fixed, center coords and ratio is + random selected from predefined lists. In test mode, crop_size + is image's original shape, center coords and ratio is fixed. + Defaults to False. + test_pad_mode (tuple, optional): padding method and padding shape + value, only available in test mode. Default is using + 'logical_or' with 127 as padding shape value. + + - 'logical_or': final_shape = input_shape | padding_shape_value + - 'size_divisor': final_shape = int( + ceil(input_shape / padding_shape_value) * padding_shape_value) + + Defaults to ('logical_or', 127). + test_pad_add_pix (int): Extra padding pixel in test mode. + Defaults to 0. + bbox_clip_border (bool): Whether clip the objects outside + the border of the image. Defaults to True. + """ + + def __init__(self, + crop_size: Optional[tuple] = None, + ratios: Optional[tuple] = (0.9, 1.0, 1.1), + border: Optional[int] = 128, + mean: Optional[Sequence] = None, + std: Optional[Sequence] = None, + to_rgb: Optional[bool] = None, + test_mode: bool = False, + test_pad_mode: Optional[tuple] = ('logical_or', 127), + test_pad_add_pix: int = 0, + bbox_clip_border: bool = True) -> None: + if test_mode: + assert crop_size is None, 'crop_size must be None in test mode' + assert ratios is None, 'ratios must be None in test mode' + assert border is None, 'border must be None in test mode' + assert isinstance(test_pad_mode, (list, tuple)) + assert test_pad_mode[0] in ['logical_or', 'size_divisor'] + else: + assert isinstance(crop_size, (list, tuple)) + assert crop_size[0] > 0 and crop_size[1] > 0, ( + 'crop_size must > 0 in train mode') + assert isinstance(ratios, (list, tuple)) + assert test_pad_mode is None, ( + 'test_pad_mode must be None in train mode') + + self.crop_size = crop_size + self.ratios = ratios + self.border = border + # We do not set default value to mean, std and to_rgb because these + # hyper-parameters are easy to forget but could affect the performance. + # Please use the same setting as Normalize for performance assurance. + assert mean is not None and std is not None and to_rgb is not None + self.to_rgb = to_rgb + self.input_mean = mean + self.input_std = std + if to_rgb: + self.mean = mean[::-1] + self.std = std[::-1] + else: + self.mean = mean + self.std = std + self.test_mode = test_mode + self.test_pad_mode = test_pad_mode + self.test_pad_add_pix = test_pad_add_pix + self.bbox_clip_border = bbox_clip_border + + def _get_border(self, border, size): + """Get final border for the target size. + + This function generates a ``final_border`` according to image's shape. + The area between ``final_border`` and ``size - final_border`` is the + ``center range``. We randomly choose center from the ``center range`` + to avoid our random center is too close to original image's border. + Also ``center range`` should be larger than 0. + + Args: + border (int): The initial border, default is 128. + size (int): The width or height of original image. + Returns: + int: The final border. + """ + k = 2 * border / size + i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k))) + return border // i + + def _filter_boxes(self, patch, boxes): + """Check whether the center of each box is in the patch. + + Args: + patch (list[int]): The cropped area, [left, top, right, bottom]. + boxes (numpy array, (N x 4)): Ground truth boxes. + + Returns: + mask (numpy array, (N,)): Each box is inside or outside the patch. + """ + center = boxes.centers.numpy() + mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * ( + center[:, 0] < patch[2]) * ( + center[:, 1] < patch[3]) + return mask + + def _crop_image_and_paste(self, image, center, size): + """Crop image with a given center and size, then paste the cropped + image to a blank image with two centers align. + + This function is equivalent to generating a blank image with ``size`` + as its shape. Then cover it on the original image with two centers ( + the center of blank image and the random center of original image) + aligned. The overlap area is paste from the original image and the + outside area is filled with ``mean pixel``. + + Args: + image (np array, H x W x C): Original image. + center (list[int]): Target crop center coord. + size (list[int]): Target crop size. [target_h, target_w] + + Returns: + cropped_img (np array, target_h x target_w x C): Cropped image. + border (np array, 4): The distance of four border of + ``cropped_img`` to the original image area, [top, bottom, + left, right] + patch (list[int]): The cropped area, [left, top, right, bottom]. + """ + center_y, center_x = center + target_h, target_w = size + img_h, img_w, img_c = image.shape + + x0 = max(0, center_x - target_w // 2) + x1 = min(center_x + target_w // 2, img_w) + y0 = max(0, center_y - target_h // 2) + y1 = min(center_y + target_h // 2, img_h) + patch = np.array((int(x0), int(y0), int(x1), int(y1))) + + left, right = center_x - x0, x1 - center_x + top, bottom = center_y - y0, y1 - center_y + + cropped_center_y, cropped_center_x = target_h // 2, target_w // 2 + cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype) + for i in range(img_c): + cropped_img[:, :, i] += self.mean[i] + y_slice = slice(cropped_center_y - top, cropped_center_y + bottom) + x_slice = slice(cropped_center_x - left, cropped_center_x + right) + cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :] + + border = np.array([ + cropped_center_y - top, cropped_center_y + bottom, + cropped_center_x - left, cropped_center_x + right + ], + dtype=np.float32) + + return cropped_img, border, patch + + def _train_aug(self, results): + """Random crop and around padding the original image. + + Args: + results (dict): Image infomations in the augment pipeline. + + Returns: + results (dict): The updated dict. + """ + img = results['img'] + h, w, c = img.shape + gt_bboxes = results['gt_bboxes'] + while True: + scale = random.choice(self.ratios) + new_h = int(self.crop_size[1] * scale) + new_w = int(self.crop_size[0] * scale) + h_border = self._get_border(self.border, h) + w_border = self._get_border(self.border, w) + + for i in range(50): + center_x = random.randint(low=w_border, high=w - w_border) + center_y = random.randint(low=h_border, high=h - h_border) + + cropped_img, border, patch = self._crop_image_and_paste( + img, [center_y, center_x], [new_h, new_w]) + + if len(gt_bboxes) == 0: + results['img'] = cropped_img + results['img_shape'] = cropped_img.shape[:2] + return results + + # if image do not have valid bbox, any crop patch is valid. + mask = self._filter_boxes(patch, gt_bboxes) + if not mask.any(): + continue + + results['img'] = cropped_img + results['img_shape'] = cropped_img.shape[:2] + + x0, y0, x1, y1 = patch + + left_w, top_h = center_x - x0, center_y - y0 + cropped_center_x, cropped_center_y = new_w // 2, new_h // 2 + + # crop bboxes accordingly and clip to the image boundary + gt_bboxes = gt_bboxes[mask] + gt_bboxes.translate_([ + cropped_center_x - left_w - x0, + cropped_center_y - top_h - y0 + ]) + if self.bbox_clip_border: + gt_bboxes.clip_([new_h, new_w]) + keep = gt_bboxes.is_inside([new_h, new_w]).numpy() + gt_bboxes = gt_bboxes[keep] + + results['gt_bboxes'] = gt_bboxes + + # ignore_flags + if results.get('gt_ignore_flags', None) is not None: + gt_ignore_flags = results['gt_ignore_flags'][mask] + results['gt_ignore_flags'] = \ + gt_ignore_flags[keep] + + # labels + if results.get('gt_bboxes_labels', None) is not None: + gt_labels = results['gt_bboxes_labels'][mask] + results['gt_bboxes_labels'] = gt_labels[keep] + + if 'gt_masks' in results or 'gt_seg_map' in results: + raise NotImplementedError( + 'RandomCenterCropPad only supports bbox.') + + return results + + def _test_aug(self, results): + """Around padding the original image without cropping. + + The padding mode and value are from ``test_pad_mode``. + + Args: + results (dict): Image infomations in the augment pipeline. + + Returns: + results (dict): The updated dict. + """ + img = results['img'] + h, w, c = img.shape + if self.test_pad_mode[0] in ['logical_or']: + # self.test_pad_add_pix is only used for centernet + target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix + target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix + elif self.test_pad_mode[0] in ['size_divisor']: + divisor = self.test_pad_mode[1] + target_h = int(np.ceil(h / divisor)) * divisor + target_w = int(np.ceil(w / divisor)) * divisor + else: + raise NotImplementedError( + 'RandomCenterCropPad only support two testing pad mode:' + 'logical-or and size_divisor.') + + cropped_img, border, _ = self._crop_image_and_paste( + img, [h // 2, w // 2], [target_h, target_w]) + results['img'] = cropped_img + results['img_shape'] = cropped_img.shape[:2] + results['border'] = border + return results + + @autocast_box_type() + def transform(self, results: dict) -> dict: + img = results['img'] + assert img.dtype == np.float32, ( + 'RandomCenterCropPad needs the input image of dtype np.float32,' + ' please set "to_float32=True" in "LoadImageFromFile" pipeline') + h, w, c = img.shape + assert c == len(self.mean) + if self.test_mode: + return self._test_aug(results) + else: + return self._train_aug(results) + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(crop_size={self.crop_size}, ' + repr_str += f'ratios={self.ratios}, ' + repr_str += f'border={self.border}, ' + repr_str += f'mean={self.input_mean}, ' + repr_str += f'std={self.input_std}, ' + repr_str += f'to_rgb={self.to_rgb}, ' + repr_str += f'test_mode={self.test_mode}, ' + repr_str += f'test_pad_mode={self.test_pad_mode}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + +@TRANSFORMS.register_module() +class CutOut(BaseTransform): + """CutOut operation. + + Randomly drop some regions of image used in + `Cutout `_. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + n_holes (int or tuple[int, int]): Number of regions to be dropped. + If it is given as a list, number of holes will be randomly + selected from the closed interval [``n_holes[0]``, ``n_holes[1]``]. + cutout_shape (tuple[int, int] or list[tuple[int, int]], optional): + The candidate shape of dropped regions. It can be + ``tuple[int, int]`` to use a fixed cutout shape, or + ``list[tuple[int, int]]`` to randomly choose shape + from the list. Defaults to None. + cutout_ratio (tuple[float, float] or list[tuple[float, float]], + optional): The candidate ratio of dropped regions. It can be + ``tuple[float, float]`` to use a fixed ratio or + ``list[tuple[float, float]]`` to randomly choose ratio + from the list. Please note that ``cutout_shape`` and + ``cutout_ratio`` cannot be both given at the same time. + Defaults to None. + fill_in (tuple[float, float, float] or tuple[int, int, int]): The value + of pixel to fill in the dropped regions. Defaults to (0, 0, 0). + """ + + def __init__( + self, + n_holes: Union[int, Tuple[int, int]], + cutout_shape: Optional[Union[Tuple[int, int], + List[Tuple[int, int]]]] = None, + cutout_ratio: Optional[Union[Tuple[float, float], + List[Tuple[float, float]]]] = None, + fill_in: Union[Tuple[float, float, float], Tuple[int, int, + int]] = (0, 0, 0) + ) -> None: + + assert (cutout_shape is None) ^ (cutout_ratio is None), \ + 'Either cutout_shape or cutout_ratio should be specified.' + assert (isinstance(cutout_shape, (list, tuple)) + or isinstance(cutout_ratio, (list, tuple))) + if isinstance(n_holes, tuple): + assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1] + else: + n_holes = (n_holes, n_holes) + self.n_holes = n_holes + self.fill_in = fill_in + self.with_ratio = cutout_ratio is not None + self.candidates = cutout_ratio if self.with_ratio else cutout_shape + if not isinstance(self.candidates, list): + self.candidates = [self.candidates] + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Call function to drop some regions of image.""" + h, w, c = results['img'].shape + n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1) + for _ in range(n_holes): + x1 = np.random.randint(0, w) + y1 = np.random.randint(0, h) + index = np.random.randint(0, len(self.candidates)) + if not self.with_ratio: + cutout_w, cutout_h = self.candidates[index] + else: + cutout_w = int(self.candidates[index][0] * w) + cutout_h = int(self.candidates[index][1] * h) + + x2 = np.clip(x1 + cutout_w, 0, w) + y2 = np.clip(y1 + cutout_h, 0, h) + results['img'][y1:y2, x1:x2, :] = self.fill_in + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(n_holes={self.n_holes}, ' + repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio + else f'cutout_shape={self.candidates}, ') + repr_str += f'fill_in={self.fill_in})' + return repr_str + + +@TRANSFORMS.register_module() +class Mosaic(BaseTransform): + """Mosaic augmentation. + + Given 4 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + mosaic transform + center_x + +------------------------------+ + | pad | pad | + | +-----------+ | + | | | | + | | image1 |--------+ | + | | | | | + | | | image2 | | + center_y |----+-------------+-----------| + | | cropped | | + |pad | image3 | image4 | + | | | | + +----|-------------+-----------+ + | | + +-------------+ + + The mosaic transform steps are as follows: + + 1. Choose the mosaic center as the intersections of 4 images + 2. Get the left top image according to the index, and randomly + sample another 3 images from the custom dataset. + 3. Sub image will be cropped if image is larger than mosaic patch + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size before mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + center_ratio_range (Sequence[float]): Center ratio range of mosaic + output. Defaults to (0.5, 1.5). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + prob: float = 1.0) -> None: + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + + log_img_scale(img_scale, skip_square=True, shape_order='wh') + self.img_scale = img_scale + self.center_ratio_range = center_ratio_range + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + self.prob = prob + + @cache_randomness + def get_indexes(self, dataset: BaseDataset) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`MultiImageMixDataset`): The dataset. + + Returns: + list: indexes. + """ + + indexes = [random.randint(0, len(dataset)) for _ in range(3)] + return indexes + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Mosaic transform function. + + Args: + results (dict): Result dict. + + Returns: + dict: Updated result dict. + """ + if random.uniform(0, 1) > self.prob: + return results + + assert 'mix_results' in results + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full( + (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)), + self.pad_val, + dtype=results['img'].dtype) + + # mosaic center x, y + center_x = int( + random.uniform(*self.center_ratio_range) * self.img_scale[0]) + center_y = int( + random.uniform(*self.center_ratio_range) * self.img_scale[1]) + center_position = (center_x, center_y) + + loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') + for i, loc in enumerate(loc_strs): + if loc == 'top_left': + results_patch = copy.deepcopy(results) + else: + results_patch = copy.deepcopy(results['mix_results'][i - 1]) + + img_i = results_patch['img'] + h_i, w_i = img_i.shape[:2] + # keep_ratio resize + scale_ratio_i = min(self.img_scale[1] / h_i, + self.img_scale[0] / w_i) + img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) + + # compute the combine parameters + paste_coord, crop_coord = self._mosaic_combine( + loc, center_position, img_i.shape[:2][::-1]) + x1_p, y1_p, x2_p, y2_p = paste_coord + x1_c, y1_c, x2_c, y2_c = crop_coord + + # crop and paste image + mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] + + # adjust coordinate + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + + padw = x1_p - x1_c + padh = y1_p - y1_c + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]]) + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape[:2] + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + return results + + def _mosaic_combine( + self, loc: str, center_position_xy: Sequence[float], + img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]: + """Calculate global coordinate of mosaic image and local coordinate of + cropped sub-image. + + Args: + loc (str): Index for the sub-image, loc in ('top_left', + 'top_right', 'bottom_left', 'bottom_right'). + center_position_xy (Sequence[float]): Mixing center for 4 images, + (x, y). + img_shape_wh (Sequence[int]): Width and height of sub-image + + Returns: + tuple[tuple[float]]: Corresponding coordinate of pasting and + cropping + - paste_coord (tuple): paste corner coordinate in mosaic image. + - crop_coord (tuple): crop corner coordinate in mosaic image. + """ + assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') + if loc == 'top_left': + # index0 to top left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + center_position_xy[0], \ + center_position_xy[1] + crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - ( + y2 - y1), img_shape_wh[0], img_shape_wh[1] + + elif loc == 'top_right': + # index1 to top right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + center_position_xy[1] + crop_coord = 0, img_shape_wh[1] - (y2 - y1), min( + img_shape_wh[0], x2 - x1), img_shape_wh[1] + + elif loc == 'bottom_left': + # index2 to bottom left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + center_position_xy[1], \ + center_position_xy[0], \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min( + y2 - y1, img_shape_wh[1]) + + else: + # index3 to bottom right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + center_position_xy[1], \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = 0, 0, min(img_shape_wh[0], + x2 - x1), min(y2 - y1, img_shape_wh[1]) + + paste_coord = x1, y1, x2, y2 + return paste_coord, crop_coord + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'center_ratio_range={self.center_ratio_range}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class MixUp(BaseTransform): + """MixUp data augmentation. + + .. code:: text + + mixup transform + +------------------------------+ + | mixup image | | + | +--------|--------+ | + | | | | | + |---------------+ | | + | | | | + | | image | | + | | | | + | | | | + | |-----------------+ | + | pad | + +------------------------------+ + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset and embedded in + the top left patch(after padding and resizing) + 2. The target of mixup transform is the weighted average of mixup + image and origin image. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + img_scale (Sequence[int]): Image output size after mixup pipeline. + The shape order should be (width, height). Defaults to (640, 640). + ratio_range (Sequence[float]): Scale ratio of mixup image. + Defaults to (0.5, 1.5). + flip_ratio (float): Horizontal flip ratio of mixup image. + Defaults to 0.5. + pad_val (int): Pad value. Defaults to 114. + max_iters (int): The maximum number of iterations. If the number of + iterations is greater than `max_iters`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + ratio_range: Tuple[float, float] = (0.5, 1.5), + flip_ratio: float = 0.5, + pad_val: float = 114.0, + max_iters: int = 15, + bbox_clip_border: bool = True) -> None: + assert isinstance(img_scale, tuple) + log_img_scale(img_scale, skip_square=True, shape_order='wh') + self.dynamic_scale = img_scale + self.ratio_range = ratio_range + self.flip_ratio = flip_ratio + self.pad_val = pad_val + self.max_iters = max_iters + self.bbox_clip_border = bbox_clip_border + + @cache_randomness + def get_indexes(self, dataset: BaseDataset) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`MultiImageMixDataset`): The dataset. + + Returns: + list: indexes. + """ + + for i in range(self.max_iters): + index = random.randint(0, len(dataset)) + gt_bboxes_i = dataset[index]['gt_bboxes'] + if len(gt_bboxes_i) != 0: + break + + return index + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """MixUp transform function. + + Args: + results (dict): Result dict. + + Returns: + dict: Updated result dict. + """ + + assert 'mix_results' in results + assert len( + results['mix_results']) == 1, 'MixUp only support 2 images now !' + + if results['mix_results'][0]['gt_bboxes'].shape[0] == 0: + # empty bbox + return results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + + jit_factor = random.uniform(*self.ratio_range) + is_flip = random.uniform(0, 1) > self.flip_ratio + + if len(retrieve_img.shape) == 3: + out_img = np.ones( + (self.dynamic_scale[1], self.dynamic_scale[0], 3), + dtype=retrieve_img.dtype) * self.pad_val + else: + out_img = np.ones( + self.dynamic_scale[::-1], + dtype=retrieve_img.dtype) * self.pad_val + + # 1. keep_ratio resize + scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0], + self.dynamic_scale[0] / retrieve_img.shape[1]) + retrieve_img = mmcv.imresize( + retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), + int(retrieve_img.shape[0] * scale_ratio))) + + # 2. paste + out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img + + # 3. scale jit + scale_ratio *= jit_factor + out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), + int(out_img.shape[0] * jit_factor))) + + # 4. flip + if is_flip: + out_img = out_img[:, ::-1, :] + + # 5. random crop + ori_img = results['img'] + origin_h, origin_w = out_img.shape[:2] + target_h, target_w = ori_img.shape[:2] + padded_img = np.ones((max(origin_h, target_h), max( + origin_w, target_w), 3)) * self.pad_val + padded_img = padded_img.astype(np.uint8) + padded_img[:origin_h, :origin_w] = out_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, + x_offset:x_offset + target_w] + + # 6. adjust bbox + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) + if self.bbox_clip_border: + retrieve_gt_bboxes.clip_([origin_h, origin_w]) + + if is_flip: + retrieve_gt_bboxes.flip_([origin_h, origin_w], + direction='horizontal') + + # 7. filter + cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() + cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) + if self.bbox_clip_border: + cp_retrieve_gt_bboxes.clip_([target_h, target_w]) + + # 8. mix up + ori_img = ori_img.astype(np.float32) + mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32) + + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( + (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + + # remove outside bbox + inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy() + mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] + mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] + mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape[:2] + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(dynamic_scale={self.dynamic_scale}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'flip_ratio={self.flip_ratio}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'max_iters={self.max_iters}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + +@TRANSFORMS.register_module() +class RandomAffine(BaseTransform): + """Random affine transform data augmentation. + + This operation randomly generates affine transform matrix which including + rotation, translation, shear and scaling transforms. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + max_rotate_degree (float): Maximum degrees of rotation transform. + Defaults to 10. + max_translate_ratio (float): Maximum ratio of translation. + Defaults to 0.1. + scaling_ratio_range (tuple[float]): Min and max ratio of + scaling transform. Defaults to (0.5, 1.5). + max_shear_degree (float): Maximum degrees of shear + transform. Defaults to 2. + border (tuple[int]): Distance from width and height sides of input + image to adjust output shape. Only used in mosaic dataset. + Defaults to (0, 0). + border_val (tuple[int]): Border padding values of 3 channels. + Defaults to (114, 114, 114). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + """ + + def __init__(self, + max_rotate_degree: float = 10.0, + max_translate_ratio: float = 0.1, + scaling_ratio_range: Tuple[float, float] = (0.5, 1.5), + max_shear_degree: float = 2.0, + border: Tuple[int, int] = (0, 0), + border_val: Tuple[int, int, int] = (114, 114, 114), + bbox_clip_border: bool = True) -> None: + assert 0 <= max_translate_ratio <= 1 + assert scaling_ratio_range[0] <= scaling_ratio_range[1] + assert scaling_ratio_range[0] > 0 + self.max_rotate_degree = max_rotate_degree + self.max_translate_ratio = max_translate_ratio + self.scaling_ratio_range = scaling_ratio_range + self.max_shear_degree = max_shear_degree + self.border = border + self.border_val = border_val + self.bbox_clip_border = bbox_clip_border + + @cache_randomness + def _get_random_homography_matrix(self, height, width): + # Rotation + rotation_degree = random.uniform(-self.max_rotate_degree, + self.max_rotate_degree) + rotation_matrix = self._get_rotation_matrix(rotation_degree) + + # Scaling + scaling_ratio = random.uniform(self.scaling_ratio_range[0], + self.scaling_ratio_range[1]) + scaling_matrix = self._get_scaling_matrix(scaling_ratio) + + # Shear + x_degree = random.uniform(-self.max_shear_degree, + self.max_shear_degree) + y_degree = random.uniform(-self.max_shear_degree, + self.max_shear_degree) + shear_matrix = self._get_shear_matrix(x_degree, y_degree) + + # Translation + trans_x = random.uniform(-self.max_translate_ratio, + self.max_translate_ratio) * width + trans_y = random.uniform(-self.max_translate_ratio, + self.max_translate_ratio) * height + translate_matrix = self._get_translation_matrix(trans_x, trans_y) + + warp_matrix = ( + translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix) + return warp_matrix + + @autocast_box_type() + def transform(self, results: dict) -> dict: + img = results['img'] + height = img.shape[0] + self.border[1] * 2 + width = img.shape[1] + self.border[0] * 2 + + warp_matrix = self._get_random_homography_matrix(height, width) + + img = cv2.warpPerspective( + img, + warp_matrix, + dsize=(width, height), + borderValue=self.border_val) + results['img'] = img + results['img_shape'] = img.shape[:2] + + bboxes = results['gt_bboxes'] + num_bboxes = len(bboxes) + if num_bboxes: + bboxes.project_(warp_matrix) + if self.bbox_clip_border: + bboxes.clip_([height, width]) + # remove outside bbox + valid_index = bboxes.is_inside([height, width]).numpy() + results['gt_bboxes'] = bboxes[valid_index] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + valid_index] + results['gt_ignore_flags'] = results['gt_ignore_flags'][ + valid_index] + + if 'gt_masks' in results: + raise NotImplementedError('RandomAffine only supports bbox.') + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(max_rotate_degree={self.max_rotate_degree}, ' + repr_str += f'max_translate_ratio={self.max_translate_ratio}, ' + repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, ' + repr_str += f'max_shear_degree={self.max_shear_degree}, ' + repr_str += f'border={self.border}, ' + repr_str += f'border_val={self.border_val}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + @staticmethod + def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray: + radian = math.radians(rotate_degrees) + rotation_matrix = np.array( + [[np.cos(radian), -np.sin(radian), 0.], + [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]], + dtype=np.float32) + return rotation_matrix + + @staticmethod + def _get_scaling_matrix(scale_ratio: float) -> np.ndarray: + scaling_matrix = np.array( + [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]], + dtype=np.float32) + return scaling_matrix + + @staticmethod + def _get_shear_matrix(x_shear_degrees: float, + y_shear_degrees: float) -> np.ndarray: + x_radian = math.radians(x_shear_degrees) + y_radian = math.radians(y_shear_degrees) + shear_matrix = np.array([[1, np.tan(x_radian), 0.], + [np.tan(y_radian), 1, 0.], [0., 0., 1.]], + dtype=np.float32) + return shear_matrix + + @staticmethod + def _get_translation_matrix(x: float, y: float) -> np.ndarray: + translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]], + dtype=np.float32) + return translation_matrix + + +@TRANSFORMS.register_module() +class YOLOXHSVRandomAug(BaseTransform): + """Apply HSV augmentation to image sequentially. It is referenced from + https://github.com/Megvii- + BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + hue_delta (int): delta of hue. Defaults to 5. + saturation_delta (int): delta of saturation. Defaults to 30. + value_delta (int): delat of value. Defaults to 30. + """ + + def __init__(self, + hue_delta: int = 5, + saturation_delta: int = 30, + value_delta: int = 30) -> None: + self.hue_delta = hue_delta + self.saturation_delta = saturation_delta + self.value_delta = value_delta + + @cache_randomness + def _get_hsv_gains(self): + hsv_gains = np.random.uniform(-1, 1, 3) * [ + self.hue_delta, self.saturation_delta, self.value_delta + ] + # random selection of h, s, v + hsv_gains *= np.random.randint(0, 2, 3) + # prevent overflow + hsv_gains = hsv_gains.astype(np.int16) + return hsv_gains + + def transform(self, results: dict) -> dict: + img = results['img'] + hsv_gains = self._get_hsv_gains() + img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16) + + img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180 + img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255) + img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255) + cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img) + + results['img'] = img + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(hue_delta={self.hue_delta}, ' + repr_str += f'saturation_delta={self.saturation_delta}, ' + repr_str += f'value_delta={self.value_delta})' + return repr_str + + +@TRANSFORMS.register_module() +class CopyPaste(BaseTransform): + """Simple Copy-Paste is a Strong Data Augmentation Method for Instance + Segmentation The simple copy-paste transform steps are as follows: + + 1. The destination image is already resized with aspect ratio kept, + cropped and padded. + 2. Randomly select a source image, which is also already resized + with aspect ratio kept, cropped and padded in a similar way + as the destination image. + 3. Randomly select some objects from the source image. + 4. Paste these source objects to the destination image directly, + due to the source and destination image have the same size. + 5. Update object masks of the destination image, for some origin objects + may be occluded. + 6. Generate bboxes from the updated destination masks and + filter some objects which are totally occluded, and adjust bboxes + which are partly occluded. + 7. Append selected source bboxes, masks, and labels. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - gt_masks (BitmapMasks) (optional) + + Modified Keys: + + - img + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + - gt_masks (optional) + + Args: + max_num_pasted (int): The maximum number of pasted objects. + Defaults to 100. + bbox_occluded_thr (int): The threshold of occluded bbox. + Defaults to 10. + mask_occluded_thr (int): The threshold of occluded mask. + Defaults to 300. + selected (bool): Whether select objects or not. If select is False, + all objects of the source image will be pasted to the + destination image. + Defaults to True. + paste_by_box (bool): Whether use boxes as masks when masks are not + available. + Defaults to False. + """ + + def __init__( + self, + max_num_pasted: int = 100, + bbox_occluded_thr: int = 10, + mask_occluded_thr: int = 300, + selected: bool = True, + paste_by_box: bool = False, + ) -> None: + self.max_num_pasted = max_num_pasted + self.bbox_occluded_thr = bbox_occluded_thr + self.mask_occluded_thr = mask_occluded_thr + self.selected = selected + self.paste_by_box = paste_by_box + + @cache_randomness + def get_indexes(self, dataset: BaseDataset) -> int: + """Call function to collect indexes.s. + + Args: + dataset (:obj:`MultiImageMixDataset`): The dataset. + Returns: + list: Indexes. + """ + return random.randint(0, len(dataset)) + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to make a copy-paste of image. + + Args: + results (dict): Result dict. + Returns: + dict: Result dict with copy-paste transformed. + """ + + assert 'mix_results' in results + num_images = len(results['mix_results']) + assert num_images == 1, \ + f'CopyPaste only supports processing 2 images, got {num_images}' + if self.selected: + selected_results = self._select_object(results['mix_results'][0]) + else: + selected_results = results['mix_results'][0] + return self._copy_paste(results, selected_results) + + @cache_randomness + def _get_selected_inds(self, num_bboxes: int) -> np.ndarray: + max_num_pasted = min(num_bboxes + 1, self.max_num_pasted) + num_pasted = np.random.randint(0, max_num_pasted) + return np.random.choice(num_bboxes, size=num_pasted, replace=False) + + def get_gt_masks(self, results: dict) -> BitmapMasks: + """Get gt_masks originally or generated based on bboxes. + + If gt_masks is not contained in results, + it will be generated based on gt_bboxes. + Args: + results (dict): Result dict. + Returns: + BitmapMasks: gt_masks, originally or generated based on bboxes. + """ + if results.get('gt_masks', None) is not None: + if self.paste_by_box: + warnings.warn('gt_masks is already contained in results, ' + 'so paste_by_box is disabled.') + return results['gt_masks'] + else: + if not self.paste_by_box: + raise RuntimeError('results does not contain masks.') + return results['gt_bboxes'].create_masks(results['img'].shape[:2]) + + def _select_object(self, results: dict) -> dict: + """Select some objects from the source results.""" + bboxes = results['gt_bboxes'] + labels = results['gt_bboxes_labels'] + masks = self.get_gt_masks(results) + ignore_flags = results['gt_ignore_flags'] + + selected_inds = self._get_selected_inds(bboxes.shape[0]) + + selected_bboxes = bboxes[selected_inds] + selected_labels = labels[selected_inds] + selected_masks = masks[selected_inds] + selected_ignore_flags = ignore_flags[selected_inds] + + results['gt_bboxes'] = selected_bboxes + results['gt_bboxes_labels'] = selected_labels + results['gt_masks'] = selected_masks + results['gt_ignore_flags'] = selected_ignore_flags + return results + + def _copy_paste(self, dst_results: dict, src_results: dict) -> dict: + """CopyPaste transform function. + + Args: + dst_results (dict): Result dict of the destination image. + src_results (dict): Result dict of the source image. + Returns: + dict: Updated result dict. + """ + dst_img = dst_results['img'] + dst_bboxes = dst_results['gt_bboxes'] + dst_labels = dst_results['gt_bboxes_labels'] + dst_masks = self.get_gt_masks(dst_results) + dst_ignore_flags = dst_results['gt_ignore_flags'] + + src_img = src_results['img'] + src_bboxes = src_results['gt_bboxes'] + src_labels = src_results['gt_bboxes_labels'] + src_masks = src_results['gt_masks'] + src_ignore_flags = src_results['gt_ignore_flags'] + + if len(src_bboxes) == 0: + return dst_results + + # update masks and generate bboxes from updated masks + composed_mask = np.where(np.any(src_masks.masks, axis=0), 1, 0) + updated_dst_masks = self._get_updated_masks(dst_masks, composed_mask) + updated_dst_bboxes = updated_dst_masks.get_bboxes(type(dst_bboxes)) + assert len(updated_dst_bboxes) == len(updated_dst_masks) + + # filter totally occluded objects + l1_distance = (updated_dst_bboxes.tensor - dst_bboxes.tensor).abs() + bboxes_inds = (l1_distance <= self.bbox_occluded_thr).all( + dim=-1).numpy() + masks_inds = updated_dst_masks.masks.sum( + axis=(1, 2)) > self.mask_occluded_thr + valid_inds = bboxes_inds | masks_inds + + # Paste source objects to destination image directly + img = dst_img * (1 - composed_mask[..., np.newaxis] + ) + src_img * composed_mask[..., np.newaxis] + bboxes = src_bboxes.cat([updated_dst_bboxes[valid_inds], src_bboxes]) + labels = np.concatenate([dst_labels[valid_inds], src_labels]) + masks = np.concatenate( + [updated_dst_masks.masks[valid_inds], src_masks.masks]) + ignore_flags = np.concatenate( + [dst_ignore_flags[valid_inds], src_ignore_flags]) + + dst_results['img'] = img + dst_results['gt_bboxes'] = bboxes + dst_results['gt_bboxes_labels'] = labels + dst_results['gt_masks'] = BitmapMasks(masks, masks.shape[1], + masks.shape[2]) + dst_results['gt_ignore_flags'] = ignore_flags + + return dst_results + + def _get_updated_masks(self, masks: BitmapMasks, + composed_mask: np.ndarray) -> BitmapMasks: + """Update masks with composed mask.""" + assert masks.masks.shape[-2:] == composed_mask.shape[-2:], \ + 'Cannot compare two arrays of different size' + masks.masks = np.where(composed_mask, 0, masks.masks) + return masks + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(max_num_pasted={self.max_num_pasted}, ' + repr_str += f'bbox_occluded_thr={self.bbox_occluded_thr}, ' + repr_str += f'mask_occluded_thr={self.mask_occluded_thr}, ' + repr_str += f'selected={self.selected}), ' + repr_str += f'paste_by_box={self.paste_by_box})' + return repr_str + + +@TRANSFORMS.register_module() +class RandomErasing(BaseTransform): + """RandomErasing operation. + + Random Erasing randomly selects a rectangle region + in an image and erases its pixels with random values. + `RandomErasing `_. + + Required Keys: + + - img + - gt_bboxes (HorizontalBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - gt_masks (BitmapMasks) (optional) + + Modified Keys: + - img + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + - gt_masks (optional) + + Args: + n_patches (int or tuple[int, int]): Number of regions to be dropped. + If it is given as a tuple, number of patches will be randomly + selected from the closed interval [``n_patches[0]``, + ``n_patches[1]``]. + ratio (float or tuple[float, float]): The ratio of erased regions. + It can be ``float`` to use a fixed ratio or ``tuple[float, float]`` + to randomly choose ratio from the interval. + squared (bool): Whether to erase square region. Defaults to True. + bbox_erased_thr (float): The threshold for the maximum area proportion + of the bbox to be erased. When the proportion of the area where the + bbox is erased is greater than the threshold, the bbox will be + removed. Defaults to 0.9. + img_border_value (int or float or tuple): The filled values for + image border. If float, the same fill value will be used for + all the three channels of image. If tuple, it should be 3 elements. + Defaults to 128. + mask_border_value (int): The fill value used for masks. Defaults to 0. + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Defaults to 255. + """ + + def __init__( + self, + n_patches: Union[int, Tuple[int, int]], + ratio: Union[float, Tuple[float, float]], + squared: bool = True, + bbox_erased_thr: float = 0.9, + img_border_value: Union[int, float, tuple] = 128, + mask_border_value: int = 0, + seg_ignore_label: int = 255, + ) -> None: + if isinstance(n_patches, tuple): + assert len(n_patches) == 2 and 0 <= n_patches[0] < n_patches[1] + else: + n_patches = (n_patches, n_patches) + if isinstance(ratio, tuple): + assert len(ratio) == 2 and 0 <= ratio[0] < ratio[1] <= 1 + else: + ratio = (ratio, ratio) + + self.n_patches = n_patches + self.ratio = ratio + self.squared = squared + self.bbox_erased_thr = bbox_erased_thr + self.img_border_value = img_border_value + self.mask_border_value = mask_border_value + self.seg_ignore_label = seg_ignore_label + + @cache_randomness + def _get_patches(self, img_shape: Tuple[int, int]) -> List[list]: + """Get patches for random erasing.""" + patches = [] + n_patches = np.random.randint(self.n_patches[0], self.n_patches[1] + 1) + for _ in range(n_patches): + if self.squared: + ratio = np.random.random() * (self.ratio[1] - + self.ratio[0]) + self.ratio[0] + ratio = (ratio, ratio) + else: + ratio = (np.random.random() * (self.ratio[1] - self.ratio[0]) + + self.ratio[0], np.random.random() * + (self.ratio[1] - self.ratio[0]) + self.ratio[0]) + ph, pw = int(img_shape[0] * ratio[0]), int(img_shape[1] * ratio[1]) + px1, py1 = np.random.randint(0, + img_shape[1] - pw), np.random.randint( + 0, img_shape[0] - ph) + px2, py2 = px1 + pw, py1 + ph + patches.append([px1, py1, px2, py2]) + return np.array(patches) + + def _transform_img(self, results: dict, patches: List[list]) -> None: + """Random erasing the image.""" + for patch in patches: + px1, py1, px2, py2 = patch + results['img'][py1:py2, px1:px2, :] = self.img_border_value + + def _transform_bboxes(self, results: dict, patches: List[list]) -> None: + """Random erasing the bboxes.""" + bboxes = results['gt_bboxes'] + # TODO: unify the logic by using operators in BaseBoxes. + assert isinstance(bboxes, HorizontalBoxes) + bboxes = bboxes.numpy() + left_top = np.maximum(bboxes[:, None, :2], patches[:, :2]) + right_bottom = np.minimum(bboxes[:, None, 2:], patches[:, 2:]) + wh = np.maximum(right_bottom - left_top, 0) + inter_areas = wh[:, :, 0] * wh[:, :, 1] + bbox_areas = (bboxes[:, 2] - bboxes[:, 0]) * ( + bboxes[:, 3] - bboxes[:, 1]) + bboxes_erased_ratio = inter_areas.sum(-1) / (bbox_areas + 1e-7) + valid_inds = bboxes_erased_ratio < self.bbox_erased_thr + results['gt_bboxes'] = HorizontalBoxes(bboxes[valid_inds]) + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][valid_inds] + results['gt_ignore_flags'] = results['gt_ignore_flags'][valid_inds] + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'][valid_inds] + + def _transform_masks(self, results: dict, patches: List[list]) -> None: + """Random erasing the masks.""" + for patch in patches: + px1, py1, px2, py2 = patch + results['gt_masks'].masks[:, py1:py2, + px1:px2] = self.mask_border_value + + def _transform_seg(self, results: dict, patches: List[list]) -> None: + """Random erasing the segmentation map.""" + for patch in patches: + px1, py1, px2, py2 = patch + results['gt_seg_map'][py1:py2, px1:px2] = self.seg_ignore_label + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to erase some regions of image.""" + patches = self._get_patches(results['img_shape']) + self._transform_img(results, patches) + if results.get('gt_bboxes', None) is not None: + self._transform_bboxes(results, patches) + if results.get('gt_masks', None) is not None: + self._transform_masks(results, patches) + if results.get('gt_seg_map', None) is not None: + self._transform_seg(results, patches) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(n_patches={self.n_patches}, ' + repr_str += f'ratio={self.ratio}, ' + repr_str += f'squared={self.squared}, ' + repr_str += f'bbox_erased_thr={self.bbox_erased_thr}, ' + repr_str += f'img_border_value={self.img_border_value}, ' + repr_str += f'mask_border_value={self.mask_border_value}, ' + repr_str += f'seg_ignore_label={self.seg_ignore_label})' + return repr_str + + +@TRANSFORMS.register_module() +class CachedMosaic(Mosaic): + """Cached mosaic augmentation. + + Cached mosaic transform will random select images from the cache + and combine them into one output image. + + .. code:: text + + mosaic transform + center_x + +------------------------------+ + | pad | pad | + | +-----------+ | + | | | | + | | image1 |--------+ | + | | | | | + | | | image2 | | + center_y |----+-------------+-----------| + | | cropped | | + |pad | image3 | image4 | + | | | | + +----|-------------+-----------+ + | | + +-------------+ + + The cached mosaic transform steps are as follows: + + 1. Append the results from the last transform into the cache. + 2. Choose the mosaic center as the intersections of 4 images + 3. Get the left top image according to the index, and randomly + sample another 3 images from the result cache. + 4. Sub image will be cropped if image is larger than mosaic patch + + Required Keys: + + - img + - gt_bboxes (np.float32) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size before mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + center_ratio_range (Sequence[float]): Center ratio range of mosaic + output. Defaults to (0.5, 1.5). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + """ + + def __init__(self, + *args, + max_cached_images: int = 40, + random_pop: bool = True, + **kwargs) -> None: + super().__init__(*args, **kwargs) + self.results_cache = [] + self.random_pop = random_pop + assert max_cached_images >= 4, 'The length of cache must >= 4, ' \ + f'but got {max_cached_images}.' + self.max_cached_images = max_cached_images + + @cache_randomness + def get_indexes(self, cache: list) -> list: + """Call function to collect indexes. + + Args: + cache (list): The results cache. + + Returns: + list: indexes. + """ + + indexes = [random.randint(0, len(cache) - 1) for _ in range(3)] + return indexes + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Mosaic transform function. + + Args: + results (dict): Result dict. + + Returns: + dict: Updated result dict. + """ + # cache and pop images + self.results_cache.append(copy.deepcopy(results)) + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 4: + return results + + if random.uniform(0, 1) > self.prob: + return results + indices = self.get_indexes(self.results_cache) + mix_results = [copy.deepcopy(self.results_cache[i]) for i in indices] + + # TODO: refactor mosaic to reuse these code. + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + mosaic_masks = [] + with_mask = True if 'gt_masks' in results else False + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full( + (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)), + self.pad_val, + dtype=results['img'].dtype) + + # mosaic center x, y + center_x = int( + random.uniform(*self.center_ratio_range) * self.img_scale[0]) + center_y = int( + random.uniform(*self.center_ratio_range) * self.img_scale[1]) + center_position = (center_x, center_y) + + loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') + for i, loc in enumerate(loc_strs): + if loc == 'top_left': + results_patch = copy.deepcopy(results) + else: + results_patch = copy.deepcopy(mix_results[i - 1]) + + img_i = results_patch['img'] + h_i, w_i = img_i.shape[:2] + # keep_ratio resize + scale_ratio_i = min(self.img_scale[1] / h_i, + self.img_scale[0] / w_i) + img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) + + # compute the combine parameters + paste_coord, crop_coord = self._mosaic_combine( + loc, center_position, img_i.shape[:2][::-1]) + x1_p, y1_p, x2_p, y2_p = paste_coord + x1_c, y1_c, x2_c, y2_c = crop_coord + + # crop and paste image + mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] + + # adjust coordinate + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + + padw = x1_p - x1_c + padh = y1_p - y1_c + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + if with_mask and results_patch.get('gt_masks', None) is not None: + gt_masks_i = results_patch['gt_masks'] + gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i)) + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padw, + direction='horizontal') + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padh, + direction='vertical') + mosaic_masks.append(gt_masks_i) + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]]) + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape[:2] + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks) + results['gt_masks'] = mosaic_masks[inside_inds] + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'center_ratio_range={self.center_ratio_range}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob}, ' + repr_str += f'max_cached_images={self.max_cached_images}, ' + repr_str += f'random_pop={self.random_pop})' + return repr_str + + +@TRANSFORMS.register_module() +class CachedMixUp(BaseTransform): + """Cached mixup data augmentation. + + .. code:: text + + mixup transform + +------------------------------+ + | mixup image | | + | +--------|--------+ | + | | | | | + |---------------+ | | + | | | | + | | image | | + | | | | + | | | | + | |-----------------+ | + | pad | + +------------------------------+ + + The cached mixup transform steps are as follows: + + 1. Append the results from the last transform into the cache. + 2. Another random image is picked from the cache and embedded in + the top left patch(after padding and resizing) + 3. The target of mixup transform is the weighted average of mixup + image and origin image. + + Required Keys: + + - img + - gt_bboxes (np.float32) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + img_scale (Sequence[int]): Image output size after mixup pipeline. + The shape order should be (width, height). Defaults to (640, 640). + ratio_range (Sequence[float]): Scale ratio of mixup image. + Defaults to (0.5, 1.5). + flip_ratio (float): Horizontal flip ratio of mixup image. + Defaults to 0.5. + pad_val (int): Pad value. Defaults to 114. + max_iters (int): The maximum number of iterations. If the number of + iterations is greater than `max_iters`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + ratio_range: Tuple[float, float] = (0.5, 1.5), + flip_ratio: float = 0.5, + pad_val: float = 114.0, + max_iters: int = 15, + bbox_clip_border: bool = True, + max_cached_images: int = 20, + random_pop: bool = True, + prob: float = 1.0) -> None: + assert isinstance(img_scale, tuple) + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + self.dynamic_scale = img_scale + self.ratio_range = ratio_range + self.flip_ratio = flip_ratio + self.pad_val = pad_val + self.max_iters = max_iters + self.bbox_clip_border = bbox_clip_border + self.results_cache = [] + + self.max_cached_images = max_cached_images + self.random_pop = random_pop + self.prob = prob + + @cache_randomness + def get_indexes(self, cache: list) -> int: + """Call function to collect indexes. + + Args: + cache (list): The result cache. + + Returns: + int: index. + """ + + for i in range(self.max_iters): + index = random.randint(0, len(cache) - 1) + gt_bboxes_i = cache[index]['gt_bboxes'] + if len(gt_bboxes_i) != 0: + break + return index + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """MixUp transform function. + + Args: + results (dict): Result dict. + + Returns: + dict: Updated result dict. + """ + # cache and pop images + self.results_cache.append(copy.deepcopy(results)) + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 1: + return results + + if random.uniform(0, 1) > self.prob: + return results + + index = self.get_indexes(self.results_cache) + retrieve_results = copy.deepcopy(self.results_cache[index]) + + # TODO: refactor mixup to reuse these code. + if retrieve_results['gt_bboxes'].shape[0] == 0: + # empty bbox + return results + + retrieve_img = retrieve_results['img'] + with_mask = True if 'gt_masks' in results else False + + jit_factor = random.uniform(*self.ratio_range) + is_flip = random.uniform(0, 1) > self.flip_ratio + + if len(retrieve_img.shape) == 3: + out_img = np.ones( + (self.dynamic_scale[1], self.dynamic_scale[0], 3), + dtype=retrieve_img.dtype) * self.pad_val + else: + out_img = np.ones( + self.dynamic_scale[::-1], + dtype=retrieve_img.dtype) * self.pad_val + + # 1. keep_ratio resize + scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0], + self.dynamic_scale[0] / retrieve_img.shape[1]) + retrieve_img = mmcv.imresize( + retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), + int(retrieve_img.shape[0] * scale_ratio))) + + # 2. paste + out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img + + # 3. scale jit + scale_ratio *= jit_factor + out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), + int(out_img.shape[0] * jit_factor))) + + # 4. flip + if is_flip: + out_img = out_img[:, ::-1, :] + + # 5. random crop + ori_img = results['img'] + origin_h, origin_w = out_img.shape[:2] + target_h, target_w = ori_img.shape[:2] + padded_img = np.ones((max(origin_h, target_h), max( + origin_w, target_w), 3)) * self.pad_val + padded_img = padded_img.astype(np.uint8) + padded_img[:origin_h, :origin_w] = out_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, + x_offset:x_offset + target_w] + + # 6. adjust bbox + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) + if with_mask: + retrieve_gt_masks = retrieve_results['gt_masks'].rescale( + scale_ratio) + + if self.bbox_clip_border: + retrieve_gt_bboxes.clip_([origin_h, origin_w]) + + if is_flip: + retrieve_gt_bboxes.flip_([origin_h, origin_w], + direction='horizontal') + if with_mask: + retrieve_gt_masks = retrieve_gt_masks.flip() + + # 7. filter + cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() + cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) + if with_mask: + retrieve_gt_masks = retrieve_gt_masks.translate( + out_shape=(target_h, target_w), + offset=-x_offset, + direction='horizontal') + retrieve_gt_masks = retrieve_gt_masks.translate( + out_shape=(target_h, target_w), + offset=-y_offset, + direction='vertical') + + if self.bbox_clip_border: + cp_retrieve_gt_bboxes.clip_([target_h, target_w]) + + # 8. mix up + ori_img = ori_img.astype(np.float32) + mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32) + + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( + (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + if with_mask: + mixup_gt_masks = retrieve_gt_masks.cat( + [results['gt_masks'], retrieve_gt_masks]) + + # remove outside bbox + inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy() + mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] + mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] + mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] + if with_mask: + mixup_gt_masks = mixup_gt_masks[inside_inds] + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape[:2] + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + if with_mask: + results['gt_masks'] = mixup_gt_masks + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(dynamic_scale={self.dynamic_scale}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'flip_ratio={self.flip_ratio}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'max_iters={self.max_iters}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border}, ' + repr_str += f'max_cached_images={self.max_cached_images}, ' + repr_str += f'random_pop={self.random_pop}, ' + repr_str += f'prob={self.prob})' + return repr_str diff --git a/mmdetection/mmdet/datasets/transforms/wrappers.py b/mmdetection/mmdet/datasets/transforms/wrappers.py new file mode 100644 index 0000000..3a17711 --- /dev/null +++ b/mmdetection/mmdet/datasets/transforms/wrappers.py @@ -0,0 +1,277 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Callable, Dict, List, Optional, Union + +import numpy as np +from mmcv.transforms import BaseTransform, Compose +from mmcv.transforms.utils import cache_random_params, cache_randomness + +from mmdet.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class MultiBranch(BaseTransform): + r"""Multiple branch pipeline wrapper. + + Generate multiple data-augmented versions of the same image. + `MultiBranch` needs to specify the branch names of all + pipelines of the dataset, perform corresponding data augmentation + for the current branch, and return None for other branches, + which ensures the consistency of return format across + different samples. + + Args: + branch_field (list): List of branch names. + branch_pipelines (dict): Dict of different pipeline configs + to be composed. + + Examples: + >>> branch_field = ['sup', 'unsup_teacher', 'unsup_student'] + >>> sup_pipeline = [ + >>> dict(type='LoadImageFromFile'), + >>> dict(type='LoadAnnotations', with_bbox=True), + >>> dict(type='Resize', scale=(1333, 800), keep_ratio=True), + >>> dict(type='RandomFlip', prob=0.5), + >>> dict( + >>> type='MultiBranch', + >>> branch_field=branch_field, + >>> sup=dict(type='PackDetInputs')) + >>> ] + >>> weak_pipeline = [ + >>> dict(type='LoadImageFromFile'), + >>> dict(type='LoadAnnotations', with_bbox=True), + >>> dict(type='Resize', scale=(1333, 800), keep_ratio=True), + >>> dict(type='RandomFlip', prob=0.0), + >>> dict( + >>> type='MultiBranch', + >>> branch_field=branch_field, + >>> sup=dict(type='PackDetInputs')) + >>> ] + >>> strong_pipeline = [ + >>> dict(type='LoadImageFromFile'), + >>> dict(type='LoadAnnotations', with_bbox=True), + >>> dict(type='Resize', scale=(1333, 800), keep_ratio=True), + >>> dict(type='RandomFlip', prob=1.0), + >>> dict( + >>> type='MultiBranch', + >>> branch_field=branch_field, + >>> sup=dict(type='PackDetInputs')) + >>> ] + >>> unsup_pipeline = [ + >>> dict(type='LoadImageFromFile'), + >>> dict(type='LoadEmptyAnnotations'), + >>> dict( + >>> type='MultiBranch', + >>> branch_field=branch_field, + >>> unsup_teacher=weak_pipeline, + >>> unsup_student=strong_pipeline) + >>> ] + >>> from mmcv.transforms import Compose + >>> sup_branch = Compose(sup_pipeline) + >>> unsup_branch = Compose(unsup_pipeline) + >>> print(sup_branch) + >>> Compose( + >>> LoadImageFromFile(ignore_empty=False, to_float32=False, color_type='color', imdecode_backend='cv2') # noqa + >>> LoadAnnotations(with_bbox=True, with_label=True, with_mask=False, with_seg=False, poly2mask=True, imdecode_backend='cv2') # noqa + >>> Resize(scale=(1333, 800), scale_factor=None, keep_ratio=True, clip_object_border=True), backend=cv2), interpolation=bilinear) # noqa + >>> RandomFlip(prob=0.5, direction=horizontal) + >>> MultiBranch(branch_pipelines=['sup']) + >>> ) + >>> print(unsup_branch) + >>> Compose( + >>> LoadImageFromFile(ignore_empty=False, to_float32=False, color_type='color', imdecode_backend='cv2') # noqa + >>> LoadEmptyAnnotations(with_bbox=True, with_label=True, with_mask=False, with_seg=False, seg_ignore_label=255) # noqa + >>> MultiBranch(branch_pipelines=['unsup_teacher', 'unsup_student']) + >>> ) + """ + + def __init__(self, branch_field: List[str], + **branch_pipelines: dict) -> None: + self.branch_field = branch_field + self.branch_pipelines = { + branch: Compose(pipeline) + for branch, pipeline in branch_pipelines.items() + } + + def transform(self, results: dict) -> dict: + """Transform function to apply transforms sequentially. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: + + - 'inputs' (Dict[str, obj:`torch.Tensor`]): The forward data of + models from different branches. + - 'data_sample' (Dict[str,obj:`DetDataSample`]): The annotation + info of the sample from different branches. + """ + + multi_results = {} + for branch in self.branch_field: + multi_results[branch] = {'inputs': None, 'data_samples': None} + for branch, pipeline in self.branch_pipelines.items(): + branch_results = pipeline(copy.deepcopy(results)) + # If one branch pipeline returns None, + # it will sample another data from dataset. + if branch_results is None: + return None + multi_results[branch] = branch_results + + format_results = {} + for branch, results in multi_results.items(): + for key in results.keys(): + if format_results.get(key, None) is None: + format_results[key] = {branch: results[key]} + else: + format_results[key][branch] = results[key] + return format_results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(branch_pipelines={list(self.branch_pipelines.keys())})' + return repr_str + + +@TRANSFORMS.register_module() +class RandomOrder(Compose): + """Shuffle the transform Sequence.""" + + @cache_randomness + def _random_permutation(self): + return np.random.permutation(len(self.transforms)) + + def transform(self, results: Dict) -> Optional[Dict]: + """Transform function to apply transforms in random order. + + Args: + results (dict): A result dict contains the results to transform. + + Returns: + dict or None: Transformed results. + """ + inds = self._random_permutation() + for idx in inds: + t = self.transforms[idx] + results = t(results) + if results is None: + return None + return results + + def __repr__(self): + """Compute the string representation.""" + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + format_string += f'{t.__class__.__name__}, ' + format_string += ')' + return format_string + + +@TRANSFORMS.register_module() +class ProposalBroadcaster(BaseTransform): + """A transform wrapper to apply the wrapped transforms to process both + `gt_bboxes` and `proposals` without adding any codes. It will do the + following steps: + + 1. Scatter the broadcasting targets to a list of inputs of the wrapped + transforms. The type of the list should be list[dict, dict], which + the first is the original inputs, the second is the processing + results that `gt_bboxes` being rewritten by the `proposals`. + 2. Apply ``self.transforms``, with same random parameters, which is + sharing with a context manager. The type of the outputs is a + list[dict, dict]. + 3. Gather the outputs, update the `proposals` in the first item of + the outputs with the `gt_bboxes` in the second . + + Args: + transforms (list, optional): Sequence of transform + object or config dict to be wrapped. Defaults to []. + + Note: The `TransformBroadcaster` in MMCV can achieve the same operation as + `ProposalBroadcaster`, but need to set more complex parameters. + + Examples: + >>> pipeline = [ + >>> dict(type='LoadImageFromFile'), + >>> dict(type='LoadProposals', num_max_proposals=2000), + >>> dict(type='LoadAnnotations', with_bbox=True), + >>> dict( + >>> type='ProposalBroadcaster', + >>> transforms=[ + >>> dict(type='Resize', scale=(1333, 800), + >>> keep_ratio=True), + >>> dict(type='RandomFlip', prob=0.5), + >>> ]), + >>> dict(type='PackDetInputs')] + """ + + def __init__(self, transforms: List[Union[dict, Callable]] = []) -> None: + self.transforms = Compose(transforms) + + def transform(self, results: dict) -> dict: + """Apply wrapped transform functions to process both `gt_bboxes` and + `proposals`. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Updated result dict. + """ + assert results.get('proposals', None) is not None, \ + '`proposals` should be in the results, please delete ' \ + '`ProposalBroadcaster` in your configs, or check whether ' \ + 'you have load proposals successfully.' + + inputs = self._process_input(results) + outputs = self._apply_transforms(inputs) + outputs = self._process_output(outputs) + return outputs + + def _process_input(self, data: dict) -> list: + """Scatter the broadcasting targets to a list of inputs of the wrapped + transforms. + + Args: + data (dict): The original input data. + + Returns: + list[dict]: A list of input data. + """ + cp_data = copy.deepcopy(data) + cp_data['gt_bboxes'] = cp_data['proposals'] + scatters = [data, cp_data] + return scatters + + def _apply_transforms(self, inputs: list) -> list: + """Apply ``self.transforms``. + + Args: + inputs (list[dict, dict]): list of input data. + + Returns: + list[dict]: The output of the wrapped pipeline. + """ + assert len(inputs) == 2 + ctx = cache_random_params + with ctx(self.transforms): + output_scatters = [self.transforms(_input) for _input in inputs] + return output_scatters + + def _process_output(self, output_scatters: list) -> dict: + """Gathering and renaming data items. + + Args: + output_scatters (list[dict, dict]): The output of the wrapped + pipeline. + + Returns: + dict: Updated result dict. + """ + assert isinstance(output_scatters, list) and \ + isinstance(output_scatters[0], dict) and \ + len(output_scatters) == 2 + outputs = output_scatters[0] + outputs['proposals'] = output_scatters[1]['gt_bboxes'] + return outputs diff --git a/mmdetection/mmdet/datasets/utils.py b/mmdetection/mmdet/datasets/utils.py new file mode 100644 index 0000000..d794eb4 --- /dev/null +++ b/mmdetection/mmdet/datasets/utils.py @@ -0,0 +1,48 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmcv.transforms import LoadImageFromFile + +from mmdet.datasets.transforms import LoadAnnotations, LoadPanopticAnnotations +from mmdet.registry import TRANSFORMS + + +def get_loading_pipeline(pipeline): + """Only keep loading image and annotations related configuration. + + Args: + pipeline (list[dict]): Data pipeline configs. + + Returns: + list[dict]: The new pipeline list with only keep + loading image and annotations related configuration. + + Examples: + >>> pipelines = [ + ... dict(type='LoadImageFromFile'), + ... dict(type='LoadAnnotations', with_bbox=True), + ... dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + ... dict(type='RandomFlip', flip_ratio=0.5), + ... dict(type='Normalize', **img_norm_cfg), + ... dict(type='Pad', size_divisor=32), + ... dict(type='DefaultFormatBundle'), + ... dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) + ... ] + >>> expected_pipelines = [ + ... dict(type='LoadImageFromFile'), + ... dict(type='LoadAnnotations', with_bbox=True) + ... ] + >>> assert expected_pipelines ==\ + ... get_loading_pipeline(pipelines) + """ + loading_pipeline_cfg = [] + for cfg in pipeline: + obj_cls = TRANSFORMS.get(cfg['type']) + # TODO:use more elegant way to distinguish loading modules + if obj_cls is not None and obj_cls in (LoadImageFromFile, + LoadAnnotations, + LoadPanopticAnnotations): + loading_pipeline_cfg.append(cfg) + assert len(loading_pipeline_cfg) == 2, \ + 'The data pipeline in your config file must include ' \ + 'loading image and annotations related pipeline.' + return loading_pipeline_cfg diff --git a/mmdetection/mmdet/datasets/v3det.py b/mmdetection/mmdet/datasets/v3det.py new file mode 100644 index 0000000..25bfe3b --- /dev/null +++ b/mmdetection/mmdet/datasets/v3det.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path +from typing import Optional + +import mmengine + +from mmdet.registry import DATASETS +from .coco import CocoDataset + + +@DATASETS.register_module() +class V3DetDataset(CocoDataset): + """Dataset for V3Det.""" + + METAINFO = { + 'classes': None, + 'palette': None, + } + + def __init__( + self, + *args, + metainfo: Optional[dict] = None, + data_root: str = '', + label_file='annotations/category_name_13204_v3det_2023_v1.txt', # noqa + **kwargs) -> None: + class_names = tuple( + mmengine.list_from_file(os.path.join(data_root, label_file))) + if metainfo is None: + metainfo = {'classes': class_names} + super().__init__( + *args, data_root=data_root, metainfo=metainfo, **kwargs) diff --git a/mmdetection/mmdet/datasets/voc.py b/mmdetection/mmdet/datasets/voc.py new file mode 100644 index 0000000..65e73f2 --- /dev/null +++ b/mmdetection/mmdet/datasets/voc.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import DATASETS +from .xml_style import XMLDataset + + +@DATASETS.register_module() +class VOCDataset(XMLDataset): + """Dataset for PASCAL VOC.""" + + METAINFO = { + 'classes': + ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', + 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', + 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'), + # palette is a list of color tuples, which is used for visualization. + 'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192), + (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255), + (153, 69, 1), (120, 166, 157), (0, 182, 199), + (0, 226, 252), (182, 182, 255), (0, 0, 230), (220, 20, 60), + (163, 255, 0), (0, 82, 0), (3, 95, 161), (0, 80, 100), + (183, 130, 88)] + } + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if 'VOC2007' in self.sub_data_root: + self._metainfo['dataset_type'] = 'VOC2007' + elif 'VOC2012' in self.sub_data_root: + self._metainfo['dataset_type'] = 'VOC2012' + else: + self._metainfo['dataset_type'] = None diff --git a/mmdetection/mmdet/datasets/wider_face.py b/mmdetection/mmdet/datasets/wider_face.py new file mode 100644 index 0000000..62c7fff --- /dev/null +++ b/mmdetection/mmdet/datasets/wider_face.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import xml.etree.ElementTree as ET + +from mmengine.dist import is_main_process +from mmengine.fileio import get_local_path, list_from_file +from mmengine.utils import ProgressBar + +from mmdet.registry import DATASETS +from mmdet.utils.typing_utils import List, Union +from .xml_style import XMLDataset + + +@DATASETS.register_module() +class WIDERFaceDataset(XMLDataset): + """Reader for the WIDER Face dataset in PASCAL VOC format. + + Conversion scripts can be found in + https://github.com/sovrasov/wider-face-pascal-voc-annotations + """ + METAINFO = {'classes': ('face', ), 'palette': [(0, 255, 0)]} + + def load_data_list(self) -> List[dict]: + """Load annotation from XML style ann_file. + + Returns: + list[dict]: Annotation info from XML file. + """ + assert self._metainfo.get('classes', None) is not None, \ + 'classes in `XMLDataset` can not be None.' + self.cat2label = { + cat: i + for i, cat in enumerate(self._metainfo['classes']) + } + + data_list = [] + img_ids = list_from_file(self.ann_file, backend_args=self.backend_args) + + # loading process takes around 10 mins + if is_main_process(): + prog_bar = ProgressBar(len(img_ids)) + + for img_id in img_ids: + raw_img_info = {} + raw_img_info['img_id'] = img_id + raw_img_info['file_name'] = f'{img_id}.jpg' + parsed_data_info = self.parse_data_info(raw_img_info) + data_list.append(parsed_data_info) + + if is_main_process(): + prog_bar.update() + return data_list + + def parse_data_info(self, img_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + img_info (dict): Raw image information, usually it includes + `img_id`, `file_name`, and `xml_path`. + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + data_info = {} + img_id = img_info['img_id'] + xml_path = osp.join(self.data_prefix['img'], 'Annotations', + f'{img_id}.xml') + data_info['img_id'] = img_id + data_info['xml_path'] = xml_path + + # deal with xml file + with get_local_path( + xml_path, backend_args=self.backend_args) as local_path: + raw_ann_info = ET.parse(local_path) + root = raw_ann_info.getroot() + size = root.find('size') + width = int(size.find('width').text) + height = int(size.find('height').text) + folder = root.find('folder').text + img_path = osp.join(self.data_prefix['img'], folder, + img_info['file_name']) + data_info['img_path'] = img_path + + data_info['height'] = height + data_info['width'] = width + + # Coordinates are in range [0, width - 1 or height - 1] + data_info['instances'] = self._parse_instance_info( + raw_ann_info, minus_one=False) + return data_info diff --git a/mmdetection/mmdet/datasets/xml_style.py b/mmdetection/mmdet/datasets/xml_style.py new file mode 100644 index 0000000..06045ea --- /dev/null +++ b/mmdetection/mmdet/datasets/xml_style.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import xml.etree.ElementTree as ET +from typing import List, Optional, Union + +import mmcv +from mmengine.fileio import get, get_local_path, list_from_file + +from mmdet.registry import DATASETS +from .base_det_dataset import BaseDetDataset + + +@DATASETS.register_module() +class XMLDataset(BaseDetDataset): + """XML dataset for detection. + + Args: + img_subdir (str): Subdir where images are stored. Default: JPEGImages. + ann_subdir (str): Subdir where annotations are. Default: Annotations. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + """ + + def __init__(self, + img_subdir: str = 'JPEGImages', + ann_subdir: str = 'Annotations', + **kwargs) -> None: + self.img_subdir = img_subdir + self.ann_subdir = ann_subdir + super().__init__(**kwargs) + + @property + def sub_data_root(self) -> str: + """Return the sub data root.""" + return self.data_prefix.get('sub_data_root', '') + + def load_data_list(self) -> List[dict]: + """Load annotation from XML style ann_file. + + Returns: + list[dict]: Annotation info from XML file. + """ + assert self._metainfo.get('classes', None) is not None, \ + '`classes` in `XMLDataset` can not be None.' + self.cat2label = { + cat: i + for i, cat in enumerate(self._metainfo['classes']) + } + + data_list = [] + img_ids = list_from_file(self.ann_file, backend_args=self.backend_args) + for img_id in img_ids: + file_name = osp.join(self.img_subdir, f'{img_id}.jpg') + xml_path = osp.join(self.sub_data_root, self.ann_subdir, + f'{img_id}.xml') + + raw_img_info = {} + raw_img_info['img_id'] = img_id + raw_img_info['file_name'] = file_name + raw_img_info['xml_path'] = xml_path + + parsed_data_info = self.parse_data_info(raw_img_info) + data_list.append(parsed_data_info) + return data_list + + @property + def bbox_min_size(self) -> Optional[int]: + """Return the minimum size of bounding boxes in the images.""" + if self.filter_cfg is not None: + return self.filter_cfg.get('bbox_min_size', None) + else: + return None + + def parse_data_info(self, img_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + img_info (dict): Raw image information, usually it includes + `img_id`, `file_name`, and `xml_path`. + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + data_info = {} + img_path = osp.join(self.sub_data_root, img_info['file_name']) + data_info['img_path'] = img_path + data_info['img_id'] = img_info['img_id'] + data_info['xml_path'] = img_info['xml_path'] + + # deal with xml file + with get_local_path( + img_info['xml_path'], + backend_args=self.backend_args) as local_path: + raw_ann_info = ET.parse(local_path) + root = raw_ann_info.getroot() + size = root.find('size') + if size is not None: + width = int(size.find('width').text) + height = int(size.find('height').text) + else: + img_bytes = get(img_path, backend_args=self.backend_args) + img = mmcv.imfrombytes(img_bytes, backend='cv2') + height, width = img.shape[:2] + del img, img_bytes + + data_info['height'] = height + data_info['width'] = width + + data_info['instances'] = self._parse_instance_info( + raw_ann_info, minus_one=True) + + return data_info + + def _parse_instance_info(self, + raw_ann_info: ET, + minus_one: bool = True) -> List[dict]: + """parse instance information. + + Args: + raw_ann_info (ElementTree): ElementTree object. + minus_one (bool): Whether to subtract 1 from the coordinates. + Defaults to True. + + Returns: + List[dict]: List of instances. + """ + instances = [] + for obj in raw_ann_info.findall('object'): + instance = {} + name = obj.find('name').text + if name not in self._metainfo['classes']: + continue + difficult = obj.find('difficult') + difficult = 0 if difficult is None else int(difficult.text) + bnd_box = obj.find('bndbox') + bbox = [ + int(float(bnd_box.find('xmin').text)), + int(float(bnd_box.find('ymin').text)), + int(float(bnd_box.find('xmax').text)), + int(float(bnd_box.find('ymax').text)) + ] + + # VOC needs to subtract 1 from the coordinates + if minus_one: + bbox = [x - 1 for x in bbox] + + ignore = False + if self.bbox_min_size is not None: + assert not self.test_mode + w = bbox[2] - bbox[0] + h = bbox[3] - bbox[1] + if w < self.bbox_min_size or h < self.bbox_min_size: + ignore = True + if difficult or ignore: + instance['ignore_flag'] = 1 + else: + instance['ignore_flag'] = 0 + instance['bbox'] = bbox + instance['bbox_label'] = self.cat2label[name] + instances.append(instance) + return instances + + def filter_data(self) -> List[dict]: + """Filter annotations according to filter_cfg. + + Returns: + List[dict]: Filtered results. + """ + if self.test_mode: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \ + if self.filter_cfg is not None else False + min_size = self.filter_cfg.get('min_size', 0) \ + if self.filter_cfg is not None else 0 + + valid_data_infos = [] + for i, data_info in enumerate(self.data_list): + width = data_info['width'] + height = data_info['height'] + if filter_empty_gt and len(data_info['instances']) == 0: + continue + if min(width, height) >= min_size: + valid_data_infos.append(data_info) + + return valid_data_infos diff --git a/mmdetection/mmdet/datasets/youtube_vis_dataset.py b/mmdetection/mmdet/datasets/youtube_vis_dataset.py new file mode 100644 index 0000000..38c3d39 --- /dev/null +++ b/mmdetection/mmdet/datasets/youtube_vis_dataset.py @@ -0,0 +1,52 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import DATASETS +from .base_video_dataset import BaseVideoDataset + + +@DATASETS.register_module() +class YouTubeVISDataset(BaseVideoDataset): + """YouTube VIS dataset for video instance segmentation. + + Args: + dataset_version (str): Select dataset year version. + """ + + def __init__(self, dataset_version: str, *args, **kwargs): + self.set_dataset_classes(dataset_version) + super().__init__(*args, **kwargs) + + @classmethod + def set_dataset_classes(cls, dataset_version: str) -> None: + """Pass the category of the corresponding year to metainfo. + + Args: + dataset_version (str): Select dataset year version. + """ + classes_2019_version = ('person', 'giant_panda', 'lizard', 'parrot', + 'skateboard', 'sedan', 'ape', 'dog', 'snake', + 'monkey', 'hand', 'rabbit', 'duck', 'cat', + 'cow', 'fish', 'train', 'horse', 'turtle', + 'bear', 'motorbike', 'giraffe', 'leopard', + 'fox', 'deer', 'owl', 'surfboard', 'airplane', + 'truck', 'zebra', 'tiger', 'elephant', + 'snowboard', 'boat', 'shark', 'mouse', 'frog', + 'eagle', 'earless_seal', 'tennis_racket') + + classes_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car', + 'cat', 'cow', 'deer', 'dog', 'duck', + 'earless_seal', 'elephant', 'fish', + 'flying_disc', 'fox', 'frog', 'giant_panda', + 'giraffe', 'horse', 'leopard', 'lizard', + 'monkey', 'motorbike', 'mouse', 'parrot', + 'person', 'rabbit', 'shark', 'skateboard', + 'snake', 'snowboard', 'squirrel', 'surfboard', + 'tennis_racket', 'tiger', 'train', 'truck', + 'turtle', 'whale', 'zebra') + + if dataset_version == '2019': + cls.METAINFO = dict(classes=classes_2019_version) + elif dataset_version == '2021': + cls.METAINFO = dict(classes=classes_2021_version) + else: + raise NotImplementedError('Not supported YouTubeVIS dataset' + f'version: {dataset_version}') diff --git a/mmdetection/mmdet/engine/__init__.py b/mmdetection/mmdet/engine/__init__.py new file mode 100644 index 0000000..c91ace6 --- /dev/null +++ b/mmdetection/mmdet/engine/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hooks import * # noqa: F401, F403 +from .optimizers import * # noqa: F401, F403 +from .runner import * # noqa: F401, F403 +from .schedulers import * # noqa: F401, F403 diff --git a/mmdetection/mmdet/engine/hooks/__init__.py b/mmdetection/mmdet/engine/hooks/__init__.py new file mode 100644 index 0000000..35f7688 --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .checkloss_hook import CheckInvalidLossHook +from .mean_teacher_hook import MeanTeacherHook +from .memory_profiler_hook import MemoryProfilerHook +from .num_class_check_hook import NumClassCheckHook +from .pipeline_switch_hook import PipelineSwitchHook +from .set_epoch_info_hook import SetEpochInfoHook +from .sync_norm_hook import SyncNormHook +from .utils import trigger_visualization_hook +from .visualization_hook import DetVisualizationHook, TrackVisualizationHook +from .yolox_mode_switch_hook import YOLOXModeSwitchHook +from .submission_hook import SubmissionHook + +__all__ = [ + 'YOLOXModeSwitchHook', 'SyncNormHook', 'CheckInvalidLossHook', + 'SetEpochInfoHook', 'MemoryProfilerHook', 'DetVisualizationHook', + 'NumClassCheckHook', 'MeanTeacherHook', 'trigger_visualization_hook', + 'PipelineSwitchHook', 'TrackVisualizationHook', + 'SubmissionHook' +] diff --git a/mmdetection/mmdet/engine/hooks/checkloss_hook.py b/mmdetection/mmdet/engine/hooks/checkloss_hook.py new file mode 100644 index 0000000..3ebfcd5 --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/checkloss_hook.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from mmengine.hooks import Hook +from mmengine.runner import Runner + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class CheckInvalidLossHook(Hook): + """Check invalid loss hook. + + This hook will regularly check whether the loss is valid + during training. + + Args: + interval (int): Checking interval (every k iterations). + Default: 50. + """ + + def __init__(self, interval: int = 50) -> None: + self.interval = interval + + def after_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None, + outputs: Optional[dict] = None) -> None: + """Regularly check whether the loss is valid every n iterations. + + Args: + runner (:obj:`Runner`): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict, Optional): Data from dataloader. + Defaults to None. + outputs (dict, Optional): Outputs from model. Defaults to None. + """ + if self.every_n_train_iters(runner, self.interval): + assert torch.isfinite(outputs['loss']), \ + runner.logger.info('loss become infinite or NaN!') diff --git a/mmdetection/mmdet/engine/hooks/mean_teacher_hook.py b/mmdetection/mmdet/engine/hooks/mean_teacher_hook.py new file mode 100644 index 0000000..b924c0a --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/mean_teacher_hook.py @@ -0,0 +1,87 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.hooks import Hook +from mmengine.model import is_model_wrapper +from mmengine.runner import Runner + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class MeanTeacherHook(Hook): + """Mean Teacher Hook. + + Mean Teacher is an efficient semi-supervised learning method in + `Mean Teacher `_. + This method requires two models with exactly the same structure, + as the student model and the teacher model, respectively. + The student model updates the parameters through gradient descent, + and the teacher model updates the parameters through + exponential moving average of the student model. + Compared with the student model, the teacher model + is smoother and accumulates more knowledge. + + Args: + momentum (float): The momentum used for updating teacher's parameter. + Teacher's parameter are updated with the formula: + `teacher = (1-momentum) * teacher + momentum * student`. + Defaults to 0.001. + interval (int): Update teacher's parameter every interval iteration. + Defaults to 1. + skip_buffers (bool): Whether to skip the model buffers, such as + batchnorm running stats (running_mean, running_var), it does not + perform the ema operation. Default to True. + """ + + def __init__(self, + momentum: float = 0.001, + interval: int = 1, + skip_buffer=True) -> None: + assert 0 < momentum < 1 + self.momentum = momentum + self.interval = interval + self.skip_buffers = skip_buffer + + def before_train(self, runner: Runner) -> None: + """To check that teacher model and student model exist.""" + model = runner.model + if is_model_wrapper(model): + model = model.module + assert hasattr(model, 'teacher') + assert hasattr(model, 'student') + # only do it at initial stage + if runner.iter == 0: + self.momentum_update(model, 1) + + def after_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None, + outputs: Optional[dict] = None) -> None: + """Update teacher's parameter every self.interval iterations.""" + if (runner.iter + 1) % self.interval != 0: + return + model = runner.model + if is_model_wrapper(model): + model = model.module + self.momentum_update(model, self.momentum) + + def momentum_update(self, model: nn.Module, momentum: float) -> None: + """Compute the moving average of the parameters using exponential + moving average.""" + if self.skip_buffers: + for (src_name, src_parm), (dst_name, dst_parm) in zip( + model.student.named_parameters(), + model.teacher.named_parameters()): + dst_parm.data.mul_(1 - momentum).add_( + src_parm.data, alpha=momentum) + else: + for (src_parm, + dst_parm) in zip(model.student.state_dict().values(), + model.teacher.state_dict().values()): + # exclude num_tracking + if dst_parm.dtype.is_floating_point: + dst_parm.data.mul_(1 - momentum).add_( + src_parm.data, alpha=momentum) diff --git a/mmdetection/mmdet/engine/hooks/memory_profiler_hook.py b/mmdetection/mmdet/engine/hooks/memory_profiler_hook.py new file mode 100644 index 0000000..3dcdcae --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/memory_profiler_hook.py @@ -0,0 +1,121 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence + +from mmengine.hooks import Hook +from mmengine.runner import Runner + +from mmdet.registry import HOOKS +from mmdet.structures import DetDataSample + + +@HOOKS.register_module() +class MemoryProfilerHook(Hook): + """Memory profiler hook recording memory information including virtual + memory, swap memory, and the memory of the current process. + + Args: + interval (int): Checking interval (every k iterations). + Default: 50. + """ + + def __init__(self, interval: int = 50) -> None: + try: + from psutil import swap_memory, virtual_memory + self._swap_memory = swap_memory + self._virtual_memory = virtual_memory + except ImportError: + raise ImportError('psutil is not installed, please install it by: ' + 'pip install psutil') + + try: + from memory_profiler import memory_usage + self._memory_usage = memory_usage + except ImportError: + raise ImportError( + 'memory_profiler is not installed, please install it by: ' + 'pip install memory_profiler') + + self.interval = interval + + def _record_memory_information(self, runner: Runner) -> None: + """Regularly record memory information. + + Args: + runner (:obj:`Runner`): The runner of the training or evaluation + process. + """ + # in Byte + virtual_memory = self._virtual_memory() + swap_memory = self._swap_memory() + # in MB + process_memory = self._memory_usage()[0] + factor = 1024 * 1024 + runner.logger.info( + 'Memory information ' + 'available_memory: ' + f'{round(virtual_memory.available / factor)} MB, ' + 'used_memory: ' + f'{round(virtual_memory.used / factor)} MB, ' + f'memory_utilization: {virtual_memory.percent} %, ' + 'available_swap_memory: ' + f'{round((swap_memory.total - swap_memory.used) / factor)}' + ' MB, ' + f'used_swap_memory: {round(swap_memory.used / factor)} MB, ' + f'swap_memory_utilization: {swap_memory.percent} %, ' + 'current_process_memory: ' + f'{round(process_memory)} MB') + + def after_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None, + outputs: Optional[dict] = None) -> None: + """Regularly record memory information. + + Args: + runner (:obj:`Runner`): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict, optional): Data from dataloader. + Defaults to None. + outputs (dict, optional): Outputs from model. Defaults to None. + """ + if self.every_n_inner_iters(batch_idx, self.interval): + self._record_memory_information(runner) + + def after_val_iter( + self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None, + outputs: Optional[Sequence[DetDataSample]] = None) -> None: + """Regularly record memory information. + + Args: + runner (:obj:`Runner`): The runner of the validation process. + batch_idx (int): The index of the current batch in the val loop. + data_batch (dict, optional): Data from dataloader. + Defaults to None. + outputs (Sequence[:obj:`DetDataSample`], optional): + Outputs from model. Defaults to None. + """ + if self.every_n_inner_iters(batch_idx, self.interval): + self._record_memory_information(runner) + + def after_test_iter( + self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None, + outputs: Optional[Sequence[DetDataSample]] = None) -> None: + """Regularly record memory information. + + Args: + runner (:obj:`Runner`): The runner of the testing process. + batch_idx (int): The index of the current batch in the test loop. + data_batch (dict, optional): Data from dataloader. + Defaults to None. + outputs (Sequence[:obj:`DetDataSample`], optional): + Outputs from model. Defaults to None. + """ + if self.every_n_inner_iters(batch_idx, self.interval): + self._record_memory_information(runner) diff --git a/mmdetection/mmdet/engine/hooks/num_class_check_hook.py b/mmdetection/mmdet/engine/hooks/num_class_check_hook.py new file mode 100644 index 0000000..6588473 --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/num_class_check_hook.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import VGG +from mmengine.hooks import Hook +from mmengine.runner import Runner + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class NumClassCheckHook(Hook): + """Check whether the `num_classes` in head matches the length of `classes` + in `dataset.metainfo`.""" + + def _check_head(self, runner: Runner, mode: str) -> None: + """Check whether the `num_classes` in head matches the length of + `classes` in `dataset.metainfo`. + + Args: + runner (:obj:`Runner`): The runner of the training or evaluation + process. + """ + assert mode in ['train', 'val'] + model = runner.model + dataset = runner.train_dataloader.dataset if mode == 'train' else \ + runner.val_dataloader.dataset + if dataset.metainfo.get('classes', None) is None: + runner.logger.warning( + f'Please set `classes` ' + f'in the {dataset.__class__.__name__} `metainfo` and' + f'check if it is consistent with the `num_classes` ' + f'of head') + else: + classes = dataset.metainfo['classes'] + assert type(classes) is not str, \ + (f'`classes` in {dataset.__class__.__name__}' + f'should be a tuple of str.' + f'Add comma if number of classes is 1 as ' + f'classes = ({classes},)') + from mmdet.models.roi_heads.mask_heads import FusedSemanticHead + for name, module in model.named_modules(): + if hasattr(module, 'num_classes') and not name.endswith( + 'rpn_head') and not isinstance( + module, (VGG, FusedSemanticHead)): + assert module.num_classes == len(classes), \ + (f'The `num_classes` ({module.num_classes}) in ' + f'{module.__class__.__name__} of ' + f'{model.__class__.__name__} does not matches ' + f'the length of `classes` ' + f'{len(classes)}) in ' + f'{dataset.__class__.__name__}') + + def before_train_epoch(self, runner: Runner) -> None: + """Check whether the training dataset is compatible with head. + + Args: + runner (:obj:`Runner`): The runner of the training or evaluation + process. + """ + self._check_head(runner, 'train') + + def before_val_epoch(self, runner: Runner) -> None: + """Check whether the dataset in val epoch is compatible with head. + + Args: + runner (:obj:`Runner`): The runner of the training or evaluation + process. + """ + self._check_head(runner, 'val') diff --git a/mmdetection/mmdet/engine/hooks/pipeline_switch_hook.py b/mmdetection/mmdet/engine/hooks/pipeline_switch_hook.py new file mode 100644 index 0000000..a5abd89 --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/pipeline_switch_hook.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.transforms import Compose +from mmengine.hooks import Hook + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class PipelineSwitchHook(Hook): + """Switch data pipeline at switch_epoch. + + Args: + switch_epoch (int): switch pipeline at this epoch. + switch_pipeline (list[dict]): the pipeline to switch to. + """ + + def __init__(self, switch_epoch, switch_pipeline): + self.switch_epoch = switch_epoch + self.switch_pipeline = switch_pipeline + self._restart_dataloader = False + self._has_switched = False + + def before_train_epoch(self, runner): + """switch pipeline.""" + epoch = runner.epoch + train_loader = runner.train_dataloader + if epoch >= self.switch_epoch and not self._has_switched: + runner.logger.info('Switch pipeline now!') + # The dataset pipeline cannot be updated when persistent_workers + # is True, so we need to force the dataloader's multi-process + # restart. This is a very hacky approach. + train_loader.dataset.pipeline = Compose(self.switch_pipeline) + if hasattr(train_loader, 'persistent_workers' + ) and train_loader.persistent_workers is True: + train_loader._DataLoader__initialized = False + train_loader._iterator = None + self._restart_dataloader = True + self._has_switched = True + else: + # Once the restart is complete, we need to restore + # the initialization flag. + if self._restart_dataloader: + train_loader._DataLoader__initialized = True diff --git a/mmdetection/mmdet/engine/hooks/set_epoch_info_hook.py b/mmdetection/mmdet/engine/hooks/set_epoch_info_hook.py new file mode 100644 index 0000000..183f316 --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/set_epoch_info_hook.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import Hook +from mmengine.model.wrappers import is_model_wrapper + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class SetEpochInfoHook(Hook): + """Set runner's epoch information to the model.""" + + def before_train_epoch(self, runner): + epoch = runner.epoch + model = runner.model + if is_model_wrapper(model): + model = model.module + model.set_epoch(epoch) diff --git a/mmdetection/mmdet/engine/hooks/submission_hook.py b/mmdetection/mmdet/engine/hooks/submission_hook.py new file mode 100644 index 0000000..4a4a87b --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/submission_hook.py @@ -0,0 +1,83 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings +from typing import Optional, Sequence + +import mmcv +from mmengine.fileio import get +from mmengine.hooks import Hook +from mmengine.runner import Runner +from mmengine.utils import mkdir_or_exist +from mmengine.visualization import Visualizer + +from mmdet.datasets.samplers import TrackImgSampler +from mmdet.registry import HOOKS +from mmdet.structures import DetDataSample, TrackDataSample +import pandas as pd + +@HOOKS.register_module() +class SubmissionHook(Hook): + """ + Hook for submitting results. Saves verification and test process prediction results. + + In the testing phase: + + 1. Receives labels, scores, and bboxes from outputs and stores them in prediction_strings. + 2. Get the img_path of outputs and save it in file_names. + + Args: + prediction_strings (list): [labels + ' ' + scores + ' ' + x_min + ' ' + y_min + ' ' + x_max + ' ' + y_max]를 추가한 list + file_names (list): img_path를 추가한 list + test_out_dir (str) : 저장할 경로 + """ + + def __init__(self, test_out_dir: Optional[str] = None): + self.prediction_strings = [] + self.file_names = [] + self.test_out_dir = test_out_dir + + def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict, + outputs: Sequence[DetDataSample]) -> None: + """Run after every testing iterations. + + Args: + runner (:obj:`Runner`): The runner of the testing process. + batch_idx (int): The index of the current batch in the val loop. + data_batch (dict): Data from dataloader. + outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples + that contain annotations and predictions. + """ + assert len(outputs) == 1, \ + 'only batch_size=1 is supported while testing.' + + pred_x_size = data_batch['inputs'][0].shape[1] + pred_y_size = data_batch['inputs'][0].shape[2] + + for output in outputs: + prediction_string = '' + for label, score, bbox in zip(output.pred_instances.labels, output.pred_instances.scores, output.pred_instances.bboxes): + bbox = bbox.cpu().numpy() + bbox[0], bbox[2] = bbox[0] / pred_x_size * 1024, bbox[2] / pred_x_size * 1024 + bbox[1], bbox[3] = bbox[1] / pred_y_size * 1024, bbox[3] / pred_y_size * 1024 + prediction_string += str(int(label.cpu())) + ' ' + str(float(score.cpu())) + ' ' + str(bbox[0]) + ' ' + str(bbox[1]) + ' ' + str(bbox[2]) + ' ' + str(bbox[3]) + self.prediction_strings.append(prediction_string) + self.file_names.append(output.img_path[13:]) + + def after_test(self, runner: Runner): + """ + Run after testing + + Args: + runner (:obj:`Runner`): The runner of the testing process. + """ + if self.test_out_dir is not None: + self.test_out_dir = osp.join(runner.work_dir, runner.timestamp, + self.test_out_dir) + mkdir_or_exist(self.test_out_dir) + + submission = pd.DataFrame() + submission['PredictionString'] = self.prediction_strings + submission['image_id'] = self.file_names + submission.to_csv(osp.join(self.test_out_dir, 'submission.csv'), index=None) + print('submission saved to {}'.format(osp.join(self.test_out_dir, 'submission.csv'))) + \ No newline at end of file diff --git a/mmdetection/mmdet/engine/hooks/sync_norm_hook.py b/mmdetection/mmdet/engine/hooks/sync_norm_hook.py new file mode 100644 index 0000000..a173438 --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/sync_norm_hook.py @@ -0,0 +1,37 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +from mmengine.dist import get_dist_info +from mmengine.hooks import Hook +from torch import nn + +from mmdet.registry import HOOKS +from mmdet.utils import all_reduce_dict + + +def get_norm_states(module: nn.Module) -> OrderedDict: + """Get the state_dict of batch norms in the module.""" + async_norm_states = OrderedDict() + for name, child in module.named_modules(): + if isinstance(child, nn.modules.batchnorm._NormBase): + for k, v in child.state_dict().items(): + async_norm_states['.'.join([name, k])] = v + return async_norm_states + + +@HOOKS.register_module() +class SyncNormHook(Hook): + """Synchronize Norm states before validation, currently used in YOLOX.""" + + def before_val_epoch(self, runner): + """Synchronizing norm.""" + module = runner.model + _, world_size = get_dist_info() + if world_size == 1: + return + norm_states = get_norm_states(module) + if len(norm_states) == 0: + return + # TODO: use `all_reduce_dict` in mmengine + norm_states = all_reduce_dict(norm_states, op='mean') + module.load_state_dict(norm_states, strict=False) diff --git a/mmdetection/mmdet/engine/hooks/utils.py b/mmdetection/mmdet/engine/hooks/utils.py new file mode 100644 index 0000000..d267cfe --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/utils.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. +def trigger_visualization_hook(cfg, args): + default_hooks = cfg.default_hooks + if 'visualization' in default_hooks: + visualization_hook = default_hooks['visualization'] + # Turn on visualization + visualization_hook['draw'] = True + if args.show: + visualization_hook['show'] = True + visualization_hook['wait_time'] = args.wait_time + if args.show_dir: + visualization_hook['test_out_dir'] = args.show_dir + else: + raise RuntimeError( + 'VisualizationHook must be included in default_hooks.' + 'refer to usage ' + '"visualization=dict(type=\'VisualizationHook\')"') + + return cfg diff --git a/mmdetection/mmdet/engine/hooks/visualization_hook.py b/mmdetection/mmdet/engine/hooks/visualization_hook.py new file mode 100644 index 0000000..fad0f90 --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/visualization_hook.py @@ -0,0 +1,312 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings +from typing import Optional, Sequence + +import mmcv +from mmengine.fileio import get +from mmengine.hooks import Hook +from mmengine.runner import Runner +from mmengine.utils import mkdir_or_exist +from mmengine.visualization import Visualizer + +from mmdet.datasets.samplers import TrackImgSampler +from mmdet.registry import HOOKS +from mmdet.structures import DetDataSample, TrackDataSample + + +@HOOKS.register_module() +class DetVisualizationHook(Hook): + """Detection Visualization Hook. Used to visualize validation and testing + process prediction results. + + In the testing phase: + + 1. If ``show`` is True, it means that only the prediction results are + visualized without storing data, so ``vis_backends`` needs to + be excluded. + 2. If ``test_out_dir`` is specified, it means that the prediction results + need to be saved to ``test_out_dir``. In order to avoid vis_backends + also storing data, so ``vis_backends`` needs to be excluded. + 3. ``vis_backends`` takes effect if the user does not specify ``show`` + and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or + TensorboardVisBackend to store the prediction result in Wandb or + Tensorboard. + + Args: + draw (bool): whether to draw prediction results. If it is False, + it means that no drawing will be done. Defaults to False. + interval (int): The interval of visualization. Defaults to 50. + score_thr (float): The threshold to visualize the bboxes + and masks. Defaults to 0.3. + show (bool): Whether to display the drawn image. Default to False. + wait_time (float): The interval of show (s). Defaults to 0. + test_out_dir (str, optional): directory where painted images + will be saved in testing process. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + """ + + def __init__(self, + draw: bool = False, + interval: int = 50, + score_thr: float = 0.3, + show: bool = False, + wait_time: float = 0., + test_out_dir: Optional[str] = None, + backend_args: dict = None): + self._visualizer: Visualizer = Visualizer.get_current_instance() + self.interval = interval + self.score_thr = score_thr + self.show = show + if self.show: + # No need to think about vis backends. + self._visualizer._vis_backends = {} + warnings.warn('The show is True, it means that only ' + 'the prediction results are visualized ' + 'without storing data, so vis_backends ' + 'needs to be excluded.') + + self.wait_time = wait_time + self.backend_args = backend_args + self.draw = draw + self.test_out_dir = test_out_dir + self._test_index = 0 + + def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict, + outputs: Sequence[DetDataSample]) -> None: + """Run after every ``self.interval`` validation iterations. + + Args: + runner (:obj:`Runner`): The runner of the validation process. + batch_idx (int): The index of the current batch in the val loop. + data_batch (dict): Data from dataloader. + outputs (Sequence[:obj:`DetDataSample`]]): A batch of data samples + that contain annotations and predictions. + """ + if self.draw is False: + return + + # There is no guarantee that the same batch of images + # is visualized for each evaluation. + total_curr_iter = runner.iter + batch_idx + + # Visualize only the first data + img_path = outputs[0].img_path + img_bytes = get(img_path, backend_args=self.backend_args) + img = mmcv.imfrombytes(img_bytes, channel_order='rgb') + + if total_curr_iter % self.interval == 0: + self._visualizer.add_datasample( + osp.basename(img_path) if self.show else 'val_img', + img, + data_sample=outputs[0], + show=self.show, + wait_time=self.wait_time, + pred_score_thr=self.score_thr, + step=total_curr_iter) + + def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict, + outputs: Sequence[DetDataSample]) -> None: + """Run after every testing iterations. + + Args: + runner (:obj:`Runner`): The runner of the testing process. + batch_idx (int): The index of the current batch in the val loop. + data_batch (dict): Data from dataloader. + outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples + that contain annotations and predictions. + """ + if self.draw is False: + return + + if self.test_out_dir is not None: + self.test_out_dir = osp.join(runner.work_dir, runner.timestamp, + self.test_out_dir) + mkdir_or_exist(self.test_out_dir) + + for data_sample in outputs: + self._test_index += 1 + + img_path = data_sample.img_path + img_bytes = get(img_path, backend_args=self.backend_args) + img = mmcv.imfrombytes(img_bytes, channel_order='rgb') + + out_file = None + if self.test_out_dir is not None: + out_file = osp.basename(img_path) + out_file = osp.join(self.test_out_dir, out_file) + + self._visualizer.add_datasample( + osp.basename(img_path) if self.show else 'test_img', + img, + data_sample=data_sample, + show=self.show, + wait_time=self.wait_time, + pred_score_thr=self.score_thr, + out_file=out_file, + step=self._test_index) + + +@HOOKS.register_module() +class TrackVisualizationHook(Hook): + """Tracking Visualization Hook. Used to visualize validation and testing + process prediction results. + + In the testing phase: + + 1. If ``show`` is True, it means that only the prediction results are + visualized without storing data, so ``vis_backends`` needs to + be excluded. + 2. If ``test_out_dir`` is specified, it means that the prediction results + need to be saved to ``test_out_dir``. In order to avoid vis_backends + also storing data, so ``vis_backends`` needs to be excluded. + 3. ``vis_backends`` takes effect if the user does not specify ``show`` + and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or + TensorboardVisBackend to store the prediction result in Wandb or + Tensorboard. + + Args: + draw (bool): whether to draw prediction results. If it is False, + it means that no drawing will be done. Defaults to False. + frame_interval (int): The interval of visualization. Defaults to 30. + score_thr (float): The threshold to visualize the bboxes + and masks. Defaults to 0.3. + show (bool): Whether to display the drawn image. Default to False. + wait_time (float): The interval of show (s). Defaults to 0. + test_out_dir (str, optional): directory where painted images + will be saved in testing process. + backend_args (dict): Arguments to instantiate a file client. + Defaults to ``None``. + """ + + def __init__(self, + draw: bool = False, + frame_interval: int = 30, + score_thr: float = 0.3, + show: bool = False, + wait_time: float = 0., + test_out_dir: Optional[str] = None, + backend_args: dict = None) -> None: + self._visualizer: Visualizer = Visualizer.get_current_instance() + self.frame_interval = frame_interval + self.score_thr = score_thr + self.show = show + if self.show: + # No need to think about vis backends. + self._visualizer._vis_backends = {} + warnings.warn('The show is True, it means that only ' + 'the prediction results are visualized ' + 'without storing data, so vis_backends ' + 'needs to be excluded.') + + self.wait_time = wait_time + self.backend_args = backend_args + self.draw = draw + self.test_out_dir = test_out_dir + self.image_idx = 0 + + def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict, + outputs: Sequence[TrackDataSample]) -> None: + """Run after every ``self.interval`` validation iteration. + + Args: + runner (:obj:`Runner`): The runner of the validation process. + batch_idx (int): The index of the current batch in the val loop. + data_batch (dict): Data from dataloader. + outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model. + """ + if self.draw is False: + return + + assert len(outputs) == 1,\ + 'only batch_size=1 is supported while validating.' + + sampler = runner.val_dataloader.sampler + if isinstance(sampler, TrackImgSampler): + if self.every_n_inner_iters(batch_idx, self.frame_interval): + total_curr_iter = runner.iter + batch_idx + track_data_sample = outputs[0] + self.visualize_single_image(track_data_sample[0], + total_curr_iter) + else: + # video visualization DefaultSampler + if self.every_n_inner_iters(batch_idx, 1): + track_data_sample = outputs[0] + video_length = len(track_data_sample) + + for frame_id in range(video_length): + if frame_id % self.frame_interval == 0: + total_curr_iter = runner.iter + self.image_idx + \ + frame_id + img_data_sample = track_data_sample[frame_id] + self.visualize_single_image(img_data_sample, + total_curr_iter) + self.image_idx = self.image_idx + video_length + + def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict, + outputs: Sequence[TrackDataSample]) -> None: + """Run after every testing iteration. + + Args: + runner (:obj:`Runner`): The runner of the testing process. + batch_idx (int): The index of the current batch in the test loop. + data_batch (dict): Data from dataloader. + outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model. + """ + if self.draw is False: + return + + assert len(outputs) == 1, \ + 'only batch_size=1 is supported while testing.' + + if self.test_out_dir is not None: + self.test_out_dir = osp.join(runner.work_dir, runner.timestamp, + self.test_out_dir) + mkdir_or_exist(self.test_out_dir) + + sampler = runner.test_dataloader.sampler + if isinstance(sampler, TrackImgSampler): + if self.every_n_inner_iters(batch_idx, self.frame_interval): + track_data_sample = outputs[0] + self.visualize_single_image(track_data_sample[0], batch_idx) + else: + # video visualization DefaultSampler + if self.every_n_inner_iters(batch_idx, 1): + track_data_sample = outputs[0] + video_length = len(track_data_sample) + + for frame_id in range(video_length): + if frame_id % self.frame_interval == 0: + img_data_sample = track_data_sample[frame_id] + self.visualize_single_image(img_data_sample, + self.image_idx + frame_id) + self.image_idx = self.image_idx + video_length + + def visualize_single_image(self, img_data_sample: DetDataSample, + step: int) -> None: + """ + Args: + img_data_sample (DetDataSample): single image output. + step (int): The index of the current image. + """ + img_path = img_data_sample.img_path + img_bytes = get(img_path, backend_args=self.backend_args) + img = mmcv.imfrombytes(img_bytes, channel_order='rgb') + + out_file = None + if self.test_out_dir is not None: + video_name = img_path.split('/')[-3] + mkdir_or_exist(osp.join(self.test_out_dir, video_name)) + out_file = osp.join(self.test_out_dir, video_name, + osp.basename(img_path)) + + self._visualizer.add_datasample( + osp.basename(img_path) if self.show else 'test_img', + img, + data_sample=img_data_sample, + show=self.show, + wait_time=self.wait_time, + pred_score_thr=self.score_thr, + out_file=out_file, + step=step) diff --git a/mmdetection/mmdet/engine/hooks/yolox_mode_switch_hook.py b/mmdetection/mmdet/engine/hooks/yolox_mode_switch_hook.py new file mode 100644 index 0000000..05a2c69 --- /dev/null +++ b/mmdetection/mmdet/engine/hooks/yolox_mode_switch_hook.py @@ -0,0 +1,66 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +from mmengine.hooks import Hook +from mmengine.model import is_model_wrapper + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class YOLOXModeSwitchHook(Hook): + """Switch the mode of YOLOX during training. + + This hook turns off the mosaic and mixup data augmentation and switches + to use L1 loss in bbox_head. + + Args: + num_last_epochs (int): The number of latter epochs in the end of the + training to close the data augmentation and switch to L1 loss. + Defaults to 15. + skip_type_keys (Sequence[str], optional): Sequence of type string to be + skip pipeline. Defaults to ('Mosaic', 'RandomAffine', 'MixUp'). + """ + + def __init__( + self, + num_last_epochs: int = 15, + skip_type_keys: Sequence[str] = ('Mosaic', 'RandomAffine', 'MixUp') + ) -> None: + self.num_last_epochs = num_last_epochs + self.skip_type_keys = skip_type_keys + self._restart_dataloader = False + self._has_switched = False + + def before_train_epoch(self, runner) -> None: + """Close mosaic and mixup augmentation and switches to use L1 loss.""" + epoch = runner.epoch + train_loader = runner.train_dataloader + model = runner.model + # TODO: refactor after mmengine using model wrapper + if is_model_wrapper(model): + model = model.module + epoch_to_be_switched = ((epoch + 1) >= + runner.max_epochs - self.num_last_epochs) + if epoch_to_be_switched and not self._has_switched: + runner.logger.info('No mosaic and mixup aug now!') + # The dataset pipeline cannot be updated when persistent_workers + # is True, so we need to force the dataloader's multi-process + # restart. This is a very hacky approach. + train_loader.dataset.update_skip_type_keys(self.skip_type_keys) + if hasattr(train_loader, 'persistent_workers' + ) and train_loader.persistent_workers is True: + train_loader._DataLoader__initialized = False + train_loader._iterator = None + self._restart_dataloader = True + runner.logger.info('Add additional L1 loss now!') + if hasattr(model, 'detector'): + model.detector.bbox_head.use_l1 = True + else: + model.bbox_head.use_l1 = True + self._has_switched = True + else: + # Once the restart is complete, we need to restore + # the initialization flag. + if self._restart_dataloader: + train_loader._DataLoader__initialized = True diff --git a/mmdetection/mmdet/engine/optimizers/__init__.py b/mmdetection/mmdet/engine/optimizers/__init__.py new file mode 100644 index 0000000..83db069 --- /dev/null +++ b/mmdetection/mmdet/engine/optimizers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .layer_decay_optimizer_constructor import \ + LearningRateDecayOptimizerConstructor + +__all__ = ['LearningRateDecayOptimizerConstructor'] diff --git a/mmdetection/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py b/mmdetection/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py new file mode 100644 index 0000000..73028a0 --- /dev/null +++ b/mmdetection/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +from typing import List + +import torch.nn as nn +from mmengine.dist import get_dist_info +from mmengine.logging import MMLogger +from mmengine.optim import DefaultOptimWrapperConstructor + +from mmdet.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +def get_layer_id_for_convnext(var_name, max_layer_id): + """Get the layer id to set the different learning rates in ``layer_wise`` + decay_type. + + Args: + var_name (str): The key of the model. + max_layer_id (int): Maximum layer id. + + Returns: + int: The id number corresponding to different learning rate in + ``LearningRateDecayOptimizerConstructor``. + """ + + if var_name in ('backbone.cls_token', 'backbone.mask_token', + 'backbone.pos_embed'): + return 0 + elif var_name.startswith('backbone.downsample_layers'): + stage_id = int(var_name.split('.')[2]) + if stage_id == 0: + layer_id = 0 + elif stage_id == 1: + layer_id = 2 + elif stage_id == 2: + layer_id = 3 + elif stage_id == 3: + layer_id = max_layer_id + return layer_id + elif var_name.startswith('backbone.stages'): + stage_id = int(var_name.split('.')[2]) + block_id = int(var_name.split('.')[3]) + if stage_id == 0: + layer_id = 1 + elif stage_id == 1: + layer_id = 2 + elif stage_id == 2: + layer_id = 3 + block_id // 3 + elif stage_id == 3: + layer_id = max_layer_id + return layer_id + else: + return max_layer_id + 1 + + +def get_stage_id_for_convnext(var_name, max_stage_id): + """Get the stage id to set the different learning rates in ``stage_wise`` + decay_type. + + Args: + var_name (str): The key of the model. + max_stage_id (int): Maximum stage id. + + Returns: + int: The id number corresponding to different learning rate in + ``LearningRateDecayOptimizerConstructor``. + """ + + if var_name in ('backbone.cls_token', 'backbone.mask_token', + 'backbone.pos_embed'): + return 0 + elif var_name.startswith('backbone.downsample_layers'): + return 0 + elif var_name.startswith('backbone.stages'): + stage_id = int(var_name.split('.')[2]) + return stage_id + 1 + else: + return max_stage_id - 1 + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor): + # Different learning rates are set for different layers of backbone. + # Note: Currently, this optimizer constructor is built for ConvNeXt. + + def add_params(self, params: List[dict], module: nn.Module, + **kwargs) -> None: + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (list[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + """ + logger = MMLogger.get_current_instance() + + parameter_groups = {} + logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}') + num_layers = self.paramwise_cfg.get('num_layers') + 2 + decay_rate = self.paramwise_cfg.get('decay_rate') + decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise') + logger.info('Build LearningRateDecayOptimizerConstructor ' + f'{decay_type} {decay_rate} - {num_layers}') + weight_decay = self.base_wd + for name, param in module.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if len(param.shape) == 1 or name.endswith('.bias') or name in ( + 'pos_embed', 'cls_token'): + group_name = 'no_decay' + this_weight_decay = 0. + else: + group_name = 'decay' + this_weight_decay = weight_decay + if 'layer_wise' in decay_type: + if 'ConvNeXt' in module.backbone.__class__.__name__: + layer_id = get_layer_id_for_convnext( + name, self.paramwise_cfg.get('num_layers')) + logger.info(f'set param {name} as id {layer_id}') + else: + raise NotImplementedError() + elif decay_type == 'stage_wise': + if 'ConvNeXt' in module.backbone.__class__.__name__: + layer_id = get_stage_id_for_convnext(name, num_layers) + logger.info(f'set param {name} as id {layer_id}') + else: + raise NotImplementedError() + group_name = f'layer_{layer_id}_{group_name}' + + if group_name not in parameter_groups: + scale = decay_rate**(num_layers - layer_id - 1) + + parameter_groups[group_name] = { + 'weight_decay': this_weight_decay, + 'params': [], + 'param_names': [], + 'lr_scale': scale, + 'group_name': group_name, + 'lr': scale * self.base_lr, + } + + parameter_groups[group_name]['params'].append(param) + parameter_groups[group_name]['param_names'].append(name) + rank, _ = get_dist_info() + if rank == 0: + to_display = {} + for key in parameter_groups: + to_display[key] = { + 'param_names': parameter_groups[key]['param_names'], + 'lr_scale': parameter_groups[key]['lr_scale'], + 'lr': parameter_groups[key]['lr'], + 'weight_decay': parameter_groups[key]['weight_decay'], + } + logger.info(f'Param groups = {json.dumps(to_display, indent=2)}') + params.extend(parameter_groups.values()) diff --git a/mmdetection/mmdet/engine/runner/__init__.py b/mmdetection/mmdet/engine/runner/__init__.py new file mode 100644 index 0000000..e8bcce4 --- /dev/null +++ b/mmdetection/mmdet/engine/runner/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .loops import TeacherStudentValLoop + +__all__ = ['TeacherStudentValLoop'] diff --git a/mmdetection/mmdet/engine/runner/loops.py b/mmdetection/mmdet/engine/runner/loops.py new file mode 100644 index 0000000..afe53af --- /dev/null +++ b/mmdetection/mmdet/engine/runner/loops.py @@ -0,0 +1,38 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.model import is_model_wrapper +from mmengine.runner import ValLoop + +from mmdet.registry import LOOPS + + +@LOOPS.register_module() +class TeacherStudentValLoop(ValLoop): + """Loop for validation of model teacher and student.""" + + def run(self): + """Launch validation for model teacher and student.""" + self.runner.call_hook('before_val') + self.runner.call_hook('before_val_epoch') + self.runner.model.eval() + + model = self.runner.model + if is_model_wrapper(model): + model = model.module + assert hasattr(model, 'teacher') + assert hasattr(model, 'student') + + predict_on = model.semi_test_cfg.get('predict_on', None) + multi_metrics = dict() + for _predict_on in ['teacher', 'student']: + model.semi_test_cfg['predict_on'] = _predict_on + for idx, data_batch in enumerate(self.dataloader): + self.run_iter(idx, data_batch) + # compute metrics + metrics = self.evaluator.evaluate(len(self.dataloader.dataset)) + multi_metrics.update( + {'/'.join((_predict_on, k)): v + for k, v in metrics.items()}) + model.semi_test_cfg['predict_on'] = predict_on + + self.runner.call_hook('after_val_epoch', metrics=multi_metrics) + self.runner.call_hook('after_val') diff --git a/mmdetection/mmdet/engine/schedulers/__init__.py b/mmdetection/mmdet/engine/schedulers/__init__.py new file mode 100644 index 0000000..0126164 --- /dev/null +++ b/mmdetection/mmdet/engine/schedulers/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .quadratic_warmup import (QuadraticWarmupLR, QuadraticWarmupMomentum, + QuadraticWarmupParamScheduler) + +__all__ = [ + 'QuadraticWarmupParamScheduler', 'QuadraticWarmupMomentum', + 'QuadraticWarmupLR' +] diff --git a/mmdetection/mmdet/engine/schedulers/quadratic_warmup.py b/mmdetection/mmdet/engine/schedulers/quadratic_warmup.py new file mode 100644 index 0000000..639b478 --- /dev/null +++ b/mmdetection/mmdet/engine/schedulers/quadratic_warmup.py @@ -0,0 +1,131 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.optim.scheduler.lr_scheduler import LRSchedulerMixin +from mmengine.optim.scheduler.momentum_scheduler import MomentumSchedulerMixin +from mmengine.optim.scheduler.param_scheduler import INF, _ParamScheduler +from torch.optim import Optimizer + +from mmdet.registry import PARAM_SCHEDULERS + + +@PARAM_SCHEDULERS.register_module() +class QuadraticWarmupParamScheduler(_ParamScheduler): + r"""Warm up the parameter value of each parameter group by quadratic + formula: + + .. math:: + + X_{t} = X_{t-1} + \frac{2t+1}{{(end-begin)}^{2}} \times X_{base} + + Args: + optimizer (Optimizer): Wrapped optimizer. + param_name (str): Name of the parameter to be adjusted, such as + ``lr``, ``momentum``. + begin (int): Step at which to start updating the parameters. + Defaults to 0. + end (int): Step at which to stop updating the parameters. + Defaults to INF. + last_step (int): The index of last step. Used for resume without + state dict. Defaults to -1. + by_epoch (bool): Whether the scheduled parameters are updated by + epochs. Defaults to True. + verbose (bool): Whether to print the value for each update. + Defaults to False. + """ + + def __init__(self, + optimizer: Optimizer, + param_name: str, + begin: int = 0, + end: int = INF, + last_step: int = -1, + by_epoch: bool = True, + verbose: bool = False): + if end >= INF: + raise ValueError('``end`` must be less than infinity,' + 'Please set ``end`` parameter of ' + '``QuadraticWarmupScheduler`` as the ' + 'number of warmup end.') + self.total_iters = end - begin + super().__init__( + optimizer=optimizer, + param_name=param_name, + begin=begin, + end=end, + last_step=last_step, + by_epoch=by_epoch, + verbose=verbose) + + @classmethod + def build_iter_from_epoch(cls, + *args, + begin=0, + end=INF, + by_epoch=True, + epoch_length=None, + **kwargs): + """Build an iter-based instance of this scheduler from an epoch-based + config.""" + assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \ + 'be converted to iter-based.' + assert epoch_length is not None and epoch_length > 0, \ + f'`epoch_length` must be a positive integer, ' \ + f'but got {epoch_length}.' + by_epoch = False + begin = begin * epoch_length + if end != INF: + end = end * epoch_length + return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs) + + def _get_value(self): + """Compute value using chainable form of the scheduler.""" + if self.last_step == 0: + return [ + base_value * (2 * self.last_step + 1) / self.total_iters**2 + for base_value in self.base_values + ] + + return [ + group[self.param_name] + base_value * + (2 * self.last_step + 1) / self.total_iters**2 + for base_value, group in zip(self.base_values, + self.optimizer.param_groups) + ] + + +@PARAM_SCHEDULERS.register_module() +class QuadraticWarmupLR(LRSchedulerMixin, QuadraticWarmupParamScheduler): + """Warm up the learning rate of each parameter group by quadratic formula. + + Args: + optimizer (Optimizer): Wrapped optimizer. + begin (int): Step at which to start updating the parameters. + Defaults to 0. + end (int): Step at which to stop updating the parameters. + Defaults to INF. + last_step (int): The index of last step. Used for resume without + state dict. Defaults to -1. + by_epoch (bool): Whether the scheduled parameters are updated by + epochs. Defaults to True. + verbose (bool): Whether to print the value for each update. + Defaults to False. + """ + + +@PARAM_SCHEDULERS.register_module() +class QuadraticWarmupMomentum(MomentumSchedulerMixin, + QuadraticWarmupParamScheduler): + """Warm up the momentum value of each parameter group by quadratic formula. + + Args: + optimizer (Optimizer): Wrapped optimizer. + begin (int): Step at which to start updating the parameters. + Defaults to 0. + end (int): Step at which to stop updating the parameters. + Defaults to INF. + last_step (int): The index of last step. Used for resume without + state dict. Defaults to -1. + by_epoch (bool): Whether the scheduled parameters are updated by + epochs. Defaults to True. + verbose (bool): Whether to print the value for each update. + Defaults to False. + """ diff --git a/mmdetection/mmdet/evaluation/__init__.py b/mmdetection/mmdet/evaluation/__init__.py new file mode 100644 index 0000000..f70dc22 --- /dev/null +++ b/mmdetection/mmdet/evaluation/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .functional import * # noqa: F401,F403 +from .metrics import * # noqa: F401,F403 diff --git a/mmdetection/mmdet/evaluation/functional/__init__.py b/mmdetection/mmdet/evaluation/functional/__init__.py new file mode 100644 index 0000000..96d58eb --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_overlaps import bbox_overlaps +from .cityscapes_utils import evaluateImgLists +from .class_names import (cityscapes_classes, coco_classes, + coco_panoptic_classes, dataset_aliases, get_classes, + imagenet_det_classes, imagenet_vid_classes, + objects365v1_classes, objects365v2_classes, + oid_challenge_classes, oid_v6_classes, voc_classes) +from .mean_ap import average_precision, eval_map, print_map_summary +from .panoptic_utils import (INSTANCE_OFFSET, pq_compute_multi_core, + pq_compute_single_core) +from .recall import (eval_recalls, plot_iou_recall, plot_num_recall, + print_recall_summary) +from .ytvis import YTVIS +from .ytviseval import YTVISeval + +__all__ = [ + 'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes', + 'coco_classes', 'cityscapes_classes', 'dataset_aliases', 'get_classes', + 'average_precision', 'eval_map', 'print_map_summary', 'eval_recalls', + 'print_recall_summary', 'plot_num_recall', 'plot_iou_recall', + 'oid_v6_classes', 'oid_challenge_classes', 'INSTANCE_OFFSET', + 'pq_compute_single_core', 'pq_compute_multi_core', 'bbox_overlaps', + 'objects365v1_classes', 'objects365v2_classes', 'coco_panoptic_classes', + 'evaluateImgLists', 'YTVIS', 'YTVISeval' +] diff --git a/mmdetection/mmdet/evaluation/functional/bbox_overlaps.py b/mmdetection/mmdet/evaluation/functional/bbox_overlaps.py new file mode 100644 index 0000000..5d6eb82 --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/bbox_overlaps.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + + +def bbox_overlaps(bboxes1, + bboxes2, + mode='iou', + eps=1e-6, + use_legacy_coordinate=False): + """Calculate the ious between each bbox of bboxes1 and bboxes2. + + Args: + bboxes1 (ndarray): Shape (n, 4) + bboxes2 (ndarray): Shape (k, 4) + mode (str): IOU (intersection over union) or IOF (intersection + over foreground) + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Note when function is used in `VOCDataset`, it should be + True to align with the official implementation + `http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar` + Default: False. + + Returns: + ious (ndarray): Shape (n, k) + """ + + assert mode in ['iou', 'iof'] + if not use_legacy_coordinate: + extra_length = 0. + else: + extra_length = 1. + bboxes1 = bboxes1.astype(np.float32) + bboxes2 = bboxes2.astype(np.float32) + rows = bboxes1.shape[0] + cols = bboxes2.shape[0] + ious = np.zeros((rows, cols), dtype=np.float32) + if rows * cols == 0: + return ious + exchange = False + if bboxes1.shape[0] > bboxes2.shape[0]: + bboxes1, bboxes2 = bboxes2, bboxes1 + ious = np.zeros((cols, rows), dtype=np.float32) + exchange = True + area1 = (bboxes1[:, 2] - bboxes1[:, 0] + extra_length) * ( + bboxes1[:, 3] - bboxes1[:, 1] + extra_length) + area2 = (bboxes2[:, 2] - bboxes2[:, 0] + extra_length) * ( + bboxes2[:, 3] - bboxes2[:, 1] + extra_length) + for i in range(bboxes1.shape[0]): + x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0]) + y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1]) + x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2]) + y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3]) + overlap = np.maximum(x_end - x_start + extra_length, 0) * np.maximum( + y_end - y_start + extra_length, 0) + if mode == 'iou': + union = area1[i] + area2 - overlap + else: + union = area1[i] if not exchange else area2 + union = np.maximum(union, eps) + ious[i, :] = overlap / union + if exchange: + ious = ious.T + return ious diff --git a/mmdetection/mmdet/evaluation/functional/cityscapes_utils.py b/mmdetection/mmdet/evaluation/functional/cityscapes_utils.py new file mode 100644 index 0000000..5ced368 --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/cityscapes_utils.py @@ -0,0 +1,302 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) https://github.com/mcordts/cityscapesScripts +# A wrapper of `cityscapesscripts` which supports loading groundtruth +# image from `backend_args`. +import json +import os +import sys +from pathlib import Path +from typing import Optional, Union + +import mmcv +import numpy as np +from mmengine.fileio import get + +try: + import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval # noqa: E501 + from cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling import \ + CArgs # noqa: E501 + from cityscapesscripts.evaluation.instance import Instance + from cityscapesscripts.helpers.csHelpers import (id2label, labels, + writeDict2JSON) + HAS_CITYSCAPESAPI = True +except ImportError: + CArgs = object + HAS_CITYSCAPESAPI = False + + +def evaluateImgLists(prediction_list: list, + groundtruth_list: list, + args: CArgs, + backend_args: Optional[dict] = None, + dump_matches: bool = False) -> dict: + """A wrapper of obj:``cityscapesscripts.evaluation. + + evalInstanceLevelSemanticLabeling.evaluateImgLists``. Support loading + groundtruth image from file backend. + Args: + prediction_list (list): A list of prediction txt file. + groundtruth_list (list): A list of groundtruth image file. + args (CArgs): A global object setting in + obj:``cityscapesscripts.evaluation. + evalInstanceLevelSemanticLabeling`` + backend_args (dict, optional): Arguments to instantiate the + preifx of uri corresponding backend. Defaults to None. + dump_matches (bool): whether dump matches.json. Defaults to False. + Returns: + dict: The computed metric. + """ + if not HAS_CITYSCAPESAPI: + raise RuntimeError('Failed to import `cityscapesscripts`.' + 'Please try to install official ' + 'cityscapesscripts by ' + '"pip install cityscapesscripts"') + # determine labels of interest + CSEval.setInstanceLabels(args) + # get dictionary of all ground truth instances + gt_instances = getGtInstances( + groundtruth_list, args, backend_args=backend_args) + # match predictions and ground truth + matches = matchGtWithPreds(prediction_list, groundtruth_list, gt_instances, + args, backend_args) + if dump_matches: + CSEval.writeDict2JSON(matches, 'matches.json') + # evaluate matches + apScores = CSEval.evaluateMatches(matches, args) + # averages + avgDict = CSEval.computeAverages(apScores, args) + # result dict + resDict = CSEval.prepareJSONDataForResults(avgDict, apScores, args) + if args.JSONOutput: + # create output folder if necessary + path = os.path.dirname(args.exportFile) + CSEval.ensurePath(path) + # Write APs to JSON + CSEval.writeDict2JSON(resDict, args.exportFile) + + CSEval.printResults(avgDict, args) + + return resDict + + +def matchGtWithPreds(prediction_list: list, + groundtruth_list: list, + gt_instances: dict, + args: CArgs, + backend_args=None): + """A wrapper of obj:``cityscapesscripts.evaluation. + + evalInstanceLevelSemanticLabeling.matchGtWithPreds``. Support loading + groundtruth image from file backend. + Args: + prediction_list (list): A list of prediction txt file. + groundtruth_list (list): A list of groundtruth image file. + gt_instances (dict): Groundtruth dict. + args (CArgs): A global object setting in + obj:``cityscapesscripts.evaluation. + evalInstanceLevelSemanticLabeling`` + backend_args (dict, optional): Arguments to instantiate the + preifx of uri corresponding backend. Defaults to None. + Returns: + dict: The processed prediction and groundtruth result. + """ + if not HAS_CITYSCAPESAPI: + raise RuntimeError('Failed to import `cityscapesscripts`.' + 'Please try to install official ' + 'cityscapesscripts by ' + '"pip install cityscapesscripts"') + matches: dict = dict() + if not args.quiet: + print(f'Matching {len(prediction_list)} pairs of images...') + + count = 0 + for (pred, gt) in zip(prediction_list, groundtruth_list): + # Read input files + gt_image = readGTImage(gt, backend_args) + pred_info = readPredInfo(pred) + # Get and filter ground truth instances + unfiltered_instances = gt_instances[gt] + cur_gt_instances_orig = CSEval.filterGtInstances( + unfiltered_instances, args) + + # Try to assign all predictions + (cur_gt_instances, + cur_pred_instances) = CSEval.assignGt2Preds(cur_gt_instances_orig, + gt_image, pred_info, args) + + # append to global dict + matches[gt] = {} + matches[gt]['groundTruth'] = cur_gt_instances + matches[gt]['prediction'] = cur_pred_instances + + count += 1 + if not args.quiet: + print(f'\rImages Processed: {count}', end=' ') + sys.stdout.flush() + + if not args.quiet: + print('') + + return matches + + +def readGTImage(image_file: Union[str, Path], + backend_args: Optional[dict] = None) -> np.ndarray: + """Read an image from path. + + Same as obj:``cityscapesscripts.evaluation. + evalInstanceLevelSemanticLabeling.readGTImage``, but support loading + groundtruth image from file backend. + Args: + image_file (str or Path): Either a str or pathlib.Path. + backend_args (dict, optional): Instantiates the corresponding file + backend. It may contain `backend` key to specify the file + backend. If it contains, the file backend corresponding to this + value will be used and initialized with the remaining values, + otherwise the corresponding file backend will be selected + based on the prefix of the file path. Defaults to None. + Returns: + np.ndarray: The groundtruth image. + """ + img_bytes = get(image_file, backend_args=backend_args) + img = mmcv.imfrombytes(img_bytes, flag='unchanged', backend='pillow') + return img + + +def readPredInfo(prediction_file: str) -> dict: + """A wrapper of obj:``cityscapesscripts.evaluation. + + evalInstanceLevelSemanticLabeling.readPredInfo``. + Args: + prediction_file (str): The prediction txt file. + Returns: + dict: The processed prediction results. + """ + if not HAS_CITYSCAPESAPI: + raise RuntimeError('Failed to import `cityscapesscripts`.' + 'Please try to install official ' + 'cityscapesscripts by ' + '"pip install cityscapesscripts"') + printError = CSEval.printError + + predInfo = {} + if (not os.path.isfile(prediction_file)): + printError(f"Infofile '{prediction_file}' " + 'for the predictions not found.') + with open(prediction_file) as f: + for line in f: + splittedLine = line.split(' ') + if len(splittedLine) != 3: + printError('Invalid prediction file. Expected content: ' + 'relPathPrediction1 labelIDPrediction1 ' + 'confidencePrediction1') + if os.path.isabs(splittedLine[0]): + printError('Invalid prediction file. First entry in each ' + 'line must be a relative path.') + + filename = os.path.join( + os.path.dirname(prediction_file), splittedLine[0]) + + imageInfo = {} + imageInfo['labelID'] = int(float(splittedLine[1])) + imageInfo['conf'] = float(splittedLine[2]) # type: ignore + predInfo[filename] = imageInfo + + return predInfo + + +def getGtInstances(groundtruth_list: list, + args: CArgs, + backend_args: Optional[dict] = None) -> dict: + """A wrapper of obj:``cityscapesscripts.evaluation. + + evalInstanceLevelSemanticLabeling.getGtInstances``. Support loading + groundtruth image from file backend. + Args: + groundtruth_list (list): A list of groundtruth image file. + args (CArgs): A global object setting in + obj:``cityscapesscripts.evaluation. + evalInstanceLevelSemanticLabeling`` + backend_args (dict, optional): Arguments to instantiate the + preifx of uri corresponding backend. Defaults to None. + Returns: + dict: The computed metric. + """ + if not HAS_CITYSCAPESAPI: + raise RuntimeError('Failed to import `cityscapesscripts`.' + 'Please try to install official ' + 'cityscapesscripts by ' + '"pip install cityscapesscripts"') + # if there is a global statistics json, then load it + if (os.path.isfile(args.gtInstancesFile)): + if not args.quiet: + print('Loading ground truth instances from JSON.') + with open(args.gtInstancesFile) as json_file: + gt_instances = json.load(json_file) + # otherwise create it + else: + if (not args.quiet): + print('Creating ground truth instances from png files.') + gt_instances = instances2dict( + groundtruth_list, args, backend_args=backend_args) + writeDict2JSON(gt_instances, args.gtInstancesFile) + + return gt_instances + + +def instances2dict(image_list: list, + args: CArgs, + backend_args: Optional[dict] = None) -> dict: + """A wrapper of obj:``cityscapesscripts.evaluation. + + evalInstanceLevelSemanticLabeling.instances2dict``. Support loading + groundtruth image from file backend. + Args: + image_list (list): A list of image file. + args (CArgs): A global object setting in + obj:``cityscapesscripts.evaluation. + evalInstanceLevelSemanticLabeling`` + backend_args (dict, optional): Arguments to instantiate the + preifx of uri corresponding backend. Defaults to None. + Returns: + dict: The processed groundtruth results. + """ + if not HAS_CITYSCAPESAPI: + raise RuntimeError('Failed to import `cityscapesscripts`.' + 'Please try to install official ' + 'cityscapesscripts by ' + '"pip install cityscapesscripts"') + imgCount = 0 + instanceDict = {} + + if not isinstance(image_list, list): + image_list = [image_list] + + if not args.quiet: + print(f'Processing {len(image_list)} images...') + + for image_name in image_list: + # Load image + img_bytes = get(image_name, backend_args=backend_args) + imgNp = mmcv.imfrombytes(img_bytes, flag='unchanged', backend='pillow') + + # Initialize label categories + instances: dict = {} + for label in labels: + instances[label.name] = [] + + # Loop through all instance ids in instance image + for instanceId in np.unique(imgNp): + instanceObj = Instance(imgNp, instanceId) + + instances[id2label[instanceObj.labelID].name].append( + instanceObj.toDict()) + + instanceDict[image_name] = instances + imgCount += 1 + + if not args.quiet: + print(f'\rImages Processed: {imgCount}', end=' ') + sys.stdout.flush() + + return instanceDict diff --git a/mmdetection/mmdet/evaluation/functional/class_names.py b/mmdetection/mmdet/evaluation/functional/class_names.py new file mode 100644 index 0000000..d0ea709 --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/class_names.py @@ -0,0 +1,517 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.utils import is_str + + +def wider_face_classes() -> list: + """Class names of WIDERFace.""" + return ['face'] + + +def voc_classes() -> list: + """Class names of PASCAL VOC.""" + return [ + 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', + 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', + 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor' + ] + + +def imagenet_det_classes() -> list: + """Class names of ImageNet Det.""" + return [ + 'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo', + 'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam', + 'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap', + 'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder', + 'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito', + 'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle', + 'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker', + 'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew', + 'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper', + 'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly', + 'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig', + 'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog', + 'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart', + 'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger', + 'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim', + 'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse', + 'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle', + 'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard', + 'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can', + 'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace', + 'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume', + 'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza', + 'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine', + 'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse', + 'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator', + 'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler', + 'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver', + 'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile', + 'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula', + 'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer', + 'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine', + 'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie', + 'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet', + 'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin', + 'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft', + 'whale', 'wine_bottle', 'zebra' + ] + + +def imagenet_vid_classes() -> list: + """Class names of ImageNet VID.""" + return [ + 'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car', + 'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda', + 'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit', + 'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle', + 'watercraft', 'whale', 'zebra' + ] + + +def coco_classes() -> list: + """Class names of COCO.""" + return [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign', + 'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', + 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', + 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', + 'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard', + 'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork', + 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', + 'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair', + 'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy_bear', 'hair_drier', 'toothbrush' + ] + + +def coco_panoptic_classes() -> list: + """Class names of COCO panoptic.""" + return [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', + 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', + 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', + 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', + 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', + 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', + 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', + 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner', + 'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff', + 'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light', + 'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield', + 'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow', + 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile', + 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', + 'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged', + 'paper-merged', 'food-other-merged', 'building-other-merged', + 'rock-merged', 'wall-other-merged', 'rug-merged' + ] + + +def cityscapes_classes() -> list: + """Class names of Cityscapes.""" + return [ + 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', + 'bicycle' + ] + + +def oid_challenge_classes() -> list: + """Class names of Open Images Challenge.""" + return [ + 'Footwear', 'Jeans', 'House', 'Tree', 'Woman', 'Man', 'Land vehicle', + 'Person', 'Wheel', 'Bus', 'Human face', 'Bird', 'Dress', 'Girl', + 'Vehicle', 'Building', 'Cat', 'Car', 'Belt', 'Elephant', 'Dessert', + 'Butterfly', 'Train', 'Guitar', 'Poster', 'Book', 'Boy', 'Bee', + 'Flower', 'Window', 'Hat', 'Human head', 'Dog', 'Human arm', 'Drink', + 'Human mouth', 'Human hair', 'Human nose', 'Human hand', 'Table', + 'Marine invertebrates', 'Fish', 'Sculpture', 'Rose', 'Street light', + 'Glasses', 'Fountain', 'Skyscraper', 'Swimwear', 'Brassiere', 'Drum', + 'Duck', 'Countertop', 'Furniture', 'Ball', 'Human leg', 'Boat', + 'Balloon', 'Bicycle helmet', 'Goggles', 'Door', 'Human eye', 'Shirt', + 'Toy', 'Teddy bear', 'Pasta', 'Tomato', 'Human ear', + 'Vehicle registration plate', 'Microphone', 'Musical keyboard', + 'Tower', 'Houseplant', 'Flowerpot', 'Fruit', 'Vegetable', + 'Musical instrument', 'Suit', 'Motorcycle', 'Bagel', 'French fries', + 'Hamburger', 'Chair', 'Salt and pepper shakers', 'Snail', 'Airplane', + 'Horse', 'Laptop', 'Computer keyboard', 'Football helmet', 'Cocktail', + 'Juice', 'Tie', 'Computer monitor', 'Human beard', 'Bottle', + 'Saxophone', 'Lemon', 'Mouse', 'Sock', 'Cowboy hat', 'Sun hat', + 'Football', 'Porch', 'Sunglasses', 'Lobster', 'Crab', 'Picture frame', + 'Van', 'Crocodile', 'Surfboard', 'Shorts', 'Helicopter', 'Helmet', + 'Sports uniform', 'Taxi', 'Swan', 'Goose', 'Coat', 'Jacket', 'Handbag', + 'Flag', 'Skateboard', 'Television', 'Tire', 'Spoon', 'Palm tree', + 'Stairs', 'Salad', 'Castle', 'Oven', 'Microwave oven', 'Wine', + 'Ceiling fan', 'Mechanical fan', 'Cattle', 'Truck', 'Box', 'Ambulance', + 'Desk', 'Wine glass', 'Reptile', 'Tank', 'Traffic light', 'Billboard', + 'Tent', 'Insect', 'Spider', 'Treadmill', 'Cupboard', 'Shelf', + 'Seat belt', 'Human foot', 'Bicycle', 'Bicycle wheel', 'Couch', + 'Bookcase', 'Fedora', 'Backpack', 'Bench', 'Oyster', + 'Moths and butterflies', 'Lavender', 'Waffle', 'Fork', 'Animal', + 'Accordion', 'Mobile phone', 'Plate', 'Coffee cup', 'Saucer', + 'Platter', 'Dagger', 'Knife', 'Bull', 'Tortoise', 'Sea turtle', 'Deer', + 'Weapon', 'Apple', 'Ski', 'Taco', 'Traffic sign', 'Beer', 'Necklace', + 'Sunflower', 'Piano', 'Organ', 'Harpsichord', 'Bed', 'Cabinetry', + 'Nightstand', 'Curtain', 'Chest of drawers', 'Drawer', 'Parrot', + 'Sandal', 'High heels', 'Tableware', 'Cart', 'Mushroom', 'Kite', + 'Missile', 'Seafood', 'Camera', 'Paper towel', 'Toilet paper', + 'Sombrero', 'Radish', 'Lighthouse', 'Segway', 'Pig', 'Watercraft', + 'Golf cart', 'studio couch', 'Dolphin', 'Whale', 'Earrings', 'Otter', + 'Sea lion', 'Whiteboard', 'Monkey', 'Gondola', 'Zebra', + 'Baseball glove', 'Scarf', 'Adhesive tape', 'Trousers', 'Scoreboard', + 'Lily', 'Carnivore', 'Power plugs and sockets', 'Office building', + 'Sandwich', 'Swimming pool', 'Headphones', 'Tin can', 'Crown', 'Doll', + 'Cake', 'Frog', 'Beetle', 'Ant', 'Gas stove', 'Canoe', 'Falcon', + 'Blue jay', 'Egg', 'Fire hydrant', 'Raccoon', 'Muffin', 'Wall clock', + 'Coffee', 'Mug', 'Tea', 'Bear', 'Waste container', 'Home appliance', + 'Candle', 'Lion', 'Mirror', 'Starfish', 'Marine mammal', 'Wheelchair', + 'Umbrella', 'Alpaca', 'Violin', 'Cello', 'Brown bear', 'Canary', 'Bat', + 'Ruler', 'Plastic bag', 'Penguin', 'Watermelon', 'Harbor seal', 'Pen', + 'Pumpkin', 'Harp', 'Kitchen appliance', 'Roller skates', 'Bust', + 'Coffee table', 'Tennis ball', 'Tennis racket', 'Ladder', 'Boot', + 'Bowl', 'Stop sign', 'Volleyball', 'Eagle', 'Paddle', 'Chicken', + 'Skull', 'Lamp', 'Beehive', 'Maple', 'Sink', 'Goldfish', 'Tripod', + 'Coconut', 'Bidet', 'Tap', 'Bathroom cabinet', 'Toilet', + 'Filing cabinet', 'Pretzel', 'Table tennis racket', 'Bronze sculpture', + 'Rocket', 'Mouse', 'Hamster', 'Lizard', 'Lifejacket', 'Goat', + 'Washing machine', 'Trumpet', 'Horn', 'Trombone', 'Sheep', + 'Tablet computer', 'Pillow', 'Kitchen & dining room table', + 'Parachute', 'Raven', 'Glove', 'Loveseat', 'Christmas tree', + 'Shellfish', 'Rifle', 'Shotgun', 'Sushi', 'Sparrow', 'Bread', + 'Toaster', 'Watch', 'Asparagus', 'Artichoke', 'Suitcase', 'Antelope', + 'Broccoli', 'Ice cream', 'Racket', 'Banana', 'Cookie', 'Cucumber', + 'Dragonfly', 'Lynx', 'Caterpillar', 'Light bulb', 'Office supplies', + 'Miniskirt', 'Skirt', 'Fireplace', 'Potato', 'Light switch', + 'Croissant', 'Cabbage', 'Ladybug', 'Handgun', 'Luggage and bags', + 'Window blind', 'Snowboard', 'Baseball bat', 'Digital clock', + 'Serving tray', 'Infant bed', 'Sofa bed', 'Guacamole', 'Fox', 'Pizza', + 'Snowplow', 'Jet ski', 'Refrigerator', 'Lantern', 'Convenience store', + 'Sword', 'Rugby ball', 'Owl', 'Ostrich', 'Pancake', 'Strawberry', + 'Carrot', 'Tart', 'Dice', 'Turkey', 'Rabbit', 'Invertebrate', 'Vase', + 'Stool', 'Swim cap', 'Shower', 'Clock', 'Jellyfish', 'Aircraft', + 'Chopsticks', 'Orange', 'Snake', 'Sewing machine', 'Kangaroo', 'Mixer', + 'Food processor', 'Shrimp', 'Towel', 'Porcupine', 'Jaguar', 'Cannon', + 'Limousine', 'Mule', 'Squirrel', 'Kitchen knife', 'Tiara', 'Tiger', + 'Bow and arrow', 'Candy', 'Rhinoceros', 'Shark', 'Cricket ball', + 'Doughnut', 'Plumbing fixture', 'Camel', 'Polar bear', 'Coin', + 'Printer', 'Blender', 'Giraffe', 'Billiard table', 'Kettle', + 'Dinosaur', 'Pineapple', 'Zucchini', 'Jug', 'Barge', 'Teapot', + 'Golf ball', 'Binoculars', 'Scissors', 'Hot dog', 'Door handle', + 'Seahorse', 'Bathtub', 'Leopard', 'Centipede', 'Grapefruit', 'Snowman', + 'Cheetah', 'Alarm clock', 'Grape', 'Wrench', 'Wok', 'Bell pepper', + 'Cake stand', 'Barrel', 'Woodpecker', 'Flute', 'Corded phone', + 'Willow', 'Punching bag', 'Pomegranate', 'Telephone', 'Pear', + 'Common fig', 'Bench', 'Wood-burning stove', 'Burrito', 'Nail', + 'Turtle', 'Submarine sandwich', 'Drinking straw', 'Peach', 'Popcorn', + 'Frying pan', 'Picnic basket', 'Honeycomb', 'Envelope', 'Mango', + 'Cutting board', 'Pitcher', 'Stationary bicycle', 'Dumbbell', + 'Personal care', 'Dog bed', 'Snowmobile', 'Oboe', 'Briefcase', + 'Squash', 'Tick', 'Slow cooker', 'Coffeemaker', 'Measuring cup', + 'Crutch', 'Stretcher', 'Screwdriver', 'Flashlight', 'Spatula', + 'Pressure cooker', 'Ring binder', 'Beaker', 'Torch', 'Winter melon' + ] + + +def oid_v6_classes() -> list: + """Class names of Open Images V6.""" + return [ + 'Tortoise', 'Container', 'Magpie', 'Sea turtle', 'Football', + 'Ambulance', 'Ladder', 'Toothbrush', 'Syringe', 'Sink', 'Toy', + 'Organ (Musical Instrument)', 'Cassette deck', 'Apple', 'Human eye', + 'Cosmetics', 'Paddle', 'Snowman', 'Beer', 'Chopsticks', 'Human beard', + 'Bird', 'Parking meter', 'Traffic light', 'Croissant', 'Cucumber', + 'Radish', 'Towel', 'Doll', 'Skull', 'Washing machine', 'Glove', 'Tick', + 'Belt', 'Sunglasses', 'Banjo', 'Cart', 'Ball', 'Backpack', 'Bicycle', + 'Home appliance', 'Centipede', 'Boat', 'Surfboard', 'Boot', + 'Headphones', 'Hot dog', 'Shorts', 'Fast food', 'Bus', 'Boy', + 'Screwdriver', 'Bicycle wheel', 'Barge', 'Laptop', 'Miniskirt', + 'Drill (Tool)', 'Dress', 'Bear', 'Waffle', 'Pancake', 'Brown bear', + 'Woodpecker', 'Blue jay', 'Pretzel', 'Bagel', 'Tower', 'Teapot', + 'Person', 'Bow and arrow', 'Swimwear', 'Beehive', 'Brassiere', 'Bee', + 'Bat (Animal)', 'Starfish', 'Popcorn', 'Burrito', 'Chainsaw', + 'Balloon', 'Wrench', 'Tent', 'Vehicle registration plate', 'Lantern', + 'Toaster', 'Flashlight', 'Billboard', 'Tiara', 'Limousine', 'Necklace', + 'Carnivore', 'Scissors', 'Stairs', 'Computer keyboard', 'Printer', + 'Traffic sign', 'Chair', 'Shirt', 'Poster', 'Cheese', 'Sock', + 'Fire hydrant', 'Land vehicle', 'Earrings', 'Tie', 'Watercraft', + 'Cabinetry', 'Suitcase', 'Muffin', 'Bidet', 'Snack', 'Snowmobile', + 'Clock', 'Medical equipment', 'Cattle', 'Cello', 'Jet ski', 'Camel', + 'Coat', 'Suit', 'Desk', 'Cat', 'Bronze sculpture', 'Juice', 'Gondola', + 'Beetle', 'Cannon', 'Computer mouse', 'Cookie', 'Office building', + 'Fountain', 'Coin', 'Calculator', 'Cocktail', 'Computer monitor', + 'Box', 'Stapler', 'Christmas tree', 'Cowboy hat', 'Hiking equipment', + 'Studio couch', 'Drum', 'Dessert', 'Wine rack', 'Drink', 'Zucchini', + 'Ladle', 'Human mouth', 'Dairy Product', 'Dice', 'Oven', 'Dinosaur', + 'Ratchet (Device)', 'Couch', 'Cricket ball', 'Winter melon', 'Spatula', + 'Whiteboard', 'Pencil sharpener', 'Door', 'Hat', 'Shower', 'Eraser', + 'Fedora', 'Guacamole', 'Dagger', 'Scarf', 'Dolphin', 'Sombrero', + 'Tin can', 'Mug', 'Tap', 'Harbor seal', 'Stretcher', 'Can opener', + 'Goggles', 'Human body', 'Roller skates', 'Coffee cup', + 'Cutting board', 'Blender', 'Plumbing fixture', 'Stop sign', + 'Office supplies', 'Volleyball (Ball)', 'Vase', 'Slow cooker', + 'Wardrobe', 'Coffee', 'Whisk', 'Paper towel', 'Personal care', 'Food', + 'Sun hat', 'Tree house', 'Flying disc', 'Skirt', 'Gas stove', + 'Salt and pepper shakers', 'Mechanical fan', 'Face powder', 'Fax', + 'Fruit', 'French fries', 'Nightstand', 'Barrel', 'Kite', 'Tart', + 'Treadmill', 'Fox', 'Flag', 'French horn', 'Window blind', + 'Human foot', 'Golf cart', 'Jacket', 'Egg (Food)', 'Street light', + 'Guitar', 'Pillow', 'Human leg', 'Isopod', 'Grape', 'Human ear', + 'Power plugs and sockets', 'Panda', 'Giraffe', 'Woman', 'Door handle', + 'Rhinoceros', 'Bathtub', 'Goldfish', 'Houseplant', 'Goat', + 'Baseball bat', 'Baseball glove', 'Mixing bowl', + 'Marine invertebrates', 'Kitchen utensil', 'Light switch', 'House', + 'Horse', 'Stationary bicycle', 'Hammer', 'Ceiling fan', 'Sofa bed', + 'Adhesive tape', 'Harp', 'Sandal', 'Bicycle helmet', 'Saucer', + 'Harpsichord', 'Human hair', 'Heater', 'Harmonica', 'Hamster', + 'Curtain', 'Bed', 'Kettle', 'Fireplace', 'Scale', 'Drinking straw', + 'Insect', 'Hair dryer', 'Kitchenware', 'Indoor rower', 'Invertebrate', + 'Food processor', 'Bookcase', 'Refrigerator', 'Wood-burning stove', + 'Punching bag', 'Common fig', 'Cocktail shaker', 'Jaguar (Animal)', + 'Golf ball', 'Fashion accessory', 'Alarm clock', 'Filing cabinet', + 'Artichoke', 'Table', 'Tableware', 'Kangaroo', 'Koala', 'Knife', + 'Bottle', 'Bottle opener', 'Lynx', 'Lavender (Plant)', 'Lighthouse', + 'Dumbbell', 'Human head', 'Bowl', 'Humidifier', 'Porch', 'Lizard', + 'Billiard table', 'Mammal', 'Mouse', 'Motorcycle', + 'Musical instrument', 'Swim cap', 'Frying pan', 'Snowplow', + 'Bathroom cabinet', 'Missile', 'Bust', 'Man', 'Waffle iron', 'Milk', + 'Ring binder', 'Plate', 'Mobile phone', 'Baked goods', 'Mushroom', + 'Crutch', 'Pitcher (Container)', 'Mirror', 'Personal flotation device', + 'Table tennis racket', 'Pencil case', 'Musical keyboard', 'Scoreboard', + 'Briefcase', 'Kitchen knife', 'Nail (Construction)', 'Tennis ball', + 'Plastic bag', 'Oboe', 'Chest of drawers', 'Ostrich', 'Piano', 'Girl', + 'Plant', 'Potato', 'Hair spray', 'Sports equipment', 'Pasta', + 'Penguin', 'Pumpkin', 'Pear', 'Infant bed', 'Polar bear', 'Mixer', + 'Cupboard', 'Jacuzzi', 'Pizza', 'Digital clock', 'Pig', 'Reptile', + 'Rifle', 'Lipstick', 'Skateboard', 'Raven', 'High heels', 'Red panda', + 'Rose', 'Rabbit', 'Sculpture', 'Saxophone', 'Shotgun', 'Seafood', + 'Submarine sandwich', 'Snowboard', 'Sword', 'Picture frame', 'Sushi', + 'Loveseat', 'Ski', 'Squirrel', 'Tripod', 'Stethoscope', 'Submarine', + 'Scorpion', 'Segway', 'Training bench', 'Snake', 'Coffee table', + 'Skyscraper', 'Sheep', 'Television', 'Trombone', 'Tea', 'Tank', 'Taco', + 'Telephone', 'Torch', 'Tiger', 'Strawberry', 'Trumpet', 'Tree', + 'Tomato', 'Train', 'Tool', 'Picnic basket', 'Cooking spray', + 'Trousers', 'Bowling equipment', 'Football helmet', 'Truck', + 'Measuring cup', 'Coffeemaker', 'Violin', 'Vehicle', 'Handbag', + 'Paper cutter', 'Wine', 'Weapon', 'Wheel', 'Worm', 'Wok', 'Whale', + 'Zebra', 'Auto part', 'Jug', 'Pizza cutter', 'Cream', 'Monkey', 'Lion', + 'Bread', 'Platter', 'Chicken', 'Eagle', 'Helicopter', 'Owl', 'Duck', + 'Turtle', 'Hippopotamus', 'Crocodile', 'Toilet', 'Toilet paper', + 'Squid', 'Clothing', 'Footwear', 'Lemon', 'Spider', 'Deer', 'Frog', + 'Banana', 'Rocket', 'Wine glass', 'Countertop', 'Tablet computer', + 'Waste container', 'Swimming pool', 'Dog', 'Book', 'Elephant', 'Shark', + 'Candle', 'Leopard', 'Axe', 'Hand dryer', 'Soap dispenser', + 'Porcupine', 'Flower', 'Canary', 'Cheetah', 'Palm tree', 'Hamburger', + 'Maple', 'Building', 'Fish', 'Lobster', 'Garden Asparagus', + 'Furniture', 'Hedgehog', 'Airplane', 'Spoon', 'Otter', 'Bull', + 'Oyster', 'Horizontal bar', 'Convenience store', 'Bomb', 'Bench', + 'Ice cream', 'Caterpillar', 'Butterfly', 'Parachute', 'Orange', + 'Antelope', 'Beaker', 'Moths and butterflies', 'Window', 'Closet', + 'Castle', 'Jellyfish', 'Goose', 'Mule', 'Swan', 'Peach', 'Coconut', + 'Seat belt', 'Raccoon', 'Chisel', 'Fork', 'Lamp', 'Camera', + 'Squash (Plant)', 'Racket', 'Human face', 'Human arm', 'Vegetable', + 'Diaper', 'Unicycle', 'Falcon', 'Chime', 'Snail', 'Shellfish', + 'Cabbage', 'Carrot', 'Mango', 'Jeans', 'Flowerpot', 'Pineapple', + 'Drawer', 'Stool', 'Envelope', 'Cake', 'Dragonfly', 'Common sunflower', + 'Microwave oven', 'Honeycomb', 'Marine mammal', 'Sea lion', 'Ladybug', + 'Shelf', 'Watch', 'Candy', 'Salad', 'Parrot', 'Handgun', 'Sparrow', + 'Van', 'Grinder', 'Spice rack', 'Light bulb', 'Corded phone', + 'Sports uniform', 'Tennis racket', 'Wall clock', 'Serving tray', + 'Kitchen & dining room table', 'Dog bed', 'Cake stand', + 'Cat furniture', 'Bathroom accessory', 'Facial tissue holder', + 'Pressure cooker', 'Kitchen appliance', 'Tire', 'Ruler', + 'Luggage and bags', 'Microphone', 'Broccoli', 'Umbrella', 'Pastry', + 'Grapefruit', 'Band-aid', 'Animal', 'Bell pepper', 'Turkey', 'Lily', + 'Pomegranate', 'Doughnut', 'Glasses', 'Human nose', 'Pen', 'Ant', + 'Car', 'Aircraft', 'Human hand', 'Skunk', 'Teddy bear', 'Watermelon', + 'Cantaloupe', 'Dishwasher', 'Flute', 'Balance beam', 'Sandwich', + 'Shrimp', 'Sewing machine', 'Binoculars', 'Rays and skates', 'Ipod', + 'Accordion', 'Willow', 'Crab', 'Crown', 'Seahorse', 'Perfume', + 'Alpaca', 'Taxi', 'Canoe', 'Remote control', 'Wheelchair', + 'Rugby ball', 'Armadillo', 'Maracas', 'Helmet' + ] + + +def objects365v1_classes() -> list: + """Class names of Objects365 V1.""" + return [ + 'person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle', + 'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk', + 'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes', + 'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'tv', + 'storage box', 'vase', 'bench', 'wine glass', 'boots', 'bowl', + 'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can', + 'stool', 'backpack', 'couch', 'belt', 'carpet', 'basket', + 'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table', 'suv', + 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone', + 'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle', + 'bread', 'high heels', 'ring', 'van', 'watch', 'sink', 'horse', 'fish', + 'apple', 'camera', 'candle', 'teddy bear', 'cake', 'motorcycle', + 'wild bird', 'laptop', 'knife', 'traffic sign', 'cell phone', 'paddle', + 'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus', + 'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone', + 'tea pot', 'keyboard', 'tripod', 'hockey', 'fan', 'dog', 'spoon', + 'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal', + 'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane', + 'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote', + 'baseball glove', 'paper towel', 'refrigerator', 'train', 'tomato', + 'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone', + 'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine', + 'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove', + 'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat', + 'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun', + 'life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck', + 'sports car', 'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator', + 'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies', + 'cutting/chopping board', 'tennis racket', 'candy', + 'skating and skiing shoes', 'scissors', 'folder', 'baseball', + 'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine', + 'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear', + 'american football', 'basketball', 'potato', 'paint brush', 'printer', + 'billiards', 'fire hydrant', 'goose', 'projector', 'sausage', + 'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball', + 'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee', + 'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender', + 'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango', + 'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion', + 'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale', + 'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple', + 'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle', + 'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar', + 'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD', + 'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado', + 'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear', + 'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn', + 'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball', + 'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice', + 'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel', + 'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope', + 'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut', + 'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly', + 'egg tart', 'cheese', 'pig', 'pomelo', 'race car', 'rice cooker', + 'tuba', 'crosswalk sign', 'papaya', 'hair drier', 'green onion', + 'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill', + 'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup', + 'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish', + 'noodles', 'yak', 'mop', 'crab', 'microscope', 'barbell', 'bread/bun', + 'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'seal', + 'mangosteen', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case', + 'saw', 'table tennis paddle', 'okra', 'starfish', 'eagle', 'monkey', + 'durian', 'game board', 'rabbit', 'french horn', 'ambulance', + 'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon', + 'chainsaw', 'lobster', 'iron', 'flashlight' + ] + + +def objects365v2_classes() -> list: + """Class names of Objects365 V2.""" + return [ + 'Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp', + 'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf', + 'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet', + 'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower', + 'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots', + 'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt', + 'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker', + 'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool', + 'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum', + 'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle', 'Guitar', + 'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned', 'Truck', + 'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel', 'Stuffed Toy', + 'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed', 'Faucet', 'Tent', + 'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple', 'Air Conditioner', + 'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck', 'Fork', + 'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock', 'Pot', + 'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger', + 'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine', + 'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle', + 'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane', + 'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage', + 'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone', + 'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane', + 'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat', + 'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza', + 'Elephant', 'Skateboard', 'Surfboard', 'Gun', + 'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot', + 'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper', + 'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks', + 'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board', + 'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder', + 'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball', + 'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle', 'Violin', + 'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck', 'Billards', + 'Converter', 'Bathtub', 'Wheelchair', 'Golf Club', 'Briefcase', + 'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear', 'Heavy Truck', + 'Hamburger', 'Extractor', 'Extention Cord', 'Tong', 'Tennis Racket', + 'Folder', 'American Football', 'earphone', 'Mask', 'Kettle', 'Tennis', + 'Ship', 'Swing', 'Coffee Machine', 'Slide', 'Carriage', 'Onion', + 'Green beans', 'Projector', 'Frisbee', + 'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon', + 'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon', + 'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog', + 'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer', + 'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple', + 'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle', + 'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone', + 'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion', + 'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom', + 'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit', + 'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese', + 'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue', + 'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap', + 'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut', + 'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak', + 'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate', + 'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker', 'Tuba', + 'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal', 'Buttefly', + 'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin', 'Electric Drill', + 'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill', 'Lighter', + 'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi', 'Target', + 'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case', 'Yak', + 'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop', + 'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle', + 'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster', + 'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling', + 'Table Tennis ' + ] + + +dataset_aliases = { + 'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'], + 'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'], + 'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'], + 'coco': ['coco', 'mscoco', 'ms_coco'], + 'coco_panoptic': ['coco_panoptic', 'panoptic'], + 'wider_face': ['WIDERFaceDataset', 'wider_face', 'WIDERFace'], + 'cityscapes': ['cityscapes'], + 'oid_challenge': ['oid_challenge', 'openimages_challenge'], + 'oid_v6': ['oid_v6', 'openimages_v6'], + 'objects365v1': ['objects365v1', 'obj365v1'], + 'objects365v2': ['objects365v2', 'obj365v2'] +} + + +def get_classes(dataset) -> list: + """Get class names of a dataset.""" + alias2name = {} + for name, aliases in dataset_aliases.items(): + for alias in aliases: + alias2name[alias] = name + + if is_str(dataset): + if dataset in alias2name: + labels = eval(alias2name[dataset] + '_classes()') + else: + raise ValueError(f'Unrecognized dataset: {dataset}') + else: + raise TypeError(f'dataset must a str, but got {type(dataset)}') + return labels diff --git a/mmdetection/mmdet/evaluation/functional/mean_ap.py b/mmdetection/mmdet/evaluation/functional/mean_ap.py new file mode 100644 index 0000000..989972a --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/mean_ap.py @@ -0,0 +1,792 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from multiprocessing import Pool + +import numpy as np +from mmengine.logging import print_log +from mmengine.utils import is_str +from terminaltables import AsciiTable + +from .bbox_overlaps import bbox_overlaps +from .class_names import get_classes + + +def average_precision(recalls, precisions, mode='area'): + """Calculate average precision (for single or multiple scales). + + Args: + recalls (ndarray): shape (num_scales, num_dets) or (num_dets, ) + precisions (ndarray): shape (num_scales, num_dets) or (num_dets, ) + mode (str): 'area' or '11points', 'area' means calculating the area + under precision-recall curve, '11points' means calculating + the average precision of recalls at [0, 0.1, ..., 1] + + Returns: + float or ndarray: calculated average precision + """ + no_scale = False + if recalls.ndim == 1: + no_scale = True + recalls = recalls[np.newaxis, :] + precisions = precisions[np.newaxis, :] + assert recalls.shape == precisions.shape and recalls.ndim == 2 + num_scales = recalls.shape[0] + ap = np.zeros(num_scales, dtype=np.float32) + if mode == 'area': + zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) + ones = np.ones((num_scales, 1), dtype=recalls.dtype) + mrec = np.hstack((zeros, recalls, ones)) + mpre = np.hstack((zeros, precisions, zeros)) + for i in range(mpre.shape[1] - 1, 0, -1): + mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) + for i in range(num_scales): + ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] + ap[i] = np.sum( + (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) + elif mode == '11points': + for i in range(num_scales): + for thr in np.arange(0, 1 + 1e-3, 0.1): + precs = precisions[i, recalls[i, :] >= thr] + prec = precs.max() if precs.size > 0 else 0 + ap[i] += prec + ap /= 11 + else: + raise ValueError( + 'Unrecognized mode, only "area" and "11points" are supported') + if no_scale: + ap = ap[0] + return ap + + +def tpfp_imagenet(det_bboxes, + gt_bboxes, + gt_bboxes_ignore=None, + default_iou_thr=0.5, + area_ranges=None, + use_legacy_coordinate=False, + **kwargs): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Defaults to None + default_iou_thr (float): IoU threshold to be considered as matched for + medium and large bboxes (small ones have special rules). + Defaults to 0.5. + area_ranges (list[tuple] | None): Range of bbox areas to be evaluated, + in the format [(min1, max1), (min2, max2), ...]. Defaults to None. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Defaults to False. + + Returns: + tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of + each array is (num_scales, m). + """ + + if not use_legacy_coordinate: + extra_length = 0. + else: + extra_length = 1. + + # an indicator of ignored gts + gt_ignore_inds = np.concatenate( + (np.zeros(gt_bboxes.shape[0], + dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool))) + # stack gt_bboxes and gt_bboxes_ignore for convenience + gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore)) + + num_dets = det_bboxes.shape[0] + num_gts = gt_bboxes.shape[0] + if area_ranges is None: + area_ranges = [(None, None)] + num_scales = len(area_ranges) + # tp and fp are of shape (num_scales, num_gts), each row is tp or fp + # of a certain scale. + tp = np.zeros((num_scales, num_dets), dtype=np.float32) + fp = np.zeros((num_scales, num_dets), dtype=np.float32) + if gt_bboxes.shape[0] == 0: + if area_ranges == [(None, None)]: + fp[...] = 1 + else: + det_areas = ( + det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * ( + det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length) + for i, (min_area, max_area) in enumerate(area_ranges): + fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1 + return tp, fp + ious = bbox_overlaps( + det_bboxes, gt_bboxes - 1, use_legacy_coordinate=use_legacy_coordinate) + gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length + gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length + iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)), + default_iou_thr) + # sort all detections by scores in descending order + sort_inds = np.argsort(-det_bboxes[:, -1]) + for k, (min_area, max_area) in enumerate(area_ranges): + gt_covered = np.zeros(num_gts, dtype=bool) + # if no area range is specified, gt_area_ignore is all False + if min_area is None: + gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool) + else: + gt_areas = gt_w * gt_h + gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area) + for i in sort_inds: + max_iou = -1 + matched_gt = -1 + # find best overlapped available gt + for j in range(num_gts): + # different from PASCAL VOC: allow finding other gts if the + # best overlapped ones are already matched by other det bboxes + if gt_covered[j]: + continue + elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou: + max_iou = ious[i, j] + matched_gt = j + # there are 4 cases for a det bbox: + # 1. it matches a gt, tp = 1, fp = 0 + # 2. it matches an ignored gt, tp = 0, fp = 0 + # 3. it matches no gt and within area range, tp = 0, fp = 1 + # 4. it matches no gt but is beyond area range, tp = 0, fp = 0 + if matched_gt >= 0: + gt_covered[matched_gt] = 1 + if not (gt_ignore_inds[matched_gt] + or gt_area_ignore[matched_gt]): + tp[k, i] = 1 + elif min_area is None: + fp[k, i] = 1 + else: + bbox = det_bboxes[i, :4] + area = (bbox[2] - bbox[0] + extra_length) * ( + bbox[3] - bbox[1] + extra_length) + if area >= min_area and area < max_area: + fp[k, i] = 1 + return tp, fp + + +def tpfp_default(det_bboxes, + gt_bboxes, + gt_bboxes_ignore=None, + iou_thr=0.5, + area_ranges=None, + use_legacy_coordinate=False, + **kwargs): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Defaults to None + iou_thr (float): IoU threshold to be considered as matched. + Defaults to 0.5. + area_ranges (list[tuple] | None): Range of bbox areas to be + evaluated, in the format [(min1, max1), (min2, max2), ...]. + Defaults to None. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Defaults to False. + + Returns: + tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of + each array is (num_scales, m). + """ + + if not use_legacy_coordinate: + extra_length = 0. + else: + extra_length = 1. + + # an indicator of ignored gts + gt_ignore_inds = np.concatenate( + (np.zeros(gt_bboxes.shape[0], + dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool))) + # stack gt_bboxes and gt_bboxes_ignore for convenience + gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore)) + + num_dets = det_bboxes.shape[0] + num_gts = gt_bboxes.shape[0] + if area_ranges is None: + area_ranges = [(None, None)] + num_scales = len(area_ranges) + # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of + # a certain scale + tp = np.zeros((num_scales, num_dets), dtype=np.float32) + fp = np.zeros((num_scales, num_dets), dtype=np.float32) + + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + if gt_bboxes.shape[0] == 0: + if area_ranges == [(None, None)]: + fp[...] = 1 + else: + det_areas = ( + det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * ( + det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length) + for i, (min_area, max_area) in enumerate(area_ranges): + fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1 + return tp, fp + + ious = bbox_overlaps( + det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate) + # for each det, the max iou with all gts + ious_max = ious.max(axis=1) + # for each det, which gt overlaps most with it + ious_argmax = ious.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-det_bboxes[:, -1]) + for k, (min_area, max_area) in enumerate(area_ranges): + gt_covered = np.zeros(num_gts, dtype=bool) + # if no area range is specified, gt_area_ignore is all False + if min_area is None: + gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool) + else: + gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length) + gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area) + for i in sort_inds: + if ious_max[i] >= iou_thr: + matched_gt = ious_argmax[i] + if not (gt_ignore_inds[matched_gt] + or gt_area_ignore[matched_gt]): + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[k, i] = 1 + else: + fp[k, i] = 1 + # otherwise ignore this detected bbox, tp = 0, fp = 0 + elif min_area is None: + fp[k, i] = 1 + else: + bbox = det_bboxes[i, :4] + area = (bbox[2] - bbox[0] + extra_length) * ( + bbox[3] - bbox[1] + extra_length) + if area >= min_area and area < max_area: + fp[k, i] = 1 + return tp, fp + + +def tpfp_openimages(det_bboxes, + gt_bboxes, + gt_bboxes_ignore=None, + iou_thr=0.5, + area_ranges=None, + use_legacy_coordinate=False, + gt_bboxes_group_of=None, + use_group_of=True, + ioa_thr=0.5, + **kwargs): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Defaults to None + iou_thr (float): IoU threshold to be considered as matched. + Defaults to 0.5. + area_ranges (list[tuple] | None): Range of bbox areas to be + evaluated, in the format [(min1, max1), (min2, max2), ...]. + Defaults to None. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Defaults to False. + gt_bboxes_group_of (ndarray): GT group_of of this image, of shape + (k, 1). Defaults to None + use_group_of (bool): Whether to use group of when calculate TP and FP, + which only used in OpenImages evaluation. Defaults to True. + ioa_thr (float | None): IoA threshold to be considered as matched, + which only used in OpenImages evaluation. Defaults to 0.5. + + Returns: + tuple[np.ndarray]: Returns a tuple (tp, fp, det_bboxes), where + (tp, fp) whose elements are 0 and 1. The shape of each array is + (num_scales, m). (det_bboxes) whose will filter those are not + matched by group of gts when processing Open Images evaluation. + The shape is (num_scales, m). + """ + + if not use_legacy_coordinate: + extra_length = 0. + else: + extra_length = 1. + + # an indicator of ignored gts + gt_ignore_inds = np.concatenate( + (np.zeros(gt_bboxes.shape[0], + dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool))) + # stack gt_bboxes and gt_bboxes_ignore for convenience + gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore)) + + num_dets = det_bboxes.shape[0] + num_gts = gt_bboxes.shape[0] + if area_ranges is None: + area_ranges = [(None, None)] + num_scales = len(area_ranges) + # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of + # a certain scale + tp = np.zeros((num_scales, num_dets), dtype=np.float32) + fp = np.zeros((num_scales, num_dets), dtype=np.float32) + + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + if gt_bboxes.shape[0] == 0: + if area_ranges == [(None, None)]: + fp[...] = 1 + else: + det_areas = ( + det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * ( + det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length) + for i, (min_area, max_area) in enumerate(area_ranges): + fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1 + return tp, fp, det_bboxes + + if gt_bboxes_group_of is not None and use_group_of: + # if handle group-of boxes, divided gt boxes into two parts: + # non-group-of and group-of.Then calculate ious and ioas through + # non-group-of group-of gts respectively. This only used in + # OpenImages evaluation. + assert gt_bboxes_group_of.shape[0] == gt_bboxes.shape[0] + non_group_gt_bboxes = gt_bboxes[~gt_bboxes_group_of] + group_gt_bboxes = gt_bboxes[gt_bboxes_group_of] + num_gts_group = group_gt_bboxes.shape[0] + ious = bbox_overlaps(det_bboxes, non_group_gt_bboxes) + ioas = bbox_overlaps(det_bboxes, group_gt_bboxes, mode='iof') + else: + # if not consider group-of boxes, only calculate ious through gt boxes + ious = bbox_overlaps( + det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate) + ioas = None + + if ious.shape[1] > 0: + # for each det, the max iou with all gts + ious_max = ious.max(axis=1) + # for each det, which gt overlaps most with it + ious_argmax = ious.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-det_bboxes[:, -1]) + for k, (min_area, max_area) in enumerate(area_ranges): + gt_covered = np.zeros(num_gts, dtype=bool) + # if no area range is specified, gt_area_ignore is all False + if min_area is None: + gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool) + else: + gt_areas = ( + gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length) + gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area) + for i in sort_inds: + if ious_max[i] >= iou_thr: + matched_gt = ious_argmax[i] + if not (gt_ignore_inds[matched_gt] + or gt_area_ignore[matched_gt]): + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[k, i] = 1 + else: + fp[k, i] = 1 + # otherwise ignore this detected bbox, tp = 0, fp = 0 + elif min_area is None: + fp[k, i] = 1 + else: + bbox = det_bboxes[i, :4] + area = (bbox[2] - bbox[0] + extra_length) * ( + bbox[3] - bbox[1] + extra_length) + if area >= min_area and area < max_area: + fp[k, i] = 1 + else: + # if there is no no-group-of gt bboxes in this image, + # then all det bboxes within area range are false positives. + # Only used in OpenImages evaluation. + if area_ranges == [(None, None)]: + fp[...] = 1 + else: + det_areas = ( + det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * ( + det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length) + for i, (min_area, max_area) in enumerate(area_ranges): + fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1 + + if ioas is None or ioas.shape[1] <= 0: + return tp, fp, det_bboxes + else: + # The evaluation of group-of TP and FP are done in two stages: + # 1. All detections are first matched to non group-of boxes; true + # positives are determined. + # 2. Detections that are determined as false positives are matched + # against group-of boxes and calculated group-of TP and FP. + # Only used in OpenImages evaluation. + det_bboxes_group = np.zeros( + (num_scales, ioas.shape[1], det_bboxes.shape[1]), dtype=float) + match_group_of = np.zeros((num_scales, num_dets), dtype=bool) + tp_group = np.zeros((num_scales, num_gts_group), dtype=np.float32) + ioas_max = ioas.max(axis=1) + # for each det, which gt overlaps most with it + ioas_argmax = ioas.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-det_bboxes[:, -1]) + for k, (min_area, max_area) in enumerate(area_ranges): + box_is_covered = tp[k] + # if no area range is specified, gt_area_ignore is all False + if min_area is None: + gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool) + else: + gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1]) + gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area) + for i in sort_inds: + matched_gt = ioas_argmax[i] + if not box_is_covered[i]: + if ioas_max[i] >= ioa_thr: + if not (gt_ignore_inds[matched_gt] + or gt_area_ignore[matched_gt]): + if not tp_group[k, matched_gt]: + tp_group[k, matched_gt] = 1 + match_group_of[k, i] = True + else: + match_group_of[k, i] = True + + if det_bboxes_group[k, matched_gt, -1] < \ + det_bboxes[i, -1]: + det_bboxes_group[k, matched_gt] = \ + det_bboxes[i] + + fp_group = (tp_group <= 0).astype(float) + tps = [] + fps = [] + # concatenate tp, fp, and det-boxes which not matched group of + # gt boxes and tp_group, fp_group, and det_bboxes_group which + # matched group of boxes respectively. + for i in range(num_scales): + tps.append( + np.concatenate((tp[i][~match_group_of[i]], tp_group[i]))) + fps.append( + np.concatenate((fp[i][~match_group_of[i]], fp_group[i]))) + det_bboxes = np.concatenate( + (det_bboxes[~match_group_of[i]], det_bboxes_group[i])) + + tp = np.vstack(tps) + fp = np.vstack(fps) + return tp, fp, det_bboxes + + +def get_cls_results(det_results, annotations, class_id): + """Get det results and gt information of a certain class. + + Args: + det_results (list[list]): Same as `eval_map()`. + annotations (list[dict]): Same as `eval_map()`. + class_id (int): ID of a specific class. + + Returns: + tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes + """ + cls_dets = [img_res[class_id] for img_res in det_results] + cls_gts = [] + cls_gts_ignore = [] + for ann in annotations: + gt_inds = ann['labels'] == class_id + cls_gts.append(ann['bboxes'][gt_inds, :]) + + if ann.get('labels_ignore', None) is not None: + ignore_inds = ann['labels_ignore'] == class_id + cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :]) + else: + cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32)) + + return cls_dets, cls_gts, cls_gts_ignore + + +def get_cls_group_ofs(annotations, class_id): + """Get `gt_group_of` of a certain class, which is used in Open Images. + + Args: + annotations (list[dict]): Same as `eval_map()`. + class_id (int): ID of a specific class. + + Returns: + list[np.ndarray]: `gt_group_of` of a certain class. + """ + gt_group_ofs = [] + for ann in annotations: + gt_inds = ann['labels'] == class_id + if ann.get('gt_is_group_ofs', None) is not None: + gt_group_ofs.append(ann['gt_is_group_ofs'][gt_inds]) + else: + gt_group_ofs.append(np.empty((0, 1), dtype=bool)) + + return gt_group_ofs + + +def eval_map(det_results, + annotations, + scale_ranges=None, + iou_thr=0.5, + ioa_thr=None, + dataset=None, + logger=None, + tpfp_fn=None, + nproc=4, + use_legacy_coordinate=False, + use_group_of=False, + eval_mode='area'): + """Evaluate mAP of a dataset. + + Args: + det_results (list[list]): [[cls1_det, cls2_det, ...], ...]. + The outer list indicates images, and the inner list indicates + per-class detected bboxes. + annotations (list[dict]): Ground truth annotations where each item of + the list indicates an image. Keys of annotations are: + + - `bboxes`: numpy array of shape (n, 4) + - `labels`: numpy array of shape (n, ) + - `bboxes_ignore` (optional): numpy array of shape (k, 4) + - `labels_ignore` (optional): numpy array of shape (k, ) + scale_ranges (list[tuple] | None): Range of scales to be evaluated, + in the format [(min1, max1), (min2, max2), ...]. A range of + (32, 64) means the area range between (32**2, 64**2). + Defaults to None. + iou_thr (float): IoU threshold to be considered as matched. + Defaults to 0.5. + ioa_thr (float | None): IoA threshold to be considered as matched, + which only used in OpenImages evaluation. Defaults to None. + dataset (list[str] | str | None): Dataset name or dataset classes, + there are minor differences in metrics for different datasets, e.g. + "voc", "imagenet_det", etc. Defaults to None. + logger (logging.Logger | str | None): The way to print the mAP + summary. See `mmengine.logging.print_log()` for details. + Defaults to None. + tpfp_fn (callable | None): The function used to determine true/ + false positives. If None, :func:`tpfp_default` is used as default + unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this + case). If it is given as a function, then this function is used + to evaluate tp & fp. Default None. + nproc (int): Processes used for computing TP and FP. + Defaults to 4. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Defaults to False. + use_group_of (bool): Whether to use group of when calculate TP and FP, + which only used in OpenImages evaluation. Defaults to False. + eval_mode (str): 'area' or '11points', 'area' means calculating the + area under precision-recall curve, '11points' means calculating + the average precision of recalls at [0, 0.1, ..., 1], + PASCAL VOC2007 uses `11points` as default evaluate mode, while + others are 'area'. Defaults to 'area'. + + Returns: + tuple: (mAP, [dict, dict, ...]) + """ + assert len(det_results) == len(annotations) + assert eval_mode in ['area', '11points'], \ + f'Unrecognized {eval_mode} mode, only "area" and "11points" ' \ + 'are supported' + if not use_legacy_coordinate: + extra_length = 0. + else: + extra_length = 1. + + num_imgs = len(det_results) + num_scales = len(scale_ranges) if scale_ranges is not None else 1 + num_classes = len(det_results[0]) # positive class num + area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges] + if scale_ranges is not None else None) + + # There is no need to use multi processes to process + # when num_imgs = 1 . + if num_imgs > 1: + assert nproc > 0, 'nproc must be at least one.' + nproc = min(nproc, num_imgs) + pool = Pool(nproc) + + eval_results = [] + for i in range(num_classes): + # get gt and det bboxes of this class + cls_dets, cls_gts, cls_gts_ignore = get_cls_results( + det_results, annotations, i) + # choose proper function according to datasets to compute tp and fp + if tpfp_fn is None: + if dataset in ['det', 'vid']: + tpfp_fn = tpfp_imagenet + elif dataset in ['oid_challenge', 'oid_v6'] \ + or use_group_of is True: + tpfp_fn = tpfp_openimages + else: + tpfp_fn = tpfp_default + if not callable(tpfp_fn): + raise ValueError( + f'tpfp_fn has to be a function or None, but got {tpfp_fn}') + + if num_imgs > 1: + # compute tp and fp for each image with multiple processes + args = [] + if use_group_of: + # used in Open Images Dataset evaluation + gt_group_ofs = get_cls_group_ofs(annotations, i) + args.append(gt_group_ofs) + args.append([use_group_of for _ in range(num_imgs)]) + if ioa_thr is not None: + args.append([ioa_thr for _ in range(num_imgs)]) + + tpfp = pool.starmap( + tpfp_fn, + zip(cls_dets, cls_gts, cls_gts_ignore, + [iou_thr for _ in range(num_imgs)], + [area_ranges for _ in range(num_imgs)], + [use_legacy_coordinate for _ in range(num_imgs)], *args)) + else: + tpfp = tpfp_fn( + cls_dets[0], + cls_gts[0], + cls_gts_ignore[0], + iou_thr, + area_ranges, + use_legacy_coordinate, + gt_bboxes_group_of=(get_cls_group_ofs(annotations, i)[0] + if use_group_of else None), + use_group_of=use_group_of, + ioa_thr=ioa_thr) + tpfp = [tpfp] + + if use_group_of: + tp, fp, cls_dets = tuple(zip(*tpfp)) + else: + tp, fp = tuple(zip(*tpfp)) + # calculate gt number of each scale + # ignored gts or gts beyond the specific scale are not counted + num_gts = np.zeros(num_scales, dtype=int) + for j, bbox in enumerate(cls_gts): + if area_ranges is None: + num_gts[0] += bbox.shape[0] + else: + gt_areas = (bbox[:, 2] - bbox[:, 0] + extra_length) * ( + bbox[:, 3] - bbox[:, 1] + extra_length) + for k, (min_area, max_area) in enumerate(area_ranges): + num_gts[k] += np.sum((gt_areas >= min_area) + & (gt_areas < max_area)) + # sort all det bboxes by score, also sort tp and fp + cls_dets = np.vstack(cls_dets) + num_dets = cls_dets.shape[0] + sort_inds = np.argsort(-cls_dets[:, -1]) + tp = np.hstack(tp)[:, sort_inds] + fp = np.hstack(fp)[:, sort_inds] + # calculate recall and precision with tp and fp + tp = np.cumsum(tp, axis=1) + fp = np.cumsum(fp, axis=1) + eps = np.finfo(np.float32).eps + recalls = tp / np.maximum(num_gts[:, np.newaxis], eps) + precisions = tp / np.maximum((tp + fp), eps) + # calculate AP + if scale_ranges is None: + recalls = recalls[0, :] + precisions = precisions[0, :] + num_gts = num_gts.item() + ap = average_precision(recalls, precisions, eval_mode) + eval_results.append({ + 'num_gts': num_gts, + 'num_dets': num_dets, + 'recall': recalls, + 'precision': precisions, + 'ap': ap + }) + + if num_imgs > 1: + pool.close() + + if scale_ranges is not None: + # shape (num_classes, num_scales) + all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results]) + all_num_gts = np.vstack( + [cls_result['num_gts'] for cls_result in eval_results]) + mean_ap = [] + for i in range(num_scales): + if np.any(all_num_gts[:, i] > 0): + mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean()) + else: + mean_ap.append(0.0) + else: + aps = [] + for cls_result in eval_results: + if cls_result['num_gts'] > 0: + aps.append(cls_result['ap']) + mean_ap = np.array(aps).mean().item() if aps else 0.0 + + print_map_summary( + mean_ap, eval_results, dataset, area_ranges, logger=logger) + + return mean_ap, eval_results + + +def print_map_summary(mean_ap, + results, + dataset=None, + scale_ranges=None, + logger=None): + """Print mAP and results of each class. + + A table will be printed to show the gts/dets/recall/AP of each class and + the mAP. + + Args: + mean_ap (float): Calculated from `eval_map()`. + results (list[dict]): Calculated from `eval_map()`. + dataset (list[str] | str | None): Dataset name or dataset classes. + scale_ranges (list[tuple] | None): Range of scales to be evaluated. + logger (logging.Logger | str | None): The way to print the mAP + summary. See `mmengine.logging.print_log()` for details. + Defaults to None. + """ + + if logger == 'silent': + return + + if isinstance(results[0]['ap'], np.ndarray): + num_scales = len(results[0]['ap']) + else: + num_scales = 1 + + if scale_ranges is not None: + assert len(scale_ranges) == num_scales + + num_classes = len(results) + + recalls = np.zeros((num_scales, num_classes), dtype=np.float32) + aps = np.zeros((num_scales, num_classes), dtype=np.float32) + num_gts = np.zeros((num_scales, num_classes), dtype=int) + for i, cls_result in enumerate(results): + if cls_result['recall'].size > 0: + recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1] + aps[:, i] = cls_result['ap'] + num_gts[:, i] = cls_result['num_gts'] + + if dataset is None: + label_names = [str(i) for i in range(num_classes)] + elif is_str(dataset): + label_names = get_classes(dataset) + else: + label_names = dataset + + if not isinstance(mean_ap, list): + mean_ap = [mean_ap] + + header = ['class', 'gts', 'dets', 'recall', 'ap'] + for i in range(num_scales): + if scale_ranges is not None: + print_log(f'Scale range {scale_ranges[i]}', logger=logger) + table_data = [header] + for j in range(num_classes): + row_data = [ + label_names[j], num_gts[i, j], results[j]['num_dets'], + f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}' + ] + table_data.append(row_data) + table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}']) + table = AsciiTable(table_data) + table.inner_footing_row_border = True + print_log('\n' + table.table, logger=logger) diff --git a/mmdetection/mmdet/evaluation/functional/panoptic_utils.py b/mmdetection/mmdet/evaluation/functional/panoptic_utils.py new file mode 100644 index 0000000..6faa8ed --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/panoptic_utils.py @@ -0,0 +1,228 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Copyright (c) 2018, Alexander Kirillov +# This file supports `backend_args` for `panopticapi`, +# the source code is copied from `panopticapi`, +# only the way to load the gt images is modified. +import multiprocessing +import os + +import mmcv +import numpy as np +from mmengine.fileio import get + +# A custom value to distinguish instance ID and category ID; need to +# be greater than the number of categories. +# For a pixel in the panoptic result map: +# pan_id = ins_id * INSTANCE_OFFSET + cat_id +INSTANCE_OFFSET = 1000 + +try: + from panopticapi.evaluation import OFFSET, VOID, PQStat + from panopticapi.utils import rgb2id +except ImportError: + PQStat = None + rgb2id = None + VOID = 0 + OFFSET = 256 * 256 * 256 + + +def pq_compute_single_core(proc_id, + annotation_set, + gt_folder, + pred_folder, + categories, + backend_args=None, + print_log=False): + """The single core function to evaluate the metric of Panoptic + Segmentation. + + Same as the function with the same name in `panopticapi`. Only the function + to load the images is changed to use the file client. + + Args: + proc_id (int): The id of the mini process. + gt_folder (str): The path of the ground truth images. + pred_folder (str): The path of the prediction images. + categories (str): The categories of the dataset. + backend_args (object): The Backend of the dataset. If None, + the backend will be set to `local`. + print_log (bool): Whether to print the log. Defaults to False. + """ + if PQStat is None: + raise RuntimeError( + 'panopticapi is not installed, please install it by: ' + 'pip install git+https://github.com/cocodataset/' + 'panopticapi.git.') + + pq_stat = PQStat() + + idx = 0 + for gt_ann, pred_ann in annotation_set: + if print_log and idx % 100 == 0: + print('Core: {}, {} from {} images processed'.format( + proc_id, idx, len(annotation_set))) + idx += 1 + # The gt images can be on the local disk or `ceph`, so we use + # backend here. + img_bytes = get( + os.path.join(gt_folder, gt_ann['file_name']), + backend_args=backend_args) + pan_gt = mmcv.imfrombytes(img_bytes, flag='color', channel_order='rgb') + pan_gt = rgb2id(pan_gt) + + # The predictions can only be on the local dist now. + pan_pred = mmcv.imread( + os.path.join(pred_folder, pred_ann['file_name']), + flag='color', + channel_order='rgb') + pan_pred = rgb2id(pan_pred) + + gt_segms = {el['id']: el for el in gt_ann['segments_info']} + pred_segms = {el['id']: el for el in pred_ann['segments_info']} + + # predicted segments area calculation + prediction sanity checks + pred_labels_set = set(el['id'] for el in pred_ann['segments_info']) + labels, labels_cnt = np.unique(pan_pred, return_counts=True) + for label, label_cnt in zip(labels, labels_cnt): + if label not in pred_segms: + if label == VOID: + continue + raise KeyError( + 'In the image with ID {} segment with ID {} is ' + 'presented in PNG and not presented in JSON.'.format( + gt_ann['image_id'], label)) + pred_segms[label]['area'] = label_cnt + pred_labels_set.remove(label) + if pred_segms[label]['category_id'] not in categories: + raise KeyError( + 'In the image with ID {} segment with ID {} has ' + 'unknown category_id {}.'.format( + gt_ann['image_id'], label, + pred_segms[label]['category_id'])) + if len(pred_labels_set) != 0: + raise KeyError( + 'In the image with ID {} the following segment IDs {} ' + 'are presented in JSON and not presented in PNG.'.format( + gt_ann['image_id'], list(pred_labels_set))) + + # confusion matrix calculation + pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype( + np.uint64) + gt_pred_map = {} + labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True) + for label, intersection in zip(labels, labels_cnt): + gt_id = label // OFFSET + pred_id = label % OFFSET + gt_pred_map[(gt_id, pred_id)] = intersection + + # count all matched pairs + gt_matched = set() + pred_matched = set() + for label_tuple, intersection in gt_pred_map.items(): + gt_label, pred_label = label_tuple + if gt_label not in gt_segms: + continue + if pred_label not in pred_segms: + continue + if gt_segms[gt_label]['iscrowd'] == 1: + continue + if gt_segms[gt_label]['category_id'] != pred_segms[pred_label][ + 'category_id']: + continue + + union = pred_segms[pred_label]['area'] + gt_segms[gt_label][ + 'area'] - intersection - gt_pred_map.get((VOID, pred_label), 0) + iou = intersection / union + if iou > 0.5: + pq_stat[gt_segms[gt_label]['category_id']].tp += 1 + pq_stat[gt_segms[gt_label]['category_id']].iou += iou + gt_matched.add(gt_label) + pred_matched.add(pred_label) + + # count false positives + crowd_labels_dict = {} + for gt_label, gt_info in gt_segms.items(): + if gt_label in gt_matched: + continue + # crowd segments are ignored + if gt_info['iscrowd'] == 1: + crowd_labels_dict[gt_info['category_id']] = gt_label + continue + pq_stat[gt_info['category_id']].fn += 1 + + # count false positives + for pred_label, pred_info in pred_segms.items(): + if pred_label in pred_matched: + continue + # intersection of the segment with VOID + intersection = gt_pred_map.get((VOID, pred_label), 0) + # plus intersection with corresponding CROWD region if it exists + if pred_info['category_id'] in crowd_labels_dict: + intersection += gt_pred_map.get( + (crowd_labels_dict[pred_info['category_id']], pred_label), + 0) + # predicted segment is ignored if more than half of + # the segment correspond to VOID and CROWD regions + if intersection / pred_info['area'] > 0.5: + continue + pq_stat[pred_info['category_id']].fp += 1 + + if print_log: + print('Core: {}, all {} images processed'.format( + proc_id, len(annotation_set))) + return pq_stat + + +def pq_compute_multi_core(matched_annotations_list, + gt_folder, + pred_folder, + categories, + backend_args=None, + nproc=32): + """Evaluate the metrics of Panoptic Segmentation with multithreading. + + Same as the function with the same name in `panopticapi`. + + Args: + matched_annotations_list (list): The matched annotation list. Each + element is a tuple of annotations of the same image with the + format (gt_anns, pred_anns). + gt_folder (str): The path of the ground truth images. + pred_folder (str): The path of the prediction images. + categories (str): The categories of the dataset. + backend_args (object): The file client of the dataset. If None, + the backend will be set to `local`. + nproc (int): Number of processes for panoptic quality computing. + Defaults to 32. When `nproc` exceeds the number of cpu cores, + the number of cpu cores is used. + """ + if PQStat is None: + raise RuntimeError( + 'panopticapi is not installed, please install it by: ' + 'pip install git+https://github.com/cocodataset/' + 'panopticapi.git.') + + cpu_num = min(nproc, multiprocessing.cpu_count()) + + annotations_split = np.array_split(matched_annotations_list, cpu_num) + print('Number of cores: {}, images per core: {}'.format( + cpu_num, len(annotations_split[0]))) + workers = multiprocessing.Pool(processes=cpu_num) + processes = [] + for proc_id, annotation_set in enumerate(annotations_split): + p = workers.apply_async(pq_compute_single_core, + (proc_id, annotation_set, gt_folder, + pred_folder, categories, backend_args)) + processes.append(p) + + # Close the process pool, otherwise it will lead to memory + # leaking problems. + workers.close() + workers.join() + + pq_stat = PQStat() + for p in processes: + pq_stat += p.get() + + return pq_stat diff --git a/mmdetection/mmdet/evaluation/functional/recall.py b/mmdetection/mmdet/evaluation/functional/recall.py new file mode 100644 index 0000000..4bce2bf --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/recall.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Sequence + +import numpy as np +from mmengine.logging import print_log +from terminaltables import AsciiTable + +from .bbox_overlaps import bbox_overlaps + + +def _recalls(all_ious, proposal_nums, thrs): + + img_num = all_ious.shape[0] + total_gt_num = sum([ious.shape[0] for ious in all_ious]) + + _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32) + for k, proposal_num in enumerate(proposal_nums): + tmp_ious = np.zeros(0) + for i in range(img_num): + ious = all_ious[i][:, :proposal_num].copy() + gt_ious = np.zeros((ious.shape[0])) + if ious.size == 0: + tmp_ious = np.hstack((tmp_ious, gt_ious)) + continue + for j in range(ious.shape[0]): + gt_max_overlaps = ious.argmax(axis=1) + max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps] + gt_idx = max_ious.argmax() + gt_ious[j] = max_ious[gt_idx] + box_idx = gt_max_overlaps[gt_idx] + ious[gt_idx, :] = -1 + ious[:, box_idx] = -1 + tmp_ious = np.hstack((tmp_ious, gt_ious)) + _ious[k, :] = tmp_ious + + _ious = np.fliplr(np.sort(_ious, axis=1)) + recalls = np.zeros((proposal_nums.size, thrs.size)) + for i, thr in enumerate(thrs): + recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num) + + return recalls + + +def set_recall_param(proposal_nums, iou_thrs): + """Check proposal_nums and iou_thrs and set correct format.""" + if isinstance(proposal_nums, Sequence): + _proposal_nums = np.array(proposal_nums) + elif isinstance(proposal_nums, int): + _proposal_nums = np.array([proposal_nums]) + else: + _proposal_nums = proposal_nums + + if iou_thrs is None: + _iou_thrs = np.array([0.5]) + elif isinstance(iou_thrs, Sequence): + _iou_thrs = np.array(iou_thrs) + elif isinstance(iou_thrs, float): + _iou_thrs = np.array([iou_thrs]) + else: + _iou_thrs = iou_thrs + + return _proposal_nums, _iou_thrs + + +def eval_recalls(gts, + proposals, + proposal_nums=None, + iou_thrs=0.5, + logger=None, + use_legacy_coordinate=False): + """Calculate recalls. + + Args: + gts (list[ndarray]): a list of arrays of shape (n, 4) + proposals (list[ndarray]): a list of arrays of shape (k, 4) or (k, 5) + proposal_nums (int | Sequence[int]): Top N proposals to be evaluated. + iou_thrs (float | Sequence[float]): IoU thresholds. Default: 0.5. + logger (logging.Logger | str | None): The way to print the recall + summary. See `mmengine.logging.print_log()` for details. + Default: None. + use_legacy_coordinate (bool): Whether use coordinate system + in mmdet v1.x. "1" was added to both height and width + which means w, h should be + computed as 'x2 - x1 + 1` and 'y2 - y1 + 1'. Default: False. + + + Returns: + ndarray: recalls of different ious and proposal nums + """ + + img_num = len(gts) + assert img_num == len(proposals) + proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs) + all_ious = [] + for i in range(img_num): + if proposals[i].ndim == 2 and proposals[i].shape[1] == 5: + scores = proposals[i][:, 4] + sort_idx = np.argsort(scores)[::-1] + img_proposal = proposals[i][sort_idx, :] + else: + img_proposal = proposals[i] + prop_num = min(img_proposal.shape[0], proposal_nums[-1]) + if gts[i] is None or gts[i].shape[0] == 0: + ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32) + else: + ious = bbox_overlaps( + gts[i], + img_proposal[:prop_num, :4], + use_legacy_coordinate=use_legacy_coordinate) + all_ious.append(ious) + all_ious = np.array(all_ious) + recalls = _recalls(all_ious, proposal_nums, iou_thrs) + + print_recall_summary(recalls, proposal_nums, iou_thrs, logger=logger) + return recalls + + +def print_recall_summary(recalls, + proposal_nums, + iou_thrs, + row_idxs=None, + col_idxs=None, + logger=None): + """Print recalls in a table. + + Args: + recalls (ndarray): calculated from `bbox_recalls` + proposal_nums (ndarray or list): top N proposals + iou_thrs (ndarray or list): iou thresholds + row_idxs (ndarray): which rows(proposal nums) to print + col_idxs (ndarray): which cols(iou thresholds) to print + logger (logging.Logger | str | None): The way to print the recall + summary. See `mmengine.logging.print_log()` for details. + Default: None. + """ + proposal_nums = np.array(proposal_nums, dtype=np.int32) + iou_thrs = np.array(iou_thrs) + if row_idxs is None: + row_idxs = np.arange(proposal_nums.size) + if col_idxs is None: + col_idxs = np.arange(iou_thrs.size) + row_header = [''] + iou_thrs[col_idxs].tolist() + table_data = [row_header] + for i, num in enumerate(proposal_nums[row_idxs]): + row = [f'{val:.3f}' for val in recalls[row_idxs[i], col_idxs].tolist()] + row.insert(0, num) + table_data.append(row) + table = AsciiTable(table_data) + print_log('\n' + table.table, logger=logger) + + +def plot_num_recall(recalls, proposal_nums): + """Plot Proposal_num-Recalls curve. + + Args: + recalls(ndarray or list): shape (k,) + proposal_nums(ndarray or list): same shape as `recalls` + """ + if isinstance(proposal_nums, np.ndarray): + _proposal_nums = proposal_nums.tolist() + else: + _proposal_nums = proposal_nums + if isinstance(recalls, np.ndarray): + _recalls = recalls.tolist() + else: + _recalls = recalls + + import matplotlib.pyplot as plt + f = plt.figure() + plt.plot([0] + _proposal_nums, [0] + _recalls) + plt.xlabel('Proposal num') + plt.ylabel('Recall') + plt.axis([0, proposal_nums.max(), 0, 1]) + f.show() + + +def plot_iou_recall(recalls, iou_thrs): + """Plot IoU-Recalls curve. + + Args: + recalls(ndarray or list): shape (k,) + iou_thrs(ndarray or list): same shape as `recalls` + """ + if isinstance(iou_thrs, np.ndarray): + _iou_thrs = iou_thrs.tolist() + else: + _iou_thrs = iou_thrs + if isinstance(recalls, np.ndarray): + _recalls = recalls.tolist() + else: + _recalls = recalls + + import matplotlib.pyplot as plt + f = plt.figure() + plt.plot(_iou_thrs + [1.0], _recalls + [0.]) + plt.xlabel('IoU') + plt.ylabel('Recall') + plt.axis([iou_thrs.min(), 1, 0, 1]) + f.show() diff --git a/mmdetection/mmdet/evaluation/functional/ytvis.py b/mmdetection/mmdet/evaluation/functional/ytvis.py new file mode 100644 index 0000000..c65a7e9 --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/ytvis.py @@ -0,0 +1,305 @@ +# Copyright (c) Github URL +# Copied from +# https://github.com/youtubevos/cocoapi/blob/master/PythonAPI/pycocotools/ytvos.py +__author__ = 'ychfan' +# Interface for accessing the YouTubeVIS dataset. + +# The following API functions are defined: +# YTVIS - YTVIS api class that loads YouTubeVIS annotation file +# and prepare data structures. +# decodeMask - Decode binary mask M encoded via run-length encoding. +# encodeMask - Encode binary mask M using run-length encoding. +# getAnnIds - Get ann ids that satisfy given filter conditions. +# getCatIds - Get cat ids that satisfy given filter conditions. +# getImgIds - Get img ids that satisfy given filter conditions. +# loadAnns - Load anns with the specified ids. +# loadCats - Load cats with the specified ids. +# loadImgs - Load imgs with the specified ids. +# annToMask - Convert segmentation in an annotation to binary mask. +# loadRes - Load algorithm results and create API for accessing them. + +# Microsoft COCO Toolbox. version 2.0 +# Data, paper, and tutorials available at: http://mscoco.org/ +# Code written by Piotr Dollar and Tsung-Yi Lin, 2014. +# Licensed under the Simplified BSD License [see bsd.txt] + +import copy +import itertools +import json +import sys +import time +from collections import defaultdict + +import numpy as np +from pycocotools import mask as maskUtils + +PYTHON_VERSION = sys.version_info[0] + + +def _isArrayLike(obj): + return hasattr(obj, '__iter__') and hasattr(obj, '__len__') + + +class YTVIS: + + def __init__(self, annotation_file=None): + """Constructor of Microsoft COCO helper class for reading and + visualizing annotations. + + :param annotation_file (str | dict): location of annotation file or + dict results. + :param image_folder (str): location to the folder that hosts images. + :return: + """ + # load dataset + self.dataset, self.anns, self.cats, self.vids = dict(), dict(), dict( + ), dict() + self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list) + if annotation_file is not None: + print('loading annotations into memory...') + tic = time.time() + if type(annotation_file) == str: + dataset = json.load(open(annotation_file, 'r')) + else: + dataset = annotation_file + assert type( + dataset + ) == dict, 'annotation file format {} not supported'.format( + type(dataset)) + print('Done (t={:0.2f}s)'.format(time.time() - tic)) + self.dataset = dataset + self.createIndex() + + def createIndex(self): + # create index + print('creating index...') + anns, cats, vids = {}, {}, {} + vidToAnns, catToVids = defaultdict(list), defaultdict(list) + if 'annotations' in self.dataset: + for ann in self.dataset['annotations']: + vidToAnns[ann['video_id']].append(ann) + anns[ann['id']] = ann + + if 'videos' in self.dataset: + for vid in self.dataset['videos']: + vids[vid['id']] = vid + + if 'categories' in self.dataset: + for cat in self.dataset['categories']: + cats[cat['id']] = cat + + if 'annotations' in self.dataset and 'categories' in self.dataset: + for ann in self.dataset['annotations']: + catToVids[ann['category_id']].append(ann['video_id']) + + print('index created!') + + # create class members + self.anns = anns + self.vidToAnns = vidToAnns + self.catToVids = catToVids + self.vids = vids + self.cats = cats + + def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None): + """Get ann ids that satisfy given filter conditions. default skips that + filter. + + :param vidIds (int array) : get anns for given vids + catIds (int array) : get anns for given cats + areaRng (float array) : get anns for given area range + iscrowd (boolean) : get anns for given crowd label + :return: ids (int array) : integer array of ann ids + """ + vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(vidIds) == len(catIds) == len(areaRng) == 0: + anns = self.dataset['annotations'] + else: + if not len(vidIds) == 0: + lists = [ + self.vidToAnns[vidId] for vidId in vidIds + if vidId in self.vidToAnns + ] + anns = list(itertools.chain.from_iterable(lists)) + else: + anns = self.dataset['annotations'] + anns = anns if len(catIds) == 0 else [ + ann for ann in anns if ann['category_id'] in catIds + ] + anns = anns if len(areaRng) == 0 else [ + ann for ann in anns if ann['avg_area'] > areaRng[0] + and ann['avg_area'] < areaRng[1] + ] + if iscrowd is not None: + ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] + else: + ids = [ann['id'] for ann in anns] + return ids + + def getCatIds(self, catNms=[], supNms=[], catIds=[]): + """filtering parameters. default skips that filter. + + :param catNms (str array) : get cats for given cat names + :param supNms (str array) : get cats for given supercategory names + :param catIds (int array) : get cats for given cat ids + :return: ids (int array) : integer array of cat ids + """ + catNms = catNms if _isArrayLike(catNms) else [catNms] + supNms = supNms if _isArrayLike(supNms) else [supNms] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(catNms) == len(supNms) == len(catIds) == 0: + cats = self.dataset['categories'] + else: + cats = self.dataset['categories'] + cats = cats if len(catNms) == 0 else [ + cat for cat in cats if cat['name'] in catNms + ] + cats = cats if len(supNms) == 0 else [ + cat for cat in cats if cat['supercategory'] in supNms + ] + cats = cats if len(catIds) == 0 else [ + cat for cat in cats if cat['id'] in catIds + ] + ids = [cat['id'] for cat in cats] + return ids + + def getVidIds(self, vidIds=[], catIds=[]): + """Get vid ids that satisfy given filter conditions. + + :param vidIds (int array) : get vids for given ids + :param catIds (int array) : get vids with all given cats + :return: ids (int array) : integer array of vid ids + """ + vidIds = vidIds if _isArrayLike(vidIds) else [vidIds] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(vidIds) == len(catIds) == 0: + ids = self.vids.keys() + else: + ids = set(vidIds) + for i, catId in enumerate(catIds): + if i == 0 and len(ids) == 0: + ids = set(self.catToVids[catId]) + else: + ids &= set(self.catToVids[catId]) + return list(ids) + + def loadAnns(self, ids=[]): + """Load anns with the specified ids. + + :param ids (int array) : integer ids specifying anns + :return: anns (object array) : loaded ann objects + """ + if _isArrayLike(ids): + return [self.anns[id] for id in ids] + elif type(ids) == int: + return [self.anns[ids]] + + def loadCats(self, ids=[]): + """Load cats with the specified ids. + + :param ids (int array) : integer ids specifying cats + :return: cats (object array) : loaded cat objects + """ + if _isArrayLike(ids): + return [self.cats[id] for id in ids] + elif type(ids) == int: + return [self.cats[ids]] + + def loadVids(self, ids=[]): + """Load anns with the specified ids. + + :param ids (int array) : integer ids specifying vid + :return: vids (object array) : loaded vid objects + """ + if _isArrayLike(ids): + return [self.vids[id] for id in ids] + elif type(ids) == int: + return [self.vids[ids]] + + def loadRes(self, resFile): + """Load result file and return a result api object. + + :param resFile (str) : file name of result file + :return: res (obj) : result api object + """ + res = YTVIS() + res.dataset['videos'] = [img for img in self.dataset['videos']] + + print('Loading and preparing results...') + tic = time.time() + if type(resFile) == str or (PYTHON_VERSION == 2 + and type(resFile) == str): + anns = json.load(open(resFile)) + elif type(resFile) == np.ndarray: + anns = self.loadNumpyAnnotations(resFile) + else: + anns = resFile + assert type(anns) == list, 'results in not an array of objects' + annsVidIds = [ann['video_id'] for ann in anns] + assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \ + 'Results do not correspond to current coco set' + if 'segmentations' in anns[0]: + res.dataset['categories'] = copy.deepcopy( + self.dataset['categories']) + for id, ann in enumerate(anns): + ann['areas'] = [] + if 'bboxes' not in ann: + ann['bboxes'] = [] + for seg in ann['segmentations']: + # now only support compressed RLE format + # as segmentation results + if seg: + ann['areas'].append(maskUtils.area(seg)) + if len(ann['bboxes']) < len(ann['areas']): + ann['bboxes'].append(maskUtils.toBbox(seg)) + else: + ann['areas'].append(None) + if len(ann['bboxes']) < len(ann['areas']): + ann['bboxes'].append(None) + ann['id'] = id + 1 + l_ori = [a for a in ann['areas'] if a] + if len(l_ori) == 0: + ann['avg_area'] = 0 + else: + ann['avg_area'] = np.array(l_ori).mean() + ann['iscrowd'] = 0 + print('DONE (t={:0.2f}s)'.format(time.time() - tic)) + + res.dataset['annotations'] = anns + res.createIndex() + return res + + def annToRLE(self, ann, frameId): + """Convert annotation which can be polygons, uncompressed RLE to RLE. + + :return: binary mask (numpy 2D array) + """ + t = self.vids[ann['video_id']] + h, w = t['height'], t['width'] + segm = ann['segmentations'][frameId] + if type(segm) == list: + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(segm, h, w) + rle = maskUtils.merge(rles) + elif type(segm['counts']) == list: + # uncompressed RLE + rle = maskUtils.frPyObjects(segm, h, w) + else: + # rle + rle = segm + return rle + + def annToMask(self, ann, frameId): + """Convert annotation which can be polygons, uncompressed RLE, or RLE + to binary mask. + + :return: binary mask (numpy 2D array) + """ + rle = self.annToRLE(ann, frameId) + m = maskUtils.decode(rle) + return m diff --git a/mmdetection/mmdet/evaluation/functional/ytviseval.py b/mmdetection/mmdet/evaluation/functional/ytviseval.py new file mode 100644 index 0000000..fdaf110 --- /dev/null +++ b/mmdetection/mmdet/evaluation/functional/ytviseval.py @@ -0,0 +1,623 @@ +# Copyright (c) Github URL +# Copied from +# https://github.com/youtubevos/cocoapi/blob/master/PythonAPI/pycocotools/ytvoseval.py +__author__ = 'ychfan' + +import copy +import datetime +import time +from collections import defaultdict + +import numpy as np +from pycocotools import mask as maskUtils + + +class YTVISeval: + # Interface for evaluating video instance segmentation on + # the YouTubeVIS dataset. + # + # The usage for YTVISeval is as follows: + # cocoGt=..., cocoDt=... # load dataset and results + # E = YTVISeval(cocoGt,cocoDt); # initialize YTVISeval object + # E.params.recThrs = ...; # set parameters as desired + # E.evaluate(); # run per image evaluation + # E.accumulate(); # accumulate per image results + # E.summarize(); # display summary metrics of results + # For example usage see evalDemo.m and http://mscoco.org/. + # + # The evaluation parameters are as follows (defaults in brackets): + # imgIds - [all] N img ids to use for evaluation + # catIds - [all] K cat ids to use for evaluation + # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation + # recThrs - [0:.01:1] R=101 recall thresholds for evaluation + # areaRng - [...] A=4 object area ranges for evaluation + # maxDets - [1 10 100] M=3 thresholds on max detections per image + # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints' + # iouType replaced the now DEPRECATED useSegm parameter. + # useCats - [1] if true use category labels for evaluation + # Note: if useCats=0 category labels are ignored as in proposal scoring. + # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. + # + # evaluate(): evaluates detections on every image and every category and + # concats the results into the "evalImgs" with fields: + # dtIds - [1xD] id for each of the D detections (dt) + # gtIds - [1xG] id for each of the G ground truths (gt) + # dtMatches - [TxD] matching gt id at each IoU or 0 + # gtMatches - [TxG] matching dt id at each IoU or 0 + # dtScores - [1xD] confidence of each dt + # gtIgnore - [1xG] ignore flag for each gt + # dtIgnore - [TxD] ignore flag for each dt at each IoU + # + # accumulate(): accumulates the per-image, per-category evaluation + # results in "evalImgs" into the dictionary "eval" with fields: + # params - parameters used for evaluation + # date - date evaluation was performed + # counts - [T,R,K,A,M] parameter dimensions (see above) + # precision - [TxRxKxAxM] precision for every evaluation setting + # recall - [TxKxAxM] max recall for every evaluation setting + # Note: precision and recall==-1 for settings with no gt objects. + # + # See also coco, mask, pycocoDemo, pycocoEvalDemo + # + # Microsoft COCO Toolbox. version 2.0 + # Data, paper, and tutorials available at: http://mscoco.org/ + # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. + # Licensed under the Simplified BSD License [see coco/license.txt] + def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'): + """Initialize CocoEval using coco APIs for gt and dt. + + :param cocoGt: coco object with ground truth annotations + :param cocoDt: coco object with detection results + :return: None + """ + if not iouType: + print('iouType not specified. use default iouType segm') + self.cocoGt = cocoGt # ground truth COCO API + self.cocoDt = cocoDt # detections COCO API + self.params = {} # evaluation parameters + self.evalVids = defaultdict( + list) # per-image per-category evaluation results [KxAxI] elements + self.eval = {} # accumulated evaluation results + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + self.params = Params(iouType=iouType) # parameters + self._paramsEval = {} # parameters for evaluation + self.stats = [] # result summarization + self.ious = {} # ious between all gts and dts + if cocoGt is not None: + self.params.vidIds = sorted(cocoGt.getVidIds()) + self.params.catIds = sorted(cocoGt.getCatIds()) + + def _prepare(self): + ''' + Prepare ._gts and ._dts for evaluation based on params + :return: None + ''' + + def _toMask(anns, coco): + # modify ann['segmentation'] by reference + for ann in anns: + for i, a in enumerate(ann['segmentations']): + if a: + rle = coco.annToRLE(ann, i) + ann['segmentations'][i] = rle + l_ori = [a for a in ann['areas'] if a] + if len(l_ori) == 0: + ann['avg_area'] = 0 + else: + ann['avg_area'] = np.array(l_ori).mean() + + p = self.params + if p.useCats: + gts = self.cocoGt.loadAnns( + self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds)) + dts = self.cocoDt.loadAnns( + self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds)) + else: + gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds)) + dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds)) + + # convert ground truth to mask if iouType == 'segm' + if p.iouType == 'segm': + _toMask(gts, self.cocoGt) + _toMask(dts, self.cocoDt) + # set ignore flag + for gt in gts: + gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0 + gt['ignore'] = 'iscrowd' in gt and gt['iscrowd'] + if p.iouType == 'keypoints': + gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore'] + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + for gt in gts: + self._gts[gt['video_id'], gt['category_id']].append(gt) + for dt in dts: + self._dts[dt['video_id'], dt['category_id']].append(dt) + self.evalVids = defaultdict( + list) # per-image per-category evaluation results + self.eval = {} # accumulated evaluation results + + def evaluate(self): + ''' + Run per image evaluation on given images and store + results (a list of dict) in self.evalVids + :return: None + ''' + tic = time.time() + print('Running per image evaluation...') + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = 'segm' if p.useSegm == 1 else 'bbox' + print('useSegm (deprecated) is not None. Running {} evaluation'. + format(p.iouType)) + print('Evaluate annotation type *{}*'.format(p.iouType)) + p.vidIds = list(np.unique(p.vidIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType == 'segm' or p.iouType == 'bbox': + computeIoU = self.computeIoU + elif p.iouType == 'keypoints': + computeIoU = self.computeOks + self.ious = {(vidId, catId): computeIoU(vidId, catId) + for vidId in p.vidIds for catId in catIds} + + evaluateVid = self.evaluateVid + maxDet = p.maxDets[-1] + + self.evalImgs = [ + evaluateVid(vidId, catId, areaRng, maxDet) for catId in catIds + for areaRng in p.areaRng for vidId in p.vidIds + ] + self._paramsEval = copy.deepcopy(self.params) + toc = time.time() + print('DONE (t={:0.2f}s).'.format(toc - tic)) + + def computeIoU(self, vidId, catId): + p = self.params + if p.useCats: + gt = self._gts[vidId, catId] + dt = self._dts[vidId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[vidId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[vidId, cId]] + if len(gt) == 0 and len(dt) == 0: + return [] + inds = np.argsort([-d['score'] for d in dt], kind='mergesort') + dt = [dt[i] for i in inds] + if len(dt) > p.maxDets[-1]: + dt = dt[0:p.maxDets[-1]] + + if p.iouType == 'segm': + g = [g['segmentations'] for g in gt] + d = [d['segmentations'] for d in dt] + elif p.iouType == 'bbox': + g = [g['bboxes'] for g in gt] + d = [d['bboxes'] for d in dt] + else: + raise Exception('unknown iouType for iou computation') + + # compute iou between each dt and gt region + + def iou_seq(d_seq, g_seq): + i = .0 + u = .0 + for d, g in zip(d_seq, g_seq): + if d and g: + i += maskUtils.area(maskUtils.merge([d, g], True)) + u += maskUtils.area(maskUtils.merge([d, g], False)) + elif not d and g: + u += maskUtils.area(g) + elif d and not g: + u += maskUtils.area(d) + if not u > .0: + print('Mask sizes in video {} and category {} may not match!'. + format(vidId, catId)) + iou = i / u if u > .0 else .0 + return iou + + ious = np.zeros([len(d), len(g)]) + for i, j in np.ndindex(ious.shape): + ious[i, j] = iou_seq(d[i], g[j]) + + return ious + + def computeOks(self, imgId, catId): + p = self.params + + gts = self._gts[imgId, catId] + dts = self._dts[imgId, catId] + inds = np.argsort([-d['score'] for d in dts], kind='mergesort') + dts = [dts[i] for i in inds] + if len(dts) > p.maxDets[-1]: + dts = dts[0:p.maxDets[-1]] + # if len(gts) == 0 and len(dts) == 0: + if len(gts) == 0 or len(dts) == 0: + return [] + ious = np.zeros((len(dts), len(gts))) + sigmas = np.array([ + .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, + .87, .87, .89, .89 + ]) / 10.0 + vars = (sigmas * 2)**2 + k = len(sigmas) + # compute oks between each detection and ground truth object + for j, gt in enumerate(gts): + # create bounds for ignore regions(double the gt bbox) + g = np.array(gt['keypoints']) + xg = g[0::3] + yg = g[1::3] + vg = g[2::3] + k1 = np.count_nonzero(vg > 0) + bb = gt['bbox'] + x0 = bb[0] - bb[2] + x1 = bb[0] + bb[2] * 2 + y0 = bb[1] - bb[3] + y1 = bb[1] + bb[3] * 2 + for i, dt in enumerate(dts): + d = np.array(dt['keypoints']) + xd = d[0::3] + yd = d[1::3] + if k1 > 0: + # measure the per-keypoint distance if keypoints visible + dx = xd - xg + dy = yd - yg + else: + # measure minimum distance to keypoints + z = np.zeros((k)) + dx = np.max((z, x0 - xd), axis=0) + np.max( + (z, xd - x1), axis=0) + dy = np.max((z, y0 - yd), axis=0) + np.max( + (z, yd - y1), axis=0) + e = (dx**2 + dy**2) / vars / (gt['avg_area'] + + np.spacing(1)) / 2 + if k1 > 0: + e = e[vg > 0] + ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] + return ious + + def evaluateVid(self, vidId, catId, aRng, maxDet): + ''' + perform evaluation for single category and image + :return: dict (single image results) + ''' + p = self.params + if p.useCats: + gt = self._gts[vidId, catId] + dt = self._dts[vidId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[vidId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[vidId, cId]] + if len(gt) == 0 and len(dt) == 0: + return None + + for g in gt: + if g['ignore'] or (g['avg_area'] < aRng[0] + or g['avg_area'] > aRng[1]): + g['_ignore'] = 1 + else: + g['_ignore'] = 0 + + # sort dt highest score first, sort gt ignore last + gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort') + gt = [gt[i] for i in gtind] + dtind = np.argsort([-d['score'] for d in dt], kind='mergesort') + dt = [dt[i] for i in dtind[0:maxDet]] + iscrowd = [int(o['iscrowd']) for o in gt] + # load computed ious + ious = self.ious[vidId, catId][:, gtind] if len( + self.ious[vidId, catId]) > 0 else self.ious[vidId, catId] + + T = len(p.iouThrs) + G = len(gt) + D = len(dt) + gtm = np.zeros((T, G)) + dtm = np.zeros((T, D)) + gtIg = np.array([g['_ignore'] for g in gt]) + dtIg = np.zeros((T, D)) + if not len(ious) == 0: + for tind, t in enumerate(p.iouThrs): + for dind, d in enumerate(dt): + # information about best match so far (m=-1 -> unmatched) + iou = min([t, 1 - 1e-10]) + m = -1 + for gind, g in enumerate(gt): + # if this gt already matched, and not a crowd, continue + if gtm[tind, gind] > 0 and not iscrowd[gind]: + continue + # if dt matched to reg gt, and on ignore gt, stop + if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1: + break + # continue to next gt unless better match made + if ious[dind, gind] < iou: + continue + # if match successful and best so far, + # store appropriately + iou = ious[dind, gind] + m = gind + # if match made store id of match for both dt and gt + if m == -1: + continue + dtIg[tind, dind] = gtIg[m] + dtm[tind, dind] = gt[m]['id'] + gtm[tind, m] = d['id'] + # set unmatched detections outside of area range to ignore + a = np.array([ + d['avg_area'] < aRng[0] or d['avg_area'] > aRng[1] for d in dt + ]).reshape((1, len(dt))) + dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, + 0))) + # store results for given image and category + return { + 'video_id': vidId, + 'category_id': catId, + 'aRng': aRng, + 'maxDet': maxDet, + 'dtIds': [d['id'] for d in dt], + 'gtIds': [g['id'] for g in gt], + 'dtMatches': dtm, + 'gtMatches': gtm, + 'dtScores': [d['score'] for d in dt], + 'gtIgnore': gtIg, + 'dtIgnore': dtIg, + } + + def accumulate(self, p=None): + """Accumulate per image evaluation results and store the result in + self.eval. + + :param p: input params for evaluation + :return: None + """ + print('Accumulating evaluation results...') + tic = time.time() + if not self.evalImgs: + print('Please run evaluate() first') + # allows input customized parameters + if p is None: + p = self.params + p.catIds = p.catIds if p.useCats == 1 else [-1] + T = len(p.iouThrs) + R = len(p.recThrs) + K = len(p.catIds) if p.useCats else 1 + A = len(p.areaRng) + M = len(p.maxDets) + precision = -np.ones( + (T, R, K, A, M)) # -1 for the precision of absent categories + recall = -np.ones((T, K, A, M)) + scores = -np.ones((T, R, K, A, M)) + + # create dictionary for future indexing + _pe = self._paramsEval + catIds = _pe.catIds if _pe.useCats else [-1] + setK = set(catIds) + setA = set(map(tuple, _pe.areaRng)) + setM = set(_pe.maxDets) + setI = set(_pe.vidIds) + # get inds to evaluate + k_list = [n for n, k in enumerate(p.catIds) if k in setK] + m_list = [m for n, m in enumerate(p.maxDets) if m in setM] + a_list = [ + n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) + if a in setA + ] + i_list = [n for n, i in enumerate(p.vidIds) if i in setI] + I0 = len(_pe.vidIds) + A0 = len(_pe.areaRng) + # retrieve E at each category, area range, and max number of detections + for k, k0 in enumerate(k_list): + Nk = k0 * A0 * I0 + for a, a0 in enumerate(a_list): + Na = a0 * I0 + for m, maxDet in enumerate(m_list): + E = [self.evalImgs[Nk + Na + i] for i in i_list] + E = [e for e in E if e is not None] + if len(E) == 0: + continue + dtScores = np.concatenate( + [e['dtScores'][0:maxDet] for e in E]) + + inds = np.argsort(-dtScores, kind='mergesort') + dtScoresSorted = dtScores[inds] + + dtm = np.concatenate( + [e['dtMatches'][:, 0:maxDet] for e in E], axis=1)[:, + inds] + dtIg = np.concatenate( + [e['dtIgnore'][:, 0:maxDet] for e in E], axis=1)[:, + inds] + gtIg = np.concatenate([e['gtIgnore'] for e in E]) + npig = np.count_nonzero(gtIg == 0) + if npig == 0: + continue + tps = np.logical_and(dtm, np.logical_not(dtIg)) + fps = np.logical_and( + np.logical_not(dtm), np.logical_not(dtIg)) + + tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) + for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): + tp = np.array(tp) + fp = np.array(fp) + nd_ori = len(tp) + rc = tp / npig + pr = tp / (fp + tp + np.spacing(1)) + q = np.zeros((R, )) + ss = np.zeros((R, )) + + if nd_ori: + recall[t, k, a, m] = rc[-1] + else: + recall[t, k, a, m] = 0 + + # use python array gets significant speed improvement + pr = pr.tolist() + q = q.tolist() + + for i in range(nd_ori - 1, 0, -1): + if pr[i] > pr[i - 1]: + pr[i - 1] = pr[i] + + inds = np.searchsorted(rc, p.recThrs, side='left') + try: + for ri, pi in enumerate(inds): + q[ri] = pr[pi] + ss[ri] = dtScoresSorted[pi] + except Exception: + pass + precision[t, :, k, a, m] = np.array(q) + scores[t, :, k, a, m] = np.array(ss) + self.eval = { + 'params': p, + 'counts': [T, R, K, A, M], + 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + 'precision': precision, + 'recall': recall, + 'scores': scores, + } + toc = time.time() + print('DONE (t={:0.2f}s).'.format(toc - tic)) + + def summarize(self): + """Compute and display summary metrics for evaluation results. + + Note this function can *only* be applied on the default parameter + setting + """ + + def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100): + p = self.params + iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | ' \ + 'maxDets={:>3d} ] = {:0.3f}' + titleStr = 'Average Precision' if ap == 1 else 'Average Recall' + typeStr = '(AP)' if ap == 1 else '(AR)' + iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \ + if iouThr is None else '{:0.2f}'.format(iouThr) + + aind = [ + i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng + ] + mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] + if ap == 1: + # dimension of precision: [TxRxKxAxM] + s = self.eval['precision'] + # IoU + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, :, aind, mind] + else: + # dimension of recall: [TxKxAxM] + s = self.eval['recall'] + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, aind, mind] + if len(s[s > -1]) == 0: + mean_s = -1 + else: + mean_s = np.mean(s[s > -1]) + print( + iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, + mean_s)) + return mean_s + + def _summarizeDets(): + stats = np.zeros((12, )) + stats[0] = _summarize(1) + stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2]) + stats[2] = _summarize( + 1, iouThr=.75, maxDets=self.params.maxDets[2]) + stats[3] = _summarize( + 1, areaRng='small', maxDets=self.params.maxDets[2]) + stats[4] = _summarize( + 1, areaRng='medium', maxDets=self.params.maxDets[2]) + stats[5] = _summarize( + 1, areaRng='large', maxDets=self.params.maxDets[2]) + stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) + stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) + stats[9] = _summarize( + 0, areaRng='small', maxDets=self.params.maxDets[2]) + stats[10] = _summarize( + 0, areaRng='medium', maxDets=self.params.maxDets[2]) + stats[11] = _summarize( + 0, areaRng='large', maxDets=self.params.maxDets[2]) + return stats + + def _summarizeKps(): + stats = np.zeros((10, )) + stats[0] = _summarize(1, maxDets=20) + stats[1] = _summarize(1, maxDets=20, iouThr=.5) + stats[2] = _summarize(1, maxDets=20, iouThr=.75) + stats[3] = _summarize(1, maxDets=20, areaRng='medium') + stats[4] = _summarize(1, maxDets=20, areaRng='large') + stats[5] = _summarize(0, maxDets=20) + stats[6] = _summarize(0, maxDets=20, iouThr=.5) + stats[7] = _summarize(0, maxDets=20, iouThr=.75) + stats[8] = _summarize(0, maxDets=20, areaRng='medium') + stats[9] = _summarize(0, maxDets=20, areaRng='large') + return stats + + if not self.eval: + raise Exception('Please run accumulate() first') + iouType = self.params.iouType + if iouType == 'segm' or iouType == 'bbox': + summarize = _summarizeDets + elif iouType == 'keypoints': + summarize = _summarizeKps + self.stats = summarize() + + def __str__(self): + self.summarize() + + +class Params: + """Params for coco evaluation api.""" + + def setDetParams(self): + self.vidIds = [] + self.catIds = [] + # np.arange causes trouble. the data point on arange + # is slightly larger than the true value + self.iouThrs = np.linspace( + .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + self.recThrs = np.linspace( + .0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True) + self.maxDets = [1, 10, 100] + self.areaRng = [[0**2, 1e5**2], [0**2, 128**2], [128**2, 256**2], + [256**2, 1e5**2]] + self.areaRngLbl = ['all', 'small', 'medium', 'large'] + self.useCats = 1 + + def setKpParams(self): + self.vidIds = [] + self.catIds = [] + # np.arange causes trouble. the data point on arange + # is slightly larger than the true value + self.iouThrs = np.linspace( + .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + self.recThrs = np.linspace( + .0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True) + self.maxDets = [20] + self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]] + self.areaRngLbl = ['all', 'medium', 'large'] + self.useCats = 1 + + def __init__(self, iouType='segm'): + if iouType == 'segm' or iouType == 'bbox': + self.setDetParams() + elif iouType == 'keypoints': + self.setKpParams() + else: + raise Exception('iouType not supported') + self.iouType = iouType + # useSegm is deprecated + self.useSegm = None diff --git a/mmdetection/mmdet/evaluation/metrics/__init__.py b/mmdetection/mmdet/evaluation/metrics/__init__.py new file mode 100644 index 0000000..e1ec0e4 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_video_metric import BaseVideoMetric +from .cityscapes_metric import CityScapesMetric +from .coco_caption_metric import COCOCaptionMetric +from .coco_metric import CocoMetric +from .coco_occluded_metric import CocoOccludedSeparatedMetric +from .coco_panoptic_metric import CocoPanopticMetric +from .coco_video_metric import CocoVideoMetric +from .crowdhuman_metric import CrowdHumanMetric +from .dump_det_results import DumpDetResults +from .dump_proposals_metric import DumpProposals +from .lvis_metric import LVISMetric +from .mot_challenge_metric import MOTChallengeMetric +from .openimages_metric import OpenImagesMetric +from .refseg_metric import RefSegMetric +from .reid_metric import ReIDMetrics +from .semseg_metric import SemSegMetric +from .voc_metric import VOCMetric +from .youtube_vis_metric import YouTubeVISMetric + +__all__ = [ + 'CityScapesMetric', 'CocoMetric', 'CocoPanopticMetric', 'OpenImagesMetric', + 'VOCMetric', 'LVISMetric', 'CrowdHumanMetric', 'DumpProposals', + 'CocoOccludedSeparatedMetric', 'DumpDetResults', 'BaseVideoMetric', + 'MOTChallengeMetric', 'CocoVideoMetric', 'ReIDMetrics', 'YouTubeVISMetric', + 'COCOCaptionMetric', 'SemSegMetric', 'RefSegMetric' +] diff --git a/mmdetection/mmdet/evaluation/metrics/base_video_metric.py b/mmdetection/mmdet/evaluation/metrics/base_video_metric.py new file mode 100644 index 0000000..90c7cdc --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/base_video_metric.py @@ -0,0 +1,173 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import pickle +import shutil +import tempfile +import warnings +from typing import Optional, Sequence + +import torch +from mmengine.dist import (barrier, broadcast, broadcast_object_list, + get_dist_info, is_main_process) +from mmengine.evaluator import BaseMetric +from mmengine.utils import mkdir_or_exist + + +class BaseVideoMetric(BaseMetric): + """Base class for a metric in video task. + + The metric first processes each batch of data_samples and predictions, + and appends the processed results to the results list. Then it + collects all results together from all ranks if distributed training + is used. Finally, it computes the metrics of the entire dataset. + + A subclass of class:`BaseVideoMetric` should assign a meaningful value + to the class attribute `default_prefix`. See the argument `prefix` for + details. + """ + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for track_data_sample in data_samples: + video_data_samples = track_data_sample['video_data_samples'] + ori_video_len = video_data_samples[0].ori_video_length + if ori_video_len == len(video_data_samples): + # video process + self.process_video(video_data_samples) + else: + # image process + self.process_image(video_data_samples, ori_video_len) + + def evaluate(self, size: int = 1) -> dict: + """Evaluate the model performance of the whole dataset after processing + all batches. + + Args: + size (int): Length of the entire validation dataset. + + Returns: + dict: Evaluation metrics dict on the val dataset. The keys are the + names of the metrics, and the values are corresponding results. + """ + if len(self.results) == 0: + warnings.warn( + f'{self.__class__.__name__} got empty `self.results`. Please ' + 'ensure that the processed results are properly added into ' + '`self.results` in `process` method.') + + results = collect_tracking_results(self.results, self.collect_device) + + if is_main_process(): + _metrics = self.compute_metrics(results) # type: ignore + # Add prefix to metric names + if self.prefix: + _metrics = { + '/'.join((self.prefix, k)): v + for k, v in _metrics.items() + } + metrics = [_metrics] + else: + metrics = [None] # type: ignore + + broadcast_object_list(metrics) + + # reset the results list + self.results.clear() + return metrics[0] + + +def collect_tracking_results(results: list, + device: str = 'cpu', + tmpdir: Optional[str] = None) -> Optional[list]: + """Collected results in distributed environments. different from the + function mmengine.dist.collect_results, tracking compute metrics don't use + paramenter size, which means length of the entire validation dataset. + because it's equal to video num, but compute metrics need image num. + + Args: + results (list): Result list containing result parts to be + collected. Each item of ``result_part`` should be a picklable + object. + device (str): Device name. Optional values are 'cpu' and 'gpu'. + tmpdir (str | None): Temporal directory for collected results to + store. If set to None, it will create a temporal directory for it. + ``tmpdir`` should be None when device is 'gpu'. Defaults to None. + + Returns: + list or None: The collected results. + """ + if device not in ['gpu', 'cpu']: + raise NotImplementedError( + f"device must be 'cpu' or 'gpu', but got {device}") + + if device == 'gpu': + assert tmpdir is None, 'tmpdir should be None when device is "gpu"' + raise NotImplementedError('GPU collecting has not been supported yet') + else: + return collect_tracking_results_cpu(results, tmpdir) + + +def collect_tracking_results_cpu(result_part: list, + tmpdir: Optional[str] = None + ) -> Optional[list]: + """Collect results on cpu mode. + + Saves the results on different gpus to 'tmpdir' and collects them by the + rank 0 worker. + + Args: + result_part (list): The part of prediction results. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. If is None, use `tempfile.mkdtemp()` + to make a temporary path. Defaults to None. + + Returns: + list or None: The collected results. + """ + rank, world_size = get_dist_info() + if world_size == 1: + return result_part + + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8) + if rank == 0: + mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8) + dir_tensor[:len(tmpdir)] = tmpdir + broadcast(dir_tensor, 0) + tmpdir = dir_tensor.numpy().tobytes().decode().rstrip() + else: + mkdir_or_exist(tmpdir) + + # dump the part result to the dir + with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f: # type: ignore + pickle.dump(result_part, f, protocol=2) + + barrier() + + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + path = osp.join(tmpdir, f'part_{i}.pkl') # type: ignore + with open(path, 'rb') as f: + part_list.extend(pickle.load(f)) + shutil.rmtree(tmpdir) + return part_list diff --git a/mmdetection/mmdet/evaluation/metrics/cityscapes_metric.py b/mmdetection/mmdet/evaluation/metrics/cityscapes_metric.py new file mode 100644 index 0000000..e5cdc17 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/cityscapes_metric.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import shutil +import tempfile +from collections import OrderedDict +from typing import Dict, Optional, Sequence + +import mmcv +import numpy as np +from mmengine.dist import is_main_process +from mmengine.evaluator import BaseMetric +from mmengine.logging import MMLogger + +from mmdet.registry import METRICS + +try: + import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval # noqa: E501 + import cityscapesscripts.helpers.labels as CSLabels + + from mmdet.evaluation.functional import evaluateImgLists + HAS_CITYSCAPESAPI = True +except ImportError: + HAS_CITYSCAPESAPI = False + + +@METRICS.register_module() +class CityScapesMetric(BaseMetric): + """CityScapes metric for instance segmentation. + + Args: + outfile_prefix (str): The prefix of txt and png files. The txt and + png file will be save in a directory whose path is + "outfile_prefix.results/". + seg_prefix (str, optional): Path to the directory which contains the + cityscapes instance segmentation masks. It's necessary when + training and validation. It could be None when infer on test + dataset. Defaults to None. + format_only (bool): Format the output results without perform + evaluation. It is useful when you want to format the result + to a specific format and submit it to the test server. + Defaults to False. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + dump_matches (bool): Whether dump matches.json file during evaluating. + Defaults to False. + file_client_args (dict, optional): Arguments to instantiate the + corresponding backend in mmdet <= 3.0.0rc6. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + """ + default_prefix: Optional[str] = 'cityscapes' + + def __init__(self, + outfile_prefix: str, + seg_prefix: Optional[str] = None, + format_only: bool = False, + collect_device: str = 'cpu', + prefix: Optional[str] = None, + dump_matches: bool = False, + file_client_args: dict = None, + backend_args: dict = None) -> None: + + if not HAS_CITYSCAPESAPI: + raise RuntimeError('Failed to import `cityscapesscripts`.' + 'Please try to install official ' + 'cityscapesscripts by ' + '"pip install cityscapesscripts"') + super().__init__(collect_device=collect_device, prefix=prefix) + + self.tmp_dir = None + self.format_only = format_only + if self.format_only: + assert outfile_prefix is not None, 'outfile_prefix must be not' + 'None when format_only is True, otherwise the result files will' + 'be saved to a temp directory which will be cleaned up at the end.' + else: + assert seg_prefix is not None, '`seg_prefix` is necessary when ' + 'computing the CityScapes metrics' + + if outfile_prefix is None: + self.tmp_dir = tempfile.TemporaryDirectory() + self.outfile_prefix = osp.join(self.tmp_dir.name, 'results') + else: + # the directory to save predicted panoptic segmentation mask + self.outfile_prefix = osp.join(outfile_prefix, 'results') # type: ignore # yapf: disable # noqa: E501 + + dir_name = osp.expanduser(self.outfile_prefix) + + if osp.exists(dir_name) and is_main_process(): + logger: MMLogger = MMLogger.get_current_instance() + logger.info('remove previous results.') + shutil.rmtree(dir_name) + os.makedirs(dir_name, exist_ok=True) + + self.backend_args = backend_args + if file_client_args is not None: + raise RuntimeError( + 'The `file_client_args` is deprecated, ' + 'please use `backend_args` instead, please refer to' + 'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py' # noqa: E501 + ) + + self.seg_prefix = seg_prefix + self.dump_matches = dump_matches + + def __del__(self) -> None: + """Clean up the results if necessary.""" + if self.tmp_dir is not None: + self.tmp_dir.cleanup() + + # TODO: data_batch is no longer needed, consider adjusting the + # parameter position + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + # parse pred + result = dict() + pred = data_sample['pred_instances'] + filename = data_sample['img_path'] + basename = osp.splitext(osp.basename(filename))[0] + pred_txt = osp.join(self.outfile_prefix, basename + '_pred.txt') + result['pred_txt'] = pred_txt + labels = pred['labels'].cpu().numpy() + masks = pred['masks'].cpu().numpy().astype(np.uint8) + if 'mask_scores' in pred: + # some detectors use different scores for bbox and mask + mask_scores = pred['mask_scores'].cpu().numpy() + else: + mask_scores = pred['scores'].cpu().numpy() + + with open(pred_txt, 'w') as f: + for i, (label, mask, mask_score) in enumerate( + zip(labels, masks, mask_scores)): + class_name = self.dataset_meta['classes'][label] + class_id = CSLabels.name2label[class_name].id + png_filename = osp.join( + self.outfile_prefix, + basename + f'_{i}_{class_name}.png') + mmcv.imwrite(mask, png_filename) + f.write(f'{osp.basename(png_filename)} ' + f'{class_id} {mask_score}\n') + + # parse gt + gt = dict() + img_path = filename.replace('leftImg8bit.png', + 'gtFine_instanceIds.png') + gt['file_name'] = img_path.replace('leftImg8bit', 'gtFine') + + self.results.append((gt, result)) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + + if self.format_only: + logger.info( + f'results are saved to {osp.dirname(self.outfile_prefix)}') + return OrderedDict() + logger.info('starts to compute metric') + + gts, preds = zip(*results) + # set global states in cityscapes evaluation API + gt_instances_file = osp.join(self.outfile_prefix, 'gtInstances.json') # type: ignore # yapf: disable # noqa: E501 + # split gt and prediction list + gts, preds = zip(*results) + CSEval.args.JSONOutput = False + CSEval.args.colorized = False + CSEval.args.gtInstancesFile = gt_instances_file + + groundTruthImgList = [gt['file_name'] for gt in gts] + predictionImgList = [pred['pred_txt'] for pred in preds] + CSEval_results = evaluateImgLists( + predictionImgList, + groundTruthImgList, + CSEval.args, + self.backend_args, + dump_matches=self.dump_matches)['averages'] + + eval_results = OrderedDict() + eval_results['mAP'] = CSEval_results['allAp'] + eval_results['AP@50'] = CSEval_results['allAp50%'] + + return eval_results diff --git a/mmdetection/mmdet/evaluation/metrics/coco_caption_metric.py b/mmdetection/mmdet/evaluation/metrics/coco_caption_metric.py new file mode 100644 index 0000000..d8c7350 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/coco_caption_metric.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import tempfile +from typing import List, Optional + +from mmengine.evaluator import BaseMetric +from mmengine.utils import track_iter_progress +from pycocotools.coco import COCO + +from mmdet.registry import METRICS + +try: + from pycocoevalcap.eval import COCOEvalCap +except ImportError: + COCOEvalCap = None + + +@METRICS.register_module() +class COCOCaptionMetric(BaseMetric): + """Coco Caption evaluation wrapper. + + Save the generated captions and transform into coco format. + Calling COCO API for caption metrics. + + Args: + ann_file (str): the path for the COCO format caption ground truth + json file, load for evaluations. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Should be modified according to the + `retrieval_type` for unambiguous results. Defaults to TR. + """ + + def __init__(self, + ann_file: str, + collect_device: str = 'cpu', + prefix: Optional[str] = None): + if COCOEvalCap is None: + raise RuntimeError( + 'COCOEvalCap is not installed, please install it by: ' + 'pip install pycocoevalcap') + + super().__init__(collect_device=collect_device, prefix=prefix) + self.ann_file = ann_file + + def process(self, data_batch, data_samples): + """Process one batch of data samples. + + The processed results should be stored in ``self.results``, which will + be used to computed the metrics when all batches have been processed. + + Args: + data_batch: A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + + for data_sample in data_samples: + result = dict() + + result['caption'] = data_sample['pred_caption'] + result['image_id'] = int(data_sample['img_id']) + + # Save the result to `self.results`. + self.results.append(result) + + def compute_metrics(self, results: List): + """Compute the metrics from processed results. + + Args: + results (dict): The processed results of each batch. + + Returns: + Dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + # NOTICE: don't access `self.results` from the method. + + with tempfile.TemporaryDirectory() as temp_dir: + + eval_result_file = save_result( + result=results, + result_dir=temp_dir, + filename='caption_pred', + remove_duplicate='image_id', + ) + + coco_val = coco_caption_eval(eval_result_file, self.ann_file) + + return coco_val + + +def save_result(result, result_dir, filename, remove_duplicate=''): + """Saving predictions as json file for evaluation.""" + # combine results from all processes + if remove_duplicate: + result_new = [] + id_list = [] + for res in track_iter_progress(result): + if res[remove_duplicate] not in id_list: + id_list.append(res[remove_duplicate]) + result_new.append(res) + result = result_new + + final_result_file_url = os.path.join(result_dir, '%s.json' % filename) + print(f'result file saved to {final_result_file_url}') + json.dump(result, open(final_result_file_url, 'w')) + + return final_result_file_url + + +def coco_caption_eval(results_file, ann_file): + """Evaluation between gt json and prediction json files.""" + # create coco object and coco_result object + coco = COCO(ann_file) + coco_result = coco.loadRes(results_file) + + # create coco_eval object by taking coco and coco_result + coco_eval = COCOEvalCap(coco, coco_result) + + # make sure the image ids are the same + coco_eval.params['image_id'] = coco_result.getImgIds() + + # This will take some times at the first run + coco_eval.evaluate() + + # print output evaluation scores + for metric, score in coco_eval.eval.items(): + print(f'{metric}: {score:.3f}') + + return coco_eval.eval diff --git a/mmdetection/mmdet/evaluation/metrics/coco_metric.py b/mmdetection/mmdet/evaluation/metrics/coco_metric.py new file mode 100644 index 0000000..cfdc66e --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/coco_metric.py @@ -0,0 +1,597 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import itertools +import os.path as osp +import tempfile +from collections import OrderedDict +from typing import Dict, List, Optional, Sequence, Union + +import numpy as np +import torch +from mmengine.evaluator import BaseMetric +from mmengine.fileio import dump, get_local_path, load +from mmengine.logging import MMLogger +from terminaltables import AsciiTable + +from mmdet.datasets.api_wrappers import COCO, COCOeval, COCOevalMP +from mmdet.registry import METRICS +from mmdet.structures.mask import encode_mask_results +from ..functional import eval_recalls + + +@METRICS.register_module() +class CocoMetric(BaseMetric): + """COCO evaluation metric. + + Evaluate AR, AP, and mAP for detection tasks including proposal/box + detection and instance segmentation. Please refer to + https://cocodataset.org/#detection-eval for more details. + + Args: + ann_file (str, optional): Path to the coco format annotation file. + If not specified, ground truth annotations from the dataset will + be converted to coco format. Defaults to None. + metric (str | List[str]): Metrics to be evaluated. Valid metrics + include 'bbox', 'segm', 'proposal', and 'proposal_fast'. + Defaults to 'bbox'. + classwise (bool): Whether to evaluate the metric class-wise. + Defaults to False. + proposal_nums (Sequence[int]): Numbers of proposals to be evaluated. + Defaults to (100, 300, 1000). + iou_thrs (float | List[float], optional): IoU threshold to compute AP + and AR. If not specified, IoUs from 0.5 to 0.95 will be used. + Defaults to None. + metric_items (List[str], optional): Metric result names to be + recorded in the evaluation result. Defaults to None. + format_only (bool): Format the output results without perform + evaluation. It is useful when you want to format the result + to a specific format and submit it to the test server. + Defaults to False. + outfile_prefix (str, optional): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Defaults to None. + file_client_args (dict, optional): Arguments to instantiate the + corresponding backend in mmdet <= 3.0.0rc6. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + sort_categories (bool): Whether sort categories in annotations. Only + used for `Objects365V1Dataset`. Defaults to False. + use_mp_eval (bool): Whether to use mul-processing evaluation + """ + default_prefix: Optional[str] = 'coco' + + def __init__(self, + ann_file: Optional[str] = None, + metric: Union[str, List[str]] = 'bbox', + classwise: bool = False, + proposal_nums: Sequence[int] = (100, 300, 1000), + iou_thrs: Optional[Union[float, Sequence[float]]] = None, + metric_items: Optional[Sequence[str]] = None, + format_only: bool = False, + outfile_prefix: Optional[str] = None, + file_client_args: dict = None, + backend_args: dict = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None, + sort_categories: bool = False, + use_mp_eval: bool = False) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + # coco evaluation metrics + self.metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast'] + for metric in self.metrics: + if metric not in allowed_metrics: + raise KeyError( + "metric should be one of 'bbox', 'segm', 'proposal', " + f"'proposal_fast', but got {metric}.") + + # do class wise evaluation, default False + self.classwise = classwise + # whether to use multi processing evaluation, default False + self.use_mp_eval = use_mp_eval + + # proposal_nums used to compute recall or precision. + self.proposal_nums = list(proposal_nums) + + # iou_thrs used to compute recall or precision. + if iou_thrs is None: + iou_thrs = np.linspace( + .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + self.iou_thrs = iou_thrs + self.metric_items = metric_items + self.format_only = format_only + if self.format_only: + assert outfile_prefix is not None, 'outfile_prefix must be not' + 'None when format_only is True, otherwise the result files will' + 'be saved to a temp directory which will be cleaned up at the end.' + + self.outfile_prefix = outfile_prefix + + self.backend_args = backend_args + if file_client_args is not None: + raise RuntimeError( + 'The `file_client_args` is deprecated, ' + 'please use `backend_args` instead, please refer to' + 'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py' # noqa: E501 + ) + + # if ann_file is not specified, + # initialize coco api with the converted dataset + if ann_file is not None: + with get_local_path( + ann_file, backend_args=self.backend_args) as local_path: + self._coco_api = COCO(local_path) + if sort_categories: + # 'categories' list in objects365_train.json and + # objects365_val.json is inconsistent, need sort + # list(or dict) before get cat_ids. + cats = self._coco_api.cats + sorted_cats = {i: cats[i] for i in sorted(cats)} + self._coco_api.cats = sorted_cats + categories = self._coco_api.dataset['categories'] + sorted_categories = sorted( + categories, key=lambda i: i['id']) + self._coco_api.dataset['categories'] = sorted_categories + else: + self._coco_api = None + + # handle dataset lazy init + self.cat_ids = None + self.img_ids = None + + def fast_eval_recall(self, + results: List[dict], + proposal_nums: Sequence[int], + iou_thrs: Sequence[float], + logger: Optional[MMLogger] = None) -> np.ndarray: + """Evaluate proposal recall with COCO's fast_eval_recall. + + Args: + results (List[dict]): Results of the dataset. + proposal_nums (Sequence[int]): Proposal numbers used for + evaluation. + iou_thrs (Sequence[float]): IoU thresholds used for evaluation. + logger (MMLogger, optional): Logger used for logging the recall + summary. + Returns: + np.ndarray: Averaged recall results. + """ + gt_bboxes = [] + pred_bboxes = [result['bboxes'] for result in results] + for i in range(len(self.img_ids)): + ann_ids = self._coco_api.get_ann_ids(img_ids=self.img_ids[i]) + ann_info = self._coco_api.load_anns(ann_ids) + if len(ann_info) == 0: + gt_bboxes.append(np.zeros((0, 4))) + continue + bboxes = [] + for ann in ann_info: + if ann.get('ignore', False) or ann['iscrowd']: + continue + x1, y1, w, h = ann['bbox'] + bboxes.append([x1, y1, x1 + w, y1 + h]) + bboxes = np.array(bboxes, dtype=np.float32) + if bboxes.shape[0] == 0: + bboxes = np.zeros((0, 4)) + gt_bboxes.append(bboxes) + + recalls = eval_recalls( + gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger) + ar = recalls.mean(axis=1) + return ar + + def xyxy2xywh(self, bbox: np.ndarray) -> list: + """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO + evaluation. + + Args: + bbox (numpy.ndarray): The bounding boxes, shape (4, ), in + ``xyxy`` order. + + Returns: + list[float]: The converted bounding boxes, in ``xywh`` order. + """ + + _bbox: List = bbox.tolist() + return [ + _bbox[0], + _bbox[1], + _bbox[2] - _bbox[0], + _bbox[3] - _bbox[1], + ] + + def results2json(self, results: Sequence[dict], + outfile_prefix: str) -> dict: + """Dump the detection results to a COCO style json file. + + There are 3 types of results: proposals, bbox predictions, mask + predictions, and they have different data types. This method will + automatically recognize the type, and dump them to json files. + + Args: + results (Sequence[dict]): Testing results of the + dataset. + outfile_prefix (str): The filename prefix of the json files. If the + prefix is "somepath/xxx", the json files will be named + "somepath/xxx.bbox.json", "somepath/xxx.segm.json", + "somepath/xxx.proposal.json". + + Returns: + dict: Possible keys are "bbox", "segm", "proposal", and + values are corresponding filenames. + """ + bbox_json_results = [] + segm_json_results = [] if 'masks' in results[0] else None + for idx, result in enumerate(results): + image_id = result.get('img_id', idx) + labels = result['labels'] + bboxes = result['bboxes'] + scores = result['scores'] + # bbox results + for i, label in enumerate(labels): + data = dict() + data['image_id'] = image_id + data['bbox'] = self.xyxy2xywh(bboxes[i]) + data['score'] = float(scores[i]) + data['category_id'] = self.cat_ids[label] + bbox_json_results.append(data) + + if segm_json_results is None: + continue + + # segm results + masks = result['masks'] + mask_scores = result.get('mask_scores', scores) + for i, label in enumerate(labels): + data = dict() + data['image_id'] = image_id + data['bbox'] = self.xyxy2xywh(bboxes[i]) + data['score'] = float(mask_scores[i]) + data['category_id'] = self.cat_ids[label] + if isinstance(masks[i]['counts'], bytes): + masks[i]['counts'] = masks[i]['counts'].decode() + data['segmentation'] = masks[i] + segm_json_results.append(data) + + result_files = dict() + result_files['bbox'] = f'{outfile_prefix}.bbox.json' + result_files['proposal'] = f'{outfile_prefix}.bbox.json' + dump(bbox_json_results, result_files['bbox']) + + if segm_json_results is not None: + result_files['segm'] = f'{outfile_prefix}.segm.json' + dump(segm_json_results, result_files['segm']) + + return result_files + + def gt_to_coco_json(self, gt_dicts: Sequence[dict], + outfile_prefix: str) -> str: + """Convert ground truth to coco format json file. + + Args: + gt_dicts (Sequence[dict]): Ground truth of the dataset. + outfile_prefix (str): The filename prefix of the json files. If the + prefix is "somepath/xxx", the json file will be named + "somepath/xxx.gt.json". + Returns: + str: The filename of the json file. + """ + categories = [ + dict(id=id, name=name) + for id, name in enumerate(self.dataset_meta['classes']) + ] + image_infos = [] + annotations = [] + + for idx, gt_dict in enumerate(gt_dicts): + img_id = gt_dict.get('img_id', idx) + image_info = dict( + id=img_id, + width=gt_dict['width'], + height=gt_dict['height'], + file_name='') + image_infos.append(image_info) + for ann in gt_dict['anns']: + label = ann['bbox_label'] + bbox = ann['bbox'] + coco_bbox = [ + bbox[0], + bbox[1], + bbox[2] - bbox[0], + bbox[3] - bbox[1], + ] + + annotation = dict( + id=len(annotations) + + 1, # coco api requires id starts with 1 + image_id=img_id, + bbox=coco_bbox, + iscrowd=ann.get('ignore_flag', 0), + category_id=int(label), + area=coco_bbox[2] * coco_bbox[3]) + if ann.get('mask', None): + mask = ann['mask'] + # area = mask_util.area(mask) + if isinstance(mask, dict) and isinstance( + mask['counts'], bytes): + mask['counts'] = mask['counts'].decode() + annotation['segmentation'] = mask + # annotation['area'] = float(area) + annotations.append(annotation) + + info = dict( + date_created=str(datetime.datetime.now()), + description='Coco json file converted by mmdet CocoMetric.') + coco_json = dict( + info=info, + images=image_infos, + categories=categories, + licenses=None, + ) + if len(annotations) > 0: + coco_json['annotations'] = annotations + converted_json_path = f'{outfile_prefix}.gt.json' + dump(coco_json, converted_json_path) + return converted_json_path + + # TODO: data_batch is no longer needed, consider adjusting the + # parameter position + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + result = dict() + pred = data_sample['pred_instances'] + result['img_id'] = data_sample['img_id'] + result['bboxes'] = pred['bboxes'].cpu().numpy() + result['scores'] = pred['scores'].cpu().numpy() + result['labels'] = pred['labels'].cpu().numpy() + # encode mask to RLE + if 'masks' in pred: + result['masks'] = encode_mask_results( + pred['masks'].detach().cpu().numpy()) if isinstance( + pred['masks'], torch.Tensor) else pred['masks'] + # some detectors use different scores for bbox and mask + if 'mask_scores' in pred: + result['mask_scores'] = pred['mask_scores'].cpu().numpy() + + # parse gt + gt = dict() + gt['width'] = data_sample['ori_shape'][1] + gt['height'] = data_sample['ori_shape'][0] + gt['img_id'] = data_sample['img_id'] + if self._coco_api is None: + # TODO: Need to refactor to support LoadAnnotations + assert 'instances' in data_sample, \ + 'ground truth is required for evaluation when ' \ + '`ann_file` is not provided' + gt['anns'] = data_sample['instances'] + # add converted result to the results list + self.results.append((gt, result)) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + + # split gt and prediction list + gts, preds = zip(*results) + + tmp_dir = None + if self.outfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + outfile_prefix = osp.join(tmp_dir.name, 'results') + else: + outfile_prefix = self.outfile_prefix + + if self._coco_api is None: + # use converted gt json file to initialize coco api + logger.info('Converting ground truth to coco format...') + coco_json_path = self.gt_to_coco_json( + gt_dicts=gts, outfile_prefix=outfile_prefix) + self._coco_api = COCO(coco_json_path) + + # handle lazy init + if self.cat_ids is None: + self.cat_ids = self._coco_api.get_cat_ids( + cat_names=self.dataset_meta['classes']) + if self.img_ids is None: + self.img_ids = self._coco_api.get_img_ids() + + # convert predictions to coco format and dump to json file + result_files = self.results2json(preds, outfile_prefix) + + eval_results = OrderedDict() + if self.format_only: + logger.info('results are saved in ' + f'{osp.dirname(outfile_prefix)}') + return eval_results + + for metric in self.metrics: + logger.info(f'Evaluating {metric}...') + + # TODO: May refactor fast_eval_recall to an independent metric? + # fast eval recall + if metric == 'proposal_fast': + ar = self.fast_eval_recall( + preds, self.proposal_nums, self.iou_thrs, logger=logger) + log_msg = [] + for i, num in enumerate(self.proposal_nums): + eval_results[f'AR@{num}'] = ar[i] + log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}') + log_msg = ''.join(log_msg) + logger.info(log_msg) + continue + + # evaluate proposal, bbox and segm + iou_type = 'bbox' if metric == 'proposal' else metric + if metric not in result_files: + raise KeyError(f'{metric} is not in results') + try: + predictions = load(result_files[metric]) + if iou_type == 'segm': + # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331 # noqa + # When evaluating mask AP, if the results contain bbox, + # cocoapi will use the box area instead of the mask area + # for calculating the instance area. Though the overall AP + # is not affected, this leads to different + # small/medium/large mask AP results. + for x in predictions: + x.pop('bbox') + coco_dt = self._coco_api.loadRes(predictions) + + except IndexError: + logger.error( + 'The testing results of the whole dataset is empty.') + break + + if self.use_mp_eval: + coco_eval = COCOevalMP(self._coco_api, coco_dt, iou_type) + else: + coco_eval = COCOeval(self._coco_api, coco_dt, iou_type) + + coco_eval.params.catIds = self.cat_ids + coco_eval.params.imgIds = self.img_ids + coco_eval.params.maxDets = list(self.proposal_nums) + coco_eval.params.iouThrs = self.iou_thrs + + # mapping of cocoEval.stats + coco_metric_names = { + 'mAP': 0, + 'mAP_50': 1, + 'mAP_75': 2, + 'mAP_s': 3, + 'mAP_m': 4, + 'mAP_l': 5, + 'AR@100': 6, + 'AR@300': 7, + 'AR@1000': 8, + 'AR_s@1000': 9, + 'AR_m@1000': 10, + 'AR_l@1000': 11 + } + metric_items = self.metric_items + if metric_items is not None: + for metric_item in metric_items: + if metric_item not in coco_metric_names: + raise KeyError( + f'metric item "{metric_item}" is not supported') + + if metric == 'proposal': + coco_eval.params.useCats = 0 + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + if metric_items is None: + metric_items = [ + 'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000', + 'AR_m@1000', 'AR_l@1000' + ] + + for item in metric_items: + val = float( + f'{coco_eval.stats[coco_metric_names[item]]:.3f}') + eval_results[item] = val + else: + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + if self.classwise: # Compute per-category AP + # Compute per-category AP + # from https://github.com/facebookresearch/detectron2/ + precisions = coco_eval.eval['precision'] + # precision: (iou, recall, cls, area range, max dets) + assert len(self.cat_ids) == precisions.shape[2] + + results_per_category = [] + for idx, cat_id in enumerate(self.cat_ids): + t = [] + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + nm = self._coco_api.loadCats(cat_id)[0] + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + if precision.size: + ap = np.mean(precision) + else: + ap = float('nan') + t.append(f'{nm["name"]}') + t.append(f'{round(ap, 3)}') + eval_results[f'{nm["name"]}_precision'] = round(ap, 3) + + # indexes of IoU @50 and @75 + for iou in [0, 5]: + precision = precisions[iou, :, idx, 0, -1] + precision = precision[precision > -1] + if precision.size: + ap = np.mean(precision) + else: + ap = float('nan') + t.append(f'{round(ap, 3)}') + + # indexes of area of small, median and large + for area in [1, 2, 3]: + precision = precisions[:, :, idx, area, -1] + precision = precision[precision > -1] + if precision.size: + ap = np.mean(precision) + else: + ap = float('nan') + t.append(f'{round(ap, 3)}') + results_per_category.append(tuple(t)) + + num_columns = len(results_per_category[0]) + results_flatten = list( + itertools.chain(*results_per_category)) + headers = [ + 'category', 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', + 'mAP_m', 'mAP_l' + ] + results_2d = itertools.zip_longest(*[ + results_flatten[i::num_columns] + for i in range(num_columns) + ]) + table_data = [headers] + table_data += [result for result in results_2d] + table = AsciiTable(table_data) + logger.info('\n' + table.table) + + if metric_items is None: + metric_items = [ + 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l' + ] + + for metric_item in metric_items: + key = f'{metric}_{metric_item}' + val = coco_eval.stats[coco_metric_names[metric_item]] + eval_results[key] = float(f'{round(val, 3)}') + + ap = coco_eval.stats[:6] + logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} ' + f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} ' + f'{ap[4]:.3f} {ap[5]:.3f}') + + if tmp_dir is not None: + tmp_dir.cleanup() + return eval_results diff --git a/mmdetection/mmdet/evaluation/metrics/coco_occluded_metric.py b/mmdetection/mmdet/evaluation/metrics/coco_occluded_metric.py new file mode 100644 index 0000000..81235a0 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/coco_occluded_metric.py @@ -0,0 +1,204 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Union + +import mmengine +import numpy as np +from mmengine.fileio import load +from mmengine.logging import print_log +from pycocotools import mask as coco_mask +from terminaltables import AsciiTable + +from mmdet.registry import METRICS +from .coco_metric import CocoMetric + + +@METRICS.register_module() +class CocoOccludedSeparatedMetric(CocoMetric): + """Metric of separated and occluded masks which presented in paper `A Tri- + Layer Plugin to Improve Occluded Detection. + + `_. + + Separated COCO and Occluded COCO are automatically generated subsets of + COCO val dataset, collecting separated objects and partially occluded + objects for a large variety of categories. In this way, we define + occlusion into two major categories: separated and partially occluded. + + - Separation: target object segmentation mask is separated into distinct + regions by the occluder. + - Partial Occlusion: target object is partially occluded but the + segmentation mask is connected. + + These two new scalable real-image datasets are to benchmark a model's + capability to detect occluded objects of 80 common categories. + + Please cite the paper if you use this dataset: + + @article{zhan2022triocc, + title={A Tri-Layer Plugin to Improve Occluded Detection}, + author={Zhan, Guanqi and Xie, Weidi and Zisserman, Andrew}, + journal={British Machine Vision Conference}, + year={2022} + } + + Args: + occluded_ann (str): Path to the occluded coco annotation file. + separated_ann (str): Path to the separated coco annotation file. + score_thr (float): Score threshold of the detection masks. + Defaults to 0.3. + iou_thr (float): IoU threshold for the recall calculation. + Defaults to 0.75. + metric (str | List[str]): Metrics to be evaluated. Valid metrics + include 'bbox', 'segm', 'proposal', and 'proposal_fast'. + Defaults to 'bbox'. + """ + default_prefix: Optional[str] = 'coco' + + def __init__( + self, + *args, + occluded_ann: + str = 'https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/occluded_coco.pkl', # noqa + separated_ann: + str = 'https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/separated_coco.pkl', # noqa + score_thr: float = 0.3, + iou_thr: float = 0.75, + metric: Union[str, List[str]] = ['bbox', 'segm'], + **kwargs) -> None: + super().__init__(*args, metric=metric, **kwargs) + self.occluded_ann = load(occluded_ann) + self.separated_ann = load(separated_ann) + self.score_thr = score_thr + self.iou_thr = iou_thr + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + coco_metric_res = super().compute_metrics(results) + eval_res = self.evaluate_occluded_separated(results) + coco_metric_res.update(eval_res) + return coco_metric_res + + def evaluate_occluded_separated(self, results: List[tuple]) -> dict: + """Compute the recall of occluded and separated masks. + + Args: + results (list[tuple]): Testing results of the dataset. + + Returns: + dict[str, float]: The recall of occluded and separated masks. + """ + dict_det = {} + print_log('processing detection results...') + prog_bar = mmengine.ProgressBar(len(results)) + for i in range(len(results)): + gt, dt = results[i] + img_id = dt['img_id'] + cur_img_name = self._coco_api.imgs[img_id]['file_name'] + if cur_img_name not in dict_det.keys(): + dict_det[cur_img_name] = [] + + for bbox, score, label, mask in zip(dt['bboxes'], dt['scores'], + dt['labels'], dt['masks']): + cur_binary_mask = coco_mask.decode(mask) + dict_det[cur_img_name].append([ + score, self.dataset_meta['classes'][label], + cur_binary_mask, bbox + ]) + dict_det[cur_img_name].sort( + key=lambda x: (-x[0], x[3][0], x[3][1]) + ) # rank by confidence from high to low, avoid same confidence + prog_bar.update() + print_log('\ncomputing occluded mask recall...', logger='current') + occluded_correct_num, occluded_recall = self.compute_recall( + dict_det, gt_ann=self.occluded_ann, is_occ=True) + print_log( + f'\nCOCO occluded mask recall: {occluded_recall:.2f}%', + logger='current') + print_log( + f'COCO occluded mask success num: {occluded_correct_num}', + logger='current') + print_log('computing separated mask recall...', logger='current') + separated_correct_num, separated_recall = self.compute_recall( + dict_det, gt_ann=self.separated_ann, is_occ=False) + print_log( + f'\nCOCO separated mask recall: {separated_recall:.2f}%', + logger='current') + print_log( + f'COCO separated mask success num: {separated_correct_num}', + logger='current') + table_data = [ + ['mask type', 'recall', 'num correct'], + ['occluded', f'{occluded_recall:.2f}%', occluded_correct_num], + ['separated', f'{separated_recall:.2f}%', separated_correct_num] + ] + table = AsciiTable(table_data) + print_log('\n' + table.table, logger='current') + return dict( + occluded_recall=occluded_recall, separated_recall=separated_recall) + + def compute_recall(self, + result_dict: dict, + gt_ann: list, + is_occ: bool = True) -> tuple: + """Compute the recall of occluded or separated masks. + + Args: + result_dict (dict): Processed mask results. + gt_ann (list): Occluded or separated coco annotations. + is_occ (bool): Whether the annotation is occluded mask. + Defaults to True. + Returns: + tuple: number of correct masks and the recall. + """ + correct = 0 + prog_bar = mmengine.ProgressBar(len(gt_ann)) + for iter_i in range(len(gt_ann)): + cur_item = gt_ann[iter_i] + cur_img_name = cur_item[0] + cur_gt_bbox = cur_item[3] + if is_occ: + cur_gt_bbox = [ + cur_gt_bbox[0], cur_gt_bbox[1], + cur_gt_bbox[0] + cur_gt_bbox[2], + cur_gt_bbox[1] + cur_gt_bbox[3] + ] + cur_gt_class = cur_item[1] + cur_gt_mask = coco_mask.decode(cur_item[4]) + + assert cur_img_name in result_dict.keys() + cur_detections = result_dict[cur_img_name] + + correct_flag = False + for i in range(len(cur_detections)): + cur_det_confidence = cur_detections[i][0] + if cur_det_confidence < self.score_thr: + break + cur_det_class = cur_detections[i][1] + if cur_det_class != cur_gt_class: + continue + cur_det_mask = cur_detections[i][2] + cur_iou = self.mask_iou(cur_det_mask, cur_gt_mask) + if cur_iou >= self.iou_thr: + correct_flag = True + break + if correct_flag: + correct += 1 + prog_bar.update() + recall = correct / len(gt_ann) * 100 + return correct, recall + + def mask_iou(self, mask1: np.ndarray, mask2: np.ndarray) -> np.ndarray: + """Compute IoU between two masks.""" + mask1_area = np.count_nonzero(mask1 == 1) + mask2_area = np.count_nonzero(mask2 == 1) + intersection = np.count_nonzero(np.logical_and(mask1 == 1, mask2 == 1)) + iou = intersection / (mask1_area + mask2_area - intersection) + return iou diff --git a/mmdetection/mmdet/evaluation/metrics/coco_panoptic_metric.py b/mmdetection/mmdet/evaluation/metrics/coco_panoptic_metric.py new file mode 100644 index 0000000..1554c09 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/coco_panoptic_metric.py @@ -0,0 +1,618 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import itertools +import os.path as osp +import tempfile +from typing import Dict, Optional, Sequence, Tuple, Union + +import mmcv +import numpy as np +from mmengine.evaluator import BaseMetric +from mmengine.fileio import dump, get_local_path, load +from mmengine.logging import MMLogger, print_log +from terminaltables import AsciiTable + +from mmdet.datasets.api_wrappers import COCOPanoptic +from mmdet.registry import METRICS +from ..functional import (INSTANCE_OFFSET, pq_compute_multi_core, + pq_compute_single_core) + +try: + import panopticapi + from panopticapi.evaluation import VOID, PQStat + from panopticapi.utils import id2rgb, rgb2id +except ImportError: + panopticapi = None + id2rgb = None + rgb2id = None + VOID = None + PQStat = None + + +@METRICS.register_module() +class CocoPanopticMetric(BaseMetric): + """COCO panoptic segmentation evaluation metric. + + Evaluate PQ, SQ RQ for panoptic segmentation tasks. Please refer to + https://cocodataset.org/#panoptic-eval for more details. + + Args: + ann_file (str, optional): Path to the coco format annotation file. + If not specified, ground truth annotations from the dataset will + be converted to coco format. Defaults to None. + seg_prefix (str, optional): Path to the directory which contains the + coco panoptic segmentation mask. It should be specified when + evaluate. Defaults to None. + classwise (bool): Whether to evaluate the metric class-wise. + Defaults to False. + outfile_prefix (str, optional): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. + It should be specified when format_only is True. Defaults to None. + format_only (bool): Format the output results without perform + evaluation. It is useful when you want to format the result + to a specific format and submit it to the test server. + Defaults to False. + nproc (int): Number of processes for panoptic quality computing. + Defaults to 32. When ``nproc`` exceeds the number of cpu cores, + the number of cpu cores is used. + file_client_args (dict, optional): Arguments to instantiate the + corresponding backend in mmdet <= 3.0.0rc6. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + """ + default_prefix: Optional[str] = 'coco_panoptic' + + def __init__(self, + ann_file: Optional[str] = None, + seg_prefix: Optional[str] = None, + classwise: bool = False, + format_only: bool = False, + outfile_prefix: Optional[str] = None, + nproc: int = 32, + file_client_args: dict = None, + backend_args: dict = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + if panopticapi is None: + raise RuntimeError( + 'panopticapi is not installed, please install it by: ' + 'pip install git+https://github.com/cocodataset/' + 'panopticapi.git.') + + super().__init__(collect_device=collect_device, prefix=prefix) + self.classwise = classwise + self.format_only = format_only + if self.format_only: + assert outfile_prefix is not None, 'outfile_prefix must be not' + 'None when format_only is True, otherwise the result files will' + 'be saved to a temp directory which will be cleaned up at the end.' + + self.tmp_dir = None + # outfile_prefix should be a prefix of a path which points to a shared + # storage when train or test with multi nodes. + self.outfile_prefix = outfile_prefix + if outfile_prefix is None: + self.tmp_dir = tempfile.TemporaryDirectory() + self.outfile_prefix = osp.join(self.tmp_dir.name, 'results') + # the directory to save predicted panoptic segmentation mask + self.seg_out_dir = f'{self.outfile_prefix}.panoptic' + self.nproc = nproc + self.seg_prefix = seg_prefix + + self.cat_ids = None + self.cat2label = None + + self.backend_args = backend_args + if file_client_args is not None: + raise RuntimeError( + 'The `file_client_args` is deprecated, ' + 'please use `backend_args` instead, please refer to' + 'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py' # noqa: E501 + ) + + if ann_file: + with get_local_path( + ann_file, backend_args=self.backend_args) as local_path: + self._coco_api = COCOPanoptic(local_path) + self.categories = self._coco_api.cats + else: + self._coco_api = None + self.categories = None + + def __del__(self) -> None: + """Clean up.""" + if self.tmp_dir is not None: + self.tmp_dir.cleanup() + + def gt_to_coco_json(self, gt_dicts: Sequence[dict], + outfile_prefix: str) -> Tuple[str, str]: + """Convert ground truth to coco panoptic segmentation format json file. + + Args: + gt_dicts (Sequence[dict]): Ground truth of the dataset. + outfile_prefix (str): The filename prefix of the json file. If the + prefix is "somepath/xxx", the json file will be named + "somepath/xxx.gt.json". + + Returns: + Tuple[str, str]: The filename of the json file and the name of the\ + directory which contains panoptic segmentation masks. + """ + assert len(gt_dicts) > 0, 'gt_dicts is empty.' + gt_folder = osp.dirname(gt_dicts[0]['seg_map_path']) + converted_json_path = f'{outfile_prefix}.gt.json' + + categories = [] + for id, name in enumerate(self.dataset_meta['classes']): + isthing = 1 if name in self.dataset_meta['thing_classes'] else 0 + categories.append({'id': id, 'name': name, 'isthing': isthing}) + + image_infos = [] + annotations = [] + for gt_dict in gt_dicts: + img_id = gt_dict['image_id'] + image_info = { + 'id': img_id, + 'width': gt_dict['width'], + 'height': gt_dict['height'], + 'file_name': osp.split(gt_dict['seg_map_path'])[-1] + } + image_infos.append(image_info) + + pan_png = mmcv.imread(gt_dict['seg_map_path']).squeeze() + pan_png = pan_png[:, :, ::-1] + pan_png = rgb2id(pan_png) + segments_info = [] + for segment_info in gt_dict['segments_info']: + id = segment_info['id'] + label = segment_info['category'] + mask = pan_png == id + isthing = categories[label]['isthing'] + if isthing: + iscrowd = 1 if not segment_info['is_thing'] else 0 + else: + iscrowd = 0 + + new_segment_info = { + 'id': id, + 'category_id': label, + 'isthing': isthing, + 'iscrowd': iscrowd, + 'area': mask.sum() + } + segments_info.append(new_segment_info) + + segm_file = image_info['file_name'].replace('jpg', 'png') + annotation = dict( + image_id=img_id, + segments_info=segments_info, + file_name=segm_file) + annotations.append(annotation) + pan_png = id2rgb(pan_png) + + info = dict( + date_created=str(datetime.datetime.now()), + description='Coco json file converted by mmdet CocoPanopticMetric.' + ) + coco_json = dict( + info=info, + images=image_infos, + categories=categories, + licenses=None, + ) + if len(annotations) > 0: + coco_json['annotations'] = annotations + dump(coco_json, converted_json_path) + return converted_json_path, gt_folder + + def result2json(self, results: Sequence[dict], + outfile_prefix: str) -> Tuple[str, str]: + """Dump the panoptic results to a COCO style json file and a directory. + + Args: + results (Sequence[dict]): Testing results of the dataset. + outfile_prefix (str): The filename prefix of the json files and the + directory. + + Returns: + Tuple[str, str]: The json file and the directory which contains \ + panoptic segmentation masks. The filename of the json is + "somepath/xxx.panoptic.json" and name of the directory is + "somepath/xxx.panoptic". + """ + label2cat = dict((v, k) for (k, v) in self.cat2label.items()) + pred_annotations = [] + for idx in range(len(results)): + result = results[idx] + for segment_info in result['segments_info']: + sem_label = segment_info['category_id'] + # convert sem_label to json label + cat_id = label2cat[sem_label] + segment_info['category_id'] = label2cat[sem_label] + is_thing = self.categories[cat_id]['isthing'] + segment_info['isthing'] = is_thing + pred_annotations.append(result) + pan_json_results = dict(annotations=pred_annotations) + json_filename = f'{outfile_prefix}.panoptic.json' + dump(pan_json_results, json_filename) + return json_filename, ( + self.seg_out_dir + if self.tmp_dir is None else tempfile.gettempdir()) + + def _parse_predictions(self, + pred: dict, + img_id: int, + segm_file: str, + label2cat=None) -> dict: + """Parse panoptic segmentation predictions. + + Args: + pred (dict): Panoptic segmentation predictions. + img_id (int): Image id. + segm_file (str): Segmentation file name. + label2cat (dict): Mapping from label to category id. + Defaults to None. + + Returns: + dict: Parsed predictions. + """ + result = dict() + result['img_id'] = img_id + # shape (1, H, W) -> (H, W) + pan = pred['pred_panoptic_seg']['sem_seg'].cpu().numpy()[0] + ignore_index = pred['pred_panoptic_seg'].get( + 'ignore_index', len(self.dataset_meta['classes'])) + pan_labels = np.unique(pan) + segments_info = [] + for pan_label in pan_labels: + sem_label = pan_label % INSTANCE_OFFSET + # We reserve the length of dataset_meta['classes'] + # and ignore_index for VOID label + if sem_label == len( + self.dataset_meta['classes']) or sem_label == ignore_index: + continue + mask = pan == pan_label + area = mask.sum() + segments_info.append({ + 'id': + int(pan_label), + # when ann_file provided, sem_label should be cat_id, otherwise + # sem_label should be a continuous id, not the cat_id + # defined in dataset + 'category_id': + label2cat[sem_label] if label2cat else sem_label, + 'area': + int(area) + }) + # evaluation script uses 0 for VOID label. + pan[pan % INSTANCE_OFFSET == len(self.dataset_meta['classes'])] = VOID + pan[pan % INSTANCE_OFFSET == ignore_index] = VOID + + pan = id2rgb(pan).astype(np.uint8) + mmcv.imwrite(pan[:, :, ::-1], osp.join(self.seg_out_dir, segm_file)) + result = { + 'image_id': img_id, + 'segments_info': segments_info, + 'file_name': segm_file + } + + return result + + def _compute_batch_pq_stats(self, data_samples: Sequence[dict]): + """Process gts and predictions when ``outfile_prefix`` is not set, gts + are from dataset or a json file which is defined by ``ann_file``. + + Intermediate results, ``pq_stats``, are computed here and put into + ``self.results``. + """ + if self._coco_api is None: + categories = dict() + for id, name in enumerate(self.dataset_meta['classes']): + isthing = 1 if name in self.dataset_meta['thing_classes']\ + else 0 + categories[id] = {'id': id, 'name': name, 'isthing': isthing} + label2cat = None + else: + categories = self.categories + cat_ids = self._coco_api.get_cat_ids( + cat_names=self.dataset_meta['classes']) + label2cat = {i: cat_id for i, cat_id in enumerate(cat_ids)} + + for data_sample in data_samples: + # parse pred + img_id = data_sample['img_id'] + segm_file = osp.basename(data_sample['img_path']).replace( + 'jpg', 'png') + result = self._parse_predictions( + pred=data_sample, + img_id=img_id, + segm_file=segm_file, + label2cat=label2cat) + + # parse gt + gt = dict() + gt['image_id'] = img_id + gt['width'] = data_sample['ori_shape'][1] + gt['height'] = data_sample['ori_shape'][0] + gt['file_name'] = segm_file + + if self._coco_api is None: + # get segments_info from data_sample + seg_map_path = osp.join(self.seg_prefix, segm_file) + pan_png = mmcv.imread(seg_map_path).squeeze() + pan_png = pan_png[:, :, ::-1] + pan_png = rgb2id(pan_png) + segments_info = [] + + for segment_info in data_sample['segments_info']: + id = segment_info['id'] + label = segment_info['category'] + mask = pan_png == id + isthing = categories[label]['isthing'] + if isthing: + iscrowd = 1 if not segment_info['is_thing'] else 0 + else: + iscrowd = 0 + + new_segment_info = { + 'id': id, + 'category_id': label, + 'isthing': isthing, + 'iscrowd': iscrowd, + 'area': mask.sum() + } + segments_info.append(new_segment_info) + else: + # get segments_info from annotation file + segments_info = self._coco_api.imgToAnns[img_id] + + gt['segments_info'] = segments_info + + pq_stats = pq_compute_single_core( + proc_id=0, + annotation_set=[(gt, result)], + gt_folder=self.seg_prefix, + pred_folder=self.seg_out_dir, + categories=categories, + backend_args=self.backend_args) + + self.results.append(pq_stats) + + def _process_gt_and_predictions(self, data_samples: Sequence[dict]): + """Process gts and predictions when ``outfile_prefix`` is set. + + The predictions will be saved to directory specified by + ``outfile_predfix``. The matched pair (gt, result) will be put into + ``self.results``. + """ + for data_sample in data_samples: + # parse pred + img_id = data_sample['img_id'] + segm_file = osp.basename(data_sample['img_path']).replace( + 'jpg', 'png') + result = self._parse_predictions( + pred=data_sample, img_id=img_id, segm_file=segm_file) + + # parse gt + gt = dict() + gt['image_id'] = img_id + gt['width'] = data_sample['ori_shape'][1] + gt['height'] = data_sample['ori_shape'][0] + + if self._coco_api is None: + # get segments_info from dataset + gt['segments_info'] = data_sample['segments_info'] + gt['seg_map_path'] = data_sample['seg_map_path'] + + self.results.append((gt, result)) + + # TODO: data_batch is no longer needed, consider adjusting the + # parameter position + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + # If ``self.tmp_dir`` is none, it will save gt and predictions to + # self.results, otherwise, it will compute pq_stats here. + if self.tmp_dir is None: + self._process_gt_and_predictions(data_samples) + else: + self._compute_batch_pq_stats(data_samples) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. There + are two cases: + + - When ``outfile_prefix`` is not provided, the elements in + results are pq_stats which can be summed directly to get PQ. + - When ``outfile_prefix`` is provided, the elements in + results are tuples like (gt, pred). + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + + if self.tmp_dir is None: + # do evaluation after collect all the results + + # split gt and prediction list + gts, preds = zip(*results) + + if self._coco_api is None: + # use converted gt json file to initialize coco api + logger.info('Converting ground truth to coco format...') + coco_json_path, gt_folder = self.gt_to_coco_json( + gt_dicts=gts, outfile_prefix=self.outfile_prefix) + self._coco_api = COCOPanoptic(coco_json_path) + else: + gt_folder = self.seg_prefix + + self.cat_ids = self._coco_api.get_cat_ids( + cat_names=self.dataset_meta['classes']) + self.cat2label = { + cat_id: i + for i, cat_id in enumerate(self.cat_ids) + } + self.img_ids = self._coco_api.get_img_ids() + self.categories = self._coco_api.cats + + # convert predictions to coco format and dump to json file + json_filename, pred_folder = self.result2json( + results=preds, outfile_prefix=self.outfile_prefix) + + if self.format_only: + logger.info('results are saved in ' + f'{osp.dirname(self.outfile_prefix)}') + return dict() + + imgs = self._coco_api.imgs + gt_json = self._coco_api.img_ann_map + gt_json = [{ + 'image_id': k, + 'segments_info': v, + 'file_name': imgs[k]['segm_file'] + } for k, v in gt_json.items()] + pred_json = load(json_filename) + pred_json = dict( + (el['image_id'], el) for el in pred_json['annotations']) + + # match the gt_anns and pred_anns in the same image + matched_annotations_list = [] + for gt_ann in gt_json: + img_id = gt_ann['image_id'] + if img_id not in pred_json.keys(): + raise Exception('no prediction for the image' + ' with id: {}'.format(img_id)) + matched_annotations_list.append((gt_ann, pred_json[img_id])) + + pq_stat = pq_compute_multi_core( + matched_annotations_list, + gt_folder, + pred_folder, + self.categories, + backend_args=self.backend_args, + nproc=self.nproc) + + else: + # aggregate the results generated in process + if self._coco_api is None: + categories = dict() + for id, name in enumerate(self.dataset_meta['classes']): + isthing = 1 if name in self.dataset_meta[ + 'thing_classes'] else 0 + categories[id] = { + 'id': id, + 'name': name, + 'isthing': isthing + } + self.categories = categories + + pq_stat = PQStat() + for result in results: + pq_stat += result + + metrics = [('All', None), ('Things', True), ('Stuff', False)] + pq_results = {} + + for name, isthing in metrics: + pq_results[name], classwise_results = pq_stat.pq_average( + self.categories, isthing=isthing) + if name == 'All': + pq_results['classwise'] = classwise_results + + classwise_results = None + if self.classwise: + classwise_results = { + k: v + for k, v in zip(self.dataset_meta['classes'], + pq_results['classwise'].values()) + } + + print_panoptic_table(pq_results, classwise_results, logger=logger) + results = parse_pq_results(pq_results) + + return results + + +def parse_pq_results(pq_results: dict) -> dict: + """Parse the Panoptic Quality results. + + Args: + pq_results (dict): Panoptic Quality results. + + Returns: + dict: Panoptic Quality results parsed. + """ + result = dict() + result['PQ'] = 100 * pq_results['All']['pq'] + result['SQ'] = 100 * pq_results['All']['sq'] + result['RQ'] = 100 * pq_results['All']['rq'] + result['PQ_th'] = 100 * pq_results['Things']['pq'] + result['SQ_th'] = 100 * pq_results['Things']['sq'] + result['RQ_th'] = 100 * pq_results['Things']['rq'] + result['PQ_st'] = 100 * pq_results['Stuff']['pq'] + result['SQ_st'] = 100 * pq_results['Stuff']['sq'] + result['RQ_st'] = 100 * pq_results['Stuff']['rq'] + return result + + +def print_panoptic_table( + pq_results: dict, + classwise_results: Optional[dict] = None, + logger: Optional[Union['MMLogger', str]] = None) -> None: + """Print the panoptic evaluation results table. + + Args: + pq_results(dict): The Panoptic Quality results. + classwise_results(dict, optional): The classwise Panoptic Quality. + results. The keys are class names and the values are metrics. + Defaults to None. + logger (:obj:`MMLogger` | str, optional): Logger used for printing + related information during evaluation. Default: None. + """ + + headers = ['', 'PQ', 'SQ', 'RQ', 'categories'] + data = [headers] + for name in ['All', 'Things', 'Stuff']: + numbers = [ + f'{(pq_results[name][k] * 100):0.3f}' for k in ['pq', 'sq', 'rq'] + ] + row = [name] + numbers + [pq_results[name]['n']] + data.append(row) + table = AsciiTable(data) + print_log('Panoptic Evaluation Results:\n' + table.table, logger=logger) + + if classwise_results is not None: + class_metrics = [(name, ) + tuple(f'{(metrics[k] * 100):0.3f}' + for k in ['pq', 'sq', 'rq']) + for name, metrics in classwise_results.items()] + num_columns = min(8, len(class_metrics) * 4) + results_flatten = list(itertools.chain(*class_metrics)) + headers = ['category', 'PQ', 'SQ', 'RQ'] * (num_columns // 4) + results_2d = itertools.zip_longest( + *[results_flatten[i::num_columns] for i in range(num_columns)]) + data = [headers] + data += [result for result in results_2d] + table = AsciiTable(data) + print_log( + 'Classwise Panoptic Evaluation Results:\n' + table.table, + logger=logger) diff --git a/mmdetection/mmdet/evaluation/metrics/coco_video_metric.py b/mmdetection/mmdet/evaluation/metrics/coco_video_metric.py new file mode 100644 index 0000000..b5c75d0 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/coco_video_metric.py @@ -0,0 +1,80 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Sequence + +from mmengine.dist import broadcast_object_list, is_main_process + +from mmdet.registry import METRICS +from .base_video_metric import collect_tracking_results +from .coco_metric import CocoMetric + + +@METRICS.register_module() +class CocoVideoMetric(CocoMetric): + """COCO evaluation metric. + + Evaluate AR, AP, and mAP for detection tasks including proposal/box + detection and instance segmentation. Please refer to + https://cocodataset.org/#detection-eval for more details. + """ + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for track_data_sample in data_samples: + video_data_samples = track_data_sample['video_data_samples'] + ori_video_len = video_data_samples[0].ori_video_length + video_len = len(video_data_samples) + if ori_video_len == video_len: + # video process + for frame_id in range(video_len): + img_data_sample = video_data_samples[frame_id].to_dict() + super().process(None, [img_data_sample]) + else: + # image process + img_data_sample = video_data_samples[0].to_dict() + super().process(None, [img_data_sample]) + + def evaluate(self, size: int = 1) -> dict: + """Evaluate the model performance of the whole dataset after processing + all batches. + + Args: + size (int): Length of the entire validation dataset. + Returns: + dict: Evaluation metrics dict on the val dataset. The keys are the + names of the metrics, and the values are corresponding results. + """ + if len(self.results) == 0: + warnings.warn( + f'{self.__class__.__name__} got empty `self.results`. Please ' + 'ensure that the processed results are properly added into ' + '`self.results` in `process` method.') + + results = collect_tracking_results(self.results, self.collect_device) + + if is_main_process(): + _metrics = self.compute_metrics(results) # type: ignore + # Add prefix to metric names + if self.prefix: + _metrics = { + '/'.join((self.prefix, k)): v + for k, v in _metrics.items() + } + metrics = [_metrics] + else: + metrics = [None] # type: ignore + + broadcast_object_list(metrics) + + # reset the results list + self.results.clear() + return metrics[0] diff --git a/mmdetection/mmdet/evaluation/metrics/crowdhuman_metric.py b/mmdetection/mmdet/evaluation/metrics/crowdhuman_metric.py new file mode 100644 index 0000000..50ac210 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/crowdhuman_metric.py @@ -0,0 +1,824 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import json +import os.path as osp +import tempfile +from collections import OrderedDict +from multiprocessing import Process, Queue +from typing import Dict, List, Optional, Sequence, Union + +import numpy as np +from mmengine.evaluator import BaseMetric +from mmengine.fileio import dump, get_text, load +from mmengine.logging import MMLogger +from scipy.sparse import csr_matrix +from scipy.sparse.csgraph import maximum_bipartite_matching + +from mmdet.evaluation.functional.bbox_overlaps import bbox_overlaps +from mmdet.registry import METRICS + +PERSON_CLASSES = ['background', 'person'] + + +@METRICS.register_module() +class CrowdHumanMetric(BaseMetric): + """CrowdHuman evaluation metric. + + Evaluate Average Precision (AP), Miss Rate (MR) and Jaccard Index (JI) + for detection tasks. + + Args: + ann_file (str): Path to the annotation file. + metric (str | List[str]): Metrics to be evaluated. Valid metrics + include 'AP', 'MR' and 'JI'. Defaults to 'AP'. + format_only (bool): Format the output results without perform + evaluation. It is useful when you want to format the result + to a specific format and submit it to the test server. + Defaults to False. + outfile_prefix (str, optional): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Defaults to None. + file_client_args (dict, optional): Arguments to instantiate the + corresponding backend in mmdet <= 3.0.0rc6. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + eval_mode (int): Select the mode of evaluate. Valid mode include + 0(just body box), 1(just head box) and 2(both of them). + Defaults to 0. + iou_thres (float): IoU threshold. Defaults to 0.5. + compare_matching_method (str, optional): Matching method to compare + the detection results with the ground_truth when compute 'AP' + and 'MR'.Valid method include VOC and None(CALTECH). Default to + None. + mr_ref (str): Different parameter selection to calculate MR. Valid + ref include CALTECH_-2 and CALTECH_-4. Defaults to CALTECH_-2. + num_ji_process (int): The number of processes to evaluation JI. + Defaults to 10. + """ + default_prefix: Optional[str] = 'crowd_human' + + def __init__(self, + ann_file: str, + metric: Union[str, List[str]] = ['AP', 'MR', 'JI'], + format_only: bool = False, + outfile_prefix: Optional[str] = None, + file_client_args: dict = None, + backend_args: dict = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None, + eval_mode: int = 0, + iou_thres: float = 0.5, + compare_matching_method: Optional[str] = None, + mr_ref: str = 'CALTECH_-2', + num_ji_process: int = 10) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + + self.ann_file = ann_file + # crowdhuman evaluation metrics + self.metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['MR', 'AP', 'JI'] + for metric in self.metrics: + if metric not in allowed_metrics: + raise KeyError(f"metric should be one of 'MR', 'AP', 'JI'," + f'but got {metric}.') + + self.format_only = format_only + if self.format_only: + assert outfile_prefix is not None, 'outfile_prefix must be not' + 'None when format_only is True, otherwise the result files will' + 'be saved to a temp directory which will be cleaned up at the end.' + self.outfile_prefix = outfile_prefix + self.backend_args = backend_args + if file_client_args is not None: + raise RuntimeError( + 'The `file_client_args` is deprecated, ' + 'please use `backend_args` instead, please refer to' + 'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py' # noqa: E501 + ) + + assert eval_mode in [0, 1, 2], \ + "Unknown eval mode. mr_ref should be one of '0', '1', '2'." + assert compare_matching_method is None or \ + compare_matching_method == 'VOC', \ + 'The alternative compare_matching_method is VOC.' \ + 'This parameter defaults to CALTECH(None)' + assert mr_ref == 'CALTECH_-2' or mr_ref == 'CALTECH_-4', \ + "mr_ref should be one of 'CALTECH_-2', 'CALTECH_-4'." + self.eval_mode = eval_mode + self.iou_thres = iou_thres + self.compare_matching_method = compare_matching_method + self.mr_ref = mr_ref + self.num_ji_process = num_ji_process + + @staticmethod + def results2json(results: Sequence[dict], outfile_prefix: str) -> str: + """Dump the detection results to a json file.""" + result_file_path = f'{outfile_prefix}.json' + bbox_json_results = [] + for i, result in enumerate(results): + ann, pred = result + dump_dict = dict() + dump_dict['ID'] = ann['ID'] + dump_dict['width'] = ann['width'] + dump_dict['height'] = ann['height'] + dtboxes = [] + bboxes = pred.tolist() + for _, single_bbox in enumerate(bboxes): + temp_dict = dict() + x1, y1, x2, y2, score = single_bbox + temp_dict['box'] = [x1, y1, x2 - x1, y2 - y1] + temp_dict['score'] = score + temp_dict['tag'] = 1 + dtboxes.append(temp_dict) + dump_dict['dtboxes'] = dtboxes + bbox_json_results.append(dump_dict) + dump(bbox_json_results, result_file_path) + return result_file_path + + def process(self, data_batch: Sequence[dict], + data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + ann = dict() + ann['ID'] = data_sample['img_id'] + ann['width'] = data_sample['ori_shape'][1] + ann['height'] = data_sample['ori_shape'][0] + pred_bboxes = data_sample['pred_instances']['bboxes'].cpu().numpy() + pred_scores = data_sample['pred_instances']['scores'].cpu().numpy() + + pred_bbox_scores = np.hstack( + [pred_bboxes, pred_scores.reshape((-1, 1))]) + + self.results.append((ann, pred_bbox_scores)) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + eval_results(Dict[str, float]): The computed metrics. + The keys are the names of the metrics, and the values + are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + + tmp_dir = None + if self.outfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + outfile_prefix = osp.join(tmp_dir.name, 'result') + else: + outfile_prefix = self.outfile_prefix + + # convert predictions to coco format and dump to json file + result_file = self.results2json(results, outfile_prefix) + eval_results = OrderedDict() + if self.format_only: + logger.info(f'results are saved in {osp.dirname(outfile_prefix)}') + return eval_results + + # load evaluation samples + eval_samples = self.load_eval_samples(result_file) + + if 'AP' in self.metrics or 'MR' in self.metrics: + score_list = self.compare(eval_samples) + gt_num = sum([eval_samples[i].gt_num for i in eval_samples]) + ign_num = sum([eval_samples[i].ign_num for i in eval_samples]) + gt_num = gt_num - ign_num + img_num = len(eval_samples) + + for metric in self.metrics: + logger.info(f'Evaluating {metric}...') + if metric == 'AP': + AP = self.eval_ap(score_list, gt_num, img_num) + eval_results['mAP'] = float(f'{round(AP, 4)}') + if metric == 'MR': + MR = self.eval_mr(score_list, gt_num, img_num) + eval_results['mMR'] = float(f'{round(MR, 4)}') + if metric == 'JI': + JI = self.eval_ji(eval_samples) + eval_results['JI'] = float(f'{round(JI, 4)}') + if tmp_dir is not None: + tmp_dir.cleanup() + + return eval_results + + def load_eval_samples(self, result_file): + """Load data from annotations file and detection results. + + Args: + result_file (str): The file path of the saved detection results. + + Returns: + Dict[Image]: The detection result packaged by Image + """ + gt_str = get_text( + self.ann_file, backend_args=self.backend_args).strip().split('\n') + gt_records = [json.loads(line) for line in gt_str] + + pred_records = load(result_file, backend_args=self.backend_args) + eval_samples = dict() + for gt_record, pred_record in zip(gt_records, pred_records): + assert gt_record['ID'] == pred_record['ID'], \ + 'please set val_dataloader.sampler.shuffle=False and try again' + eval_samples[pred_record['ID']] = Image(self.eval_mode) + eval_samples[pred_record['ID']].load(gt_record, 'box', None, + PERSON_CLASSES, True) + eval_samples[pred_record['ID']].load(pred_record, 'box', None, + PERSON_CLASSES, False) + eval_samples[pred_record['ID']].clip_all_boader() + return eval_samples + + def compare(self, samples): + """Match the detection results with the ground_truth. + + Args: + samples (dict[Image]): The detection result packaged by Image. + + Returns: + score_list(list[tuple[ndarray, int, str]]): Matching result. + a list of tuples (dtbox, label, imgID) in the descending + sort of dtbox.score. + """ + score_list = list() + for id in samples: + if self.compare_matching_method == 'VOC': + result = samples[id].compare_voc(self.iou_thres) + else: + result = samples[id].compare_caltech(self.iou_thres) + score_list.extend(result) + # In the descending sort of dtbox score. + score_list.sort(key=lambda x: x[0][-1], reverse=True) + return score_list + + @staticmethod + def eval_ap(score_list, gt_num, img_num): + """Evaluate by average precision. + + Args: + score_list(list[tuple[ndarray, int, str]]): Matching result. + a list of tuples (dtbox, label, imgID) in the descending + sort of dtbox.score. + gt_num(int): The number of gt boxes in the entire dataset. + img_num(int): The number of images in the entire dataset. + + Returns: + ap(float): result of average precision. + """ + + # calculate general ap score + def _calculate_map(_recall, _precision): + assert len(_recall) == len(_precision) + area = 0 + for k in range(1, len(_recall)): + delta_h = (_precision[k - 1] + _precision[k]) / 2 + delta_w = _recall[k] - _recall[k - 1] + area += delta_w * delta_h + return area + + tp, fp = 0.0, 0.0 + rpX, rpY = list(), list() + + fpn = [] + recalln = [] + thr = [] + fppi = [] + for i, item in enumerate(score_list): + if item[1] == 1: + tp += 1.0 + elif item[1] == 0: + fp += 1.0 + fn = gt_num - tp + recall = tp / (tp + fn) + precision = tp / (tp + fp) + rpX.append(recall) + rpY.append(precision) + fpn.append(fp) + recalln.append(tp) + thr.append(item[0][-1]) + fppi.append(fp / img_num) + + ap = _calculate_map(rpX, rpY) + return ap + + def eval_mr(self, score_list, gt_num, img_num): + """Evaluate by Caltech-style log-average miss rate. + + Args: + score_list(list[tuple[ndarray, int, str]]): Matching result. + a list of tuples (dtbox, label, imgID) in the descending + sort of dtbox.score. + gt_num(int): The number of gt boxes in the entire dataset. + img_num(int): The number of image in the entire dataset. + + Returns: + mr(float): result of miss rate. + """ + + # find greater_than + def _find_gt(lst, target): + for idx, _item in enumerate(lst): + if _item >= target: + return idx + return len(lst) - 1 + + if self.mr_ref == 'CALTECH_-2': + # CALTECH_MRREF_2: anchor points (from 10^-2 to 1) as in + # P.Dollar's paper + ref = [ + 0.0100, 0.0178, 0.03160, 0.0562, 0.1000, 0.1778, 0.3162, + 0.5623, 1.000 + ] + else: + # CALTECH_MRREF_4: anchor points (from 10^-4 to 1) as in + # S.Zhang's paper + ref = [ + 0.0001, 0.0003, 0.00100, 0.0032, 0.0100, 0.0316, 0.1000, + 0.3162, 1.000 + ] + + tp, fp = 0.0, 0.0 + fppiX, fppiY = list(), list() + for i, item in enumerate(score_list): + if item[1] == 1: + tp += 1.0 + elif item[1] == 0: + fp += 1.0 + + fn = gt_num - tp + recall = tp / (tp + fn) + missrate = 1.0 - recall + fppi = fp / img_num + fppiX.append(fppi) + fppiY.append(missrate) + + score = list() + for pos in ref: + argmin = _find_gt(fppiX, pos) + if argmin >= 0: + score.append(fppiY[argmin]) + score = np.array(score) + mr = np.exp(np.log(score).mean()) + return mr + + def eval_ji(self, samples): + """Evaluate by JI using multi_process. + + Args: + samples(Dict[str, Image]): The detection result packaged by Image. + + Returns: + ji(float): result of jaccard index. + """ + import math + res_line = [] + res_ji = [] + for i in range(10): + score_thr = 1e-1 * i + total = len(samples) + stride = math.ceil(total / self.num_ji_process) + result_queue = Queue(10000) + results, procs = [], [] + records = list(samples.items()) + for i in range(self.num_ji_process): + start = i * stride + end = np.min([start + stride, total]) + sample_data = dict(records[start:end]) + p = Process( + target=self.compute_ji_with_ignore, + args=(result_queue, sample_data, score_thr)) + p.start() + procs.append(p) + for i in range(total): + t = result_queue.get() + results.append(t) + for p in procs: + p.join() + line, mean_ratio = self.gather(results) + line = 'score_thr:{:.1f}, {}'.format(score_thr, line) + res_line.append(line) + res_ji.append(mean_ratio) + return max(res_ji) + + def compute_ji_with_ignore(self, result_queue, dt_result, score_thr): + """Compute JI with ignore. + + Args: + result_queue(Queue): The Queue for save compute result when + multi_process. + dt_result(dict[Image]): Detection result packaged by Image. + score_thr(float): The threshold of detection score. + Returns: + dict: compute result. + """ + for ID, record in dt_result.items(): + gt_boxes = record.gt_boxes + dt_boxes = record.dt_boxes + keep = dt_boxes[:, -1] > score_thr + dt_boxes = dt_boxes[keep][:, :-1] + + gt_tag = np.array(gt_boxes[:, -1] != -1) + matches = self.compute_ji_matching(dt_boxes, gt_boxes[gt_tag, :4]) + # get the unmatched_indices + matched_indices = np.array([j for (j, _) in matches]) + unmatched_indices = list( + set(np.arange(dt_boxes.shape[0])) - set(matched_indices)) + num_ignore_dt = self.get_ignores(dt_boxes[unmatched_indices], + gt_boxes[~gt_tag, :4]) + matched_indices = np.array([j for (_, j) in matches]) + unmatched_indices = list( + set(np.arange(gt_boxes[gt_tag].shape[0])) - + set(matched_indices)) + num_ignore_gt = self.get_ignores( + gt_boxes[gt_tag][unmatched_indices], gt_boxes[~gt_tag, :4]) + # compute results + eps = 1e-6 + k = len(matches) + m = gt_tag.sum() - num_ignore_gt + n = dt_boxes.shape[0] - num_ignore_dt + ratio = k / (m + n - k + eps) + recall = k / (m + eps) + cover = k / (n + eps) + noise = 1 - cover + result_dict = dict( + ratio=ratio, + recall=recall, + cover=cover, + noise=noise, + k=k, + m=m, + n=n) + result_queue.put_nowait(result_dict) + + @staticmethod + def gather(results): + """Integrate test results.""" + assert len(results) + img_num = 0 + for result in results: + if result['n'] != 0 or result['m'] != 0: + img_num += 1 + mean_ratio = np.sum([rb['ratio'] for rb in results]) / img_num + valids = np.sum([rb['k'] for rb in results]) + total = np.sum([rb['n'] for rb in results]) + gtn = np.sum([rb['m'] for rb in results]) + line = 'mean_ratio:{:.4f}, valids:{}, total:{}, gtn:{}'\ + .format(mean_ratio, valids, total, gtn) + return line, mean_ratio + + def compute_ji_matching(self, dt_boxes, gt_boxes): + """Match the annotation box for each detection box. + + Args: + dt_boxes(ndarray): Detection boxes. + gt_boxes(ndarray): Ground_truth boxes. + + Returns: + matches_(list[tuple[int, int]]): Match result. + """ + assert dt_boxes.shape[-1] > 3 and gt_boxes.shape[-1] > 3 + if dt_boxes.shape[0] < 1 or gt_boxes.shape[0] < 1: + return list() + + ious = bbox_overlaps(dt_boxes, gt_boxes, mode='iou') + input_ = copy.deepcopy(ious) + input_[input_ < self.iou_thres] = 0 + match_scipy = maximum_bipartite_matching( + csr_matrix(input_), perm_type='column') + matches_ = [] + for i in range(len(match_scipy)): + if match_scipy[i] != -1: + matches_.append((i, int(match_scipy[i]))) + return matches_ + + def get_ignores(self, dt_boxes, gt_boxes): + """Get the number of ignore bboxes.""" + if gt_boxes.size: + ioas = bbox_overlaps(dt_boxes, gt_boxes, mode='iof') + ioas = np.max(ioas, axis=1) + rows = np.where(ioas > self.iou_thres)[0] + return len(rows) + else: + return 0 + + +class Image(object): + """Data structure for evaluation of CrowdHuman. + + Note: + This implementation is modified from https://github.com/Purkialo/ + CrowdDet/blob/master/lib/evaluate/APMRToolkits/image.py + + Args: + mode (int): Select the mode of evaluate. Valid mode include + 0(just body box), 1(just head box) and 2(both of them). + Defaults to 0. + """ + + def __init__(self, mode): + self.ID = None + self.width = None + self.height = None + self.dt_boxes = None + self.gt_boxes = None + self.eval_mode = mode + + self.ign_num = None + self.gt_num = None + self.dt_num = None + + def load(self, record, body_key, head_key, class_names, gt_flag): + """Loading information for evaluation. + + Args: + record (dict): Label information or test results. + The format might look something like this: + { + 'ID': '273271,c9db000d5146c15', + 'gtboxes': [ + {'fbox': [72, 202, 163, 503], 'tag': 'person', ...}, + {'fbox': [199, 180, 144, 499], 'tag': 'person', ...}, + ... + ] + } + or: + { + 'ID': '273271,c9db000d5146c15', + 'width': 800, + 'height': 1067, + 'dtboxes': [ + { + 'box': [306.22, 205.95, 164.05, 394.04], + 'score': 0.99, + 'tag': 1 + }, + { + 'box': [403.60, 178.66, 157.15, 421.33], + 'score': 0.99, + 'tag': 1 + }, + ... + ] + } + body_key (str, None): key of detection body box. + Valid when loading detection results and self.eval_mode!=1. + head_key (str, None): key of detection head box. + Valid when loading detection results and self.eval_mode!=0. + class_names (list[str]):class names of data set. + Defaults to ['background', 'person']. + gt_flag (bool): Indicate whether record is ground truth + or predicting the outcome. + """ + if 'ID' in record and self.ID is None: + self.ID = record['ID'] + if 'width' in record and self.width is None: + self.width = record['width'] + if 'height' in record and self.height is None: + self.height = record['height'] + if gt_flag: + self.gt_num = len(record['gtboxes']) + body_bbox, head_bbox = self.load_gt_boxes(record, 'gtboxes', + class_names) + if self.eval_mode == 0: + self.gt_boxes = body_bbox + self.ign_num = (body_bbox[:, -1] == -1).sum() + elif self.eval_mode == 1: + self.gt_boxes = head_bbox + self.ign_num = (head_bbox[:, -1] == -1).sum() + else: + gt_tag = np.array([ + body_bbox[i, -1] != -1 and head_bbox[i, -1] != -1 + for i in range(len(body_bbox)) + ]) + self.ign_num = (gt_tag == 0).sum() + self.gt_boxes = np.hstack( + (body_bbox[:, :-1], head_bbox[:, :-1], + gt_tag.reshape(-1, 1))) + + if not gt_flag: + self.dt_num = len(record['dtboxes']) + if self.eval_mode == 0: + self.dt_boxes = self.load_det_boxes(record, 'dtboxes', + body_key, 'score') + elif self.eval_mode == 1: + self.dt_boxes = self.load_det_boxes(record, 'dtboxes', + head_key, 'score') + else: + body_dtboxes = self.load_det_boxes(record, 'dtboxes', body_key, + 'score') + head_dtboxes = self.load_det_boxes(record, 'dtboxes', head_key, + 'score') + self.dt_boxes = np.hstack((body_dtboxes, head_dtboxes)) + + @staticmethod + def load_gt_boxes(dict_input, key_name, class_names): + """load ground_truth and transform [x, y, w, h] to [x1, y1, x2, y2]""" + assert key_name in dict_input + if len(dict_input[key_name]) < 1: + return np.empty([0, 5]) + head_bbox = [] + body_bbox = [] + for rb in dict_input[key_name]: + if rb['tag'] in class_names: + body_tag = class_names.index(rb['tag']) + head_tag = copy.deepcopy(body_tag) + else: + body_tag = -1 + head_tag = -1 + if 'extra' in rb: + if 'ignore' in rb['extra']: + if rb['extra']['ignore'] != 0: + body_tag = -1 + head_tag = -1 + if 'head_attr' in rb: + if 'ignore' in rb['head_attr']: + if rb['head_attr']['ignore'] != 0: + head_tag = -1 + head_bbox.append(np.hstack((rb['hbox'], head_tag))) + body_bbox.append(np.hstack((rb['fbox'], body_tag))) + head_bbox = np.array(head_bbox) + head_bbox[:, 2:4] += head_bbox[:, :2] + body_bbox = np.array(body_bbox) + body_bbox[:, 2:4] += body_bbox[:, :2] + return body_bbox, head_bbox + + @staticmethod + def load_det_boxes(dict_input, key_name, key_box, key_score, key_tag=None): + """load detection boxes.""" + assert key_name in dict_input + if len(dict_input[key_name]) < 1: + return np.empty([0, 5]) + else: + assert key_box in dict_input[key_name][0] + if key_score: + assert key_score in dict_input[key_name][0] + if key_tag: + assert key_tag in dict_input[key_name][0] + if key_score: + if key_tag: + bboxes = np.vstack([ + np.hstack((rb[key_box], rb[key_score], rb[key_tag])) + for rb in dict_input[key_name] + ]) + else: + bboxes = np.vstack([ + np.hstack((rb[key_box], rb[key_score])) + for rb in dict_input[key_name] + ]) + else: + if key_tag: + bboxes = np.vstack([ + np.hstack((rb[key_box], rb[key_tag])) + for rb in dict_input[key_name] + ]) + else: + bboxes = np.vstack( + [rb[key_box] for rb in dict_input[key_name]]) + bboxes[:, 2:4] += bboxes[:, :2] + return bboxes + + def clip_all_boader(self): + """Make sure boxes are within the image range.""" + + def _clip_boundary(boxes, height, width): + assert boxes.shape[-1] >= 4 + boxes[:, 0] = np.minimum(np.maximum(boxes[:, 0], 0), width - 1) + boxes[:, 1] = np.minimum(np.maximum(boxes[:, 1], 0), height - 1) + boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], width), 0) + boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], height), 0) + return boxes + + assert self.dt_boxes.shape[-1] >= 4 + assert self.gt_boxes.shape[-1] >= 4 + assert self.width is not None and self.height is not None + if self.eval_mode == 2: + self.dt_boxes[:, :4] = _clip_boundary(self.dt_boxes[:, :4], + self.height, self.width) + self.gt_boxes[:, :4] = _clip_boundary(self.gt_boxes[:, :4], + self.height, self.width) + self.dt_boxes[:, 4:8] = _clip_boundary(self.dt_boxes[:, 4:8], + self.height, self.width) + self.gt_boxes[:, 4:8] = _clip_boundary(self.gt_boxes[:, 4:8], + self.height, self.width) + else: + self.dt_boxes = _clip_boundary(self.dt_boxes, self.height, + self.width) + self.gt_boxes = _clip_boundary(self.gt_boxes, self.height, + self.width) + + def compare_voc(self, thres): + """Match the detection results with the ground_truth by VOC. + + Args: + thres (float): IOU threshold. + + Returns: + score_list(list[tuple[ndarray, int, str]]): Matching result. + a list of tuples (dtbox, label, imgID) in the descending + sort of dtbox.score. + """ + if self.dt_boxes is None: + return list() + dtboxes = self.dt_boxes + gtboxes = self.gt_boxes if self.gt_boxes is not None else list() + dtboxes.sort(key=lambda x: x.score, reverse=True) + gtboxes.sort(key=lambda x: x.ign) + + score_list = list() + for i, dt in enumerate(dtboxes): + maxpos = -1 + maxiou = thres + + for j, gt in enumerate(gtboxes): + overlap = dt.iou(gt) + if overlap > maxiou: + maxiou = overlap + maxpos = j + + if maxpos >= 0: + if gtboxes[maxpos].ign == 0: + gtboxes[maxpos].matched = 1 + dtboxes[i].matched = 1 + score_list.append((dt, self.ID)) + else: + dtboxes[i].matched = -1 + else: + dtboxes[i].matched = 0 + score_list.append((dt, self.ID)) + return score_list + + def compare_caltech(self, thres): + """Match the detection results with the ground_truth by Caltech + matching strategy. + + Args: + thres (float): IOU threshold. + + Returns: + score_list(list[tuple[ndarray, int, str]]): Matching result. + a list of tuples (dtbox, label, imgID) in the descending + sort of dtbox.score. + """ + if self.dt_boxes is None or self.gt_boxes is None: + return list() + + dtboxes = self.dt_boxes if self.dt_boxes is not None else list() + gtboxes = self.gt_boxes if self.gt_boxes is not None else list() + dt_matched = np.zeros(dtboxes.shape[0]) + gt_matched = np.zeros(gtboxes.shape[0]) + + dtboxes = np.array(sorted(dtboxes, key=lambda x: x[-1], reverse=True)) + gtboxes = np.array(sorted(gtboxes, key=lambda x: x[-1], reverse=True)) + if len(dtboxes): + overlap_iou = bbox_overlaps(dtboxes, gtboxes, mode='iou') + overlap_ioa = bbox_overlaps(dtboxes, gtboxes, mode='iof') + else: + return list() + + score_list = list() + for i, dt in enumerate(dtboxes): + maxpos = -1 + maxiou = thres + for j, gt in enumerate(gtboxes): + if gt_matched[j] == 1: + continue + if gt[-1] > 0: + overlap = overlap_iou[i][j] + if overlap > maxiou: + maxiou = overlap + maxpos = j + else: + if maxpos >= 0: + break + else: + overlap = overlap_ioa[i][j] + if overlap > thres: + maxiou = overlap + maxpos = j + if maxpos >= 0: + if gtboxes[maxpos, -1] > 0: + gt_matched[maxpos] = 1 + dt_matched[i] = 1 + score_list.append((dt, 1, self.ID)) + else: + dt_matched[i] = -1 + else: + dt_matched[i] = 0 + score_list.append((dt, 0, self.ID)) + return score_list diff --git a/mmdetection/mmdet/evaluation/metrics/dump_det_results.py b/mmdetection/mmdet/evaluation/metrics/dump_det_results.py new file mode 100644 index 0000000..f3071d1 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/dump_det_results.py @@ -0,0 +1,47 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Sequence + +from mmengine.evaluator import DumpResults +from mmengine.evaluator.metric import _to_cpu + +from mmdet.registry import METRICS +from mmdet.structures.mask import encode_mask_results + + +@METRICS.register_module() +class DumpDetResults(DumpResults): + """Dump model predictions to a pickle file for offline evaluation. + + Different from `DumpResults` in MMEngine, it compresses instance + segmentation masks into RLE format. + + Args: + out_file_path (str): Path of the dumped file. Must end with '.pkl' + or '.pickle'. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + """ + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """transfer tensors in predictions to CPU.""" + data_samples = _to_cpu(data_samples) + for data_sample in data_samples: + # remove gt + data_sample.pop('gt_instances', None) + data_sample.pop('ignored_instances', None) + data_sample.pop('gt_panoptic_seg', None) + + if 'pred_instances' in data_sample: + pred = data_sample['pred_instances'] + # encode mask to RLE + if 'masks' in pred: + pred['masks'] = encode_mask_results(pred['masks'].numpy()) + if 'pred_panoptic_seg' in data_sample: + warnings.warn( + 'Panoptic segmentation map will not be compressed. ' + 'The dumped file will be extremely large! ' + 'Suggest using `CocoPanopticMetric` to save the coco ' + 'format json and segmentation png files directly.') + self.results.extend(data_samples) diff --git a/mmdetection/mmdet/evaluation/metrics/dump_proposals_metric.py b/mmdetection/mmdet/evaluation/metrics/dump_proposals_metric.py new file mode 100644 index 0000000..9e9c536 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/dump_proposals_metric.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +from typing import Optional, Sequence + +from mmengine.dist import is_main_process +from mmengine.evaluator import BaseMetric +from mmengine.fileio import dump +from mmengine.logging import MMLogger +from mmengine.structures import InstanceData + +from mmdet.registry import METRICS + + +@METRICS.register_module() +class DumpProposals(BaseMetric): + """Dump proposals pseudo metric. + + Args: + output_dir (str): The root directory for ``proposals_file``. + Defaults to ''. + proposals_file (str): Proposals file path. Defaults to 'proposals.pkl'. + num_max_proposals (int, optional): Maximum number of proposals to dump. + If not specified, all proposals will be dumped. + file_client_args (dict, optional): Arguments to instantiate the + corresponding backend in mmdet <= 3.0.0rc6. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + """ + + default_prefix: Optional[str] = 'dump_proposals' + + def __init__(self, + output_dir: str = '', + proposals_file: str = 'proposals.pkl', + num_max_proposals: Optional[int] = None, + file_client_args: dict = None, + backend_args: dict = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + self.num_max_proposals = num_max_proposals + # TODO: update after mmengine finish refactor fileio. + self.backend_args = backend_args + if file_client_args is not None: + raise RuntimeError( + 'The `file_client_args` is deprecated, ' + 'please use `backend_args` instead, please refer to' + 'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py' # noqa: E501 + ) + self.output_dir = output_dir + assert proposals_file.endswith(('.pkl', '.pickle')), \ + 'The output file must be a pkl file.' + + self.proposals_file = os.path.join(self.output_dir, proposals_file) + if is_main_process(): + os.makedirs(self.output_dir, exist_ok=True) + + def process(self, data_batch: Sequence[dict], + data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + pred = data_sample['pred_instances'] + # `bboxes` is sorted by `scores` + ranked_scores, rank_inds = pred['scores'].sort(descending=True) + ranked_bboxes = pred['bboxes'][rank_inds, :] + + ranked_bboxes = ranked_bboxes.cpu().numpy() + ranked_scores = ranked_scores.cpu().numpy() + + pred_instance = InstanceData() + pred_instance.bboxes = ranked_bboxes + pred_instance.scores = ranked_scores + if self.num_max_proposals is not None: + pred_instance = pred_instance[:self.num_max_proposals] + + img_path = data_sample['img_path'] + # `file_name` is the key to obtain the proposals from the + # `proposals_list`. + file_name = osp.join( + osp.split(osp.split(img_path)[0])[-1], + osp.split(img_path)[-1]) + result = {file_name: pred_instance} + self.results.append(result) + + def compute_metrics(self, results: list) -> dict: + """Dump the processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + dict: An empty dict. + """ + logger: MMLogger = MMLogger.get_current_instance() + dump_results = {} + for result in results: + dump_results.update(result) + dump( + dump_results, + file=self.proposals_file, + backend_args=self.backend_args) + logger.info(f'Results are saved at {self.proposals_file}') + return {} diff --git a/mmdetection/mmdet/evaluation/metrics/lvis_metric.py b/mmdetection/mmdet/evaluation/metrics/lvis_metric.py new file mode 100644 index 0000000..e4dd614 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/lvis_metric.py @@ -0,0 +1,364 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import itertools +import os.path as osp +import tempfile +import warnings +from collections import OrderedDict +from typing import Dict, List, Optional, Sequence, Union + +import numpy as np +from mmengine.fileio import get_local_path +from mmengine.logging import MMLogger +from terminaltables import AsciiTable + +from mmdet.registry import METRICS +from mmdet.structures.mask import encode_mask_results +from ..functional import eval_recalls +from .coco_metric import CocoMetric + +try: + import lvis + if getattr(lvis, '__version__', '0') >= '10.5.3': + warnings.warn( + 'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"', # noqa: E501 + UserWarning) + from lvis import LVIS, LVISEval, LVISResults +except ImportError: + lvis = None + LVISEval = None + LVISResults = None + + +@METRICS.register_module() +class LVISMetric(CocoMetric): + """LVIS evaluation metric. + + Args: + ann_file (str, optional): Path to the coco format annotation file. + If not specified, ground truth annotations from the dataset will + be converted to coco format. Defaults to None. + metric (str | List[str]): Metrics to be evaluated. Valid metrics + include 'bbox', 'segm', 'proposal', and 'proposal_fast'. + Defaults to 'bbox'. + classwise (bool): Whether to evaluate the metric class-wise. + Defaults to False. + proposal_nums (Sequence[int]): Numbers of proposals to be evaluated. + Defaults to (100, 300, 1000). + iou_thrs (float | List[float], optional): IoU threshold to compute AP + and AR. If not specified, IoUs from 0.5 to 0.95 will be used. + Defaults to None. + metric_items (List[str], optional): Metric result names to be + recorded in the evaluation result. Defaults to None. + format_only (bool): Format the output results without perform + evaluation. It is useful when you want to format the result + to a specific format and submit it to the test server. + Defaults to False. + outfile_prefix (str, optional): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + file_client_args (dict, optional): Arguments to instantiate the + corresponding backend in mmdet <= 3.0.0rc6. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + """ + + default_prefix: Optional[str] = 'lvis' + + def __init__(self, + ann_file: Optional[str] = None, + metric: Union[str, List[str]] = 'bbox', + classwise: bool = False, + proposal_nums: Sequence[int] = (100, 300, 1000), + iou_thrs: Optional[Union[float, Sequence[float]]] = None, + metric_items: Optional[Sequence[str]] = None, + format_only: bool = False, + outfile_prefix: Optional[str] = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None, + file_client_args: dict = None, + backend_args: dict = None) -> None: + if lvis is None: + raise RuntimeError( + 'Package lvis is not installed. Please run "pip install ' + 'git+https://github.com/lvis-dataset/lvis-api.git".') + super().__init__(collect_device=collect_device, prefix=prefix) + # coco evaluation metrics + self.metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast'] + for metric in self.metrics: + if metric not in allowed_metrics: + raise KeyError( + "metric should be one of 'bbox', 'segm', 'proposal', " + f"'proposal_fast', but got {metric}.") + + # do class wise evaluation, default False + self.classwise = classwise + + # proposal_nums used to compute recall or precision. + self.proposal_nums = list(proposal_nums) + + # iou_thrs used to compute recall or precision. + if iou_thrs is None: + iou_thrs = np.linspace( + .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + self.iou_thrs = iou_thrs + self.metric_items = metric_items + self.format_only = format_only + if self.format_only: + assert outfile_prefix is not None, 'outfile_prefix must be not' + 'None when format_only is True, otherwise the result files will' + 'be saved to a temp directory which will be cleaned up at the end.' + + self.outfile_prefix = outfile_prefix + self.backend_args = backend_args + if file_client_args is not None: + raise RuntimeError( + 'The `file_client_args` is deprecated, ' + 'please use `backend_args` instead, please refer to' + 'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py' # noqa: E501 + ) + + # if ann_file is not specified, + # initialize lvis api with the converted dataset + if ann_file is not None: + with get_local_path( + ann_file, backend_args=self.backend_args) as local_path: + self._lvis_api = LVIS(local_path) + else: + self._lvis_api = None + + # handle dataset lazy init + self.cat_ids = None + self.img_ids = None + + def fast_eval_recall(self, + results: List[dict], + proposal_nums: Sequence[int], + iou_thrs: Sequence[float], + logger: Optional[MMLogger] = None) -> np.ndarray: + """Evaluate proposal recall with LVIS's fast_eval_recall. + + Args: + results (List[dict]): Results of the dataset. + proposal_nums (Sequence[int]): Proposal numbers used for + evaluation. + iou_thrs (Sequence[float]): IoU thresholds used for evaluation. + logger (MMLogger, optional): Logger used for logging the recall + summary. + Returns: + np.ndarray: Averaged recall results. + """ + gt_bboxes = [] + pred_bboxes = [result['bboxes'] for result in results] + for i in range(len(self.img_ids)): + ann_ids = self._lvis_api.get_ann_ids(img_ids=[self.img_ids[i]]) + ann_info = self._lvis_api.load_anns(ann_ids) + if len(ann_info) == 0: + gt_bboxes.append(np.zeros((0, 4))) + continue + bboxes = [] + for ann in ann_info: + x1, y1, w, h = ann['bbox'] + bboxes.append([x1, y1, x1 + w, y1 + h]) + bboxes = np.array(bboxes, dtype=np.float32) + if bboxes.shape[0] == 0: + bboxes = np.zeros((0, 4)) + gt_bboxes.append(bboxes) + + recalls = eval_recalls( + gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger) + ar = recalls.mean(axis=1) + return ar + + # TODO: data_batch is no longer needed, consider adjusting the + # parameter position + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + result = dict() + pred = data_sample['pred_instances'] + result['img_id'] = data_sample['img_id'] + result['bboxes'] = pred['bboxes'].cpu().numpy() + result['scores'] = pred['scores'].cpu().numpy() + result['labels'] = pred['labels'].cpu().numpy() + # encode mask to RLE + if 'masks' in pred: + result['masks'] = encode_mask_results( + pred['masks'].detach().cpu().numpy()) + # some detectors use different scores for bbox and mask + if 'mask_scores' in pred: + result['mask_scores'] = pred['mask_scores'].cpu().numpy() + + # parse gt + gt = dict() + gt['width'] = data_sample['ori_shape'][1] + gt['height'] = data_sample['ori_shape'][0] + gt['img_id'] = data_sample['img_id'] + if self._lvis_api is None: + # TODO: Need to refactor to support LoadAnnotations + assert 'instances' in data_sample, \ + 'ground truth is required for evaluation when ' \ + '`ann_file` is not provided' + gt['anns'] = data_sample['instances'] + # add converted result to the results list + self.results.append((gt, result)) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + + # split gt and prediction list + gts, preds = zip(*results) + + tmp_dir = None + if self.outfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + outfile_prefix = osp.join(tmp_dir.name, 'results') + else: + outfile_prefix = self.outfile_prefix + + if self._lvis_api is None: + # use converted gt json file to initialize coco api + logger.info('Converting ground truth to coco format...') + coco_json_path = self.gt_to_coco_json( + gt_dicts=gts, outfile_prefix=outfile_prefix) + self._lvis_api = LVIS(coco_json_path) + + # handle lazy init + if self.cat_ids is None: + self.cat_ids = self._lvis_api.get_cat_ids() + if self.img_ids is None: + self.img_ids = self._lvis_api.get_img_ids() + + # convert predictions to coco format and dump to json file + result_files = self.results2json(preds, outfile_prefix) + + eval_results = OrderedDict() + if self.format_only: + logger.info('results are saved in ' + f'{osp.dirname(outfile_prefix)}') + return eval_results + + lvis_gt = self._lvis_api + + for metric in self.metrics: + logger.info(f'Evaluating {metric}...') + + # TODO: May refactor fast_eval_recall to an independent metric? + # fast eval recall + if metric == 'proposal_fast': + ar = self.fast_eval_recall( + preds, self.proposal_nums, self.iou_thrs, logger=logger) + log_msg = [] + for i, num in enumerate(self.proposal_nums): + eval_results[f'AR@{num}'] = ar[i] + log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}') + log_msg = ''.join(log_msg) + logger.info(log_msg) + continue + + try: + lvis_dt = LVISResults(lvis_gt, result_files[metric]) + except IndexError: + logger.info( + 'The testing results of the whole dataset is empty.') + break + + iou_type = 'bbox' if metric == 'proposal' else metric + lvis_eval = LVISEval(lvis_gt, lvis_dt, iou_type) + lvis_eval.params.imgIds = self.img_ids + metric_items = self.metric_items + if metric == 'proposal': + lvis_eval.params.useCats = 0 + lvis_eval.params.maxDets = list(self.proposal_nums) + lvis_eval.evaluate() + lvis_eval.accumulate() + lvis_eval.summarize() + if metric_items is None: + metric_items = ['AR@300', 'ARs@300', 'ARm@300', 'ARl@300'] + for k, v in lvis_eval.get_results().items(): + if k in metric_items: + val = float('{:.3f}'.format(float(v))) + eval_results[k] = val + + else: + lvis_eval.evaluate() + lvis_eval.accumulate() + lvis_eval.summarize() + lvis_results = lvis_eval.get_results() + if self.classwise: # Compute per-category AP + # Compute per-category AP + # from https://github.com/facebookresearch/detectron2/ + precisions = lvis_eval.eval['precision'] + # precision: (iou, recall, cls, area range, max dets) + assert len(self.cat_ids) == precisions.shape[2] + + results_per_category = [] + for idx, catId in enumerate(self.cat_ids): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + # the dimensions of precisions are + # [num_thrs, num_recalls, num_cats, num_area_rngs] + nm = self._lvis_api.load_cats([catId])[0] + precision = precisions[:, :, idx, 0] + precision = precision[precision > -1] + if precision.size: + ap = np.mean(precision) + else: + ap = float('nan') + results_per_category.append( + (f'{nm["name"]}', f'{float(ap):0.3f}')) + eval_results[f'{nm["name"]}_precision'] = round(ap, 3) + + num_columns = min(6, len(results_per_category) * 2) + results_flatten = list( + itertools.chain(*results_per_category)) + headers = ['category', 'AP'] * (num_columns // 2) + results_2d = itertools.zip_longest(*[ + results_flatten[i::num_columns] + for i in range(num_columns) + ]) + table_data = [headers] + table_data += [result for result in results_2d] + table = AsciiTable(table_data) + logger.info('\n' + table.table) + + if metric_items is None: + metric_items = [ + 'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'APr', + 'APc', 'APf' + ] + + for k, v in lvis_results.items(): + if k in metric_items: + key = '{}_{}'.format(metric, k) + val = float('{:.3f}'.format(float(v))) + eval_results[key] = val + + lvis_eval.print_results() + if tmp_dir is not None: + tmp_dir.cleanup() + return eval_results diff --git a/mmdetection/mmdet/evaluation/metrics/mot_challenge_metric.py b/mmdetection/mmdet/evaluation/metrics/mot_challenge_metric.py new file mode 100644 index 0000000..a5513c4 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/mot_challenge_metric.py @@ -0,0 +1,443 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import shutil +import tempfile +from collections import defaultdict +from typing import List, Optional, Union + +import numpy as np +import torch + +try: + import trackeval +except ImportError: + trackeval = None +from mmengine.dist import (all_gather_object, barrier, broadcast, + broadcast_object_list, get_dist_info, + is_main_process) +from mmengine.logging import MMLogger + +from mmdet.registry import METRICS, TASK_UTILS +from .base_video_metric import BaseVideoMetric + + +def get_tmpdir() -> str: + """return the same tmpdir for all processes.""" + rank, world_size = get_dist_info() + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8) + if rank == 0: + tmpdir = tempfile.mkdtemp() + tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8) + dir_tensor[:len(tmpdir)] = tmpdir + broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + return tmpdir + + +@METRICS.register_module() +class MOTChallengeMetric(BaseVideoMetric): + """Evaluation metrics for MOT Challenge. + + Args: + metric (str | list[str]): Metrics to be evaluated. Options are + 'HOTA', 'CLEAR', 'Identity'. + Defaults to ['HOTA', 'CLEAR', 'Identity']. + outfile_prefix (str, optional): Path to save the formatted results. + Defaults to None. + track_iou_thr (float): IoU threshold for tracking evaluation. + Defaults to 0.5. + benchmark (str): Benchmark to be evaluated. Defaults to 'MOT17'. + format_only (bool): If True, only formatting the results to the + official format and not performing evaluation. Defaults to False. + postprocess_tracklet_cfg (List[dict], optional): configs for tracklets + postprocessing methods. `InterpolateTracklets` is supported. + Defaults to [] + - InterpolateTracklets: + - min_num_frames (int, optional): The minimum length of a + track that will be interpolated. Defaults to 5. + - max_num_frames (int, optional): The maximum disconnected + length in a track. Defaults to 20. + - use_gsi (bool, optional): Whether to use the GSI (Gaussian- + smoothed interpolation) method. Defaults to False. + - smooth_tau (int, optional): smoothing parameter in GSI. + Defaults to 10. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Default: None + Returns: + """ + TRACKER = 'default-tracker' + allowed_metrics = ['HOTA', 'CLEAR', 'Identity'] + allowed_benchmarks = ['MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack'] + default_prefix: Optional[str] = 'motchallenge-metric' + + def __init__(self, + metric: Union[str, List[str]] = ['HOTA', 'CLEAR', 'Identity'], + outfile_prefix: Optional[str] = None, + track_iou_thr: float = 0.5, + benchmark: str = 'MOT17', + format_only: bool = False, + use_postprocess: bool = False, + postprocess_tracklet_cfg: Optional[List[dict]] = [], + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + if trackeval is None: + raise RuntimeError( + 'trackeval is not installed,' + 'please install it by: pip install' + 'git+https://github.com/JonathonLuiten/TrackEval.git' + 'trackeval need low version numpy, please install it' + 'by: pip install -U numpy==1.23.5') + if isinstance(metric, list): + metrics = metric + elif isinstance(metric, str): + metrics = [metric] + else: + raise TypeError('metric must be a list or a str.') + for metric in metrics: + if metric not in self.allowed_metrics: + raise KeyError(f'metric {metric} is not supported.') + self.metrics = metrics + self.format_only = format_only + if self.format_only: + assert outfile_prefix is not None, 'outfile_prefix must be not' + 'None when format_only is True, otherwise the result files will' + 'be saved to a temp directory which will be cleaned up at the end.' + self.use_postprocess = use_postprocess + self.postprocess_tracklet_cfg = postprocess_tracklet_cfg.copy() + self.postprocess_tracklet_methods = [ + TASK_UTILS.build(cfg) for cfg in self.postprocess_tracklet_cfg + ] + assert benchmark in self.allowed_benchmarks + self.benchmark = benchmark + self.track_iou_thr = track_iou_thr + self.tmp_dir = tempfile.TemporaryDirectory() + self.tmp_dir.name = get_tmpdir() + self.seq_info = defaultdict( + lambda: dict(seq_length=-1, gt_tracks=[], pred_tracks=[])) + self.gt_dir = self._get_gt_dir() + self.pred_dir = self._get_pred_dir(outfile_prefix) + self.seqmap = osp.join(self.pred_dir, 'videoseq.txt') + with open(self.seqmap, 'w') as f: + f.write('name\n') + + def __del__(self): + # To avoid tmpdir being cleaned up too early, because in multiple + # consecutive ValLoops, the value of `self.tmp_dir.name` is unchanged, + # and calling `tmp_dir.cleanup()` in compute_metrics will cause errors. + self.tmp_dir.cleanup() + + def _get_pred_dir(self, outfile_prefix): + """Get directory to save the prediction results.""" + logger: MMLogger = MMLogger.get_current_instance() + + if outfile_prefix is None: + outfile_prefix = self.tmp_dir.name + else: + if osp.exists(outfile_prefix) and is_main_process(): + logger.info('remove previous results.') + shutil.rmtree(outfile_prefix) + pred_dir = osp.join(outfile_prefix, self.TRACKER) + os.makedirs(pred_dir, exist_ok=True) + return pred_dir + + def _get_gt_dir(self): + """Get directory to save the gt files.""" + output_dir = osp.join(self.tmp_dir.name, 'gt') + os.makedirs(output_dir, exist_ok=True) + return output_dir + + def transform_gt_and_pred(self, img_data_sample, video, frame_id): + + video = img_data_sample['img_path'].split(os.sep)[-3] + # load gts + if 'instances' in img_data_sample: + gt_instances = img_data_sample['instances'] + gt_tracks = [ + np.array([ + frame_id + 1, gt_instances[i]['instance_id'], + gt_instances[i]['bbox'][0], gt_instances[i]['bbox'][1], + gt_instances[i]['bbox'][2] - gt_instances[i]['bbox'][0], + gt_instances[i]['bbox'][3] - gt_instances[i]['bbox'][1], + gt_instances[i]['mot_conf'], + gt_instances[i]['category_id'], + gt_instances[i]['visibility'] + ]) for i in range(len(gt_instances)) + ] + self.seq_info[video]['gt_tracks'].extend(gt_tracks) + + # load predictions + assert 'pred_track_instances' in img_data_sample + if self.use_postprocess: + pred_instances = img_data_sample['pred_track_instances'] + pred_tracks = [ + pred_instances['bboxes'][i] + for i in range(len(pred_instances['bboxes'])) + ] + else: + pred_instances = img_data_sample['pred_track_instances'] + pred_tracks = [ + np.array([ + frame_id + 1, pred_instances['instances_id'][i].cpu(), + pred_instances['bboxes'][i][0].cpu(), + pred_instances['bboxes'][i][1].cpu(), + (pred_instances['bboxes'][i][2] - + pred_instances['bboxes'][i][0]).cpu(), + (pred_instances['bboxes'][i][3] - + pred_instances['bboxes'][i][1]).cpu(), + pred_instances['scores'][i].cpu() + ]) for i in range(len(pred_instances['instances_id'])) + ] + self.seq_info[video]['pred_tracks'].extend(pred_tracks) + + def process_image(self, data_samples, video_len): + + img_data_sample = data_samples[0].to_dict() + video = img_data_sample['img_path'].split(os.sep)[-3] + frame_id = img_data_sample['frame_id'] + if self.seq_info[video]['seq_length'] == -1: + self.seq_info[video]['seq_length'] = video_len + self.transform_gt_and_pred(img_data_sample, video, frame_id) + + if frame_id == video_len - 1: + # postprocessing + if self.postprocess_tracklet_cfg: + info = self.seq_info[video] + pred_tracks = np.array(info['pred_tracks']) + for postprocess_tracklet_methods in \ + self.postprocess_tracklet_methods: + pred_tracks = postprocess_tracklet_methods\ + .forward(pred_tracks) + info['pred_tracks'] = pred_tracks + self._save_one_video_gts_preds(video) + + def process_video(self, data_samples): + + video_len = len(data_samples) + for frame_id in range(video_len): + img_data_sample = data_samples[frame_id].to_dict() + # load basic info + video = img_data_sample['img_path'].split(os.sep)[-3] + if self.seq_info[video]['seq_length'] == -1: + self.seq_info[video]['seq_length'] = video_len + self.transform_gt_and_pred(img_data_sample, video, frame_id) + + if self.postprocess_tracklet_cfg: + info = self.seq_info[video] + pred_tracks = np.array(info['pred_tracks']) + for postprocess_tracklet_methods in \ + self.postprocess_tracklet_methods: + pred_tracks = postprocess_tracklet_methods \ + .forward(pred_tracks) + info['pred_tracks'] = pred_tracks + self._save_one_video_gts_preds(video) + + def _save_one_video_gts_preds(self, seq: str) -> None: + """Save the gt and prediction results.""" + info = self.seq_info[seq] + # save predictions + pred_file = osp.join(self.pred_dir, seq + '.txt') + + pred_tracks = np.array(info['pred_tracks']) + + with open(pred_file, 'wt') as f: + for tracks in pred_tracks: + line = '%d,%d,%.3f,%.3f,%.3f,%.3f,%.3f,-1,-1,-1\n' % ( + tracks[0], tracks[1], tracks[2], tracks[3], tracks[4], + tracks[5], tracks[6]) + f.writelines(line) + + info['pred_tracks'] = [] + # save gts + if info['gt_tracks']: + gt_file = osp.join(self.gt_dir, seq + '.txt') + with open(gt_file, 'wt') as f: + for tracks in info['gt_tracks']: + line = '%d,%d,%d,%d,%d,%d,%d,%d,%.5f\n' % ( + tracks[0], tracks[1], tracks[2], tracks[3], tracks[4], + tracks[5], tracks[6], tracks[7], tracks[8]) + f.writelines(line) + info['gt_tracks'].clear() + # save seq info + with open(self.seqmap, 'a') as f: + f.write(seq + '\n') + f.close() + + def compute_metrics(self, results: list = None) -> dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + Defaults to None. + + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + + # NOTICE: don't access `self.results` from the method. + eval_results = dict() + + if self.format_only: + return eval_results + + eval_config = trackeval.Evaluator.get_default_eval_config() + + # need to split out the tracker name + # caused by the implementation of TrackEval + pred_dir_tmp = self.pred_dir.rsplit(osp.sep, 1)[0] + dataset_config = self.get_dataset_cfg(self.gt_dir, pred_dir_tmp) + + evaluator = trackeval.Evaluator(eval_config) + dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)] + metrics = [ + getattr(trackeval.metrics, + metric)(dict(METRICS=[metric], THRESHOLD=0.5)) + for metric in self.metrics + ] + output_res, _ = evaluator.evaluate(dataset, metrics) + output_res = output_res['MotChallenge2DBox'][ + self.TRACKER]['COMBINED_SEQ']['pedestrian'] + + if 'HOTA' in self.metrics: + logger.info('Evaluating HOTA Metrics...') + eval_results['HOTA'] = np.average(output_res['HOTA']['HOTA']) + eval_results['AssA'] = np.average(output_res['HOTA']['AssA']) + eval_results['DetA'] = np.average(output_res['HOTA']['DetA']) + + if 'CLEAR' in self.metrics: + logger.info('Evaluating CLEAR Metrics...') + eval_results['MOTA'] = np.average(output_res['CLEAR']['MOTA']) + eval_results['MOTP'] = np.average(output_res['CLEAR']['MOTP']) + eval_results['IDSW'] = np.average(output_res['CLEAR']['IDSW']) + eval_results['TP'] = np.average(output_res['CLEAR']['CLR_TP']) + eval_results['FP'] = np.average(output_res['CLEAR']['CLR_FP']) + eval_results['FN'] = np.average(output_res['CLEAR']['CLR_FN']) + eval_results['Frag'] = np.average(output_res['CLEAR']['Frag']) + eval_results['MT'] = np.average(output_res['CLEAR']['MT']) + eval_results['ML'] = np.average(output_res['CLEAR']['ML']) + + if 'Identity' in self.metrics: + logger.info('Evaluating Identity Metrics...') + eval_results['IDF1'] = np.average(output_res['Identity']['IDF1']) + eval_results['IDTP'] = np.average(output_res['Identity']['IDTP']) + eval_results['IDFN'] = np.average(output_res['Identity']['IDFN']) + eval_results['IDFP'] = np.average(output_res['Identity']['IDFP']) + eval_results['IDP'] = np.average(output_res['Identity']['IDP']) + eval_results['IDR'] = np.average(output_res['Identity']['IDR']) + + return eval_results + + def evaluate(self, size: int = 1) -> dict: + """Evaluate the model performance of the whole dataset after processing + all batches. + + Args: + size (int): Length of the entire validation dataset. + Defaults to None. + + Returns: + dict: Evaluation metrics dict on the val dataset. The keys are the + names of the metrics, and the values are corresponding results. + """ + # wait for all processes to complete prediction. + barrier() + + # gather seq_info and convert the list of dict to a dict. + # convert self.seq_info to dict first to make it picklable. + gathered_seq_info = all_gather_object(dict(self.seq_info)) + all_seq_info = dict() + for _seq_info in gathered_seq_info: + all_seq_info.update(_seq_info) + self.seq_info = all_seq_info + + if is_main_process(): + _metrics = self.compute_metrics() # type: ignore + # Add prefix to metric names + if self.prefix: + _metrics = { + '/'.join((self.prefix, k)): v + for k, v in _metrics.items() + } + metrics = [_metrics] + else: + metrics = [None] # type: ignore + + broadcast_object_list(metrics) + + # reset the results list + self.results.clear() + return metrics[0] + + def get_dataset_cfg(self, gt_folder: str, tracker_folder: str): + """Get default configs for trackeval.datasets.MotChallenge2DBox. + + Args: + gt_folder (str): the name of the GT folder + tracker_folder (str): the name of the tracker folder + + Returns: + Dataset Configs for MotChallenge2DBox. + """ + dataset_config = dict( + # Location of GT data + GT_FOLDER=gt_folder, + # Trackers location + TRACKERS_FOLDER=tracker_folder, + # Where to save eval results + # (if None, same as TRACKERS_FOLDER) + OUTPUT_FOLDER=None, + # Use self.TRACKER as the default tracker + TRACKERS_TO_EVAL=[self.TRACKER], + # Option values: ['pedestrian'] + CLASSES_TO_EVAL=['pedestrian'], + # Option Values: 'MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack' + BENCHMARK=self.benchmark, + # Option Values: 'train', 'test' + SPLIT_TO_EVAL='val' if self.benchmark == 'DanceTrack' else 'train', + # Whether tracker input files are zipped + INPUT_AS_ZIP=False, + # Whether to print current config + PRINT_CONFIG=True, + # Whether to perform preprocessing + # (never done for MOT15) + DO_PREPROC=False if self.benchmark == 'MOT15' else True, + # Tracker files are in + # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER + TRACKER_SUB_FOLDER='', + # Output files are saved in + # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER + OUTPUT_SUB_FOLDER='', + # Names of trackers to display + # (if None: TRACKERS_TO_EVAL) + TRACKER_DISPLAY_NAMES=None, + # Where seqmaps are found + # (if None: GT_FOLDER/seqmaps) + SEQMAP_FOLDER=None, + # Directly specify seqmap file + # (if none use seqmap_folder/benchmark-split_to_eval) + SEQMAP_FILE=self.seqmap, + # If not None, specify sequences to eval + # and their number of timesteps + SEQ_INFO={ + seq: info['seq_length'] + for seq, info in self.seq_info.items() + }, + # '{gt_folder}/{seq}.txt' + GT_LOC_FORMAT='{gt_folder}/{seq}.txt', + # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in + # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/ + # If True, the middle 'benchmark-split' folder is skipped for both. + SKIP_SPLIT_FOL=True, + ) + + return dataset_config diff --git a/mmdetection/mmdet/evaluation/metrics/openimages_metric.py b/mmdetection/mmdet/evaluation/metrics/openimages_metric.py new file mode 100644 index 0000000..d75c59e --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/openimages_metric.py @@ -0,0 +1,237 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from collections import OrderedDict +from typing import List, Optional, Sequence, Union + +import numpy as np +from mmengine.evaluator import BaseMetric +from mmengine.logging import MMLogger, print_log + +from mmdet.registry import METRICS +from ..functional import eval_map + + +@METRICS.register_module() +class OpenImagesMetric(BaseMetric): + """OpenImages evaluation metric. + + Evaluate detection mAP for OpenImages. Please refer to + https://storage.googleapis.com/openimages/web/evaluation.html for more + details. + + Args: + iou_thrs (float or List[float]): IoU threshold. Defaults to 0.5. + ioa_thrs (float or List[float]): IoA threshold. Defaults to 0.5. + scale_ranges (List[tuple], optional): Scale ranges for evaluating + mAP. If not specified, all bounding boxes would be included in + evaluation. Defaults to None + use_group_of (bool): Whether consider group of groud truth bboxes + during evaluating. Defaults to True. + get_supercategory (bool): Whether to get parent class of the + current class. Default: True. + filter_labels (bool): Whether filter unannotated classes. + Default: True. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + """ + default_prefix: Optional[str] = 'openimages' + + def __init__(self, + iou_thrs: Union[float, List[float]] = 0.5, + ioa_thrs: Union[float, List[float]] = 0.5, + scale_ranges: Optional[List[tuple]] = None, + use_group_of: bool = True, + get_supercategory: bool = True, + filter_labels: bool = True, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + self.iou_thrs = [iou_thrs] if isinstance(iou_thrs, float) else iou_thrs + self.ioa_thrs = [ioa_thrs] if (isinstance(ioa_thrs, float) + or ioa_thrs is None) else ioa_thrs + assert isinstance(self.iou_thrs, list) and isinstance( + self.ioa_thrs, list) + assert len(self.iou_thrs) == len(self.ioa_thrs) + + self.scale_ranges = scale_ranges + self.use_group_of = use_group_of + self.get_supercategory = get_supercategory + self.filter_labels = filter_labels + + def _get_supercategory_ann(self, instances: List[dict]) -> List[dict]: + """Get parent classes's annotation of the corresponding class. + + Args: + instances (List[dict]): A list of annotations of the instances. + + Returns: + List[dict]: Annotations extended with super-category. + """ + supercat_instances = [] + relation_matrix = self.dataset_meta['RELATION_MATRIX'] + for instance in instances: + labels = np.where(relation_matrix[instance['bbox_label']])[0] + for label in labels: + if label == instance['bbox_label']: + continue + new_instance = copy.deepcopy(instance) + new_instance['bbox_label'] = label + supercat_instances.append(new_instance) + return supercat_instances + + def _process_predictions(self, pred_bboxes: np.ndarray, + pred_scores: np.ndarray, pred_labels: np.ndarray, + gt_instances: list, + image_level_labels: np.ndarray) -> tuple: + """Process results of the corresponding class of the detection bboxes. + + Note: It will choose to do the following two processing according to + the parameters: + + 1. Whether to add parent classes of the corresponding class of the + detection bboxes. + + 2. Whether to ignore the classes that unannotated on that image. + + Args: + pred_bboxes (np.ndarray): bboxes predicted by the model + pred_scores (np.ndarray): scores predicted by the model + pred_labels (np.ndarray): labels predicted by the model + gt_instances (list): ground truth annotations + image_level_labels (np.ndarray): human-verified image level labels + + Returns: + tuple: Processed bboxes, scores, and labels. + """ + processed_bboxes = copy.deepcopy(pred_bboxes) + processed_scores = copy.deepcopy(pred_scores) + processed_labels = copy.deepcopy(pred_labels) + gt_labels = np.array([ins['bbox_label'] for ins in gt_instances], + dtype=np.int64) + if image_level_labels is not None: + allowed_classes = np.unique( + np.append(gt_labels, image_level_labels)) + else: + allowed_classes = np.unique(gt_labels) + relation_matrix = self.dataset_meta['RELATION_MATRIX'] + pred_classes = np.unique(pred_labels) + for pred_class in pred_classes: + classes = np.where(relation_matrix[pred_class])[0] + for cls in classes: + if (cls in allowed_classes and cls != pred_class + and self.get_supercategory): + # add super-supercategory preds + index = np.where(pred_labels == pred_class)[0] + processed_scores = np.concatenate( + [processed_scores, pred_scores[index]]) + processed_bboxes = np.concatenate( + [processed_bboxes, pred_bboxes[index]]) + extend_labels = np.full(index.shape, cls, dtype=np.int64) + processed_labels = np.concatenate( + [processed_labels, extend_labels]) + elif cls not in allowed_classes and self.filter_labels: + # remove unannotated preds + index = np.where(processed_labels != cls)[0] + processed_scores = processed_scores[index] + processed_bboxes = processed_bboxes[index] + processed_labels = processed_labels[index] + return processed_bboxes, processed_scores, processed_labels + + # TODO: data_batch is no longer needed, consider adjusting the + # parameter position + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + gt = copy.deepcopy(data_sample) + # add super-category instances + # TODO: Need to refactor to support LoadAnnotations + instances = gt['instances'] + if self.get_supercategory: + supercat_instances = self._get_supercategory_ann(instances) + instances.extend(supercat_instances) + gt_labels = [] + gt_bboxes = [] + is_group_ofs = [] + for ins in instances: + gt_labels.append(ins['bbox_label']) + gt_bboxes.append(ins['bbox']) + is_group_ofs.append(ins['is_group_of']) + ann = dict( + labels=np.array(gt_labels, dtype=np.int64), + bboxes=np.array(gt_bboxes, dtype=np.float32).reshape((-1, 4)), + gt_is_group_ofs=np.array(is_group_ofs, dtype=bool)) + + image_level_labels = gt.get('image_level_labels', None) + pred = data_sample['pred_instances'] + pred_bboxes = pred['bboxes'].cpu().numpy() + pred_scores = pred['scores'].cpu().numpy() + pred_labels = pred['labels'].cpu().numpy() + + pred_bboxes, pred_scores, pred_labels = self._process_predictions( + pred_bboxes, pred_scores, pred_labels, instances, + image_level_labels) + + dets = [] + for label in range(len(self.dataset_meta['classes'])): + index = np.where(pred_labels == label)[0] + pred_bbox_scores = np.hstack( + [pred_bboxes[index], pred_scores[index].reshape((-1, 1))]) + dets.append(pred_bbox_scores) + self.results.append((ann, dets)) + + def compute_metrics(self, results: list) -> dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + logger = MMLogger.get_current_instance() + gts, preds = zip(*results) + eval_results = OrderedDict() + # get dataset type + dataset_type = self.dataset_meta.get('dataset_type') + if dataset_type not in ['oid_challenge', 'oid_v6']: + dataset_type = 'oid_v6' + print_log( + 'Cannot infer dataset type from the length of the' + ' classes. Set `oid_v6` as dataset type.', + logger='current') + mean_aps = [] + for i, (iou_thr, + ioa_thr) in enumerate(zip(self.iou_thrs, self.ioa_thrs)): + if self.use_group_of: + assert ioa_thr is not None, 'ioa_thr must have value when' \ + ' using group_of in evaluation.' + print_log(f'\n{"-" * 15}iou_thr, ioa_thr: {iou_thr}, {ioa_thr}' + f'{"-" * 15}') + mean_ap, _ = eval_map( + preds, + gts, + scale_ranges=self.scale_ranges, + iou_thr=iou_thr, + ioa_thr=ioa_thr, + dataset=dataset_type, + logger=logger, + use_group_of=self.use_group_of) + + mean_aps.append(mean_ap) + eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3) + eval_results['mAP'] = sum(mean_aps) / len(mean_aps) + return eval_results diff --git a/mmdetection/mmdet/evaluation/metrics/refseg_metric.py b/mmdetection/mmdet/evaluation/metrics/refseg_metric.py new file mode 100644 index 0000000..0faee07 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/refseg_metric.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +import torch +from mmengine.evaluator import BaseMetric + +from mmdet.registry import METRICS + + +@METRICS.register_module() +class RefSegMetric(BaseMetric): + """Referring Expression Segmentation Metric.""" + + def __init__(self, metric: Sequence = ('cIoU', 'mIoU'), **kwargs): + super().__init__(**kwargs) + assert set(metric).issubset(['cIoU', 'mIoU']), \ + f'Only support cIoU and mIoU, but got {metric}' + assert len(metric) > 0, 'metrics should not be empty' + self.metrics = metric + + def compute_iou(self, pred_seg: torch.Tensor, + gt_seg: torch.Tensor) -> tuple: + overlap = pred_seg & gt_seg + union = pred_seg | gt_seg + return overlap, union + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data and data_samples. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + for data_sample in data_samples: + pred_label = data_sample['pred_instances']['masks'].bool() + label = data_sample['gt_masks'].to_tensor( + pred_label.dtype, pred_label.device).bool() + # calculate iou + overlap, union = self.compute_iou(pred_label, label) + + bs = len(pred_label) + iou = overlap.reshape(bs, -1).sum(-1) * 1.0 / union.reshape( + bs, -1).sum(-1) + iou = torch.nan_to_num_(iou, nan=0.0) + self.results.append((overlap.sum(), union.sum(), iou.sum(), bs)) + + def compute_metrics(self, results: list) -> dict: + results = tuple(zip(*results)) + assert len(results) == 4 + cum_i = sum(results[0]) + cum_u = sum(results[1]) + iou = sum(results[2]) + seg_total = sum(results[3]) + + metrics = {} + if 'cIoU' in self.metrics: + metrics['cIoU'] = cum_i * 100 / cum_u + if 'mIoU' in self.metrics: + metrics['mIoU'] = iou * 100 / seg_total + return metrics diff --git a/mmdetection/mmdet/evaluation/metrics/reid_metric.py b/mmdetection/mmdet/evaluation/metrics/reid_metric.py new file mode 100644 index 0000000..d74df14 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/reid_metric.py @@ -0,0 +1,138 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import numpy as np +import torch +from mmengine.evaluator import BaseMetric + +from mmdet.registry import METRICS + + +@METRICS.register_module() +class ReIDMetrics(BaseMetric): + """mAP and CMC evaluation metrics for the ReID task. + + Args: + metric (str | list[str]): Metrics to be evaluated. + Default value is `mAP`. + metric_options: (dict, optional): Options for calculating metrics. + Allowed keys are 'rank_list' and 'max_rank'. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Default: None + """ + allowed_metrics = ['mAP', 'CMC'] + default_prefix: Optional[str] = 'reid-metric' + + def __init__(self, + metric: Union[str, Sequence[str]] = 'mAP', + metric_options: Optional[dict] = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device, prefix) + + if isinstance(metric, list): + metrics = metric + elif isinstance(metric, str): + metrics = [metric] + else: + raise TypeError('metric must be a list or a str.') + for metric in metrics: + if metric not in self.allowed_metrics: + raise KeyError(f'metric {metric} is not supported.') + self.metrics = metrics + + self.metric_options = metric_options or dict( + rank_list=[1, 5, 10, 20], max_rank=20) + for rank in self.metric_options['rank_list']: + assert 1 <= rank <= self.metric_options['max_rank'] + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + pred_feature = data_sample['pred_feature'] + assert isinstance(pred_feature, torch.Tensor) + gt_label = data_sample.get('gt_label', data_sample['gt_label']) + assert isinstance(gt_label['label'], torch.Tensor) + result = dict( + pred_feature=pred_feature.data.cpu(), + gt_label=gt_label['label'].cpu()) + self.results.append(result) + + def compute_metrics(self, results: list) -> dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + # NOTICE: don't access `self.results` from the method. + metrics = {} + + pids = torch.cat([result['gt_label'] for result in results]).numpy() + features = torch.stack([result['pred_feature'] for result in results]) + + n, c = features.size() + mat = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n) + distmat = mat + mat.t() + distmat.addmm_(features, features.t(), beta=1, alpha=-2) + distmat = distmat.numpy() + + indices = np.argsort(distmat, axis=1) + matches = (pids[indices] == pids[:, np.newaxis]).astype(np.int32) + + all_cmc = [] + all_AP = [] + num_valid_q = 0. + for q_idx in range(n): + # remove self + raw_cmc = matches[q_idx][1:] + if not np.any(raw_cmc): + # this condition is true when query identity + # does not appear in gallery + continue + + cmc = raw_cmc.cumsum() + cmc[cmc > 1] = 1 + + all_cmc.append(cmc[:self.metric_options['max_rank']]) + num_valid_q += 1. + + # compute average precision + num_rel = raw_cmc.sum() + tmp_cmc = raw_cmc.cumsum() + tmp_cmc = [x / (i + 1.) for i, x in enumerate(tmp_cmc)] + tmp_cmc = np.asarray(tmp_cmc) * raw_cmc + AP = tmp_cmc.sum() / num_rel + all_AP.append(AP) + + assert num_valid_q > 0, \ + 'Error: all query identities do not appear in gallery' + + all_cmc = np.asarray(all_cmc) + all_cmc = all_cmc.sum(0) / num_valid_q + mAP = np.mean(all_AP) + + if 'mAP' in self.metrics: + metrics['mAP'] = np.around(mAP, decimals=3) + if 'CMC' in self.metrics: + for rank in self.metric_options['rank_list']: + metrics[f'R{rank}'] = np.around(all_cmc[rank - 1], decimals=3) + + return metrics diff --git a/mmdetection/mmdet/evaluation/metrics/semseg_metric.py b/mmdetection/mmdet/evaluation/metrics/semseg_metric.py new file mode 100644 index 0000000..3215f67 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/semseg_metric.py @@ -0,0 +1,279 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from collections import OrderedDict +from typing import Dict, Optional, Sequence, Union + +import numpy as np +import torch +from mmcv import imwrite +from mmengine.dist import is_main_process +from mmengine.evaluator import BaseMetric +from mmengine.logging import MMLogger, print_log +from mmengine.utils import mkdir_or_exist +from PIL import Image + +try: + from prettytable import PrettyTable +except ImportError: + PrettyTable = None + +from mmdet.registry import METRICS + + +@METRICS.register_module() +class SemSegMetric(BaseMetric): + """mIoU evaluation metric. + + Args: + iou_metrics (list[str] | str): Metrics to be calculated, the options + includes 'mIoU', 'mDice' and 'mFscore'. + beta (int): Determines the weight of recall in the combined score. + Default: 1. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + output_dir (str): The directory for output prediction. Defaults to + None. + format_only (bool): Only format result for results commit without + perform evaluation. It is useful when you want to save the result + to a specific format and submit it to the test server. + Defaults to False. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + """ + + def __init__(self, + iou_metrics: Sequence[str] = ['mIoU'], + beta: int = 1, + collect_device: str = 'cpu', + output_dir: Optional[str] = None, + format_only: bool = False, + backend_args: dict = None, + prefix: Optional[str] = None) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + + if isinstance(iou_metrics, str): + iou_metrics = [iou_metrics] + if not set(iou_metrics).issubset(set(['mIoU', 'mDice', 'mFscore'])): + raise KeyError(f'metrics {iou_metrics} is not supported. ' + f'Only supports mIoU/mDice/mFscore.') + self.metrics = iou_metrics + self.beta = beta + self.output_dir = output_dir + if self.output_dir and is_main_process(): + mkdir_or_exist(self.output_dir) + self.format_only = format_only + self.backend_args = backend_args + + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data and data_samples. + + The processed results should be stored in ``self.results``, which will + be used to compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. + """ + num_classes = len(self.dataset_meta['classes']) + for data_sample in data_samples: + pred_label = data_sample['pred_sem_seg']['sem_seg'].squeeze() + # format_only always for test dataset without ground truth + if not self.format_only: + label = data_sample['gt_sem_seg']['sem_seg'].squeeze().to( + pred_label) + ignore_index = data_sample['pred_sem_seg'].get( + 'ignore_index', 255) + self.results.append( + self._compute_pred_stats(pred_label, label, num_classes, + ignore_index)) + + # format_result + if self.output_dir is not None: + basename = osp.splitext(osp.basename( + data_sample['img_path']))[0] + png_filename = osp.abspath( + osp.join(self.output_dir, f'{basename}.png')) + output_mask = pred_label.cpu().numpy() + output = Image.fromarray(output_mask.astype(np.uint8)) + imwrite(output, png_filename, backend_args=self.backend_args) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. The key + mainly includes aAcc, mIoU, mAcc, mDice, mFscore, mPrecision, + mRecall. + """ + logger: MMLogger = MMLogger.get_current_instance() + if self.format_only: + logger.info(f'results are saved to {osp.dirname(self.output_dir)}') + return OrderedDict() + + ret_metrics = self.get_return_metrics(results) + + # summary table + ret_metrics_summary = OrderedDict({ + ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + }) + metrics = dict() + for key, val in ret_metrics_summary.items(): + if key == 'aAcc': + metrics[key] = val + else: + metrics['m' + key] = val + + print_semantic_table(ret_metrics, self.dataset_meta['classes'], logger) + + return metrics + + def _compute_pred_stats(self, pred_label: torch.tensor, + label: torch.tensor, num_classes: int, + ignore_index: int): + """Parse semantic segmentation predictions. + + Args: + pred_label (torch.tensor): Prediction segmentation map + or predict result filename. The shape is (H, W). + label (torch.tensor): Ground truth segmentation map + or label filename. The shape is (H, W). + num_classes (int): Number of categories. + + Returns: + torch.Tensor: The intersection of prediction and ground truth + histogram on all classes. + torch.Tensor: The union of prediction and ground truth histogram on + all classes. + torch.Tensor: The prediction histogram on all classes. + torch.Tensor: The ground truth histogram on all classes. + """ + assert pred_label.shape == label.shape + mask = label != ignore_index + label, pred_label = label[mask], pred_label[mask] + + intersect = pred_label[pred_label == label] + area_intersect = torch.histc( + intersect.float(), bins=num_classes, min=0, max=num_classes - 1) + area_pred_label = torch.histc( + pred_label.float(), bins=num_classes, min=0, max=num_classes - 1) + area_label = torch.histc( + label.float(), bins=num_classes, min=0, max=num_classes - 1) + area_union = area_pred_label + area_label - area_intersect + result = dict( + area_intersect=area_intersect, + area_union=area_union, + area_pred_label=area_pred_label, + area_label=area_label) + return result + + def get_return_metrics(self, results: list) -> dict: + """Calculate evaluation metrics. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, np.ndarray]: per category evaluation metrics, + shape (num_classes, ). + """ + + def f_score(precision, recall, beta=1): + """calculate the f-score value. + + Args: + precision (float | torch.Tensor): The precision value. + recall (float | torch.Tensor): The recall value. + beta (int): Determines the weight of recall in the combined + score. Default: 1. + + Returns: + [torch.tensor]: The f-score value. + """ + score = (1 + beta**2) * (precision * recall) / ( + (beta**2 * precision) + recall) + return score + + total_area_intersect = sum([r['area_intersect'] for r in results]) + total_area_union = sum([r['area_union'] for r in results]) + total_area_pred_label = sum([r['area_pred_label'] for r in results]) + total_area_label = sum([r['area_label'] for r in results]) + + all_acc = total_area_intersect / total_area_label + ret_metrics = OrderedDict({'aAcc': all_acc}) + for metric in self.metrics: + if metric == 'mIoU': + iou = total_area_intersect / total_area_union + acc = total_area_intersect / total_area_label + ret_metrics['IoU'] = iou + ret_metrics['Acc'] = acc + elif metric == 'mDice': + dice = 2 * total_area_intersect / ( + total_area_pred_label + total_area_label) + acc = total_area_intersect / total_area_label + ret_metrics['Dice'] = dice + ret_metrics['Acc'] = acc + elif metric == 'mFscore': + precision = total_area_intersect / total_area_pred_label + recall = total_area_intersect / total_area_label + f_value = torch.tensor([ + f_score(x[0], x[1], self.beta) + for x in zip(precision, recall) + ]) + ret_metrics['Fscore'] = f_value + ret_metrics['Precision'] = precision + ret_metrics['Recall'] = recall + + ret_metrics = { + metric: value.cpu().numpy() + for metric, value in ret_metrics.items() + } + + return ret_metrics + + +def print_semantic_table( + results: dict, + class_names: list, + logger: Optional[Union['MMLogger', str]] = None) -> None: + """Print semantic segmentation evaluation results table. + + Args: + results (dict): The evaluation results. + class_names (list): Class names. + logger (MMLogger | str, optional): Logger used for printing. + Default: None. + """ + # each class table + results.pop('aAcc', None) + ret_metrics_class = OrderedDict({ + ret_metric: np.round(ret_metric_value * 100, 2) + for ret_metric, ret_metric_value in results.items() + }) + + print_log('per class results:', logger) + if PrettyTable: + class_table_data = PrettyTable() + ret_metrics_class.update({'Class': class_names}) + ret_metrics_class.move_to_end('Class', last=False) + for key, val in ret_metrics_class.items(): + class_table_data.add_column(key, val) + print_log('\n' + class_table_data.get_string(), logger=logger) + else: + logger.warning( + '`prettytable` is not installed, for better table format, ' + 'please consider installing it with "pip install prettytable"') + print_result = {} + for class_name, iou, acc in zip(class_names, ret_metrics_class['IoU'], + ret_metrics_class['Acc']): + print_result[class_name] = {'IoU': iou, 'Acc': acc} + print_log(print_result, logger) diff --git a/mmdetection/mmdet/evaluation/metrics/voc_metric.py b/mmdetection/mmdet/evaluation/metrics/voc_metric.py new file mode 100644 index 0000000..32d8c07 --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/voc_metric.py @@ -0,0 +1,176 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from collections import OrderedDict +from typing import List, Optional, Sequence, Union + +import numpy as np +from mmengine.evaluator import BaseMetric +from mmengine.logging import MMLogger + +from mmdet.registry import METRICS +from ..functional import eval_map, eval_recalls + + +@METRICS.register_module() +class VOCMetric(BaseMetric): + """Pascal VOC evaluation metric. + + Args: + iou_thrs (float or List[float]): IoU threshold. Defaults to 0.5. + scale_ranges (List[tuple], optional): Scale ranges for evaluating + mAP. If not specified, all bounding boxes would be included in + evaluation. Defaults to None. + metric (str | list[str]): Metrics to be evaluated. Options are + 'mAP', 'recall'. If is list, the first setting in the list will + be used to evaluate metric. + proposal_nums (Sequence[int]): Proposal number used for evaluating + recalls, such as recall@100, recall@1000. + Default: (100, 300, 1000). + eval_mode (str): 'area' or '11points', 'area' means calculating the + area under precision-recall curve, '11points' means calculating + the average precision of recalls at [0, 0.1, ..., 1]. + The PASCAL VOC2007 defaults to use '11points', while PASCAL + VOC2012 defaults to use 'area'. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + """ + + default_prefix: Optional[str] = 'pascal_voc' + + def __init__(self, + iou_thrs: Union[float, List[float]] = 0.5, + scale_ranges: Optional[List[tuple]] = None, + metric: Union[str, List[str]] = 'mAP', + proposal_nums: Sequence[int] = (100, 300, 1000), + eval_mode: str = '11points', + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + self.iou_thrs = [iou_thrs] if isinstance(iou_thrs, float) \ + else iou_thrs + self.scale_ranges = scale_ranges + # voc evaluation metrics + if not isinstance(metric, str): + assert len(metric) == 1 + metric = metric[0] + allowed_metrics = ['recall', 'mAP'] + if metric not in allowed_metrics: + raise KeyError( + f"metric should be one of 'recall', 'mAP', but got {metric}.") + self.metric = metric + self.proposal_nums = proposal_nums + assert eval_mode in ['area', '11points'], \ + 'Unrecognized mode, only "area" and "11points" are supported' + self.eval_mode = eval_mode + + # TODO: data_batch is no longer needed, consider adjusting the + # parameter position + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + gt = copy.deepcopy(data_sample) + # TODO: Need to refactor to support LoadAnnotations + gt_instances = gt['gt_instances'] + gt_ignore_instances = gt['ignored_instances'] + ann = dict( + labels=gt_instances['labels'].cpu().numpy(), + bboxes=gt_instances['bboxes'].cpu().numpy(), + bboxes_ignore=gt_ignore_instances['bboxes'].cpu().numpy(), + labels_ignore=gt_ignore_instances['labels'].cpu().numpy()) + + pred = data_sample['pred_instances'] + pred_bboxes = pred['bboxes'].cpu().numpy() + pred_scores = pred['scores'].cpu().numpy() + pred_labels = pred['labels'].cpu().numpy() + + dets = [] + for label in range(len(self.dataset_meta['classes'])): + index = np.where(pred_labels == label)[0] + pred_bbox_scores = np.hstack( + [pred_bboxes[index], pred_scores[index].reshape((-1, 1))]) + dets.append(pred_bbox_scores) + + self.results.append((ann, dets)) + + def compute_metrics(self, results: list) -> dict: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + gts, preds = zip(*results) + eval_results = OrderedDict() + if self.metric == 'mAP': + assert isinstance(self.iou_thrs, list) + dataset_type = self.dataset_meta.get('dataset_type') + if dataset_type in ['VOC2007', 'VOC2012']: + dataset_name = 'voc' + if dataset_type == 'VOC2007' and self.eval_mode != '11points': + warnings.warn('Pascal VOC2007 uses `11points` as default ' + 'evaluate mode, but you are using ' + f'{self.eval_mode}.') + elif dataset_type == 'VOC2012' and self.eval_mode != 'area': + warnings.warn('Pascal VOC2012 uses `area` as default ' + 'evaluate mode, but you are using ' + f'{self.eval_mode}.') + else: + dataset_name = self.dataset_meta['classes'] + + mean_aps = [] + for iou_thr in self.iou_thrs: + logger.info(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}') + # Follow the official implementation, + # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar + # we should use the legacy coordinate system in mmdet 1.x, + # which means w, h should be computed as 'x2 - x1 + 1` and + # `y2 - y1 + 1` + mean_ap, _ = eval_map( + preds, + gts, + scale_ranges=self.scale_ranges, + iou_thr=iou_thr, + dataset=dataset_name, + logger=logger, + eval_mode=self.eval_mode, + use_legacy_coordinate=True) + mean_aps.append(mean_ap) + eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3) + eval_results['mAP'] = sum(mean_aps) / len(mean_aps) + eval_results.move_to_end('mAP', last=False) + elif self.metric == 'recall': + gt_bboxes = [gt['bboxes'] for gt in gts] + pr_bboxes = [pred[0] for pred in preds] + recalls = eval_recalls( + gt_bboxes, + pr_bboxes, + self.proposal_nums, + self.iou_thrs, + logger=logger, + use_legacy_coordinate=True) + for i, num in enumerate(self.proposal_nums): + for j, iou_thr in enumerate(self.iou_thrs): + eval_results[f'recall@{num}@{iou_thr}'] = recalls[i, j] + if recalls.shape[1] > 1: + ar = recalls.mean(axis=1) + for i, num in enumerate(self.proposal_nums): + eval_results[f'AR@{num}'] = ar[i] + return eval_results diff --git a/mmdetection/mmdet/evaluation/metrics/youtube_vis_metric.py b/mmdetection/mmdet/evaluation/metrics/youtube_vis_metric.py new file mode 100644 index 0000000..5abc77a --- /dev/null +++ b/mmdetection/mmdet/evaluation/metrics/youtube_vis_metric.py @@ -0,0 +1,426 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import tempfile +import warnings +import zipfile +from collections import OrderedDict, defaultdict +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import mmengine +import numpy as np +from mmengine.dist import (all_gather_object, barrier, broadcast_object_list, + is_main_process) +from mmengine.logging import MMLogger + +from mmdet.registry import METRICS +from mmdet.structures.mask import encode_mask_results +from ..functional import YTVIS, YTVISeval +from .base_video_metric import BaseVideoMetric, collect_tracking_results + + +@METRICS.register_module() +class YouTubeVISMetric(BaseVideoMetric): + """mAP evaluation metrics for the VIS task. + + Args: + metric (str | list[str]): Metrics to be evaluated. + Default value is `youtube_vis_ap`. + metric_items (List[str], optional): Metric result names to be + recorded in the evaluation result. Defaults to None. + outfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonyms metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Default: None + format_only (bool): If True, only formatting the results to the + official format and not performing evaluation. Defaults to False. + """ + + default_prefix: Optional[str] = 'youtube_vis' + + def __init__(self, + metric: Union[str, List[str]] = 'youtube_vis_ap', + metric_items: Optional[Sequence[str]] = None, + outfile_prefix: Optional[str] = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None, + format_only: bool = False) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + # vis evaluation metrics + self.metrics = metric if isinstance(metric, list) else [metric] + self.format_only = format_only + allowed_metrics = ['youtube_vis_ap'] + for metric in self.metrics: + if metric not in allowed_metrics: + raise KeyError( + f"metric should be 'youtube_vis_ap', but got {metric}.") + + self.metric_items = metric_items + self.outfile_prefix = outfile_prefix + self.per_video_res = [] + self.categories = [] + self._vis_meta_info = defaultdict(list) # record video and image infos + + def process_video(self, data_samples): + + video_length = len(data_samples) + for frame_id in range(video_length): + result = dict() + img_data_sample = data_samples[frame_id].to_dict() + pred = img_data_sample['pred_track_instances'] + video_id = img_data_sample['video_id'] + + result['img_id'] = img_data_sample['img_id'] + result['bboxes'] = pred['bboxes'].cpu().numpy() + result['scores'] = pred['scores'].cpu().numpy() + result['labels'] = pred['labels'].cpu().numpy() + result['instances_id'] = pred['instances_id'].cpu().numpy() + # encode mask to RLE + assert 'masks' in pred, \ + 'masks must exist in YouTube-VIS metric' + result['masks'] = encode_mask_results( + pred['masks'].detach().cpu().numpy()) + + # parse gt + gt = dict() + gt['width'] = img_data_sample['ori_shape'][1] + gt['height'] = img_data_sample['ori_shape'][0] + gt['img_id'] = img_data_sample['img_id'] + gt['frame_id'] = frame_id + gt['video_id'] = video_id + gt['video_length'] = video_length + + if 'instances' in img_data_sample: + gt['anns'] = img_data_sample['instances'] + else: + gt['anns'] = dict() + self.per_video_res.append((result, gt)) + + preds, gts = zip(*self.per_video_res) + # format the results + # we must format gts first to update self._vis_meta_info + gt_results = self._format_one_video_gts(gts) + pred_results = self._format_one_video_preds(preds) + self.per_video_res.clear() + # add converted result to the results list + self.results.append((pred_results, gt_results)) + + def compute_metrics(self, results: List) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (List): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + # split gt and prediction list + tmp_pred_results, tmp_gt_results = zip(*results) + gt_results = self.format_gts(tmp_gt_results) + pred_results = self.format_preds(tmp_pred_results) + + if self.format_only: + self.save_pred_results(pred_results) + return dict() + + ytvis = YTVIS(gt_results) + + ytvis_dets = ytvis.loadRes(pred_results) + vid_ids = ytvis.getVidIds() + + iou_type = metric = 'segm' + eval_results = OrderedDict() + ytvisEval = YTVISeval(ytvis, ytvis_dets, iou_type) + ytvisEval.params.vidIds = vid_ids + ytvisEval.evaluate() + ytvisEval.accumulate() + ytvisEval.summarize() + + coco_metric_names = { + 'mAP': 0, + 'mAP_50': 1, + 'mAP_75': 2, + 'mAP_s': 3, + 'mAP_m': 4, + 'mAP_l': 5, + 'AR@1': 6, + 'AR@10': 7, + 'AR@100': 8, + 'AR_s@100': 9, + 'AR_m@100': 10, + 'AR_l@100': 11 + } + metric_items = self.metric_items + if metric_items is not None: + for metric_item in metric_items: + if metric_item not in coco_metric_names: + raise KeyError( + f'metric item "{metric_item}" is not supported') + + if metric_items is None: + metric_items = [ + 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l' + ] + for metric_item in metric_items: + key = f'{metric}_{metric_item}' + val = float( + f'{ytvisEval.stats[coco_metric_names[metric_item]]:.3f}') + eval_results[key] = val + + return eval_results + + def format_gts(self, gts: Tuple[List]) -> dict: + """Gather all ground-truth from self.results.""" + self.categories = [ + dict(id=id + 1, name=name) + for id, name in enumerate(self.dataset_meta['classes']) + ] + gt_results = dict( + categories=self.categories, + videos=self._vis_meta_info['videos'], + annotations=[]) + for gt_result in gts: + gt_results['annotations'].extend(gt_result) + return gt_results + + def format_preds(self, preds: Tuple[List]) -> List: + """Gather all predictions from self.results.""" + pred_results = [] + for pred_result in preds: + pred_results.extend(pred_result) + return pred_results + + def _format_one_video_preds(self, pred_dicts: Tuple[dict]) -> List: + """Convert the annotation to the format of YouTube-VIS. + + This operation is to make it easier to use the official eval API. + + Args: + pred_dicts (Tuple[dict]): Prediction of the dataset. + + Returns: + List: The formatted predictions. + """ + # Collate preds scatters (tuple of dict to dict of list) + preds = defaultdict(list) + for pred in pred_dicts: + for key in pred.keys(): + preds[key].append(pred[key]) + + img_infos = self._vis_meta_info['images'] + vid_infos = self._vis_meta_info['videos'] + inds = [i for i, _ in enumerate(img_infos) if _['frame_id'] == 0] + inds.append(len(img_infos)) + json_results = [] + video_id = vid_infos[-1]['id'] + # collect data for each instances in a video. + collect_data = dict() + for frame_id, (masks, scores, labels, ids) in enumerate( + zip(preds['masks'], preds['scores'], preds['labels'], + preds['instances_id'])): + + assert len(masks) == len(labels) + for j, id in enumerate(ids): + if id not in collect_data: + collect_data[id] = dict( + category_ids=[], scores=[], segmentations=dict()) + collect_data[id]['category_ids'].append(labels[j]) + collect_data[id]['scores'].append(scores[j]) + if isinstance(masks[j]['counts'], bytes): + masks[j]['counts'] = masks[j]['counts'].decode() + collect_data[id]['segmentations'][frame_id] = masks[j] + + # transform the collected data into official format + for id, id_data in collect_data.items(): + output = dict() + output['video_id'] = video_id + output['score'] = np.array(id_data['scores']).mean().item() + # majority voting for sequence category + output['category_id'] = np.bincount( + np.array(id_data['category_ids'])).argmax().item() + 1 + output['segmentations'] = [] + for frame_id in range(inds[-1] - inds[-2]): + if frame_id in id_data['segmentations']: + output['segmentations'].append( + id_data['segmentations'][frame_id]) + else: + output['segmentations'].append(None) + json_results.append(output) + + return json_results + + def _format_one_video_gts(self, gt_dicts: Tuple[dict]) -> List: + """Convert the annotation to the format of YouTube-VIS. + + This operation is to make it easier to use the official eval API. + + Args: + gt_dicts (Tuple[dict]): Ground truth of the dataset. + + Returns: + list: The formatted gts. + """ + video_infos = [] + image_infos = [] + instance_infos = defaultdict(list) + len_videos = dict() # mapping from instance_id to video_length + vis_anns = [] + + # get video infos + for gt_dict in gt_dicts: + frame_id = gt_dict['frame_id'] + video_id = gt_dict['video_id'] + img_id = gt_dict['img_id'] + image_info = dict( + id=img_id, + width=gt_dict['width'], + height=gt_dict['height'], + frame_id=frame_id, + file_name='') + image_infos.append(image_info) + if frame_id == 0: + video_info = dict( + id=video_id, + width=gt_dict['width'], + height=gt_dict['height'], + file_name='') + video_infos.append(video_info) + + for ann in gt_dict['anns']: + label = ann['bbox_label'] + bbox = ann['bbox'] + instance_id = ann['instance_id'] + # update video length + len_videos[instance_id] = gt_dict['video_length'] + coco_bbox = [ + bbox[0], + bbox[1], + bbox[2] - bbox[0], + bbox[3] - bbox[1], + ] + + annotation = dict( + video_id=video_id, + frame_id=frame_id, + bbox=coco_bbox, + instance_id=instance_id, + iscrowd=ann.get('ignore_flag', 0), + category_id=int(label) + 1, + area=coco_bbox[2] * coco_bbox[3]) + if ann.get('mask', None): + mask = ann['mask'] + # area = mask_util.area(mask) + if isinstance(mask, dict) and isinstance( + mask['counts'], bytes): + mask['counts'] = mask['counts'].decode() + annotation['segmentation'] = mask + + instance_infos[instance_id].append(annotation) + + # update vis meta info + self._vis_meta_info['images'].extend(image_infos) + self._vis_meta_info['videos'].extend(video_infos) + + for instance_id, ann_infos in instance_infos.items(): + cur_video_len = len_videos[instance_id] + segm = [None] * cur_video_len + bbox = [None] * cur_video_len + area = [None] * cur_video_len + # In the official format, no instances are represented by + # 'None', however, only images with instances are recorded + # in the current annotations, so we need to use 'None' to + # initialize these lists. + for ann_info in ann_infos: + frame_id = ann_info['frame_id'] + segm[frame_id] = ann_info['segmentation'] + bbox[frame_id] = ann_info['bbox'] + area[frame_id] = ann_info['area'] + instance = dict( + category_id=ann_infos[0]['category_id'], + segmentations=segm, + bboxes=bbox, + video_id=ann_infos[0]['video_id'], + areas=area, + id=instance_id, + iscrowd=ann_infos[0]['iscrowd']) + vis_anns.append(instance) + return vis_anns + + def save_pred_results(self, pred_results: List) -> None: + """Save the results to a zip file (standard format for YouTube-VIS + Challenge). + + Args: + pred_results (list): Testing results of the + dataset. + """ + logger: MMLogger = MMLogger.get_current_instance() + if self.outfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + outfile_prefix = osp.join(tmp_dir.name, 'results') + else: + outfile_prefix = self.outfile_prefix + mmengine.dump(pred_results, f'{outfile_prefix}.json') + # zip the json file in order to submit to the test server. + zip_file_name = f'{outfile_prefix}.submission_file.zip' + zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) + logger.info(f"zip the 'results.json' into '{zip_file_name}', " + 'please submmit the zip file to the test server') + zf.write(f'{outfile_prefix}.json', 'results.json') + zf.close() + + def evaluate(self, size: int) -> dict: + """Evaluate the model performance of the whole dataset after processing + all batches. + + Args: + size (int): Length of the entire validation dataset. + + Returns: + dict: Evaluation metrics dict on the val dataset. The keys are the + names of the metrics, and the values are corresponding results. + """ + # wait for all processes to complete prediction. + barrier() + + if len(self.results) == 0: + warnings.warn( + f'{self.__class__.__name__} got empty `self.results`. Please ' + 'ensure that the processed results are properly added into ' + '`self.results` in `process` method.') + + results = collect_tracking_results(self.results, self.collect_device) + + # gather seq_info + gathered_seq_info = all_gather_object(self._vis_meta_info['videos']) + all_seq_info = [] + for _seq_info in gathered_seq_info: + all_seq_info.extend(_seq_info) + # update self._vis_meta_info + self._vis_meta_info = dict(videos=all_seq_info) + + if is_main_process(): + _metrics = self.compute_metrics(results) # type: ignore + # Add prefix to metric names + if self.prefix: + _metrics = { + '/'.join((self.prefix, k)): v + for k, v in _metrics.items() + } + metrics = [_metrics] + else: + metrics = [None] # type: ignore + + broadcast_object_list(metrics) + + # reset the results list + self.results.clear() + # reset the vis_meta_info + self._vis_meta_info.clear() + return metrics[0] diff --git a/mmdetection/mmdet/models/__init__.py b/mmdetection/mmdet/models/__init__.py new file mode 100644 index 0000000..c0a0d5e --- /dev/null +++ b/mmdetection/mmdet/models/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backbones import * # noqa: F401,F403 +from .data_preprocessors import * # noqa: F401,F403 +from .dense_heads import * # noqa: F401,F403 +from .detectors import * # noqa: F401,F403 +from .language_models import * # noqa: F401,F403 +from .layers import * # noqa: F401,F403 +from .losses import * # noqa: F401,F403 +from .mot import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .reid import * # noqa: F401,F403 +from .roi_heads import * # noqa: F401,F403 +from .seg_heads import * # noqa: F401,F403 +from .task_modules import * # noqa: F401,F403 +from .test_time_augs import * # noqa: F401,F403 +from .trackers import * # noqa: F401,F403 +from .tracking_heads import * # noqa: F401,F403 +from .vis import * # noqa: F401,F403 diff --git a/mmdetection/mmdet/models/backbones/__init__.py b/mmdetection/mmdet/models/backbones/__init__.py new file mode 100644 index 0000000..e16ff85 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .csp_darknet import CSPDarknet +from .cspnext import CSPNeXt +from .darknet import Darknet +from .detectors_resnet import DetectoRS_ResNet +from .detectors_resnext import DetectoRS_ResNeXt +from .efficientnet import EfficientNet +from .hourglass import HourglassNet +from .hrnet import HRNet +from .mobilenet_v2 import MobileNetV2 +from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2 +from .regnet import RegNet +from .res2net import Res2Net +from .resnest import ResNeSt +from .resnet import ResNet, ResNetV1d +from .resnext import ResNeXt +from .ssd_vgg import SSDVGG +from .swin import SwinTransformer +from .trident_resnet import TridentResNet + +__all__ = [ + 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', + 'MobileNetV2', 'Res2Net', 'HourglassNet', 'DetectoRS_ResNet', + 'DetectoRS_ResNeXt', 'Darknet', 'ResNeSt', 'TridentResNet', 'CSPDarknet', + 'SwinTransformer', 'PyramidVisionTransformer', + 'PyramidVisionTransformerV2', 'EfficientNet', 'CSPNeXt' +] diff --git a/mmdetection/mmdet/models/backbones/csp_darknet.py b/mmdetection/mmdet/models/backbones/csp_darknet.py new file mode 100644 index 0000000..a890b48 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/csp_darknet.py @@ -0,0 +1,286 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmdet.registry import MODELS +from ..layers import CSPLayer + + +class Focus(nn.Module): + """Focus width and height information into channel space. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_size (int): The kernel size of the convolution. Default: 1 + stride (int): The stride of the convolution. Default: 1 + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='Swish'). + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=1, + stride=1, + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish')): + super().__init__() + self.conv = ConvModule( + in_channels * 4, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x): + # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) + patch_top_left = x[..., ::2, ::2] + patch_top_right = x[..., ::2, 1::2] + patch_bot_left = x[..., 1::2, ::2] + patch_bot_right = x[..., 1::2, 1::2] + x = torch.cat( + ( + patch_top_left, + patch_bot_left, + patch_top_right, + patch_bot_right, + ), + dim=1, + ) + return self.conv(x) + + +class SPPBottleneck(BaseModule): + """Spatial pyramid pooling layer used in YOLOv3-SPP. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling + layers. Default: (5, 9, 13). + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='Swish'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_sizes=(5, 9, 13), + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=None): + super().__init__(init_cfg) + mid_channels = in_channels // 2 + self.conv1 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_channels = mid_channels * (len(kernel_sizes) + 1) + self.conv2 = ConvModule( + conv2_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x): + x = self.conv1(x) + with torch.cuda.amp.autocast(enabled=False): + x = torch.cat( + [x] + [pooling(x) for pooling in self.poolings], dim=1) + x = self.conv2(x) + return x + + +@MODELS.register_module() +class CSPDarknet(BaseModule): + """CSP-Darknet backbone used in YOLOv5 and YOLOX. + + Args: + arch (str): Architecture of CSP-Darknet, from {P5, P6}. + Default: P5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Default: 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Default: 1.0. + out_indices (Sequence[int]): Output from which stages. + Default: (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Default: -1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Default: False. + arch_ovewrite(list): Overwrite default arch settings. Default: None. + spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP + layers. Default: (5, 9, 13). + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + Example: + >>> from mmdet.models import CSPDarknet + >>> import torch + >>> self = CSPDarknet(depth=53) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 1024, 3, False, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 768, 3, True, False], + [768, 1024, 3, False, True]] + } + + def __init__(self, + arch='P5', + deepen_factor=1.0, + widen_factor=1.0, + out_indices=(2, 3, 4), + frozen_stages=-1, + use_depthwise=False, + arch_ovewrite=None, + spp_kernal_sizes=(5, 9, 13), + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + norm_eval=False, + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu')): + super().__init__(init_cfg) + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + assert set(out_indices).issubset( + i for i in range(len(arch_setting) + 1)) + if frozen_stages not in range(-1, len(arch_setting) + 1): + raise ValueError('frozen_stages must be in range(-1, ' + 'len(arch_setting) + 1). But received ' + f'{frozen_stages}') + + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.use_depthwise = use_depthwise + self.norm_eval = norm_eval + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + self.stem = Focus( + 3, + int(arch_setting[0][0] * widen_factor), + kernel_size=3, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.layers = ['stem'] + + for i, (in_channels, out_channels, num_blocks, add_identity, + use_spp) in enumerate(arch_setting): + in_channels = int(in_channels * widen_factor) + out_channels = int(out_channels * widen_factor) + num_blocks = max(round(num_blocks * deepen_factor), 1) + stage = [] + conv_layer = conv( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPBottleneck( + out_channels, + out_channels, + kernel_sizes=spp_kernal_sizes, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + use_depthwise=use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(csp_layer) + self.add_module(f'stage{i + 1}', nn.Sequential(*stage)) + self.layers.append(f'stage{i + 1}') + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for i in range(self.frozen_stages + 1): + m = getattr(self, self.layers[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(CSPDarknet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + def forward(self, x): + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) diff --git a/mmdetection/mmdet/models/backbones/cspnext.py b/mmdetection/mmdet/models/backbones/cspnext.py new file mode 100644 index 0000000..269725a --- /dev/null +++ b/mmdetection/mmdet/models/backbones/cspnext.py @@ -0,0 +1,195 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Sequence, Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule +from torch import Tensor +from torch.nn.modules.batchnorm import _BatchNorm + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from ..layers import CSPLayer +from .csp_darknet import SPPBottleneck + + +@MODELS.register_module() +class CSPNeXt(BaseModule): + """CSPNeXt backbone used in RTMDet. + + Args: + arch (str): Architecture of CSPNeXt, from {P5, P6}. + Defaults to P5. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + arch_ovewrite (list): Overwrite default arch settings. + Defaults to None. + spp_kernel_sizes: (tuple[int]): Sequential of kernel sizes of SPP + layers. Defaults to (5, 9, 13). + channel_attention (bool): Whether to add channel attention in each + stage. Defaults to True. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 1024, 3, False, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 768, 3, True, False], + [768, 1024, 3, False, True]] + } + + def __init__( + self, + arch: str = 'P5', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + out_indices: Sequence[int] = (2, 3, 4), + frozen_stages: int = -1, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + arch_ovewrite: dict = None, + spp_kernel_sizes: Sequence[int] = (5, 9, 13), + channel_attention: bool = True, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU'), + norm_eval: bool = False, + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + super().__init__(init_cfg=init_cfg) + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + assert set(out_indices).issubset( + i for i in range(len(arch_setting) + 1)) + if frozen_stages not in range(-1, len(arch_setting) + 1): + raise ValueError('frozen_stages must be in range(-1, ' + 'len(arch_setting) + 1). But received ' + f'{frozen_stages}') + + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.use_depthwise = use_depthwise + self.norm_eval = norm_eval + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + self.stem = nn.Sequential( + ConvModule( + 3, + int(arch_setting[0][0] * widen_factor // 2), + 3, + padding=1, + stride=2, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + int(arch_setting[0][0] * widen_factor // 2), + int(arch_setting[0][0] * widen_factor // 2), + 3, + padding=1, + stride=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + int(arch_setting[0][0] * widen_factor // 2), + int(arch_setting[0][0] * widen_factor), + 3, + padding=1, + stride=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.layers = ['stem'] + + for i, (in_channels, out_channels, num_blocks, add_identity, + use_spp) in enumerate(arch_setting): + in_channels = int(in_channels * widen_factor) + out_channels = int(out_channels * widen_factor) + num_blocks = max(round(num_blocks * deepen_factor), 1) + stage = [] + conv_layer = conv( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPBottleneck( + out_channels, + out_channels, + kernel_sizes=spp_kernel_sizes, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + use_depthwise=use_depthwise, + use_cspnext_block=True, + expand_ratio=expand_ratio, + channel_attention=channel_attention, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + stage.append(csp_layer) + self.add_module(f'stage{i + 1}', nn.Sequential(*stage)) + self.layers.append(f'stage{i + 1}') + + def _freeze_stages(self) -> None: + if self.frozen_stages >= 0: + for i in range(self.frozen_stages + 1): + m = getattr(self, self.layers[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True) -> None: + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + def forward(self, x: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]: + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) diff --git a/mmdetection/mmdet/models/backbones/darknet.py b/mmdetection/mmdet/models/backbones/darknet.py new file mode 100644 index 0000000..1d44da1 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/darknet.py @@ -0,0 +1,213 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) 2019 Western Digital Corporation or its affiliates. + +import warnings + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmdet.registry import MODELS + + +class ResBlock(BaseModule): + """The basic residual block used in Darknet. Each ResBlock consists of two + ConvModules and the input is added to the final output. Each ConvModule is + composed of Conv, BN, and LeakyReLU. In YoloV3 paper, the first convLayer + has half of the number of the filters as much as the second convLayer. The + first convLayer has filter size of 1x1 and the second one has the filter + size of 3x3. + + Args: + in_channels (int): The input channels. Must be even. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: dict(type='BN', requires_grad=True) + act_cfg (dict): Config dict for activation layer. + Default: dict(type='LeakyReLU', negative_slope=0.1). + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + in_channels, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='LeakyReLU', negative_slope=0.1), + init_cfg=None): + super(ResBlock, self).__init__(init_cfg) + assert in_channels % 2 == 0 # ensure the in_channels is even + half_in_channels = in_channels // 2 + + # shortcut + cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) + + self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg) + self.conv2 = ConvModule( + half_in_channels, in_channels, 3, padding=1, **cfg) + + def forward(self, x): + residual = x + out = self.conv1(x) + out = self.conv2(out) + out = out + residual + + return out + + +@MODELS.register_module() +class Darknet(BaseModule): + """Darknet backbone. + + Args: + depth (int): Depth of Darknet. Currently only support 53. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: dict(type='BN', requires_grad=True) + act_cfg (dict): Config dict for activation layer. + Default: dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + pretrained (str, optional): model pretrained path. Default: None + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + + Example: + >>> from mmdet.models import Darknet + >>> import torch + >>> self = Darknet(depth=53) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + + # Dict(depth: (layers, channels)) + arch_settings = { + 53: ((1, 2, 8, 8, 4), ((32, 64), (64, 128), (128, 256), (256, 512), + (512, 1024))) + } + + def __init__(self, + depth=53, + out_indices=(3, 4, 5), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='LeakyReLU', negative_slope=0.1), + norm_eval=True, + pretrained=None, + init_cfg=None): + super(Darknet, self).__init__(init_cfg) + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for darknet') + + self.depth = depth + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.layers, self.channels = self.arch_settings[depth] + + cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) + + self.conv1 = ConvModule(3, 32, 3, padding=1, **cfg) + + self.cr_blocks = ['conv1'] + for i, n_layers in enumerate(self.layers): + layer_name = f'conv_res_block{i + 1}' + in_c, out_c = self.channels[i] + self.add_module( + layer_name, + self.make_conv_res_block(in_c, out_c, n_layers, **cfg)) + self.cr_blocks.append(layer_name) + + self.norm_eval = norm_eval + + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + outs = [] + for i, layer_name in enumerate(self.cr_blocks): + cr_block = getattr(self, layer_name) + x = cr_block(x) + if i in self.out_indices: + outs.append(x) + + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for i in range(self.frozen_stages): + m = getattr(self, self.cr_blocks[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(Darknet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + @staticmethod + def make_conv_res_block(in_channels, + out_channels, + res_repeat, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='LeakyReLU', + negative_slope=0.1)): + """In Darknet backbone, ConvLayer is usually followed by ResBlock. This + function will make that. The Conv layers always have 3x3 filters with + stride=2. The number of the filters in Conv layer is the same as the + out channels of the ResBlock. + + Args: + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + res_repeat (int): The number of ResBlocks. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: dict(type='BN', requires_grad=True) + act_cfg (dict): Config dict for activation layer. + Default: dict(type='LeakyReLU', negative_slope=0.1). + """ + + cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) + + model = nn.Sequential() + model.add_module( + 'conv', + ConvModule( + in_channels, out_channels, 3, stride=2, padding=1, **cfg)) + for idx in range(res_repeat): + model.add_module('res{}'.format(idx), + ResBlock(out_channels, **cfg)) + return model diff --git a/mmdetection/mmdet/models/backbones/detectors_resnet.py b/mmdetection/mmdet/models/backbones/detectors_resnet.py new file mode 100644 index 0000000..f33424f --- /dev/null +++ b/mmdetection/mmdet/models/backbones/detectors_resnet.py @@ -0,0 +1,353 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmengine.logging import MMLogger +from mmengine.model import Sequential, constant_init, kaiming_init +from mmengine.runner.checkpoint import load_checkpoint +from torch.nn.modules.batchnorm import _BatchNorm + +from mmdet.registry import MODELS +from .resnet import BasicBlock +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResNet + + +class Bottleneck(_Bottleneck): + r"""Bottleneck for the ResNet backbone in `DetectoRS + `_. + + This bottleneck allows the users to specify whether to use + SAC (Switchable Atrous Convolution) and RFP (Recursive Feature Pyramid). + + Args: + inplanes (int): The number of input channels. + planes (int): The number of output channels before expansion. + rfp_inplanes (int, optional): The number of channels from RFP. + Default: None. If specified, an additional conv layer will be + added for ``rfp_feat``. Otherwise, the structure is the same as + base class. + sac (dict, optional): Dictionary to construct SAC. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + expansion = 4 + + def __init__(self, + inplanes, + planes, + rfp_inplanes=None, + sac=None, + init_cfg=None, + **kwargs): + super(Bottleneck, self).__init__( + inplanes, planes, init_cfg=init_cfg, **kwargs) + + assert sac is None or isinstance(sac, dict) + self.sac = sac + self.with_sac = sac is not None + if self.with_sac: + self.conv2 = build_conv_layer( + self.sac, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + bias=False) + + self.rfp_inplanes = rfp_inplanes + if self.rfp_inplanes: + self.rfp_conv = build_conv_layer( + None, + self.rfp_inplanes, + planes * self.expansion, + 1, + stride=1, + bias=True) + if init_cfg is None: + self.init_cfg = dict( + type='Constant', val=0, override=dict(name='rfp_conv')) + + def rfp_forward(self, x, rfp_feat): + """The forward function that also takes the RFP features as input.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + if self.rfp_inplanes: + rfp_feat = self.rfp_conv(rfp_feat) + out = out + rfp_feat + + out = self.relu(out) + + return out + + +class ResLayer(Sequential): + """ResLayer to build ResNet style backbone for RPF in detectoRS. + + The difference between this module and base class is that we pass + ``rfp_inplanes`` to the first block. + + Args: + block (nn.Module): block used to build ResLayer. + inplanes (int): inplanes of block. + planes (int): planes of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Default: 1 + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + downsample_first (bool): Downsample at the first block or last block. + False for Hourglass, True for ResNet. Default: True + rfp_inplanes (int, optional): The number of channels from RFP. + Default: None. If specified, an additional conv layer will be + added for ``rfp_feat``. Otherwise, the structure is the same as + base class. + """ + + def __init__(self, + block, + inplanes, + planes, + num_blocks, + stride=1, + avg_down=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + downsample_first=True, + rfp_inplanes=None, + **kwargs): + self.block = block + assert downsample_first, f'downsample_first={downsample_first} is ' \ + 'not supported in DetectoRS' + + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = [] + conv_stride = stride + if avg_down and stride != 1: + conv_stride = 1 + downsample.append( + nn.AvgPool2d( + kernel_size=stride, + stride=stride, + ceil_mode=True, + count_include_pad=False)) + downsample.extend([ + build_conv_layer( + conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=conv_stride, + bias=False), + build_norm_layer(norm_cfg, planes * block.expansion)[1] + ]) + downsample = nn.Sequential(*downsample) + + layers = [] + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + rfp_inplanes=rfp_inplanes, + **kwargs)) + inplanes = planes * block.expansion + for _ in range(1, num_blocks): + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + + super(ResLayer, self).__init__(*layers) + + +@MODELS.register_module() +class DetectoRS_ResNet(ResNet): + """ResNet backbone for DetectoRS. + + Args: + sac (dict, optional): Dictionary to construct SAC (Switchable Atrous + Convolution). Default: None. + stage_with_sac (list): Which stage to use sac. Default: (False, False, + False, False). + rfp_inplanes (int, optional): The number of channels from RFP. + Default: None. If specified, an additional conv layer will be + added for ``rfp_feat``. Otherwise, the structure is the same as + base class. + output_img (bool): If ``True``, the input image will be inserted into + the starting position of output. Default: False. + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + sac=None, + stage_with_sac=(False, False, False, False), + rfp_inplanes=None, + output_img=False, + pretrained=None, + init_cfg=None, + **kwargs): + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + self.pretrained = pretrained + if init_cfg is not None: + assert isinstance(init_cfg, dict), \ + f'init_cfg must be a dict, but got {type(init_cfg)}' + if 'type' in init_cfg: + assert init_cfg.get('type') == 'Pretrained', \ + 'Only can initialize module by loading a pretrained model' + else: + raise KeyError('`init_cfg` must contain the key "type"') + self.pretrained = init_cfg.get('checkpoint') + self.sac = sac + self.stage_with_sac = stage_with_sac + self.rfp_inplanes = rfp_inplanes + self.output_img = output_img + super(DetectoRS_ResNet, self).__init__(**kwargs) + + self.inplanes = self.stem_channels + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = self.strides[i] + dilation = self.dilations[i] + dcn = self.dcn if self.stage_with_dcn[i] else None + sac = self.sac if self.stage_with_sac[i] else None + if self.plugins is not None: + stage_plugins = self.make_stage_plugins(self.plugins, i) + else: + stage_plugins = None + planes = self.base_channels * 2**i + res_layer = self.make_res_layer( + block=self.block, + inplanes=self.inplanes, + planes=planes, + num_blocks=num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=self.with_cp, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + dcn=dcn, + sac=sac, + rfp_inplanes=rfp_inplanes if i > 0 else None, + plugins=stage_plugins) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + # In order to be properly initialized by RFP + def init_weights(self): + # Calling this method will cause parameter initialization exception + # super(DetectoRS_ResNet, self).init_weights() + + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + load_checkpoint(self, self.pretrained, strict=False, logger=logger) + elif self.pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + if self.dcn is not None: + for m in self.modules(): + if isinstance(m, Bottleneck) and hasattr( + m.conv2, 'conv_offset'): + constant_init(m.conv2.conv_offset, 0) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + constant_init(m.norm3, 0) + elif isinstance(m, BasicBlock): + constant_init(m.norm2, 0) + else: + raise TypeError('pretrained must be a str or None') + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer`` for DetectoRS.""" + return ResLayer(**kwargs) + + def forward(self, x): + """Forward function.""" + outs = list(super(DetectoRS_ResNet, self).forward(x)) + if self.output_img: + outs.insert(0, x) + return tuple(outs) + + def rfp_forward(self, x, rfp_feats): + """Forward function for RFP.""" + if self.deep_stem: + x = self.stem(x) + else: + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + rfp_feat = rfp_feats[i] if i > 0 else None + for layer in res_layer: + x = layer.rfp_forward(x, rfp_feat) + if i in self.out_indices: + outs.append(x) + return tuple(outs) diff --git a/mmdetection/mmdet/models/backbones/detectors_resnext.py b/mmdetection/mmdet/models/backbones/detectors_resnext.py new file mode 100644 index 0000000..4bbd631 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/detectors_resnext.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +from mmcv.cnn import build_conv_layer, build_norm_layer + +from mmdet.registry import MODELS +from .detectors_resnet import Bottleneck as _Bottleneck +from .detectors_resnet import DetectoRS_ResNet + + +class Bottleneck(_Bottleneck): + expansion = 4 + + def __init__(self, + inplanes, + planes, + groups=1, + base_width=4, + base_channels=64, + **kwargs): + """Bottleneck block for ResNeXt. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if + it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super(Bottleneck, self).__init__(inplanes, planes, **kwargs) + + if groups == 1: + width = self.planes + else: + width = math.floor(self.planes * + (base_width / base_channels)) * groups + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, width, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, width, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.inplanes, + width, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + fallback_on_stride = False + self.with_modulated_dcn = False + if self.with_dcn: + fallback_on_stride = self.dcn.pop('fallback_on_stride', False) + if self.with_sac: + self.conv2 = build_conv_layer( + self.sac, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + elif not self.with_dcn or fallback_on_stride: + self.conv2 = build_conv_layer( + self.conv_cfg, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + self.conv2 = build_conv_layer( + self.dcn, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + self.conv_cfg, + width, + self.planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + +@MODELS.register_module() +class DetectoRS_ResNeXt(DetectoRS_ResNet): + """ResNeXt backbone for DetectoRS. + + Args: + groups (int): The number of groups in ResNeXt. + base_width (int): The base width of ResNeXt. + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, groups=1, base_width=4, **kwargs): + self.groups = groups + self.base_width = base_width + super(DetectoRS_ResNeXt, self).__init__(**kwargs) + + def make_res_layer(self, **kwargs): + return super().make_res_layer( + groups=self.groups, + base_width=self.base_width, + base_channels=self.base_channels, + **kwargs) diff --git a/mmdetection/mmdet/models/backbones/efficientnet.py b/mmdetection/mmdet/models/backbones/efficientnet.py new file mode 100644 index 0000000..8484afe --- /dev/null +++ b/mmdetection/mmdet/models/backbones/efficientnet.py @@ -0,0 +1,418 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +from functools import partial + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn.bricks import ConvModule, DropPath +from mmengine.model import BaseModule, Sequential + +from mmdet.registry import MODELS +from ..layers import InvertedResidual, SELayer +from ..utils import make_divisible + + +class EdgeResidual(BaseModule): + """Edge Residual Block. + + Args: + in_channels (int): The input channels of this module. + out_channels (int): The output channels of this module. + mid_channels (int): The input channels of the second convolution. + kernel_size (int): The kernel size of the first convolution. + Defaults to 3. + stride (int): The stride of the first convolution. Defaults to 1. + se_cfg (dict, optional): Config dict for se layer. Defaults to None, + which means no se layer. + with_residual (bool): Use residual connection. Defaults to True. + conv_cfg (dict, optional): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to ``dict(type='BN')``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU')``. + drop_path_rate (float): stochastic depth rate. Defaults to 0. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + init_cfg (dict | list[dict], optional): Initialization config dict. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + stride=1, + se_cfg=None, + with_residual=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + drop_path_rate=0., + with_cp=False, + init_cfg=None, + **kwargs): + super(EdgeResidual, self).__init__(init_cfg=init_cfg) + assert stride in [1, 2] + self.with_cp = with_cp + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate > 0 else nn.Identity() + self.with_se = se_cfg is not None + self.with_residual = ( + stride == 1 and in_channels == out_channels and with_residual) + + if self.with_se: + assert isinstance(se_cfg, dict) + + self.conv1 = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if self.with_se: + self.se = SELayer(**se_cfg) + + self.conv2 = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x): + + def _inner_forward(x): + out = x + out = self.conv1(out) + + if self.with_se: + out = self.se(out) + + out = self.conv2(out) + + if self.with_residual: + return x + self.drop_path(out) + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +def model_scaling(layer_setting, arch_setting): + """Scaling operation to the layer's parameters according to the + arch_setting.""" + # scale width + new_layer_setting = copy.deepcopy(layer_setting) + for layer_cfg in new_layer_setting: + for block_cfg in layer_cfg: + block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8) + + # scale depth + split_layer_setting = [new_layer_setting[0]] + for layer_cfg in new_layer_setting[1:-1]: + tmp_index = [0] + for i in range(len(layer_cfg) - 1): + if layer_cfg[i + 1][1] != layer_cfg[i][1]: + tmp_index.append(i + 1) + tmp_index.append(len(layer_cfg)) + for i in range(len(tmp_index) - 1): + split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i + + 1]]) + split_layer_setting.append(new_layer_setting[-1]) + + num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]] + new_layers = [ + int(math.ceil(arch_setting[1] * num)) for num in num_of_layers + ] + + merge_layer_setting = [split_layer_setting[0]] + for i, layer_cfg in enumerate(split_layer_setting[1:-1]): + if new_layers[i] <= num_of_layers[i]: + tmp_layer_cfg = layer_cfg[:new_layers[i]] + else: + tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * ( + new_layers[i] - num_of_layers[i]) + if tmp_layer_cfg[0][3] == 1 and i != 0: + merge_layer_setting[-1] += tmp_layer_cfg.copy() + else: + merge_layer_setting.append(tmp_layer_cfg.copy()) + merge_layer_setting.append(split_layer_setting[-1]) + + return merge_layer_setting + + +@MODELS.register_module() +class EfficientNet(BaseModule): + """EfficientNet backbone. + + Args: + arch (str): Architecture of efficientnet. Defaults to b0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (6, ). + frozen_stages (int): Stages to be frozen (all param fixed). + Defaults to 0, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + """ + + # Parameters to build layers. + # 'b' represents the architecture of normal EfficientNet family includes + # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'. + # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es', + # 'em', 'el'. + # 6 parameters are needed to construct a layer, From left to right: + # - kernel_size: The kernel size of the block + # - out_channel: The number of out_channels of the block + # - se_ratio: The sequeeze ratio of SELayer. + # - stride: The stride of the block + # - expand_ratio: The expand_ratio of the mid_channels + # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual + layer_settings = { + 'b': [[[3, 32, 0, 2, 0, -1]], + [[3, 16, 4, 1, 1, 0]], + [[3, 24, 4, 2, 6, 0], + [3, 24, 4, 1, 6, 0]], + [[5, 40, 4, 2, 6, 0], + [5, 40, 4, 1, 6, 0]], + [[3, 80, 4, 2, 6, 0], + [3, 80, 4, 1, 6, 0], + [3, 80, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0]], + [[5, 192, 4, 2, 6, 0], + [5, 192, 4, 1, 6, 0], + [5, 192, 4, 1, 6, 0], + [5, 192, 4, 1, 6, 0], + [3, 320, 4, 1, 6, 0]], + [[1, 1280, 0, 1, 0, -1]] + ], + 'e': [[[3, 32, 0, 2, 0, -1]], + [[3, 24, 0, 1, 3, 1]], + [[3, 32, 0, 2, 8, 1], + [3, 32, 0, 1, 8, 1]], + [[3, 48, 0, 2, 8, 1], + [3, 48, 0, 1, 8, 1], + [3, 48, 0, 1, 8, 1], + [3, 48, 0, 1, 8, 1]], + [[5, 96, 0, 2, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0]], + [[5, 192, 0, 2, 8, 0], + [5, 192, 0, 1, 8, 0]], + [[1, 1280, 0, 1, 0, -1]] + ] + } # yapf: disable + + # Parameters to build different kinds of architecture. + # From left to right: scaling factor for width, scaling factor for depth, + # resolution. + arch_settings = { + 'b0': (1.0, 1.0, 224), + 'b1': (1.0, 1.1, 240), + 'b2': (1.1, 1.2, 260), + 'b3': (1.2, 1.4, 300), + 'b4': (1.4, 1.8, 380), + 'b5': (1.6, 2.2, 456), + 'b6': (1.8, 2.6, 528), + 'b7': (2.0, 3.1, 600), + 'b8': (2.2, 3.6, 672), + 'es': (1.0, 1.0, 224), + 'em': (1.0, 1.1, 240), + 'el': (1.2, 1.4, 300) + } + + def __init__(self, + arch='b0', + drop_path_rate=0., + out_indices=(6, ), + frozen_stages=0, + conv_cfg=dict(type='Conv2dAdaptivePadding'), + norm_cfg=dict(type='BN', eps=1e-3), + act_cfg=dict(type='Swish'), + norm_eval=False, + with_cp=False, + init_cfg=[ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + layer=['_BatchNorm', 'GroupNorm'], + val=1) + ]): + super(EfficientNet, self).__init__(init_cfg) + assert arch in self.arch_settings, \ + f'"{arch}" is not one of the arch_settings ' \ + f'({", ".join(self.arch_settings.keys())})' + self.arch_setting = self.arch_settings[arch] + self.layer_setting = self.layer_settings[arch[:1]] + for index in out_indices: + if index not in range(0, len(self.layer_setting)): + raise ValueError('the item in out_indices must in ' + f'range(0, {len(self.layer_setting)}). ' + f'But received {index}') + + if frozen_stages not in range(len(self.layer_setting) + 1): + raise ValueError('frozen_stages must be in range(0, ' + f'{len(self.layer_setting) + 1}). ' + f'But received {frozen_stages}') + self.drop_path_rate = drop_path_rate + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.layer_setting = model_scaling(self.layer_setting, + self.arch_setting) + block_cfg_0 = self.layer_setting[0][0] + block_cfg_last = self.layer_setting[-1][0] + self.in_channels = make_divisible(block_cfg_0[1], 8) + self.out_channels = block_cfg_last[1] + self.layers = nn.ModuleList() + self.layers.append( + ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=block_cfg_0[0], + stride=block_cfg_0[3], + padding=block_cfg_0[0] // 2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.make_layer() + # Avoid building unused layers in mmdetection. + if len(self.layers) < max(self.out_indices) + 1: + self.layers.append( + ConvModule( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=block_cfg_last[0], + stride=block_cfg_last[3], + padding=block_cfg_last[0] // 2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def make_layer(self): + # Without the first and the final conv block. + layer_setting = self.layer_setting[1:-1] + + total_num_blocks = sum([len(x) for x in layer_setting]) + block_idx = 0 + dpr = [ + x.item() + for x in torch.linspace(0, self.drop_path_rate, total_num_blocks) + ] # stochastic depth decay rule + + for i, layer_cfg in enumerate(layer_setting): + # Avoid building unused layers in mmdetection. + if i > max(self.out_indices) - 1: + break + layer = [] + for i, block_cfg in enumerate(layer_cfg): + (kernel_size, out_channels, se_ratio, stride, expand_ratio, + block_type) = block_cfg + + mid_channels = int(self.in_channels * expand_ratio) + out_channels = make_divisible(out_channels, 8) + if se_ratio <= 0: + se_cfg = None + else: + # In mmdetection, the `divisor` is deleted to align + # the logic of SELayer with mmpretrain. + se_cfg = dict( + channels=mid_channels, + ratio=expand_ratio * se_ratio, + act_cfg=(self.act_cfg, dict(type='Sigmoid'))) + if block_type == 1: # edge tpu + if i > 0 and expand_ratio == 3: + with_residual = False + expand_ratio = 4 + else: + with_residual = True + mid_channels = int(self.in_channels * expand_ratio) + if se_cfg is not None: + # In mmdetection, the `divisor` is deleted to align + # the logic of SELayer with mmpretrain. + se_cfg = dict( + channels=mid_channels, + ratio=se_ratio * expand_ratio, + act_cfg=(self.act_cfg, dict(type='Sigmoid'))) + block = partial(EdgeResidual, with_residual=with_residual) + else: + block = InvertedResidual + layer.append( + block( + in_channels=self.in_channels, + out_channels=out_channels, + mid_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + se_cfg=se_cfg, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + drop_path_rate=dpr[block_idx], + with_cp=self.with_cp, + # In mmdetection, `with_expand_conv` is set to align + # the logic of InvertedResidual with mmpretrain. + with_expand_conv=(mid_channels != self.in_channels))) + self.in_channels = out_channels + block_idx += 1 + self.layers.append(Sequential(*layer)) + + def forward(self, x): + outs = [] + for i, layer in enumerate(self.layers): + x = layer(x) + if i in self.out_indices: + outs.append(x) + + return tuple(outs) + + def _freeze_stages(self): + for i in range(self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(EfficientNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() diff --git a/mmdetection/mmdet/models/backbones/hourglass.py b/mmdetection/mmdet/models/backbones/hourglass.py new file mode 100644 index 0000000..bb58799 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/hourglass.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptMultiConfig +from ..layers import ResLayer +from .resnet import BasicBlock + + +class HourglassModule(BaseModule): + """Hourglass Module for HourglassNet backbone. + + Generate module recursively and use BasicBlock as the base unit. + + Args: + depth (int): Depth of current HourglassModule. + stage_channels (list[int]): Feature channels of sub-modules in current + and follow-up HourglassModule. + stage_blocks (list[int]): Number of sub-modules stacked in current and + follow-up HourglassModule. + norm_cfg (ConfigType): Dictionary to construct and config norm layer. + Defaults to `dict(type='BN', requires_grad=True)` + upsample_cfg (ConfigType): Config dict for interpolate layer. + Defaults to `dict(mode='nearest')` + init_cfg (dict or ConfigDict, optional): the config to control the + initialization. + """ + + def __init__(self, + depth: int, + stage_channels: List[int], + stage_blocks: List[int], + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + upsample_cfg: ConfigType = dict(mode='nearest'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg) + + self.depth = depth + + cur_block = stage_blocks[0] + next_block = stage_blocks[1] + + cur_channel = stage_channels[0] + next_channel = stage_channels[1] + + self.up1 = ResLayer( + BasicBlock, cur_channel, cur_channel, cur_block, norm_cfg=norm_cfg) + + self.low1 = ResLayer( + BasicBlock, + cur_channel, + next_channel, + cur_block, + stride=2, + norm_cfg=norm_cfg) + + if self.depth > 1: + self.low2 = HourglassModule(depth - 1, stage_channels[1:], + stage_blocks[1:]) + else: + self.low2 = ResLayer( + BasicBlock, + next_channel, + next_channel, + next_block, + norm_cfg=norm_cfg) + + self.low3 = ResLayer( + BasicBlock, + next_channel, + cur_channel, + cur_block, + norm_cfg=norm_cfg, + downsample_first=False) + + self.up2 = F.interpolate + self.upsample_cfg = upsample_cfg + + def forward(self, x: torch.Tensor) -> nn.Module: + """Forward function.""" + up1 = self.up1(x) + low1 = self.low1(x) + low2 = self.low2(low1) + low3 = self.low3(low2) + # Fixing `scale factor` (e.g. 2) is common for upsampling, but + # in some cases the spatial size is mismatched and error will arise. + if 'scale_factor' in self.upsample_cfg: + up2 = self.up2(low3, **self.upsample_cfg) + else: + shape = up1.shape[2:] + up2 = self.up2(low3, size=shape, **self.upsample_cfg) + return up1 + up2 + + +@MODELS.register_module() +class HourglassNet(BaseModule): + """HourglassNet backbone. + + Stacked Hourglass Networks for Human Pose Estimation. + More details can be found in the `paper + `_ . + + Args: + downsample_times (int): Downsample times in a HourglassModule. + num_stacks (int): Number of HourglassModule modules stacked, + 1 for Hourglass-52, 2 for Hourglass-104. + stage_channels (Sequence[int]): Feature channel of each sub-module in a + HourglassModule. + stage_blocks (Sequence[int]): Number of sub-modules stacked in a + HourglassModule. + feat_channel (int): Feature channel of conv after a HourglassModule. + norm_cfg (norm_cfg): Dictionary to construct and config norm layer. + init_cfg (dict or ConfigDict, optional): the config to control the + initialization. + + Example: + >>> from mmdet.models import HourglassNet + >>> import torch + >>> self = HourglassNet() + >>> self.eval() + >>> inputs = torch.rand(1, 3, 511, 511) + >>> level_outputs = self.forward(inputs) + >>> for level_output in level_outputs: + ... print(tuple(level_output.shape)) + (1, 256, 128, 128) + (1, 256, 128, 128) + """ + + def __init__(self, + downsample_times: int = 5, + num_stacks: int = 2, + stage_channels: Sequence = (256, 256, 384, 384, 384, 512), + stage_blocks: Sequence = (2, 2, 2, 2, 2, 4), + feat_channel: int = 256, + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + init_cfg: OptMultiConfig = None) -> None: + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super().__init__(init_cfg) + + self.num_stacks = num_stacks + assert self.num_stacks >= 1 + assert len(stage_channels) == len(stage_blocks) + assert len(stage_channels) > downsample_times + + cur_channel = stage_channels[0] + + self.stem = nn.Sequential( + ConvModule( + 3, cur_channel // 2, 7, padding=3, stride=2, + norm_cfg=norm_cfg), + ResLayer( + BasicBlock, + cur_channel // 2, + cur_channel, + 1, + stride=2, + norm_cfg=norm_cfg)) + + self.hourglass_modules = nn.ModuleList([ + HourglassModule(downsample_times, stage_channels, stage_blocks) + for _ in range(num_stacks) + ]) + + self.inters = ResLayer( + BasicBlock, + cur_channel, + cur_channel, + num_stacks - 1, + norm_cfg=norm_cfg) + + self.conv1x1s = nn.ModuleList([ + ConvModule( + cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None) + for _ in range(num_stacks - 1) + ]) + + self.out_convs = nn.ModuleList([ + ConvModule( + cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg) + for _ in range(num_stacks) + ]) + + self.remap_convs = nn.ModuleList([ + ConvModule( + feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None) + for _ in range(num_stacks - 1) + ]) + + self.relu = nn.ReLU(inplace=True) + + def init_weights(self) -> None: + """Init module weights.""" + # Training Centripetal Model needs to reset parameters for Conv2d + super().init_weights() + for m in self.modules(): + if isinstance(m, nn.Conv2d): + m.reset_parameters() + + def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + """Forward function.""" + inter_feat = self.stem(x) + out_feats = [] + + for ind in range(self.num_stacks): + single_hourglass = self.hourglass_modules[ind] + out_conv = self.out_convs[ind] + + hourglass_feat = single_hourglass(inter_feat) + out_feat = out_conv(hourglass_feat) + out_feats.append(out_feat) + + if ind < self.num_stacks - 1: + inter_feat = self.conv1x1s[ind]( + inter_feat) + self.remap_convs[ind]( + out_feat) + inter_feat = self.inters[ind](self.relu(inter_feat)) + + return out_feats diff --git a/mmdetection/mmdet/models/backbones/hrnet.py b/mmdetection/mmdet/models/backbones/hrnet.py new file mode 100644 index 0000000..77bd3cc --- /dev/null +++ b/mmdetection/mmdet/models/backbones/hrnet.py @@ -0,0 +1,589 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch.nn as nn +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmengine.model import BaseModule, ModuleList, Sequential +from torch.nn.modules.batchnorm import _BatchNorm + +from mmdet.registry import MODELS +from .resnet import BasicBlock, Bottleneck + + +class HRModule(BaseModule): + """High-Resolution Module for HRNet. + + In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange + is in this module. + """ + + def __init__(self, + num_branches, + blocks, + num_blocks, + in_channels, + num_channels, + multiscale_output=True, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + block_init_cfg=None, + init_cfg=None): + super(HRModule, self).__init__(init_cfg) + self.block_init_cfg = block_init_cfg + self._check_branches(num_branches, num_blocks, in_channels, + num_channels) + + self.in_channels = in_channels + self.num_branches = num_branches + + self.multiscale_output = multiscale_output + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.with_cp = with_cp + self.branches = self._make_branches(num_branches, blocks, num_blocks, + num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(inplace=False) + + def _check_branches(self, num_branches, num_blocks, in_channels, + num_channels): + if num_branches != len(num_blocks): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_BLOCKS({len(num_blocks)})' + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_CHANNELS({len(num_channels)})' + raise ValueError(error_msg) + + if num_branches != len(in_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_INCHANNELS({len(in_channels)})' + raise ValueError(error_msg) + + def _make_one_branch(self, + branch_index, + block, + num_blocks, + num_channels, + stride=1): + downsample = None + if stride != 1 or \ + self.in_channels[branch_index] != \ + num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + self.in_channels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, num_channels[branch_index] * + block.expansion)[1]) + + layers = [] + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + init_cfg=self.block_init_cfg)) + self.in_channels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + init_cfg=self.block_init_cfg)) + + return Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels)) + + return ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + in_channels = self.in_channels + fuse_layers = [] + num_out_branches = num_branches if self.multiscale_output else 1 + for i in range(num_out_branches): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, in_channels[i])[1], + nn.Upsample( + scale_factor=2**(j - i), mode='nearest'))) + elif j == i: + fuse_layer.append(None) + else: + conv_downsamples = [] + for k in range(i - j): + if k == i - j - 1: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[i])[1])) + else: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + nn.ReLU(inplace=False))) + fuse_layer.append(nn.Sequential(*conv_downsamples)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def forward(self, x): + """Forward function.""" + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + for i in range(len(self.fuse_layers)): + y = 0 + for j in range(self.num_branches): + if i == j: + y += x[j] + else: + y += self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + return x_fuse + + +@MODELS.register_module() +class HRNet(BaseModule): + """HRNet backbone. + + `High-Resolution Representations for Labeling Pixels and Regions + arXiv: `_. + + Args: + extra (dict): Detailed configuration for each stage of HRNet. + There must be 4 stages, the configuration for each stage must have + 5 keys: + + - num_modules(int): The number of HRModule in this stage. + - num_branches(int): The number of branches in the HRModule. + - block(str): The type of convolution block. + - num_blocks(tuple): The number of blocks in each branch. + The length must be equal to num_branches. + - num_channels(tuple): The number of channels in each branch. + The length must be equal to num_branches. + in_channels (int): Number of input image channels. Default: 3. + conv_cfg (dict): Dictionary to construct and config conv layer. + norm_cfg (dict): Dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: True. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: False. + multiscale_output (bool): Whether to output multi-level features + produced by multiple branches. If False, only the first level + feature will be output. Default: True. + pretrained (str, optional): Model pretrained path. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + + Example: + >>> from mmdet.models import HRNet + >>> import torch + >>> extra = dict( + >>> stage1=dict( + >>> num_modules=1, + >>> num_branches=1, + >>> block='BOTTLENECK', + >>> num_blocks=(4, ), + >>> num_channels=(64, )), + >>> stage2=dict( + >>> num_modules=1, + >>> num_branches=2, + >>> block='BASIC', + >>> num_blocks=(4, 4), + >>> num_channels=(32, 64)), + >>> stage3=dict( + >>> num_modules=4, + >>> num_branches=3, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4), + >>> num_channels=(32, 64, 128)), + >>> stage4=dict( + >>> num_modules=3, + >>> num_branches=4, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4, 4), + >>> num_channels=(32, 64, 128, 256))) + >>> self = HRNet(extra, in_channels=1) + >>> self.eval() + >>> inputs = torch.rand(1, 1, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 32, 8, 8) + (1, 64, 4, 4) + (1, 128, 2, 2) + (1, 256, 1, 1) + """ + + blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} + + def __init__(self, + extra, + in_channels=3, + conv_cfg=None, + norm_cfg=dict(type='BN'), + norm_eval=True, + with_cp=False, + zero_init_residual=False, + multiscale_output=True, + pretrained=None, + init_cfg=None): + super(HRNet, self).__init__(init_cfg) + + self.pretrained = pretrained + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + else: + raise TypeError('pretrained must be a str or None') + + # Assert configurations of 4 stages are in extra + assert 'stage1' in extra and 'stage2' in extra \ + and 'stage3' in extra and 'stage4' in extra + # Assert whether the length of `num_blocks` and `num_channels` are + # equal to `num_branches` + for i in range(4): + cfg = extra[f'stage{i + 1}'] + assert len(cfg['num_blocks']) == cfg['num_branches'] and \ + len(cfg['num_channels']) == cfg['num_branches'] + + self.extra = extra + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + # stem net + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) + self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) + + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + 64, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.relu = nn.ReLU(inplace=True) + + # stage 1 + self.stage1_cfg = self.extra['stage1'] + num_channels = self.stage1_cfg['num_channels'][0] + block_type = self.stage1_cfg['block'] + num_blocks = self.stage1_cfg['num_blocks'][0] + + block = self.blocks_dict[block_type] + stage1_out_channels = num_channels * block.expansion + self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) + + # stage 2 + self.stage2_cfg = self.extra['stage2'] + num_channels = self.stage2_cfg['num_channels'] + block_type = self.stage2_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition1 = self._make_transition_layer([stage1_out_channels], + num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + # stage 3 + self.stage3_cfg = self.extra['stage3'] + num_channels = self.stage3_cfg['num_channels'] + block_type = self.stage3_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition2 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + # stage 4 + self.stage4_cfg = self.extra['stage4'] + num_channels = self.stage4_cfg['num_channels'] + block_type = self.stage4_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition3 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multiscale_output=multiscale_output) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + def _make_transition_layer(self, num_channels_pre_layer, + num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + num_channels_pre_layer[i], + num_channels_cur_layer[i], + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + num_channels_cur_layer[i])[1], + nn.ReLU(inplace=True))) + else: + transition_layers.append(None) + else: + conv_downsamples = [] + for j in range(i + 1 - num_branches_pre): + in_channels = num_channels_pre_layer[-1] + out_channels = num_channels_cur_layer[i] \ + if j == i - num_branches_pre else in_channels + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, out_channels)[1], + nn.ReLU(inplace=True))) + transition_layers.append(nn.Sequential(*conv_downsamples)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, planes * block.expansion)[1]) + + layers = [] + block_init_cfg = None + if self.pretrained is None and not hasattr( + self, 'init_cfg') and self.zero_init_residual: + if block is BasicBlock: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm2')) + elif block is Bottleneck: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm3')) + layers.append( + block( + inplanes, + planes, + stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + init_cfg=block_init_cfg, + )) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + init_cfg=block_init_cfg)) + + return Sequential(*layers) + + def _make_stage(self, layer_config, in_channels, multiscale_output=True): + num_modules = layer_config['num_modules'] + num_branches = layer_config['num_branches'] + num_blocks = layer_config['num_blocks'] + num_channels = layer_config['num_channels'] + block = self.blocks_dict[layer_config['block']] + + hr_modules = [] + block_init_cfg = None + if self.pretrained is None and not hasattr( + self, 'init_cfg') and self.zero_init_residual: + if block is BasicBlock: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm2')) + elif block is Bottleneck: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm3')) + + for i in range(num_modules): + # multi_scale_output is only used for the last module + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + + hr_modules.append( + HRModule( + num_branches, + block, + num_blocks, + in_channels, + num_channels, + reset_multiscale_output, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + block_init_cfg=block_init_cfg)) + + return Sequential(*hr_modules), in_channels + + def forward(self, x): + """Forward function.""" + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.norm2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['num_branches']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['num_branches']): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['num_branches']): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + + return y_list + + def train(self, mode=True): + """Convert the model into training mode will keeping the normalization + layer freezed.""" + super(HRNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmdetection/mmdet/models/backbones/mobilenet_v2.py b/mmdetection/mmdet/models/backbones/mobilenet_v2.py new file mode 100644 index 0000000..a4fd051 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/mobilenet_v2.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmdet.registry import MODELS +from ..layers import InvertedResidual +from ..utils import make_divisible + + +@MODELS.register_module() +class MobileNetV2(BaseModule): + """MobileNetV2 backbone. + + Args: + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Default: 1.0. + out_indices (Sequence[int], optional): Output from which stages. + Default: (1, 2, 4, 7). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + pretrained (str, optional): model pretrained path. Default: None + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + # Parameters to build layers. 4 parameters are needed to construct a + # layer, from left to right: expand_ratio, channel, num_blocks, stride. + arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], + [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], + [6, 320, 1, 1]] + + def __init__(self, + widen_factor=1., + out_indices=(1, 2, 4, 7), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + norm_eval=False, + with_cp=False, + pretrained=None, + init_cfg=None): + super(MobileNetV2, self).__init__(init_cfg) + + self.pretrained = pretrained + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + else: + raise TypeError('pretrained must be a str or None') + + self.widen_factor = widen_factor + self.out_indices = out_indices + if not set(out_indices).issubset(set(range(0, 8))): + raise ValueError('out_indices must be a subset of range' + f'(0, 8). But received {out_indices}') + + if frozen_stages not in range(-1, 8): + raise ValueError('frozen_stages must be in range(-1, 8). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.in_channels = make_divisible(32 * widen_factor, 8) + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.layers = [] + + for i, layer_cfg in enumerate(self.arch_settings): + expand_ratio, channel, num_blocks, stride = layer_cfg + out_channels = make_divisible(channel * widen_factor, 8) + inverted_res_layer = self.make_layer( + out_channels=out_channels, + num_blocks=num_blocks, + stride=stride, + expand_ratio=expand_ratio) + layer_name = f'layer{i + 1}' + self.add_module(layer_name, inverted_res_layer) + self.layers.append(layer_name) + + if widen_factor > 1.0: + self.out_channel = int(1280 * widen_factor) + else: + self.out_channel = 1280 + + layer = ConvModule( + in_channels=self.in_channels, + out_channels=self.out_channel, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.add_module('conv2', layer) + self.layers.append('conv2') + + def make_layer(self, out_channels, num_blocks, stride, expand_ratio): + """Stack InvertedResidual blocks to build a layer for MobileNetV2. + + Args: + out_channels (int): out_channels of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Default: 1 + expand_ratio (int): Expand the number of channels of the + hidden layer in InvertedResidual by this ratio. Default: 6. + """ + layers = [] + for i in range(num_blocks): + if i >= 1: + stride = 1 + layers.append( + InvertedResidual( + self.in_channels, + out_channels, + mid_channels=int(round(self.in_channels * expand_ratio)), + stride=stride, + with_expand_conv=expand_ratio != 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def forward(self, x): + """Forward function.""" + x = self.conv1(x) + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + frozen.""" + super(MobileNetV2, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmdetection/mmdet/models/backbones/pvt.py b/mmdetection/mmdet/models/backbones/pvt.py new file mode 100644 index 0000000..8b250f6 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/pvt.py @@ -0,0 +1,665 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import warnings +from collections import OrderedDict + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer +from mmcv.cnn.bricks.drop import build_dropout +from mmcv.cnn.bricks.transformer import MultiheadAttention +from mmengine.logging import MMLogger +from mmengine.model import (BaseModule, ModuleList, Sequential, constant_init, + normal_init, trunc_normal_init) +from mmengine.model.weight_init import trunc_normal_ +from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict +from torch.nn.modules.utils import _pair as to_2tuple + +from mmdet.registry import MODELS +from ..layers import PatchEmbed, nchw_to_nlc, nlc_to_nchw + + +class MixFFN(BaseModule): + """An implementation of MixFFN of PVT. + + The differences between MixFFN & FFN: + 1. Use 1X1 Conv to replace Linear layer. + 2. Introduce 3X3 Depth-wise Conv to encode positional information. + + Args: + embed_dims (int): The feature dimension. Same as + `MultiheadAttention`. + feedforward_channels (int): The hidden dimension of FFNs. + act_cfg (dict, optional): The activation config for FFNs. + Default: dict(type='GELU'). + ffn_drop (float, optional): Probability of an element to be + zeroed in FFN. Default 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + Default: None. + use_conv (bool): If True, add 3x3 DWConv between two Linear layers. + Defaults: False. + init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + feedforward_channels, + act_cfg=dict(type='GELU'), + ffn_drop=0., + dropout_layer=None, + use_conv=False, + init_cfg=None): + super(MixFFN, self).__init__(init_cfg=init_cfg) + + self.embed_dims = embed_dims + self.feedforward_channels = feedforward_channels + self.act_cfg = act_cfg + activate = build_activation_layer(act_cfg) + + in_channels = embed_dims + fc1 = Conv2d( + in_channels=in_channels, + out_channels=feedforward_channels, + kernel_size=1, + stride=1, + bias=True) + if use_conv: + # 3x3 depth wise conv to provide positional encode information + dw_conv = Conv2d( + in_channels=feedforward_channels, + out_channels=feedforward_channels, + kernel_size=3, + stride=1, + padding=(3 - 1) // 2, + bias=True, + groups=feedforward_channels) + fc2 = Conv2d( + in_channels=feedforward_channels, + out_channels=in_channels, + kernel_size=1, + stride=1, + bias=True) + drop = nn.Dropout(ffn_drop) + layers = [fc1, activate, drop, fc2, drop] + if use_conv: + layers.insert(1, dw_conv) + self.layers = Sequential(*layers) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else torch.nn.Identity() + + def forward(self, x, hw_shape, identity=None): + out = nlc_to_nchw(x, hw_shape) + out = self.layers(out) + out = nchw_to_nlc(out) + if identity is None: + identity = x + return identity + self.dropout_layer(out) + + +class SpatialReductionAttention(MultiheadAttention): + """An implementation of Spatial Reduction Attention of PVT. + + This module is modified from MultiheadAttention which is a module from + mmcv.cnn.bricks.transformer. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. Default: None. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default: False. + qkv_bias (bool): enable bias for qkv if True. Default: True. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + sr_ratio (int): The ratio of spatial reduction of Spatial Reduction + Attention of PVT. Default: 1. + init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + attn_drop=0., + proj_drop=0., + dropout_layer=None, + batch_first=True, + qkv_bias=True, + norm_cfg=dict(type='LN'), + sr_ratio=1, + init_cfg=None): + super().__init__( + embed_dims, + num_heads, + attn_drop, + proj_drop, + batch_first=batch_first, + dropout_layer=dropout_layer, + bias=qkv_bias, + init_cfg=init_cfg) + + self.sr_ratio = sr_ratio + if sr_ratio > 1: + self.sr = Conv2d( + in_channels=embed_dims, + out_channels=embed_dims, + kernel_size=sr_ratio, + stride=sr_ratio) + # The ret[0] of build_norm_layer is norm name. + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + + # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa + from mmdet import digit_version, mmcv_version + if mmcv_version < digit_version('1.3.17'): + warnings.warn('The legacy version of forward function in' + 'SpatialReductionAttention is deprecated in' + 'mmcv>=1.3.17 and will no longer support in the' + 'future. Please upgrade your mmcv.') + self.forward = self.legacy_forward + + def forward(self, x, hw_shape, identity=None): + + x_q = x + if self.sr_ratio > 1: + x_kv = nlc_to_nchw(x, hw_shape) + x_kv = self.sr(x_kv) + x_kv = nchw_to_nlc(x_kv) + x_kv = self.norm(x_kv) + else: + x_kv = x + + if identity is None: + identity = x_q + + # Because the dataflow('key', 'query', 'value') of + # ``torch.nn.MultiheadAttention`` is (num_queries, batch, + # embed_dims), We should adjust the shape of dataflow from + # batch_first (batch, num_queries, embed_dims) to num_queries_first + # (num_queries ,batch, embed_dims), and recover ``attn_output`` + # from num_queries_first to batch_first. + if self.batch_first: + x_q = x_q.transpose(0, 1) + x_kv = x_kv.transpose(0, 1) + + out = self.attn(query=x_q, key=x_kv, value=x_kv)[0] + + if self.batch_first: + out = out.transpose(0, 1) + + return identity + self.dropout_layer(self.proj_drop(out)) + + def legacy_forward(self, x, hw_shape, identity=None): + """multi head attention forward in mmcv version < 1.3.17.""" + x_q = x + if self.sr_ratio > 1: + x_kv = nlc_to_nchw(x, hw_shape) + x_kv = self.sr(x_kv) + x_kv = nchw_to_nlc(x_kv) + x_kv = self.norm(x_kv) + else: + x_kv = x + + if identity is None: + identity = x_q + + out = self.attn(query=x_q, key=x_kv, value=x_kv)[0] + + return identity + self.dropout_layer(self.proj_drop(out)) + + +class PVTEncoderLayer(BaseModule): + """Implements one encoder layer in PVT. + + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + drop_rate (float): Probability of an element to be zeroed. + after the feed forward layer. Default: 0.0. + attn_drop_rate (float): The drop out rate for attention layer. + Default: 0.0. + drop_path_rate (float): stochastic depth rate. Default: 0.0. + qkv_bias (bool): enable bias for qkv if True. + Default: True. + act_cfg (dict): The activation config for FFNs. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + sr_ratio (int): The ratio of spatial reduction of Spatial Reduction + Attention of PVT. Default: 1. + use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN. + Default: False. + init_cfg (dict, optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + feedforward_channels, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + qkv_bias=True, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + sr_ratio=1, + use_conv_ffn=False, + init_cfg=None): + super(PVTEncoderLayer, self).__init__(init_cfg=init_cfg) + + # The ret[0] of build_norm_layer is norm name. + self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] + + self.attn = SpatialReductionAttention( + embed_dims=embed_dims, + num_heads=num_heads, + attn_drop=attn_drop_rate, + proj_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + qkv_bias=qkv_bias, + norm_cfg=norm_cfg, + sr_ratio=sr_ratio) + + # The ret[0] of build_norm_layer is norm name. + self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] + + self.ffn = MixFFN( + embed_dims=embed_dims, + feedforward_channels=feedforward_channels, + ffn_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + use_conv=use_conv_ffn, + act_cfg=act_cfg) + + def forward(self, x, hw_shape): + x = self.attn(self.norm1(x), hw_shape, identity=x) + x = self.ffn(self.norm2(x), hw_shape, identity=x) + + return x + + +class AbsolutePositionEmbedding(BaseModule): + """An implementation of the absolute position embedding in PVT. + + Args: + pos_shape (int): The shape of the absolute position embedding. + pos_dim (int): The dimension of the absolute position embedding. + drop_rate (float): Probability of an element to be zeroed. + Default: 0.0. + """ + + def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None): + super().__init__(init_cfg=init_cfg) + + if isinstance(pos_shape, int): + pos_shape = to_2tuple(pos_shape) + elif isinstance(pos_shape, tuple): + if len(pos_shape) == 1: + pos_shape = to_2tuple(pos_shape[0]) + assert len(pos_shape) == 2, \ + f'The size of image should have length 1 or 2, ' \ + f'but got {len(pos_shape)}' + self.pos_shape = pos_shape + self.pos_dim = pos_dim + + self.pos_embed = nn.Parameter( + torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim)) + self.drop = nn.Dropout(p=drop_rate) + + def init_weights(self): + trunc_normal_(self.pos_embed, std=0.02) + + def resize_pos_embed(self, pos_embed, input_shape, mode='bilinear'): + """Resize pos_embed weights. + + Resize pos_embed using bilinear interpolate method. + + Args: + pos_embed (torch.Tensor): Position embedding weights. + input_shape (tuple): Tuple for (downsampled input image height, + downsampled input image width). + mode (str): Algorithm used for upsampling: + ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` | + ``'trilinear'``. Default: ``'bilinear'``. + + Return: + torch.Tensor: The resized pos_embed of shape [B, L_new, C]. + """ + assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]' + pos_h, pos_w = self.pos_shape + pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):] + pos_embed_weight = pos_embed_weight.reshape( + 1, pos_h, pos_w, self.pos_dim).permute(0, 3, 1, 2).contiguous() + pos_embed_weight = F.interpolate( + pos_embed_weight, size=input_shape, mode=mode) + pos_embed_weight = torch.flatten(pos_embed_weight, + 2).transpose(1, 2).contiguous() + pos_embed = pos_embed_weight + + return pos_embed + + def forward(self, x, hw_shape, mode='bilinear'): + pos_embed = self.resize_pos_embed(self.pos_embed, hw_shape, mode) + return self.drop(x + pos_embed) + + +@MODELS.register_module() +class PyramidVisionTransformer(BaseModule): + """Pyramid Vision Transformer (PVT) + + Implementation of `Pyramid Vision Transformer: A Versatile Backbone for + Dense Prediction without Convolutions + `_. + + Args: + pretrain_img_size (int | tuple[int]): The size of input image when + pretrain. Defaults: 224. + in_channels (int): Number of input channels. Default: 3. + embed_dims (int): Embedding dimension. Default: 64. + num_stags (int): The num of stages. Default: 4. + num_layers (Sequence[int]): The layer number of each transformer encode + layer. Default: [3, 4, 6, 3]. + num_heads (Sequence[int]): The attention heads of each transformer + encode layer. Default: [1, 2, 5, 8]. + patch_sizes (Sequence[int]): The patch_size of each patch embedding. + Default: [4, 2, 2, 2]. + strides (Sequence[int]): The stride of each patch embedding. + Default: [4, 2, 2, 2]. + paddings (Sequence[int]): The padding of each patch embedding. + Default: [0, 0, 0, 0]. + sr_ratios (Sequence[int]): The spatial reduction rate of each + transformer encode layer. Default: [8, 4, 2, 1]. + out_indices (Sequence[int] | int): Output from which stages. + Default: (0, 1, 2, 3). + mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the + embedding dim of each transformer encode layer. + Default: [8, 8, 4, 4]. + qkv_bias (bool): Enable bias for qkv if True. Default: True. + drop_rate (float): Probability of an element to be zeroed. + Default 0.0. + attn_drop_rate (float): The drop out rate for attention layer. + Default 0.0. + drop_path_rate (float): stochastic depth rate. Default 0.1. + use_abs_pos_embed (bool): If True, add absolute position embedding to + the patch embedding. Defaults: True. + use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN. + Default: False. + act_cfg (dict): The activation config for FFNs. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + pretrained (str, optional): model pretrained path. Default: None. + convert_weights (bool): The flag indicates whether the + pre-trained model is from the original repo. We may need + to convert some keys to make it compatible. + Default: True. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + pretrain_img_size=224, + in_channels=3, + embed_dims=64, + num_stages=4, + num_layers=[3, 4, 6, 3], + num_heads=[1, 2, 5, 8], + patch_sizes=[4, 2, 2, 2], + strides=[4, 2, 2, 2], + paddings=[0, 0, 0, 0], + sr_ratios=[8, 4, 2, 1], + out_indices=(0, 1, 2, 3), + mlp_ratios=[8, 8, 4, 4], + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1, + use_abs_pos_embed=True, + norm_after_stage=False, + use_conv_ffn=False, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN', eps=1e-6), + pretrained=None, + convert_weights=True, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + self.convert_weights = convert_weights + if isinstance(pretrain_img_size, int): + pretrain_img_size = to_2tuple(pretrain_img_size) + elif isinstance(pretrain_img_size, tuple): + if len(pretrain_img_size) == 1: + pretrain_img_size = to_2tuple(pretrain_img_size[0]) + assert len(pretrain_img_size) == 2, \ + f'The size of image should have length 1 or 2, ' \ + f'but got {len(pretrain_img_size)}' + + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be setting at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + self.init_cfg = init_cfg + else: + raise TypeError('pretrained must be a str or None') + + self.embed_dims = embed_dims + + self.num_stages = num_stages + self.num_layers = num_layers + self.num_heads = num_heads + self.patch_sizes = patch_sizes + self.strides = strides + self.sr_ratios = sr_ratios + assert num_stages == len(num_layers) == len(num_heads) \ + == len(patch_sizes) == len(strides) == len(sr_ratios) + + self.out_indices = out_indices + assert max(out_indices) < self.num_stages + self.pretrained = pretrained + + # transformer encoder + dpr = [ + x.item() + for x in torch.linspace(0, drop_path_rate, sum(num_layers)) + ] # stochastic num_layer decay rule + + cur = 0 + self.layers = ModuleList() + for i, num_layer in enumerate(num_layers): + embed_dims_i = embed_dims * num_heads[i] + patch_embed = PatchEmbed( + in_channels=in_channels, + embed_dims=embed_dims_i, + kernel_size=patch_sizes[i], + stride=strides[i], + padding=paddings[i], + bias=True, + norm_cfg=norm_cfg) + + layers = ModuleList() + if use_abs_pos_embed: + pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1]) + pos_embed = AbsolutePositionEmbedding( + pos_shape=pos_shape, + pos_dim=embed_dims_i, + drop_rate=drop_rate) + layers.append(pos_embed) + layers.extend([ + PVTEncoderLayer( + embed_dims=embed_dims_i, + num_heads=num_heads[i], + feedforward_channels=mlp_ratios[i] * embed_dims_i, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=dpr[cur + idx], + qkv_bias=qkv_bias, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + sr_ratio=sr_ratios[i], + use_conv_ffn=use_conv_ffn) for idx in range(num_layer) + ]) + in_channels = embed_dims_i + # The ret[0] of build_norm_layer is norm name. + if norm_after_stage: + norm = build_norm_layer(norm_cfg, embed_dims_i)[1] + else: + norm = nn.Identity() + self.layers.append(ModuleList([patch_embed, layers, norm])) + cur += num_layer + + def init_weights(self): + logger = MMLogger.get_current_instance() + if self.init_cfg is None: + logger.warn(f'No pre-trained weights for ' + f'{self.__class__.__name__}, ' + f'training start from scratch') + for m in self.modules(): + if isinstance(m, nn.Linear): + trunc_normal_init(m, std=.02, bias=0.) + elif isinstance(m, nn.LayerNorm): + constant_init(m, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[ + 1] * m.out_channels + fan_out //= m.groups + normal_init(m, 0, math.sqrt(2.0 / fan_out)) + elif isinstance(m, AbsolutePositionEmbedding): + m.init_weights() + else: + assert 'checkpoint' in self.init_cfg, f'Only support ' \ + f'specify `Pretrained` in ' \ + f'`init_cfg` in ' \ + f'{self.__class__.__name__} ' + checkpoint = CheckpointLoader.load_checkpoint( + self.init_cfg.checkpoint, logger=logger, map_location='cpu') + logger.warn(f'Load pre-trained model for ' + f'{self.__class__.__name__} from original repo') + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + elif 'model' in checkpoint: + state_dict = checkpoint['model'] + else: + state_dict = checkpoint + if self.convert_weights: + # Because pvt backbones are not supported by mmpretrain, + # so we need to convert pre-trained weights to match this + # implementation. + state_dict = pvt_convert(state_dict) + load_state_dict(self, state_dict, strict=False, logger=logger) + + def forward(self, x): + outs = [] + + for i, layer in enumerate(self.layers): + x, hw_shape = layer[0](x) + + for block in layer[1]: + x = block(x, hw_shape) + x = layer[2](x) + x = nlc_to_nchw(x, hw_shape) + if i in self.out_indices: + outs.append(x) + + return outs + + +@MODELS.register_module() +class PyramidVisionTransformerV2(PyramidVisionTransformer): + """Implementation of `PVTv2: Improved Baselines with Pyramid Vision + Transformer `_.""" + + def __init__(self, **kwargs): + super(PyramidVisionTransformerV2, self).__init__( + patch_sizes=[7, 3, 3, 3], + paddings=[3, 1, 1, 1], + use_abs_pos_embed=False, + norm_after_stage=True, + use_conv_ffn=True, + **kwargs) + + +def pvt_convert(ckpt): + new_ckpt = OrderedDict() + # Process the concat between q linear weights and kv linear weights + use_abs_pos_embed = False + use_conv_ffn = False + for k in ckpt.keys(): + if k.startswith('pos_embed'): + use_abs_pos_embed = True + if k.find('dwconv') >= 0: + use_conv_ffn = True + for k, v in ckpt.items(): + if k.startswith('head'): + continue + if k.startswith('norm.'): + continue + if k.startswith('cls_token'): + continue + if k.startswith('pos_embed'): + stage_i = int(k.replace('pos_embed', '')) + new_k = k.replace(f'pos_embed{stage_i}', + f'layers.{stage_i - 1}.1.0.pos_embed') + if stage_i == 4 and v.size(1) == 50: # 1 (cls token) + 7 * 7 + new_v = v[:, 1:, :] # remove cls token + else: + new_v = v + elif k.startswith('patch_embed'): + stage_i = int(k.split('.')[0].replace('patch_embed', '')) + new_k = k.replace(f'patch_embed{stage_i}', + f'layers.{stage_i - 1}.0') + new_v = v + if 'proj.' in new_k: + new_k = new_k.replace('proj.', 'projection.') + elif k.startswith('block'): + stage_i = int(k.split('.')[0].replace('block', '')) + layer_i = int(k.split('.')[1]) + new_layer_i = layer_i + use_abs_pos_embed + new_k = k.replace(f'block{stage_i}.{layer_i}', + f'layers.{stage_i - 1}.1.{new_layer_i}') + new_v = v + if 'attn.q.' in new_k: + sub_item_k = k.replace('q.', 'kv.') + new_k = new_k.replace('q.', 'attn.in_proj_') + new_v = torch.cat([v, ckpt[sub_item_k]], dim=0) + elif 'attn.kv.' in new_k: + continue + elif 'attn.proj.' in new_k: + new_k = new_k.replace('proj.', 'attn.out_proj.') + elif 'attn.sr.' in new_k: + new_k = new_k.replace('sr.', 'sr.') + elif 'mlp.' in new_k: + string = f'{new_k}-' + new_k = new_k.replace('mlp.', 'ffn.layers.') + if 'fc1.weight' in new_k or 'fc2.weight' in new_k: + new_v = v.reshape((*v.shape, 1, 1)) + new_k = new_k.replace('fc1.', '0.') + new_k = new_k.replace('dwconv.dwconv.', '1.') + if use_conv_ffn: + new_k = new_k.replace('fc2.', '4.') + else: + new_k = new_k.replace('fc2.', '3.') + string += f'{new_k} {v.shape}-{new_v.shape}' + elif k.startswith('norm'): + stage_i = int(k[4]) + new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2') + new_v = v + else: + new_k = k + new_v = v + new_ckpt[new_k] = new_v + + return new_ckpt diff --git a/mmdetection/mmdet/models/backbones/regnet.py b/mmdetection/mmdet/models/backbones/regnet.py new file mode 100644 index 0000000..55d3ce0 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/regnet.py @@ -0,0 +1,356 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np +import torch.nn as nn +from mmcv.cnn import build_conv_layer, build_norm_layer + +from mmdet.registry import MODELS +from .resnet import ResNet +from .resnext import Bottleneck + + +@MODELS.register_module() +class RegNet(ResNet): + """RegNet backbone. + + More details can be found in `paper `_ . + + Args: + arch (dict): The parameter of RegNets. + + - w0 (int): initial width + - wa (float): slope of width + - wm (float): quantization parameter to quantize the width + - depth (int): depth of the backbone + - group_w (int): width of group + - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck. + strides (Sequence[int]): Strides of the first block of each stage. + base_channels (int): Base channels after stem layer. + in_channels (int): Number of input image channels. Default: 3. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + norm_cfg (dict): dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. + pretrained (str, optional): model pretrained path. Default: None + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + + Example: + >>> from mmdet.models import RegNet + >>> import torch + >>> self = RegNet( + arch=dict( + w0=88, + wa=26.31, + wm=2.25, + group_w=48, + depth=25, + bot_mul=1.0)) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 96, 8, 8) + (1, 192, 4, 4) + (1, 432, 2, 2) + (1, 1008, 1, 1) + """ + arch_settings = { + 'regnetx_400mf': + dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), + 'regnetx_800mf': + dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0), + 'regnetx_1.6gf': + dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0), + 'regnetx_3.2gf': + dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0), + 'regnetx_4.0gf': + dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0), + 'regnetx_6.4gf': + dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0), + 'regnetx_8.0gf': + dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0), + 'regnetx_12gf': + dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0), + } + + def __init__(self, + arch, + in_channels=3, + stem_channels=32, + base_channels=32, + strides=(2, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(0, 1, 2, 3), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + dcn=None, + stage_with_dcn=(False, False, False, False), + plugins=None, + with_cp=False, + zero_init_residual=True, + pretrained=None, + init_cfg=None): + super(ResNet, self).__init__(init_cfg) + + # Generate RegNet parameters first + if isinstance(arch, str): + assert arch in self.arch_settings, \ + f'"arch": "{arch}" is not one of the' \ + ' arch_settings' + arch = self.arch_settings[arch] + elif not isinstance(arch, dict): + raise ValueError('Expect "arch" to be either a string ' + f'or a dict, got {type(arch)}') + + widths, num_stages = self.generate_regnet( + arch['w0'], + arch['wa'], + arch['wm'], + arch['depth'], + ) + # Convert to per stage format + stage_widths, stage_blocks = self.get_stages_from_blocks(widths) + # Generate group widths and bot muls + group_widths = [arch['group_w'] for _ in range(num_stages)] + self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)] + # Adjust the compatibility of stage_widths and group_widths + stage_widths, group_widths = self.adjust_width_group( + stage_widths, self.bottleneck_ratio, group_widths) + + # Group params by stage + self.stage_widths = stage_widths + self.group_widths = group_widths + self.depth = sum(stage_blocks) + self.stem_channels = stem_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert num_stages >= 1 and num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.dcn = dcn + self.stage_with_dcn = stage_with_dcn + if dcn is not None: + assert len(stage_with_dcn) == num_stages + self.plugins = plugins + self.zero_init_residual = zero_init_residual + self.block = Bottleneck + expansion_bak = self.block.expansion + self.block.expansion = 1 + self.stage_blocks = stage_blocks[:num_stages] + + self._make_stem_layer(in_channels, stem_channels) + + block_init_cfg = None + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + if self.zero_init_residual: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm3')) + else: + raise TypeError('pretrained must be a str or None') + + self.inplanes = stem_channels + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = self.strides[i] + dilation = self.dilations[i] + group_width = self.group_widths[i] + width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i])) + stage_groups = width // group_width + + dcn = self.dcn if self.stage_with_dcn[i] else None + if self.plugins is not None: + stage_plugins = self.make_stage_plugins(self.plugins, i) + else: + stage_plugins = None + + res_layer = self.make_res_layer( + block=self.block, + inplanes=self.inplanes, + planes=self.stage_widths[i], + num_blocks=num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=self.with_cp, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + dcn=dcn, + plugins=stage_plugins, + groups=stage_groups, + base_width=group_width, + base_channels=self.stage_widths[i], + init_cfg=block_init_cfg) + self.inplanes = self.stage_widths[i] + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = stage_widths[-1] + self.block.expansion = expansion_bak + + def _make_stem_layer(self, in_channels, base_channels): + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + base_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, base_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + + def generate_regnet(self, + initial_width, + width_slope, + width_parameter, + depth, + divisor=8): + """Generates per block width from RegNet parameters. + + Args: + initial_width ([int]): Initial width of the backbone + width_slope ([float]): Slope of the quantized linear function + width_parameter ([int]): Parameter used to quantize the width. + depth ([int]): Depth of the backbone. + divisor (int, optional): The divisor of channels. Defaults to 8. + + Returns: + list, int: return a list of widths of each stage and the number \ + of stages + """ + assert width_slope >= 0 + assert initial_width > 0 + assert width_parameter > 1 + assert initial_width % divisor == 0 + widths_cont = np.arange(depth) * width_slope + initial_width + ks = np.round( + np.log(widths_cont / initial_width) / np.log(width_parameter)) + widths = initial_width * np.power(width_parameter, ks) + widths = np.round(np.divide(widths, divisor)) * divisor + num_stages = len(np.unique(widths)) + widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist() + return widths, num_stages + + @staticmethod + def quantize_float(number, divisor): + """Converts a float to closest non-zero int divisible by divisor. + + Args: + number (int): Original number to be quantized. + divisor (int): Divisor used to quantize the number. + + Returns: + int: quantized number that is divisible by devisor. + """ + return int(round(number / divisor) * divisor) + + def adjust_width_group(self, widths, bottleneck_ratio, groups): + """Adjusts the compatibility of widths and groups. + + Args: + widths (list[int]): Width of each stage. + bottleneck_ratio (float): Bottleneck ratio. + groups (int): number of groups in each stage + + Returns: + tuple(list): The adjusted widths and groups of each stage. + """ + bottleneck_width = [ + int(w * b) for w, b in zip(widths, bottleneck_ratio) + ] + groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)] + bottleneck_width = [ + self.quantize_float(w_bot, g) + for w_bot, g in zip(bottleneck_width, groups) + ] + widths = [ + int(w_bot / b) + for w_bot, b in zip(bottleneck_width, bottleneck_ratio) + ] + return widths, groups + + def get_stages_from_blocks(self, widths): + """Gets widths/stage_blocks of network at each stage. + + Args: + widths (list[int]): Width in each stage. + + Returns: + tuple(list): width and depth of each stage + """ + width_diff = [ + width != width_prev + for width, width_prev in zip(widths + [0], [0] + widths) + ] + stage_widths = [ + width for width, diff in zip(widths, width_diff[:-1]) if diff + ] + stage_blocks = np.diff([ + depth for depth, diff in zip(range(len(width_diff)), width_diff) + if diff + ]).tolist() + return stage_widths, stage_blocks + + def forward(self, x): + """Forward function.""" + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) diff --git a/mmdetection/mmdet/models/backbones/res2net.py b/mmdetection/mmdet/models/backbones/res2net.py new file mode 100644 index 0000000..958fc88 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/res2net.py @@ -0,0 +1,327 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmengine.model import Sequential + +from mmdet.registry import MODELS +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResNet + + +class Bottle2neck(_Bottleneck): + expansion = 4 + + def __init__(self, + inplanes, + planes, + scales=4, + base_width=26, + base_channels=64, + stage_type='normal', + **kwargs): + """Bottle2neck block for Res2Net. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if + it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super(Bottle2neck, self).__init__(inplanes, planes, **kwargs) + assert scales > 1, 'Res2Net degenerates to ResNet when scales = 1.' + width = int(math.floor(self.planes * (base_width / base_channels))) + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, width * scales, postfix=1) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.inplanes, + width * scales, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + + if stage_type == 'stage' and self.conv2_stride != 1: + self.pool = nn.AvgPool2d( + kernel_size=3, stride=self.conv2_stride, padding=1) + convs = [] + bns = [] + + fallback_on_stride = False + if self.with_dcn: + fallback_on_stride = self.dcn.pop('fallback_on_stride', False) + if not self.with_dcn or fallback_on_stride: + for i in range(scales - 1): + convs.append( + build_conv_layer( + self.conv_cfg, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + bias=False)) + bns.append( + build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1]) + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + for i in range(scales - 1): + convs.append( + build_conv_layer( + self.dcn, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + bias=False)) + bns.append( + build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1]) + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + + self.conv3 = build_conv_layer( + self.conv_cfg, + width * scales, + self.planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + self.stage_type = stage_type + self.scales = scales + self.width = width + delattr(self, 'conv2') + delattr(self, self.norm2_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + spx = torch.split(out, self.width, 1) + sp = self.convs[0](spx[0].contiguous()) + sp = self.relu(self.bns[0](sp)) + out = sp + for i in range(1, self.scales - 1): + if self.stage_type == 'stage': + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.convs[i](sp.contiguous()) + sp = self.relu(self.bns[i](sp)) + out = torch.cat((out, sp), 1) + + if self.stage_type == 'normal' or self.conv2_stride == 1: + out = torch.cat((out, spx[self.scales - 1]), 1) + elif self.stage_type == 'stage': + out = torch.cat((out, self.pool(spx[self.scales - 1])), 1) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +class Res2Layer(Sequential): + """Res2Layer to build Res2Net style backbone. + + Args: + block (nn.Module): block used to build ResLayer. + inplanes (int): inplanes of block. + planes (int): planes of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Default: 1 + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottle2neck. Default: False + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + scales (int): Scales used in Res2Net. Default: 4 + base_width (int): Basic width of each scale. Default: 26 + """ + + def __init__(self, + block, + inplanes, + planes, + num_blocks, + stride=1, + avg_down=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + scales=4, + base_width=26, + **kwargs): + self.block = block + + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.AvgPool2d( + kernel_size=stride, + stride=stride, + ceil_mode=True, + count_include_pad=False), + build_conv_layer( + conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=1, + bias=False), + build_norm_layer(norm_cfg, planes * block.expansion)[1], + ) + + layers = [] + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + scales=scales, + base_width=base_width, + stage_type='stage', + **kwargs)) + inplanes = planes * block.expansion + for i in range(1, num_blocks): + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + scales=scales, + base_width=base_width, + **kwargs)) + super(Res2Layer, self).__init__(*layers) + + +@MODELS.register_module() +class Res2Net(ResNet): + """Res2Net backbone. + + Args: + scales (int): Scales used in Res2Net. Default: 4 + base_width (int): Basic width of each scale. Default: 26 + depth (int): Depth of res2net, from {50, 101, 152}. + in_channels (int): Number of input image channels. Default: 3. + num_stages (int): Res2net stages. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottle2neck. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + norm_cfg (dict): Dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - position (str, required): Position inside block to insert + plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. + pretrained (str, optional): model pretrained path. Default: None + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + + Example: + >>> from mmdet.models import Res2Net + >>> import torch + >>> self = Res2Net(depth=50, scales=4, base_width=26) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 256, 8, 8) + (1, 512, 4, 4) + (1, 1024, 2, 2) + (1, 2048, 1, 1) + """ + + arch_settings = { + 50: (Bottle2neck, (3, 4, 6, 3)), + 101: (Bottle2neck, (3, 4, 23, 3)), + 152: (Bottle2neck, (3, 8, 36, 3)) + } + + def __init__(self, + scales=4, + base_width=26, + style='pytorch', + deep_stem=True, + avg_down=True, + pretrained=None, + init_cfg=None, + **kwargs): + self.scales = scales + self.base_width = base_width + super(Res2Net, self).__init__( + style='pytorch', + deep_stem=True, + avg_down=True, + pretrained=pretrained, + init_cfg=init_cfg, + **kwargs) + + def make_res_layer(self, **kwargs): + return Res2Layer( + scales=self.scales, + base_width=self.base_width, + base_channels=self.base_channels, + **kwargs) diff --git a/mmdetection/mmdet/models/backbones/resnest.py b/mmdetection/mmdet/models/backbones/resnest.py new file mode 100644 index 0000000..d4466c4 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/resnest.py @@ -0,0 +1,322 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from ..layers import ResLayer +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResNetV1d + + +class RSoftmax(nn.Module): + """Radix Softmax module in ``SplitAttentionConv2d``. + + Args: + radix (int): Radix of input. + groups (int): Groups of input. + """ + + def __init__(self, radix, groups): + super().__init__() + self.radix = radix + self.groups = groups + + def forward(self, x): + batch = x.size(0) + if self.radix > 1: + x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2) + x = F.softmax(x, dim=1) + x = x.reshape(batch, -1) + else: + x = torch.sigmoid(x) + return x + + +class SplitAttentionConv2d(BaseModule): + """Split-Attention Conv2d in ResNeSt. + + Args: + in_channels (int): Number of channels in the input feature map. + channels (int): Number of intermediate channels. + kernel_size (int | tuple[int]): Size of the convolution kernel. + stride (int | tuple[int]): Stride of the convolution. + padding (int | tuple[int]): Zero-padding added to both sides of + dilation (int | tuple[int]): Spacing between kernel elements. + groups (int): Number of blocked connections from input channels to + output channels. + groups (int): Same as nn.Conv2d. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels. Default: 4. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Default: None. + dcn (dict): Config dict for DCN. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + in_channels, + channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + radix=2, + reduction_factor=4, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + init_cfg=None): + super(SplitAttentionConv2d, self).__init__(init_cfg) + inter_channels = max(in_channels * radix // reduction_factor, 32) + self.radix = radix + self.groups = groups + self.channels = channels + self.with_dcn = dcn is not None + self.dcn = dcn + fallback_on_stride = False + if self.with_dcn: + fallback_on_stride = self.dcn.pop('fallback_on_stride', False) + if self.with_dcn and not fallback_on_stride: + assert conv_cfg is None, 'conv_cfg must be None for DCN' + conv_cfg = dcn + self.conv = build_conv_layer( + conv_cfg, + in_channels, + channels * radix, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups * radix, + bias=False) + # To be consistent with original implementation, starting from 0 + self.norm0_name, norm0 = build_norm_layer( + norm_cfg, channels * radix, postfix=0) + self.add_module(self.norm0_name, norm0) + self.relu = nn.ReLU(inplace=True) + self.fc1 = build_conv_layer( + None, channels, inter_channels, 1, groups=self.groups) + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, inter_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.fc2 = build_conv_layer( + None, inter_channels, channels * radix, 1, groups=self.groups) + self.rsoftmax = RSoftmax(radix, groups) + + @property + def norm0(self): + """nn.Module: the normalization layer named "norm0" """ + return getattr(self, self.norm0_name) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def forward(self, x): + x = self.conv(x) + x = self.norm0(x) + x = self.relu(x) + + batch, rchannel = x.shape[:2] + batch = x.size(0) + if self.radix > 1: + splits = x.view(batch, self.radix, -1, *x.shape[2:]) + gap = splits.sum(dim=1) + else: + gap = x + gap = F.adaptive_avg_pool2d(gap, 1) + gap = self.fc1(gap) + + gap = self.norm1(gap) + gap = self.relu(gap) + + atten = self.fc2(gap) + atten = self.rsoftmax(atten).view(batch, -1, 1, 1) + + if self.radix > 1: + attens = atten.view(batch, self.radix, -1, *atten.shape[2:]) + out = torch.sum(attens * splits, dim=1) + else: + out = atten * x + return out.contiguous() + + +class Bottleneck(_Bottleneck): + """Bottleneck block for ResNeSt. + + Args: + inplane (int): Input planes of this block. + planes (int): Middle planes of this block. + groups (int): Groups of conv2. + base_width (int): Base of width in terms of base channels. Default: 4. + base_channels (int): Base of channels for calculating width. + Default: 64. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels in + SplitAttentionConv2d. Default: 4. + avg_down_stride (bool): Whether to use average pool for stride in + Bottleneck. Default: True. + kwargs (dict): Key word arguments for base class. + """ + expansion = 4 + + def __init__(self, + inplanes, + planes, + groups=1, + base_width=4, + base_channels=64, + radix=2, + reduction_factor=4, + avg_down_stride=True, + **kwargs): + """Bottleneck block for ResNeSt.""" + super(Bottleneck, self).__init__(inplanes, planes, **kwargs) + + if groups == 1: + width = self.planes + else: + width = math.floor(self.planes * + (base_width / base_channels)) * groups + + self.avg_down_stride = avg_down_stride and self.conv2_stride > 1 + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, width, postfix=1) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.inplanes, + width, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.with_modulated_dcn = False + self.conv2 = SplitAttentionConv2d( + width, + width, + kernel_size=3, + stride=1 if self.avg_down_stride else self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + radix=radix, + reduction_factor=reduction_factor, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + dcn=self.dcn) + delattr(self, self.norm2_name) + + if self.avg_down_stride: + self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1) + + self.conv3 = build_conv_layer( + self.conv_cfg, + width, + self.planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + def forward(self, x): + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + out = self.conv2(out) + + if self.avg_down_stride: + out = self.avd_layer(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@MODELS.register_module() +class ResNeSt(ResNetV1d): + """ResNeSt backbone. + + Args: + groups (int): Number of groups of Bottleneck. Default: 1 + base_width (int): Base width of Bottleneck. Default: 4 + radix (int): Radix of SplitAttentionConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels in + SplitAttentionConv2d. Default: 4. + avg_down_stride (bool): Whether to use average pool for stride in + Bottleneck. Default: True. + kwargs (dict): Keyword arguments for ResNet. + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)), + 200: (Bottleneck, (3, 24, 36, 3)) + } + + def __init__(self, + groups=1, + base_width=4, + radix=2, + reduction_factor=4, + avg_down_stride=True, + **kwargs): + self.groups = groups + self.base_width = base_width + self.radix = radix + self.reduction_factor = reduction_factor + self.avg_down_stride = avg_down_stride + super(ResNeSt, self).__init__(**kwargs) + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``.""" + return ResLayer( + groups=self.groups, + base_width=self.base_width, + base_channels=self.base_channels, + radix=self.radix, + reduction_factor=self.reduction_factor, + avg_down_stride=self.avg_down_stride, + **kwargs) diff --git a/mmdetection/mmdet/models/backbones/resnet.py b/mmdetection/mmdet/models/backbones/resnet.py new file mode 100644 index 0000000..1d6f48f --- /dev/null +++ b/mmdetection/mmdet/models/backbones/resnet.py @@ -0,0 +1,672 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmdet.registry import MODELS +from ..layers import ResLayer + + +class BasicBlock(BaseModule): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None, + init_cfg=None): + super(BasicBlock, self).__init__(init_cfg) + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + 3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + conv_cfg, planes, planes, 3, padding=1, bias=False) + self.add_module(self.norm2_name, norm2) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +class Bottleneck(BaseModule): + expansion = 4 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None, + init_cfg=None): + """Bottleneck block for ResNet. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if + it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super(Bottleneck, self).__init__(init_cfg) + assert style in ['pytorch', 'caffe'] + assert dcn is None or isinstance(dcn, dict) + assert plugins is None or isinstance(plugins, list) + if plugins is not None: + allowed_position = ['after_conv1', 'after_conv2', 'after_conv3'] + assert all(p['position'] in allowed_position for p in plugins) + + self.inplanes = inplanes + self.planes = planes + self.stride = stride + self.dilation = dilation + self.style = style + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.dcn = dcn + self.with_dcn = dcn is not None + self.plugins = plugins + self.with_plugins = plugins is not None + + if self.with_plugins: + # collect plugins for conv1/conv2/conv3 + self.after_conv1_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv1' + ] + self.after_conv2_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv2' + ] + self.after_conv3_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv3' + ] + + if self.style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + norm_cfg, planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + fallback_on_stride = False + if self.with_dcn: + fallback_on_stride = dcn.pop('fallback_on_stride', False) + if not self.with_dcn or fallback_on_stride: + self.conv2 = build_conv_layer( + conv_cfg, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + self.conv2 = build_conv_layer( + dcn, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + conv_cfg, + planes, + planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + if self.with_plugins: + self.after_conv1_plugin_names = self.make_block_plugins( + planes, self.after_conv1_plugins) + self.after_conv2_plugin_names = self.make_block_plugins( + planes, self.after_conv2_plugins) + self.after_conv3_plugin_names = self.make_block_plugins( + planes * self.expansion, self.after_conv3_plugins) + + def make_block_plugins(self, in_channels, plugins): + """make plugins for block. + + Args: + in_channels (int): Input channels of plugin. + plugins (list[dict]): List of plugins cfg to build. + + Returns: + list[str]: List of the names of plugin. + """ + assert isinstance(plugins, list) + plugin_names = [] + for plugin in plugins: + plugin = plugin.copy() + name, layer = build_plugin_layer( + plugin, + in_channels=in_channels, + postfix=plugin.pop('postfix', '')) + assert not hasattr(self, name), f'duplicate plugin {name}' + self.add_module(name, layer) + plugin_names.append(name) + return plugin_names + + def forward_plugin(self, x, plugin_names): + out = x + for name in plugin_names: + out = getattr(self, name)(out) + return out + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) + + @property + def norm3(self): + """nn.Module: normalization layer after the third convolution layer""" + return getattr(self, self.norm3_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@MODELS.register_module() +class ResNet(BaseModule): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + stem_channels (int | None): Number of stem channels. If not specified, + it will be the same as `base_channels`. Default: None. + base_channels (int): Number of base channels of res layer. Default: 64. + in_channels (int): Number of input image channels. Default: 3. + num_stages (int): Resnet stages. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + norm_cfg (dict): Dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - position (str, required): Position inside block to insert + plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. + pretrained (str, optional): model pretrained path. Default: None + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + + Example: + >>> from mmdet.models import ResNet + >>> import torch + >>> self = ResNet(depth=18) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 64, 8, 8) + (1, 128, 4, 4) + (1, 256, 2, 2) + (1, 512, 1, 1) + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + depth, + in_channels=3, + stem_channels=None, + base_channels=64, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(0, 1, 2, 3), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + dcn=None, + stage_with_dcn=(False, False, False, False), + plugins=None, + with_cp=False, + zero_init_residual=True, + pretrained=None, + init_cfg=None): + super(ResNet, self).__init__(init_cfg) + self.zero_init_residual = zero_init_residual + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + + block_init_cfg = None + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + block = self.arch_settings[depth][0] + if self.zero_init_residual: + if block is BasicBlock: + block_init_cfg = dict( + type='Constant', + val=0, + override=dict(name='norm2')) + elif block is Bottleneck: + block_init_cfg = dict( + type='Constant', + val=0, + override=dict(name='norm3')) + else: + raise TypeError('pretrained must be a str or None') + + self.depth = depth + if stem_channels is None: + stem_channels = base_channels + self.stem_channels = stem_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert num_stages >= 1 and num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.dcn = dcn + self.stage_with_dcn = stage_with_dcn + if dcn is not None: + assert len(stage_with_dcn) == num_stages + self.plugins = plugins + self.block, stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + self.inplanes = stem_channels + + self._make_stem_layer(in_channels, stem_channels) + + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = strides[i] + dilation = dilations[i] + dcn = self.dcn if self.stage_with_dcn[i] else None + if plugins is not None: + stage_plugins = self.make_stage_plugins(plugins, i) + else: + stage_plugins = None + planes = base_channels * 2**i + res_layer = self.make_res_layer( + block=self.block, + inplanes=self.inplanes, + planes=planes, + num_blocks=num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + dcn=dcn, + plugins=stage_plugins, + init_cfg=block_init_cfg) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = self.block.expansion * base_channels * 2**( + len(self.stage_blocks) - 1) + + def make_stage_plugins(self, plugins, stage_idx): + """Make plugins for ResNet ``stage_idx`` th stage. + + Currently we support to insert ``context_block``, + ``empirical_attention_block``, ``nonlocal_block`` into the backbone + like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of + Bottleneck. + + An example of plugins format could be: + + Examples: + >>> plugins=[ + ... dict(cfg=dict(type='xxx', arg1='xxx'), + ... stages=(False, True, True, True), + ... position='after_conv2'), + ... dict(cfg=dict(type='yyy'), + ... stages=(True, True, True, True), + ... position='after_conv3'), + ... dict(cfg=dict(type='zzz', postfix='1'), + ... stages=(True, True, True, True), + ... position='after_conv3'), + ... dict(cfg=dict(type='zzz', postfix='2'), + ... stages=(True, True, True, True), + ... position='after_conv3') + ... ] + >>> self = ResNet(depth=18) + >>> stage_plugins = self.make_stage_plugins(plugins, 0) + >>> assert len(stage_plugins) == 3 + + Suppose ``stage_idx=0``, the structure of blocks in the stage would be: + + .. code-block:: none + + conv1-> conv2->conv3->yyy->zzz1->zzz2 + + Suppose 'stage_idx=1', the structure of blocks in the stage would be: + + .. code-block:: none + + conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2 + + If stages is missing, the plugin would be applied to all stages. + + Args: + plugins (list[dict]): List of plugins cfg to build. The postfix is + required if multiple same type plugins are inserted. + stage_idx (int): Index of stage to build + + Returns: + list[dict]: Plugins for current stage + """ + stage_plugins = [] + for plugin in plugins: + plugin = plugin.copy() + stages = plugin.pop('stages', None) + assert stages is None or len(stages) == self.num_stages + # whether to insert plugin into current stage + if stages is None or stages[stage_idx]: + stage_plugins.append(plugin) + + return stage_plugins + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``.""" + return ResLayer(**kwargs) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def _make_stem_layer(self, in_channels, stem_channels): + if self.deep_stem: + self.stem = nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels // 2, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels // 2)[1], + nn.ReLU(inplace=True), + build_conv_layer( + self.conv_cfg, + stem_channels // 2, + stem_channels // 2, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels // 2)[1], + nn.ReLU(inplace=True), + build_conv_layer( + self.conv_cfg, + stem_channels // 2, + stem_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels)[1], + nn.ReLU(inplace=True)) + else: + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels, + kernel_size=7, + stride=2, + padding=3, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, stem_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + if self.deep_stem: + self.stem.eval() + for param in self.stem.parameters(): + param.requires_grad = False + else: + self.norm1.eval() + for m in [self.conv1, self.norm1]: + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def forward(self, x): + """Forward function.""" + if self.deep_stem: + x = self.stem(x) + else: + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(ResNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + +@MODELS.register_module() +class ResNetV1d(ResNet): + r"""ResNetV1d variant described in `Bag of Tricks + `_. + + Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in + the input stem with three 3x3 convs. And in the downsampling block, a 2x2 + avg_pool with stride 2 is added before conv, whose stride is changed to 1. + """ + + def __init__(self, **kwargs): + super(ResNetV1d, self).__init__( + deep_stem=True, avg_down=True, **kwargs) diff --git a/mmdetection/mmdet/models/backbones/resnext.py b/mmdetection/mmdet/models/backbones/resnext.py new file mode 100644 index 0000000..df3d79e --- /dev/null +++ b/mmdetection/mmdet/models/backbones/resnext.py @@ -0,0 +1,154 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +from mmcv.cnn import build_conv_layer, build_norm_layer + +from mmdet.registry import MODELS +from ..layers import ResLayer +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResNet + + +class Bottleneck(_Bottleneck): + expansion = 4 + + def __init__(self, + inplanes, + planes, + groups=1, + base_width=4, + base_channels=64, + **kwargs): + """Bottleneck block for ResNeXt. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if + it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super(Bottleneck, self).__init__(inplanes, planes, **kwargs) + + if groups == 1: + width = self.planes + else: + width = math.floor(self.planes * + (base_width / base_channels)) * groups + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, width, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, width, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.inplanes, + width, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + fallback_on_stride = False + self.with_modulated_dcn = False + if self.with_dcn: + fallback_on_stride = self.dcn.pop('fallback_on_stride', False) + if not self.with_dcn or fallback_on_stride: + self.conv2 = build_conv_layer( + self.conv_cfg, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + self.conv2 = build_conv_layer( + self.dcn, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + self.conv_cfg, + width, + self.planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + if self.with_plugins: + self._del_block_plugins(self.after_conv1_plugin_names + + self.after_conv2_plugin_names + + self.after_conv3_plugin_names) + self.after_conv1_plugin_names = self.make_block_plugins( + width, self.after_conv1_plugins) + self.after_conv2_plugin_names = self.make_block_plugins( + width, self.after_conv2_plugins) + self.after_conv3_plugin_names = self.make_block_plugins( + self.planes * self.expansion, self.after_conv3_plugins) + + def _del_block_plugins(self, plugin_names): + """delete plugins for block if exist. + + Args: + plugin_names (list[str]): List of plugins name to delete. + """ + assert isinstance(plugin_names, list) + for plugin_name in plugin_names: + del self._modules[plugin_name] + + +@MODELS.register_module() +class ResNeXt(ResNet): + """ResNeXt backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + in_channels (int): Number of input image channels. Default: 3. + num_stages (int): Resnet stages. Default: 4. + groups (int): Group of resnext. + base_width (int): Base width of resnext. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + norm_cfg (dict): dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, groups=1, base_width=4, **kwargs): + self.groups = groups + self.base_width = base_width + super(ResNeXt, self).__init__(**kwargs) + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``""" + return ResLayer( + groups=self.groups, + base_width=self.base_width, + base_channels=self.base_channels, + **kwargs) diff --git a/mmdetection/mmdet/models/backbones/ssd_vgg.py b/mmdetection/mmdet/models/backbones/ssd_vgg.py new file mode 100644 index 0000000..843e82e --- /dev/null +++ b/mmdetection/mmdet/models/backbones/ssd_vgg.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch.nn as nn +from mmcv.cnn import VGG +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from ..necks import ssd_neck + + +@MODELS.register_module() +class SSDVGG(VGG, BaseModule): + """VGG Backbone network for single-shot-detection. + + Args: + depth (int): Depth of vgg, from {11, 13, 16, 19}. + with_last_pool (bool): Whether to add a pooling layer at the last + of the model + ceil_mode (bool): When True, will use `ceil` instead of `floor` + to compute the output shape. + out_indices (Sequence[int]): Output from which stages. + out_feature_indices (Sequence[int]): Output from which feature map. + pretrained (str, optional): model pretrained path. Default: None + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + input_size (int, optional): Deprecated argumment. + Width and height of input, from {300, 512}. + l2_norm_scale (float, optional) : Deprecated argumment. + L2 normalization layer init scale. + + Example: + >>> self = SSDVGG(input_size=300, depth=11) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 300, 300) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 1024, 19, 19) + (1, 512, 10, 10) + (1, 256, 5, 5) + (1, 256, 3, 3) + (1, 256, 1, 1) + """ + extra_setting = { + 300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256), + 512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128), + } + + def __init__(self, + depth, + with_last_pool=False, + ceil_mode=True, + out_indices=(3, 4), + out_feature_indices=(22, 34), + pretrained=None, + init_cfg=None, + input_size=None, + l2_norm_scale=None): + # TODO: in_channels for mmcv.VGG + super(SSDVGG, self).__init__( + depth, + with_last_pool=with_last_pool, + ceil_mode=ceil_mode, + out_indices=out_indices) + + self.features.add_module( + str(len(self.features)), + nn.MaxPool2d(kernel_size=3, stride=1, padding=1)) + self.features.add_module( + str(len(self.features)), + nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)) + self.features.add_module( + str(len(self.features)), nn.ReLU(inplace=True)) + self.features.add_module( + str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1)) + self.features.add_module( + str(len(self.features)), nn.ReLU(inplace=True)) + self.out_feature_indices = out_feature_indices + + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + + if init_cfg is not None: + self.init_cfg = init_cfg + elif isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict(type='Constant', val=1, layer='BatchNorm2d'), + dict(type='Normal', std=0.01, layer='Linear'), + ] + else: + raise TypeError('pretrained must be a str or None') + + if input_size is not None: + warnings.warn('DeprecationWarning: input_size is deprecated') + if l2_norm_scale is not None: + warnings.warn('DeprecationWarning: l2_norm_scale in VGG is ' + 'deprecated, it has been moved to SSDNeck.') + + def init_weights(self, pretrained=None): + super(VGG, self).init_weights() + + def forward(self, x): + """Forward function.""" + outs = [] + for i, layer in enumerate(self.features): + x = layer(x) + if i in self.out_feature_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + +class L2Norm(ssd_neck.L2Norm): + + def __init__(self, **kwargs): + super(L2Norm, self).__init__(**kwargs) + warnings.warn('DeprecationWarning: L2Norm in ssd_vgg.py ' + 'is deprecated, please use L2Norm in ' + 'mmdet/models/necks/ssd_neck.py instead') diff --git a/mmdetection/mmdet/models/backbones/swin.py b/mmdetection/mmdet/models/backbones/swin.py new file mode 100644 index 0000000..062190f --- /dev/null +++ b/mmdetection/mmdet/models/backbones/swin.py @@ -0,0 +1,819 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from collections import OrderedDict +from copy import deepcopy + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, build_dropout +from mmengine.logging import MMLogger +from mmengine.model import BaseModule, ModuleList +from mmengine.model.weight_init import (constant_init, trunc_normal_, + trunc_normal_init) +from mmengine.runner.checkpoint import CheckpointLoader +from mmengine.utils import to_2tuple + +from mmdet.registry import MODELS +from ..layers import PatchEmbed, PatchMerging + + +class WindowMSA(BaseModule): + """Window based multi-head self-attention (W-MSA) module with relative + position bias. + + Args: + embed_dims (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (tuple[int]): The height and width of the window. + qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. + Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Default: 0.0 + proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. + init_cfg (dict | None, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + window_size, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0., + proj_drop_rate=0., + init_cfg=None): + + super().__init__() + self.embed_dims = embed_dims + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_embed_dims = embed_dims // num_heads + self.scale = qk_scale or head_embed_dims**-0.5 + self.init_cfg = init_cfg + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), + num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # About 2x faster than original impl + Wh, Ww = self.window_size + rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww) + rel_position_index = rel_index_coords + rel_index_coords.T + rel_position_index = rel_position_index.flip(1).contiguous() + self.register_buffer('relative_position_index', rel_position_index) + + self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop_rate) + self.proj = nn.Linear(embed_dims, embed_dims) + self.proj_drop = nn.Dropout(proj_drop_rate) + + self.softmax = nn.Softmax(dim=-1) + + def init_weights(self): + trunc_normal_(self.relative_position_bias_table, std=0.02) + + def forward(self, x, mask=None): + """ + Args: + + x (tensor): input features with shape of (num_windows*B, N, C) + mask (tensor | None, Optional): mask with shape of (num_windows, + Wh*Ww, Wh*Ww), value should be between (-inf, 0]. + """ + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + # make torchscript happy (cannot use tensor as tuple) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B // nW, nW, self.num_heads, N, + N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + @staticmethod + def double_step_seq(step1, len1, step2, len2): + seq1 = torch.arange(0, step1 * len1, step1) + seq2 = torch.arange(0, step2 * len2, step2) + return (seq1[:, None] + seq2[None, :]).reshape(1, -1) + + +class ShiftWindowMSA(BaseModule): + """Shifted Window Multihead Self-Attention Module. + + Args: + embed_dims (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): The height and width of the window. + shift_size (int, optional): The shift step of each window towards + right-bottom. If zero, act as regular window-msa. Defaults to 0. + qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. + Default: True + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Defaults: None. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Defaults: 0. + proj_drop_rate (float, optional): Dropout ratio of output. + Defaults: 0. + dropout_layer (dict, optional): The dropout_layer used before output. + Defaults: dict(type='DropPath', drop_prob=0.). + init_cfg (dict, optional): The extra config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + window_size, + shift_size=0, + qkv_bias=True, + qk_scale=None, + attn_drop_rate=0, + proj_drop_rate=0, + dropout_layer=dict(type='DropPath', drop_prob=0.), + init_cfg=None): + super().__init__(init_cfg) + + self.window_size = window_size + self.shift_size = shift_size + assert 0 <= self.shift_size < self.window_size + + self.w_msa = WindowMSA( + embed_dims=embed_dims, + num_heads=num_heads, + window_size=to_2tuple(window_size), + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop_rate=attn_drop_rate, + proj_drop_rate=proj_drop_rate, + init_cfg=None) + + self.drop = build_dropout(dropout_layer) + + def forward(self, query, hw_shape): + B, L, C = query.shape + H, W = hw_shape + assert L == H * W, 'input feature has wrong size' + query = query.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b)) + H_pad, W_pad = query.shape[1], query.shape[2] + + # cyclic shift + if self.shift_size > 0: + shifted_query = torch.roll( + query, + shifts=(-self.shift_size, -self.shift_size), + dims=(1, 2)) + + # calculate attention mask for SW-MSA + img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device) + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, + -self.shift_size), slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, + -self.shift_size), slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + # nW, window_size, window_size, 1 + mask_windows = self.window_partition(img_mask) + mask_windows = mask_windows.view( + -1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, + float(-100.0)).masked_fill( + attn_mask == 0, float(0.0)) + else: + shifted_query = query + attn_mask = None + + # nW*B, window_size, window_size, C + query_windows = self.window_partition(shifted_query) + # nW*B, window_size*window_size, C + query_windows = query_windows.view(-1, self.window_size**2, C) + + # W-MSA/SW-MSA (nW*B, window_size*window_size, C) + attn_windows = self.w_msa(query_windows, mask=attn_mask) + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, + self.window_size, C) + + # B H' W' C + shifted_x = self.window_reverse(attn_windows, H_pad, W_pad) + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll( + shifted_x, + shifts=(self.shift_size, self.shift_size), + dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + x = self.drop(x) + return x + + def window_reverse(self, windows, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + window_size = self.window_size + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, + window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + def window_partition(self, x): + """ + Args: + x: (B, H, W, C) + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + window_size = self.window_size + x = x.view(B, H // window_size, window_size, W // window_size, + window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous() + windows = windows.view(-1, window_size, window_size, C) + return windows + + +class SwinBlock(BaseModule): + """" + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + window_size (int, optional): The local window scale. Default: 7. + shift (bool, optional): whether to shift window or not. Default False. + qkv_bias (bool, optional): enable bias for qkv if True. Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + drop_rate (float, optional): Dropout rate. Default: 0. + attn_drop_rate (float, optional): Attention dropout rate. Default: 0. + drop_path_rate (float, optional): Stochastic depth rate. Default: 0. + act_cfg (dict, optional): The config dict of activation function. + Default: dict(type='GELU'). + norm_cfg (dict, optional): The config dict of normalization. + Default: dict(type='LN'). + with_cp (bool, optional): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + init_cfg (dict | list | None, optional): The init config. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + feedforward_channels, + window_size=7, + shift=False, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + with_cp=False, + init_cfg=None): + + super(SwinBlock, self).__init__() + + self.init_cfg = init_cfg + self.with_cp = with_cp + + self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] + self.attn = ShiftWindowMSA( + embed_dims=embed_dims, + num_heads=num_heads, + window_size=window_size, + shift_size=window_size // 2 if shift else 0, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop_rate=attn_drop_rate, + proj_drop_rate=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + init_cfg=None) + + self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] + self.ffn = FFN( + embed_dims=embed_dims, + feedforward_channels=feedforward_channels, + num_fcs=2, + ffn_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + act_cfg=act_cfg, + add_identity=True, + init_cfg=None) + + def forward(self, x, hw_shape): + + def _inner_forward(x): + identity = x + x = self.norm1(x) + x = self.attn(x, hw_shape) + + x = x + identity + + identity = x + x = self.norm2(x) + x = self.ffn(x, identity=identity) + + return x + + if self.with_cp and x.requires_grad: + x = cp.checkpoint(_inner_forward, x) + else: + x = _inner_forward(x) + + return x + + +class SwinBlockSequence(BaseModule): + """Implements one stage in Swin Transformer. + + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + depth (int): The number of blocks in this stage. + window_size (int, optional): The local window scale. Default: 7. + qkv_bias (bool, optional): enable bias for qkv if True. Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + drop_rate (float, optional): Dropout rate. Default: 0. + attn_drop_rate (float, optional): Attention dropout rate. Default: 0. + drop_path_rate (float | list[float], optional): Stochastic depth + rate. Default: 0. + downsample (BaseModule | None, optional): The downsample operation + module. Default: None. + act_cfg (dict, optional): The config dict of activation function. + Default: dict(type='GELU'). + norm_cfg (dict, optional): The config dict of normalization. + Default: dict(type='LN'). + with_cp (bool, optional): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + init_cfg (dict | list | None, optional): The init config. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + feedforward_channels, + depth, + window_size=7, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + downsample=None, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + with_cp=False, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + if isinstance(drop_path_rate, list): + drop_path_rates = drop_path_rate + assert len(drop_path_rates) == depth + else: + drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)] + + self.blocks = ModuleList() + for i in range(depth): + block = SwinBlock( + embed_dims=embed_dims, + num_heads=num_heads, + feedforward_channels=feedforward_channels, + window_size=window_size, + shift=False if i % 2 == 0 else True, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rates[i], + act_cfg=act_cfg, + norm_cfg=norm_cfg, + with_cp=with_cp, + init_cfg=None) + self.blocks.append(block) + + self.downsample = downsample + + def forward(self, x, hw_shape): + for block in self.blocks: + x = block(x, hw_shape) + + if self.downsample: + x_down, down_hw_shape = self.downsample(x, hw_shape) + return x_down, down_hw_shape, x, hw_shape + else: + return x, hw_shape, x, hw_shape + + +@MODELS.register_module() +class SwinTransformer(BaseModule): + """ Swin Transformer + A PyTorch implement of : `Swin Transformer: + Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/abs/2103.14030 + + Inspiration from + https://github.com/microsoft/Swin-Transformer + + Args: + pretrain_img_size (int | tuple[int]): The size of input image when + pretrain. Defaults: 224. + in_channels (int): The num of input channels. + Defaults: 3. + embed_dims (int): The feature dimension. Default: 96. + patch_size (int | tuple[int]): Patch size. Default: 4. + window_size (int): Window size. Default: 7. + mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. + Default: 4. + depths (tuple[int]): Depths of each Swin Transformer stage. + Default: (2, 2, 6, 2). + num_heads (tuple[int]): Parallel attention heads of each Swin + Transformer stage. Default: (3, 6, 12, 24). + strides (tuple[int]): The patch merging or patch embedding stride of + each Swin Transformer stage. (In swin, we set kernel size equal to + stride.) Default: (4, 2, 2, 2). + out_indices (tuple[int]): Output from which stages. + Default: (0, 1, 2, 3). + qkv_bias (bool, optional): If True, add a learnable bias to query, key, + value. Default: True + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + patch_norm (bool): If add a norm layer for patch embed and patch + merging. Default: True. + drop_rate (float): Dropout rate. Defaults: 0. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Defaults: 0.1. + use_abs_pos_embed (bool): If True, add absolute position embedding to + the patch embedding. Defaults: False. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer at + output of backone. Defaults: dict(type='LN'). + with_cp (bool, optional): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + pretrained (str, optional): model pretrained path. Default: None. + convert_weights (bool): The flag indicates whether the + pre-trained model is from the original repo. We may need + to convert some keys to make it compatible. + Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + Default: -1 (-1 means not freezing any parameters). + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + pretrain_img_size=224, + in_channels=3, + embed_dims=96, + patch_size=4, + window_size=7, + mlp_ratio=4, + depths=(2, 2, 6, 2), + num_heads=(3, 6, 12, 24), + strides=(4, 2, 2, 2), + out_indices=(0, 1, 2, 3), + qkv_bias=True, + qk_scale=None, + patch_norm=True, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.1, + use_abs_pos_embed=False, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + with_cp=False, + pretrained=None, + convert_weights=False, + frozen_stages=-1, + init_cfg=None): + self.convert_weights = convert_weights + self.frozen_stages = frozen_stages + if isinstance(pretrain_img_size, int): + pretrain_img_size = to_2tuple(pretrain_img_size) + elif isinstance(pretrain_img_size, tuple): + if len(pretrain_img_size) == 1: + pretrain_img_size = to_2tuple(pretrain_img_size[0]) + assert len(pretrain_img_size) == 2, \ + f'The size of image should have length 1 or 2, ' \ + f'but got {len(pretrain_img_size)}' + + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + self.init_cfg = init_cfg + else: + raise TypeError('pretrained must be a str or None') + + super(SwinTransformer, self).__init__(init_cfg=init_cfg) + + num_layers = len(depths) + self.out_indices = out_indices + self.use_abs_pos_embed = use_abs_pos_embed + + assert strides[0] == patch_size, 'Use non-overlapping patch embed.' + + self.patch_embed = PatchEmbed( + in_channels=in_channels, + embed_dims=embed_dims, + conv_type='Conv2d', + kernel_size=patch_size, + stride=strides[0], + norm_cfg=norm_cfg if patch_norm else None, + init_cfg=None) + + if self.use_abs_pos_embed: + patch_row = pretrain_img_size[0] // patch_size + patch_col = pretrain_img_size[1] // patch_size + num_patches = patch_row * patch_col + self.absolute_pos_embed = nn.Parameter( + torch.zeros((1, num_patches, embed_dims))) + + self.drop_after_pos = nn.Dropout(p=drop_rate) + + # set stochastic depth decay rule + total_depth = sum(depths) + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, total_depth) + ] + + self.stages = ModuleList() + in_channels = embed_dims + for i in range(num_layers): + if i < num_layers - 1: + downsample = PatchMerging( + in_channels=in_channels, + out_channels=2 * in_channels, + stride=strides[i + 1], + norm_cfg=norm_cfg if patch_norm else None, + init_cfg=None) + else: + downsample = None + + stage = SwinBlockSequence( + embed_dims=in_channels, + num_heads=num_heads[i], + feedforward_channels=mlp_ratio * in_channels, + depth=depths[i], + window_size=window_size, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop_rate=drop_rate, + attn_drop_rate=attn_drop_rate, + drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], + downsample=downsample, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + with_cp=with_cp, + init_cfg=None) + self.stages.append(stage) + if downsample: + in_channels = downsample.out_channels + + self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] + # Add a norm layer for each output + for i in out_indices: + layer = build_norm_layer(norm_cfg, self.num_features[i])[1] + layer_name = f'norm{i}' + self.add_module(layer_name, layer) + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer, self).train(mode) + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + if self.use_abs_pos_embed: + self.absolute_pos_embed.requires_grad = False + self.drop_after_pos.eval() + + for i in range(1, self.frozen_stages + 1): + + if (i - 1) in self.out_indices: + norm_layer = getattr(self, f'norm{i-1}') + norm_layer.eval() + for param in norm_layer.parameters(): + param.requires_grad = False + + m = self.stages[i - 1] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self): + logger = MMLogger.get_current_instance() + if self.init_cfg is None: + logger.warn(f'No pre-trained weights for ' + f'{self.__class__.__name__}, ' + f'training start from scratch') + if self.use_abs_pos_embed: + trunc_normal_(self.absolute_pos_embed, std=0.02) + for m in self.modules(): + if isinstance(m, nn.Linear): + trunc_normal_init(m, std=.02, bias=0.) + elif isinstance(m, nn.LayerNorm): + constant_init(m, 1.0) + else: + assert 'checkpoint' in self.init_cfg, f'Only support ' \ + f'specify `Pretrained` in ' \ + f'`init_cfg` in ' \ + f'{self.__class__.__name__} ' + ckpt = CheckpointLoader.load_checkpoint( + self.init_cfg.checkpoint, logger=logger, map_location='cpu') + if 'state_dict' in ckpt: + _state_dict = ckpt['state_dict'] + elif 'model' in ckpt: + _state_dict = ckpt['model'] + else: + _state_dict = ckpt + if self.convert_weights: + # supported loading weight from original repo, + _state_dict = swin_converter(_state_dict) + + state_dict = OrderedDict() + for k, v in _state_dict.items(): + if k.startswith('backbone.'): + state_dict[k[9:]] = v + + # strip prefix of state_dict + if list(state_dict.keys())[0].startswith('module.'): + state_dict = {k[7:]: v for k, v in state_dict.items()} + + # reshape absolute position embedding + if state_dict.get('absolute_pos_embed') is not None: + absolute_pos_embed = state_dict['absolute_pos_embed'] + N1, L, C1 = absolute_pos_embed.size() + N2, C2, H, W = self.absolute_pos_embed.size() + if N1 != N2 or C1 != C2 or L != H * W: + logger.warning('Error in loading absolute_pos_embed, pass') + else: + state_dict['absolute_pos_embed'] = absolute_pos_embed.view( + N2, H, W, C2).permute(0, 3, 1, 2).contiguous() + + # interpolate position bias table if needed + relative_position_bias_table_keys = [ + k for k in state_dict.keys() + if 'relative_position_bias_table' in k + ] + for table_key in relative_position_bias_table_keys: + table_pretrained = state_dict[table_key] + table_current = self.state_dict()[table_key] + L1, nH1 = table_pretrained.size() + L2, nH2 = table_current.size() + if nH1 != nH2: + logger.warning(f'Error in loading {table_key}, pass') + elif L1 != L2: + S1 = int(L1**0.5) + S2 = int(L2**0.5) + table_pretrained_resized = F.interpolate( + table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1), + size=(S2, S2), + mode='bicubic') + state_dict[table_key] = table_pretrained_resized.view( + nH2, L2).permute(1, 0).contiguous() + + # load state_dict + self.load_state_dict(state_dict, False) + + def forward(self, x): + x, hw_shape = self.patch_embed(x) + + if self.use_abs_pos_embed: + x = x + self.absolute_pos_embed + x = self.drop_after_pos(x) + + outs = [] + for i, stage in enumerate(self.stages): + x, hw_shape, out, out_hw_shape = stage(x, hw_shape) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + out = norm_layer(out) + out = out.view(-1, *out_hw_shape, + self.num_features[i]).permute(0, 3, 1, + 2).contiguous() + outs.append(out) + + return outs + + +def swin_converter(ckpt): + + new_ckpt = OrderedDict() + + def correct_unfold_reduction_order(x): + out_channel, in_channel = x.shape + x = x.reshape(out_channel, 4, in_channel // 4) + x = x[:, [0, 2, 1, 3], :].transpose(1, + 2).reshape(out_channel, in_channel) + return x + + def correct_unfold_norm_order(x): + in_channel = x.shape[0] + x = x.reshape(4, in_channel // 4) + x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel) + return x + + for k, v in ckpt.items(): + if k.startswith('head'): + continue + elif k.startswith('layers'): + new_v = v + if 'attn.' in k: + new_k = k.replace('attn.', 'attn.w_msa.') + elif 'mlp.' in k: + if 'mlp.fc1.' in k: + new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.') + elif 'mlp.fc2.' in k: + new_k = k.replace('mlp.fc2.', 'ffn.layers.1.') + else: + new_k = k.replace('mlp.', 'ffn.') + elif 'downsample' in k: + new_k = k + if 'reduction.' in k: + new_v = correct_unfold_reduction_order(v) + elif 'norm.' in k: + new_v = correct_unfold_norm_order(v) + else: + new_k = k + new_k = new_k.replace('layers', 'stages', 1) + elif k.startswith('patch_embed'): + new_v = v + if 'proj' in k: + new_k = k.replace('proj', 'projection') + else: + new_k = k + else: + new_v = v + new_k = k + + new_ckpt['backbone.' + new_k] = new_v + + return new_ckpt diff --git a/mmdetection/mmdet/models/backbones/trident_resnet.py b/mmdetection/mmdet/models/backbones/trident_resnet.py new file mode 100644 index 0000000..22c7635 --- /dev/null +++ b/mmdetection/mmdet/models/backbones/trident_resnet.py @@ -0,0 +1,298 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmengine.model import BaseModule +from torch.nn.modules.utils import _pair + +from mmdet.models.backbones.resnet import Bottleneck, ResNet +from mmdet.registry import MODELS + + +class TridentConv(BaseModule): + """Trident Convolution Module. + + Args: + in_channels (int): Number of channels in input. + out_channels (int): Number of channels in output. + kernel_size (int): Size of convolution kernel. + stride (int, optional): Convolution stride. Default: 1. + trident_dilations (tuple[int, int, int], optional): Dilations of + different trident branch. Default: (1, 2, 3). + test_branch_idx (int, optional): In inference, all 3 branches will + be used if `test_branch_idx==-1`, otherwise only branch with + index `test_branch_idx` will be used. Default: 1. + bias (bool, optional): Whether to use bias in convolution or not. + Default: False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + trident_dilations=(1, 2, 3), + test_branch_idx=1, + bias=False, + init_cfg=None): + super(TridentConv, self).__init__(init_cfg) + self.num_branch = len(trident_dilations) + self.with_bias = bias + self.test_branch_idx = test_branch_idx + self.stride = _pair(stride) + self.kernel_size = _pair(kernel_size) + self.paddings = _pair(trident_dilations) + self.dilations = trident_dilations + self.in_channels = in_channels + self.out_channels = out_channels + self.bias = bias + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels, *self.kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.bias = None + + def extra_repr(self): + tmpstr = f'in_channels={self.in_channels}' + tmpstr += f', out_channels={self.out_channels}' + tmpstr += f', kernel_size={self.kernel_size}' + tmpstr += f', num_branch={self.num_branch}' + tmpstr += f', test_branch_idx={self.test_branch_idx}' + tmpstr += f', stride={self.stride}' + tmpstr += f', paddings={self.paddings}' + tmpstr += f', dilations={self.dilations}' + tmpstr += f', bias={self.bias}' + return tmpstr + + def forward(self, inputs): + if self.training or self.test_branch_idx == -1: + outputs = [ + F.conv2d(input, self.weight, self.bias, self.stride, padding, + dilation) for input, dilation, padding in zip( + inputs, self.dilations, self.paddings) + ] + else: + assert len(inputs) == 1 + outputs = [ + F.conv2d(inputs[0], self.weight, self.bias, self.stride, + self.paddings[self.test_branch_idx], + self.dilations[self.test_branch_idx]) + ] + + return outputs + + +# Since TridentNet is defined over ResNet50 and ResNet101, here we +# only support TridentBottleneckBlock. +class TridentBottleneck(Bottleneck): + """BottleBlock for TridentResNet. + + Args: + trident_dilations (tuple[int, int, int]): Dilations of different + trident branch. + test_branch_idx (int): In inference, all 3 branches will be used + if `test_branch_idx==-1`, otherwise only branch with index + `test_branch_idx` will be used. + concat_output (bool): Whether to concat the output list to a Tensor. + `True` only in the last Block. + """ + + def __init__(self, trident_dilations, test_branch_idx, concat_output, + **kwargs): + + super(TridentBottleneck, self).__init__(**kwargs) + self.trident_dilations = trident_dilations + self.num_branch = len(trident_dilations) + self.concat_output = concat_output + self.test_branch_idx = test_branch_idx + self.conv2 = TridentConv( + self.planes, + self.planes, + kernel_size=3, + stride=self.conv2_stride, + bias=False, + trident_dilations=self.trident_dilations, + test_branch_idx=test_branch_idx, + init_cfg=dict( + type='Kaiming', + distribution='uniform', + mode='fan_in', + override=dict(name='conv2'))) + + def forward(self, x): + + def _inner_forward(x): + num_branch = ( + self.num_branch + if self.training or self.test_branch_idx == -1 else 1) + identity = x + if not isinstance(x, list): + x = (x, ) * num_branch + identity = x + if self.downsample is not None: + identity = [self.downsample(b) for b in x] + + out = [self.conv1(b) for b in x] + out = [self.norm1(b) for b in out] + out = [self.relu(b) for b in out] + + if self.with_plugins: + for k in range(len(out)): + out[k] = self.forward_plugin(out[k], + self.after_conv1_plugin_names) + + out = self.conv2(out) + out = [self.norm2(b) for b in out] + out = [self.relu(b) for b in out] + if self.with_plugins: + for k in range(len(out)): + out[k] = self.forward_plugin(out[k], + self.after_conv2_plugin_names) + + out = [self.conv3(b) for b in out] + out = [self.norm3(b) for b in out] + + if self.with_plugins: + for k in range(len(out)): + out[k] = self.forward_plugin(out[k], + self.after_conv3_plugin_names) + + out = [ + out_b + identity_b for out_b, identity_b in zip(out, identity) + ] + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = [self.relu(b) for b in out] + if self.concat_output: + out = torch.cat(out, dim=0) + return out + + +def make_trident_res_layer(block, + inplanes, + planes, + num_blocks, + stride=1, + trident_dilations=(1, 2, 3), + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None, + test_branch_idx=-1): + """Build Trident Res Layers.""" + + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = [] + conv_stride = stride + downsample.extend([ + build_conv_layer( + conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=conv_stride, + bias=False), + build_norm_layer(norm_cfg, planes * block.expansion)[1] + ]) + downsample = nn.Sequential(*downsample) + + layers = [] + for i in range(num_blocks): + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride if i == 0 else 1, + trident_dilations=trident_dilations, + downsample=downsample if i == 0 else None, + style=style, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + dcn=dcn, + plugins=plugins, + test_branch_idx=test_branch_idx, + concat_output=True if i == num_blocks - 1 else False)) + inplanes = planes * block.expansion + return nn.Sequential(*layers) + + +@MODELS.register_module() +class TridentResNet(ResNet): + """The stem layer, stage 1 and stage 2 in Trident ResNet are identical to + ResNet, while in stage 3, Trident BottleBlock is utilized to replace the + normal BottleBlock to yield trident output. Different branch shares the + convolution weight but uses different dilations to achieve multi-scale + output. + + / stage3(b0) \ + x - stem - stage1 - stage2 - stage3(b1) - output + \ stage3(b2) / + + Args: + depth (int): Depth of resnet, from {50, 101, 152}. + num_branch (int): Number of branches in TridentNet. + test_branch_idx (int): In inference, all 3 branches will be used + if `test_branch_idx==-1`, otherwise only branch with index + `test_branch_idx` will be used. + trident_dilations (tuple[int]): Dilations of different trident branch. + len(trident_dilations) should be equal to num_branch. + """ # noqa + + def __init__(self, depth, num_branch, test_branch_idx, trident_dilations, + **kwargs): + + assert num_branch == len(trident_dilations) + assert depth in (50, 101, 152) + super(TridentResNet, self).__init__(depth, **kwargs) + assert self.num_stages == 3 + self.test_branch_idx = test_branch_idx + self.num_branch = num_branch + + last_stage_idx = self.num_stages - 1 + stride = self.strides[last_stage_idx] + dilation = trident_dilations + dcn = self.dcn if self.stage_with_dcn[last_stage_idx] else None + if self.plugins is not None: + stage_plugins = self.make_stage_plugins(self.plugins, + last_stage_idx) + else: + stage_plugins = None + planes = self.base_channels * 2**last_stage_idx + res_layer = make_trident_res_layer( + TridentBottleneck, + inplanes=(self.block.expansion * self.base_channels * + 2**(last_stage_idx - 1)), + planes=planes, + num_blocks=self.stage_blocks[last_stage_idx], + stride=stride, + trident_dilations=dilation, + style=self.style, + with_cp=self.with_cp, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + dcn=dcn, + plugins=stage_plugins, + test_branch_idx=self.test_branch_idx) + + layer_name = f'layer{last_stage_idx + 1}' + + self.__setattr__(layer_name, res_layer) + self.res_layers.pop(last_stage_idx) + self.res_layers.insert(last_stage_idx, layer_name) + + self._freeze_stages() diff --git a/mmdetection/mmdet/models/data_preprocessors/__init__.py b/mmdetection/mmdet/models/data_preprocessors/__init__.py new file mode 100644 index 0000000..201a1da --- /dev/null +++ b/mmdetection/mmdet/models/data_preprocessors/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_preprocessor import (BatchFixedSizePad, BatchResize, + BatchSyncRandomResize, BoxInstDataPreprocessor, + DetDataPreprocessor, + MultiBranchDataPreprocessor) +from .reid_data_preprocessor import ReIDDataPreprocessor +from .track_data_preprocessor import TrackDataPreprocessor + +__all__ = [ + 'DetDataPreprocessor', 'BatchSyncRandomResize', 'BatchFixedSizePad', + 'MultiBranchDataPreprocessor', 'BatchResize', 'BoxInstDataPreprocessor', + 'TrackDataPreprocessor', 'ReIDDataPreprocessor' +] diff --git a/mmdetection/mmdet/models/data_preprocessors/data_preprocessor.py b/mmdetection/mmdet/models/data_preprocessors/data_preprocessor.py new file mode 100644 index 0000000..55b5c35 --- /dev/null +++ b/mmdetection/mmdet/models/data_preprocessors/data_preprocessor.py @@ -0,0 +1,793 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +from numbers import Number +from typing import List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.dist import barrier, broadcast, get_dist_info +from mmengine.logging import MessageHub +from mmengine.model import BaseDataPreprocessor, ImgDataPreprocessor +from mmengine.structures import PixelData +from mmengine.utils import is_seq_of +from torch import Tensor + +from mmdet.models.utils import unfold_wo_center +from mmdet.models.utils.misc import samplelist_boxtype2tensor +from mmdet.registry import MODELS +from mmdet.structures import DetDataSample +from mmdet.structures.mask import BitmapMasks +from mmdet.utils import ConfigType + +try: + import skimage +except ImportError: + skimage = None + + +@MODELS.register_module() +class DetDataPreprocessor(ImgDataPreprocessor): + """Image pre-processor for detection tasks. + + Comparing with the :class:`mmengine.ImgDataPreprocessor`, + + 1. It supports batch augmentations. + 2. It will additionally append batch_input_shape and pad_shape + to data_samples considering the object detection task. + + It provides the data pre-processing as follows + + - Collate and move data to the target device. + - Pad inputs to the maximum size of current batch with defined + ``pad_value``. The padding size can be divisible by a defined + ``pad_size_divisor`` + - Stack inputs to batch_inputs. + - Convert inputs from bgr to rgb if the shape of input is (3, H, W). + - Normalize image with defined std and mean. + - Do batch augmentations during training. + + Args: + mean (Sequence[Number], optional): The pixel mean of R, G, B channels. + Defaults to None. + std (Sequence[Number], optional): The pixel standard deviation of + R, G, B channels. Defaults to None. + pad_size_divisor (int): The size of padded image should be + divisible by ``pad_size_divisor``. Defaults to 1. + pad_value (Number): The padded pixel value. Defaults to 0. + pad_mask (bool): Whether to pad instance masks. Defaults to False. + mask_pad_value (int): The padded pixel value for instance masks. + Defaults to 0. + pad_seg (bool): Whether to pad semantic segmentation maps. + Defaults to False. + seg_pad_value (int): The padded pixel value for semantic + segmentation maps. Defaults to 255. + bgr_to_rgb (bool): whether to convert image from BGR to RGB. + Defaults to False. + rgb_to_bgr (bool): whether to convert image from RGB to RGB. + Defaults to False. + boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of + bboxes data to ``Tensor`` type. Defaults to True. + non_blocking (bool): Whether block current process + when transferring data to device. Defaults to False. + batch_augments (list[dict], optional): Batch-level augmentations + """ + + def __init__(self, + mean: Sequence[Number] = None, + std: Sequence[Number] = None, + pad_size_divisor: int = 1, + pad_value: Union[float, int] = 0, + pad_mask: bool = False, + mask_pad_value: int = 0, + pad_seg: bool = False, + seg_pad_value: int = 255, + bgr_to_rgb: bool = False, + rgb_to_bgr: bool = False, + boxtype2tensor: bool = True, + non_blocking: Optional[bool] = False, + batch_augments: Optional[List[dict]] = None): + super().__init__( + mean=mean, + std=std, + pad_size_divisor=pad_size_divisor, + pad_value=pad_value, + bgr_to_rgb=bgr_to_rgb, + rgb_to_bgr=rgb_to_bgr, + non_blocking=non_blocking) + if batch_augments is not None: + self.batch_augments = nn.ModuleList( + [MODELS.build(aug) for aug in batch_augments]) + else: + self.batch_augments = None + self.pad_mask = pad_mask + self.mask_pad_value = mask_pad_value + self.pad_seg = pad_seg + self.seg_pad_value = seg_pad_value + self.boxtype2tensor = boxtype2tensor + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization,padding and bgr2rgb conversion based on + ``BaseDataPreprocessor``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + batch_pad_shape = self._get_pad_shape(data) + data = super().forward(data=data, training=training) + inputs, data_samples = data['inputs'], data['data_samples'] + + if data_samples is not None: + # NOTE the batched image size information may be useful, e.g. + # in DETR, this is needed for the construction of masks, which is + # then used for the transformer_head. + batch_input_shape = tuple(inputs[0].size()[-2:]) + for data_sample, pad_shape in zip(data_samples, batch_pad_shape): + data_sample.set_metainfo({ + 'batch_input_shape': batch_input_shape, + 'pad_shape': pad_shape + }) + + if self.boxtype2tensor: + samplelist_boxtype2tensor(data_samples) + + if self.pad_mask and training: + self.pad_gt_masks(data_samples) + + if self.pad_seg and training: + self.pad_gt_sem_seg(data_samples) + + if training and self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + return {'inputs': inputs, 'data_samples': data_samples} + + def _get_pad_shape(self, data: dict) -> List[tuple]: + """Get the pad_shape of each image based on data and + pad_size_divisor.""" + _batch_inputs = data['inputs'] + # Process data with `pseudo_collate`. + if is_seq_of(_batch_inputs, torch.Tensor): + batch_pad_shape = [] + for ori_input in _batch_inputs: + pad_h = int( + np.ceil(ori_input.shape[1] / + self.pad_size_divisor)) * self.pad_size_divisor + pad_w = int( + np.ceil(ori_input.shape[2] / + self.pad_size_divisor)) * self.pad_size_divisor + batch_pad_shape.append((pad_h, pad_w)) + # Process data with `default_collate`. + elif isinstance(_batch_inputs, torch.Tensor): + assert _batch_inputs.dim() == 4, ( + 'The input of `ImgDataPreprocessor` should be a NCHW tensor ' + 'or a list of tensor, but got a tensor with shape: ' + f'{_batch_inputs.shape}') + pad_h = int( + np.ceil(_batch_inputs.shape[2] / + self.pad_size_divisor)) * self.pad_size_divisor + pad_w = int( + np.ceil(_batch_inputs.shape[3] / + self.pad_size_divisor)) * self.pad_size_divisor + batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0] + else: + raise TypeError('Output of `cast_data` should be a dict ' + 'or a tuple with inputs and data_samples, but got' + f'{type(data)}: {data}') + return batch_pad_shape + + def pad_gt_masks(self, + batch_data_samples: Sequence[DetDataSample]) -> None: + """Pad gt_masks to shape of batch_input_shape.""" + if 'masks' in batch_data_samples[0].gt_instances: + for data_samples in batch_data_samples: + masks = data_samples.gt_instances.masks + data_samples.gt_instances.masks = masks.pad( + data_samples.batch_input_shape, + pad_val=self.mask_pad_value) + + def pad_gt_sem_seg(self, + batch_data_samples: Sequence[DetDataSample]) -> None: + """Pad gt_sem_seg to shape of batch_input_shape.""" + if 'gt_sem_seg' in batch_data_samples[0]: + for data_samples in batch_data_samples: + gt_sem_seg = data_samples.gt_sem_seg.sem_seg + h, w = gt_sem_seg.shape[-2:] + pad_h, pad_w = data_samples.batch_input_shape + gt_sem_seg = F.pad( + gt_sem_seg, + pad=(0, max(pad_w - w, 0), 0, max(pad_h - h, 0)), + mode='constant', + value=self.seg_pad_value) + data_samples.gt_sem_seg = PixelData(sem_seg=gt_sem_seg) + + +@MODELS.register_module() +class BatchSyncRandomResize(nn.Module): + """Batch random resize which synchronizes the random size across ranks. + + Args: + random_size_range (tuple): The multi-scale random range during + multi-scale training. + interval (int): The iter interval of change + image size. Defaults to 10. + size_divisor (int): Image size divisible factor. + Defaults to 32. + """ + + def __init__(self, + random_size_range: Tuple[int, int], + interval: int = 10, + size_divisor: int = 32) -> None: + super().__init__() + self.rank, self.world_size = get_dist_info() + self._input_size = None + self._random_size_range = (round(random_size_range[0] / size_divisor), + round(random_size_range[1] / size_divisor)) + self._interval = interval + self._size_divisor = size_divisor + + def forward( + self, inputs: Tensor, data_samples: List[DetDataSample] + ) -> Tuple[Tensor, List[DetDataSample]]: + """resize a batch of images and bboxes to shape ``self._input_size``""" + h, w = inputs.shape[-2:] + if self._input_size is None: + self._input_size = (h, w) + scale_y = self._input_size[0] / h + scale_x = self._input_size[1] / w + if scale_x != 1 or scale_y != 1: + inputs = F.interpolate( + inputs, + size=self._input_size, + mode='bilinear', + align_corners=False) + for data_sample in data_samples: + img_shape = (int(data_sample.img_shape[0] * scale_y), + int(data_sample.img_shape[1] * scale_x)) + pad_shape = (int(data_sample.pad_shape[0] * scale_y), + int(data_sample.pad_shape[1] * scale_x)) + data_sample.set_metainfo({ + 'img_shape': img_shape, + 'pad_shape': pad_shape, + 'batch_input_shape': self._input_size + }) + data_sample.gt_instances.bboxes[ + ..., + 0::2] = data_sample.gt_instances.bboxes[..., + 0::2] * scale_x + data_sample.gt_instances.bboxes[ + ..., + 1::2] = data_sample.gt_instances.bboxes[..., + 1::2] * scale_y + if 'ignored_instances' in data_sample: + data_sample.ignored_instances.bboxes[ + ..., 0::2] = data_sample.ignored_instances.bboxes[ + ..., 0::2] * scale_x + data_sample.ignored_instances.bboxes[ + ..., 1::2] = data_sample.ignored_instances.bboxes[ + ..., 1::2] * scale_y + message_hub = MessageHub.get_current_instance() + if (message_hub.get_info('iter') + 1) % self._interval == 0: + self._input_size = self._get_random_size( + aspect_ratio=float(w / h), device=inputs.device) + return inputs, data_samples + + def _get_random_size(self, aspect_ratio: float, + device: torch.device) -> Tuple[int, int]: + """Randomly generate a shape in ``_random_size_range`` and broadcast to + all ranks.""" + tensor = torch.LongTensor(2).to(device) + if self.rank == 0: + size = random.randint(*self._random_size_range) + size = (self._size_divisor * size, + self._size_divisor * int(aspect_ratio * size)) + tensor[0] = size[0] + tensor[1] = size[1] + barrier() + broadcast(tensor, 0) + input_size = (tensor[0].item(), tensor[1].item()) + return input_size + + +@MODELS.register_module() +class BatchFixedSizePad(nn.Module): + """Fixed size padding for batch images. + + Args: + size (Tuple[int, int]): Fixed padding size. Expected padding + shape (h, w). Defaults to None. + img_pad_value (int): The padded pixel value for images. + Defaults to 0. + pad_mask (bool): Whether to pad instance masks. Defaults to False. + mask_pad_value (int): The padded pixel value for instance masks. + Defaults to 0. + pad_seg (bool): Whether to pad semantic segmentation maps. + Defaults to False. + seg_pad_value (int): The padded pixel value for semantic + segmentation maps. Defaults to 255. + """ + + def __init__(self, + size: Tuple[int, int], + img_pad_value: int = 0, + pad_mask: bool = False, + mask_pad_value: int = 0, + pad_seg: bool = False, + seg_pad_value: int = 255) -> None: + super().__init__() + self.size = size + self.pad_mask = pad_mask + self.pad_seg = pad_seg + self.img_pad_value = img_pad_value + self.mask_pad_value = mask_pad_value + self.seg_pad_value = seg_pad_value + + def forward( + self, + inputs: Tensor, + data_samples: Optional[List[dict]] = None + ) -> Tuple[Tensor, Optional[List[dict]]]: + """Pad image, instance masks, segmantic segmentation maps.""" + src_h, src_w = inputs.shape[-2:] + dst_h, dst_w = self.size + + if src_h >= dst_h and src_w >= dst_w: + return inputs, data_samples + + inputs = F.pad( + inputs, + pad=(0, max(0, dst_w - src_w), 0, max(0, dst_h - src_h)), + mode='constant', + value=self.img_pad_value) + + if data_samples is not None: + # update batch_input_shape + for data_sample in data_samples: + data_sample.set_metainfo({ + 'batch_input_shape': (dst_h, dst_w), + 'pad_shape': (dst_h, dst_w) + }) + + if self.pad_mask: + for data_sample in data_samples: + masks = data_sample.gt_instances.masks + data_sample.gt_instances.masks = masks.pad( + (dst_h, dst_w), pad_val=self.mask_pad_value) + + if self.pad_seg: + for data_sample in data_samples: + gt_sem_seg = data_sample.gt_sem_seg.sem_seg + h, w = gt_sem_seg.shape[-2:] + gt_sem_seg = F.pad( + gt_sem_seg, + pad=(0, max(0, dst_w - w), 0, max(0, dst_h - h)), + mode='constant', + value=self.seg_pad_value) + data_sample.gt_sem_seg = PixelData(sem_seg=gt_sem_seg) + + return inputs, data_samples + + +@MODELS.register_module() +class MultiBranchDataPreprocessor(BaseDataPreprocessor): + """DataPreprocessor wrapper for multi-branch data. + + Take semi-supervised object detection as an example, assume that + the ratio of labeled data and unlabeled data in a batch is 1:2, + `sup` indicates the branch where the labeled data is augmented, + `unsup_teacher` and `unsup_student` indicate the branches where + the unlabeled data is augmented by different pipeline. + + The input format of multi-branch data is shown as below : + + .. code-block:: none + { + 'inputs': + { + 'sup': [Tensor, None, None], + 'unsup_teacher': [None, Tensor, Tensor], + 'unsup_student': [None, Tensor, Tensor], + }, + 'data_sample': + { + 'sup': [DetDataSample, None, None], + 'unsup_teacher': [None, DetDataSample, DetDataSample], + 'unsup_student': [NOne, DetDataSample, DetDataSample], + } + } + + The format of multi-branch data + after filtering None is shown as below : + + .. code-block:: none + { + 'inputs': + { + 'sup': [Tensor], + 'unsup_teacher': [Tensor, Tensor], + 'unsup_student': [Tensor, Tensor], + }, + 'data_sample': + { + 'sup': [DetDataSample], + 'unsup_teacher': [DetDataSample, DetDataSample], + 'unsup_student': [DetDataSample, DetDataSample], + } + } + + In order to reuse `DetDataPreprocessor` for the data + from different branches, the format of multi-branch data + grouped by branch is as below : + + .. code-block:: none + { + 'sup': + { + 'inputs': [Tensor] + 'data_sample': [DetDataSample, DetDataSample] + }, + 'unsup_teacher': + { + 'inputs': [Tensor, Tensor] + 'data_sample': [DetDataSample, DetDataSample] + }, + 'unsup_student': + { + 'inputs': [Tensor, Tensor] + 'data_sample': [DetDataSample, DetDataSample] + }, + } + + After preprocessing data from different branches, + the multi-branch data needs to be reformatted as: + + .. code-block:: none + { + 'inputs': + { + 'sup': [Tensor], + 'unsup_teacher': [Tensor, Tensor], + 'unsup_student': [Tensor, Tensor], + }, + 'data_sample': + { + 'sup': [DetDataSample], + 'unsup_teacher': [DetDataSample, DetDataSample], + 'unsup_student': [DetDataSample, DetDataSample], + } + } + + Args: + data_preprocessor (:obj:`ConfigDict` or dict): Config of + :class:`DetDataPreprocessor` to process the input data. + """ + + def __init__(self, data_preprocessor: ConfigType) -> None: + super().__init__() + self.data_preprocessor = MODELS.build(data_preprocessor) + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization,padding and bgr2rgb conversion based on + ``BaseDataPreprocessor`` for multi-branch data. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: + + - 'inputs' (Dict[str, obj:`torch.Tensor`]): The forward data of + models from different branches. + - 'data_sample' (Dict[str, obj:`DetDataSample`]): The annotation + info of the sample from different branches. + """ + + if training is False: + return self.data_preprocessor(data, training) + + # Filter out branches with a value of None + for key in data.keys(): + for branch in data[key].keys(): + data[key][branch] = list( + filter(lambda x: x is not None, data[key][branch])) + + # Group data by branch + multi_branch_data = {} + for key in data.keys(): + for branch in data[key].keys(): + if multi_branch_data.get(branch, None) is None: + multi_branch_data[branch] = {key: data[key][branch]} + elif multi_branch_data[branch].get(key, None) is None: + multi_branch_data[branch][key] = data[key][branch] + else: + multi_branch_data[branch][key].append(data[key][branch]) + + # Preprocess data from different branches + for branch, _data in multi_branch_data.items(): + multi_branch_data[branch] = self.data_preprocessor(_data, training) + + # Format data by inputs and data_samples + format_data = {} + for branch in multi_branch_data.keys(): + for key in multi_branch_data[branch].keys(): + if format_data.get(key, None) is None: + format_data[key] = {branch: multi_branch_data[branch][key]} + elif format_data[key].get(branch, None) is None: + format_data[key][branch] = multi_branch_data[branch][key] + else: + format_data[key][branch].append( + multi_branch_data[branch][key]) + + return format_data + + @property + def device(self): + return self.data_preprocessor.device + + def to(self, device: Optional[Union[int, torch.device]], *args, + **kwargs) -> nn.Module: + """Overrides this method to set the :attr:`device` + + Args: + device (int or torch.device, optional): The desired device of the + parameters and buffers in this module. + + Returns: + nn.Module: The model itself. + """ + + return self.data_preprocessor.to(device, *args, **kwargs) + + def cuda(self, *args, **kwargs) -> nn.Module: + """Overrides this method to set the :attr:`device` + + Returns: + nn.Module: The model itself. + """ + + return self.data_preprocessor.cuda(*args, **kwargs) + + def cpu(self, *args, **kwargs) -> nn.Module: + """Overrides this method to set the :attr:`device` + + Returns: + nn.Module: The model itself. + """ + + return self.data_preprocessor.cpu(*args, **kwargs) + + +@MODELS.register_module() +class BatchResize(nn.Module): + """Batch resize during training. This implementation is modified from + https://github.com/Purkialo/CrowdDet/blob/master/lib/data/CrowdHuman.py. + + It provides the data pre-processing as follows: + - A batch of all images will pad to a uniform size and stack them into + a torch.Tensor by `DetDataPreprocessor`. + - `BatchFixShapeResize` resize all images to the target size. + - Padding images to make sure the size of image can be divisible by + ``pad_size_divisor``. + + Args: + scale (tuple): Images scales for resizing. + pad_size_divisor (int): Image size divisible factor. + Defaults to 1. + pad_value (Number): The padded pixel value. Defaults to 0. + """ + + def __init__( + self, + scale: tuple, + pad_size_divisor: int = 1, + pad_value: Union[float, int] = 0, + ) -> None: + super().__init__() + self.min_size = min(scale) + self.max_size = max(scale) + self.pad_size_divisor = pad_size_divisor + self.pad_value = pad_value + + def forward( + self, inputs: Tensor, data_samples: List[DetDataSample] + ) -> Tuple[Tensor, List[DetDataSample]]: + """resize a batch of images and bboxes.""" + + batch_height, batch_width = inputs.shape[-2:] + target_height, target_width, scale = self.get_target_size( + batch_height, batch_width) + + inputs = F.interpolate( + inputs, + size=(target_height, target_width), + mode='bilinear', + align_corners=False) + + inputs = self.get_padded_tensor(inputs, self.pad_value) + + if data_samples is not None: + batch_input_shape = tuple(inputs.size()[-2:]) + for data_sample in data_samples: + img_shape = [ + int(scale * _) for _ in list(data_sample.img_shape) + ] + data_sample.set_metainfo({ + 'img_shape': tuple(img_shape), + 'batch_input_shape': batch_input_shape, + 'pad_shape': batch_input_shape, + 'scale_factor': (scale, scale) + }) + + data_sample.gt_instances.bboxes *= scale + data_sample.ignored_instances.bboxes *= scale + + return inputs, data_samples + + def get_target_size(self, height: int, + width: int) -> Tuple[int, int, float]: + """Get the target size of a batch of images based on data and scale.""" + im_size_min = np.min([height, width]) + im_size_max = np.max([height, width]) + scale = self.min_size / im_size_min + if scale * im_size_max > self.max_size: + scale = self.max_size / im_size_max + target_height, target_width = int(round(height * scale)), int( + round(width * scale)) + return target_height, target_width, scale + + def get_padded_tensor(self, tensor: Tensor, pad_value: int) -> Tensor: + """Pad images according to pad_size_divisor.""" + assert tensor.ndim == 4 + target_height, target_width = tensor.shape[-2], tensor.shape[-1] + divisor = self.pad_size_divisor + padded_height = (target_height + divisor - 1) // divisor * divisor + padded_width = (target_width + divisor - 1) // divisor * divisor + padded_tensor = torch.ones([ + tensor.shape[0], tensor.shape[1], padded_height, padded_width + ]) * pad_value + padded_tensor = padded_tensor.type_as(tensor) + padded_tensor[:, :, :target_height, :target_width] = tensor + return padded_tensor + + +@MODELS.register_module() +class BoxInstDataPreprocessor(DetDataPreprocessor): + """Pseudo mask pre-processor for BoxInst. + + Comparing with the :class:`mmdet.DetDataPreprocessor`, + + 1. It generates masks using box annotations. + 2. It computes the images color similarity in LAB color space. + + Args: + mask_stride (int): The mask output stride in boxinst. Defaults to 4. + pairwise_size (int): The size of neighborhood for each pixel. + Defaults to 3. + pairwise_dilation (int): The dilation of neighborhood for each pixel. + Defaults to 2. + pairwise_color_thresh (float): The thresh of image color similarity. + Defaults to 0.3. + bottom_pixels_removed (int): The length of removed pixels in bottom. + It is caused by the annotation error in coco dataset. + Defaults to 10. + """ + + def __init__(self, + *arg, + mask_stride: int = 4, + pairwise_size: int = 3, + pairwise_dilation: int = 2, + pairwise_color_thresh: float = 0.3, + bottom_pixels_removed: int = 10, + **kwargs) -> None: + super().__init__(*arg, **kwargs) + self.mask_stride = mask_stride + self.pairwise_size = pairwise_size + self.pairwise_dilation = pairwise_dilation + self.pairwise_color_thresh = pairwise_color_thresh + self.bottom_pixels_removed = bottom_pixels_removed + + if skimage is None: + raise RuntimeError('skimage is not installed,\ + please install it by: pip install scikit-image') + + def get_images_color_similarity(self, inputs: Tensor, + image_masks: Tensor) -> Tensor: + """Compute the image color similarity in LAB color space.""" + assert inputs.dim() == 4 + assert inputs.size(0) == 1 + + unfolded_images = unfold_wo_center( + inputs, + kernel_size=self.pairwise_size, + dilation=self.pairwise_dilation) + diff = inputs[:, :, None] - unfolded_images + similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5) + + unfolded_weights = unfold_wo_center( + image_masks[None, None], + kernel_size=self.pairwise_size, + dilation=self.pairwise_dilation) + unfolded_weights = torch.max(unfolded_weights, dim=1)[0] + + return similarity * unfolded_weights + + def forward(self, data: dict, training: bool = False) -> dict: + """Get pseudo mask labels using color similarity.""" + det_data = super().forward(data, training) + inputs, data_samples = det_data['inputs'], det_data['data_samples'] + + if training: + # get image masks and remove bottom pixels + b_img_h, b_img_w = data_samples[0].batch_input_shape + img_masks = [] + for i in range(inputs.shape[0]): + img_h, img_w = data_samples[i].img_shape + img_mask = inputs.new_ones((img_h, img_w)) + pixels_removed = int(self.bottom_pixels_removed * + float(img_h) / float(b_img_h)) + if pixels_removed > 0: + img_mask[-pixels_removed:, :] = 0 + pad_w = b_img_w - img_w + pad_h = b_img_h - img_h + img_mask = F.pad(img_mask, (0, pad_w, 0, pad_h), 'constant', + 0.) + img_masks.append(img_mask) + img_masks = torch.stack(img_masks, dim=0) + start = int(self.mask_stride // 2) + img_masks = img_masks[:, start::self.mask_stride, + start::self.mask_stride] + + # Get origin rgb image for color similarity + ori_imgs = inputs * self.std + self.mean + downsampled_imgs = F.avg_pool2d( + ori_imgs.float(), + kernel_size=self.mask_stride, + stride=self.mask_stride, + padding=0) + + # Compute color similarity for pseudo mask generation + for im_i, data_sample in enumerate(data_samples): + # TODO: Support rgb2lab in mmengine? + images_lab = skimage.color.rgb2lab( + downsampled_imgs[im_i].byte().permute(1, 2, + 0).cpu().numpy()) + images_lab = torch.as_tensor( + images_lab, device=ori_imgs.device, dtype=torch.float32) + images_lab = images_lab.permute(2, 0, 1)[None] + images_color_similarity = self.get_images_color_similarity( + images_lab, img_masks[im_i]) + pairwise_mask = (images_color_similarity >= + self.pairwise_color_thresh).float() + + per_im_bboxes = data_sample.gt_instances.bboxes + if per_im_bboxes.shape[0] > 0: + per_im_masks = [] + for per_box in per_im_bboxes: + mask_full = torch.zeros((b_img_h, b_img_w), + device=self.device).float() + mask_full[int(per_box[1]):int(per_box[3] + 1), + int(per_box[0]):int(per_box[2] + 1)] = 1.0 + per_im_masks.append(mask_full) + per_im_masks = torch.stack(per_im_masks, dim=0) + pairwise_masks = torch.cat( + [pairwise_mask for _ in range(per_im_bboxes.shape[0])], + dim=0) + else: + per_im_masks = torch.zeros((0, b_img_h, b_img_w)) + pairwise_masks = torch.zeros( + (0, self.pairwise_size**2 - 1, b_img_h, b_img_w)) + + # TODO: Support BitmapMasks with tensor? + data_sample.gt_instances.masks = BitmapMasks( + per_im_masks.cpu().numpy(), b_img_h, b_img_w) + data_sample.gt_instances.pairwise_masks = pairwise_masks + return {'inputs': inputs, 'data_samples': data_samples} diff --git a/mmdetection/mmdet/models/data_preprocessors/reid_data_preprocessor.py b/mmdetection/mmdet/models/data_preprocessors/reid_data_preprocessor.py new file mode 100644 index 0000000..3d0a1d4 --- /dev/null +++ b/mmdetection/mmdet/models/data_preprocessors/reid_data_preprocessor.py @@ -0,0 +1,216 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from numbers import Number +from typing import Optional, Sequence + +import torch +import torch.nn.functional as F +from mmengine.model import BaseDataPreprocessor, stack_batch + +from mmdet.registry import MODELS + +try: + import mmpretrain + from mmpretrain.models.utils.batch_augments import RandomBatchAugment + from mmpretrain.structures import (batch_label_to_onehot, cat_batch_labels, + tensor_split) +except ImportError: + mmpretrain = None + + +def stack_batch_scores(elements, device=None): + """Stack the ``score`` of a batch of :obj:`LabelData` to a tensor. + + Args: + elements (List[LabelData]): A batch of :obj`LabelData`. + device (torch.device, optional): The output device of the batch label. + Defaults to None. + Returns: + torch.Tensor: The stacked score tensor. + """ + item = elements[0] + if 'score' not in item._data_fields: + return None + + batch_score = torch.stack([element.score for element in elements]) + if device is not None: + batch_score = batch_score.to(device) + return batch_score + + +@MODELS.register_module() +class ReIDDataPreprocessor(BaseDataPreprocessor): + """Image pre-processor for classification tasks. + + Comparing with the :class:`mmengine.model.ImgDataPreprocessor`, + + 1. It won't do normalization if ``mean`` is not specified. + 2. It does normalization and color space conversion after stacking batch. + 3. It supports batch augmentations like mixup and cutmix. + + It provides the data pre-processing as follows + + - Collate and move data to the target device. + - Pad inputs to the maximum size of current batch with defined + ``pad_value``. The padding size can be divisible by a defined + ``pad_size_divisor`` + - Stack inputs to batch_inputs. + - Convert inputs from bgr to rgb if the shape of input is (3, H, W). + - Normalize image with defined std and mean. + - Do batch augmentations like Mixup and Cutmix during training. + + Args: + mean (Sequence[Number], optional): The pixel mean of R, G, B channels. + Defaults to None. + std (Sequence[Number], optional): The pixel standard deviation of + R, G, B channels. Defaults to None. + pad_size_divisor (int): The size of padded image should be + divisible by ``pad_size_divisor``. Defaults to 1. + pad_value (Number): The padded pixel value. Defaults to 0. + to_rgb (bool): whether to convert image from BGR to RGB. + Defaults to False. + to_onehot (bool): Whether to generate one-hot format gt-labels and set + to data samples. Defaults to False. + num_classes (int, optional): The number of classes. Defaults to None. + batch_augments (dict, optional): The batch augmentations settings, + including "augments" and "probs". For more details, see + :class:`mmpretrain.models.RandomBatchAugment`. + """ + + def __init__(self, + mean: Sequence[Number] = None, + std: Sequence[Number] = None, + pad_size_divisor: int = 1, + pad_value: Number = 0, + to_rgb: bool = False, + to_onehot: bool = False, + num_classes: Optional[int] = None, + batch_augments: Optional[dict] = None): + if mmpretrain is None: + raise RuntimeError('Please run "pip install openmim" and ' + 'run "mim install mmpretrain" to ' + 'install mmpretrain first.') + super().__init__() + self.pad_size_divisor = pad_size_divisor + self.pad_value = pad_value + self.to_rgb = to_rgb + self.to_onehot = to_onehot + self.num_classes = num_classes + + if mean is not None: + assert std is not None, 'To enable the normalization in ' \ + 'preprocessing, please specify both `mean` and `std`.' + # Enable the normalization in preprocessing. + self._enable_normalize = True + self.register_buffer('mean', + torch.tensor(mean).view(-1, 1, 1), False) + self.register_buffer('std', + torch.tensor(std).view(-1, 1, 1), False) + else: + self._enable_normalize = False + + if batch_augments is not None: + self.batch_augments = RandomBatchAugment(**batch_augments) + if not self.to_onehot: + from mmengine.logging import MMLogger + MMLogger.get_current_instance().info( + 'Because batch augmentations are enabled, the data ' + 'preprocessor automatically enables the `to_onehot` ' + 'option to generate one-hot format labels.') + self.to_onehot = True + else: + self.batch_augments = None + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding, bgr2rgb conversion and batch + augmentation based on ``BaseDataPreprocessor``. + + Args: + data (dict): data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + inputs = self.cast_data(data['inputs']) + + if isinstance(inputs, torch.Tensor): + # The branch if use `default_collate` as the collate_fn in the + # dataloader. + + # ------ To RGB ------ + if self.to_rgb and inputs.size(1) == 3: + inputs = inputs.flip(1) + + # -- Normalization --- + inputs = inputs.float() + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + # ------ Padding ----- + if self.pad_size_divisor > 1: + h, w = inputs.shape[-2:] + + target_h = math.ceil( + h / self.pad_size_divisor) * self.pad_size_divisor + target_w = math.ceil( + w / self.pad_size_divisor) * self.pad_size_divisor + pad_h = target_h - h + pad_w = target_w - w + inputs = F.pad(inputs, (0, pad_w, 0, pad_h), 'constant', + self.pad_value) + else: + # The branch if use `pseudo_collate` as the collate_fn in the + # dataloader. + + processed_inputs = [] + for input_ in inputs: + # ------ To RGB ------ + if self.to_rgb and input_.size(0) == 3: + input_ = input_.flip(0) + + # -- Normalization --- + input_ = input_.float() + if self._enable_normalize: + input_ = (input_ - self.mean) / self.std + + processed_inputs.append(input_) + # Combine padding and stack + inputs = stack_batch(processed_inputs, self.pad_size_divisor, + self.pad_value) + + data_samples = data.get('data_samples', None) + sample_item = data_samples[0] if data_samples is not None else None + if 'gt_label' in sample_item: + gt_labels = [sample.gt_label for sample in data_samples] + gt_labels_tensor = [gt_label.label for gt_label in gt_labels] + batch_label, label_indices = cat_batch_labels(gt_labels_tensor) + batch_label = batch_label.to(self.device) + + batch_score = stack_batch_scores(gt_labels, device=self.device) + if batch_score is None and self.to_onehot: + assert batch_label is not None, \ + 'Cannot generate onehot format labels because no labels.' + num_classes = self.num_classes or data_samples[0].get( + 'num_classes') + assert num_classes is not None, \ + 'Cannot generate one-hot format labels because not set ' \ + '`num_classes` in `data_preprocessor`.' + batch_score = batch_label_to_onehot(batch_label, label_indices, + num_classes) + + # ----- Batch Augmentations ---- + if training and self.batch_augments is not None: + inputs, batch_score = self.batch_augments(inputs, batch_score) + + # ----- scatter labels and scores to data samples --- + if batch_label is not None: + for sample, label in zip( + data_samples, tensor_split(batch_label, + label_indices)): + sample.set_gt_label(label) + if batch_score is not None: + for sample, score in zip(data_samples, batch_score): + sample.set_gt_score(score) + + return {'inputs': inputs, 'data_samples': data_samples} diff --git a/mmdetection/mmdet/models/data_preprocessors/track_data_preprocessor.py b/mmdetection/mmdet/models/data_preprocessors/track_data_preprocessor.py new file mode 100644 index 0000000..40a65b8 --- /dev/null +++ b/mmdetection/mmdet/models/data_preprocessors/track_data_preprocessor.py @@ -0,0 +1,266 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Sequence, Union + +import numpy as np +import torch +import torch.nn.functional as F +from mmengine.model.utils import stack_batch + +from mmdet.models.utils.misc import samplelist_boxtype2tensor +from mmdet.registry import MODELS +from mmdet.structures import TrackDataSample +from mmdet.structures.mask import BitmapMasks +from .data_preprocessor import DetDataPreprocessor + + +@MODELS.register_module() +class TrackDataPreprocessor(DetDataPreprocessor): + """Image pre-processor for tracking tasks. + + Accepts the data sampled by the dataloader, and preprocesses + it into the format of the model input. ``TrackDataPreprocessor`` + provides the tracking data pre-processing as follows: + + - Collate and move data to the target device. + - Pad inputs to the maximum size of current batch with defined + ``pad_value``. The padding size can be divisible by a defined + ``pad_size_divisor`` + - Stack inputs to inputs. + - Convert inputs from bgr to rgb if the shape of input is (1, 3, H, W). + - Normalize image with defined std and mean. + - Do batch augmentations during training. + - Record the information of ``batch_input_shape`` and ``pad_shape``. + + Args: + mean (Sequence[Number], optional): The pixel mean of R, G, B + channels. Defaults to None. + std (Sequence[Number], optional): The pixel standard deviation of + R, G, B channels. Defaults to None. + pad_size_divisor (int): The size of padded image should be + divisible by ``pad_size_divisor``. Defaults to 1. + pad_value (Number): The padded pixel value. Defaults to 0. + pad_mask (bool): Whether to pad instance masks. Defaults to False. + mask_pad_value (int): The padded pixel value for instance masks. + Defaults to 0. + bgr_to_rgb (bool): whether to convert image from BGR to RGB. + Defaults to False. + rgb_to_bgr (bool): whether to convert image from RGB to RGB. + Defaults to False. + use_det_processor: (bool): whether to use DetDataPreprocessor + in training phrase. This is mainly for some tracking models + fed into one image rather than a group of image in training. + Defaults to False. + . boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of + bboxes data to ``Tensor`` type. Defaults to True. + batch_augments (list[dict], optional): Batch-level augmentations + """ + + def __init__(self, + mean: Optional[Sequence[Union[float, int]]] = None, + std: Optional[Sequence[Union[float, int]]] = None, + use_det_processor: bool = False, + **kwargs): + super().__init__(mean=mean, std=std, **kwargs) + self.use_det_processor = use_det_processor + if mean is not None and not self.use_det_processor: + # overwrite the ``register_bufffer`` in ``ImgDataPreprocessor`` + # since the shape of ``mean`` and ``std`` in tracking tasks must be + # (T, C, H, W), which T is the temporal length of the video. + self.register_buffer('mean', + torch.tensor(mean).view(1, -1, 1, 1), False) + self.register_buffer('std', + torch.tensor(std).view(1, -1, 1, 1), False) + + def forward(self, data: dict, training: bool = False) -> Dict: + """Perform normalization,padding and bgr2rgb conversion based on + ``TrackDataPreprocessor``. + + Args: + data (dict): data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + Tuple[Dict[str, List[torch.Tensor]], OptSampleList]: Data in the + same format as the model input. + """ + if self.use_det_processor and training: + batch_pad_shape = self._get_pad_shape(data) + else: + batch_pad_shape = self._get_track_pad_shape(data) + + data = self.cast_data(data) + imgs, data_samples = data['inputs'], data['data_samples'] + + if self.use_det_processor and training: + assert imgs[0].dim() == 3, \ + 'Only support the 3 dims when use detpreprocessor in training' + if self._channel_conversion: + imgs = [_img[[2, 1, 0], ...] for _img in imgs] + # Convert to `float` + imgs = [_img.float() for _img in imgs] + if self._enable_normalize: + imgs = [(_img - self.mean) / self.std for _img in imgs] + inputs = stack_batch(imgs, self.pad_size_divisor, self.pad_value) + else: + assert imgs[0].dim() == 4, \ + 'Only support the 4 dims when use trackprocessor in training' + # The shape of imgs[0] is (T, C, H, W). + channel = imgs[0].size(1) + if self._channel_conversion and channel == 3: + imgs = [_img[:, [2, 1, 0], ...] for _img in imgs] + # change to `float` + imgs = [_img.float() for _img in imgs] + if self._enable_normalize: + imgs = [(_img - self.mean) / self.std for _img in imgs] + inputs = stack_track_batch(imgs, self.pad_size_divisor, + self.pad_value) + + if data_samples is not None: + # NOTE the batched image size information may be useful, e.g. + # in DETR, this is needed for the construction of masks, which is + # then used for the transformer_head. + batch_input_shape = tuple(inputs.size()[-2:]) + if self.use_det_processor and training: + for data_sample, pad_shape in zip(data_samples, + batch_pad_shape): + data_sample.set_metainfo({ + 'batch_input_shape': batch_input_shape, + 'pad_shape': pad_shape + }) + if self.boxtype2tensor: + samplelist_boxtype2tensor(data_samples) + if self.pad_mask: + self.pad_gt_masks(data_samples) + else: + for track_data_sample, pad_shapes in zip( + data_samples, batch_pad_shape): + for i in range(len(track_data_sample)): + det_data_sample = track_data_sample[i] + det_data_sample.set_metainfo({ + 'batch_input_shape': batch_input_shape, + 'pad_shape': pad_shapes[i] + }) + if self.pad_mask and training: + self.pad_track_gt_masks(data_samples) + + if training and self.batch_augments is not None: + for batch_aug in self.batch_augments: + if self.use_det_processor and training: + inputs, data_samples = batch_aug(inputs, data_samples) + else: + # we only support T==1 when using batch augments. + # Only yolox need batch_aug, and yolox can only process + # (N, C, H, W) shape. + # The shape of `inputs` is (N, T, C, H, W), hence, we use + # inputs[:, 0] to change the shape to (N, C, H, W). + assert inputs.size(1) == 1 and len( + data_samples[0] + ) == 1, 'Only support the number of sequence images equals to 1 when using batch augment.' # noqa: E501 + det_data_samples = [ + track_data_sample[0] + for track_data_sample in data_samples + ] + aug_inputs, aug_det_samples = batch_aug( + inputs[:, 0], det_data_samples) + inputs = aug_inputs.unsqueeze(1) + for track_data_sample, det_sample in zip( + data_samples, aug_det_samples): + track_data_sample.video_data_samples = [det_sample] + + # Note: inputs may contain large number of frames, so we must make + # sure that the mmeory is contiguous for stable forward + inputs = inputs.contiguous() + + return dict(inputs=inputs, data_samples=data_samples) + + def _get_track_pad_shape(self, data: dict) -> Dict[str, List]: + """Get the pad_shape of each image based on data and pad_size_divisor. + + Args: + data (dict): Data sampled from dataloader. + + Returns: + Dict[str, List]: The shape of padding. + """ + batch_pad_shape = dict() + batch_pad_shape = [] + for imgs in data['inputs']: + # The sequence images in one sample among a batch have the same + # original shape + pad_h = int(np.ceil(imgs.shape[-2] / + self.pad_size_divisor)) * self.pad_size_divisor + pad_w = int(np.ceil(imgs.shape[-1] / + self.pad_size_divisor)) * self.pad_size_divisor + pad_shapes = [(pad_h, pad_w)] * imgs.size(0) + batch_pad_shape.append(pad_shapes) + return batch_pad_shape + + def pad_track_gt_masks(self, + data_samples: Sequence[TrackDataSample]) -> None: + """Pad gt_masks to shape of batch_input_shape.""" + if 'masks' in data_samples[0][0].get('gt_instances', None): + for track_data_sample in data_samples: + for i in range(len(track_data_sample)): + det_data_sample = track_data_sample[i] + masks = det_data_sample.gt_instances.masks + # TODO: whether to use BitmapMasks + assert isinstance(masks, BitmapMasks) + batch_input_shape = det_data_sample.batch_input_shape + det_data_sample.gt_instances.masks = masks.pad( + batch_input_shape, pad_val=self.mask_pad_value) + + +def stack_track_batch(tensors: List[torch.Tensor], + pad_size_divisor: int = 0, + pad_value: Union[int, float] = 0) -> torch.Tensor: + """Stack multiple tensors to form a batch and pad the images to the max + shape use the right bottom padding mode in these images. If + ``pad_size_divisor > 0``, add padding to ensure the common height and width + is divisible by ``pad_size_divisor``. The difference between this function + and ``stack_batch`` in MMEngine is that this function can process batch + sequence images with shape (N, T, C, H, W). + + Args: + tensors (List[Tensor]): The input multiple tensors. each is a + TCHW 4D-tensor. T denotes the number of key/reference frames. + pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding + to ensure the common height and width is divisible by + ``pad_size_divisor``. This depends on the model, and many + models need a divisibility of 32. Defaults to 0 + pad_value (int, float): The padding value. Defaults to 0 + + Returns: + Tensor: The NTCHW 5D-tensor. N denotes the batch size. + """ + assert isinstance(tensors, list), \ + f'Expected input type to be list, but got {type(tensors)}' + assert len(set([tensor.ndim for tensor in tensors])) == 1, \ + f'Expected the dimensions of all tensors must be the same, ' \ + f'but got {[tensor.ndim for tensor in tensors]}' + assert tensors[0].ndim == 4, f'Expected tensor dimension to be 4, ' \ + f'but got {tensors[0].ndim}' + assert len(set([tensor.shape[0] for tensor in tensors])) == 1, \ + f'Expected the channels of all tensors must be the same, ' \ + f'but got {[tensor.shape[0] for tensor in tensors]}' + + tensor_sizes = [(tensor.shape[-2], tensor.shape[-1]) for tensor in tensors] + max_size = np.stack(tensor_sizes).max(0) + + if pad_size_divisor > 1: + # the last two dims are H,W, both subject to divisibility requirement + max_size = ( + max_size + + (pad_size_divisor - 1)) // pad_size_divisor * pad_size_divisor + + padded_samples = [] + for tensor in tensors: + padding_size = [ + 0, max_size[-1] - tensor.shape[-1], 0, + max_size[-2] - tensor.shape[-2] + ] + if sum(padding_size) == 0: + padded_samples.append(tensor) + else: + padded_samples.append(F.pad(tensor, padding_size, value=pad_value)) + + return torch.stack(padded_samples, dim=0) diff --git a/mmdetection/mmdet/models/dense_heads/__init__.py b/mmdetection/mmdet/models/dense_heads/__init__.py new file mode 100644 index 0000000..c9b55ec --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/__init__.py @@ -0,0 +1,72 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .anchor_free_head import AnchorFreeHead +from .anchor_head import AnchorHead +from .atss_head import ATSSHead +from .atss_vlfusion_head import ATSSVLFusionHead +from .autoassign_head import AutoAssignHead +from .boxinst_head import BoxInstBboxHead, BoxInstMaskHead +from .cascade_rpn_head import CascadeRPNHead, StageCascadeRPNHead +from .centernet_head import CenterNetHead +from .centernet_update_head import CenterNetUpdateHead +from .centripetal_head import CentripetalHead +from .condinst_head import CondInstBboxHead, CondInstMaskHead +from .conditional_detr_head import ConditionalDETRHead +from .corner_head import CornerHead +from .dab_detr_head import DABDETRHead +from .ddod_head import DDODHead +from .ddq_detr_head import DDQDETRHead +from .deformable_detr_head import DeformableDETRHead +from .detr_head import DETRHead +from .dino_head import DINOHead +from .embedding_rpn_head import EmbeddingRPNHead +from .fcos_head import FCOSHead +from .fovea_head import FoveaHead +from .free_anchor_retina_head import FreeAnchorRetinaHead +from .fsaf_head import FSAFHead +from .ga_retina_head import GARetinaHead +from .ga_rpn_head import GARPNHead +from .gfl_head import GFLHead +from .grounding_dino_head import GroundingDINOHead +from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead +from .lad_head import LADHead +from .ld_head import LDHead +from .mask2former_head import Mask2FormerHead +from .maskformer_head import MaskFormerHead +from .nasfcos_head import NASFCOSHead +from .paa_head import PAAHead +from .pisa_retinanet_head import PISARetinaHead +from .pisa_ssd_head import PISASSDHead +from .reppoints_head import RepPointsHead +from .retina_head import RetinaHead +from .retina_sepbn_head import RetinaSepBNHead +from .rpn_head import RPNHead +from .rtmdet_head import RTMDetHead, RTMDetSepBNHead +from .rtmdet_ins_head import RTMDetInsHead, RTMDetInsSepBNHead +from .sabl_retina_head import SABLRetinaHead +from .solo_head import DecoupledSOLOHead, DecoupledSOLOLightHead, SOLOHead +from .solov2_head import SOLOV2Head +from .ssd_head import SSDHead +from .tood_head import TOODHead +from .vfnet_head import VFNetHead +from .yolact_head import YOLACTHead, YOLACTProtonet +from .yolo_head import YOLOV3Head +from .yolof_head import YOLOFHead +from .yolox_head import YOLOXHead + +__all__ = [ + 'AnchorFreeHead', 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption', + 'RPNHead', 'GARPNHead', 'RetinaHead', 'RetinaSepBNHead', 'GARetinaHead', + 'SSDHead', 'FCOSHead', 'RepPointsHead', 'FoveaHead', + 'FreeAnchorRetinaHead', 'ATSSHead', 'FSAFHead', 'NASFCOSHead', + 'PISARetinaHead', 'PISASSDHead', 'GFLHead', 'CornerHead', 'YOLACTHead', + 'YOLACTProtonet', 'YOLOV3Head', 'PAAHead', 'SABLRetinaHead', + 'CentripetalHead', 'VFNetHead', 'StageCascadeRPNHead', 'CascadeRPNHead', + 'EmbeddingRPNHead', 'LDHead', 'AutoAssignHead', 'DETRHead', 'YOLOFHead', + 'DeformableDETRHead', 'CenterNetHead', 'YOLOXHead', 'SOLOHead', + 'DecoupledSOLOHead', 'DecoupledSOLOLightHead', 'SOLOV2Head', 'LADHead', + 'TOODHead', 'MaskFormerHead', 'Mask2FormerHead', 'DDODHead', + 'CenterNetUpdateHead', 'RTMDetHead', 'RTMDetSepBNHead', 'CondInstBboxHead', + 'CondInstMaskHead', 'RTMDetInsHead', 'RTMDetInsSepBNHead', + 'BoxInstBboxHead', 'BoxInstMaskHead', 'ConditionalDETRHead', 'DINOHead', + 'ATSSVLFusionHead', 'DABDETRHead', 'DDQDETRHead', 'GroundingDINOHead' +] diff --git a/mmdetection/mmdet/models/dense_heads/anchor_free_head.py b/mmdetection/mmdet/models/dense_heads/anchor_free_head.py new file mode 100644 index 0000000..90a9b36 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/anchor_free_head.py @@ -0,0 +1,317 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractmethod +from typing import Any, List, Sequence, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from numpy import ndarray +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType, + OptInstanceList) +from ..task_modules.prior_generators import MlvlPointGenerator +from ..utils import multi_apply +from .base_dense_head import BaseDenseHead + +StrideType = Union[Sequence[int], Sequence[Tuple[int, int]]] + + +@MODELS.register_module() +class AnchorFreeHead(BaseDenseHead): + """Anchor-free head (FCOS, Fovea, RepPoints, etc.). + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels. Used in child classes. + stacked_convs (int): Number of stacking convs of the head. + strides (Sequence[int] or Sequence[Tuple[int, int]]): Downsample + factor of each feature map. + dcn_on_last_conv (bool): If true, use dcn in the last layer of + towers. Defaults to False. + conv_bias (bool or str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Default: "auto". + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. Defaults + 'DistancePointBBoxCoder'. + conv_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for + normalization layer. Defaults to None. + train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of + anchor-free head. + test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of + anchor-free head. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. + """ # noqa: W605 + + _version = 1 + + def __init__( + self, + num_classes: int, + in_channels: int, + feat_channels: int = 256, + stacked_convs: int = 4, + strides: StrideType = (4, 8, 16, 32, 64), + dcn_on_last_conv: bool = False, + conv_bias: Union[bool, str] = 'auto', + loss_cls: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox: ConfigType = dict(type='IoULoss', loss_weight=1.0), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', name='conv_cls', std=0.01, bias_prob=0.01)) + ) -> None: + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + self.in_channels = in_channels + self.feat_channels = feat_channels + self.stacked_convs = stacked_convs + self.strides = strides + self.dcn_on_last_conv = dcn_on_last_conv + assert conv_bias == 'auto' or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + self.bbox_coder = TASK_UTILS.build(bbox_coder) + + self.prior_generator = MlvlPointGenerator(strides) + + # In order to keep a more general interface and be consistent with + # anchor_head. We can think of point like one anchor + self.num_base_priors = self.prior_generator.num_base_priors[0] + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.fp16_enabled = False + + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self._init_cls_convs() + self._init_reg_convs() + self._init_predictor() + + def _init_cls_convs(self) -> None: + """Initialize classification conv layers of the head.""" + self.cls_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type='DCNv2') + else: + conv_cfg = self.conv_cfg + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias)) + + def _init_reg_convs(self) -> None: + """Initialize bbox regression conv layers of the head.""" + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type='DCNv2') + else: + conv_cfg = self.conv_cfg + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias)) + + def _init_predictor(self) -> None: + """Initialize predictor layers of the head.""" + self.conv_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + + def _load_from_state_dict(self, state_dict: dict, prefix: str, + local_metadata: dict, strict: bool, + missing_keys: Union[List[str], str], + unexpected_keys: Union[List[str], str], + error_msgs: Union[List[str], str]) -> None: + """Hack some keys of the model state dict so that can load checkpoints + of previous version.""" + version = local_metadata.get('version', None) + if version is None: + # the key is different in early versions + # for example, 'fcos_cls' become 'conv_cls' now + bbox_head_keys = [ + k for k in state_dict.keys() if k.startswith(prefix) + ] + ori_predictor_keys = [] + new_predictor_keys = [] + # e.g. 'fcos_cls' or 'fcos_reg' + for key in bbox_head_keys: + ori_predictor_keys.append(key) + key = key.split('.') + if len(key) < 2: + conv_name = None + elif key[1].endswith('cls'): + conv_name = 'conv_cls' + elif key[1].endswith('reg'): + conv_name = 'conv_reg' + elif key[1].endswith('centerness'): + conv_name = 'conv_centerness' + else: + conv_name = None + if conv_name is not None: + key[1] = conv_name + new_predictor_keys.append('.'.join(key)) + else: + ori_predictor_keys.pop(-1) + for i in range(len(new_predictor_keys)): + state_dict[new_predictor_keys[i]] = state_dict.pop( + ori_predictor_keys[i]) + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually contain classification scores and bbox predictions. + + - cls_scores (list[Tensor]): Box scores for each scale level, \ + each is a 4D-tensor, the channel number is \ + num_points * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for each scale \ + level, each is a 4D-tensor, the channel number is num_points * 4. + """ + return multi_apply(self.forward_single, x)[:2] + + def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + + Returns: + tuple: Scores for each class, bbox predictions, features + after classification and regression conv layers, some + models needs these features like FCOS. + """ + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + cls_score = self.conv_cls(cls_feat) + + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + bbox_pred = self.conv_reg(reg_feat) + return cls_score, bbox_pred, cls_feat, reg_feat + + @abstractmethod + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + """ + + raise NotImplementedError + + @abstractmethod + def get_targets(self, points: List[Tensor], + batch_gt_instances: InstanceList) -> Any: + """Compute regression, classification and centerness targets for points + in multiple images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + """ + raise NotImplementedError + + # TODO refactor aug_test + def aug_test(self, + aug_batch_feats: List[Tensor], + aug_batch_img_metas: List[List[Tensor]], + rescale: bool = False) -> List[ndarray]: + """Test function with test time augmentation. + + Args: + aug_batch_feats (list[Tensor]): the outer list indicates test-time + augmentations and inner Tensor should have a shape NxCxHxW, + which contains features for all images in the batch. + aug_batch_img_metas (list[list[dict]]): the outer list indicates + test-time augs (multiscale, flip, etc.) and the inner list + indicates images in a batch. each dict has image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[ndarray]: bbox results of each class + """ + return self.aug_test_bboxes( + aug_batch_feats, aug_batch_img_metas, rescale=rescale) diff --git a/mmdetection/mmdet/models/dense_heads/anchor_head.py b/mmdetection/mmdet/models/dense_heads/anchor_head.py new file mode 100644 index 0000000..4578cac --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/anchor_head.py @@ -0,0 +1,530 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import BaseBoxes, cat_boxes, get_box_tensor +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig) +from ..task_modules.prior_generators import (AnchorGenerator, + anchor_inside_flags) +from ..task_modules.samplers import PseudoSampler +from ..utils import images_to_levels, multi_apply, unmap +from .base_dense_head import BaseDenseHead + + +@MODELS.register_module() +class AnchorHead(BaseDenseHead): + """Anchor-based head (RPN, RetinaNet, SSD, etc.). + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels. Used in child classes. + anchor_generator (dict): Config dict for anchor generator + bbox_coder (dict): Config of bounding box coder. + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Default False. It should be `True` when + using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of localization loss. + train_cfg (dict): Training config of anchor head. + test_cfg (dict): Testing config of anchor head. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ # noqa: W605 + + def __init__( + self, + num_classes: int, + in_channels: int, + feat_channels: int = 256, + anchor_generator: ConfigType = dict( + type='AnchorGenerator', + scales=[8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder: ConfigType = dict( + type='DeltaXYWHBBoxCoder', + clip_border=True, + target_means=(.0, .0, .0, .0), + target_stds=(1.0, 1.0, 1.0, 1.0)), + reg_decoded_bbox: bool = False, + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = dict( + type='Normal', layer='Conv2d', std=0.01) + ) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.num_classes = num_classes + self.feat_channels = feat_channels + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + + if self.cls_out_channels <= 0: + raise ValueError(f'num_classes={num_classes} is too small') + self.reg_decoded_bbox = reg_decoded_bbox + + self.bbox_coder = TASK_UTILS.build(bbox_coder) + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + if train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + self.fp16_enabled = False + + self.prior_generator = TASK_UTILS.build(anchor_generator) + + # Usually the numbers of anchors for each level are the same + # except SSD detectors. So it is an int in the most dense + # heads but a list of int in SSDHead + self.num_base_priors = self.prior_generator.num_base_priors[0] + self._init_layers() + + @property + def num_anchors(self) -> int: + warnings.warn('DeprecationWarning: `num_anchors` is deprecated, ' + 'for consistency or also use ' + '`num_base_priors` instead') + return self.prior_generator.num_base_priors[0] + + @property + def anchor_generator(self) -> AnchorGenerator: + warnings.warn('DeprecationWarning: anchor_generator is deprecated, ' + 'please use "prior_generator" instead') + return self.prior_generator + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.conv_cls = nn.Conv2d(self.in_channels, + self.num_base_priors * self.cls_out_channels, + 1) + reg_dim = self.bbox_coder.encode_size + self.conv_reg = nn.Conv2d(self.in_channels, + self.num_base_priors * reg_dim, 1) + + def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]: + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + + Returns: + tuple: + cls_score (Tensor): Cls scores for a single scale level \ + the channels number is num_base_priors * num_classes. + bbox_pred (Tensor): Box energies / deltas for a single scale \ + level, the channels number is num_base_priors * 4. + """ + cls_score = self.conv_cls(x) + bbox_pred = self.conv_reg(x) + return cls_score, bbox_pred + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores and bbox prediction. + + - cls_scores (list[Tensor]): Classification scores for all \ + scale levels, each is a 4D-tensor, the channels number \ + is num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all \ + scale levels, each is a 4D-tensor, the channels number \ + is num_base_priors * 4. + """ + return multi_apply(self.forward_single, x) + + def get_anchors(self, + featmap_sizes: List[tuple], + batch_img_metas: List[dict], + device: Union[torch.device, str] = 'cuda') \ + -> Tuple[List[List[Tensor]], List[List[Tensor]]]: + """Get anchors according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + batch_img_metas (list[dict]): Image meta info. + device (torch.device | str): Device for returned tensors. + Defaults to cuda. + + Returns: + tuple: + + - anchor_list (list[list[Tensor]]): Anchors of each image. + - valid_flag_list (list[list[Tensor]]): Valid flags of each + image. + """ + num_imgs = len(batch_img_metas) + + # since feature map sizes of all images are the same, we only compute + # anchors for one time + multi_level_anchors = self.prior_generator.grid_priors( + featmap_sizes, device=device) + anchor_list = [multi_level_anchors for _ in range(num_imgs)] + + # for each image, we compute valid flags of multi level anchors + valid_flag_list = [] + for img_id, img_meta in enumerate(batch_img_metas): + multi_level_flags = self.prior_generator.valid_flags( + featmap_sizes, img_meta['pad_shape'], device) + valid_flag_list.append(multi_level_flags) + + return anchor_list, valid_flag_list + + def _get_targets_single(self, + flat_anchors: Union[Tensor, BaseBoxes], + valid_flags: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression and classification targets for anchors in a + single image. + + Args: + flat_anchors (Tensor or :obj:`BaseBoxes`): Multi-level anchors + of the image, which are concatenated into a single tensor + or box type of shape (num_anchors, 4) + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_anchors, ). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + + Returns: + tuple: + + - labels (Tensor): Labels of each level. + - label_weights (Tensor): Label weights of each level. + - bbox_targets (Tensor): BBox targets of each level. + - bbox_weights (Tensor): BBox weights of each level. + - pos_inds (Tensor): positive samples indexes. + - neg_inds (Tensor): negative samples indexes. + - sampling_result (:obj:`SamplingResult`): Sampling results. + """ + inside_flags = anchor_inside_flags(flat_anchors, valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + # assign gt and sample anchors + anchors = flat_anchors[inside_flags] + + pred_instances = InstanceData(priors=anchors) + assign_result = self.assigner.assign(pred_instances, gt_instances, + gt_instances_ignore) + # No sampling is required except for RPN and + # Guided Anchoring algorithms + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_anchors = anchors.shape[0] + target_dim = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox \ + else self.bbox_coder.encode_size + bbox_targets = anchors.new_zeros(num_valid_anchors, target_dim) + bbox_weights = anchors.new_zeros(num_valid_anchors, target_dim) + + # TODO: Considering saving memory, is it necessary to be long? + labels = anchors.new_full((num_valid_anchors, ), + self.num_classes, + dtype=torch.long) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + # `bbox_coder.encode` accepts tensor or box type inputs and generates + # tensor targets. If regressing decoded boxes, the code will convert + # box type `pos_bbox_targets` to tensor. + if len(pos_inds) > 0: + if not self.reg_decoded_bbox: + pos_bbox_targets = self.bbox_coder.encode( + sampling_result.pos_priors, sampling_result.pos_gt_bboxes) + else: + pos_bbox_targets = sampling_result.pos_gt_bboxes + pos_bbox_targets = get_box_tensor(pos_bbox_targets) + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_anchors.size(0) + labels = unmap( + labels, num_total_anchors, inside_flags, + fill=self.num_classes) # fill bg label + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) + bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) + + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds, sampling_result) + + def get_targets(self, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True, + return_sampling_results: bool = False) -> tuple: + """Compute regression and classification targets for anchors in + multiple images. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, 4). + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, ) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + return_sampling_results (bool): Whether to return the sampling + results. Defaults to False. + + Returns: + tuple: Usually returns a tuple containing learning targets. + + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_targets_list (list[Tensor]): BBox targets of each level. + - bbox_weights_list (list[Tensor]): BBox weights of each level. + - avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + additional_returns: This function enables user-defined returns from + `self._get_targets_single`. These returns are currently refined + to properties at each feature map (i.e. having HxW dimension). + The results will be concatenated after the end + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + # concat all level anchors to a single tensor + concat_anchor_list = [] + concat_valid_flag_list = [] + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + concat_anchor_list.append(cat_boxes(anchor_list[i])) + concat_valid_flag_list.append(torch.cat(valid_flag_list[i])) + + # compute targets for each image + results = multi_apply( + self._get_targets_single, + concat_anchor_list, + concat_valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, + pos_inds_list, neg_inds_list, sampling_results_list) = results[:7] + rest_results = list(results[7:]) # user-added return values + # Get `avg_factor` of all images, which calculate in `SamplingResult`. + # When using sampling method, avg_factor is usually the sum of + # positive and negative priors. When using `PseudoSampler`, + # `avg_factor` is usually equal to the number of positive priors. + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # update `_raw_positive_infos`, which will be used when calling + # `get_positive_infos`. + self._raw_positive_infos.update(sampling_results=sampling_results_list) + # split targets to a list w.r.t. multiple levels + labels_list = images_to_levels(all_labels, num_level_anchors) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_anchors) + res = (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) + if return_sampling_results: + res = res + (sampling_results_list, ) + for i, r in enumerate(rest_results): # user-added return values + rest_results[i] = images_to_levels(r, num_level_anchors) + + return res + tuple(rest_results) + + def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, + anchors: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + bbox_weights: Tensor, avg_factor: int) -> tuple: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + anchors (Tensor): Box reference for each scale level with shape + (N, num_total_anchors, 4). + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of each anchor + weight shape (N, num_total_anchors, 4). + bbox_weights (Tensor): BBox regression loss weights of each anchor + with shape (N, num_total_anchors, 4). + avg_factor (int): Average factor that is used to average the loss. + + Returns: + tuple: loss components. + """ + # classification loss + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + cls_score = cls_score.permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=avg_factor) + # regression loss + target_dim = bbox_targets.size(-1) + bbox_targets = bbox_targets.reshape(-1, target_dim) + bbox_weights = bbox_weights.reshape(-1, target_dim) + bbox_pred = bbox_pred.permute(0, 2, 3, + 1).reshape(-1, + self.bbox_coder.encode_size) + if self.reg_decoded_bbox: + # When the regression loss (e.g. `IouLoss`, `GIouLoss`) + # is applied directly on the decoded bounding boxes, it + # decodes the already encoded coordinates to absolute format. + anchors = anchors.reshape(-1, anchors.size(-1)) + bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) + bbox_pred = get_box_tensor(bbox_pred) + loss_bbox = self.loss_bbox( + bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor) + return loss_cls, loss_bbox + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor) = cls_reg_targets + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + # concat all level anchors and flags to a single tensor + concat_anchor_list = [] + for i in range(len(anchor_list)): + concat_anchor_list.append(cat_boxes(anchor_list[i])) + all_anchor_list = images_to_levels(concat_anchor_list, + num_level_anchors) + + losses_cls, losses_bbox = multi_apply( + self.loss_by_feat_single, + cls_scores, + bbox_preds, + all_anchor_list, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + avg_factor=avg_factor) + return dict(loss_cls=losses_cls, loss_bbox=losses_bbox) diff --git a/mmdetection/mmdet/models/dense_heads/atss_head.py b/mmdetection/mmdet/models/dense_heads/atss_head.py new file mode 100644 index 0000000..2ce71b3 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/atss_head.py @@ -0,0 +1,524 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, Scale +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType, + OptInstanceList, reduce_mean) +from ..task_modules.prior_generators import anchor_inside_flags +from ..utils import images_to_levels, multi_apply, unmap +from .anchor_head import AnchorHead + + +@MODELS.register_module() +class ATSSHead(AnchorHead): + """Detection Head of `ATSS `_. + + ATSS head structure is similar with FCOS, however ATSS use anchor boxes + and assign label by Adaptive Training Sample Selection instead max-iou. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + pred_kernel_size (int): Kernel size of ``nn.Conv2d`` + stacked_convs (int): Number of stacking convs of the head. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='GN', num_groups=32, + requires_grad=True)``. + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Defaults to False. It should be `True` when + using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. + loss_centerness (:obj:`ConfigDict` or dict): Config of centerness loss. + Defaults to ``dict(type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0)``. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + pred_kernel_size: int = 3, + stacked_convs: int = 4, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + reg_decoded_bbox: bool = True, + loss_centerness: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='atss_cls', + std=0.01, + bias_prob=0.01)), + **kwargs) -> None: + self.pred_kernel_size = pred_kernel_size + self.stacked_convs = stacked_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + reg_decoded_bbox=reg_decoded_bbox, + init_cfg=init_cfg, + **kwargs) + + self.sampling = False + self.loss_centerness = MODELS.build(loss_centerness) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + pred_pad_size = self.pred_kernel_size // 2 + self.atss_cls = nn.Conv2d( + self.feat_channels, + self.num_anchors * self.cls_out_channels, + self.pred_kernel_size, + padding=pred_pad_size) + self.atss_reg = nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=pred_pad_size) + self.atss_centerness = nn.Conv2d( + self.feat_channels, + self.num_base_priors * 1, + self.pred_kernel_size, + padding=pred_pad_size) + self.scales = nn.ModuleList( + [Scale(1.0) for _ in self.prior_generator.strides]) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_anchors * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_anchors * 4. + """ + return multi_apply(self.forward_single, x, self.scales) + + def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]: + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + + Returns: + tuple: + cls_score (Tensor): Cls scores for a single scale level + the channels number is num_anchors * num_classes. + bbox_pred (Tensor): Box energies / deltas for a single scale + level, the channels number is num_anchors * 4. + centerness (Tensor): Centerness for a single scale level, the + channel number is (N, num_anchors * 1, H, W). + """ + cls_feat = x + reg_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + reg_feat = reg_conv(reg_feat) + cls_score = self.atss_cls(cls_feat) + # we just follow atss, not apply exp in bbox_pred + bbox_pred = scale(self.atss_reg(reg_feat)).float() + centerness = self.atss_centerness(reg_feat) + return cls_score, bbox_pred, centerness + + def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor, + bbox_pred: Tensor, centerness: Tensor, + labels: Tensor, label_weights: Tensor, + bbox_targets: Tensor, avg_factor: float) -> dict: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + anchors (Tensor): Box reference for each scale level with shape + (N, num_total_anchors, 4). + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (N, num_total_anchors, 4). + avg_factor (float): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + + anchors = anchors.reshape(-1, 4) + cls_score = cls_score.permute(0, 2, 3, 1).reshape( + -1, self.cls_out_channels).contiguous() + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + centerness = centerness.permute(0, 2, 3, 1).reshape(-1) + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + + # classification loss + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=avg_factor) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + pos_anchors = anchors[pos_inds] + pos_centerness = centerness[pos_inds] + + centerness_targets = self.centerness_target( + pos_anchors, pos_bbox_targets) + pos_decode_bbox_pred = self.bbox_coder.decode( + pos_anchors, pos_bbox_pred) + + # regression loss + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_bbox_targets, + weight=centerness_targets, + avg_factor=1.0) + + # centerness loss + loss_centerness = self.loss_centerness( + pos_centerness, centerness_targets, avg_factor=avg_factor) + + else: + loss_bbox = bbox_pred.sum() * 0 + loss_centerness = centerness.sum() * 0 + centerness_targets = bbox_targets.new_tensor(0.) + + return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum() + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + centernesses: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + centernesses (list[Tensor]): Centerness for each scale + level with shape (N, num_anchors * 1, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) = cls_reg_targets + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + + losses_cls, losses_bbox, loss_centerness, \ + bbox_avg_factor = multi_apply( + self.loss_by_feat_single, + anchor_list, + cls_scores, + bbox_preds, + centernesses, + labels_list, + label_weights_list, + bbox_targets_list, + avg_factor=avg_factor) + + bbox_avg_factor = sum(bbox_avg_factor) + bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item() + losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) + return dict( + loss_cls=losses_cls, + loss_bbox=losses_bbox, + loss_centerness=loss_centerness) + + def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor: + """Calculate the centerness between anchors and gts. + + Only calculate pos centerness targets, otherwise there may be nan. + + Args: + anchors (Tensor): Anchors with shape (N, 4), "xyxy" format. + gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format. + + Returns: + Tensor: Centerness between anchors and gts. + """ + anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2 + anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2 + l_ = anchors_cx - gts[:, 0] + t_ = anchors_cy - gts[:, 1] + r_ = gts[:, 2] - anchors_cx + b_ = gts[:, 3] - anchors_cy + + left_right = torch.stack([l_, r_], dim=1) + top_bottom = torch.stack([t_, b_], dim=1) + centerness = torch.sqrt( + (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * + (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])) + assert not torch.isnan(centerness).any() + return centerness + + def get_targets(self, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True) -> tuple: + """Get targets for ATSS head. + + This method is almost the same as `AnchorHead.get_targets()`. Besides + returning the targets as the parent method does, it also returns the + anchors as the first element of the returned tuple. + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + num_level_anchors_list = [num_level_anchors] * num_imgs + + # concat all level anchors and flags to a single tensor + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + anchor_list[i] = torch.cat(anchor_list[i]) + valid_flag_list[i] = torch.cat(valid_flag_list[i]) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + (all_anchors, all_labels, all_label_weights, all_bbox_targets, + all_bbox_weights, pos_inds_list, neg_inds_list, + sampling_results_list) = multi_apply( + self._get_targets_single, + anchor_list, + valid_flag_list, + num_level_anchors_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + # Get `avg_factor` of all images, which calculate in `SamplingResult`. + # When using sampling method, avg_factor is usually the sum of + # positive and negative priors. When using `PseudoSampler`, + # `avg_factor` is usually equal to the number of positive priors. + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # split targets to a list w.r.t. multiple levels + anchors_list = images_to_levels(all_anchors, num_level_anchors) + labels_list = images_to_levels(all_labels, num_level_anchors) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_anchors) + return (anchors_list, labels_list, label_weights_list, + bbox_targets_list, bbox_weights_list, avg_factor) + + def _get_targets_single(self, + flat_anchors: Tensor, + valid_flags: Tensor, + num_level_anchors: List[int], + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression, classification targets for anchors in a single + image. + + Args: + flat_anchors (Tensor): Multi-level anchors of the image, which are + concatenated into a single tensor of shape (num_anchors ,4) + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_anchors,). + num_level_anchors (List[int]): Number of anchors of each scale + level. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Returns: + tuple: N is the number of total anchors in the image. + labels (Tensor): Labels of all anchors in the image with shape + (N,). + label_weights (Tensor): Label weights of all anchor in the + image with shape (N,). + bbox_targets (Tensor): BBox targets of all anchors in the + image with shape (N, 4). + bbox_weights (Tensor): BBox weights of all anchors in the + image with shape (N, 4) + pos_inds (Tensor): Indices of positive anchor with shape + (num_pos,). + neg_inds (Tensor): Indices of negative anchor with shape + (num_neg,). + sampling_result (:obj:`SamplingResult`): Sampling results. + """ + inside_flags = anchor_inside_flags(flat_anchors, valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + # assign gt and sample anchors + anchors = flat_anchors[inside_flags, :] + + num_level_anchors_inside = self.get_num_level_anchors_inside( + num_level_anchors, inside_flags) + pred_instances = InstanceData(priors=anchors) + assign_result = self.assigner.assign(pred_instances, + num_level_anchors_inside, + gt_instances, gt_instances_ignore) + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_anchors = anchors.shape[0] + bbox_targets = torch.zeros_like(anchors) + bbox_weights = torch.zeros_like(anchors) + labels = anchors.new_full((num_valid_anchors, ), + self.num_classes, + dtype=torch.long) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + if self.reg_decoded_bbox: + pos_bbox_targets = sampling_result.pos_gt_bboxes + else: + pos_bbox_targets = self.bbox_coder.encode( + sampling_result.pos_priors, sampling_result.pos_gt_bboxes) + + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_anchors.size(0) + anchors = unmap(anchors, num_total_anchors, inside_flags) + labels = unmap( + labels, num_total_anchors, inside_flags, fill=self.num_classes) + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) + bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) + + return (anchors, labels, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds, sampling_result) + + def get_num_level_anchors_inside(self, num_level_anchors, inside_flags): + """Get the number of valid anchors in every level.""" + + split_inside_flags = torch.split(inside_flags, num_level_anchors) + num_level_anchors_inside = [ + int(flags.sum()) for flags in split_inside_flags + ] + return num_level_anchors_inside diff --git a/mmdetection/mmdet/models/dense_heads/atss_vlfusion_head.py b/mmdetection/mmdet/models/dense_heads/atss_vlfusion_head.py new file mode 100644 index 0000000..c5cd28b --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/atss_vlfusion_head.py @@ -0,0 +1,949 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +from typing import Callable, List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Scale +from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d +from mmengine.config import ConfigDict +from mmengine.model import BaseModel +from mmengine.structures import InstanceData +from torch import Tensor + +try: + from transformers import BertConfig +except ImportError: + BertConfig = None + +from mmdet.registry import MODELS +from mmdet.structures.bbox import cat_boxes +from mmdet.utils import InstanceList, OptInstanceList, reduce_mean +from ..utils import (BertEncoderLayer, VLFuse, filter_scores_and_topk, + permute_and_flatten, select_single_mlvl, + unpack_gt_instances) +from ..utils.vlfuse_helper import MAX_CLAMP_VALUE +from .atss_head import ATSSHead + + +def convert_grounding_to_cls_scores(logits: Tensor, + positive_maps: List[dict]) -> Tensor: + """Convert logits to class scores.""" + assert len(positive_maps) == logits.shape[0] # batch size + + scores = torch.zeros(logits.shape[0], logits.shape[1], + len(positive_maps[0])).to(logits.device) + if positive_maps is not None: + if all(x == positive_maps[0] for x in positive_maps): + # only need to compute once + positive_map = positive_maps[0] + for label_j in positive_map: + scores[:, :, label_j - + 1] = logits[:, :, + torch.LongTensor(positive_map[label_j] + )].mean(-1) + else: + for i, positive_map in enumerate(positive_maps): + for label_j in positive_map: + scores[i, :, label_j - 1] = logits[ + i, :, torch.LongTensor(positive_map[label_j])].mean(-1) + return scores + + +class Conv3x3Norm(nn.Module): + """Conv3x3 and norm.""" + + def __init__(self, + in_channels: int, + out_channels: int, + stride: int, + groups: int = 1, + use_dcn: bool = False, + norm_type: Optional[Union[Sequence, str]] = None): + super().__init__() + + if use_dcn: + self.conv = ModulatedDeformConv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + groups=groups) + else: + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + groups=groups) + + if isinstance(norm_type, Sequence): + assert len(norm_type) == 2 + assert norm_type[0] == 'gn' + gn_group = norm_type[1] + norm_type = norm_type[0] + + if norm_type == 'bn': + bn_op = nn.BatchNorm2d(out_channels) + elif norm_type == 'gn': + bn_op = nn.GroupNorm( + num_groups=gn_group, num_channels=out_channels) + if norm_type is not None: + self.bn = bn_op + else: + self.bn = None + + def forward(self, x, **kwargs): + x = self.conv(x, **kwargs) + if self.bn: + x = self.bn(x) + return x + + +class DyReLU(nn.Module): + """Dynamic ReLU.""" + + def __init__(self, + in_channels: int, + out_channels: int, + expand_ratio: int = 4): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.expand_ratio = expand_ratio + self.out_channels = out_channels + + self.fc = nn.Sequential( + nn.Linear(in_channels, in_channels // expand_ratio), + nn.ReLU(inplace=True), + nn.Linear(in_channels // expand_ratio, + out_channels * self.expand_ratio), + nn.Hardsigmoid(inplace=True)) + + def forward(self, x) -> Tensor: + x_out = x + b, c, h, w = x.size() + x = self.avg_pool(x).view(b, c) + x = self.fc(x).view(b, -1, 1, 1) + + a1, b1, a2, b2 = torch.split(x, self.out_channels, dim=1) + a1 = (a1 - 0.5) * 2 + 1.0 + a2 = (a2 - 0.5) * 2 + b1 = b1 - 0.5 + b2 = b2 - 0.5 + out = torch.max(x_out * a1 + b1, x_out * a2 + b2) + return out + + +class DyConv(nn.Module): + """Dynamic Convolution.""" + + def __init__(self, + conv_func: Callable, + in_channels: int, + out_channels: int, + use_dyfuse: bool = True, + use_dyrelu: bool = False, + use_dcn: bool = False): + super().__init__() + + self.dyconvs = nn.ModuleList() + self.dyconvs.append(conv_func(in_channels, out_channels, 1)) + self.dyconvs.append(conv_func(in_channels, out_channels, 1)) + self.dyconvs.append(conv_func(in_channels, out_channels, 2)) + + if use_dyfuse: + self.attnconv = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(in_channels, 1, kernel_size=1), + nn.ReLU(inplace=True)) + self.h_sigmoid = nn.Hardsigmoid(inplace=True) + else: + self.attnconv = None + + if use_dyrelu: + self.relu = DyReLU(in_channels, out_channels) + else: + self.relu = nn.ReLU() + + if use_dcn: + self.offset = nn.Conv2d( + in_channels, 27, kernel_size=3, stride=1, padding=1) + else: + self.offset = None + + self.init_weights() + + def init_weights(self): + for m in self.dyconvs.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight.data, 0, 0.01) + if m.bias is not None: + m.bias.data.zero_() + if self.attnconv is not None: + for m in self.attnconv.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight.data, 0, 0.01) + if m.bias is not None: + m.bias.data.zero_() + + def forward(self, inputs: dict) -> dict: + visual_feats = inputs['visual'] + + out_vis_feats = [] + for level, feature in enumerate(visual_feats): + + offset_conv_args = {} + if self.offset is not None: + offset_mask = self.offset(feature) + offset = offset_mask[:, :18, :, :] + mask = offset_mask[:, 18:, :, :].sigmoid() + offset_conv_args = dict(offset=offset, mask=mask) + + temp_feats = [self.dyconvs[1](feature, **offset_conv_args)] + + if level > 0: + temp_feats.append(self.dyconvs[2](visual_feats[level - 1], + **offset_conv_args)) + if level < len(visual_feats) - 1: + temp_feats.append( + F.upsample_bilinear( + self.dyconvs[0](visual_feats[level + 1], + **offset_conv_args), + size=[feature.size(2), + feature.size(3)])) + mean_feats = torch.mean( + torch.stack(temp_feats), dim=0, keepdim=False) + + if self.attnconv is not None: + attn_feat = [] + res_feat = [] + for feat in temp_feats: + res_feat.append(feat) + attn_feat.append(self.attnconv(feat)) + + res_feat = torch.stack(res_feat) + spa_pyr_attn = self.h_sigmoid(torch.stack(attn_feat)) + + mean_feats = torch.mean( + res_feat * spa_pyr_attn, dim=0, keepdim=False) + + out_vis_feats.append(mean_feats) + + out_vis_feats = [self.relu(item) for item in out_vis_feats] + + features_dict = {'visual': out_vis_feats, 'lang': inputs['lang']} + + return features_dict + + +class VLFusionModule(BaseModel): + """Visual-lang Fusion Module.""" + + def __init__(self, + in_channels: int, + feat_channels: int, + num_base_priors: int, + early_fuse: bool = False, + num_dyhead_blocks: int = 6, + lang_model_name: str = 'bert-base-uncased', + use_dyrelu: bool = True, + use_dyfuse: bool = True, + use_dcn: bool = True, + use_checkpoint: bool = False, + **kwargs) -> None: + super().__init__(**kwargs) + if BertConfig is None: + raise RuntimeError( + 'transformers is not installed, please install it by: ' + 'pip install transformers.') + self.in_channels = in_channels + self.feat_channels = feat_channels + self.num_base_priors = num_base_priors + self.early_fuse = early_fuse + self.num_dyhead_blocks = num_dyhead_blocks + self.use_dyrelu = use_dyrelu + self.use_dyfuse = use_dyfuse + self.use_dcn = use_dcn + self.use_checkpoint = use_checkpoint + + self.lang_cfg = BertConfig.from_pretrained(lang_model_name) + self.lang_dim = self.lang_cfg.hidden_size + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the model.""" + bias_value = -math.log((1 - 0.01) / 0.01) + + dyhead_tower = [] + for i in range(self.num_dyhead_blocks): + if self.early_fuse: + # cross-modality fusion + dyhead_tower.append(VLFuse(use_checkpoint=self.use_checkpoint)) + # lang branch + dyhead_tower.append( + BertEncoderLayer( + self.lang_cfg, + clamp_min_for_underflow=True, + clamp_max_for_overflow=True)) + + # vision branch + dyhead_tower.append( + DyConv( + lambda i, o, s: Conv3x3Norm( + i, o, s, use_dcn=self.use_dcn, norm_type=['gn', 16]), + self.in_channels if i == 0 else self.feat_channels, + self.feat_channels, + use_dyrelu=(self.use_dyrelu + and self.in_channels == self.feat_channels) + if i == 0 else self.use_dyrelu, + use_dyfuse=(self.use_dyfuse + and self.in_channels == self.feat_channels) + if i == 0 else self.use_dyfuse, + use_dcn=(self.use_dcn + and self.in_channels == self.feat_channels) + if i == 0 else self.use_dcn, + )) + + self.add_module('dyhead_tower', nn.Sequential(*dyhead_tower)) + + self.bbox_pred = nn.Conv2d( + self.feat_channels, self.num_base_priors * 4, kernel_size=1) + self.centerness = nn.Conv2d( + self.feat_channels, self.num_base_priors * 1, kernel_size=1) + self.dot_product_projection_text = nn.Linear( + self.lang_dim, + self.num_base_priors * self.feat_channels, + bias=True) + self.log_scale = nn.Parameter(torch.Tensor([0.0]), requires_grad=True) + self.bias_lang = nn.Parameter( + torch.zeros(self.lang_dim), requires_grad=True) + self.bias0 = nn.Parameter( + torch.Tensor([bias_value]), requires_grad=True) + self.scales = nn.ModuleList([Scale(1.0) for _ in range(5)]) + + def forward(self, visual_feats: Tuple[Tensor], + language_feats: dict) -> Tuple: + feat_inputs = {'visual': visual_feats, 'lang': language_feats} + dyhead_tower = self.dyhead_tower(feat_inputs) + + if self.early_fuse: + embedding = dyhead_tower['lang']['hidden'] + else: + embedding = language_feats['embedded'] + + embedding = F.normalize(embedding, p=2, dim=-1) + dot_product_proj_tokens = self.dot_product_projection_text(embedding / + 2.0) + dot_product_proj_tokens_bias = torch.matmul( + embedding, self.bias_lang) + self.bias0 + + bbox_preds = [] + centerness = [] + cls_logits = [] + + for i, feature in enumerate(visual_feats): + visual = dyhead_tower['visual'][i] + B, C, H, W = visual.shape + + bbox_pred = self.scales[i](self.bbox_pred(visual)) + bbox_preds.append(bbox_pred) + centerness.append(self.centerness(visual)) + + dot_product_proj_queries = permute_and_flatten( + visual, B, self.num_base_priors, C, H, W) + + bias = dot_product_proj_tokens_bias.unsqueeze(1).repeat( + 1, self.num_base_priors, 1) + dot_product_logit = ( + torch.matmul(dot_product_proj_queries, + dot_product_proj_tokens.transpose(-1, -2)) / + self.log_scale.exp()) + bias + dot_product_logit = torch.clamp( + dot_product_logit, max=MAX_CLAMP_VALUE) + dot_product_logit = torch.clamp( + dot_product_logit, min=-MAX_CLAMP_VALUE) + cls_logits.append(dot_product_logit) + + return bbox_preds, centerness, cls_logits + + +@MODELS.register_module() +class ATSSVLFusionHead(ATSSHead): + """ATSS head with visual-language fusion module. + + Args: + early_fuse (bool): Whether to fuse visual and language features + Defaults to False. + use_checkpoint (bool): Whether to use checkpoint. Defaults to False. + num_dyhead_blocks (int): Number of dynamic head blocks. Defaults to 6. + lang_model_name (str): Name of the language model. + Defaults to 'bert-base-uncased'. + """ + + def __init__(self, + *args, + early_fuse: bool = False, + use_checkpoint: bool = False, + num_dyhead_blocks: int = 6, + lang_model_name: str = 'bert-base-uncased', + init_cfg=None, + **kwargs): + super().__init__(*args, **kwargs, init_cfg=init_cfg) + self.head = VLFusionModule( + in_channels=self.in_channels, + feat_channels=self.feat_channels, + num_base_priors=self.num_base_priors, + early_fuse=early_fuse, + use_checkpoint=use_checkpoint, + num_dyhead_blocks=num_dyhead_blocks, + lang_model_name=lang_model_name) + self.text_masks = None + + def _init_layers(self) -> None: + """No need to initialize the ATSS head layer.""" + pass + + def forward(self, visual_feats: Tuple[Tensor], + language_feats: dict) -> Tuple[Tensor]: + """Forward function.""" + bbox_preds, centerness, cls_logits = self.head(visual_feats, + language_feats) + return cls_logits, bbox_preds, centerness + + def loss(self, visual_feats: Tuple[Tensor], language_feats: dict, + batch_data_samples): + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + outs = self(visual_feats, language_feats) + self.text_masks = language_feats['masks'] + loss_inputs = outs + (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + losses = self.loss_by_feat(*loss_inputs) + return losses + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + centernesses: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + centernesses (list[Tensor]): Centerness for each scale + level with shape (N, num_anchors * 1, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) = cls_reg_targets + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + + anchors = torch.cat(anchor_list, dim=1) + labels = torch.cat(labels_list, dim=1) + label_weights = torch.cat(label_weights_list, dim=1) + bbox_targets = torch.cat(bbox_targets_list, dim=1) + cls_scores = torch.cat(cls_scores, dim=1) + + centernesses_ = [] + bbox_preds_ = [] + for bbox_pred, centerness in zip(bbox_preds, centernesses): + centernesses_.append( + centerness.permute(0, 2, 3, + 1).reshape(cls_scores.size(0), -1, 1)) + bbox_preds_.append( + bbox_pred.permute(0, 2, 3, + 1).reshape(cls_scores.size(0), -1, 4)) + bbox_preds = torch.cat(bbox_preds_, dim=1) + centernesses = torch.cat(centernesses_, dim=1) + + losses_cls, losses_bbox, loss_centerness, bbox_avg_factor = \ + self._loss_by_feat( + anchors, + cls_scores, + bbox_preds, + centernesses, + labels, + label_weights, + bbox_targets, + avg_factor=avg_factor) + + bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item() + losses_bbox = losses_bbox / bbox_avg_factor + return dict( + loss_cls=losses_cls, + loss_bbox=losses_bbox, + loss_centerness=loss_centerness) + + def _loss_by_feat(self, anchors: Tensor, cls_score: Tensor, + bbox_pred: Tensor, centerness: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + avg_factor: float) -> dict: + """Calculate the loss of all scale level based on the features + extracted by the detection head. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + + anchors = anchors.reshape(-1, 4) + + # ===== this change ===== + pos_inds = (labels.sum(-1) > 0).reshape(-1) + + # Loss is not computed for the padded regions of the text. + assert (self.text_masks.dim() == 2) + text_mask = (self.text_masks > 0).unsqueeze(1) + text_mask = text_mask.repeat(1, cls_score.size(1), 1) + cls_score = torch.masked_select(cls_score, text_mask).contiguous() + labels = torch.masked_select(labels, text_mask) + label_weights = label_weights[..., + None].repeat(1, 1, text_mask.size(-1)) + label_weights = torch.masked_select(label_weights, text_mask) + + bbox_pred = bbox_pred.reshape(-1, 4) + centerness = centerness.reshape(-1) + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + + # classification loss + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=avg_factor) + + if pos_inds.sum() > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + pos_anchors = anchors[pos_inds] + pos_centerness = centerness[pos_inds] + + centerness_targets = self.centerness_target( + pos_anchors, pos_bbox_targets) + + if torch.isnan(centerness_targets).any(): + print('=====Centerness includes NaN=====') + mask = ~torch.isnan(centerness_targets) + centerness_targets = centerness_targets[mask] + pos_centerness = pos_centerness[mask] + pos_anchors = pos_anchors[mask] + pos_bbox_targets = pos_bbox_targets[mask] + pos_bbox_pred = pos_bbox_pred[mask] + + if pos_bbox_targets.shape[0] == 0: + loss_bbox = bbox_pred.sum() * 0 + loss_centerness = centerness.sum() * 0 + centerness_targets = bbox_targets.new_tensor(0.) + return loss_cls, loss_bbox, loss_centerness, \ + centerness_targets.sum() + + # The decoding process takes the offset into consideration. + pos_anchors[:, 2:] += 1 + pos_decode_bbox_pred = self.bbox_coder.decode( + pos_anchors, pos_bbox_pred) + + # regression loss + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_bbox_targets, + weight=centerness_targets, + avg_factor=1.0) + + # centerness loss + loss_centerness = self.loss_centerness( + pos_centerness, centerness_targets, avg_factor=avg_factor) + else: + loss_bbox = bbox_pred.sum() * 0 + loss_centerness = centerness.sum() * 0 + centerness_targets = bbox_targets.new_tensor(0.) + + return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum() + + def _get_targets_single(self, + flat_anchors: Tensor, + valid_flags: Tensor, + num_level_anchors: List[int], + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression, classification targets for anchors in a single + image. + + Args: + flat_anchors (Tensor): Multi-level anchors of the image, which are + concatenated into a single tensor of shape (num_anchors ,4) + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_anchors,). + num_level_anchors (List[int]): Number of anchors of each scale + level. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Returns: + tuple: N is the number of total anchors in the image. + labels (Tensor): Labels of all anchors in the image with shape + (N,). + label_weights (Tensor): Label weights of all anchor in the + image with shape (N,). + bbox_targets (Tensor): BBox targets of all anchors in the + image with shape (N, 4). + bbox_weights (Tensor): BBox weights of all anchors in the + image with shape (N, 4) + pos_inds (Tensor): Indices of positive anchor with shape + (num_pos,). + neg_inds (Tensor): Indices of negative anchor with shape + (num_neg,). + sampling_result (:obj:`SamplingResult`): Sampling results. + """ + anchors = flat_anchors + # Align the official implementation + anchors[:, 2:] -= 1 + + num_level_anchors_inside = num_level_anchors + pred_instances = InstanceData(priors=anchors) + assign_result = self.assigner.assign(pred_instances, + num_level_anchors_inside, + gt_instances, gt_instances_ignore) + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_anchors = anchors.shape[0] + bbox_targets = torch.zeros_like(anchors) + bbox_weights = torch.zeros_like(anchors) + + # ===== this change ===== + labels = anchors.new_full((num_valid_anchors, self.feat_channels), + 0, + dtype=torch.float32) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + if self.reg_decoded_bbox: + pos_bbox_targets = sampling_result.pos_gt_bboxes + else: + pos_bbox_targets = self.bbox_coder.encode( + sampling_result.pos_priors, sampling_result.pos_gt_bboxes) + + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + + # ===== this change ===== + labels[pos_inds] = gt_instances.positive_maps[ + sampling_result.pos_assigned_gt_inds] + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + return (anchors, labels, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds, sampling_result) + + def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor: + """Calculate the centerness between anchors and gts. + + Only calculate pos centerness targets, otherwise there may be nan. + + Args: + anchors (Tensor): Anchors with shape (N, 4), "xyxy" format. + gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format. + + Returns: + Tensor: Centerness between anchors and gts. + """ + anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2 + anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2 + l_ = anchors_cx - gts[:, 0] + t_ = anchors_cy - gts[:, 1] + r_ = gts[:, 2] - anchors_cx + b_ = gts[:, 3] - anchors_cy + + left_right = torch.stack([l_, r_], dim=1) + top_bottom = torch.stack([t_, b_], dim=1) + centerness = torch.sqrt( + (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * + (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])) + # assert not torch.isnan(centerness).any() + return centerness + + def predict(self, + visual_feats: Tuple[Tensor], + language_feats: dict, + batch_data_samples, + rescale: bool = True): + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. + + Args: + visual_feats (tuple[Tensor]): Multi-level visual features from the + upstream network, each is a 4D-tensor. + language_feats (dict): Language features from the upstream network. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + batch_token_positive_maps = [ + data_samples.token_positive_map + for data_samples in batch_data_samples + ] + outs = self(visual_feats, language_feats) + + predictions = self.predict_by_feat( + *outs, + batch_img_metas=batch_img_metas, + batch_token_positive_maps=batch_token_positive_maps, + rescale=rescale) + return predictions + + def predict_by_feat(self, + cls_logits: List[Tensor], + bbox_preds: List[Tensor], + score_factors: List[Tensor], + batch_img_metas: Optional[List[dict]] = None, + batch_token_positive_maps: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS, + such as CenterNess in FCOS, IoU branch in ATSS. + + Args: + cls_logits (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + score_factors (list[Tensor], optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Defaults to None. + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + batch_token_positive_maps (list[dict], Optional): Batch token + positive map. Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(bbox_preds) == len(score_factors) + num_levels = len(bbox_preds) + + featmap_sizes = [bbox_preds[i].shape[-2:] for i in range(num_levels)] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device) + + result_list = [] + + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + token_positive_maps = batch_token_positive_maps[img_id] + bbox_pred_list = select_single_mlvl( + bbox_preds, img_id, detach=True) + score_factor_list = select_single_mlvl( + score_factors, img_id, detach=True) + cls_logit_list = select_single_mlvl( + cls_logits, img_id, detach=True) + + results = self._predict_by_feat_single( + bbox_pred_list=bbox_pred_list, + score_factor_list=score_factor_list, + cls_logit_list=cls_logit_list, + mlvl_priors=mlvl_priors, + token_positive_maps=token_positive_maps, + img_meta=img_meta, + cfg=cfg, + rescale=rescale, + with_nms=with_nms) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + cls_logit_list: List[Tensor], + mlvl_priors: List[Tensor], + token_positive_maps: dict, + img_meta: dict, + cfg: ConfigDict, + rescale: bool = True, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + cls_logit_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + token_positive_maps (dict): Token positive map. + img_meta (dict): Image meta info. + cfg (mmengine.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + score_thr = cfg.get('score_thr', 0) + + mlvl_bbox_preds = [] + mlvl_valid_priors = [] + mlvl_scores = [] + mlvl_labels = [] + + for level_idx, (bbox_pred, score_factor, cls_logit, priors) in \ + enumerate(zip(bbox_pred_list, + score_factor_list, cls_logit_list, mlvl_priors)): + bbox_pred = bbox_pred.permute(1, 2, 0).reshape( + -1, self.bbox_coder.encode_size) + score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid() + + scores = convert_grounding_to_cls_scores( + logits=cls_logit.sigmoid()[None], + positive_maps=[token_positive_maps])[0] + + results = filter_scores_and_topk( + scores, score_thr, nms_pre, + dict(bbox_pred=bbox_pred, priors=priors)) + + scores, labels, keep_idxs, filtered_results = results + + bbox_pred = filtered_results['bbox_pred'] + priors = filtered_results['priors'] + score_factor = score_factor[keep_idxs] + scores = torch.sqrt(scores * score_factor) + + mlvl_bbox_preds.append(bbox_pred) + mlvl_valid_priors.append(priors) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + + bbox_pred = torch.cat(mlvl_bbox_preds) + priors = cat_boxes(mlvl_valid_priors) + bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) + + results = InstanceData() + results.bboxes = bboxes + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + + predictions = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + if len(predictions) > 0: + # Note: GLIP adopts a very strange bbox decoder logic, + # and if 1 is not added here, it will not align with + # the official mAP. + predictions.bboxes[:, 2:] = predictions.bboxes[:, 2:] + 1 + return predictions diff --git a/mmdetection/mmdet/models/dense_heads/autoassign_head.py b/mmdetection/mmdet/models/dense_heads/autoassign_head.py new file mode 100644 index 0000000..a2b30ff --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/autoassign_head.py @@ -0,0 +1,524 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Sequence, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Scale +from mmengine.model import bias_init_with_prob, normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import InstanceList, OptInstanceList, reduce_mean +from ..task_modules.prior_generators import MlvlPointGenerator +from ..utils import levels_to_images, multi_apply +from .fcos_head import FCOSHead + +EPS = 1e-12 + + +class CenterPrior(nn.Module): + """Center Weighting module to adjust the category-specific prior + distributions. + + Args: + force_topk (bool): When no point falls into gt_bbox, forcibly + select the k points closest to the center to calculate + the center prior. Defaults to False. + topk (int): The number of points used to calculate the + center prior when no point falls in gt_bbox. Only work when + force_topk if True. Defaults to 9. + num_classes (int): The class number of dataset. Defaults to 80. + strides (Sequence[int]): The stride of each input feature map. + Defaults to (8, 16, 32, 64, 128). + """ + + def __init__( + self, + force_topk: bool = False, + topk: int = 9, + num_classes: int = 80, + strides: Sequence[int] = (8, 16, 32, 64, 128) + ) -> None: + super().__init__() + self.mean = nn.Parameter(torch.zeros(num_classes, 2)) + self.sigma = nn.Parameter(torch.ones(num_classes, 2)) + self.strides = strides + self.force_topk = force_topk + self.topk = topk + + def forward(self, anchor_points_list: List[Tensor], + gt_instances: InstanceData, + inside_gt_bbox_mask: Tensor) -> Tuple[Tensor, Tensor]: + """Get the center prior of each point on the feature map for each + instance. + + Args: + anchor_points_list (list[Tensor]): list of coordinate + of points on feature map. Each with shape + (num_points, 2). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + inside_gt_bbox_mask (Tensor): Tensor of bool type, + with shape of (num_points, num_gt), each + value is used to mark whether this point falls + within a certain gt. + + Returns: + tuple[Tensor, Tensor]: + + - center_prior_weights(Tensor): Float tensor with shape of \ + (num_points, num_gt). Each value represents the center \ + weighting coefficient. + - inside_gt_bbox_mask (Tensor): Tensor of bool type, with shape \ + of (num_points, num_gt), each value is used to mark whether this \ + point falls within a certain gt or is the topk nearest points for \ + a specific gt_bbox. + """ + gt_bboxes = gt_instances.bboxes + labels = gt_instances.labels + + inside_gt_bbox_mask = inside_gt_bbox_mask.clone() + num_gts = len(labels) + num_points = sum([len(item) for item in anchor_points_list]) + if num_gts == 0: + return gt_bboxes.new_zeros(num_points, + num_gts), inside_gt_bbox_mask + center_prior_list = [] + for slvl_points, stride in zip(anchor_points_list, self.strides): + # slvl_points: points from single level in FPN, has shape (h*w, 2) + # single_level_points has shape (h*w, num_gt, 2) + single_level_points = slvl_points[:, None, :].expand( + (slvl_points.size(0), len(gt_bboxes), 2)) + gt_center_x = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2) + gt_center_y = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2) + gt_center = torch.stack((gt_center_x, gt_center_y), dim=1) + gt_center = gt_center[None] + # instance_center has shape (1, num_gt, 2) + instance_center = self.mean[labels][None] + # instance_sigma has shape (1, num_gt, 2) + instance_sigma = self.sigma[labels][None] + # distance has shape (num_points, num_gt, 2) + distance = (((single_level_points - gt_center) / float(stride) - + instance_center)**2) + center_prior = torch.exp(-distance / + (2 * instance_sigma**2)).prod(dim=-1) + center_prior_list.append(center_prior) + center_prior_weights = torch.cat(center_prior_list, dim=0) + + if self.force_topk: + gt_inds_no_points_inside = torch.nonzero( + inside_gt_bbox_mask.sum(0) == 0).reshape(-1) + if gt_inds_no_points_inside.numel(): + topk_center_index = \ + center_prior_weights[:, gt_inds_no_points_inside].topk( + self.topk, + dim=0)[1] + temp_mask = inside_gt_bbox_mask[:, gt_inds_no_points_inside] + inside_gt_bbox_mask[:, gt_inds_no_points_inside] = \ + torch.scatter(temp_mask, + dim=0, + index=topk_center_index, + src=torch.ones_like( + topk_center_index, + dtype=torch.bool)) + + center_prior_weights[~inside_gt_bbox_mask] = 0 + return center_prior_weights, inside_gt_bbox_mask + + +@MODELS.register_module() +class AutoAssignHead(FCOSHead): + """AutoAssignHead head used in AutoAssign. + + More details can be found in the `paper + `_ . + + Args: + force_topk (bool): Used in center prior initialization to + handle extremely small gt. Default is False. + topk (int): The number of points used to calculate the + center prior when no point falls in gt_bbox. Only work when + force_topk if True. Defaults to 9. + pos_loss_weight (float): The loss weight of positive loss + and with default value 0.25. + neg_loss_weight (float): The loss weight of negative loss + and with default value 0.75. + center_loss_weight (float): The loss weight of center prior + loss and with default value 0.75. + """ + + def __init__(self, + *args, + force_topk: bool = False, + topk: int = 9, + pos_loss_weight: float = 0.25, + neg_loss_weight: float = 0.75, + center_loss_weight: float = 0.75, + **kwargs) -> None: + super().__init__(*args, conv_bias=True, **kwargs) + self.center_prior = CenterPrior( + force_topk=force_topk, + topk=topk, + num_classes=self.num_classes, + strides=self.strides) + self.pos_loss_weight = pos_loss_weight + self.neg_loss_weight = neg_loss_weight + self.center_loss_weight = center_loss_weight + self.prior_generator = MlvlPointGenerator(self.strides, offset=0) + + def init_weights(self) -> None: + """Initialize weights of the head. + + In particular, we have special initialization for classified conv's and + regression conv's bias + """ + + super(AutoAssignHead, self).init_weights() + bias_cls = bias_init_with_prob(0.02) + normal_init(self.conv_cls, std=0.01, bias=bias_cls) + normal_init(self.conv_reg, std=0.01, bias=4.0) + + def forward_single(self, x: Tensor, scale: Scale, + stride: int) -> Tuple[Tensor, Tensor, Tensor]: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps, only + used to normalize the bbox prediction when self.norm_on_bbox + is True. + + Returns: + tuple[Tensor, Tensor, Tensor]: scores for each class, bbox + predictions and centerness predictions of input feature maps. + """ + cls_score, bbox_pred, cls_feat, reg_feat = super( + FCOSHead, self).forward_single(x) + centerness = self.conv_centerness(reg_feat) + # scale the bbox_pred of different level + # float to avoid overflow when enabling FP16 + bbox_pred = scale(bbox_pred).float() + # bbox_pred needed for gradient computation has been modified + # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace + # F.relu(bbox_pred) with bbox_pred.clamp(min=0) + bbox_pred = bbox_pred.clamp(min=0) + bbox_pred *= stride + return cls_score, bbox_pred, centerness + + def get_pos_loss_single(self, cls_score: Tensor, objectness: Tensor, + reg_loss: Tensor, gt_instances: InstanceData, + center_prior_weights: Tensor) -> Tuple[Tensor]: + """Calculate the positive loss of all points in gt_bboxes. + + Args: + cls_score (Tensor): All category scores for each point on + the feature map. The shape is (num_points, num_class). + objectness (Tensor): Foreground probability of all points, + has shape (num_points, 1). + reg_loss (Tensor): The regression loss of each gt_bbox and each + prediction box, has shape of (num_points, num_gt). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + center_prior_weights (Tensor): Float tensor with shape + of (num_points, num_gt). Each value represents + the center weighting coefficient. + + Returns: + tuple[Tensor]: + + - pos_loss (Tensor): The positive loss of all points in the \ + gt_bboxes. + """ + gt_labels = gt_instances.labels + # p_loc: localization confidence + p_loc = torch.exp(-reg_loss) + # p_cls: classification confidence + p_cls = (cls_score * objectness)[:, gt_labels] + # p_pos: joint confidence indicator + p_pos = p_cls * p_loc + + # 3 is a hyper-parameter to control the contributions of high and + # low confidence locations towards positive losses. + confidence_weight = torch.exp(p_pos * 3) + p_pos_weight = (confidence_weight * center_prior_weights) / ( + (confidence_weight * center_prior_weights).sum( + 0, keepdim=True)).clamp(min=EPS) + reweighted_p_pos = (p_pos * p_pos_weight).sum(0) + pos_loss = F.binary_cross_entropy( + reweighted_p_pos, + torch.ones_like(reweighted_p_pos), + reduction='none') + pos_loss = pos_loss.sum() * self.pos_loss_weight + return pos_loss, + + def get_neg_loss_single(self, cls_score: Tensor, objectness: Tensor, + gt_instances: InstanceData, ious: Tensor, + inside_gt_bbox_mask: Tensor) -> Tuple[Tensor]: + """Calculate the negative loss of all points in feature map. + + Args: + cls_score (Tensor): All category scores for each point on + the feature map. The shape is (num_points, num_class). + objectness (Tensor): Foreground probability of all points + and is shape of (num_points, 1). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + ious (Tensor): Float tensor with shape of (num_points, num_gt). + Each value represent the iou of pred_bbox and gt_bboxes. + inside_gt_bbox_mask (Tensor): Tensor of bool type, + with shape of (num_points, num_gt), each + value is used to mark whether this point falls + within a certain gt. + + Returns: + tuple[Tensor]: + + - neg_loss (Tensor): The negative loss of all points in the \ + feature map. + """ + gt_labels = gt_instances.labels + num_gts = len(gt_labels) + joint_conf = (cls_score * objectness) + p_neg_weight = torch.ones_like(joint_conf) + if num_gts > 0: + # the order of dinmension would affect the value of + # p_neg_weight, we strictly follow the original + # implementation. + inside_gt_bbox_mask = inside_gt_bbox_mask.permute(1, 0) + ious = ious.permute(1, 0) + + foreground_idxs = torch.nonzero(inside_gt_bbox_mask, as_tuple=True) + temp_weight = (1 / (1 - ious[foreground_idxs]).clamp_(EPS)) + + def normalize(x): + return (x - x.min() + EPS) / (x.max() - x.min() + EPS) + + for instance_idx in range(num_gts): + idxs = foreground_idxs[0] == instance_idx + if idxs.any(): + temp_weight[idxs] = normalize(temp_weight[idxs]) + + p_neg_weight[foreground_idxs[1], + gt_labels[foreground_idxs[0]]] = 1 - temp_weight + + logits = (joint_conf * p_neg_weight) + neg_loss = ( + logits**2 * F.binary_cross_entropy( + logits, torch.zeros_like(logits), reduction='none')) + neg_loss = neg_loss.sum() * self.neg_loss_weight + return neg_loss, + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * 4. + objectnesses (list[Tensor]): objectness for each scale level, each + is a 4D-tensor, the channel number is num_points * 1. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + + assert len(cls_scores) == len(bbox_preds) == len(objectnesses) + all_num_gt = sum([len(item) for item in batch_gt_instances]) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device) + inside_gt_bbox_mask_list, bbox_targets_list = self.get_targets( + all_level_points, batch_gt_instances) + + center_prior_weight_list = [] + temp_inside_gt_bbox_mask_list = [] + for gt_instances, inside_gt_bbox_mask in zip(batch_gt_instances, + inside_gt_bbox_mask_list): + center_prior_weight, inside_gt_bbox_mask = \ + self.center_prior(all_level_points, gt_instances, + inside_gt_bbox_mask) + center_prior_weight_list.append(center_prior_weight) + temp_inside_gt_bbox_mask_list.append(inside_gt_bbox_mask) + inside_gt_bbox_mask_list = temp_inside_gt_bbox_mask_list + mlvl_points = torch.cat(all_level_points, dim=0) + bbox_preds = levels_to_images(bbox_preds) + cls_scores = levels_to_images(cls_scores) + objectnesses = levels_to_images(objectnesses) + + reg_loss_list = [] + ious_list = [] + num_points = len(mlvl_points) + + for bbox_pred, encoded_targets, inside_gt_bbox_mask in zip( + bbox_preds, bbox_targets_list, inside_gt_bbox_mask_list): + temp_num_gt = encoded_targets.size(1) + expand_mlvl_points = mlvl_points[:, None, :].expand( + num_points, temp_num_gt, 2).reshape(-1, 2) + encoded_targets = encoded_targets.reshape(-1, 4) + expand_bbox_pred = bbox_pred[:, None, :].expand( + num_points, temp_num_gt, 4).reshape(-1, 4) + decoded_bbox_preds = self.bbox_coder.decode( + expand_mlvl_points, expand_bbox_pred) + decoded_target_preds = self.bbox_coder.decode( + expand_mlvl_points, encoded_targets) + with torch.no_grad(): + ious = bbox_overlaps( + decoded_bbox_preds, decoded_target_preds, is_aligned=True) + ious = ious.reshape(num_points, temp_num_gt) + if temp_num_gt: + ious = ious.max( + dim=-1, keepdim=True).values.repeat(1, temp_num_gt) + else: + ious = ious.new_zeros(num_points, temp_num_gt) + ious[~inside_gt_bbox_mask] = 0 + ious_list.append(ious) + loss_bbox = self.loss_bbox( + decoded_bbox_preds, + decoded_target_preds, + weight=None, + reduction_override='none') + reg_loss_list.append(loss_bbox.reshape(num_points, temp_num_gt)) + + cls_scores = [item.sigmoid() for item in cls_scores] + objectnesses = [item.sigmoid() for item in objectnesses] + pos_loss_list, = multi_apply(self.get_pos_loss_single, cls_scores, + objectnesses, reg_loss_list, + batch_gt_instances, + center_prior_weight_list) + pos_avg_factor = reduce_mean( + bbox_pred.new_tensor(all_num_gt)).clamp_(min=1) + pos_loss = sum(pos_loss_list) / pos_avg_factor + + neg_loss_list, = multi_apply(self.get_neg_loss_single, cls_scores, + objectnesses, batch_gt_instances, + ious_list, inside_gt_bbox_mask_list) + neg_avg_factor = sum(item.data.sum() + for item in center_prior_weight_list) + neg_avg_factor = reduce_mean(neg_avg_factor).clamp_(min=1) + neg_loss = sum(neg_loss_list) / neg_avg_factor + + center_loss = [] + for i in range(len(batch_img_metas)): + + if inside_gt_bbox_mask_list[i].any(): + center_loss.append( + len(batch_gt_instances[i]) / + center_prior_weight_list[i].sum().clamp_(min=EPS)) + # when width or height of gt_bbox is smaller than stride of p3 + else: + center_loss.append(center_prior_weight_list[i].sum() * 0) + + center_loss = torch.stack(center_loss).mean() * self.center_loss_weight + + # avoid dead lock in DDP + if all_num_gt == 0: + pos_loss = bbox_preds[0].sum() * 0 + dummy_center_prior_loss = self.center_prior.mean.sum( + ) * 0 + self.center_prior.sigma.sum() * 0 + center_loss = objectnesses[0].sum() * 0 + dummy_center_prior_loss + + loss = dict( + loss_pos=pos_loss, loss_neg=neg_loss, loss_center=center_loss) + + return loss + + def get_targets( + self, points: List[Tensor], batch_gt_instances: InstanceList + ) -> Tuple[List[Tensor], List[Tensor]]: + """Compute regression targets and each point inside or outside gt_bbox + in multiple images. + + Args: + points (list[Tensor]): Points of all fpn level, each has shape + (num_points, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple(list[Tensor], list[Tensor]): + + - inside_gt_bbox_mask_list (list[Tensor]): Each Tensor is with \ + bool type and shape of (num_points, num_gt), each value is used \ + to mark whether this point falls within a certain gt. + - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \ + level. Each tensor has shape (num_points, num_gt, 4). + """ + + concat_points = torch.cat(points, dim=0) + # the number of points per img, per lvl + inside_gt_bbox_mask_list, bbox_targets_list = multi_apply( + self._get_targets_single, batch_gt_instances, points=concat_points) + return inside_gt_bbox_mask_list, bbox_targets_list + + def _get_targets_single(self, gt_instances: InstanceData, + points: Tensor) -> Tuple[Tensor, Tensor]: + """Compute regression targets and each point inside or outside gt_bbox + for a single image. + + Args: + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + points (Tensor): Points of all fpn level, has shape + (num_points, 2). + + Returns: + tuple[Tensor, Tensor]: Containing the following Tensors: + + - inside_gt_bbox_mask (Tensor): Bool tensor with shape \ + (num_points, num_gt), each value is used to mark whether this \ + point falls within a certain gt. + - bbox_targets (Tensor): BBox targets of each points with each \ + gt_bboxes, has shape (num_points, num_gt, 4). + """ + gt_bboxes = gt_instances.bboxes + num_points = points.size(0) + num_gts = gt_bboxes.size(0) + gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) + xs, ys = points[:, 0], points[:, 1] + xs = xs[:, None] + ys = ys[:, None] + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + bbox_targets = torch.stack((left, top, right, bottom), -1) + if num_gts: + inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0 + else: + inside_gt_bbox_mask = bbox_targets.new_zeros((num_points, num_gts), + dtype=torch.bool) + + return inside_gt_bbox_mask, bbox_targets diff --git a/mmdetection/mmdet/models/dense_heads/base_dense_head.py b/mmdetection/mmdet/models/dense_heads/base_dense_head.py new file mode 100644 index 0000000..d0a4469 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/base_dense_head.py @@ -0,0 +1,583 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta, abstractmethod +from inspect import signature +from typing import List, Optional, Tuple + +import torch +from mmcv.ops import batched_nms +from mmengine.config import ConfigDict +from mmengine.model import BaseModule, constant_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.structures import SampleList +from mmdet.structures.bbox import (cat_boxes, get_box_tensor, get_box_wh, + scale_boxes) +from mmdet.utils import InstanceList, OptMultiConfig +from ..test_time_augs import merge_aug_results +from ..utils import (filter_scores_and_topk, select_single_mlvl, + unpack_gt_instances) + + +class BaseDenseHead(BaseModule, metaclass=ABCMeta): + """Base class for DenseHeads. + + 1. The ``init_weights`` method is used to initialize densehead's + model parameters. After detector initialization, ``init_weights`` + is triggered when ``detector.init_weights()`` is called externally. + + 2. The ``loss`` method is used to calculate the loss of densehead, + which includes two steps: (1) the densehead model performs forward + propagation to obtain the feature maps (2) The ``loss_by_feat`` method + is called based on the feature maps to calculate the loss. + + .. code:: text + + loss(): forward() -> loss_by_feat() + + 3. The ``predict`` method is used to predict detection results, + which includes two steps: (1) the densehead model performs forward + propagation to obtain the feature maps (2) The ``predict_by_feat`` method + is called based on the feature maps to predict detection results including + post-processing. + + .. code:: text + + predict(): forward() -> predict_by_feat() + + 4. The ``loss_and_predict`` method is used to return loss and detection + results at the same time. It will call densehead's ``forward``, + ``loss_by_feat`` and ``predict_by_feat`` methods in order. If one-stage is + used as RPN, the densehead needs to return both losses and predictions. + This predictions is used as the proposal of roihead. + + .. code:: text + + loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat() + """ + + def __init__(self, init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + # `_raw_positive_infos` will be used in `get_positive_infos`, which + # can get positive information. + self._raw_positive_infos = dict() + + def init_weights(self) -> None: + """Initialize the weights.""" + super().init_weights() + # avoid init_cfg overwrite the initialization of `conv_offset` + for m in self.modules(): + # DeformConv2dPack, ModulatedDeformConv2dPack + if hasattr(m, 'conv_offset'): + constant_init(m.conv_offset, 0) + + def get_positive_infos(self) -> InstanceList: + """Get positive information from sampling results. + + Returns: + list[:obj:`InstanceData`]: Positive information of each image, + usually including positive bboxes, positive labels, positive + priors, etc. + """ + if len(self._raw_positive_infos) == 0: + return None + + sampling_results = self._raw_positive_infos.get( + 'sampling_results', None) + assert sampling_results is not None + positive_infos = [] + for sampling_result in enumerate(sampling_results): + pos_info = InstanceData() + pos_info.bboxes = sampling_result.pos_gt_bboxes + pos_info.labels = sampling_result.pos_gt_labels + pos_info.priors = sampling_result.pos_priors + pos_info.pos_assigned_gt_inds = \ + sampling_result.pos_assigned_gt_inds + pos_info.pos_inds = sampling_result.pos_inds + positive_infos.append(pos_info) + return positive_infos + + def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + outs = self(x) + + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + loss_inputs = outs + (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + losses = self.loss_by_feat(*loss_inputs) + return losses + + @abstractmethod + def loss_by_feat(self, **kwargs) -> dict: + """Calculate the loss based on the features extracted by the detection + head.""" + pass + + def loss_and_predict( + self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + proposal_cfg: Optional[ConfigDict] = None + ) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. + + Args: + x (tuple[Tensor]): Features from FPN. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + proposal_cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + + Returns: + tuple: the return value is a tuple contains: + + - losses: (dict[str, Tensor]): A dictionary of loss components. + - predictions (list[:obj:`InstanceData`]): Detection + results of each image after the post process. + """ + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + outs = self(x) + + loss_inputs = outs + (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + losses = self.loss_by_feat(*loss_inputs) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg) + return losses, predictions + + def predict(self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + outs = self(x) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas, rescale=rescale) + return predictions + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + score_factors: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS, + such as CenterNess in FCOS, IoU branch in ATSS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + score_factors (list[Tensor], optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Defaults to None. + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + + if score_factors is None: + # e.g. Retina, FreeAnchor, Foveabox, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, AutoAssign, etc. + with_score_factors = True + assert len(cls_scores) == len(score_factors) + + num_levels = len(cls_scores) + + featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + + result_list = [] + + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + cls_score_list = select_single_mlvl( + cls_scores, img_id, detach=True) + bbox_pred_list = select_single_mlvl( + bbox_preds, img_id, detach=True) + if with_score_factors: + score_factor_list = select_single_mlvl( + score_factors, img_id, detach=True) + else: + score_factor_list = [None for _ in range(num_levels)] + + results = self._predict_by_feat_single( + cls_score_list=cls_score_list, + bbox_pred_list=bbox_pred_list, + score_factor_list=score_factor_list, + mlvl_priors=mlvl_priors, + img_meta=img_meta, + cfg=cfg, + rescale=rescale, + with_nms=with_nms) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (mmengine.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + if score_factor_list[0] is None: + # e.g. Retina, FreeAnchor, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, etc. + with_score_factors = True + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bbox_preds = [] + mlvl_valid_priors = [] + mlvl_scores = [] + mlvl_labels = [] + if with_score_factors: + mlvl_score_factors = [] + else: + mlvl_score_factors = None + for level_idx, (cls_score, bbox_pred, score_factor, priors) in \ + enumerate(zip(cls_score_list, bbox_pred_list, + score_factor_list, mlvl_priors)): + + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + dim = self.bbox_coder.encode_size + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim) + if with_score_factors: + score_factor = score_factor.permute(1, 2, + 0).reshape(-1).sigmoid() + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + + # the `custom_cls_channels` parameter is derived from + # CrossEntropyCustomLoss and FocalCustomLoss, and is currently used + # in v3det. + if getattr(self.loss_cls, 'custom_cls_channels', False): + scores = self.loss_cls.get_activation(cls_score) + elif self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + scores = cls_score.softmax(-1)[:, :-1] + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + score_thr = cfg.get('score_thr', 0) + + results = filter_scores_and_topk( + scores, score_thr, nms_pre, + dict(bbox_pred=bbox_pred, priors=priors)) + scores, labels, keep_idxs, filtered_results = results + + bbox_pred = filtered_results['bbox_pred'] + priors = filtered_results['priors'] + + if with_score_factors: + score_factor = score_factor[keep_idxs] + + mlvl_bbox_preds.append(bbox_pred) + mlvl_valid_priors.append(priors) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + + if with_score_factors: + mlvl_score_factors.append(score_factor) + + bbox_pred = torch.cat(mlvl_bbox_preds) + priors = cat_boxes(mlvl_valid_priors) + bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) + + results = InstanceData() + results.bboxes = bboxes + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + if with_score_factors: + results.score_factors = torch.cat(mlvl_score_factors) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + def _bbox_post_process(self, + results: InstanceData, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True, + img_meta: Optional[dict] = None) -> InstanceData: + """bbox post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + results (:obj:`InstaceData`): Detection instance results, + each item has shape (num_bboxes, ). + cfg (ConfigDict): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default to False. + with_nms (bool): If True, do nms before return boxes. + Default to True. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + if rescale: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + if hasattr(results, 'score_factors'): + # TODO: Add sqrt operation in order to be consistent with + # the paper. + score_factors = results.pop('score_factors') + results.scores = results.scores * score_factors + + # filter small size bboxes + if cfg.get('min_bbox_size', -1) >= 0: + w, h = get_box_wh(results.bboxes) + valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + if not valid_mask.all(): + results = results[valid_mask] + + # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg + if with_nms and results.bboxes.numel() > 0: + bboxes = get_box_tensor(results.bboxes) + det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, + results.labels, cfg.nms) + results = results[keep_idxs] + # some nms would reweight the score, such as softnms + results.scores = det_bboxes[:, -1] + results = results[:cfg.max_per_img] + + return results + + def aug_test(self, + aug_batch_feats, + aug_batch_img_metas, + rescale=False, + with_ori_nms=False, + **kwargs): + """Test function with test time augmentation. + + Args: + aug_batch_feats (list[tuple[Tensor]]): The outer list + indicates test-time augmentations and inner tuple + indicate the multi-level feats from + FPN, each Tensor should have a shape (B, C, H, W), + aug_batch_img_metas (list[list[dict]]): Meta information + of images under the different test-time augs + (multiscale, flip, etc.). The outer list indicate + the + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + with_ori_nms (bool): Whether execute the nms in original head. + Defaults to False. It will be `True` when the head is + adopted as `rpn_head`. + + Returns: + list(obj:`InstanceData`): Detection results of the + input images. Each item usually contains\ + following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance,) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances,). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + # TODO: remove this for detr and deformdetr + sig_of_get_results = signature(self.get_results) + get_results_args = [ + p.name for p in sig_of_get_results.parameters.values() + ] + get_results_single_sig = signature(self._get_results_single) + get_results_single_sig_args = [ + p.name for p in get_results_single_sig.parameters.values() + ] + assert ('with_nms' in get_results_args) and \ + ('with_nms' in get_results_single_sig_args), \ + f'{self.__class__.__name__}' \ + 'does not support test-time augmentation ' + + num_imgs = len(aug_batch_img_metas[0]) + aug_batch_results = [] + for x, img_metas in zip(aug_batch_feats, aug_batch_img_metas): + outs = self.forward(x) + batch_instance_results = self.get_results( + *outs, + img_metas=img_metas, + cfg=self.test_cfg, + rescale=False, + with_nms=with_ori_nms, + **kwargs) + aug_batch_results.append(batch_instance_results) + + # after merging, bboxes will be rescaled to the original image + batch_results = merge_aug_results(aug_batch_results, + aug_batch_img_metas) + + final_results = [] + for img_id in range(num_imgs): + results = batch_results[img_id] + det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores, + results.labels, + self.test_cfg.nms) + results = results[keep_idxs] + # some nms operation may reweight the score such as softnms + results.scores = det_bboxes[:, -1] + results = results[:self.test_cfg.max_per_img] + if rescale: + # all results have been mapped to the original scale + # in `merge_aug_results`, so just pass + pass + else: + # map to the first aug image scale + scale_factor = results.bboxes.new_tensor( + aug_batch_img_metas[0][img_id]['scale_factor']) + results.bboxes = \ + results.bboxes * scale_factor + + final_results.append(results) + + return final_results diff --git a/mmdetection/mmdet/models/dense_heads/base_mask_head.py b/mmdetection/mmdet/models/dense_heads/base_mask_head.py new file mode 100644 index 0000000..7183d78 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/base_mask_head.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Tuple, Union + +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.structures import SampleList +from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig +from ..utils import unpack_gt_instances + + +class BaseMaskHead(BaseModule, metaclass=ABCMeta): + """Base class for mask heads used in One-Stage Instance Segmentation.""" + + def __init__(self, init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + + @abstractmethod + def loss_by_feat(self, *args, **kwargs): + """Calculate the loss based on the features extracted by the mask + head.""" + pass + + @abstractmethod + def predict_by_feat(self, *args, **kwargs): + """Transform a batch of output features extracted from the head into + mask results.""" + pass + + def loss(self, + x: Union[List[Tensor], Tuple[Tensor]], + batch_data_samples: SampleList, + positive_infos: OptInstanceList = None, + **kwargs) -> dict: + """Perform forward propagation and loss calculation of the mask head on + the features of the upstream network. + + Args: + x (list[Tensor] | tuple[Tensor]): Features from FPN. + Each has a shape (B, C, H, W). + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + positive_infos (list[:obj:`InstanceData`], optional): Information + of positive samples. Used when the label assignment is + done outside the MaskHead, e.g., BboxHead in + YOLACT or CondInst, etc. When the label assignment is done in + MaskHead, it would be None, like SOLO or SOLOv2. All values + in it should have shape (num_positive_samples, *). + + + Returns: + dict: A dictionary of loss components. + """ + if positive_infos is None: + outs = self(x) + else: + outs = self(x, positive_infos) + + assert isinstance(outs, tuple), 'Forward results should be a tuple, ' \ + 'even if only one item is returned' + + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + for gt_instances, img_metas in zip(batch_gt_instances, + batch_img_metas): + img_shape = img_metas['batch_input_shape'] + gt_masks = gt_instances.masks.pad(img_shape) + gt_instances.masks = gt_masks + + losses = self.loss_by_feat( + *outs, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + positive_infos=positive_infos, + batch_gt_instances_ignore=batch_gt_instances_ignore, + **kwargs) + return losses + + def predict(self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = False, + results_list: OptInstanceList = None, + **kwargs) -> InstanceList: + """Test function without test-time augmentation. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + results_list (list[obj:`InstanceData`], optional): Detection + results of each image after the post process. Only exist + if there is a `bbox_head`, like `YOLACT`, `CondInst`, etc. + + Returns: + list[obj:`InstanceData`]: Instance segmentation + results of each image after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance,) + - labels (Tensor): Has a shape (num_instances,). + - masks (Tensor): Processed mask results, has a + shape (num_instances, h, w). + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + if results_list is None: + outs = self(x) + else: + outs = self(x, results_list) + + results_list = self.predict_by_feat( + *outs, + batch_img_metas=batch_img_metas, + rescale=rescale, + results_list=results_list, + **kwargs) + + return results_list diff --git a/mmdetection/mmdet/models/dense_heads/boxinst_head.py b/mmdetection/mmdet/models/dense_heads/boxinst_head.py new file mode 100644 index 0000000..7d6e8f7 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/boxinst_head.py @@ -0,0 +1,252 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn.functional as F +from mmengine import MessageHub +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import InstanceList +from ..utils.misc import unfold_wo_center +from .condinst_head import CondInstBboxHead, CondInstMaskHead + + +@MODELS.register_module() +class BoxInstBboxHead(CondInstBboxHead): + """BoxInst box head used in https://arxiv.org/abs/2012.02310.""" + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + +@MODELS.register_module() +class BoxInstMaskHead(CondInstMaskHead): + """BoxInst mask head used in https://arxiv.org/abs/2012.02310. + + This head outputs the mask for BoxInst. + + Args: + pairwise_size (dict): The size of neighborhood for each pixel. + Defaults to 3. + pairwise_dilation (int): The dilation of neighborhood for each pixel. + Defaults to 2. + warmup_iters (int): Warmup iterations for pair-wise loss. + Defaults to 10000. + """ + + def __init__(self, + *arg, + pairwise_size: int = 3, + pairwise_dilation: int = 2, + warmup_iters: int = 10000, + **kwargs) -> None: + self.pairwise_size = pairwise_size + self.pairwise_dilation = pairwise_dilation + self.warmup_iters = warmup_iters + super().__init__(*arg, **kwargs) + + def get_pairwise_affinity(self, mask_logits: Tensor) -> Tensor: + """Compute the pairwise affinity for each pixel.""" + log_fg_prob = F.logsigmoid(mask_logits).unsqueeze(1) + log_bg_prob = F.logsigmoid(-mask_logits).unsqueeze(1) + + log_fg_prob_unfold = unfold_wo_center( + log_fg_prob, + kernel_size=self.pairwise_size, + dilation=self.pairwise_dilation) + log_bg_prob_unfold = unfold_wo_center( + log_bg_prob, + kernel_size=self.pairwise_size, + dilation=self.pairwise_dilation) + + # the probability of making the same prediction: + # p_i * p_j + (1 - p_i) * (1 - p_j) + # we compute the the probability in log space + # to avoid numerical instability + log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold + log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold + + # TODO: Figure out the difference between it and directly sum + max_ = torch.max(log_same_fg_prob, log_same_bg_prob) + log_same_prob = torch.log( + torch.exp(log_same_fg_prob - max_) + + torch.exp(log_same_bg_prob - max_)) + max_ + + return -log_same_prob[:, 0] + + def loss_by_feat(self, mask_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], positive_infos: InstanceList, + **kwargs) -> dict: + """Calculate the loss based on the features extracted by the mask head. + + Args: + mask_preds (list[Tensor]): List of predicted masks, each has + shape (num_classes, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``masks``, + and ``labels`` attributes. + batch_img_metas (list[dict]): Meta information of multiple images. + positive_infos (List[:obj:``InstanceData``]): Information of + positive samples of each image that are assigned in detection + head. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert positive_infos is not None, \ + 'positive_infos should not be None in `BoxInstMaskHead`' + losses = dict() + + loss_mask_project = 0. + loss_mask_pairwise = 0. + num_imgs = len(mask_preds) + total_pos = 0. + avg_fatcor = 0. + + for idx in range(num_imgs): + (mask_pred, pos_mask_targets, pos_pairwise_masks, num_pos) = \ + self._get_targets_single( + mask_preds[idx], batch_gt_instances[idx], + positive_infos[idx]) + # mask loss + total_pos += num_pos + if num_pos == 0 or pos_mask_targets is None: + loss_project = mask_pred.new_zeros(1).mean() + loss_pairwise = mask_pred.new_zeros(1).mean() + avg_fatcor += 0. + else: + # compute the project term + loss_project_x = self.loss_mask( + mask_pred.max(dim=1, keepdim=True)[0], + pos_mask_targets.max(dim=1, keepdim=True)[0], + reduction_override='none').sum() + loss_project_y = self.loss_mask( + mask_pred.max(dim=2, keepdim=True)[0], + pos_mask_targets.max(dim=2, keepdim=True)[0], + reduction_override='none').sum() + loss_project = loss_project_x + loss_project_y + # compute the pairwise term + pairwise_affinity = self.get_pairwise_affinity(mask_pred) + avg_fatcor += pos_pairwise_masks.sum().clamp(min=1.0) + loss_pairwise = (pairwise_affinity * pos_pairwise_masks).sum() + + loss_mask_project += loss_project + loss_mask_pairwise += loss_pairwise + + if total_pos == 0: + total_pos += 1 # avoid nan + if avg_fatcor == 0: + avg_fatcor += 1 # avoid nan + loss_mask_project = loss_mask_project / total_pos + loss_mask_pairwise = loss_mask_pairwise / avg_fatcor + message_hub = MessageHub.get_current_instance() + iter = message_hub.get_info('iter') + warmup_factor = min(iter / float(self.warmup_iters), 1.0) + loss_mask_pairwise *= warmup_factor + + losses.update( + loss_mask_project=loss_mask_project, + loss_mask_pairwise=loss_mask_pairwise) + return losses + + def _get_targets_single(self, mask_preds: Tensor, + gt_instances: InstanceData, + positive_info: InstanceData): + """Compute targets for predictions of single image. + + Args: + mask_preds (Tensor): Predicted prototypes with shape + (num_classes, H, W). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes``, ``labels``, + and ``masks`` attributes. + positive_info (:obj:`InstanceData`): Information of positive + samples that are assigned in detection head. It usually + contains following keys. + + - pos_assigned_gt_inds (Tensor): Assigner GT indexes of + positive proposals, has shape (num_pos, ) + - pos_inds (Tensor): Positive index of image, has + shape (num_pos, ). + - param_pred (Tensor): Positive param preditions + with shape (num_pos, num_params). + + Returns: + tuple: Usually returns a tuple containing learning targets. + + - mask_preds (Tensor): Positive predicted mask with shape + (num_pos, mask_h, mask_w). + - pos_mask_targets (Tensor): Positive mask targets with shape + (num_pos, mask_h, mask_w). + - pos_pairwise_masks (Tensor): Positive pairwise masks with + shape: (num_pos, num_neighborhood, mask_h, mask_w). + - num_pos (int): Positive numbers. + """ + gt_bboxes = gt_instances.bboxes + device = gt_bboxes.device + # Note that gt_masks are generated by full box + # from BoxInstDataPreprocessor + gt_masks = gt_instances.masks.to_tensor( + dtype=torch.bool, device=device).float() + # Note that pairwise_masks are generated by image color similarity + # from BoxInstDataPreprocessor + pairwise_masks = gt_instances.pairwise_masks + pairwise_masks = pairwise_masks.to(device=device) + + # process with mask targets + pos_assigned_gt_inds = positive_info.get('pos_assigned_gt_inds') + scores = positive_info.get('scores') + centernesses = positive_info.get('centernesses') + num_pos = pos_assigned_gt_inds.size(0) + + if gt_masks.size(0) == 0 or num_pos == 0: + return mask_preds, None, None, 0 + # Since we're producing (near) full image masks, + # it'd take too much vram to backprop on every single mask. + # Thus we select only a subset. + if (self.max_masks_to_train != -1) and \ + (num_pos > self.max_masks_to_train): + perm = torch.randperm(num_pos) + select = perm[:self.max_masks_to_train] + mask_preds = mask_preds[select] + pos_assigned_gt_inds = pos_assigned_gt_inds[select] + num_pos = self.max_masks_to_train + elif self.topk_masks_per_img != -1: + unique_gt_inds = pos_assigned_gt_inds.unique() + num_inst_per_gt = max( + int(self.topk_masks_per_img / len(unique_gt_inds)), 1) + + keep_mask_preds = [] + keep_pos_assigned_gt_inds = [] + for gt_ind in unique_gt_inds: + per_inst_pos_inds = (pos_assigned_gt_inds == gt_ind) + mask_preds_per_inst = mask_preds[per_inst_pos_inds] + gt_inds_per_inst = pos_assigned_gt_inds[per_inst_pos_inds] + if sum(per_inst_pos_inds) > num_inst_per_gt: + per_inst_scores = scores[per_inst_pos_inds].sigmoid().max( + dim=1)[0] + per_inst_centerness = centernesses[ + per_inst_pos_inds].sigmoid().reshape(-1, ) + select = (per_inst_scores * per_inst_centerness).topk( + k=num_inst_per_gt, dim=0)[1] + mask_preds_per_inst = mask_preds_per_inst[select] + gt_inds_per_inst = gt_inds_per_inst[select] + keep_mask_preds.append(mask_preds_per_inst) + keep_pos_assigned_gt_inds.append(gt_inds_per_inst) + mask_preds = torch.cat(keep_mask_preds) + pos_assigned_gt_inds = torch.cat(keep_pos_assigned_gt_inds) + num_pos = pos_assigned_gt_inds.size(0) + + # Follow the origin implement + start = int(self.mask_out_stride // 2) + gt_masks = gt_masks[:, start::self.mask_out_stride, + start::self.mask_out_stride] + gt_masks = gt_masks.gt(0.5).float() + pos_mask_targets = gt_masks[pos_assigned_gt_inds] + pos_pairwise_masks = pairwise_masks[pos_assigned_gt_inds] + pos_pairwise_masks = pos_pairwise_masks * pos_mask_targets.unsqueeze(1) + + return (mask_preds, pos_mask_targets, pos_pairwise_masks, num_pos) diff --git a/mmdetection/mmdet/models/dense_heads/cascade_rpn_head.py b/mmdetection/mmdet/models/dense_heads/cascade_rpn_head.py new file mode 100644 index 0000000..a8686cc --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/cascade_rpn_head.py @@ -0,0 +1,1110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import division +import copy +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.ops import DeformConv2d +from mmengine.config import ConfigDict +from mmengine.model import BaseModule, ModuleList +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import SampleList +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, + OptInstanceList, OptMultiConfig) +from ..task_modules.assigners import RegionAssigner +from ..task_modules.samplers import PseudoSampler +from ..utils import (images_to_levels, multi_apply, select_single_mlvl, + unpack_gt_instances) +from .base_dense_head import BaseDenseHead +from .rpn_head import RPNHead + + +class AdaptiveConv(BaseModule): + """AdaptiveConv used to adapt the sampling location with the anchors. + + Args: + in_channels (int): Number of channels in the input image. + out_channels (int): Number of channels produced by the convolution. + kernel_size (int or tuple[int]): Size of the conv kernel. + Defaults to 3. + stride (int or tuple[int]): Stride of the convolution. Defaults to 1. + padding (int or tuple[int]): Zero-padding added to both sides of + the input. Defaults to 1. + dilation (int or tuple[int]): Spacing between kernel elements. + Defaults to 3. + groups (int): Number of blocked connections from input channels to + output channels. Defaults to 1. + bias (bool): If set True, adds a learnable bias to the output. + Defaults to False. + adapt_type (str): Type of adaptive conv, can be either ``offset`` + (arbitrary anchors) or 'dilation' (uniform anchor). + Defaults to 'dilation'. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \ + list[dict]): Initialization config dict. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int]] = 3, + stride: Union[int, Tuple[int]] = 1, + padding: Union[int, Tuple[int]] = 1, + dilation: Union[int, Tuple[int]] = 3, + groups: int = 1, + bias: bool = False, + adapt_type: str = 'dilation', + init_cfg: MultiConfig = dict( + type='Normal', std=0.01, override=dict(name='conv')) + ) -> None: + super().__init__(init_cfg=init_cfg) + assert adapt_type in ['offset', 'dilation'] + self.adapt_type = adapt_type + + assert kernel_size == 3, 'Adaptive conv only supports kernels 3' + if self.adapt_type == 'offset': + assert stride == 1 and padding == 1 and groups == 1, \ + 'Adaptive conv offset mode only supports padding: {1}, ' \ + f'stride: {1}, groups: {1}' + self.conv = DeformConv2d( + in_channels, + out_channels, + kernel_size, + padding=padding, + stride=stride, + groups=groups, + bias=bias) + else: + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + padding=dilation, + dilation=dilation) + + def forward(self, x: Tensor, offset: Tensor) -> Tensor: + """Forward function.""" + if self.adapt_type == 'offset': + N, _, H, W = x.shape + assert offset is not None + assert H * W == offset.shape[1] + # reshape [N, NA, 18] to (N, 18, H, W) + offset = offset.permute(0, 2, 1).reshape(N, -1, H, W) + offset = offset.contiguous() + x = self.conv(x, offset) + else: + assert offset is None + x = self.conv(x) + return x + + +@MODELS.register_module() +class StageCascadeRPNHead(RPNHead): + """Stage of CascadeRPNHead. + + Args: + in_channels (int): Number of channels in the input feature map. + anchor_generator (:obj:`ConfigDict` or dict): anchor generator config. + adapt_cfg (:obj:`ConfigDict` or dict): adaptation config. + bridged_feature (bool): whether update rpn feature. Defaults to False. + with_cls (bool): whether use classification branch. Defaults to True. + init_cfg :obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + anchor_generator: ConfigType = dict( + type='AnchorGenerator', + scales=[8], + ratios=[1.0], + strides=[4, 8, 16, 32, 64]), + adapt_cfg: ConfigType = dict(type='dilation', dilation=3), + bridged_feature: bool = False, + with_cls: bool = True, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + self.with_cls = with_cls + self.anchor_strides = anchor_generator['strides'] + self.anchor_scales = anchor_generator['scales'] + self.bridged_feature = bridged_feature + self.adapt_cfg = adapt_cfg + super().__init__( + in_channels=in_channels, + anchor_generator=anchor_generator, + init_cfg=init_cfg, + **kwargs) + + # override sampling and sampler + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + # use PseudoSampler when sampling is False + if self.train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + if init_cfg is None: + self.init_cfg = dict( + type='Normal', std=0.01, override=[dict(name='rpn_reg')]) + if self.with_cls: + self.init_cfg['override'].append(dict(name='rpn_cls')) + + def _init_layers(self) -> None: + """Init layers of a CascadeRPN stage.""" + adapt_cfg = copy.deepcopy(self.adapt_cfg) + adapt_cfg['adapt_type'] = adapt_cfg.pop('type') + self.rpn_conv = AdaptiveConv(self.in_channels, self.feat_channels, + **adapt_cfg) + if self.with_cls: + self.rpn_cls = nn.Conv2d(self.feat_channels, + self.num_anchors * self.cls_out_channels, + 1) + self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1) + self.relu = nn.ReLU(inplace=True) + + def forward_single(self, x: Tensor, offset: Tensor) -> Tuple[Tensor]: + """Forward function of single scale.""" + bridged_x = x + x = self.relu(self.rpn_conv(x, offset)) + if self.bridged_feature: + bridged_x = x # update feature + cls_score = self.rpn_cls(x) if self.with_cls else None + bbox_pred = self.rpn_reg(x) + return bridged_x, cls_score, bbox_pred + + def forward( + self, + feats: List[Tensor], + offset_list: Optional[List[Tensor]] = None) -> Tuple[List[Tensor]]: + """Forward function.""" + if offset_list is None: + offset_list = [None for _ in range(len(feats))] + return multi_apply(self.forward_single, feats, offset_list) + + def _region_targets_single(self, flat_anchors: Tensor, valid_flags: Tensor, + gt_instances: InstanceData, img_meta: dict, + gt_instances_ignore: InstanceData, + featmap_sizes: List[Tuple[int, int]], + num_level_anchors: List[int]) -> tuple: + """Get anchor targets based on region for single level. + + Args: + flat_anchors (Tensor): Multi-level anchors of the image, which are + concatenated into a single tensor of shape (num_anchors, 4) + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_anchors, ). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + featmap_sizes (list[Tuple[int, int]]): Feature map size each level. + num_level_anchors (list[int]): The number of anchors in each level. + + Returns: + tuple: + + - labels (Tensor): Labels of each level. + - label_weights (Tensor): Label weights of each level. + - bbox_targets (Tensor): BBox targets of each level. + - bbox_weights (Tensor): BBox weights of each level. + - pos_inds (Tensor): positive samples indexes. + - neg_inds (Tensor): negative samples indexes. + - sampling_result (:obj:`SamplingResult`): Sampling results. + """ + pred_instances = InstanceData() + pred_instances.priors = flat_anchors + pred_instances.valid_flags = valid_flags + + assign_result = self.assigner.assign( + pred_instances, + gt_instances, + img_meta, + featmap_sizes, + num_level_anchors, + self.anchor_scales[0], + self.anchor_strides, + gt_instances_ignore=gt_instances_ignore, + allowed_border=self.train_cfg['allowed_border']) + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_anchors = flat_anchors.shape[0] + bbox_targets = torch.zeros_like(flat_anchors) + bbox_weights = torch.zeros_like(flat_anchors) + labels = flat_anchors.new_zeros(num_anchors, dtype=torch.long) + label_weights = flat_anchors.new_zeros(num_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + if not self.reg_decoded_bbox: + pos_bbox_targets = self.bbox_coder.encode( + sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes) + else: + pos_bbox_targets = sampling_result.pos_gt_bboxes + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds, sampling_result) + + def region_targets( + self, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + featmap_sizes: List[Tuple[int, int]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + return_sampling_results: bool = False, + ) -> tuple: + """Compute regression and classification targets for anchors when using + RegionAssigner. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. + featmap_sizes (list[Tuple[int, int]]): Feature map size each level. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + tuple: + + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_targets_list (list[Tensor]): BBox targets of each level. + - bbox_weights_list (list[Tensor]): BBox weights of each level. + - avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + ``PseudoSampler``, ``avg_factor`` is usually equal to the + number of positive priors. + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + # concat all level anchors to a single tensor + concat_anchor_list = [] + concat_valid_flag_list = [] + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + concat_anchor_list.append(torch.cat(anchor_list[i])) + concat_valid_flag_list.append(torch.cat(valid_flag_list[i])) + + # compute targets for each image + (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, + pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply( + self._region_targets_single, + concat_anchor_list, + concat_valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + featmap_sizes=featmap_sizes, + num_level_anchors=num_level_anchors) + # no valid anchors + if any([labels is None for labels in all_labels]): + return None + # sampled anchors of all images + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # split targets to a list w.r.t. multiple levels + labels_list = images_to_levels(all_labels, num_level_anchors) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_anchors) + res = (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) + if return_sampling_results: + res = res + (sampling_results_list, ) + return res + + def get_targets( + self, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + featmap_sizes: List[Tuple[int, int]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + return_sampling_results: bool = False, + ) -> tuple: + """Compute regression and classification targets for anchors. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. + featmap_sizes (list[Tuple[int, int]]): Feature map size each level. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + return_sampling_results (bool): Whether to return the sampling + results. Defaults to False. + + Returns: + tuple: + + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_targets_list (list[Tensor]): BBox targets of each level. + - bbox_weights_list (list[Tensor]): BBox weights of each level. + - avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + ``PseudoSampler``, ``avg_factor`` is usually equal to the + number of positive priors. + """ + if isinstance(self.assigner, RegionAssigner): + cls_reg_targets = self.region_targets( + anchor_list, + valid_flag_list, + featmap_sizes, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + return_sampling_results=return_sampling_results) + else: + cls_reg_targets = super().get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + return_sampling_results=return_sampling_results) + return cls_reg_targets + + def anchor_offset(self, anchor_list: List[List[Tensor]], + anchor_strides: List[int], + featmap_sizes: List[Tuple[int, int]]) -> List[Tensor]: + """ Get offset for deformable conv based on anchor shape + NOTE: currently support deformable kernel_size=3 and dilation=1 + + Args: + anchor_list (list[list[tensor])): [NI, NLVL, NA, 4] list of + multi-level anchors + anchor_strides (list[int]): anchor stride of each level + + Returns: + list[tensor]: offset of DeformConv kernel with shapes of + [NLVL, NA, 2, 18]. + """ + + def _shape_offset(anchors, stride, ks=3, dilation=1): + # currently support kernel_size=3 and dilation=1 + assert ks == 3 and dilation == 1 + pad = (ks - 1) // 2 + idx = torch.arange(-pad, pad + 1, dtype=dtype, device=device) + yy, xx = torch.meshgrid(idx, idx) # return order matters + xx = xx.reshape(-1) + yy = yy.reshape(-1) + w = (anchors[:, 2] - anchors[:, 0]) / stride + h = (anchors[:, 3] - anchors[:, 1]) / stride + w = w / (ks - 1) - dilation + h = h / (ks - 1) - dilation + offset_x = w[:, None] * xx # (NA, ks**2) + offset_y = h[:, None] * yy # (NA, ks**2) + return offset_x, offset_y + + def _ctr_offset(anchors, stride, featmap_size): + feat_h, feat_w = featmap_size + assert len(anchors) == feat_h * feat_w + + x = (anchors[:, 0] + anchors[:, 2]) * 0.5 + y = (anchors[:, 1] + anchors[:, 3]) * 0.5 + # compute centers on feature map + x = x / stride + y = y / stride + # compute predefine centers + xx = torch.arange(0, feat_w, device=anchors.device) + yy = torch.arange(0, feat_h, device=anchors.device) + yy, xx = torch.meshgrid(yy, xx) + xx = xx.reshape(-1).type_as(x) + yy = yy.reshape(-1).type_as(y) + + offset_x = x - xx # (NA, ) + offset_y = y - yy # (NA, ) + return offset_x, offset_y + + num_imgs = len(anchor_list) + num_lvls = len(anchor_list[0]) + dtype = anchor_list[0][0].dtype + device = anchor_list[0][0].device + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + + offset_list = [] + for i in range(num_imgs): + mlvl_offset = [] + for lvl in range(num_lvls): + c_offset_x, c_offset_y = _ctr_offset(anchor_list[i][lvl], + anchor_strides[lvl], + featmap_sizes[lvl]) + s_offset_x, s_offset_y = _shape_offset(anchor_list[i][lvl], + anchor_strides[lvl]) + + # offset = ctr_offset + shape_offset + offset_x = s_offset_x + c_offset_x[:, None] + offset_y = s_offset_y + c_offset_y[:, None] + + # offset order (y0, x0, y1, x2, .., y8, x8, y9, x9) + offset = torch.stack([offset_y, offset_x], dim=-1) + offset = offset.reshape(offset.size(0), -1) # [NA, 2*ks**2] + mlvl_offset.append(offset) + offset_list.append(torch.cat(mlvl_offset)) # [totalNA, 2*ks**2] + offset_list = images_to_levels(offset_list, num_level_anchors) + return offset_list + + def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, + anchors: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + bbox_weights: Tensor, avg_factor: int) -> tuple: + """Loss function on single scale.""" + # classification loss + if self.with_cls: + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + cls_score = cls_score.permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=avg_factor) + # regression loss + bbox_targets = bbox_targets.reshape(-1, 4) + bbox_weights = bbox_weights.reshape(-1, 4) + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + if self.reg_decoded_bbox: + # When the regression loss (e.g. `IouLoss`, `GIouLoss`) + # is applied directly on the decoded bounding boxes, it + # decodes the already encoded coordinates to absolute format. + anchors = anchors.reshape(-1, 4) + bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) + loss_reg = self.loss_bbox( + bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor) + if self.with_cls: + return loss_cls, loss_reg + return None, loss_reg + + def loss_by_feat( + self, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Compute losses of the head. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, ) + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds] + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + featmap_sizes, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + return_sampling_results=True) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor, sampling_results_list) = cls_reg_targets + if not sampling_results_list[0].avg_factor_with_neg: + # 200 is hard-coded average factor, + # which follows guided anchoring. + avg_factor = sum([label.numel() for label in labels_list]) / 200.0 + + # change per image, per level anchor_list to per_level, per_image + mlvl_anchor_list = list(zip(*anchor_list)) + # concat mlvl_anchor_list + mlvl_anchor_list = [ + torch.cat(anchors, dim=0) for anchors in mlvl_anchor_list + ] + + losses = multi_apply( + self.loss_by_feat_single, + cls_scores, + bbox_preds, + mlvl_anchor_list, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + avg_factor=avg_factor) + if self.with_cls: + return dict(loss_rpn_cls=losses[0], loss_rpn_reg=losses[1]) + return dict(loss_rpn_reg=losses[1]) + + def predict_by_feat(self, + anchor_list: List[List[Tensor]], + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_img_metas: List[dict], + cfg: Optional[ConfigDict] = None, + rescale: bool = False) -> InstanceList: + """Get proposal predict. Overriding to enable input ``anchor_list`` + from outside. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + batch_img_metas (list[dict], Optional): Image meta info. + cfg (:obj:`ConfigDict`, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + + result_list = [] + for img_id in range(len(batch_img_metas)): + cls_score_list = select_single_mlvl(cls_scores, img_id) + bbox_pred_list = select_single_mlvl(bbox_preds, img_id) + proposals = self._predict_by_feat_single( + cls_scores=cls_score_list, + bbox_preds=bbox_pred_list, + mlvl_anchors=anchor_list[img_id], + img_meta=batch_img_metas[img_id], + cfg=cfg, + rescale=rescale) + result_list.append(proposals) + return result_list + + def _predict_by_feat_single(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + mlvl_anchors: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False) -> InstanceData: + """Transform outputs of a single image into bbox predictions. + + Args: + cls_scores (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has + shape (num_anchors * 4, H, W). + mlvl_anchors (list[Tensor]): Box reference from all scale + levels of a single image, each item has shape + (num_total_anchors, 4). + img_shape (tuple[int]): Shape of the input image, + (height, width, 3). + scale_factor (ndarray): Scale factor of the image arange as + (w_scale, h_scale, w_scale, h_scale). + cfg (:obj:`ConfigDict`): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + # bboxes from different level should be independent during NMS, + # level_ids are used as labels for batched NMS to separate them + level_ids = [] + mlvl_scores = [] + mlvl_bbox_preds = [] + mlvl_valid_anchors = [] + nms_pre = cfg.get('nms_pre', -1) + for idx in range(len(cls_scores)): + rpn_cls_score = cls_scores[idx] + rpn_bbox_pred = bbox_preds[idx] + assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:] + rpn_cls_score = rpn_cls_score.permute(1, 2, 0) + if self.use_sigmoid_cls: + rpn_cls_score = rpn_cls_score.reshape(-1) + scores = rpn_cls_score.sigmoid() + else: + rpn_cls_score = rpn_cls_score.reshape(-1, 2) + # We set FG labels to [0, num_class-1] and BG label to + # num_class in RPN head since mmdet v2.5, which is unified to + # be consistent with other head since mmdet v2.0. In mmdet v2.0 + # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head. + scores = rpn_cls_score.softmax(dim=1)[:, 0] + rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4) + anchors = mlvl_anchors[idx] + + if 0 < nms_pre < scores.shape[0]: + # sort is faster than topk + # _, topk_inds = scores.topk(cfg.nms_pre) + ranked_scores, rank_inds = scores.sort(descending=True) + topk_inds = rank_inds[:nms_pre] + scores = ranked_scores[:nms_pre] + rpn_bbox_pred = rpn_bbox_pred[topk_inds, :] + anchors = anchors[topk_inds, :] + mlvl_scores.append(scores) + mlvl_bbox_preds.append(rpn_bbox_pred) + mlvl_valid_anchors.append(anchors) + level_ids.append( + scores.new_full((scores.size(0), ), idx, dtype=torch.long)) + + anchors = torch.cat(mlvl_valid_anchors) + rpn_bbox_pred = torch.cat(mlvl_bbox_preds) + bboxes = self.bbox_coder.decode( + anchors, rpn_bbox_pred, max_shape=img_meta['img_shape']) + + proposals = InstanceData() + proposals.bboxes = bboxes + proposals.scores = torch.cat(mlvl_scores) + proposals.level_ids = torch.cat(level_ids) + + return self._bbox_post_process( + results=proposals, cfg=cfg, rescale=rescale, img_meta=img_meta) + + def refine_bboxes(self, anchor_list: List[List[Tensor]], + bbox_preds: List[Tensor], + img_metas: List[dict]) -> List[List[Tensor]]: + """Refine bboxes through stages.""" + num_levels = len(bbox_preds) + new_anchor_list = [] + for img_id in range(len(img_metas)): + mlvl_anchors = [] + for i in range(num_levels): + bbox_pred = bbox_preds[i][img_id].detach() + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + img_shape = img_metas[img_id]['img_shape'] + bboxes = self.bbox_coder.decode(anchor_list[img_id][i], + bbox_pred, img_shape) + mlvl_anchors.append(bboxes) + new_anchor_list.append(mlvl_anchors) + return new_anchor_list + + def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, _, batch_img_metas = outputs + + featmap_sizes = [featmap.size()[-2:] for featmap in x] + device = x[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + if self.adapt_cfg['type'] == 'offset': + offset_list = self.anchor_offset(anchor_list, self.anchor_strides, + featmap_sizes) + else: + offset_list = None + + x, cls_score, bbox_pred = self(x, offset_list) + rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred, + batch_gt_instances, batch_img_metas) + losses = self.loss_by_feat(*rpn_loss_inputs) + + return losses + + def loss_and_predict( + self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + proposal_cfg: Optional[ConfigDict] = None, + ) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. + + Args: + x (tuple[Tensor]): Features from FPN. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + proposal_cfg (:obj`ConfigDict`, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + + Returns: + tuple: the return value is a tuple contains: + + - losses: (dict[str, Tensor]): A dictionary of loss components. + - predictions (list[:obj:`InstanceData`]): Detection + results of each image after the post process. + """ + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, _, batch_img_metas = outputs + + featmap_sizes = [featmap.size()[-2:] for featmap in x] + device = x[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + if self.adapt_cfg['type'] == 'offset': + offset_list = self.anchor_offset(anchor_list, self.anchor_strides, + featmap_sizes) + else: + offset_list = None + + x, cls_score, bbox_pred = self(x, offset_list) + rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred, + batch_gt_instances, batch_img_metas) + losses = self.loss_by_feat(*rpn_loss_inputs) + + predictions = self.predict_by_feat( + anchor_list, + cls_score, + bbox_pred, + batch_img_metas=batch_img_metas, + cfg=proposal_cfg) + return losses, predictions + + def predict(self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + featmap_sizes = [featmap.size()[-2:] for featmap in x] + device = x[0].device + anchor_list, _ = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + if self.adapt_cfg['type'] == 'offset': + offset_list = self.anchor_offset(anchor_list, self.anchor_strides, + featmap_sizes) + else: + offset_list = None + + x, cls_score, bbox_pred = self(x, offset_list) + predictions = self.stages[-1].predict_by_feat( + anchor_list, + cls_score, + bbox_pred, + batch_img_metas=batch_img_metas, + rescale=rescale) + return predictions + + +@MODELS.register_module() +class CascadeRPNHead(BaseDenseHead): + """The CascadeRPNHead will predict more accurate region proposals, which is + required for two-stage detectors (such as Fast/Faster R-CNN). CascadeRPN + consists of a sequence of RPNStage to progressively improve the accuracy of + the detected proposals. + + More details can be found in ``https://arxiv.org/abs/1909.06720``. + + Args: + num_stages (int): number of CascadeRPN stages. + stages (list[:obj:`ConfigDict` or dict]): list of configs to build + the stages. + train_cfg (list[:obj:`ConfigDict` or dict]): list of configs at + training time each stage. + test_cfg (:obj:`ConfigDict` or dict): config at testing time. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \ + list[dict]): Initialization config dict. + """ + + def __init__(self, + num_classes: int, + num_stages: int, + stages: List[ConfigType], + train_cfg: List[ConfigType], + test_cfg: ConfigType, + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + assert num_classes == 1, 'Only support num_classes == 1' + assert num_stages == len(stages) + self.num_stages = num_stages + # Be careful! Pretrained weights cannot be loaded when use + # nn.ModuleList + self.stages = ModuleList() + for i in range(len(stages)): + train_cfg_i = train_cfg[i] if train_cfg is not None else None + stages[i].update(train_cfg=train_cfg_i) + stages[i].update(test_cfg=test_cfg) + self.stages.append(MODELS.build(stages[i])) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + def loss_by_feat(self): + """loss_by_feat() is implemented in StageCascadeRPNHead.""" + pass + + def predict_by_feat(self): + """predict_by_feat() is implemented in StageCascadeRPNHead.""" + pass + + def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, _, batch_img_metas = outputs + + featmap_sizes = [featmap.size()[-2:] for featmap in x] + device = x[0].device + anchor_list, valid_flag_list = self.stages[0].get_anchors( + featmap_sizes, batch_img_metas, device=device) + + losses = dict() + + for i in range(self.num_stages): + stage = self.stages[i] + + if stage.adapt_cfg['type'] == 'offset': + offset_list = stage.anchor_offset(anchor_list, + stage.anchor_strides, + featmap_sizes) + else: + offset_list = None + x, cls_score, bbox_pred = stage(x, offset_list) + rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, + bbox_pred, batch_gt_instances, batch_img_metas) + stage_loss = stage.loss_by_feat(*rpn_loss_inputs) + for name, value in stage_loss.items(): + losses['s{}.{}'.format(i, name)] = value + + # refine boxes + if i < self.num_stages - 1: + anchor_list = stage.refine_bboxes(anchor_list, bbox_pred, + batch_img_metas) + + return losses + + def loss_and_predict( + self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + proposal_cfg: Optional[ConfigDict] = None, + ) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. + + Args: + x (tuple[Tensor]): Features from FPN. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + proposal_cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + + Returns: + tuple: the return value is a tuple contains: + + - losses: (dict[str, Tensor]): A dictionary of loss components. + - predictions (list[:obj:`InstanceData`]): Detection + results of each image after the post process. + """ + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, _, batch_img_metas = outputs + + featmap_sizes = [featmap.size()[-2:] for featmap in x] + device = x[0].device + anchor_list, valid_flag_list = self.stages[0].get_anchors( + featmap_sizes, batch_img_metas, device=device) + + losses = dict() + + for i in range(self.num_stages): + stage = self.stages[i] + + if stage.adapt_cfg['type'] == 'offset': + offset_list = stage.anchor_offset(anchor_list, + stage.anchor_strides, + featmap_sizes) + else: + offset_list = None + x, cls_score, bbox_pred = stage(x, offset_list) + rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, + bbox_pred, batch_gt_instances, batch_img_metas) + stage_loss = stage.loss_by_feat(*rpn_loss_inputs) + for name, value in stage_loss.items(): + losses['s{}.{}'.format(i, name)] = value + + # refine boxes + if i < self.num_stages - 1: + anchor_list = stage.refine_bboxes(anchor_list, bbox_pred, + batch_img_metas) + + predictions = self.stages[-1].predict_by_feat( + anchor_list, + cls_score, + bbox_pred, + batch_img_metas=batch_img_metas, + cfg=proposal_cfg) + return losses, predictions + + def predict(self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + featmap_sizes = [featmap.size()[-2:] for featmap in x] + device = x[0].device + anchor_list, _ = self.stages[0].get_anchors( + featmap_sizes, batch_img_metas, device=device) + + for i in range(self.num_stages): + stage = self.stages[i] + if stage.adapt_cfg['type'] == 'offset': + offset_list = stage.anchor_offset(anchor_list, + stage.anchor_strides, + featmap_sizes) + else: + offset_list = None + x, cls_score, bbox_pred = stage(x, offset_list) + if i < self.num_stages - 1: + anchor_list = stage.refine_bboxes(anchor_list, bbox_pred, + batch_img_metas) + + predictions = self.stages[-1].predict_by_feat( + anchor_list, + cls_score, + bbox_pred, + batch_img_metas=batch_img_metas, + rescale=rescale) + return predictions diff --git a/mmdetection/mmdet/models/dense_heads/centernet_head.py b/mmdetection/mmdet/models/dense_heads/centernet_head.py new file mode 100644 index 0000000..09f3e59 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/centernet_head.py @@ -0,0 +1,447 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmcv.ops import batched_nms +from mmengine.config import ConfigDict +from mmengine.model import bias_init_with_prob, normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig) +from ..utils import (gaussian_radius, gen_gaussian_target, get_local_maximum, + get_topk_from_heatmap, multi_apply, + transpose_and_gather_feat) +from .base_dense_head import BaseDenseHead + + +@MODELS.register_module() +class CenterNetHead(BaseDenseHead): + """Objects as Points Head. CenterHead use center_point to indicate object's + position. Paper link + + Args: + in_channels (int): Number of channel in the input feature map. + feat_channels (int): Number of channel in the intermediate feature map. + num_classes (int): Number of categories excluding the background + category. + loss_center_heatmap (:obj:`ConfigDict` or dict): Config of center + heatmap loss. Defaults to + dict(type='GaussianFocalLoss', loss_weight=1.0) + loss_wh (:obj:`ConfigDict` or dict): Config of wh loss. Defaults to + dict(type='L1Loss', loss_weight=0.1). + loss_offset (:obj:`ConfigDict` or dict): Config of offset loss. + Defaults to dict(type='L1Loss', loss_weight=1.0). + train_cfg (:obj:`ConfigDict` or dict, optional): Training config. + Useless in CenterNet, but we keep this variable for + SingleStageDetector. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config + of CenterNet. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization + config dict. + """ + + def __init__(self, + in_channels: int, + feat_channels: int, + num_classes: int, + loss_center_heatmap: ConfigType = dict( + type='GaussianFocalLoss', loss_weight=1.0), + loss_wh: ConfigType = dict(type='L1Loss', loss_weight=0.1), + loss_offset: ConfigType = dict( + type='L1Loss', loss_weight=1.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.heatmap_head = self._build_head(in_channels, feat_channels, + num_classes) + self.wh_head = self._build_head(in_channels, feat_channels, 2) + self.offset_head = self._build_head(in_channels, feat_channels, 2) + + self.loss_center_heatmap = MODELS.build(loss_center_heatmap) + self.loss_wh = MODELS.build(loss_wh) + self.loss_offset = MODELS.build(loss_offset) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.fp16_enabled = False + + def _build_head(self, in_channels: int, feat_channels: int, + out_channels: int) -> nn.Sequential: + """Build head for each branch.""" + layer = nn.Sequential( + nn.Conv2d(in_channels, feat_channels, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(feat_channels, out_channels, kernel_size=1)) + return layer + + def init_weights(self) -> None: + """Initialize weights of the head.""" + bias_init = bias_init_with_prob(0.1) + self.heatmap_head[-1].bias.data.fill_(bias_init) + for head in [self.wh_head, self.offset_head]: + for m in head.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + + def forward(self, x: Tuple[Tensor, ...]) -> Tuple[List[Tensor]]: + """Forward features. Notice CenterNet head does not use FPN. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + center_heatmap_preds (list[Tensor]): center predict heatmaps for + all levels, the channels number is num_classes. + wh_preds (list[Tensor]): wh predicts for all levels, the channels + number is 2. + offset_preds (list[Tensor]): offset predicts for all levels, the + channels number is 2. + """ + return multi_apply(self.forward_single, x) + + def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]: + """Forward feature of a single level. + + Args: + x (Tensor): Feature of a single level. + + Returns: + center_heatmap_pred (Tensor): center predict heatmaps, the + channels number is num_classes. + wh_pred (Tensor): wh predicts, the channels number is 2. + offset_pred (Tensor): offset predicts, the channels number is 2. + """ + center_heatmap_pred = self.heatmap_head(x).sigmoid() + wh_pred = self.wh_head(x) + offset_pred = self.offset_head(x) + return center_heatmap_pred, wh_pred, offset_pred + + def loss_by_feat( + self, + center_heatmap_preds: List[Tensor], + wh_preds: List[Tensor], + offset_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + center_heatmap_preds (list[Tensor]): center predict heatmaps for + all levels with shape (B, num_classes, H, W). + wh_preds (list[Tensor]): wh predicts for all levels with + shape (B, 2, H, W). + offset_preds (list[Tensor]): offset predicts for all levels + with shape (B, 2, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: which has components below: + - loss_center_heatmap (Tensor): loss of center heatmap. + - loss_wh (Tensor): loss of hw heatmap + - loss_offset (Tensor): loss of offset heatmap. + """ + assert len(center_heatmap_preds) == len(wh_preds) == len( + offset_preds) == 1 + center_heatmap_pred = center_heatmap_preds[0] + wh_pred = wh_preds[0] + offset_pred = offset_preds[0] + + gt_bboxes = [ + gt_instances.bboxes for gt_instances in batch_gt_instances + ] + gt_labels = [ + gt_instances.labels for gt_instances in batch_gt_instances + ] + img_shape = batch_img_metas[0]['batch_input_shape'] + target_result, avg_factor = self.get_targets(gt_bboxes, gt_labels, + center_heatmap_pred.shape, + img_shape) + + center_heatmap_target = target_result['center_heatmap_target'] + wh_target = target_result['wh_target'] + offset_target = target_result['offset_target'] + wh_offset_target_weight = target_result['wh_offset_target_weight'] + + # Since the channel of wh_target and offset_target is 2, the avg_factor + # of loss_center_heatmap is always 1/2 of loss_wh and loss_offset. + loss_center_heatmap = self.loss_center_heatmap( + center_heatmap_pred, center_heatmap_target, avg_factor=avg_factor) + loss_wh = self.loss_wh( + wh_pred, + wh_target, + wh_offset_target_weight, + avg_factor=avg_factor * 2) + loss_offset = self.loss_offset( + offset_pred, + offset_target, + wh_offset_target_weight, + avg_factor=avg_factor * 2) + return dict( + loss_center_heatmap=loss_center_heatmap, + loss_wh=loss_wh, + loss_offset=loss_offset) + + def get_targets(self, gt_bboxes: List[Tensor], gt_labels: List[Tensor], + feat_shape: tuple, img_shape: tuple) -> Tuple[dict, int]: + """Compute regression and classification targets in multiple images. + + Args: + gt_bboxes (list[Tensor]): Ground truth bboxes for each image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): class indices corresponding to each box. + feat_shape (tuple): feature map shape with value [B, _, H, W] + img_shape (tuple): image shape. + + Returns: + tuple[dict, float]: The float value is mean avg_factor, the dict + has components below: + - center_heatmap_target (Tensor): targets of center heatmap, \ + shape (B, num_classes, H, W). + - wh_target (Tensor): targets of wh predict, shape \ + (B, 2, H, W). + - offset_target (Tensor): targets of offset predict, shape \ + (B, 2, H, W). + - wh_offset_target_weight (Tensor): weights of wh and offset \ + predict, shape (B, 2, H, W). + """ + img_h, img_w = img_shape[:2] + bs, _, feat_h, feat_w = feat_shape + + width_ratio = float(feat_w / img_w) + height_ratio = float(feat_h / img_h) + + center_heatmap_target = gt_bboxes[-1].new_zeros( + [bs, self.num_classes, feat_h, feat_w]) + wh_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w]) + offset_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w]) + wh_offset_target_weight = gt_bboxes[-1].new_zeros( + [bs, 2, feat_h, feat_w]) + + for batch_id in range(bs): + gt_bbox = gt_bboxes[batch_id] + gt_label = gt_labels[batch_id] + center_x = (gt_bbox[:, [0]] + gt_bbox[:, [2]]) * width_ratio / 2 + center_y = (gt_bbox[:, [1]] + gt_bbox[:, [3]]) * height_ratio / 2 + gt_centers = torch.cat((center_x, center_y), dim=1) + + for j, ct in enumerate(gt_centers): + ctx_int, cty_int = ct.int() + ctx, cty = ct + scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio + scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio + radius = gaussian_radius([scale_box_h, scale_box_w], + min_overlap=0.3) + radius = max(0, int(radius)) + ind = gt_label[j] + gen_gaussian_target(center_heatmap_target[batch_id, ind], + [ctx_int, cty_int], radius) + + wh_target[batch_id, 0, cty_int, ctx_int] = scale_box_w + wh_target[batch_id, 1, cty_int, ctx_int] = scale_box_h + + offset_target[batch_id, 0, cty_int, ctx_int] = ctx - ctx_int + offset_target[batch_id, 1, cty_int, ctx_int] = cty - cty_int + + wh_offset_target_weight[batch_id, :, cty_int, ctx_int] = 1 + + avg_factor = max(1, center_heatmap_target.eq(1).sum()) + target_result = dict( + center_heatmap_target=center_heatmap_target, + wh_target=wh_target, + offset_target=offset_target, + wh_offset_target_weight=wh_offset_target_weight) + return target_result, avg_factor + + def predict_by_feat(self, + center_heatmap_preds: List[Tensor], + wh_preds: List[Tensor], + offset_preds: List[Tensor], + batch_img_metas: Optional[List[dict]] = None, + rescale: bool = True, + with_nms: bool = False) -> InstanceList: + """Transform network output for a batch into bbox predictions. + + Args: + center_heatmap_preds (list[Tensor]): Center predict heatmaps for + all levels with shape (B, num_classes, H, W). + wh_preds (list[Tensor]): WH predicts for all levels with + shape (B, 2, H, W). + offset_preds (list[Tensor]): Offset predicts for all levels + with shape (B, 2, H, W). + batch_img_metas (list[dict], optional): Batch image meta info. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to True. + with_nms (bool): If True, do nms before return boxes. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Instance segmentation + results of each image after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(center_heatmap_preds) == len(wh_preds) == len( + offset_preds) == 1 + result_list = [] + for img_id in range(len(batch_img_metas)): + result_list.append( + self._predict_by_feat_single( + center_heatmap_preds[0][img_id:img_id + 1, ...], + wh_preds[0][img_id:img_id + 1, ...], + offset_preds[0][img_id:img_id + 1, ...], + batch_img_metas[img_id], + rescale=rescale, + with_nms=with_nms)) + return result_list + + def _predict_by_feat_single(self, + center_heatmap_pred: Tensor, + wh_pred: Tensor, + offset_pred: Tensor, + img_meta: dict, + rescale: bool = True, + with_nms: bool = False) -> InstanceData: + """Transform outputs of a single image into bbox results. + + Args: + center_heatmap_pred (Tensor): Center heatmap for current level with + shape (1, num_classes, H, W). + wh_pred (Tensor): WH heatmap for current level with shape + (1, num_classes, H, W). + offset_pred (Tensor): Offset for current level with shape + (1, corner_offset_channels, H, W). + img_meta (dict): Meta information of current image, e.g., + image size, scaling factor, etc. + rescale (bool): If True, return boxes in original image space. + Defaults to True. + with_nms (bool): If True, do nms before return boxes. + Defaults to False. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + batch_det_bboxes, batch_labels = self._decode_heatmap( + center_heatmap_pred, + wh_pred, + offset_pred, + img_meta['batch_input_shape'], + k=self.test_cfg.topk, + kernel=self.test_cfg.local_maximum_kernel) + + det_bboxes = batch_det_bboxes.view([-1, 5]) + det_labels = batch_labels.view(-1) + + batch_border = det_bboxes.new_tensor(img_meta['border'])[..., + [2, 0, 2, 0]] + det_bboxes[..., :4] -= batch_border + + if rescale and 'scale_factor' in img_meta: + det_bboxes[..., :4] /= det_bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + + if with_nms: + det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels, + self.test_cfg) + results = InstanceData() + results.bboxes = det_bboxes[..., :4] + results.scores = det_bboxes[..., 4] + results.labels = det_labels + return results + + def _decode_heatmap(self, + center_heatmap_pred: Tensor, + wh_pred: Tensor, + offset_pred: Tensor, + img_shape: tuple, + k: int = 100, + kernel: int = 3) -> Tuple[Tensor, Tensor]: + """Transform outputs into detections raw bbox prediction. + + Args: + center_heatmap_pred (Tensor): center predict heatmap, + shape (B, num_classes, H, W). + wh_pred (Tensor): wh predict, shape (B, 2, H, W). + offset_pred (Tensor): offset predict, shape (B, 2, H, W). + img_shape (tuple): image shape in hw format. + k (int): Get top k center keypoints from heatmap. Defaults to 100. + kernel (int): Max pooling kernel for extract local maximum pixels. + Defaults to 3. + + Returns: + tuple[Tensor]: Decoded output of CenterNetHead, containing + the following Tensors: + + - batch_bboxes (Tensor): Coords of each box with shape (B, k, 5) + - batch_topk_labels (Tensor): Categories of each box with \ + shape (B, k) + """ + height, width = center_heatmap_pred.shape[2:] + inp_h, inp_w = img_shape + + center_heatmap_pred = get_local_maximum( + center_heatmap_pred, kernel=kernel) + + *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap( + center_heatmap_pred, k=k) + batch_scores, batch_index, batch_topk_labels = batch_dets + + wh = transpose_and_gather_feat(wh_pred, batch_index) + offset = transpose_and_gather_feat(offset_pred, batch_index) + topk_xs = topk_xs + offset[..., 0] + topk_ys = topk_ys + offset[..., 1] + tl_x = (topk_xs - wh[..., 0] / 2) * (inp_w / width) + tl_y = (topk_ys - wh[..., 1] / 2) * (inp_h / height) + br_x = (topk_xs + wh[..., 0] / 2) * (inp_w / width) + br_y = (topk_ys + wh[..., 1] / 2) * (inp_h / height) + + batch_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], dim=2) + batch_bboxes = torch.cat((batch_bboxes, batch_scores[..., None]), + dim=-1) + return batch_bboxes, batch_topk_labels + + def _bboxes_nms(self, bboxes: Tensor, labels: Tensor, + cfg: ConfigDict) -> Tuple[Tensor, Tensor]: + """bboxes nms.""" + if labels.numel() > 0: + max_num = cfg.max_per_img + bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:, + -1].contiguous(), + labels, cfg.nms) + if max_num > 0: + bboxes = bboxes[:max_num] + labels = labels[keep][:max_num] + + return bboxes, labels diff --git a/mmdetection/mmdet/models/dense_heads/centernet_update_head.py b/mmdetection/mmdet/models/dense_heads/centernet_update_head.py new file mode 100644 index 0000000..00cfcb8 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/centernet_update_head.py @@ -0,0 +1,624 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import Scale +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox2distance +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, reduce_mean) +from ..utils import multi_apply +from .anchor_free_head import AnchorFreeHead + +INF = 1000000000 +RangeType = Sequence[Tuple[int, int]] + + +def _transpose(tensor_list: List[Tensor], + num_point_list: list) -> List[Tensor]: + """This function is used to transpose image first tensors to level first + ones.""" + for img_idx in range(len(tensor_list)): + tensor_list[img_idx] = torch.split( + tensor_list[img_idx], num_point_list, dim=0) + + tensors_level_first = [] + for targets_per_level in zip(*tensor_list): + tensors_level_first.append(torch.cat(targets_per_level, dim=0)) + return tensors_level_first + + +@MODELS.register_module() +class CenterNetUpdateHead(AnchorFreeHead): + """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2. + Paper link ``_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channel in the input feature map. + regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple + level points. + hm_min_radius (int): Heatmap target minimum radius of cls branch. + Defaults to 4. + hm_min_overlap (float): Heatmap target minimum overlap of cls branch. + Defaults to 0.8. + more_pos_thresh (float): The filtering threshold when the cls branch + adds more positive samples. Defaults to 0.2. + more_pos_topk (int): The maximum number of additional positive samples + added to each gt. Defaults to 9. + soft_weight_on_reg (bool): Whether to use the soft target of the + cls branch as the soft weight of the bbox branch. + Defaults to False. + loss_cls (:obj:`ConfigDict` or dict): Config of cls loss. Defaults to + dict(type='GaussianFocalLoss', loss_weight=1.0) + loss_bbox (:obj:`ConfigDict` or dict): Config of bbox loss. Defaults to + dict(type='GIoULoss', loss_weight=2.0). + norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct + and config norm layer. Defaults to + ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config. + Unused in CenterNet. Reserved for compatibility with + SingleStageDetector. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config + of CenterNet. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + regress_ranges: RangeType = ((0, 80), (64, 160), (128, 320), + (256, 640), (512, INF)), + hm_min_radius: int = 4, + hm_min_overlap: float = 0.8, + more_pos_thresh: float = 0.2, + more_pos_topk: int = 9, + soft_weight_on_reg: bool = False, + loss_cls: ConfigType = dict( + type='GaussianFocalLoss', + pos_weight=0.25, + neg_weight=0.75, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='GIoULoss', loss_weight=2.0), + norm_cfg: OptConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + **kwargs) -> None: + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + norm_cfg=norm_cfg, + train_cfg=train_cfg, + test_cfg=test_cfg, + **kwargs) + self.soft_weight_on_reg = soft_weight_on_reg + self.hm_min_radius = hm_min_radius + self.more_pos_thresh = more_pos_thresh + self.more_pos_topk = more_pos_topk + self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap) + self.sigmoid_clamp = 0.0001 + + # GaussianFocalLoss must be sigmoid mode + self.use_sigmoid_cls = True + self.cls_out_channels = num_classes + + self.regress_ranges = regress_ranges + self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) + + def _init_predictor(self) -> None: + """Initialize predictor layers of the head.""" + self.conv_cls = nn.Conv2d( + self.feat_channels, self.num_classes, 3, padding=1) + self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of each level outputs. + + - cls_scores (list[Tensor]): Box scores for each scale level, \ + each is a 4D-tensor, the channel number is num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for each \ + scale level, each is a 4D-tensor, the channel number is 4. + """ + return multi_apply(self.forward_single, x, self.scales, self.strides) + + def forward_single(self, x: Tensor, scale: Scale, + stride: int) -> Tuple[Tensor, Tensor]: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps. + + Returns: + tuple: scores for each class, bbox predictions of + input feature maps. + """ + cls_score, bbox_pred, _, _ = super().forward_single(x) + # scale the bbox_pred of different level + # float to avoid overflow when enabling FP16 + bbox_pred = scale(bbox_pred).float() + # bbox_pred needed for gradient computation has been modified + # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace + # F.relu(bbox_pred) with bbox_pred.clamp(min=0) + bbox_pred = bbox_pred.clamp(min=0) + if not self.training: + bbox_pred *= stride + return cls_score, bbox_pred + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = cls_scores[0].size(0) + assert len(cls_scores) == len(bbox_preds) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device) + + # 1 flatten outputs + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + for bbox_pred in bbox_preds + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + + # repeat points to align with bbox_preds + flatten_points = torch.cat( + [points.repeat(num_imgs, 1) for points in all_level_points]) + + assert (torch.isfinite(flatten_bbox_preds).all().item()) + + # 2 calc reg and cls branch targets + cls_targets, bbox_targets = self.get_targets(all_level_points, + batch_gt_instances) + + # 3 add more pos index for cls branch + featmap_sizes = flatten_points.new_tensor(featmap_sizes) + pos_inds, cls_labels = self.add_cls_pos_inds(flatten_points, + flatten_bbox_preds, + featmap_sizes, + batch_gt_instances) + + # 4 calc cls loss + if pos_inds is None: + # num_gts=0 + num_pos_cls = bbox_preds[0].new_tensor(0, dtype=torch.float) + else: + num_pos_cls = bbox_preds[0].new_tensor( + len(pos_inds), dtype=torch.float) + num_pos_cls = max(reduce_mean(num_pos_cls), 1.0) + flatten_cls_scores = flatten_cls_scores.sigmoid().clamp( + min=self.sigmoid_clamp, max=1 - self.sigmoid_clamp) + cls_loss = self.loss_cls( + flatten_cls_scores, + cls_targets, + pos_inds=pos_inds, + pos_labels=cls_labels, + avg_factor=num_pos_cls) + + # 5 calc reg loss + pos_bbox_inds = torch.nonzero( + bbox_targets.max(dim=1)[0] >= 0).squeeze(1) + pos_bbox_preds = flatten_bbox_preds[pos_bbox_inds] + pos_bbox_targets = bbox_targets[pos_bbox_inds] + + bbox_weight_map = cls_targets.max(dim=1)[0] + bbox_weight_map = bbox_weight_map[pos_bbox_inds] + bbox_weight_map = bbox_weight_map if self.soft_weight_on_reg \ + else torch.ones_like(bbox_weight_map) + num_pos_bbox = max(reduce_mean(bbox_weight_map.sum()), 1.0) + + if len(pos_bbox_inds) > 0: + pos_points = flatten_points[pos_bbox_inds] + pos_decoded_bbox_preds = self.bbox_coder.decode( + pos_points, pos_bbox_preds) + pos_decoded_target_preds = self.bbox_coder.decode( + pos_points, pos_bbox_targets) + bbox_loss = self.loss_bbox( + pos_decoded_bbox_preds, + pos_decoded_target_preds, + weight=bbox_weight_map, + avg_factor=num_pos_bbox) + else: + bbox_loss = flatten_bbox_preds.sum() * 0 + + return dict(loss_cls=cls_loss, loss_bbox=bbox_loss) + + def get_targets( + self, + points: List[Tensor], + batch_gt_instances: InstanceList, + ) -> Tuple[Tensor, Tensor]: + """Compute classification and bbox targets for points in multiple + images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple: Targets of each level. + + - concat_lvl_labels (Tensor): Labels of all level and batch. + - concat_lvl_bbox_targets (Tensor): BBox targets of all \ + level and batch. + """ + assert len(points) == len(self.regress_ranges) + + num_levels = len(points) + # the number of points per img, per lvl + num_points = [center.size(0) for center in points] + + # expand regress ranges to align with points + expanded_regress_ranges = [ + points[i].new_tensor(self.regress_ranges[i])[None].expand_as( + points[i]) for i in range(num_levels) + ] + # concat all levels points and regress ranges + concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) + concat_points = torch.cat(points, dim=0) + concat_strides = torch.cat([ + concat_points.new_ones(num_points[i]) * self.strides[i] + for i in range(num_levels) + ]) + + # get labels and bbox_targets of each image + cls_targets_list, bbox_targets_list = multi_apply( + self._get_targets_single, + batch_gt_instances, + points=concat_points, + regress_ranges=concat_regress_ranges, + strides=concat_strides) + + bbox_targets_list = _transpose(bbox_targets_list, num_points) + cls_targets_list = _transpose(cls_targets_list, num_points) + concat_lvl_bbox_targets = torch.cat(bbox_targets_list, 0) + concat_lvl_cls_targets = torch.cat(cls_targets_list, dim=0) + return concat_lvl_cls_targets, concat_lvl_bbox_targets + + def _get_targets_single(self, gt_instances: InstanceData, points: Tensor, + regress_ranges: Tensor, + strides: Tensor) -> Tuple[Tensor, Tensor]: + """Compute classification and bbox targets for a single image.""" + num_points = points.size(0) + num_gts = len(gt_instances) + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + + if num_gts == 0: + return gt_labels.new_full((num_points, + self.num_classes), + self.num_classes), \ + gt_bboxes.new_full((num_points, 4), -1) + + # Calculate the regression tblr target corresponding to all points + points = points[:, None].expand(num_points, num_gts, 2) + gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) + strides = strides[:, None, None].expand(num_points, num_gts, 2) + + bbox_target = bbox2distance(points, gt_bboxes) # M x N x 4 + + # condition1: inside a gt bbox + inside_gt_bbox_mask = bbox_target.min(dim=2)[0] > 0 # M x N + + # condition2: Calculate the nearest points from + # the upper, lower, left and right ranges from + # the center of the gt bbox + centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2) + centers_discret = ((centers / strides).int() * strides).float() + \ + strides / 2 + + centers_discret_dist = points - centers_discret + dist_x = centers_discret_dist[..., 0].abs() + dist_y = centers_discret_dist[..., 1].abs() + inside_gt_center3x3_mask = (dist_x <= strides[..., 0]) & \ + (dist_y <= strides[..., 0]) + + # condition3: limit the regression range for each location + bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:] + crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2 + inside_fpn_level_mask = (crit >= regress_ranges[:, [0]]) & \ + (crit <= regress_ranges[:, [1]]) + bbox_target_mask = inside_gt_bbox_mask & \ + inside_gt_center3x3_mask & \ + inside_fpn_level_mask + + # Calculate the distance weight map + gt_center_peak_mask = ((centers_discret_dist**2).sum(dim=2) == 0) + weighted_dist = ((points - centers)**2).sum(dim=2) # M x N + weighted_dist[gt_center_peak_mask] = 0 + + areas = (gt_bboxes[..., 2] - gt_bboxes[..., 0]) * ( + gt_bboxes[..., 3] - gt_bboxes[..., 1]) + radius = self.delta**2 * 2 * areas + radius = torch.clamp(radius, min=self.hm_min_radius**2) + weighted_dist = weighted_dist / radius + + # Calculate bbox_target + bbox_weighted_dist = weighted_dist.clone() + bbox_weighted_dist[bbox_target_mask == 0] = INF * 1.0 + min_dist, min_inds = bbox_weighted_dist.min(dim=1) + bbox_target = bbox_target[range(len(bbox_target)), + min_inds] # M x N x 4 --> M x 4 + bbox_target[min_dist == INF] = -INF + + # Convert to feature map scale + bbox_target /= strides[:, 0, :].repeat(1, 2) + + # Calculate cls_target + cls_target = self._create_heatmaps_from_dist(weighted_dist, gt_labels) + + return cls_target, bbox_target + + @torch.no_grad() + def add_cls_pos_inds( + self, flatten_points: Tensor, flatten_bbox_preds: Tensor, + featmap_sizes: Tensor, batch_gt_instances: InstanceList + ) -> Tuple[Optional[Tensor], Optional[Tensor]]: + """Provide additional adaptive positive samples to the classification + branch. + + Args: + flatten_points (Tensor): The point after flatten, including + batch image and all levels. The shape is (N, 2). + flatten_bbox_preds (Tensor): The bbox predicts after flatten, + including batch image and all levels. The shape is (N, 4). + featmap_sizes (Tensor): Feature map size of all layers. + The shape is (5, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple: + + - pos_inds (Tensor): Adaptively selected positive sample index. + - cls_labels (Tensor): Corresponding positive class label. + """ + outputs = self._get_center3x3_region_index_targets( + batch_gt_instances, featmap_sizes) + cls_labels, fpn_level_masks, center3x3_inds, \ + center3x3_bbox_targets, center3x3_masks = outputs + + num_gts, total_level, K = cls_labels.shape[0], len( + self.strides), center3x3_masks.shape[-1] + + if num_gts == 0: + return None, None + + # The out-of-bounds index is forcibly set to 0 + # to prevent loss calculation errors + center3x3_inds[center3x3_masks == 0] = 0 + reg_pred_center3x3 = flatten_bbox_preds[center3x3_inds] + center3x3_points = flatten_points[center3x3_inds].view(-1, 2) + + center3x3_bbox_targets_expand = center3x3_bbox_targets.view( + -1, 4).clamp(min=0) + + pos_decoded_bbox_preds = self.bbox_coder.decode( + center3x3_points, reg_pred_center3x3.view(-1, 4)) + pos_decoded_target_preds = self.bbox_coder.decode( + center3x3_points, center3x3_bbox_targets_expand) + center3x3_bbox_loss = self.loss_bbox( + pos_decoded_bbox_preds, + pos_decoded_target_preds, + None, + reduction_override='none').view(num_gts, total_level, + K) / self.loss_bbox.loss_weight + + # Invalid index Loss set to infinity + center3x3_bbox_loss[center3x3_masks == 0] = INF + + # 4 is the center point of the sampled 9 points, the center point + # of gt bbox after discretization. + # The center point of gt bbox after discretization + # must be a positive sample, so we force its loss to be set to 0. + center3x3_bbox_loss.view(-1, K)[fpn_level_masks.view(-1), 4] = 0 + center3x3_bbox_loss = center3x3_bbox_loss.view(num_gts, -1) + + loss_thr = torch.kthvalue( + center3x3_bbox_loss, self.more_pos_topk, dim=1)[0] + + loss_thr[loss_thr > self.more_pos_thresh] = self.more_pos_thresh + new_pos = center3x3_bbox_loss < loss_thr.view(num_gts, 1) + pos_inds = center3x3_inds.view(num_gts, -1)[new_pos] + cls_labels = cls_labels.view(num_gts, + 1).expand(num_gts, + total_level * K)[new_pos] + return pos_inds, cls_labels + + def _create_heatmaps_from_dist(self, weighted_dist: Tensor, + cls_labels: Tensor) -> Tensor: + """Generate heatmaps of classification branch based on weighted + distance map.""" + heatmaps = weighted_dist.new_zeros( + (weighted_dist.shape[0], self.num_classes)) + for c in range(self.num_classes): + inds = (cls_labels == c) # N + if inds.int().sum() == 0: + continue + heatmaps[:, c] = torch.exp(-weighted_dist[:, inds].min(dim=1)[0]) + zeros = heatmaps[:, c] < 1e-4 + heatmaps[zeros, c] = 0 + return heatmaps + + def _get_center3x3_region_index_targets(self, + bacth_gt_instances: InstanceList, + shapes_per_level: Tensor) -> tuple: + """Get the center (and the 3x3 region near center) locations and target + of each objects.""" + cls_labels = [] + inside_fpn_level_masks = [] + center3x3_inds = [] + center3x3_masks = [] + center3x3_bbox_targets = [] + + total_levels = len(self.strides) + batch = len(bacth_gt_instances) + + shapes_per_level = shapes_per_level.long() + area_per_level = (shapes_per_level[:, 0] * shapes_per_level[:, 1]) + + # Select a total of 9 positions of 3x3 in the center of the gt bbox + # as candidate positive samples + K = 9 + dx = shapes_per_level.new_tensor([-1, 0, 1, -1, 0, 1, -1, 0, + 1]).view(1, 1, K) + dy = shapes_per_level.new_tensor([-1, -1, -1, 0, 0, 0, 1, 1, + 1]).view(1, 1, K) + + regress_ranges = shapes_per_level.new_tensor(self.regress_ranges).view( + len(self.regress_ranges), 2) # L x 2 + strides = shapes_per_level.new_tensor(self.strides) + + start_coord_pre_level = [] + _start = 0 + for level in range(total_levels): + start_coord_pre_level.append(_start) + _start = _start + batch * area_per_level[level] + start_coord_pre_level = shapes_per_level.new_tensor( + start_coord_pre_level).view(1, total_levels, 1) + area_per_level = area_per_level.view(1, total_levels, 1) + + for im_i in range(batch): + gt_instance = bacth_gt_instances[im_i] + gt_bboxes = gt_instance.bboxes + gt_labels = gt_instance.labels + num_gts = gt_bboxes.shape[0] + if num_gts == 0: + continue + + cls_labels.append(gt_labels) + + gt_bboxes = gt_bboxes[:, None].expand(num_gts, total_levels, 4) + expanded_strides = strides[None, :, + None].expand(num_gts, total_levels, 2) + expanded_regress_ranges = regress_ranges[None].expand( + num_gts, total_levels, 2) + expanded_shapes_per_level = shapes_per_level[None].expand( + num_gts, total_levels, 2) + + # calc reg_target + centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2) + centers_inds = (centers / expanded_strides).long() + centers_discret = centers_inds * expanded_strides \ + + expanded_strides // 2 + + bbox_target = bbox2distance(centers_discret, + gt_bboxes) # M x N x 4 + + # calc inside_fpn_level_mask + bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:] + crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2 + inside_fpn_level_mask = \ + (crit >= expanded_regress_ranges[..., 0]) & \ + (crit <= expanded_regress_ranges[..., 1]) + + inside_gt_bbox_mask = bbox_target.min(dim=2)[0] >= 0 + inside_fpn_level_mask = inside_gt_bbox_mask & inside_fpn_level_mask + inside_fpn_level_masks.append(inside_fpn_level_mask) + + # calc center3x3_ind and mask + expand_ws = expanded_shapes_per_level[..., 1:2].expand( + num_gts, total_levels, K) + expand_hs = expanded_shapes_per_level[..., 0:1].expand( + num_gts, total_levels, K) + centers_inds_x = centers_inds[..., 0:1] + centers_inds_y = centers_inds[..., 1:2] + + center3x3_idx = start_coord_pre_level + \ + im_i * area_per_level + \ + (centers_inds_y + dy) * expand_ws + \ + (centers_inds_x + dx) + center3x3_mask = \ + ((centers_inds_y + dy) < expand_hs) & \ + ((centers_inds_y + dy) >= 0) & \ + ((centers_inds_x + dx) < expand_ws) & \ + ((centers_inds_x + dx) >= 0) + + # recalc center3x3 region reg target + bbox_target = bbox_target / expanded_strides.repeat(1, 1, 2) + center3x3_bbox_target = bbox_target[..., None, :].expand( + num_gts, total_levels, K, 4).clone() + center3x3_bbox_target[..., 0] += dx + center3x3_bbox_target[..., 1] += dy + center3x3_bbox_target[..., 2] -= dx + center3x3_bbox_target[..., 3] -= dy + # update center3x3_mask + center3x3_mask = center3x3_mask & ( + center3x3_bbox_target.min(dim=3)[0] >= 0) # n x L x K + + center3x3_inds.append(center3x3_idx) + center3x3_masks.append(center3x3_mask) + center3x3_bbox_targets.append(center3x3_bbox_target) + + if len(inside_fpn_level_masks) > 0: + cls_labels = torch.cat(cls_labels, dim=0) + inside_fpn_level_masks = torch.cat(inside_fpn_level_masks, dim=0) + center3x3_inds = torch.cat(center3x3_inds, dim=0).long() + center3x3_bbox_targets = torch.cat(center3x3_bbox_targets, dim=0) + center3x3_masks = torch.cat(center3x3_masks, dim=0) + else: + cls_labels = shapes_per_level.new_zeros(0).long() + inside_fpn_level_masks = shapes_per_level.new_zeros( + (0, total_levels)).bool() + center3x3_inds = shapes_per_level.new_zeros( + (0, total_levels, K)).long() + center3x3_bbox_targets = shapes_per_level.new_zeros( + (0, total_levels, K, 4)).float() + center3x3_masks = shapes_per_level.new_zeros( + (0, total_levels, K)).bool() + return cls_labels, inside_fpn_level_masks, center3x3_inds, \ + center3x3_bbox_targets, center3x3_masks diff --git a/mmdetection/mmdet/models/dense_heads/centripetal_head.py b/mmdetection/mmdet/models/dense_heads/centripetal_head.py new file mode 100644 index 0000000..18f6601 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/centripetal_head.py @@ -0,0 +1,459 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.ops import DeformConv2d +from mmengine.model import normal_init +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import (ConfigType, InstanceList, OptInstanceList, + OptMultiConfig) +from ..utils import multi_apply +from .corner_head import CornerHead + + +@MODELS.register_module() +class CentripetalHead(CornerHead): + """Head of CentripetalNet: Pursuing High-quality Keypoint Pairs for Object + Detection. + + CentripetalHead inherits from :class:`CornerHead`. It removes the + embedding branch and adds guiding shift and centripetal shift branches. + More details can be found in the `paper + `_ . + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + num_feat_levels (int): Levels of feature from the previous module. + 2 for HourglassNet-104 and 1 for HourglassNet-52. HourglassNet-104 + outputs the final feature and intermediate supervision feature and + HourglassNet-52 only outputs the final feature. Defaults to 2. + corner_emb_channels (int): Channel of embedding vector. Defaults to 1. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config. + Useless in CornerHead, but we keep this variable for + SingleStageDetector. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + CornerHead. + loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap + loss. Defaults to GaussianFocalLoss. + loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding + loss. Defaults to AssociativeEmbeddingLoss. + loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss. + Defaults to SmoothL1Loss. + loss_guiding_shift (:obj:`ConfigDict` or dict): Config of + guiding shift loss. Defaults to SmoothL1Loss. + loss_centripetal_shift (:obj:`ConfigDict` or dict): Config of + centripetal shift loss. Defaults to SmoothL1Loss. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. + """ + + def __init__(self, + *args, + centripetal_shift_channels: int = 2, + guiding_shift_channels: int = 2, + feat_adaption_conv_kernel: int = 3, + loss_guiding_shift: ConfigType = dict( + type='SmoothL1Loss', beta=1.0, loss_weight=0.05), + loss_centripetal_shift: ConfigType = dict( + type='SmoothL1Loss', beta=1.0, loss_weight=1), + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + assert centripetal_shift_channels == 2, ( + 'CentripetalHead only support centripetal_shift_channels == 2') + self.centripetal_shift_channels = centripetal_shift_channels + assert guiding_shift_channels == 2, ( + 'CentripetalHead only support guiding_shift_channels == 2') + self.guiding_shift_channels = guiding_shift_channels + self.feat_adaption_conv_kernel = feat_adaption_conv_kernel + super().__init__(*args, init_cfg=init_cfg, **kwargs) + self.loss_guiding_shift = MODELS.build(loss_guiding_shift) + self.loss_centripetal_shift = MODELS.build(loss_centripetal_shift) + + def _init_centripetal_layers(self) -> None: + """Initialize centripetal layers. + + Including feature adaption deform convs (feat_adaption), deform offset + prediction convs (dcn_off), guiding shift (guiding_shift) and + centripetal shift ( centripetal_shift). Each branch has two parts: + prefix `tl_` for top-left and `br_` for bottom-right. + """ + self.tl_feat_adaption = nn.ModuleList() + self.br_feat_adaption = nn.ModuleList() + self.tl_dcn_offset = nn.ModuleList() + self.br_dcn_offset = nn.ModuleList() + self.tl_guiding_shift = nn.ModuleList() + self.br_guiding_shift = nn.ModuleList() + self.tl_centripetal_shift = nn.ModuleList() + self.br_centripetal_shift = nn.ModuleList() + + for _ in range(self.num_feat_levels): + self.tl_feat_adaption.append( + DeformConv2d(self.in_channels, self.in_channels, + self.feat_adaption_conv_kernel, 1, 1)) + self.br_feat_adaption.append( + DeformConv2d(self.in_channels, self.in_channels, + self.feat_adaption_conv_kernel, 1, 1)) + + self.tl_guiding_shift.append( + self._make_layers( + out_channels=self.guiding_shift_channels, + in_channels=self.in_channels)) + self.br_guiding_shift.append( + self._make_layers( + out_channels=self.guiding_shift_channels, + in_channels=self.in_channels)) + + self.tl_dcn_offset.append( + ConvModule( + self.guiding_shift_channels, + self.feat_adaption_conv_kernel**2 * + self.guiding_shift_channels, + 1, + bias=False, + act_cfg=None)) + self.br_dcn_offset.append( + ConvModule( + self.guiding_shift_channels, + self.feat_adaption_conv_kernel**2 * + self.guiding_shift_channels, + 1, + bias=False, + act_cfg=None)) + + self.tl_centripetal_shift.append( + self._make_layers( + out_channels=self.centripetal_shift_channels, + in_channels=self.in_channels)) + self.br_centripetal_shift.append( + self._make_layers( + out_channels=self.centripetal_shift_channels, + in_channels=self.in_channels)) + + def _init_layers(self) -> None: + """Initialize layers for CentripetalHead. + + Including two parts: CornerHead layers and CentripetalHead layers + """ + super()._init_layers() # using _init_layers in CornerHead + self._init_centripetal_layers() + + def init_weights(self) -> None: + super().init_weights() + for i in range(self.num_feat_levels): + normal_init(self.tl_feat_adaption[i], std=0.01) + normal_init(self.br_feat_adaption[i], std=0.01) + normal_init(self.tl_dcn_offset[i].conv, std=0.1) + normal_init(self.br_dcn_offset[i].conv, std=0.1) + _ = [x.conv.reset_parameters() for x in self.tl_guiding_shift[i]] + _ = [x.conv.reset_parameters() for x in self.br_guiding_shift[i]] + _ = [ + x.conv.reset_parameters() for x in self.tl_centripetal_shift[i] + ] + _ = [ + x.conv.reset_parameters() for x in self.br_centripetal_shift[i] + ] + + def forward_single(self, x: Tensor, lvl_ind: int) -> List[Tensor]: + """Forward feature of a single level. + + Args: + x (Tensor): Feature of a single level. + lvl_ind (int): Level index of current feature. + + Returns: + tuple[Tensor]: A tuple of CentripetalHead's output for current + feature level. Containing the following Tensors: + + - tl_heat (Tensor): Predicted top-left corner heatmap. + - br_heat (Tensor): Predicted bottom-right corner heatmap. + - tl_off (Tensor): Predicted top-left offset heatmap. + - br_off (Tensor): Predicted bottom-right offset heatmap. + - tl_guiding_shift (Tensor): Predicted top-left guiding shift + heatmap. + - br_guiding_shift (Tensor): Predicted bottom-right guiding + shift heatmap. + - tl_centripetal_shift (Tensor): Predicted top-left centripetal + shift heatmap. + - br_centripetal_shift (Tensor): Predicted bottom-right + centripetal shift heatmap. + """ + tl_heat, br_heat, _, _, tl_off, br_off, tl_pool, br_pool = super( + ).forward_single( + x, lvl_ind, return_pool=True) + + tl_guiding_shift = self.tl_guiding_shift[lvl_ind](tl_pool) + br_guiding_shift = self.br_guiding_shift[lvl_ind](br_pool) + + tl_dcn_offset = self.tl_dcn_offset[lvl_ind](tl_guiding_shift.detach()) + br_dcn_offset = self.br_dcn_offset[lvl_ind](br_guiding_shift.detach()) + + tl_feat_adaption = self.tl_feat_adaption[lvl_ind](tl_pool, + tl_dcn_offset) + br_feat_adaption = self.br_feat_adaption[lvl_ind](br_pool, + br_dcn_offset) + + tl_centripetal_shift = self.tl_centripetal_shift[lvl_ind]( + tl_feat_adaption) + br_centripetal_shift = self.br_centripetal_shift[lvl_ind]( + br_feat_adaption) + + result_list = [ + tl_heat, br_heat, tl_off, br_off, tl_guiding_shift, + br_guiding_shift, tl_centripetal_shift, br_centripetal_shift + ] + return result_list + + def loss_by_feat( + self, + tl_heats: List[Tensor], + br_heats: List[Tensor], + tl_offs: List[Tensor], + br_offs: List[Tensor], + tl_guiding_shifts: List[Tensor], + br_guiding_shifts: List[Tensor], + tl_centripetal_shifts: List[Tensor], + br_centripetal_shifts: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + tl_heats (list[Tensor]): Top-left corner heatmaps for each level + with shape (N, num_classes, H, W). + br_heats (list[Tensor]): Bottom-right corner heatmaps for each + level with shape (N, num_classes, H, W). + tl_offs (list[Tensor]): Top-left corner offsets for each level + with shape (N, corner_offset_channels, H, W). + br_offs (list[Tensor]): Bottom-right corner offsets for each level + with shape (N, corner_offset_channels, H, W). + tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each + level with shape (N, guiding_shift_channels, H, W). + br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for + each level with shape (N, guiding_shift_channels, H, W). + tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts + for each level with shape (N, centripetal_shift_channels, H, + W). + br_centripetal_shifts (list[Tensor]): Bottom-right centripetal + shifts for each level with shape (N, + centripetal_shift_channels, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Specify which bounding boxes can be ignored when computing + the loss. + + Returns: + dict[str, Tensor]: A dictionary of loss components. Containing the + following losses: + + - det_loss (list[Tensor]): Corner keypoint losses of all + feature levels. + - off_loss (list[Tensor]): Corner offset losses of all feature + levels. + - guiding_loss (list[Tensor]): Guiding shift losses of all + feature levels. + - centripetal_loss (list[Tensor]): Centripetal shift losses of + all feature levels. + """ + gt_bboxes = [ + gt_instances.bboxes for gt_instances in batch_gt_instances + ] + gt_labels = [ + gt_instances.labels for gt_instances in batch_gt_instances + ] + + targets = self.get_targets( + gt_bboxes, + gt_labels, + tl_heats[-1].shape, + batch_img_metas[0]['batch_input_shape'], + with_corner_emb=self.with_corner_emb, + with_guiding_shift=True, + with_centripetal_shift=True) + mlvl_targets = [targets for _ in range(self.num_feat_levels)] + [det_losses, off_losses, guiding_losses, centripetal_losses + ] = multi_apply(self.loss_by_feat_single, tl_heats, br_heats, tl_offs, + br_offs, tl_guiding_shifts, br_guiding_shifts, + tl_centripetal_shifts, br_centripetal_shifts, + mlvl_targets) + loss_dict = dict( + det_loss=det_losses, + off_loss=off_losses, + guiding_loss=guiding_losses, + centripetal_loss=centripetal_losses) + return loss_dict + + def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor, + tl_off: Tensor, br_off: Tensor, + tl_guiding_shift: Tensor, br_guiding_shift: Tensor, + tl_centripetal_shift: Tensor, + br_centripetal_shift: Tensor, + targets: dict) -> Tuple[Tensor, ...]: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + tl_hmp (Tensor): Top-left corner heatmap for current level with + shape (N, num_classes, H, W). + br_hmp (Tensor): Bottom-right corner heatmap for current level with + shape (N, num_classes, H, W). + tl_off (Tensor): Top-left corner offset for current level with + shape (N, corner_offset_channels, H, W). + br_off (Tensor): Bottom-right corner offset for current level with + shape (N, corner_offset_channels, H, W). + tl_guiding_shift (Tensor): Top-left guiding shift for current level + with shape (N, guiding_shift_channels, H, W). + br_guiding_shift (Tensor): Bottom-right guiding shift for current + level with shape (N, guiding_shift_channels, H, W). + tl_centripetal_shift (Tensor): Top-left centripetal shift for + current level with shape (N, centripetal_shift_channels, H, W). + br_centripetal_shift (Tensor): Bottom-right centripetal shift for + current level with shape (N, centripetal_shift_channels, H, W). + targets (dict): Corner target generated by `get_targets`. + + Returns: + tuple[torch.Tensor]: Losses of the head's different branches + containing the following losses: + + - det_loss (Tensor): Corner keypoint loss. + - off_loss (Tensor): Corner offset loss. + - guiding_loss (Tensor): Guiding shift loss. + - centripetal_loss (Tensor): Centripetal shift loss. + """ + targets['corner_embedding'] = None + + det_loss, _, _, off_loss = super().loss_by_feat_single( + tl_hmp, br_hmp, None, None, tl_off, br_off, targets) + + gt_tl_guiding_shift = targets['topleft_guiding_shift'] + gt_br_guiding_shift = targets['bottomright_guiding_shift'] + gt_tl_centripetal_shift = targets['topleft_centripetal_shift'] + gt_br_centripetal_shift = targets['bottomright_centripetal_shift'] + + gt_tl_heatmap = targets['topleft_heatmap'] + gt_br_heatmap = targets['bottomright_heatmap'] + # We only compute the offset loss at the real corner position. + # The value of real corner would be 1 in heatmap ground truth. + # The mask is computed in class agnostic mode and its shape is + # batch * 1 * width * height. + tl_mask = gt_tl_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as( + gt_tl_heatmap) + br_mask = gt_br_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as( + gt_br_heatmap) + + # Guiding shift loss + tl_guiding_loss = self.loss_guiding_shift( + tl_guiding_shift, + gt_tl_guiding_shift, + tl_mask, + avg_factor=tl_mask.sum()) + br_guiding_loss = self.loss_guiding_shift( + br_guiding_shift, + gt_br_guiding_shift, + br_mask, + avg_factor=br_mask.sum()) + guiding_loss = (tl_guiding_loss + br_guiding_loss) / 2.0 + # Centripetal shift loss + tl_centripetal_loss = self.loss_centripetal_shift( + tl_centripetal_shift, + gt_tl_centripetal_shift, + tl_mask, + avg_factor=tl_mask.sum()) + br_centripetal_loss = self.loss_centripetal_shift( + br_centripetal_shift, + gt_br_centripetal_shift, + br_mask, + avg_factor=br_mask.sum()) + centripetal_loss = (tl_centripetal_loss + br_centripetal_loss) / 2.0 + + return det_loss, off_loss, guiding_loss, centripetal_loss + + def predict_by_feat(self, + tl_heats: List[Tensor], + br_heats: List[Tensor], + tl_offs: List[Tensor], + br_offs: List[Tensor], + tl_guiding_shifts: List[Tensor], + br_guiding_shifts: List[Tensor], + tl_centripetal_shifts: List[Tensor], + br_centripetal_shifts: List[Tensor], + batch_img_metas: Optional[List[dict]] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + tl_heats (list[Tensor]): Top-left corner heatmaps for each level + with shape (N, num_classes, H, W). + br_heats (list[Tensor]): Bottom-right corner heatmaps for each + level with shape (N, num_classes, H, W). + tl_offs (list[Tensor]): Top-left corner offsets for each level + with shape (N, corner_offset_channels, H, W). + br_offs (list[Tensor]): Bottom-right corner offsets for each level + with shape (N, corner_offset_channels, H, W). + tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each + level with shape (N, guiding_shift_channels, H, W). Useless in + this function, we keep this arg because it's the raw output + from CentripetalHead. + br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for + each level with shape (N, guiding_shift_channels, H, W). + Useless in this function, we keep this arg because it's the + raw output from CentripetalHead. + tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts + for each level with shape (N, centripetal_shift_channels, H, + W). + br_centripetal_shifts (list[Tensor]): Bottom-right centripetal + shifts for each level with shape (N, + centripetal_shift_channels, H, W). + batch_img_metas (list[dict], optional): Batch image meta info. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len( + batch_img_metas) + result_list = [] + for img_id in range(len(batch_img_metas)): + result_list.append( + self._predict_by_feat_single( + tl_heats[-1][img_id:img_id + 1, :], + br_heats[-1][img_id:img_id + 1, :], + tl_offs[-1][img_id:img_id + 1, :], + br_offs[-1][img_id:img_id + 1, :], + batch_img_metas[img_id], + tl_emb=None, + br_emb=None, + tl_centripetal_shift=tl_centripetal_shifts[-1][ + img_id:img_id + 1, :], + br_centripetal_shift=br_centripetal_shifts[-1][ + img_id:img_id + 1, :], + rescale=rescale, + with_nms=with_nms)) + + return result_list diff --git a/mmdetection/mmdet/models/dense_heads/condinst_head.py b/mmdetection/mmdet/models/dense_heads/condinst_head.py new file mode 100644 index 0000000..35a25e6 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/condinst_head.py @@ -0,0 +1,1226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, Scale +from mmengine.config import ConfigDict +from mmengine.model import BaseModule, kaiming_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import cat_boxes +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType, + OptInstanceList, reduce_mean) +from ..task_modules.prior_generators import MlvlPointGenerator +from ..utils import (aligned_bilinear, filter_scores_and_topk, multi_apply, + relative_coordinate_maps, select_single_mlvl) +from ..utils.misc import empty_instances +from .base_mask_head import BaseMaskHead +from .fcos_head import FCOSHead + +INF = 1e8 + + +@MODELS.register_module() +class CondInstBboxHead(FCOSHead): + """CondInst box head used in https://arxiv.org/abs/1904.02689. + + Note that CondInst Bbox Head is a extension of FCOS head. + Two differences are described as follows: + + 1. CondInst box head predicts a set of params for each instance. + 2. CondInst box head return the pos_gt_inds and pos_inds. + + Args: + num_params (int): Number of params for instance segmentation. + """ + + def __init__(self, *args, num_params: int = 169, **kwargs) -> None: + self.num_params = num_params + super().__init__(*args, **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + super()._init_layers() + self.controller = nn.Conv2d( + self.feat_channels, self.num_params, 3, padding=1) + + def forward_single(self, x: Tensor, scale: Scale, + stride: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps, only + used to normalize the bbox prediction when self.norm_on_bbox + is True. + + Returns: + tuple: scores for each class, bbox predictions, centerness + predictions and param predictions of input feature maps. + """ + cls_score, bbox_pred, cls_feat, reg_feat = \ + super(FCOSHead, self).forward_single(x) + if self.centerness_on_reg: + centerness = self.conv_centerness(reg_feat) + else: + centerness = self.conv_centerness(cls_feat) + # scale the bbox_pred of different level + # float to avoid overflow when enabling FP16 + bbox_pred = scale(bbox_pred).float() + if self.norm_on_bbox: + # bbox_pred needed for gradient computation has been modified + # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace + # F.relu(bbox_pred) with bbox_pred.clamp(min=0) + bbox_pred = bbox_pred.clamp(min=0) + if not self.training: + bbox_pred *= stride + else: + bbox_pred = bbox_pred.exp() + param_pred = self.controller(reg_feat) + return cls_score, bbox_pred, centerness, param_pred + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + centernesses: List[Tensor], + param_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * 4. + centernesses (list[Tensor]): centerness for each scale level, each + is a 4D-tensor, the channel number is num_points * 1. + param_preds (List[Tensor]): param_pred for each scale level, each + is a 4D-tensor, the channel number is num_params. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) == len(centernesses) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + # Need stride for rel coord compute + all_level_points_strides = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device, + with_stride=True) + all_level_points = [i[:, :2] for i in all_level_points_strides] + all_level_strides = [i[:, 2] for i in all_level_points_strides] + labels, bbox_targets, pos_inds_list, pos_gt_inds_list = \ + self.get_targets(all_level_points, batch_gt_instances) + + num_imgs = cls_scores[0].size(0) + # flatten cls_scores, bbox_preds and centerness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + for bbox_pred in bbox_preds + ] + flatten_centerness = [ + centerness.permute(0, 2, 3, 1).reshape(-1) + for centerness in centernesses + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + flatten_centerness = torch.cat(flatten_centerness) + flatten_labels = torch.cat(labels) + flatten_bbox_targets = torch.cat(bbox_targets) + # repeat points to align with bbox_preds + flatten_points = torch.cat( + [points.repeat(num_imgs, 1) for points in all_level_points]) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((flatten_labels >= 0) + & (flatten_labels < bg_class_ind)).nonzero().reshape(-1) + num_pos = torch.tensor( + len(pos_inds), dtype=torch.float, device=bbox_preds[0].device) + num_pos = max(reduce_mean(num_pos), 1.0) + loss_cls = self.loss_cls( + flatten_cls_scores, flatten_labels, avg_factor=num_pos) + + pos_bbox_preds = flatten_bbox_preds[pos_inds] + pos_centerness = flatten_centerness[pos_inds] + pos_bbox_targets = flatten_bbox_targets[pos_inds] + pos_centerness_targets = self.centerness_target(pos_bbox_targets) + # centerness weighted iou loss + centerness_denorm = max( + reduce_mean(pos_centerness_targets.sum().detach()), 1e-6) + + if len(pos_inds) > 0: + pos_points = flatten_points[pos_inds] + pos_decoded_bbox_preds = self.bbox_coder.decode( + pos_points, pos_bbox_preds) + pos_decoded_target_preds = self.bbox_coder.decode( + pos_points, pos_bbox_targets) + loss_bbox = self.loss_bbox( + pos_decoded_bbox_preds, + pos_decoded_target_preds, + weight=pos_centerness_targets, + avg_factor=centerness_denorm) + loss_centerness = self.loss_centerness( + pos_centerness, pos_centerness_targets, avg_factor=num_pos) + else: + loss_bbox = pos_bbox_preds.sum() + loss_centerness = pos_centerness.sum() + + self._raw_positive_infos.update(cls_scores=cls_scores) + self._raw_positive_infos.update(centernesses=centernesses) + self._raw_positive_infos.update(param_preds=param_preds) + self._raw_positive_infos.update(all_level_points=all_level_points) + self._raw_positive_infos.update(all_level_strides=all_level_strides) + self._raw_positive_infos.update(pos_gt_inds_list=pos_gt_inds_list) + self._raw_positive_infos.update(pos_inds_list=pos_inds_list) + + return dict( + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_centerness=loss_centerness) + + def get_targets( + self, points: List[Tensor], batch_gt_instances: InstanceList + ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]: + """Compute regression, classification and centerness targets for points + in multiple images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple: Targets of each level. + + - concat_lvl_labels (list[Tensor]): Labels of each level. + - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \ + level. + - pos_inds_list (list[Tensor]): pos_inds of each image. + - pos_gt_inds_list (List[Tensor]): pos_gt_inds of each image. + """ + assert len(points) == len(self.regress_ranges) + num_levels = len(points) + # expand regress ranges to align with points + expanded_regress_ranges = [ + points[i].new_tensor(self.regress_ranges[i])[None].expand_as( + points[i]) for i in range(num_levels) + ] + # concat all levels points and regress ranges + concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) + concat_points = torch.cat(points, dim=0) + + # the number of points per img, per lvl + num_points = [center.size(0) for center in points] + + # get labels and bbox_targets of each image + labels_list, bbox_targets_list, pos_inds_list, pos_gt_inds_list = \ + multi_apply( + self._get_targets_single, + batch_gt_instances, + points=concat_points, + regress_ranges=concat_regress_ranges, + num_points_per_lvl=num_points) + + # split to per img, per level + labels_list = [labels.split(num_points, 0) for labels in labels_list] + bbox_targets_list = [ + bbox_targets.split(num_points, 0) + for bbox_targets in bbox_targets_list + ] + + # concat per level image + concat_lvl_labels = [] + concat_lvl_bbox_targets = [] + for i in range(num_levels): + concat_lvl_labels.append( + torch.cat([labels[i] for labels in labels_list])) + bbox_targets = torch.cat( + [bbox_targets[i] for bbox_targets in bbox_targets_list]) + if self.norm_on_bbox: + bbox_targets = bbox_targets / self.strides[i] + concat_lvl_bbox_targets.append(bbox_targets) + return (concat_lvl_labels, concat_lvl_bbox_targets, pos_inds_list, + pos_gt_inds_list) + + def _get_targets_single( + self, gt_instances: InstanceData, points: Tensor, + regress_ranges: Tensor, num_points_per_lvl: List[int] + ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """Compute regression and classification targets for a single image.""" + num_points = points.size(0) + num_gts = len(gt_instances) + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + gt_masks = gt_instances.get('masks', None) + + if num_gts == 0: + return gt_labels.new_full((num_points,), self.num_classes), \ + gt_bboxes.new_zeros((num_points, 4)), \ + gt_bboxes.new_zeros((0,), dtype=torch.int64), \ + gt_bboxes.new_zeros((0,), dtype=torch.int64) + + areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1]) + # TODO: figure out why these two are different + # areas = areas[None].expand(num_points, num_gts) + areas = areas[None].repeat(num_points, 1) + regress_ranges = regress_ranges[:, None, :].expand( + num_points, num_gts, 2) + gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) + xs, ys = points[:, 0], points[:, 1] + xs = xs[:, None].expand(num_points, num_gts) + ys = ys[:, None].expand(num_points, num_gts) + + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + bbox_targets = torch.stack((left, top, right, bottom), -1) + + if self.center_sampling: + # condition1: inside a `center bbox` + radius = self.center_sample_radius + # if gt_mask not None, use gt mask's centroid to determine + # the center region rather than gt_bbox center + if gt_masks is None: + center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2 + center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2 + else: + h, w = gt_masks.height, gt_masks.width + masks = gt_masks.to_tensor( + dtype=torch.bool, device=gt_bboxes.device) + yys = torch.arange( + 0, h, dtype=torch.float32, device=masks.device) + xxs = torch.arange( + 0, w, dtype=torch.float32, device=masks.device) + # m00/m10/m01 represent the moments of a contour + # centroid is computed by m00/m10 and m00/m01 + m00 = masks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6) + m10 = (masks * xxs).sum(dim=-1).sum(dim=-1) + m01 = (masks * yys[:, None]).sum(dim=-1).sum(dim=-1) + center_xs = m10 / m00 + center_ys = m01 / m00 + + center_xs = center_xs[None].expand(num_points, num_gts) + center_ys = center_ys[None].expand(num_points, num_gts) + center_gts = torch.zeros_like(gt_bboxes) + stride = center_xs.new_zeros(center_xs.shape) + + # project the points on current lvl back to the `original` sizes + lvl_begin = 0 + for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl): + lvl_end = lvl_begin + num_points_lvl + stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius + lvl_begin = lvl_end + + x_mins = center_xs - stride + y_mins = center_ys - stride + x_maxs = center_xs + stride + y_maxs = center_ys + stride + center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0], + x_mins, gt_bboxes[..., 0]) + center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1], + y_mins, gt_bboxes[..., 1]) + center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2], + gt_bboxes[..., 2], x_maxs) + center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3], + gt_bboxes[..., 3], y_maxs) + + cb_dist_left = xs - center_gts[..., 0] + cb_dist_right = center_gts[..., 2] - xs + cb_dist_top = ys - center_gts[..., 1] + cb_dist_bottom = center_gts[..., 3] - ys + center_bbox = torch.stack( + (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1) + inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0 + else: + # condition1: inside a gt bbox + inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0 + + # condition2: limit the regression range for each location + max_regress_distance = bbox_targets.max(-1)[0] + inside_regress_range = ( + (max_regress_distance >= regress_ranges[..., 0]) + & (max_regress_distance <= regress_ranges[..., 1])) + + # if there are still more than one objects for a location, + # we choose the one with minimal area + areas[inside_gt_bbox_mask == 0] = INF + areas[inside_regress_range == 0] = INF + min_area, min_area_inds = areas.min(dim=1) + + labels = gt_labels[min_area_inds] + labels[min_area == INF] = self.num_classes # set as BG + bbox_targets = bbox_targets[range(num_points), min_area_inds] + + # return pos_inds & pos_gt_inds + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().reshape(-1) + pos_gt_inds = min_area_inds[labels < self.num_classes] + return labels, bbox_targets, pos_inds, pos_gt_inds + + def get_positive_infos(self) -> InstanceList: + """Get positive information from sampling results. + + Returns: + list[:obj:`InstanceData`]: Positive information of each image, + usually including positive bboxes, positive labels, positive + priors, etc. + """ + assert len(self._raw_positive_infos) > 0 + + pos_gt_inds_list = self._raw_positive_infos['pos_gt_inds_list'] + pos_inds_list = self._raw_positive_infos['pos_inds_list'] + num_imgs = len(pos_gt_inds_list) + + cls_score_list = [] + centerness_list = [] + param_pred_list = [] + point_list = [] + stride_list = [] + for cls_score_per_lvl, centerness_per_lvl, param_pred_per_lvl,\ + point_per_lvl, stride_per_lvl in \ + zip(self._raw_positive_infos['cls_scores'], + self._raw_positive_infos['centernesses'], + self._raw_positive_infos['param_preds'], + self._raw_positive_infos['all_level_points'], + self._raw_positive_infos['all_level_strides']): + cls_score_per_lvl = \ + cls_score_per_lvl.permute( + 0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes) + centerness_per_lvl = \ + centerness_per_lvl.permute( + 0, 2, 3, 1).reshape(num_imgs, -1, 1) + param_pred_per_lvl = \ + param_pred_per_lvl.permute( + 0, 2, 3, 1).reshape(num_imgs, -1, self.num_params) + point_per_lvl = point_per_lvl.unsqueeze(0).repeat(num_imgs, 1, 1) + stride_per_lvl = stride_per_lvl.unsqueeze(0).repeat(num_imgs, 1) + + cls_score_list.append(cls_score_per_lvl) + centerness_list.append(centerness_per_lvl) + param_pred_list.append(param_pred_per_lvl) + point_list.append(point_per_lvl) + stride_list.append(stride_per_lvl) + cls_scores = torch.cat(cls_score_list, dim=1) + centernesses = torch.cat(centerness_list, dim=1) + param_preds = torch.cat(param_pred_list, dim=1) + all_points = torch.cat(point_list, dim=1) + all_strides = torch.cat(stride_list, dim=1) + + positive_infos = [] + for i, (pos_gt_inds, + pos_inds) in enumerate(zip(pos_gt_inds_list, pos_inds_list)): + pos_info = InstanceData() + pos_info.points = all_points[i][pos_inds] + pos_info.strides = all_strides[i][pos_inds] + pos_info.scores = cls_scores[i][pos_inds] + pos_info.centernesses = centernesses[i][pos_inds] + pos_info.param_preds = param_preds[i][pos_inds] + pos_info.pos_assigned_gt_inds = pos_gt_inds + pos_info.pos_inds = pos_inds + positive_infos.append(pos_info) + return positive_infos + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + score_factors: Optional[List[Tensor]] = None, + param_preds: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS, + such as CenterNess in FCOS, IoU branch in ATSS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + score_factors (list[Tensor], optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Defaults to None. + param_preds (list[Tensor], optional): Params for all scale + level, each is a 4D-tensor, has shape + (batch_size, num_priors * num_params, H, W) + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + + if score_factors is None: + # e.g. Retina, FreeAnchor, Foveabox, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, AutoAssign, etc. + with_score_factors = True + assert len(cls_scores) == len(score_factors) + + num_levels = len(cls_scores) + + featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] + all_level_points_strides = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device, + with_stride=True) + all_level_points = [i[:, :2] for i in all_level_points_strides] + all_level_strides = [i[:, 2] for i in all_level_points_strides] + + result_list = [] + + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + cls_score_list = select_single_mlvl( + cls_scores, img_id, detach=True) + bbox_pred_list = select_single_mlvl( + bbox_preds, img_id, detach=True) + if with_score_factors: + score_factor_list = select_single_mlvl( + score_factors, img_id, detach=True) + else: + score_factor_list = [None for _ in range(num_levels)] + param_pred_list = select_single_mlvl( + param_preds, img_id, detach=True) + + results = self._predict_by_feat_single( + cls_score_list=cls_score_list, + bbox_pred_list=bbox_pred_list, + score_factor_list=score_factor_list, + param_pred_list=param_pred_list, + mlvl_points=all_level_points, + mlvl_strides=all_level_strides, + img_meta=img_meta, + cfg=cfg, + rescale=rescale, + with_nms=with_nms) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + param_pred_list: List[Tensor], + mlvl_points: List[Tensor], + mlvl_strides: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + param_pred_list (List[Tensor]): Param predition from all scale + levels of a single image, each item has shape + (num_priors * num_params, H, W). + mlvl_points (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. + It has shape (num_priors, 2) + mlvl_strides (List[Tensor]): Each element in the list is + the stride of a single level in feature pyramid. + It has shape (num_priors, 1) + img_meta (dict): Image meta info. + cfg (mmengine.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + if score_factor_list[0] is None: + # e.g. Retina, FreeAnchor, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, etc. + with_score_factors = True + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bbox_preds = [] + mlvl_param_preds = [] + mlvl_valid_points = [] + mlvl_valid_strides = [] + mlvl_scores = [] + mlvl_labels = [] + if with_score_factors: + mlvl_score_factors = [] + else: + mlvl_score_factors = None + for level_idx, (cls_score, bbox_pred, score_factor, + param_pred, points, strides) in \ + enumerate(zip(cls_score_list, bbox_pred_list, + score_factor_list, param_pred_list, + mlvl_points, mlvl_strides)): + + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + dim = self.bbox_coder.encode_size + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim) + if with_score_factors: + score_factor = score_factor.permute(1, 2, + 0).reshape(-1).sigmoid() + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + scores = cls_score.softmax(-1)[:, :-1] + + param_pred = param_pred.permute(1, 2, + 0).reshape(-1, self.num_params) + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + score_thr = cfg.get('score_thr', 0) + + results = filter_scores_and_topk( + scores, score_thr, nms_pre, + dict( + bbox_pred=bbox_pred, + param_pred=param_pred, + points=points, + strides=strides)) + scores, labels, keep_idxs, filtered_results = results + + bbox_pred = filtered_results['bbox_pred'] + param_pred = filtered_results['param_pred'] + points = filtered_results['points'] + strides = filtered_results['strides'] + + if with_score_factors: + score_factor = score_factor[keep_idxs] + + mlvl_bbox_preds.append(bbox_pred) + mlvl_param_preds.append(param_pred) + mlvl_valid_points.append(points) + mlvl_valid_strides.append(strides) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + + if with_score_factors: + mlvl_score_factors.append(score_factor) + + bbox_pred = torch.cat(mlvl_bbox_preds) + priors = cat_boxes(mlvl_valid_points) + bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) + + results = InstanceData() + results.bboxes = bboxes + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + results.param_preds = torch.cat(mlvl_param_preds) + results.points = torch.cat(mlvl_valid_points) + results.strides = torch.cat(mlvl_valid_strides) + if with_score_factors: + results.score_factors = torch.cat(mlvl_score_factors) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + +class MaskFeatModule(BaseModule): + """CondInst mask feature map branch used in \ + https://arxiv.org/abs/1904.02689. + + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels of the mask feature + map branch. + start_level (int): The starting feature map level from RPN that + will be used to predict the mask feature map. + end_level (int): The ending feature map level from rpn that + will be used to predict the mask feature map. + out_channels (int): Number of output channels of the mask feature + map branch. This is the channel count of the mask + feature map that to be dynamically convolved with the predicted + kernel. + mask_stride (int): Downsample factor of the mask feature map output. + Defaults to 4. + num_stacked_convs (int): Number of convs in mask feature branch. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + in_channels: int, + feat_channels: int, + start_level: int, + end_level: int, + out_channels: int, + mask_stride: int = 4, + num_stacked_convs: int = 4, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + init_cfg: MultiConfig = [ + dict(type='Normal', layer='Conv2d', std=0.01) + ], + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.feat_channels = feat_channels + self.start_level = start_level + self.end_level = end_level + self.mask_stride = mask_stride + self.num_stacked_convs = num_stacked_convs + assert start_level >= 0 and end_level >= start_level + self.out_channels = out_channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.convs_all_levels = nn.ModuleList() + for i in range(self.start_level, self.end_level + 1): + convs_per_level = nn.Sequential() + convs_per_level.add_module( + f'conv{i}', + ConvModule( + self.in_channels, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=False, + bias=False)) + self.convs_all_levels.append(convs_per_level) + + conv_branch = [] + for _ in range(self.num_stacked_convs): + conv_branch.append( + ConvModule( + self.feat_channels, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + bias=False)) + self.conv_branch = nn.Sequential(*conv_branch) + + self.conv_pred = nn.Conv2d( + self.feat_channels, self.out_channels, 1, stride=1) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + super().init_weights() + kaiming_init(self.convs_all_levels, a=1, distribution='uniform') + kaiming_init(self.conv_branch, a=1, distribution='uniform') + kaiming_init(self.conv_pred, a=1, distribution='uniform') + + def forward(self, x: Tuple[Tensor]) -> Tensor: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + Tensor: The predicted mask feature map. + """ + inputs = x[self.start_level:self.end_level + 1] + assert len(inputs) == (self.end_level - self.start_level + 1) + feature_add_all_level = self.convs_all_levels[0](inputs[0]) + target_h, target_w = feature_add_all_level.size()[2:] + for i in range(1, len(inputs)): + input_p = inputs[i] + x_p = self.convs_all_levels[i](input_p) + h, w = x_p.size()[2:] + factor_h = target_h // h + factor_w = target_w // w + assert factor_h == factor_w + feature_per_level = aligned_bilinear(x_p, factor_h) + feature_add_all_level = feature_add_all_level + \ + feature_per_level + + feature_add_all_level = self.conv_branch(feature_add_all_level) + feature_pred = self.conv_pred(feature_add_all_level) + return feature_pred + + +@MODELS.register_module() +class CondInstMaskHead(BaseMaskHead): + """CondInst mask head used in https://arxiv.org/abs/1904.02689. + + This head outputs the mask for CondInst. + + Args: + mask_feature_head (dict): Config of CondInstMaskFeatHead. + num_layers (int): Number of dynamic conv layers. + feat_channels (int): Number of channels in the dynamic conv. + mask_out_stride (int): The stride of the mask feat. + size_of_interest (int): The size of the region used in rel coord. + max_masks_to_train (int): Maximum number of masks to train for + each image. + loss_segm (:obj:`ConfigDict` or dict, optional): Config of + segmentation loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config + of head. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + head. + """ + + def __init__(self, + mask_feature_head: ConfigType, + num_layers: int = 3, + feat_channels: int = 8, + mask_out_stride: int = 4, + size_of_interest: int = 8, + max_masks_to_train: int = -1, + topk_masks_per_img: int = -1, + loss_mask: ConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None) -> None: + super().__init__() + self.mask_feature_head = MaskFeatModule(**mask_feature_head) + self.mask_feat_stride = self.mask_feature_head.mask_stride + self.in_channels = self.mask_feature_head.out_channels + self.num_layers = num_layers + self.feat_channels = feat_channels + self.size_of_interest = size_of_interest + self.mask_out_stride = mask_out_stride + self.max_masks_to_train = max_masks_to_train + self.topk_masks_per_img = topk_masks_per_img + self.prior_generator = MlvlPointGenerator([self.mask_feat_stride]) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.loss_mask = MODELS.build(loss_mask) + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + weight_nums, bias_nums = [], [] + for i in range(self.num_layers): + if i == 0: + weight_nums.append((self.in_channels + 2) * self.feat_channels) + bias_nums.append(self.feat_channels) + elif i == self.num_layers - 1: + weight_nums.append(self.feat_channels * 1) + bias_nums.append(1) + else: + weight_nums.append(self.feat_channels * self.feat_channels) + bias_nums.append(self.feat_channels) + + self.weight_nums = weight_nums + self.bias_nums = bias_nums + self.num_params = sum(weight_nums) + sum(bias_nums) + + def parse_dynamic_params( + self, params: Tensor) -> Tuple[List[Tensor], List[Tensor]]: + """parse the dynamic params for dynamic conv.""" + num_insts = params.size(0) + params_splits = list( + torch.split_with_sizes( + params, self.weight_nums + self.bias_nums, dim=1)) + weight_splits = params_splits[:self.num_layers] + bias_splits = params_splits[self.num_layers:] + for i in range(self.num_layers): + if i < self.num_layers - 1: + weight_splits[i] = weight_splits[i].reshape( + num_insts * self.in_channels, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(num_insts * + self.in_channels) + else: + # out_channels x in_channels x 1 x 1 + weight_splits[i] = weight_splits[i].reshape( + num_insts * 1, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(num_insts) + + return weight_splits, bias_splits + + def dynamic_conv_forward(self, features: Tensor, weights: List[Tensor], + biases: List[Tensor], num_insts: int) -> Tensor: + """dynamic forward, each layer follow a relu.""" + n_layers = len(weights) + x = features + for i, (w, b) in enumerate(zip(weights, biases)): + x = F.conv2d(x, w, bias=b, stride=1, padding=0, groups=num_insts) + if i < n_layers - 1: + x = F.relu(x) + return x + + def forward(self, x: tuple, positive_infos: InstanceList) -> tuple: + """Forward feature from the upstream network to get prototypes and + linearly combine the prototypes, using masks coefficients, into + instance masks. Finally, crop the instance masks with given bboxes. + + Args: + x (Tuple[Tensor]): Feature from the upstream network, which is + a 4D-tensor. + positive_infos (List[:obj:``InstanceData``]): Positive information + that calculate from detect head. + + Returns: + tuple: Predicted instance segmentation masks + """ + mask_feats = self.mask_feature_head(x) + return multi_apply(self.forward_single, mask_feats, positive_infos) + + def forward_single(self, mask_feat: Tensor, + positive_info: InstanceData) -> Tensor: + """Forward features of a each image.""" + pos_param_preds = positive_info.get('param_preds') + pos_points = positive_info.get('points') + pos_strides = positive_info.get('strides') + + num_inst = pos_param_preds.shape[0] + mask_feat = mask_feat[None].repeat(num_inst, 1, 1, 1) + _, _, H, W = mask_feat.size() + if num_inst == 0: + return (pos_param_preds.new_zeros((0, 1, H, W)), ) + + locations = self.prior_generator.single_level_grid_priors( + mask_feat.size()[2:], 0, device=mask_feat.device) + + rel_coords = relative_coordinate_maps(locations, pos_points, + pos_strides, + self.size_of_interest, + mask_feat.size()[2:]) + mask_head_inputs = torch.cat([rel_coords, mask_feat], dim=1) + mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W) + + weights, biases = self.parse_dynamic_params(pos_param_preds) + mask_preds = self.dynamic_conv_forward(mask_head_inputs, weights, + biases, num_inst) + mask_preds = mask_preds.reshape(-1, H, W) + mask_preds = aligned_bilinear( + mask_preds.unsqueeze(0), + int(self.mask_feat_stride / self.mask_out_stride)).squeeze(0) + + return (mask_preds, ) + + def loss_by_feat(self, mask_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], positive_infos: InstanceList, + **kwargs) -> dict: + """Calculate the loss based on the features extracted by the mask head. + + Args: + mask_preds (list[Tensor]): List of predicted masks, each has + shape (num_classes, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``masks``, + and ``labels`` attributes. + batch_img_metas (list[dict]): Meta information of multiple images. + positive_infos (List[:obj:``InstanceData``]): Information of + positive samples of each image that are assigned in detection + head. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert positive_infos is not None, \ + 'positive_infos should not be None in `CondInstMaskHead`' + losses = dict() + + loss_mask = 0. + num_imgs = len(mask_preds) + total_pos = 0 + + for idx in range(num_imgs): + (mask_pred, pos_mask_targets, num_pos) = \ + self._get_targets_single( + mask_preds[idx], batch_gt_instances[idx], + positive_infos[idx]) + # mask loss + total_pos += num_pos + if num_pos == 0 or pos_mask_targets is None: + loss = mask_pred.new_zeros(1).mean() + else: + loss = self.loss_mask( + mask_pred, pos_mask_targets, + reduction_override='none').sum() + loss_mask += loss + + if total_pos == 0: + total_pos += 1 # avoid nan + loss_mask = loss_mask / total_pos + losses.update(loss_mask=loss_mask) + return losses + + def _get_targets_single(self, mask_preds: Tensor, + gt_instances: InstanceData, + positive_info: InstanceData): + """Compute targets for predictions of single image. + + Args: + mask_preds (Tensor): Predicted prototypes with shape + (num_classes, H, W). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes``, ``labels``, + and ``masks`` attributes. + positive_info (:obj:`InstanceData`): Information of positive + samples that are assigned in detection head. It usually + contains following keys. + + - pos_assigned_gt_inds (Tensor): Assigner GT indexes of + positive proposals, has shape (num_pos, ) + - pos_inds (Tensor): Positive index of image, has + shape (num_pos, ). + - param_pred (Tensor): Positive param preditions + with shape (num_pos, num_params). + + Returns: + tuple: Usually returns a tuple containing learning targets. + + - mask_preds (Tensor): Positive predicted mask with shape + (num_pos, mask_h, mask_w). + - pos_mask_targets (Tensor): Positive mask targets with shape + (num_pos, mask_h, mask_w). + - num_pos (int): Positive numbers. + """ + gt_bboxes = gt_instances.bboxes + device = gt_bboxes.device + gt_masks = gt_instances.masks.to_tensor( + dtype=torch.bool, device=device).float() + + # process with mask targets + pos_assigned_gt_inds = positive_info.get('pos_assigned_gt_inds') + scores = positive_info.get('scores') + centernesses = positive_info.get('centernesses') + num_pos = pos_assigned_gt_inds.size(0) + + if gt_masks.size(0) == 0 or num_pos == 0: + return mask_preds, None, 0 + # Since we're producing (near) full image masks, + # it'd take too much vram to backprop on every single mask. + # Thus we select only a subset. + if (self.max_masks_to_train != -1) and \ + (num_pos > self.max_masks_to_train): + perm = torch.randperm(num_pos) + select = perm[:self.max_masks_to_train] + mask_preds = mask_preds[select] + pos_assigned_gt_inds = pos_assigned_gt_inds[select] + num_pos = self.max_masks_to_train + elif self.topk_masks_per_img != -1: + unique_gt_inds = pos_assigned_gt_inds.unique() + num_inst_per_gt = max( + int(self.topk_masks_per_img / len(unique_gt_inds)), 1) + + keep_mask_preds = [] + keep_pos_assigned_gt_inds = [] + for gt_ind in unique_gt_inds: + per_inst_pos_inds = (pos_assigned_gt_inds == gt_ind) + mask_preds_per_inst = mask_preds[per_inst_pos_inds] + gt_inds_per_inst = pos_assigned_gt_inds[per_inst_pos_inds] + if sum(per_inst_pos_inds) > num_inst_per_gt: + per_inst_scores = scores[per_inst_pos_inds].sigmoid().max( + dim=1)[0] + per_inst_centerness = centernesses[ + per_inst_pos_inds].sigmoid().reshape(-1, ) + select = (per_inst_scores * per_inst_centerness).topk( + k=num_inst_per_gt, dim=0)[1] + mask_preds_per_inst = mask_preds_per_inst[select] + gt_inds_per_inst = gt_inds_per_inst[select] + keep_mask_preds.append(mask_preds_per_inst) + keep_pos_assigned_gt_inds.append(gt_inds_per_inst) + mask_preds = torch.cat(keep_mask_preds) + pos_assigned_gt_inds = torch.cat(keep_pos_assigned_gt_inds) + num_pos = pos_assigned_gt_inds.size(0) + + # Follow the origin implement + start = int(self.mask_out_stride // 2) + gt_masks = gt_masks[:, start::self.mask_out_stride, + start::self.mask_out_stride] + gt_masks = gt_masks.gt(0.5).float() + pos_mask_targets = gt_masks[pos_assigned_gt_inds] + + return (mask_preds, pos_mask_targets, num_pos) + + def predict_by_feat(self, + mask_preds: List[Tensor], + results_list: InstanceList, + batch_img_metas: List[dict], + rescale: bool = True, + **kwargs) -> InstanceList: + """Transform a batch of output features extracted from the head into + mask results. + + Args: + mask_preds (list[Tensor]): Predicted prototypes with shape + (num_classes, H, W). + results_list (List[:obj:``InstanceData``]): BBoxHead results. + batch_img_metas (list[dict]): Meta information of all images. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Processed results of multiple + images.Each :obj:`InstanceData` usually contains + following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + assert len(mask_preds) == len(results_list) == len(batch_img_metas) + + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + results = results_list[img_id] + bboxes = results.bboxes + mask_pred = mask_preds[img_id] + if bboxes.shape[0] == 0 or mask_pred.shape[0] == 0: + results_list[img_id] = empty_instances( + [img_meta], + bboxes.device, + task_type='mask', + instance_results=[results])[0] + else: + im_mask = self._predict_by_feat_single( + mask_preds=mask_pred, + bboxes=bboxes, + img_meta=img_meta, + rescale=rescale) + results.masks = im_mask + return results_list + + def _predict_by_feat_single(self, + mask_preds: Tensor, + bboxes: Tensor, + img_meta: dict, + rescale: bool, + cfg: OptConfigType = None): + """Transform a single image's features extracted from the head into + mask results. + + Args: + mask_preds (Tensor): Predicted prototypes, has shape [H, W, N]. + img_meta (dict): Meta information of each image, e.g., + image size, scaling factor, etc. + rescale (bool): If rescale is False, then returned masks will + fit the scale of imgs[0]. + cfg (dict, optional): Config used in test phase. + Defaults to None. + + Returns: + :obj:`InstanceData`: Processed results of single image. + it usually contains following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + cfg = self.test_cfg if cfg is None else cfg + scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + img_h, img_w = img_meta['img_shape'][:2] + ori_h, ori_w = img_meta['ori_shape'][:2] + + mask_preds = mask_preds.sigmoid().unsqueeze(0) + mask_preds = aligned_bilinear(mask_preds, self.mask_out_stride) + mask_preds = mask_preds[:, :, :img_h, :img_w] + if rescale: # in-placed rescale the bboxes + scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + bboxes /= scale_factor + + masks = F.interpolate( + mask_preds, (ori_h, ori_w), + mode='bilinear', + align_corners=False).squeeze(0) > cfg.mask_thr + else: + masks = mask_preds.squeeze(0) > cfg.mask_thr + + return masks diff --git a/mmdetection/mmdet/models/dense_heads/conditional_detr_head.py b/mmdetection/mmdet/models/dense_heads/conditional_detr_head.py new file mode 100644 index 0000000..cc2df2c --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/conditional_detr_head.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +import torch.nn as nn +from mmengine.model import bias_init_with_prob +from torch import Tensor + +from mmdet.models.layers.transformer import inverse_sigmoid +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import InstanceList +from .detr_head import DETRHead + + +@MODELS.register_module() +class ConditionalDETRHead(DETRHead): + """Head of Conditional DETR. Conditional DETR: Conditional DETR for Fast + Training Convergence. More details can be found in the `paper. + + `_ . + """ + + def init_weights(self): + """Initialize weights of the transformer head.""" + super().init_weights() + # The initialization below for transformer head is very + # important as we use Focal_loss for loss_cls + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + nn.init.constant_(self.fc_cls.bias, bias_init) + + def forward(self, hidden_states: Tensor, + references: Tensor) -> Tuple[Tensor, Tensor]: + """"Forward function. + + Args: + hidden_states (Tensor): Features from transformer decoder. If + `return_intermediate_dec` is True output has shape + (num_decoder_layers, bs, num_queries, dim), else has shape (1, + bs, num_queries, dim) which only contains the last layer + outputs. + references (Tensor): References from transformer decoder, has + shape (bs, num_queries, 2). + Returns: + tuple[Tensor]: results of head containing the following tensor. + + - layers_cls_scores (Tensor): Outputs from the classification head, + shape (num_decoder_layers, bs, num_queries, cls_out_channels). + Note cls_out_channels should include background. + - layers_bbox_preds (Tensor): Sigmoid outputs from the regression + head with normalized coordinate format (cx, cy, w, h), has shape + (num_decoder_layers, bs, num_queries, 4). + """ + + references_unsigmoid = inverse_sigmoid(references) + layers_bbox_preds = [] + for layer_id in range(hidden_states.shape[0]): + tmp_reg_preds = self.fc_reg( + self.activate(self.reg_ffn(hidden_states[layer_id]))) + tmp_reg_preds[..., :2] += references_unsigmoid + outputs_coord = tmp_reg_preds.sigmoid() + layers_bbox_preds.append(outputs_coord) + layers_bbox_preds = torch.stack(layers_bbox_preds) + + layers_cls_scores = self.fc_cls(hidden_states) + return layers_cls_scores, layers_bbox_preds + + def loss(self, hidden_states: Tensor, references: Tensor, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + hidden_states (Tensor): Features from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, dim). + references (Tensor): References from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, 2). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states, references) + loss_inputs = outs + (batch_gt_instances, batch_img_metas) + losses = self.loss_by_feat(*loss_inputs) + return losses + + def loss_and_predict( + self, hidden_states: Tensor, references: Tensor, + batch_data_samples: SampleList) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. Over-write because + img_metas are needed as inputs for bbox_head. + + Args: + hidden_states (Tensor): Features from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, dim). + references (Tensor): References from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, 2). + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns: + tuple: The return value is a tuple contains: + + - losses: (dict[str, Tensor]): A dictionary of loss components. + - predictions (list[:obj:`InstanceData`]): Detection + results of each image after the post process. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states, references) + loss_inputs = outs + (batch_gt_instances, batch_img_metas) + losses = self.loss_by_feat(*loss_inputs) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas) + return losses, predictions + + def predict(self, + hidden_states: Tensor, + references: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. Over-write + because img_metas are needed as inputs for bbox_head. + + Args: + hidden_states (Tensor): Features from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, dim). + references (Tensor): References from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, 2). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to True. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + last_layer_hidden_state = hidden_states[-1].unsqueeze(0) + outs = self(last_layer_hidden_state, references) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas, rescale=rescale) + + return predictions diff --git a/mmdetection/mmdet/models/dense_heads/corner_head.py b/mmdetection/mmdet/models/dense_heads/corner_head.py new file mode 100644 index 0000000..0cec71d --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/corner_head.py @@ -0,0 +1,1084 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from logging import warning +from math import ceil, log +from typing import List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.ops import CornerPool, batched_nms +from mmengine.config import ConfigDict +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig) +from ..utils import (gather_feat, gaussian_radius, gen_gaussian_target, + get_local_maximum, get_topk_from_heatmap, multi_apply, + transpose_and_gather_feat) +from .base_dense_head import BaseDenseHead + + +class BiCornerPool(BaseModule): + """Bidirectional Corner Pooling Module (TopLeft, BottomRight, etc.) + + Args: + in_channels (int): Input channels of module. + directions (list[str]): Directions of two CornerPools. + out_channels (int): Output channels of module. + feat_channels (int): Feature channels of module. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct + and config norm layer. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to + control the initialization. + """ + + def __init__(self, + in_channels: int, + directions: List[int], + feat_channels: int = 128, + out_channels: int = 128, + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg) + self.direction1_conv = ConvModule( + in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg) + self.direction2_conv = ConvModule( + in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg) + + self.aftpool_conv = ConvModule( + feat_channels, + out_channels, + 3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=None) + + self.conv1 = ConvModule( + in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None) + self.conv2 = ConvModule( + in_channels, out_channels, 3, padding=1, norm_cfg=norm_cfg) + + self.direction1_pool = CornerPool(directions[0]) + self.direction2_pool = CornerPool(directions[1]) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x: Tensor) -> Tensor: + """Forward features from the upstream network. + + Args: + x (tensor): Input feature of BiCornerPool. + + Returns: + conv2 (tensor): Output feature of BiCornerPool. + """ + direction1_conv = self.direction1_conv(x) + direction2_conv = self.direction2_conv(x) + direction1_feat = self.direction1_pool(direction1_conv) + direction2_feat = self.direction2_pool(direction2_conv) + aftpool_conv = self.aftpool_conv(direction1_feat + direction2_feat) + conv1 = self.conv1(x) + relu = self.relu(aftpool_conv + conv1) + conv2 = self.conv2(relu) + return conv2 + + +@MODELS.register_module() +class CornerHead(BaseDenseHead): + """Head of CornerNet: Detecting Objects as Paired Keypoints. + + Code is modified from the `official github repo + `_ . + + More details can be found in the `paper + `_ . + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + num_feat_levels (int): Levels of feature from the previous module. + 2 for HourglassNet-104 and 1 for HourglassNet-52. Because + HourglassNet-104 outputs the final feature and intermediate + supervision feature and HourglassNet-52 only outputs the final + feature. Defaults to 2. + corner_emb_channels (int): Channel of embedding vector. Defaults to 1. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config. + Useless in CornerHead, but we keep this variable for + SingleStageDetector. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + CornerHead. + loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap + loss. Defaults to GaussianFocalLoss. + loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding + loss. Defaults to AssociativeEmbeddingLoss. + loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss. + Defaults to SmoothL1Loss. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + num_feat_levels: int = 2, + corner_emb_channels: int = 1, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + loss_heatmap: ConfigType = dict( + type='GaussianFocalLoss', + alpha=2.0, + gamma=4.0, + loss_weight=1), + loss_embedding: ConfigType = dict( + type='AssociativeEmbeddingLoss', + pull_weight=0.25, + push_weight=0.25), + loss_offset: ConfigType = dict( + type='SmoothL1Loss', beta=1.0, loss_weight=1), + init_cfg: OptMultiConfig = None) -> None: + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.in_channels = in_channels + self.corner_emb_channels = corner_emb_channels + self.with_corner_emb = self.corner_emb_channels > 0 + self.corner_offset_channels = 2 + self.num_feat_levels = num_feat_levels + self.loss_heatmap = MODELS.build( + loss_heatmap) if loss_heatmap is not None else None + self.loss_embedding = MODELS.build( + loss_embedding) if loss_embedding is not None else None + self.loss_offset = MODELS.build( + loss_offset) if loss_offset is not None else None + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self._init_layers() + + def _make_layers(self, + out_channels: int, + in_channels: int = 256, + feat_channels: int = 256) -> nn.Sequential: + """Initialize conv sequential for CornerHead.""" + return nn.Sequential( + ConvModule(in_channels, feat_channels, 3, padding=1), + ConvModule( + feat_channels, out_channels, 1, norm_cfg=None, act_cfg=None)) + + def _init_corner_kpt_layers(self) -> None: + """Initialize corner keypoint layers. + + Including corner heatmap branch and corner offset branch. Each branch + has two parts: prefix `tl_` for top-left and `br_` for bottom-right. + """ + self.tl_pool, self.br_pool = nn.ModuleList(), nn.ModuleList() + self.tl_heat, self.br_heat = nn.ModuleList(), nn.ModuleList() + self.tl_off, self.br_off = nn.ModuleList(), nn.ModuleList() + + for _ in range(self.num_feat_levels): + self.tl_pool.append( + BiCornerPool( + self.in_channels, ['top', 'left'], + out_channels=self.in_channels)) + self.br_pool.append( + BiCornerPool( + self.in_channels, ['bottom', 'right'], + out_channels=self.in_channels)) + + self.tl_heat.append( + self._make_layers( + out_channels=self.num_classes, + in_channels=self.in_channels)) + self.br_heat.append( + self._make_layers( + out_channels=self.num_classes, + in_channels=self.in_channels)) + + self.tl_off.append( + self._make_layers( + out_channels=self.corner_offset_channels, + in_channels=self.in_channels)) + self.br_off.append( + self._make_layers( + out_channels=self.corner_offset_channels, + in_channels=self.in_channels)) + + def _init_corner_emb_layers(self) -> None: + """Initialize corner embedding layers. + + Only include corner embedding branch with two parts: prefix `tl_` for + top-left and `br_` for bottom-right. + """ + self.tl_emb, self.br_emb = nn.ModuleList(), nn.ModuleList() + + for _ in range(self.num_feat_levels): + self.tl_emb.append( + self._make_layers( + out_channels=self.corner_emb_channels, + in_channels=self.in_channels)) + self.br_emb.append( + self._make_layers( + out_channels=self.corner_emb_channels, + in_channels=self.in_channels)) + + def _init_layers(self) -> None: + """Initialize layers for CornerHead. + + Including two parts: corner keypoint layers and corner embedding layers + """ + self._init_corner_kpt_layers() + if self.with_corner_emb: + self._init_corner_emb_layers() + + def init_weights(self) -> None: + super().init_weights() + bias_init = bias_init_with_prob(0.1) + for i in range(self.num_feat_levels): + # The initialization of parameters are different between + # nn.Conv2d and ConvModule. Our experiments show that + # using the original initialization of nn.Conv2d increases + # the final mAP by about 0.2% + self.tl_heat[i][-1].conv.reset_parameters() + self.tl_heat[i][-1].conv.bias.data.fill_(bias_init) + self.br_heat[i][-1].conv.reset_parameters() + self.br_heat[i][-1].conv.bias.data.fill_(bias_init) + self.tl_off[i][-1].conv.reset_parameters() + self.br_off[i][-1].conv.reset_parameters() + if self.with_corner_emb: + self.tl_emb[i][-1].conv.reset_parameters() + self.br_emb[i][-1].conv.reset_parameters() + + def forward(self, feats: Tuple[Tensor]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of corner heatmaps, offset heatmaps and + embedding heatmaps. + - tl_heats (list[Tensor]): Top-left corner heatmaps for all + levels, each is a 4D-tensor, the channels number is + num_classes. + - br_heats (list[Tensor]): Bottom-right corner heatmaps for all + levels, each is a 4D-tensor, the channels number is + num_classes. + - tl_embs (list[Tensor] | list[None]): Top-left embedding + heatmaps for all levels, each is a 4D-tensor or None. + If not None, the channels number is corner_emb_channels. + - br_embs (list[Tensor] | list[None]): Bottom-right embedding + heatmaps for all levels, each is a 4D-tensor or None. + If not None, the channels number is corner_emb_channels. + - tl_offs (list[Tensor]): Top-left offset heatmaps for all + levels, each is a 4D-tensor. The channels number is + corner_offset_channels. + - br_offs (list[Tensor]): Bottom-right offset heatmaps for all + levels, each is a 4D-tensor. The channels number is + corner_offset_channels. + """ + lvl_ind = list(range(self.num_feat_levels)) + return multi_apply(self.forward_single, feats, lvl_ind) + + def forward_single(self, + x: Tensor, + lvl_ind: int, + return_pool: bool = False) -> List[Tensor]: + """Forward feature of a single level. + + Args: + x (Tensor): Feature of a single level. + lvl_ind (int): Level index of current feature. + return_pool (bool): Return corner pool feature or not. + Defaults to False. + + Returns: + tuple[Tensor]: A tuple of CornerHead's output for current feature + level. Containing the following Tensors: + + - tl_heat (Tensor): Predicted top-left corner heatmap. + - br_heat (Tensor): Predicted bottom-right corner heatmap. + - tl_emb (Tensor | None): Predicted top-left embedding heatmap. + None for `self.with_corner_emb == False`. + - br_emb (Tensor | None): Predicted bottom-right embedding + heatmap. None for `self.with_corner_emb == False`. + - tl_off (Tensor): Predicted top-left offset heatmap. + - br_off (Tensor): Predicted bottom-right offset heatmap. + - tl_pool (Tensor): Top-left corner pool feature. Not must + have. + - br_pool (Tensor): Bottom-right corner pool feature. Not must + have. + """ + tl_pool = self.tl_pool[lvl_ind](x) + tl_heat = self.tl_heat[lvl_ind](tl_pool) + br_pool = self.br_pool[lvl_ind](x) + br_heat = self.br_heat[lvl_ind](br_pool) + + tl_emb, br_emb = None, None + if self.with_corner_emb: + tl_emb = self.tl_emb[lvl_ind](tl_pool) + br_emb = self.br_emb[lvl_ind](br_pool) + + tl_off = self.tl_off[lvl_ind](tl_pool) + br_off = self.br_off[lvl_ind](br_pool) + + result_list = [tl_heat, br_heat, tl_emb, br_emb, tl_off, br_off] + if return_pool: + result_list.append(tl_pool) + result_list.append(br_pool) + + return result_list + + def get_targets(self, + gt_bboxes: List[Tensor], + gt_labels: List[Tensor], + feat_shape: Sequence[int], + img_shape: Sequence[int], + with_corner_emb: bool = False, + with_guiding_shift: bool = False, + with_centripetal_shift: bool = False) -> dict: + """Generate corner targets. + + Including corner heatmap, corner offset. + + Optional: corner embedding, corner guiding shift, centripetal shift. + + For CornerNet, we generate corner heatmap, corner offset and corner + embedding from this function. + + For CentripetalNet, we generate corner heatmap, corner offset, guiding + shift and centripetal shift from this function. + + Args: + gt_bboxes (list[Tensor]): Ground truth bboxes of each image, each + has shape (num_gt, 4). + gt_labels (list[Tensor]): Ground truth labels of each box, each has + shape (num_gt, ). + feat_shape (Sequence[int]): Shape of output feature, + [batch, channel, height, width]. + img_shape (Sequence[int]): Shape of input image, + [height, width, channel]. + with_corner_emb (bool): Generate corner embedding target or not. + Defaults to False. + with_guiding_shift (bool): Generate guiding shift target or not. + Defaults to False. + with_centripetal_shift (bool): Generate centripetal shift target or + not. Defaults to False. + + Returns: + dict: Ground truth of corner heatmap, corner offset, corner + embedding, guiding shift and centripetal shift. Containing the + following keys: + + - topleft_heatmap (Tensor): Ground truth top-left corner + heatmap. + - bottomright_heatmap (Tensor): Ground truth bottom-right + corner heatmap. + - topleft_offset (Tensor): Ground truth top-left corner offset. + - bottomright_offset (Tensor): Ground truth bottom-right corner + offset. + - corner_embedding (list[list[list[int]]]): Ground truth corner + embedding. Not must have. + - topleft_guiding_shift (Tensor): Ground truth top-left corner + guiding shift. Not must have. + - bottomright_guiding_shift (Tensor): Ground truth bottom-right + corner guiding shift. Not must have. + - topleft_centripetal_shift (Tensor): Ground truth top-left + corner centripetal shift. Not must have. + - bottomright_centripetal_shift (Tensor): Ground truth + bottom-right corner centripetal shift. Not must have. + """ + batch_size, _, height, width = feat_shape + img_h, img_w = img_shape[:2] + + width_ratio = float(width / img_w) + height_ratio = float(height / img_h) + + gt_tl_heatmap = gt_bboxes[-1].new_zeros( + [batch_size, self.num_classes, height, width]) + gt_br_heatmap = gt_bboxes[-1].new_zeros( + [batch_size, self.num_classes, height, width]) + gt_tl_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width]) + gt_br_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width]) + + if with_corner_emb: + match = [] + + # Guiding shift is a kind of offset, from center to corner + if with_guiding_shift: + gt_tl_guiding_shift = gt_bboxes[-1].new_zeros( + [batch_size, 2, height, width]) + gt_br_guiding_shift = gt_bboxes[-1].new_zeros( + [batch_size, 2, height, width]) + # Centripetal shift is also a kind of offset, from center to corner + # and normalized by log. + if with_centripetal_shift: + gt_tl_centripetal_shift = gt_bboxes[-1].new_zeros( + [batch_size, 2, height, width]) + gt_br_centripetal_shift = gt_bboxes[-1].new_zeros( + [batch_size, 2, height, width]) + + for batch_id in range(batch_size): + # Ground truth of corner embedding per image is a list of coord set + corner_match = [] + for box_id in range(len(gt_labels[batch_id])): + left, top, right, bottom = gt_bboxes[batch_id][box_id] + center_x = (left + right) / 2.0 + center_y = (top + bottom) / 2.0 + label = gt_labels[batch_id][box_id] + + # Use coords in the feature level to generate ground truth + scale_left = left * width_ratio + scale_right = right * width_ratio + scale_top = top * height_ratio + scale_bottom = bottom * height_ratio + scale_center_x = center_x * width_ratio + scale_center_y = center_y * height_ratio + + # Int coords on feature map/ground truth tensor + left_idx = int(min(scale_left, width - 1)) + right_idx = int(min(scale_right, width - 1)) + top_idx = int(min(scale_top, height - 1)) + bottom_idx = int(min(scale_bottom, height - 1)) + + # Generate gaussian heatmap + scale_box_width = ceil(scale_right - scale_left) + scale_box_height = ceil(scale_bottom - scale_top) + radius = gaussian_radius((scale_box_height, scale_box_width), + min_overlap=0.3) + radius = max(0, int(radius)) + gt_tl_heatmap[batch_id, label] = gen_gaussian_target( + gt_tl_heatmap[batch_id, label], [left_idx, top_idx], + radius) + gt_br_heatmap[batch_id, label] = gen_gaussian_target( + gt_br_heatmap[batch_id, label], [right_idx, bottom_idx], + radius) + + # Generate corner offset + left_offset = scale_left - left_idx + top_offset = scale_top - top_idx + right_offset = scale_right - right_idx + bottom_offset = scale_bottom - bottom_idx + gt_tl_offset[batch_id, 0, top_idx, left_idx] = left_offset + gt_tl_offset[batch_id, 1, top_idx, left_idx] = top_offset + gt_br_offset[batch_id, 0, bottom_idx, right_idx] = right_offset + gt_br_offset[batch_id, 1, bottom_idx, + right_idx] = bottom_offset + + # Generate corner embedding + if with_corner_emb: + corner_match.append([[top_idx, left_idx], + [bottom_idx, right_idx]]) + # Generate guiding shift + if with_guiding_shift: + gt_tl_guiding_shift[batch_id, 0, top_idx, + left_idx] = scale_center_x - left_idx + gt_tl_guiding_shift[batch_id, 1, top_idx, + left_idx] = scale_center_y - top_idx + gt_br_guiding_shift[batch_id, 0, bottom_idx, + right_idx] = right_idx - scale_center_x + gt_br_guiding_shift[ + batch_id, 1, bottom_idx, + right_idx] = bottom_idx - scale_center_y + # Generate centripetal shift + if with_centripetal_shift: + gt_tl_centripetal_shift[batch_id, 0, top_idx, + left_idx] = log(scale_center_x - + scale_left) + gt_tl_centripetal_shift[batch_id, 1, top_idx, + left_idx] = log(scale_center_y - + scale_top) + gt_br_centripetal_shift[batch_id, 0, bottom_idx, + right_idx] = log(scale_right - + scale_center_x) + gt_br_centripetal_shift[batch_id, 1, bottom_idx, + right_idx] = log(scale_bottom - + scale_center_y) + + if with_corner_emb: + match.append(corner_match) + + target_result = dict( + topleft_heatmap=gt_tl_heatmap, + topleft_offset=gt_tl_offset, + bottomright_heatmap=gt_br_heatmap, + bottomright_offset=gt_br_offset) + + if with_corner_emb: + target_result.update(corner_embedding=match) + if with_guiding_shift: + target_result.update( + topleft_guiding_shift=gt_tl_guiding_shift, + bottomright_guiding_shift=gt_br_guiding_shift) + if with_centripetal_shift: + target_result.update( + topleft_centripetal_shift=gt_tl_centripetal_shift, + bottomright_centripetal_shift=gt_br_centripetal_shift) + + return target_result + + def loss_by_feat( + self, + tl_heats: List[Tensor], + br_heats: List[Tensor], + tl_embs: List[Tensor], + br_embs: List[Tensor], + tl_offs: List[Tensor], + br_offs: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + tl_heats (list[Tensor]): Top-left corner heatmaps for each level + with shape (N, num_classes, H, W). + br_heats (list[Tensor]): Bottom-right corner heatmaps for each + level with shape (N, num_classes, H, W). + tl_embs (list[Tensor]): Top-left corner embeddings for each level + with shape (N, corner_emb_channels, H, W). + br_embs (list[Tensor]): Bottom-right corner embeddings for each + level with shape (N, corner_emb_channels, H, W). + tl_offs (list[Tensor]): Top-left corner offsets for each level + with shape (N, corner_offset_channels, H, W). + br_offs (list[Tensor]): Bottom-right corner offsets for each level + with shape (N, corner_offset_channels, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Specify which bounding boxes can be ignored when computing + the loss. + + Returns: + dict[str, Tensor]: A dictionary of loss components. Containing the + following losses: + + - det_loss (list[Tensor]): Corner keypoint losses of all + feature levels. + - pull_loss (list[Tensor]): Part one of AssociativeEmbedding + losses of all feature levels. + - push_loss (list[Tensor]): Part two of AssociativeEmbedding + losses of all feature levels. + - off_loss (list[Tensor]): Corner offset losses of all feature + levels. + """ + gt_bboxes = [ + gt_instances.bboxes for gt_instances in batch_gt_instances + ] + gt_labels = [ + gt_instances.labels for gt_instances in batch_gt_instances + ] + + targets = self.get_targets( + gt_bboxes, + gt_labels, + tl_heats[-1].shape, + batch_img_metas[0]['batch_input_shape'], + with_corner_emb=self.with_corner_emb) + mlvl_targets = [targets for _ in range(self.num_feat_levels)] + det_losses, pull_losses, push_losses, off_losses = multi_apply( + self.loss_by_feat_single, tl_heats, br_heats, tl_embs, br_embs, + tl_offs, br_offs, mlvl_targets) + loss_dict = dict(det_loss=det_losses, off_loss=off_losses) + if self.with_corner_emb: + loss_dict.update(pull_loss=pull_losses, push_loss=push_losses) + return loss_dict + + def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor, + tl_emb: Optional[Tensor], br_emb: Optional[Tensor], + tl_off: Tensor, br_off: Tensor, + targets: dict) -> Tuple[Tensor, ...]: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + tl_hmp (Tensor): Top-left corner heatmap for current level with + shape (N, num_classes, H, W). + br_hmp (Tensor): Bottom-right corner heatmap for current level with + shape (N, num_classes, H, W). + tl_emb (Tensor, optional): Top-left corner embedding for current + level with shape (N, corner_emb_channels, H, W). + br_emb (Tensor, optional): Bottom-right corner embedding for + current level with shape (N, corner_emb_channels, H, W). + tl_off (Tensor): Top-left corner offset for current level with + shape (N, corner_offset_channels, H, W). + br_off (Tensor): Bottom-right corner offset for current level with + shape (N, corner_offset_channels, H, W). + targets (dict): Corner target generated by `get_targets`. + + Returns: + tuple[torch.Tensor]: Losses of the head's different branches + containing the following losses: + + - det_loss (Tensor): Corner keypoint loss. + - pull_loss (Tensor): Part one of AssociativeEmbedding loss. + - push_loss (Tensor): Part two of AssociativeEmbedding loss. + - off_loss (Tensor): Corner offset loss. + """ + gt_tl_hmp = targets['topleft_heatmap'] + gt_br_hmp = targets['bottomright_heatmap'] + gt_tl_off = targets['topleft_offset'] + gt_br_off = targets['bottomright_offset'] + gt_embedding = targets['corner_embedding'] + + # Detection loss + tl_det_loss = self.loss_heatmap( + tl_hmp.sigmoid(), + gt_tl_hmp, + avg_factor=max(1, + gt_tl_hmp.eq(1).sum())) + br_det_loss = self.loss_heatmap( + br_hmp.sigmoid(), + gt_br_hmp, + avg_factor=max(1, + gt_br_hmp.eq(1).sum())) + det_loss = (tl_det_loss + br_det_loss) / 2.0 + + # AssociativeEmbedding loss + if self.with_corner_emb and self.loss_embedding is not None: + pull_loss, push_loss = self.loss_embedding(tl_emb, br_emb, + gt_embedding) + else: + pull_loss, push_loss = None, None + + # Offset loss + # We only compute the offset loss at the real corner position. + # The value of real corner would be 1 in heatmap ground truth. + # The mask is computed in class agnostic mode and its shape is + # batch * 1 * width * height. + tl_off_mask = gt_tl_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as( + gt_tl_hmp) + br_off_mask = gt_br_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as( + gt_br_hmp) + tl_off_loss = self.loss_offset( + tl_off, + gt_tl_off, + tl_off_mask, + avg_factor=max(1, tl_off_mask.sum())) + br_off_loss = self.loss_offset( + br_off, + gt_br_off, + br_off_mask, + avg_factor=max(1, br_off_mask.sum())) + + off_loss = (tl_off_loss + br_off_loss) / 2.0 + + return det_loss, pull_loss, push_loss, off_loss + + def predict_by_feat(self, + tl_heats: List[Tensor], + br_heats: List[Tensor], + tl_embs: List[Tensor], + br_embs: List[Tensor], + tl_offs: List[Tensor], + br_offs: List[Tensor], + batch_img_metas: Optional[List[dict]] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + tl_heats (list[Tensor]): Top-left corner heatmaps for each level + with shape (N, num_classes, H, W). + br_heats (list[Tensor]): Bottom-right corner heatmaps for each + level with shape (N, num_classes, H, W). + tl_embs (list[Tensor]): Top-left corner embeddings for each level + with shape (N, corner_emb_channels, H, W). + br_embs (list[Tensor]): Bottom-right corner embeddings for each + level with shape (N, corner_emb_channels, H, W). + tl_offs (list[Tensor]): Top-left corner offsets for each level + with shape (N, corner_offset_channels, H, W). + br_offs (list[Tensor]): Bottom-right corner offsets for each level + with shape (N, corner_offset_channels, H, W). + batch_img_metas (list[dict], optional): Batch image meta info. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len( + batch_img_metas) + result_list = [] + for img_id in range(len(batch_img_metas)): + result_list.append( + self._predict_by_feat_single( + tl_heats[-1][img_id:img_id + 1, :], + br_heats[-1][img_id:img_id + 1, :], + tl_offs[-1][img_id:img_id + 1, :], + br_offs[-1][img_id:img_id + 1, :], + batch_img_metas[img_id], + tl_emb=tl_embs[-1][img_id:img_id + 1, :], + br_emb=br_embs[-1][img_id:img_id + 1, :], + rescale=rescale, + with_nms=with_nms)) + + return result_list + + def _predict_by_feat_single(self, + tl_heat: Tensor, + br_heat: Tensor, + tl_off: Tensor, + br_off: Tensor, + img_meta: dict, + tl_emb: Optional[Tensor] = None, + br_emb: Optional[Tensor] = None, + tl_centripetal_shift: Optional[Tensor] = None, + br_centripetal_shift: Optional[Tensor] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + tl_heat (Tensor): Top-left corner heatmap for current level with + shape (N, num_classes, H, W). + br_heat (Tensor): Bottom-right corner heatmap for current level + with shape (N, num_classes, H, W). + tl_off (Tensor): Top-left corner offset for current level with + shape (N, corner_offset_channels, H, W). + br_off (Tensor): Bottom-right corner offset for current level with + shape (N, corner_offset_channels, H, W). + img_meta (dict): Meta information of current image, e.g., + image size, scaling factor, etc. + tl_emb (Tensor): Top-left corner embedding for current level with + shape (N, corner_emb_channels, H, W). + br_emb (Tensor): Bottom-right corner embedding for current level + with shape (N, corner_emb_channels, H, W). + tl_centripetal_shift: Top-left corner's centripetal shift for + current level with shape (N, 2, H, W). + br_centripetal_shift: Bottom-right corner's centripetal shift for + current level with shape (N, 2, H, W). + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + if isinstance(img_meta, (list, tuple)): + img_meta = img_meta[0] + + batch_bboxes, batch_scores, batch_clses = self._decode_heatmap( + tl_heat=tl_heat.sigmoid(), + br_heat=br_heat.sigmoid(), + tl_off=tl_off, + br_off=br_off, + tl_emb=tl_emb, + br_emb=br_emb, + tl_centripetal_shift=tl_centripetal_shift, + br_centripetal_shift=br_centripetal_shift, + img_meta=img_meta, + k=self.test_cfg.corner_topk, + kernel=self.test_cfg.local_maximum_kernel, + distance_threshold=self.test_cfg.distance_threshold) + + if rescale and 'scale_factor' in img_meta: + batch_bboxes /= batch_bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + + bboxes = batch_bboxes.view([-1, 4]) + scores = batch_scores.view(-1) + clses = batch_clses.view(-1) + + det_bboxes = torch.cat([bboxes, scores.unsqueeze(-1)], -1) + keepinds = (det_bboxes[:, -1] > -0.1) + det_bboxes = det_bboxes[keepinds] + det_labels = clses[keepinds] + + if with_nms: + det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels, + self.test_cfg) + + results = InstanceData() + results.bboxes = det_bboxes[..., :4] + results.scores = det_bboxes[..., 4] + results.labels = det_labels + return results + + def _bboxes_nms(self, bboxes: Tensor, labels: Tensor, + cfg: ConfigDict) -> Tuple[Tensor, Tensor]: + """bboxes nms.""" + if 'nms_cfg' in cfg: + warning.warn('nms_cfg in test_cfg will be deprecated. ' + 'Please rename it as nms') + if 'nms' not in cfg: + cfg.nms = cfg.nms_cfg + + if labels.numel() > 0: + max_num = cfg.max_per_img + bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:, + -1].contiguous(), + labels, cfg.nms) + if max_num > 0: + bboxes = bboxes[:max_num] + labels = labels[keep][:max_num] + + return bboxes, labels + + def _decode_heatmap(self, + tl_heat: Tensor, + br_heat: Tensor, + tl_off: Tensor, + br_off: Tensor, + tl_emb: Optional[Tensor] = None, + br_emb: Optional[Tensor] = None, + tl_centripetal_shift: Optional[Tensor] = None, + br_centripetal_shift: Optional[Tensor] = None, + img_meta: Optional[dict] = None, + k: int = 100, + kernel: int = 3, + distance_threshold: float = 0.5, + num_dets: int = 1000) -> Tuple[Tensor, Tensor, Tensor]: + """Transform outputs into detections raw bbox prediction. + + Args: + tl_heat (Tensor): Top-left corner heatmap for current level with + shape (N, num_classes, H, W). + br_heat (Tensor): Bottom-right corner heatmap for current level + with shape (N, num_classes, H, W). + tl_off (Tensor): Top-left corner offset for current level with + shape (N, corner_offset_channels, H, W). + br_off (Tensor): Bottom-right corner offset for current level with + shape (N, corner_offset_channels, H, W). + tl_emb (Tensor, Optional): Top-left corner embedding for current + level with shape (N, corner_emb_channels, H, W). + br_emb (Tensor, Optional): Bottom-right corner embedding for + current level with shape (N, corner_emb_channels, H, W). + tl_centripetal_shift (Tensor, Optional): Top-left centripetal shift + for current level with shape (N, 2, H, W). + br_centripetal_shift (Tensor, Optional): Bottom-right centripetal + shift for current level with shape (N, 2, H, W). + img_meta (dict): Meta information of current image, e.g., + image size, scaling factor, etc. + k (int): Get top k corner keypoints from heatmap. + kernel (int): Max pooling kernel for extract local maximum pixels. + distance_threshold (float): Distance threshold. Top-left and + bottom-right corner keypoints with feature distance less than + the threshold will be regarded as keypoints from same object. + num_dets (int): Num of raw boxes before doing nms. + + Returns: + tuple[torch.Tensor]: Decoded output of CornerHead, containing the + following Tensors: + + - bboxes (Tensor): Coords of each box. + - scores (Tensor): Scores of each box. + - clses (Tensor): Categories of each box. + """ + with_embedding = tl_emb is not None and br_emb is not None + with_centripetal_shift = ( + tl_centripetal_shift is not None + and br_centripetal_shift is not None) + assert with_embedding + with_centripetal_shift == 1 + batch, _, height, width = tl_heat.size() + if torch.onnx.is_in_onnx_export(): + inp_h, inp_w = img_meta['pad_shape_for_onnx'][:2] + else: + inp_h, inp_w = img_meta['batch_input_shape'][:2] + + # perform nms on heatmaps + tl_heat = get_local_maximum(tl_heat, kernel=kernel) + br_heat = get_local_maximum(br_heat, kernel=kernel) + + tl_scores, tl_inds, tl_clses, tl_ys, tl_xs = get_topk_from_heatmap( + tl_heat, k=k) + br_scores, br_inds, br_clses, br_ys, br_xs = get_topk_from_heatmap( + br_heat, k=k) + + # We use repeat instead of expand here because expand is a + # shallow-copy function. Thus it could cause unexpected testing result + # sometimes. Using expand will decrease about 10% mAP during testing + # compared to repeat. + tl_ys = tl_ys.view(batch, k, 1).repeat(1, 1, k) + tl_xs = tl_xs.view(batch, k, 1).repeat(1, 1, k) + br_ys = br_ys.view(batch, 1, k).repeat(1, k, 1) + br_xs = br_xs.view(batch, 1, k).repeat(1, k, 1) + + tl_off = transpose_and_gather_feat(tl_off, tl_inds) + tl_off = tl_off.view(batch, k, 1, 2) + br_off = transpose_and_gather_feat(br_off, br_inds) + br_off = br_off.view(batch, 1, k, 2) + + tl_xs = tl_xs + tl_off[..., 0] + tl_ys = tl_ys + tl_off[..., 1] + br_xs = br_xs + br_off[..., 0] + br_ys = br_ys + br_off[..., 1] + + if with_centripetal_shift: + tl_centripetal_shift = transpose_and_gather_feat( + tl_centripetal_shift, tl_inds).view(batch, k, 1, 2).exp() + br_centripetal_shift = transpose_and_gather_feat( + br_centripetal_shift, br_inds).view(batch, 1, k, 2).exp() + + tl_ctxs = tl_xs + tl_centripetal_shift[..., 0] + tl_ctys = tl_ys + tl_centripetal_shift[..., 1] + br_ctxs = br_xs - br_centripetal_shift[..., 0] + br_ctys = br_ys - br_centripetal_shift[..., 1] + + # all possible boxes based on top k corners (ignoring class) + tl_xs *= (inp_w / width) + tl_ys *= (inp_h / height) + br_xs *= (inp_w / width) + br_ys *= (inp_h / height) + + if with_centripetal_shift: + tl_ctxs *= (inp_w / width) + tl_ctys *= (inp_h / height) + br_ctxs *= (inp_w / width) + br_ctys *= (inp_h / height) + + x_off, y_off = 0, 0 # no crop + if not torch.onnx.is_in_onnx_export(): + # since `RandomCenterCropPad` is done on CPU with numpy and it's + # not dynamic traceable when exporting to ONNX, thus 'border' + # does not appears as key in 'img_meta'. As a tmp solution, + # we move this 'border' handle part to the postprocess after + # finished exporting to ONNX, which is handle in + # `mmdet/core/export/model_wrappers.py`. Though difference between + # pytorch and exported onnx model, it might be ignored since + # comparable performance is achieved between them (e.g. 40.4 vs + # 40.6 on COCO val2017, for CornerNet without test-time flip) + if 'border' in img_meta: + x_off = img_meta['border'][2] + y_off = img_meta['border'][0] + + tl_xs -= x_off + tl_ys -= y_off + br_xs -= x_off + br_ys -= y_off + + zeros = tl_xs.new_zeros(*tl_xs.size()) + tl_xs = torch.where(tl_xs > 0.0, tl_xs, zeros) + tl_ys = torch.where(tl_ys > 0.0, tl_ys, zeros) + br_xs = torch.where(br_xs > 0.0, br_xs, zeros) + br_ys = torch.where(br_ys > 0.0, br_ys, zeros) + + bboxes = torch.stack((tl_xs, tl_ys, br_xs, br_ys), dim=3) + area_bboxes = ((br_xs - tl_xs) * (br_ys - tl_ys)).abs() + + if with_centripetal_shift: + tl_ctxs -= x_off + tl_ctys -= y_off + br_ctxs -= x_off + br_ctys -= y_off + + tl_ctxs *= tl_ctxs.gt(0.0).type_as(tl_ctxs) + tl_ctys *= tl_ctys.gt(0.0).type_as(tl_ctys) + br_ctxs *= br_ctxs.gt(0.0).type_as(br_ctxs) + br_ctys *= br_ctys.gt(0.0).type_as(br_ctys) + + ct_bboxes = torch.stack((tl_ctxs, tl_ctys, br_ctxs, br_ctys), + dim=3) + area_ct_bboxes = ((br_ctxs - tl_ctxs) * (br_ctys - tl_ctys)).abs() + + rcentral = torch.zeros_like(ct_bboxes) + # magic nums from paper section 4.1 + mu = torch.ones_like(area_bboxes) / 2.4 + mu[area_bboxes > 3500] = 1 / 2.1 # large bbox have smaller mu + + bboxes_center_x = (bboxes[..., 0] + bboxes[..., 2]) / 2 + bboxes_center_y = (bboxes[..., 1] + bboxes[..., 3]) / 2 + rcentral[..., 0] = bboxes_center_x - mu * (bboxes[..., 2] - + bboxes[..., 0]) / 2 + rcentral[..., 1] = bboxes_center_y - mu * (bboxes[..., 3] - + bboxes[..., 1]) / 2 + rcentral[..., 2] = bboxes_center_x + mu * (bboxes[..., 2] - + bboxes[..., 0]) / 2 + rcentral[..., 3] = bboxes_center_y + mu * (bboxes[..., 3] - + bboxes[..., 1]) / 2 + area_rcentral = ((rcentral[..., 2] - rcentral[..., 0]) * + (rcentral[..., 3] - rcentral[..., 1])).abs() + dists = area_ct_bboxes / area_rcentral + + tl_ctx_inds = (ct_bboxes[..., 0] <= rcentral[..., 0]) | ( + ct_bboxes[..., 0] >= rcentral[..., 2]) + tl_cty_inds = (ct_bboxes[..., 1] <= rcentral[..., 1]) | ( + ct_bboxes[..., 1] >= rcentral[..., 3]) + br_ctx_inds = (ct_bboxes[..., 2] <= rcentral[..., 0]) | ( + ct_bboxes[..., 2] >= rcentral[..., 2]) + br_cty_inds = (ct_bboxes[..., 3] <= rcentral[..., 1]) | ( + ct_bboxes[..., 3] >= rcentral[..., 3]) + + if with_embedding: + tl_emb = transpose_and_gather_feat(tl_emb, tl_inds) + tl_emb = tl_emb.view(batch, k, 1) + br_emb = transpose_and_gather_feat(br_emb, br_inds) + br_emb = br_emb.view(batch, 1, k) + dists = torch.abs(tl_emb - br_emb) + + tl_scores = tl_scores.view(batch, k, 1).repeat(1, 1, k) + br_scores = br_scores.view(batch, 1, k).repeat(1, k, 1) + + scores = (tl_scores + br_scores) / 2 # scores for all possible boxes + + # tl and br should have same class + tl_clses = tl_clses.view(batch, k, 1).repeat(1, 1, k) + br_clses = br_clses.view(batch, 1, k).repeat(1, k, 1) + cls_inds = (tl_clses != br_clses) + + # reject boxes based on distances + dist_inds = dists > distance_threshold + + # reject boxes based on widths and heights + width_inds = (br_xs <= tl_xs) + height_inds = (br_ys <= tl_ys) + + # No use `scores[cls_inds]`, instead we use `torch.where` here. + # Since only 1-D indices with type 'tensor(bool)' are supported + # when exporting to ONNX, any other bool indices with more dimensions + # (e.g. 2-D bool tensor) as input parameter in node is invalid + negative_scores = -1 * torch.ones_like(scores) + scores = torch.where(cls_inds, negative_scores, scores) + scores = torch.where(width_inds, negative_scores, scores) + scores = torch.where(height_inds, negative_scores, scores) + scores = torch.where(dist_inds, negative_scores, scores) + + if with_centripetal_shift: + scores[tl_ctx_inds] = -1 + scores[tl_cty_inds] = -1 + scores[br_ctx_inds] = -1 + scores[br_cty_inds] = -1 + + scores = scores.view(batch, -1) + scores, inds = torch.topk(scores, num_dets) + scores = scores.unsqueeze(2) + + bboxes = bboxes.view(batch, -1, 4) + bboxes = gather_feat(bboxes, inds) + + clses = tl_clses.contiguous().view(batch, -1, 1) + clses = gather_feat(clses, inds) + + return bboxes, scores, clses diff --git a/mmdetection/mmdet/models/dense_heads/dab_detr_head.py b/mmdetection/mmdet/models/dense_heads/dab_detr_head.py new file mode 100644 index 0000000..892833f --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/dab_detr_head.py @@ -0,0 +1,106 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch.nn as nn +from mmcv.cnn import Linear +from mmengine.model import bias_init_with_prob, constant_init +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import InstanceList +from ..layers import MLP, inverse_sigmoid +from .conditional_detr_head import ConditionalDETRHead + + +@MODELS.register_module() +class DABDETRHead(ConditionalDETRHead): + """Head of DAB-DETR. DAB-DETR: Dynamic Anchor Boxes are Better Queries for + DETR. + + More details can be found in the `paper + `_ . + """ + + def _init_layers(self) -> None: + """Initialize layers of the transformer head.""" + # cls branch + self.fc_cls = Linear(self.embed_dims, self.cls_out_channels) + # reg branch + self.fc_reg = MLP(self.embed_dims, self.embed_dims, 4, 3) + + def init_weights(self) -> None: + """initialize weights.""" + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + nn.init.constant_(self.fc_cls.bias, bias_init) + constant_init(self.fc_reg.layers[-1], 0., bias=0.) + + def forward(self, hidden_states: Tensor, + references: Tensor) -> Tuple[Tensor, Tensor]: + """"Forward function. + + Args: + hidden_states (Tensor): Features from transformer decoder. If + `return_intermediate_dec` is True output has shape + (num_decoder_layers, bs, num_queries, dim), else has shape (1, + bs, num_queries, dim) which only contains the last layer + outputs. + references (Tensor): References from transformer decoder. If + `return_intermediate_dec` is True output has shape + (num_decoder_layers, bs, num_queries, 2/4), else has shape (1, + bs, num_queries, 2/4) + which only contains the last layer reference. + Returns: + tuple[Tensor]: results of head containing the following tensor. + + - layers_cls_scores (Tensor): Outputs from the classification head, + shape (num_decoder_layers, bs, num_queries, cls_out_channels). + Note cls_out_channels should include background. + - layers_bbox_preds (Tensor): Sigmoid outputs from the regression + head with normalized coordinate format (cx, cy, w, h), has shape + (num_decoder_layers, bs, num_queries, 4). + """ + layers_cls_scores = self.fc_cls(hidden_states) + references_before_sigmoid = inverse_sigmoid(references, eps=1e-3) + tmp_reg_preds = self.fc_reg(hidden_states) + tmp_reg_preds[..., :references_before_sigmoid. + size(-1)] += references_before_sigmoid + layers_bbox_preds = tmp_reg_preds.sigmoid() + return layers_cls_scores, layers_bbox_preds + + def predict(self, + hidden_states: Tensor, + references: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. Over-write + because img_metas are needed as inputs for bbox_head. + + Args: + hidden_states (Tensor): Feature from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, dim). + references (Tensor): references from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, 2/4). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to True. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + last_layer_hidden_state = hidden_states[-1].unsqueeze(0) + last_layer_reference = references[-1].unsqueeze(0) + outs = self(last_layer_hidden_state, last_layer_reference) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas, rescale=rescale) + return predictions diff --git a/mmdetection/mmdet/models/dense_heads/ddod_head.py b/mmdetection/mmdet/models/dense_heads/ddod_head.py new file mode 100644 index 0000000..64e91ff --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/ddod_head.py @@ -0,0 +1,794 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, Scale +from mmengine.model import bias_init_with_prob, normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, reduce_mean) +from ..task_modules.prior_generators import anchor_inside_flags +from ..utils import images_to_levels, multi_apply, unmap +from .anchor_head import AnchorHead + +EPS = 1e-12 + + +@MODELS.register_module() +class DDODHead(AnchorHead): + """Detection Head of `DDOD `_. + + DDOD head decomposes conjunctions lying in most current one-stage + detectors via label assignment disentanglement, spatial feature + disentanglement, and pyramid supervision disentanglement. + + Args: + num_classes (int): Number of categories excluding the + background category. + in_channels (int): Number of channels in the input feature map. + stacked_convs (int): The number of stacked Conv. Defaults to 4. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + use_dcn (bool): Use dcn, Same as ATSS when False. Defaults to True. + norm_cfg (:obj:`ConfigDict` or dict): Normal config of ddod head. + Defaults to dict(type='GN', num_groups=32, requires_grad=True). + loss_iou (:obj:`ConfigDict` or dict): Config of IoU loss. Defaults to + dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0). + """ + + def __init__(self, + num_classes: int, + in_channels: int, + stacked_convs: int = 4, + conv_cfg: OptConfigType = None, + use_dcn: bool = True, + norm_cfg: ConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + loss_iou: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + **kwargs) -> None: + self.stacked_convs = stacked_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.use_dcn = use_dcn + super().__init__(num_classes, in_channels, **kwargs) + + if self.train_cfg: + self.cls_assigner = TASK_UTILS.build(self.train_cfg['assigner']) + self.reg_assigner = TASK_UTILS.build( + self.train_cfg['reg_assigner']) + self.loss_iou = MODELS.build(loss_iou) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=dict(type='DCN', deform_groups=1) + if i == 0 and self.use_dcn else self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=dict(type='DCN', deform_groups=1) + if i == 0 and self.use_dcn else self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.atss_cls = nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + 3, + padding=1) + self.atss_reg = nn.Conv2d( + self.feat_channels, self.num_base_priors * 4, 3, padding=1) + self.atss_iou = nn.Conv2d( + self.feat_channels, self.num_base_priors * 1, 3, padding=1) + self.scales = nn.ModuleList( + [Scale(1.0) for _ in self.prior_generator.strides]) + + # we use the global list in loss + self.cls_num_pos_samples_per_level = [ + 0. for _ in range(len(self.prior_generator.strides)) + ] + self.reg_num_pos_samples_per_level = [ + 0. for _ in range(len(self.prior_generator.strides)) + ] + + def init_weights(self) -> None: + """Initialize weights of the head.""" + for m in self.cls_convs: + normal_init(m.conv, std=0.01) + for m in self.reg_convs: + normal_init(m.conv, std=0.01) + normal_init(self.atss_reg, std=0.01) + normal_init(self.atss_iou, std=0.01) + bias_cls = bias_init_with_prob(0.01) + normal_init(self.atss_cls, std=0.01, bias=bias_cls) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores, bbox predictions, + and iou predictions. + + - cls_scores (list[Tensor]): Classification scores for all \ + scale levels, each is a 4D-tensor, the channels number is \ + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all \ + scale levels, each is a 4D-tensor, the channels number is \ + num_base_priors * 4. + - iou_preds (list[Tensor]): IoU scores for all scale levels, \ + each is a 4D-tensor, the channels number is num_base_priors * 1. + """ + return multi_apply(self.forward_single, x, self.scales) + + def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]: + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + + Returns: + tuple: + + - cls_score (Tensor): Cls scores for a single scale level \ + the channels number is num_base_priors * num_classes. + - bbox_pred (Tensor): Box energies / deltas for a single \ + scale level, the channels number is num_base_priors * 4. + - iou_pred (Tensor): Iou for a single scale level, the \ + channel number is (N, num_base_priors * 1, H, W). + """ + cls_feat = x + reg_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + reg_feat = reg_conv(reg_feat) + cls_score = self.atss_cls(cls_feat) + # we just follow atss, not apply exp in bbox_pred + bbox_pred = scale(self.atss_reg(reg_feat)).float() + iou_pred = self.atss_iou(reg_feat) + return cls_score, bbox_pred, iou_pred + + def loss_cls_by_feat_single(self, cls_score: Tensor, labels: Tensor, + label_weights: Tensor, + reweight_factor: List[float], + avg_factor: float) -> Tuple[Tensor]: + """Compute cls loss of a single scale level. + + Args: + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_base_priors * num_classes, H, W). + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors) + reweight_factor (List[float]): Reweight factor for cls and reg + loss. + avg_factor (float): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + Tuple[Tensor]: A tuple of loss components. + """ + cls_score = cls_score.permute(0, 2, 3, 1).reshape( + -1, self.cls_out_channels).contiguous() + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=avg_factor) + return reweight_factor * loss_cls, + + def loss_reg_by_feat_single(self, anchors: Tensor, bbox_pred: Tensor, + iou_pred: Tensor, labels, + label_weights: Tensor, bbox_targets: Tensor, + bbox_weights: Tensor, + reweight_factor: List[float], + avg_factor: float) -> Tuple[Tensor, Tensor]: + """Compute reg loss of a single scale level based on the features + extracted by the detection head. + + Args: + anchors (Tensor): Box reference for each scale level with shape + (N, num_total_anchors, 4). + bbox_pred (Tensor): Box energies / deltas for each scale + level with shape (N, num_base_priors * 4, H, W). + iou_pred (Tensor): Iou for a single scale level, the + channel number is (N, num_base_priors * 1, H, W). + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (N, num_total_anchors, 4). + bbox_weights (Tensor): BBox weights of all anchors in the + image with shape (N, 4) + reweight_factor (List[float]): Reweight factor for cls and reg + loss. + avg_factor (float): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + Returns: + Tuple[Tensor, Tensor]: A tuple of loss components. + """ + anchors = anchors.reshape(-1, 4) + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + iou_pred = iou_pred.permute(0, 2, 3, 1).reshape(-1, ) + bbox_targets = bbox_targets.reshape(-1, 4) + bbox_weights = bbox_weights.reshape(-1, 4) + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + + iou_targets = label_weights.new_zeros(labels.shape) + iou_weights = label_weights.new_zeros(labels.shape) + iou_weights[(bbox_weights.sum(axis=1) > 0).nonzero( + as_tuple=False)] = 1. + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & + (labels < bg_class_ind)).nonzero(as_tuple=False).squeeze(1) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + pos_anchors = anchors[pos_inds] + + pos_decode_bbox_pred = self.bbox_coder.decode( + pos_anchors, pos_bbox_pred) + pos_decode_bbox_targets = self.bbox_coder.decode( + pos_anchors, pos_bbox_targets) + + # regression loss + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_decode_bbox_targets, + avg_factor=avg_factor) + + iou_targets[pos_inds] = bbox_overlaps( + pos_decode_bbox_pred.detach(), + pos_decode_bbox_targets, + is_aligned=True) + loss_iou = self.loss_iou( + iou_pred, iou_targets, iou_weights, avg_factor=avg_factor) + else: + loss_bbox = bbox_pred.sum() * 0 + loss_iou = iou_pred.sum() * 0 + + return reweight_factor * loss_bbox, reweight_factor * loss_iou + + def calc_reweight_factor(self, labels_list: List[Tensor]) -> List[float]: + """Compute reweight_factor for regression and classification loss.""" + # get pos samples for each level + bg_class_ind = self.num_classes + for ii, each_level_label in enumerate(labels_list): + pos_inds = ((each_level_label >= 0) & + (each_level_label < bg_class_ind)).nonzero( + as_tuple=False).squeeze(1) + self.cls_num_pos_samples_per_level[ii] += len(pos_inds) + # get reweight factor from 1 ~ 2 with bilinear interpolation + min_pos_samples = min(self.cls_num_pos_samples_per_level) + max_pos_samples = max(self.cls_num_pos_samples_per_level) + interval = 1. / (max_pos_samples - min_pos_samples + 1e-10) + reweight_factor_per_level = [] + for pos_samples in self.cls_num_pos_samples_per_level: + factor = 2. - (pos_samples - min_pos_samples) * interval + reweight_factor_per_level.append(factor) + return reweight_factor_per_level + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + iou_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_base_priors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_base_priors * 4, H, W) + iou_preds (list[Tensor]): Score factor for all scale level, + each is a 4D-tensor, has shape (batch_size, 1, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + # calculate common vars for cls and reg assigners at once + targets_com = self.process_predictions_and_anchors( + anchor_list, valid_flag_list, cls_scores, bbox_preds, + batch_img_metas, batch_gt_instances_ignore) + (anchor_list, valid_flag_list, num_level_anchors_list, cls_score_list, + bbox_pred_list, batch_gt_instances_ignore) = targets_com + + # classification branch assigner + cls_targets = self.get_cls_targets( + anchor_list, + valid_flag_list, + num_level_anchors_list, + cls_score_list, + bbox_pred_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + + (cls_anchor_list, labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) = cls_targets + + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + avg_factor = max(avg_factor, 1.0) + + reweight_factor_per_level = self.calc_reweight_factor(labels_list) + + cls_losses_cls, = multi_apply( + self.loss_cls_by_feat_single, + cls_scores, + labels_list, + label_weights_list, + reweight_factor_per_level, + avg_factor=avg_factor) + + # regression branch assigner + reg_targets = self.get_reg_targets( + anchor_list, + valid_flag_list, + num_level_anchors_list, + cls_score_list, + bbox_pred_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + + (reg_anchor_list, labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) = reg_targets + + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + avg_factor = max(avg_factor, 1.0) + + reweight_factor_per_level = self.calc_reweight_factor(labels_list) + + reg_losses_bbox, reg_losses_iou = multi_apply( + self.loss_reg_by_feat_single, + reg_anchor_list, + bbox_preds, + iou_preds, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + reweight_factor_per_level, + avg_factor=avg_factor) + + return dict( + loss_cls=cls_losses_cls, + loss_bbox=reg_losses_bbox, + loss_iou=reg_losses_iou) + + def process_predictions_and_anchors( + self, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> tuple: + """Compute common vars for regression and classification targets. + + Args: + anchor_list (List[List[Tensor]]): anchors of each image. + valid_flag_list (List[List[Tensor]]): Valid flags of each image. + cls_scores (List[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Return: + tuple[Tensor]: A tuple of common loss vars. + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + num_level_anchors_list = [num_level_anchors] * num_imgs + + anchor_list_ = [] + valid_flag_list_ = [] + # concat all level anchors and flags to a single tensor + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + anchor_list_.append(torch.cat(anchor_list[i])) + valid_flag_list_.append(torch.cat(valid_flag_list[i])) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None for _ in range(num_imgs)] + + num_levels = len(cls_scores) + cls_score_list = [] + bbox_pred_list = [] + + mlvl_cls_score_list = [ + cls_score.permute(0, 2, 3, 1).reshape( + num_imgs, -1, self.num_base_priors * self.cls_out_channels) + for cls_score in cls_scores + ] + mlvl_bbox_pred_list = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_base_priors * 4) + for bbox_pred in bbox_preds + ] + + for i in range(num_imgs): + mlvl_cls_tensor_list = [ + mlvl_cls_score_list[j][i] for j in range(num_levels) + ] + mlvl_bbox_tensor_list = [ + mlvl_bbox_pred_list[j][i] for j in range(num_levels) + ] + cat_mlvl_cls_score = torch.cat(mlvl_cls_tensor_list, dim=0) + cat_mlvl_bbox_pred = torch.cat(mlvl_bbox_tensor_list, dim=0) + cls_score_list.append(cat_mlvl_cls_score) + bbox_pred_list.append(cat_mlvl_bbox_pred) + return (anchor_list_, valid_flag_list_, num_level_anchors_list, + cls_score_list, bbox_pred_list, batch_gt_instances_ignore) + + def get_cls_targets(self, + anchor_list: List[Tensor], + valid_flag_list: List[Tensor], + num_level_anchors_list: List[int], + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True) -> tuple: + """Get cls targets for DDOD head. + + This method is almost the same as `AnchorHead.get_targets()`. + Besides returning the targets as the parent method does, + it also returns the anchors as the first element of the + returned tuple. + + Args: + anchor_list (list[Tensor]): anchors of each image. + valid_flag_list (list[Tensor]): Valid flags of each image. + num_level_anchors_list (list[Tensor]): Number of anchors of each + scale level of all image. + cls_score_list (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + bbox_pred_list (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Return: + tuple[Tensor]: A tuple of cls targets components. + """ + (all_anchors, all_labels, all_label_weights, all_bbox_targets, + all_bbox_weights, pos_inds_list, neg_inds_list, + sampling_results_list) = multi_apply( + self._get_targets_single, + anchor_list, + valid_flag_list, + cls_score_list, + bbox_pred_list, + num_level_anchors_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs, + is_cls_assigner=True) + # Get `avg_factor` of all images, which calculate in `SamplingResult`. + # When using sampling method, avg_factor is usually the sum of + # positive and negative priors. When using `PseudoSampler`, + # `avg_factor` is usually equal to the number of positive priors. + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # split targets to a list w.r.t. multiple levels + anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0]) + labels_list = images_to_levels(all_labels, num_level_anchors_list[0]) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors_list[0]) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors_list[0]) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_anchors_list[0]) + return (anchors_list, labels_list, label_weights_list, + bbox_targets_list, bbox_weights_list, avg_factor) + + def get_reg_targets(self, + anchor_list: List[Tensor], + valid_flag_list: List[Tensor], + num_level_anchors_list: List[int], + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True) -> tuple: + """Get reg targets for DDOD head. + + This method is almost the same as `AnchorHead.get_targets()` when + is_cls_assigner is False. Besides returning the targets as the parent + method does, it also returns the anchors as the first element of the + returned tuple. + + Args: + anchor_list (list[Tensor]): anchors of each image. + valid_flag_list (list[Tensor]): Valid flags of each image. + num_level_anchors_list (list[Tensor]): Number of anchors of each + scale level of all image. + cls_score_list (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + bbox_pred_list (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Return: + tuple[Tensor]: A tuple of reg targets components. + """ + (all_anchors, all_labels, all_label_weights, all_bbox_targets, + all_bbox_weights, pos_inds_list, neg_inds_list, + sampling_results_list) = multi_apply( + self._get_targets_single, + anchor_list, + valid_flag_list, + cls_score_list, + bbox_pred_list, + num_level_anchors_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs, + is_cls_assigner=False) + # Get `avg_factor` of all images, which calculate in `SamplingResult`. + # When using sampling method, avg_factor is usually the sum of + # positive and negative priors. When using `PseudoSampler`, + # `avg_factor` is usually equal to the number of positive priors. + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # split targets to a list w.r.t. multiple levels + anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0]) + labels_list = images_to_levels(all_labels, num_level_anchors_list[0]) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors_list[0]) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors_list[0]) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_anchors_list[0]) + return (anchors_list, labels_list, label_weights_list, + bbox_targets_list, bbox_weights_list, avg_factor) + + def _get_targets_single(self, + flat_anchors: Tensor, + valid_flags: Tensor, + cls_scores: Tensor, + bbox_preds: Tensor, + num_level_anchors: List[int], + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True, + is_cls_assigner: bool = True) -> tuple: + """Compute regression, classification targets for anchors in a single + image. + + Args: + flat_anchors (Tensor): Multi-level anchors of the image, + which are concatenated into a single tensor of shape + (num_base_priors, 4). + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_base_priors,). + cls_scores (Tensor): Classification scores for all scale + levels of the image. + bbox_preds (Tensor): Box energies / deltas for all scale + levels of the image. + num_level_anchors (List[int]): Number of anchors of each + scale level. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + is_cls_assigner (bool): Classification or regression. + Defaults to True. + + Returns: + tuple: N is the number of total anchors in the image. + - anchors (Tensor): all anchors in the image with shape (N, 4). + - labels (Tensor): Labels of all anchors in the image with \ + shape (N, ). + - label_weights (Tensor): Label weights of all anchor in the \ + image with shape (N, ). + - bbox_targets (Tensor): BBox targets of all anchors in the \ + image with shape (N, 4). + - bbox_weights (Tensor): BBox weights of all anchors in the \ + image with shape (N, 4) + - pos_inds (Tensor): Indices of positive anchor with shape \ + (num_pos, ). + - neg_inds (Tensor): Indices of negative anchor with shape \ + (num_neg, ). + - sampling_result (:obj:`SamplingResult`): Sampling results. + """ + inside_flags = anchor_inside_flags(flat_anchors, valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + # assign gt and sample anchors + anchors = flat_anchors[inside_flags, :] + + num_level_anchors_inside = self.get_num_level_anchors_inside( + num_level_anchors, inside_flags) + bbox_preds_valid = bbox_preds[inside_flags, :] + cls_scores_valid = cls_scores[inside_flags, :] + + assigner = self.cls_assigner if is_cls_assigner else self.reg_assigner + + # decode prediction out of assigner + bbox_preds_valid = self.bbox_coder.decode(anchors, bbox_preds_valid) + pred_instances = InstanceData( + priors=anchors, bboxes=bbox_preds_valid, scores=cls_scores_valid) + + assign_result = assigner.assign( + pred_instances=pred_instances, + num_level_priors=num_level_anchors_inside, + gt_instances=gt_instances, + gt_instances_ignore=gt_instances_ignore) + sampling_result = self.sampler.sample( + assign_result=assign_result, + pred_instances=pred_instances, + gt_instances=gt_instances) + + num_valid_anchors = anchors.shape[0] + bbox_targets = torch.zeros_like(anchors) + bbox_weights = torch.zeros_like(anchors) + labels = anchors.new_full((num_valid_anchors, ), + self.num_classes, + dtype=torch.long) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + pos_bbox_targets = self.bbox_coder.encode( + sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes) + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_anchors.size(0) + anchors = unmap(anchors, num_total_anchors, inside_flags) + labels = unmap( + labels, num_total_anchors, inside_flags, fill=self.num_classes) + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) + bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) + + return (anchors, labels, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds, sampling_result) + + def get_num_level_anchors_inside(self, num_level_anchors: List[int], + inside_flags: Tensor) -> List[int]: + """Get the anchors of each scale level inside. + + Args: + num_level_anchors (list[int]): Number of anchors of each + scale level. + inside_flags (Tensor): Multi level inside flags of the image, + which are concatenated into a single tensor of + shape (num_base_priors,). + + Returns: + list[int]: Number of anchors of each scale level inside. + """ + split_inside_flags = torch.split(inside_flags, num_level_anchors) + num_level_anchors_inside = [ + int(flags.sum()) for flags in split_inside_flags + ] + return num_level_anchors_inside diff --git a/mmdetection/mmdet/models/dense_heads/ddq_detr_head.py b/mmdetection/mmdet/models/dense_heads/ddq_detr_head.py new file mode 100644 index 0000000..0580653 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/ddq_detr_head.py @@ -0,0 +1,550 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Dict, List, Tuple + +import torch +from mmengine.model import bias_init_with_prob, constant_init +from torch import Tensor, nn + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy +from mmdet.utils import InstanceList, OptInstanceList, reduce_mean +from ..layers import inverse_sigmoid +from ..losses import DDQAuxLoss +from ..utils import multi_apply +from .dino_head import DINOHead + + +@MODELS.register_module() +class DDQDETRHead(DINOHead): + r"""Head of DDQDETR: Dense Distinct Query for + End-to-End Object Detection. + + Code is modified from the `official github repo + `_. + + More details can be found in the `paper + `_ . + + Args: + aux_num_pos (int): Number of positive targets assigned to a + perdicted object. Defaults to 4. + """ + + def __init__(self, *args, aux_num_pos=4, **kwargs): + super(DDQDETRHead, self).__init__(*args, **kwargs) + self.aux_loss_for_dense = DDQAuxLoss( + train_cfg=dict( + assigner=dict(type='TopkHungarianAssigner', topk=aux_num_pos), + alpha=1, + beta=6)) + + def _init_layers(self) -> None: + """Initialize classification branch and regression branch of aux head + for dense queries.""" + super(DDQDETRHead, self)._init_layers() + # If decoder `num_layers` = 6 and `as_two_stage` = True, then: + # 1) 6 main heads are required for + # each decoder output of distinct queries. + # 2) 1 main head is required for `output_memory` of distinct queries. + # 3) 1 aux head is required for `output_memory` of dense queries, + # which is done by code below this comment. + # So 8 heads are required in sum. + # aux head for dense queries on encoder feature map + self.cls_branches.append(copy.deepcopy(self.cls_branches[-1])) + self.reg_branches.append(copy.deepcopy(self.reg_branches[-1])) + + # If decoder `num_layers` = 6 and `as_two_stage` = True, then: + # 6 aux heads are required for each decoder output of dense queries. + # So 8 + 6 = 14 heads and heads are requires in sum. + # self.num_pred_layer is 7 + # aux head for dense queries in decoder + self.aux_cls_branches = nn.ModuleList([ + copy.deepcopy(self.cls_branches[-1]) + for _ in range(self.num_pred_layer - 1) + ]) + self.aux_reg_branches = nn.ModuleList([ + copy.deepcopy(self.reg_branches[-1]) + for _ in range(self.num_pred_layer - 1) + ]) + + def init_weights(self) -> None: + """Initialize weights of the Deformable DETR head.""" + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + nn.init.constant_(m.bias, bias_init) + for m in self.aux_cls_branches: + nn.init.constant_(m.bias, bias_init) + for m in self.reg_branches: + constant_init(m[-1], 0, bias=0) + for m in self.reg_branches: + nn.init.constant_(m[-1].bias.data[2:], 0.0) + + for m in self.aux_reg_branches: + constant_init(m[-1], 0, bias=0) + + for m in self.aux_reg_branches: + nn.init.constant_(m[-1].bias.data[2:], 0.0) + + def forward(self, hidden_states: Tensor, + references: List[Tensor]) -> Tuple[Tensor]: + """Forward function. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries_total, + dim), where `num_queries_total` is the sum of + `num_denoising_queries`, `num_queries` and `num_dense_queries` + when `self.training` is `True`, else `num_queries`. + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). Each reference has shape (bs, + num_queries_total, 4) with the last dimension arranged as + (cx, cy, w, h). + + Returns: + tuple[Tensor]: results of head containing the following tensors. + + - all_layers_outputs_classes (Tensor): Outputs from the + classification head, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels). + - all_layers_outputs_coords (Tensor): Sigmoid outputs from the + regression head with normalized coordinate format (cx, cy, w, + h), has shape (num_decoder_layers, bs, num_queries_total, 4) + with the last dimension arranged as (cx, cy, w, h). + """ + all_layers_outputs_classes = [] + all_layers_outputs_coords = [] + if self.training: + num_dense = self.cache_dict['num_dense_queries'] + for layer_id in range(hidden_states.shape[0]): + reference = inverse_sigmoid(references[layer_id]) + hidden_state = hidden_states[layer_id] + if self.training: + dense_hidden_state = hidden_state[:, -num_dense:] + hidden_state = hidden_state[:, :-num_dense] + + outputs_class = self.cls_branches[layer_id](hidden_state) + tmp_reg_preds = self.reg_branches[layer_id](hidden_state) + if self.training: + dense_outputs_class = self.aux_cls_branches[layer_id]( + dense_hidden_state) + dense_tmp_reg_preds = self.aux_reg_branches[layer_id]( + dense_hidden_state) + outputs_class = torch.cat([outputs_class, dense_outputs_class], + dim=1) + tmp_reg_preds = torch.cat([tmp_reg_preds, dense_tmp_reg_preds], + dim=1) + + if reference.shape[-1] == 4: + tmp_reg_preds += reference + else: + assert reference.shape[-1] == 2 + tmp_reg_preds[..., :2] += reference + outputs_coord = tmp_reg_preds.sigmoid() + all_layers_outputs_classes.append(outputs_class) + all_layers_outputs_coords.append(outputs_coord) + + all_layers_outputs_classes = torch.stack(all_layers_outputs_classes) + all_layers_outputs_coords = torch.stack(all_layers_outputs_coords) + + return all_layers_outputs_classes, all_layers_outputs_coords + + def loss(self, + hidden_states: Tensor, + references: List[Tensor], + enc_outputs_class: Tensor, + enc_outputs_coord: Tensor, + batch_data_samples: SampleList, + dn_meta: Dict[str, int], + aux_enc_outputs_class=None, + aux_enc_outputs_coord=None) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the queries of the upstream network. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries_total, + dim), where `num_queries_total` is the sum of + `num_denoising_queries`, `num_queries` and `num_dense_queries` + when `self.training` is `True`, else `num_queries`. + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). Each reference has shape (bs, + num_queries_total, 4) with the last dimension arranged as + (cx, cy, w, h). + enc_outputs_class (Tensor): The top k classification score of + each point on encoder feature map, has shape (bs, num_queries, + cls_out_channels). + enc_outputs_coord (Tensor): The proposal generated from points + with top k score, has shape (bs, num_queries, 4) with the + last dimension arranged as (cx, cy, w, h). + batch_data_samples (list[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + aux_enc_outputs_class (Tensor): The `dense_topk` classification + score of each point on encoder feature map, has shape (bs, + num_dense_queries, cls_out_channels). + It is `None` when `self.training` is `False`. + aux_enc_outputs_coord (Tensor): The proposal generated from points + with `dense_topk` score, has shape (bs, num_dense_queries, 4) + with the last dimension arranged as (cx, cy, w, h). + It is `None` when `self.training` is `False`. + + Returns: + dict: A dictionary of loss components. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states, references) + loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, + batch_gt_instances, batch_img_metas, dn_meta) + losses = self.loss_by_feat(*loss_inputs) + + aux_enc_outputs_coord = bbox_cxcywh_to_xyxy(aux_enc_outputs_coord) + aux_enc_outputs_coord_list = [] + for img_id in range(len(aux_enc_outputs_coord)): + det_bboxes = aux_enc_outputs_coord[img_id] + img_shape = batch_img_metas[img_id]['img_shape'] + det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] + det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] + aux_enc_outputs_coord_list.append(det_bboxes) + aux_enc_outputs_coord = torch.stack(aux_enc_outputs_coord_list) + aux_loss = self.aux_loss_for_dense.loss( + aux_enc_outputs_class.sigmoid(), aux_enc_outputs_coord, + [item.bboxes for item in batch_gt_instances], + [item.labels for item in batch_gt_instances], batch_img_metas) + for k, v in aux_loss.items(): + losses[f'aux_enc_{k}'] = v + + return losses + + def loss_by_feat( + self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + enc_cls_scores: Tensor, + enc_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + dn_meta: Dict[str, int], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Loss function. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels). + all_layers_bbox_preds (Tensor): Bbox coordinates of all decoder + layers. Each has shape (num_decoder_layers, bs, + num_queries_total, 4) with normalized coordinate format + (cx, cy, w, h). + enc_cls_scores (Tensor): The top k score of each point on + encoder feature map, has shape (bs, num_queries, + cls_out_channels). + enc_bbox_preds (Tensor): The proposal generated from points + with top k score, has shape (bs, num_queries, 4) with the + last dimension arranged as (cx, cy, w, h). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + (all_layers_matching_cls_scores, all_layers_matching_bbox_preds, + all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \ + self.split_outputs( + all_layers_cls_scores, all_layers_bbox_preds, dn_meta) + + num_dense_queries = dn_meta['num_dense_queries'] + num_layer = all_layers_matching_bbox_preds.size(0) + dense_all_layers_matching_cls_scores = all_layers_matching_cls_scores[:, :, # noqa: E501 + -num_dense_queries:] # noqa: E501 + dense_all_layers_matching_bbox_preds = all_layers_matching_bbox_preds[:, :, # noqa: E501 + -num_dense_queries:] # noqa: E501 + + all_layers_matching_cls_scores = all_layers_matching_cls_scores[:, :, : # noqa: E501 + -num_dense_queries] # noqa: E501 + all_layers_matching_bbox_preds = all_layers_matching_bbox_preds[:, :, : # noqa: E501 + -num_dense_queries] # noqa: E501 + + loss_dict = self.loss_for_distinct_queries( + all_layers_matching_cls_scores, all_layers_matching_bbox_preds, + batch_gt_instances, batch_img_metas, batch_gt_instances_ignore) + + if enc_cls_scores is not None: + + enc_loss_cls, enc_losses_bbox, enc_losses_iou = \ + self.loss_by_feat_single( + enc_cls_scores, enc_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + loss_dict['enc_loss_iou'] = enc_losses_iou + + if all_layers_denoising_cls_scores is not None: + dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn( + all_layers_denoising_cls_scores, + all_layers_denoising_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + dn_meta=dn_meta) + loss_dict['dn_loss_cls'] = dn_losses_cls[-1] + loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1] + loss_dict['dn_loss_iou'] = dn_losses_iou[-1] + for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \ + enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1], + dn_losses_iou[:-1])): + loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i + loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i + + for l_id in range(num_layer): + cls_scores = dense_all_layers_matching_cls_scores[l_id].sigmoid() + bbox_preds = dense_all_layers_matching_bbox_preds[l_id] + + bbox_preds = bbox_cxcywh_to_xyxy(bbox_preds) + bbox_preds_list = [] + for img_id in range(len(bbox_preds)): + det_bboxes = bbox_preds[img_id] + img_shape = batch_img_metas[img_id]['img_shape'] + det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] + det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] + bbox_preds_list.append(det_bboxes) + bbox_preds = torch.stack(bbox_preds_list) + aux_loss = self.aux_loss_for_dense.loss( + cls_scores, bbox_preds, + [item.bboxes for item in batch_gt_instances], + [item.labels for item in batch_gt_instances], batch_img_metas) + for k, v in aux_loss.items(): + loss_dict[f'{l_id}_aux_{k}'] = v + + return loss_dict + + def loss_for_distinct_queries( + self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Calculate the loss of distinct queries, that is, excluding denoising + and dense queries. Only select the distinct queries in decoder for + loss. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries, cls_out_channels). + all_layers_bbox_preds (Tensor): Bbox coordinates of all decoder + layers. It has shape (num_decoder_layers, bs, + num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert batch_gt_instances_ignore is None, \ + f'{self.__class__.__name__} only supports ' \ + 'for batch_gt_instances_ignore setting to None.' + + losses_cls, losses_bbox, losses_iou = multi_apply( + self._loss_for_distinct_queries_single, + all_layers_cls_scores, + all_layers_bbox_preds, + [i for i in range(len(all_layers_bbox_preds))], + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas) + + loss_dict = dict() + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + loss_dict['loss_iou'] = losses_iou[-1] + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i, loss_iou_i in \ + zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i + num_dec_layer += 1 + return loss_dict + + def _loss_for_distinct_queries_single(self, cls_scores, bbox_preds, l_id, + batch_gt_instances, batch_img_metas): + """Calculate the loss for outputs from a single decoder layer of + distinct queries, that is, excluding denoising and dense queries. Only + select the distinct queries in decoder for loss. + + Args: + cls_scores (Tensor): Classification scores of a single + decoder layer, has shape (bs, num_queries, cls_out_channels). + bbox_preds (Tensor): Bbox coordinates of a single decoder + layer. It has shape (bs, num_queries, 4) with the last + dimension arranged as (cx, cy, w, h). + l_id (int): Decoder layer index for these outputs. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + num_imgs = cls_scores.size(0) + if 0 < l_id: + batch_mask = [ + self.cache_dict['distinct_query_mask'][l_id - 1][ + img_id * self.cache_dict['num_heads']][0] + for img_id in range(num_imgs) + ] + else: + batch_mask = [ + torch.ones(len(cls_scores[i]), + device=cls_scores.device).bool() + for i in range(num_imgs) + ] + # only select the distinct queries in decoder for loss + cls_scores_list = [ + cls_scores[i][batch_mask[i]] for i in range(num_imgs) + ] + bbox_preds_list = [ + bbox_preds[i][batch_mask[i]] for i in range(num_imgs) + ] + cls_scores = torch.cat(cls_scores_list) + + cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, + batch_gt_instances, batch_img_metas) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + loss_cls = self.loss_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # construct factors used for rescale bboxes + factors = [] + for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds_list): + img_h, img_w, = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0).repeat( + bbox_pred.size(0), 1) + factors.append(factor) + factors = torch.cat(factors, 0) + + # DETR regress the relative position of boxes (cxcywh) in the image, + # thus the learning target is normalized by the image size. So here + # we need to re-scale them for calculating IoU loss + bbox_preds = torch.cat(bbox_preds_list) + bbox_preds = bbox_preds.reshape(-1, 4) + bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors + bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors + + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou( + bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) + + # regression L1 loss + loss_bbox = self.loss_bbox( + bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) + return loss_cls, loss_bbox, loss_iou + + def predict_by_feat(self, + layer_cls_scores: Tensor, + layer_bbox_preds: Tensor, + batch_img_metas: List[dict], + rescale: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + layer_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries, cls_out_channels). + layer_bbox_preds (Tensor): Bbox coordinates of all decoder layers. + Each has shape (num_decoder_layers, bs, num_queries, 4) + with normalized coordinate format (cx, cy, w, h). + batch_img_metas (list[dict]): Meta information of each image. + rescale (bool, optional): If `True`, return boxes in original + image space. Default `False`. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + cls_scores = layer_cls_scores[-1] + bbox_preds = layer_bbox_preds[-1] + + num_imgs = cls_scores.size(0) + # -1 is last layer input query mask + + batch_mask = [ + self.cache_dict['distinct_query_mask'][-1][ + img_id * self.cache_dict['num_heads']][0] + for img_id in range(num_imgs) + ] + + result_list = [] + for img_id in range(len(batch_img_metas)): + cls_score = cls_scores[img_id][batch_mask[img_id]] + bbox_pred = bbox_preds[img_id][batch_mask[img_id]] + img_meta = batch_img_metas[img_id] + results = self._predict_by_feat_single(cls_score, bbox_pred, + img_meta, rescale) + result_list.append(results) + return result_list diff --git a/mmdetection/mmdet/models/dense_heads/deformable_detr_head.py b/mmdetection/mmdet/models/dense_heads/deformable_detr_head.py new file mode 100644 index 0000000..adedd4a --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/deformable_detr_head.py @@ -0,0 +1,329 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Dict, List, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import Linear +from mmengine.model import bias_init_with_prob, constant_init +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import InstanceList, OptInstanceList +from ..layers import inverse_sigmoid +from .detr_head import DETRHead + + +@MODELS.register_module() +class DeformableDETRHead(DETRHead): + r"""Head of DeformDETR: Deformable DETR: Deformable Transformers for + End-to-End Object Detection. + + Code is modified from the `official github repo + `_. + + More details can be found in the `paper + `_ . + + Args: + share_pred_layer (bool): Whether to share parameters for all the + prediction layers. Defaults to `False`. + num_pred_layer (int): The number of the prediction layers. + Defaults to 6. + as_two_stage (bool, optional): Whether to generate the proposal + from the outputs of encoder. Defaults to `False`. + """ + + def __init__(self, + *args, + share_pred_layer: bool = False, + num_pred_layer: int = 6, + as_two_stage: bool = False, + **kwargs) -> None: + self.share_pred_layer = share_pred_layer + self.num_pred_layer = num_pred_layer + self.as_two_stage = as_two_stage + + super().__init__(*args, **kwargs) + + def _init_layers(self) -> None: + """Initialize classification branch and regression branch of head.""" + fc_cls = Linear(self.embed_dims, self.cls_out_channels) + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, 4)) + reg_branch = nn.Sequential(*reg_branch) + + if self.share_pred_layer: + self.cls_branches = nn.ModuleList( + [fc_cls for _ in range(self.num_pred_layer)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(self.num_pred_layer)]) + else: + self.cls_branches = nn.ModuleList( + [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)]) + self.reg_branches = nn.ModuleList([ + copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer) + ]) + + def init_weights(self) -> None: + """Initialize weights of the Deformable DETR head.""" + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + if hasattr(m, 'bias') and m.bias is not None: + nn.init.constant_(m.bias, bias_init) + for m in self.reg_branches: + constant_init(m[-1], 0, bias=0) + nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0) + if self.as_two_stage: + for m in self.reg_branches: + nn.init.constant_(m[-1].bias.data[2:], 0.0) + + def forward(self, hidden_states: Tensor, + references: List[Tensor]) -> Tuple[Tensor, Tensor]: + """Forward function. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries, dim). + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries, 4) when `as_two_stage` of the detector is `True`, + otherwise (bs, num_queries, 2). Each `inter_reference` has + shape (bs, num_queries, 4) when `with_box_refine` of the + detector is `True`, otherwise (bs, num_queries, 2). The + coordinates are arranged as (cx, cy) when the last dimension is + 2, and (cx, cy, w, h) when it is 4. + + Returns: + tuple[Tensor]: results of head containing the following tensor. + + - all_layers_outputs_classes (Tensor): Outputs from the + classification head, has shape (num_decoder_layers, bs, + num_queries, cls_out_channels). + - all_layers_outputs_coords (Tensor): Sigmoid outputs from the + regression head with normalized coordinate format (cx, cy, w, + h), has shape (num_decoder_layers, bs, num_queries, 4) with the + last dimension arranged as (cx, cy, w, h). + """ + all_layers_outputs_classes = [] + all_layers_outputs_coords = [] + + for layer_id in range(hidden_states.shape[0]): + reference = inverse_sigmoid(references[layer_id]) + # NOTE The last reference will not be used. + hidden_state = hidden_states[layer_id] + outputs_class = self.cls_branches[layer_id](hidden_state) + tmp_reg_preds = self.reg_branches[layer_id](hidden_state) + if reference.shape[-1] == 4: + # When `layer` is 0 and `as_two_stage` of the detector + # is `True`, or when `layer` is greater than 0 and + # `with_box_refine` of the detector is `True`. + tmp_reg_preds += reference + else: + # When `layer` is 0 and `as_two_stage` of the detector + # is `False`, or when `layer` is greater than 0 and + # `with_box_refine` of the detector is `False`. + assert reference.shape[-1] == 2 + tmp_reg_preds[..., :2] += reference + outputs_coord = tmp_reg_preds.sigmoid() + all_layers_outputs_classes.append(outputs_class) + all_layers_outputs_coords.append(outputs_coord) + + all_layers_outputs_classes = torch.stack(all_layers_outputs_classes) + all_layers_outputs_coords = torch.stack(all_layers_outputs_coords) + + return all_layers_outputs_classes, all_layers_outputs_coords + + def loss(self, hidden_states: Tensor, references: List[Tensor], + enc_outputs_class: Tensor, enc_outputs_coord: Tensor, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the queries of the upstream network. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, num_queries, bs, dim). + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries, 4) when `as_two_stage` of the detector is `True`, + otherwise (bs, num_queries, 2). Each `inter_reference` has + shape (bs, num_queries, 4) when `with_box_refine` of the + detector is `True`, otherwise (bs, num_queries, 2). The + coordinates are arranged as (cx, cy) when the last dimension is + 2, and (cx, cy, w, h) when it is 4. + enc_outputs_class (Tensor): The score of each point on encode + feature map, has shape (bs, num_feat_points, cls_out_channels). + Only when `as_two_stage` is `True` it would be passed in, + otherwise it would be `None`. + enc_outputs_coord (Tensor): The proposal generate from the encode + feature map, has shape (bs, num_feat_points, 4) with the last + dimension arranged as (cx, cy, w, h). Only when `as_two_stage` + is `True` it would be passed in, otherwise it would be `None`. + batch_data_samples (list[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states, references) + loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, + batch_gt_instances, batch_img_metas) + losses = self.loss_by_feat(*loss_inputs) + return losses + + def loss_by_feat( + self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + enc_cls_scores: Tensor, + enc_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Loss function. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, num_queries, + cls_out_channels). + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and has shape (num_decoder_layers, bs, + num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + enc_cls_scores (Tensor): The score of each point on encode + feature map, has shape (bs, num_feat_points, cls_out_channels). + Only when `as_two_stage` is `True` it would be passes in, + otherwise, it would be `None`. + enc_bbox_preds (Tensor): The proposal generate from the encode + feature map, has shape (bs, num_feat_points, 4) with the last + dimension arranged as (cx, cy, w, h). Only when `as_two_stage` + is `True` it would be passed in, otherwise it would be `None`. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + loss_dict = super().loss_by_feat(all_layers_cls_scores, + all_layers_bbox_preds, + batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + proposal_gt_instances = copy.deepcopy(batch_gt_instances) + for i in range(len(proposal_gt_instances)): + proposal_gt_instances[i].labels = torch.zeros_like( + proposal_gt_instances[i].labels) + enc_loss_cls, enc_losses_bbox, enc_losses_iou = \ + self.loss_by_feat_single( + enc_cls_scores, enc_bbox_preds, + batch_gt_instances=proposal_gt_instances, + batch_img_metas=batch_img_metas) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + loss_dict['enc_loss_iou'] = enc_losses_iou + return loss_dict + + def predict(self, + hidden_states: Tensor, + references: List[Tensor], + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + """Perform forward propagation and loss calculation of the detection + head on the queries of the upstream network. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, num_queries, bs, dim). + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries, 4) when `as_two_stage` of the detector is `True`, + otherwise (bs, num_queries, 2). Each `inter_reference` has + shape (bs, num_queries, 4) when `with_box_refine` of the + detector is `True`, otherwise (bs, num_queries, 2). The + coordinates are arranged as (cx, cy) when the last dimension is + 2, and (cx, cy, w, h) when it is 4. + batch_data_samples (list[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): If `True`, return boxes in original + image space. Defaults to `True`. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + outs = self(hidden_states, references) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas, rescale=rescale) + return predictions + + def predict_by_feat(self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + batch_img_metas: List[Dict], + rescale: bool = False) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, num_queries, + cls_out_channels). + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries, + 4) with the last dimension arranged as (cx, cy, w, h). + batch_img_metas (list[dict]): Meta information of each image. + rescale (bool, optional): If `True`, return boxes in original + image space. Default `False`. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + cls_scores = all_layers_cls_scores[-1] + bbox_preds = all_layers_bbox_preds[-1] + + result_list = [] + for img_id in range(len(batch_img_metas)): + cls_score = cls_scores[img_id] + bbox_pred = bbox_preds[img_id] + img_meta = batch_img_metas[img_id] + results = self._predict_by_feat_single(cls_score, bbox_pred, + img_meta, rescale) + result_list.append(results) + return result_list diff --git a/mmdetection/mmdet/models/dense_heads/dense_test_mixins.py b/mmdetection/mmdet/models/dense_heads/dense_test_mixins.py new file mode 100644 index 0000000..a7526d4 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/dense_test_mixins.py @@ -0,0 +1,215 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import sys +import warnings +from inspect import signature + +import torch +from mmcv.ops import batched_nms +from mmengine.structures import InstanceData + +from mmdet.structures.bbox import bbox_mapping_back +from ..test_time_augs import merge_aug_proposals + +if sys.version_info >= (3, 7): + from mmdet.utils.contextmanagers import completed + + +class BBoxTestMixin(object): + """Mixin class for testing det bboxes via DenseHead.""" + + def simple_test_bboxes(self, feats, img_metas, rescale=False): + """Test det bboxes without test-time augmentation, can be applied in + DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``, + etc. + + Args: + feats (tuple[torch.Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + img_metas (list[dict]): List of image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[obj:`InstanceData`]: Detection results of each + image after the post process. \ + Each item usually contains following keys. \ + + - scores (Tensor): Classification scores, has a shape + (num_instance,) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances,). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + warnings.warn('You are calling `simple_test_bboxes` in ' + '`dense_test_mixins`, but the `dense_test_mixins`' + 'will be deprecated soon. Please use ' + '`simple_test` instead.') + outs = self.forward(feats) + results_list = self.get_results( + *outs, img_metas=img_metas, rescale=rescale) + return results_list + + def aug_test_bboxes(self, feats, img_metas, rescale=False): + """Test det bboxes with test time augmentation, can be applied in + DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``, + etc. + + Args: + feats (list[Tensor]): the outer list indicates test-time + augmentations and inner Tensor should have a shape NxCxHxW, + which contains features for all images in the batch. + img_metas (list[list[dict]]): the outer list indicates test-time + augs (multiscale, flip, etc.) and the inner list indicates + images in a batch. each dict has image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is ``bboxes`` with shape (n, 5), + where 5 represent (tl_x, tl_y, br_x, br_y, score). + The shape of the second tensor in the tuple is ``labels`` + with shape (n,). The length of list should always be 1. + """ + + warnings.warn('You are calling `aug_test_bboxes` in ' + '`dense_test_mixins`, but the `dense_test_mixins`' + 'will be deprecated soon. Please use ' + '`aug_test` instead.') + # check with_nms argument + gb_sig = signature(self.get_results) + gb_args = [p.name for p in gb_sig.parameters.values()] + gbs_sig = signature(self._get_results_single) + gbs_args = [p.name for p in gbs_sig.parameters.values()] + assert ('with_nms' in gb_args) and ('with_nms' in gbs_args), \ + f'{self.__class__.__name__}' \ + ' does not support test-time augmentation' + + aug_bboxes = [] + aug_scores = [] + aug_labels = [] + for x, img_meta in zip(feats, img_metas): + # only one image in the batch + outs = self.forward(x) + bbox_outputs = self.get_results( + *outs, + img_metas=img_meta, + cfg=self.test_cfg, + rescale=False, + with_nms=False)[0] + aug_bboxes.append(bbox_outputs.bboxes) + aug_scores.append(bbox_outputs.scores) + if len(bbox_outputs) >= 3: + aug_labels.append(bbox_outputs.labels) + + # after merging, bboxes will be rescaled to the original image size + merged_bboxes, merged_scores = self.merge_aug_bboxes( + aug_bboxes, aug_scores, img_metas) + merged_labels = torch.cat(aug_labels, dim=0) if aug_labels else None + + if merged_bboxes.numel() == 0: + det_bboxes = torch.cat([merged_bboxes, merged_scores[:, None]], -1) + return [ + (det_bboxes, merged_labels), + ] + + det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores, + merged_labels, self.test_cfg.nms) + det_bboxes = det_bboxes[:self.test_cfg.max_per_img] + det_labels = merged_labels[keep_idxs][:self.test_cfg.max_per_img] + + if rescale: + _det_bboxes = det_bboxes + else: + _det_bboxes = det_bboxes.clone() + _det_bboxes[:, :4] *= det_bboxes.new_tensor( + img_metas[0][0]['scale_factor']) + + results = InstanceData() + results.bboxes = _det_bboxes[:, :4] + results.scores = _det_bboxes[:, 4] + results.labels = det_labels + return [results] + + def aug_test_rpn(self, feats, img_metas): + """Test with augmentation for only for ``RPNHead`` and its variants, + e.g., ``GARPNHead``, etc. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + img_metas (list[dict]): Meta info of each image. + + Returns: + list[Tensor]: Proposals of each image, each item has shape (n, 5), + where 5 represent (tl_x, tl_y, br_x, br_y, score). + """ + samples_per_gpu = len(img_metas[0]) + aug_proposals = [[] for _ in range(samples_per_gpu)] + for x, img_meta in zip(feats, img_metas): + results_list = self.simple_test_rpn(x, img_meta) + for i, results in enumerate(results_list): + proposals = torch.cat( + [results.bboxes, results.scores[:, None]], dim=-1) + aug_proposals[i].append(proposals) + # reorganize the order of 'img_metas' to match the dimensions + # of 'aug_proposals' + aug_img_metas = [] + for i in range(samples_per_gpu): + aug_img_meta = [] + for j in range(len(img_metas)): + aug_img_meta.append(img_metas[j][i]) + aug_img_metas.append(aug_img_meta) + # after merging, proposals will be rescaled to the original image size + + merged_proposals = [] + for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas): + merged_proposal = merge_aug_proposals(proposals, aug_img_meta, + self.test_cfg) + results = InstanceData() + results.bboxes = merged_proposal[:, :4] + results.scores = merged_proposal[:, 4] + merged_proposals.append(results) + return merged_proposals + + if sys.version_info >= (3, 7): + + async def async_simple_test_rpn(self, x, img_metas): + sleep_interval = self.test_cfg.pop('async_sleep_interval', 0.025) + async with completed( + __name__, 'rpn_head_forward', + sleep_interval=sleep_interval): + rpn_outs = self(x) + + proposal_list = self.get_results(*rpn_outs, img_metas=img_metas) + return proposal_list + + def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas): + """Merge augmented detection bboxes and scores. + + Args: + aug_bboxes (list[Tensor]): shape (n, 4*#class) + aug_scores (list[Tensor] or None): shape (n, #class) + img_shapes (list[Tensor]): shape (3, ). + + Returns: + tuple[Tensor]: ``bboxes`` with shape (n,4), where + 4 represent (tl_x, tl_y, br_x, br_y) + and ``scores`` with shape (n,). + """ + recovered_bboxes = [] + for bboxes, img_info in zip(aug_bboxes, img_metas): + img_shape = img_info[0]['img_shape'] + scale_factor = img_info[0]['scale_factor'] + flip = img_info[0]['flip'] + flip_direction = img_info[0]['flip_direction'] + bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip, + flip_direction) + recovered_bboxes.append(bboxes) + bboxes = torch.cat(recovered_bboxes, dim=0) + if aug_scores is None: + return bboxes + else: + scores = torch.cat(aug_scores, dim=0) + return bboxes, scores diff --git a/mmdetection/mmdet/models/dense_heads/detr_head.py b/mmdetection/mmdet/models/dense_heads/detr_head.py new file mode 100644 index 0000000..9daeb47 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/detr_head.py @@ -0,0 +1,634 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Linear +from mmcv.cnn.bricks.transformer import FFN +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import SampleList +from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps, + bbox_xyxy_to_cxcywh) +from mmdet.utils import (ConfigType, InstanceList, OptInstanceList, + OptMultiConfig, reduce_mean) +from ..losses import QualityFocalLoss +from ..utils import multi_apply + + +@MODELS.register_module() +class DETRHead(BaseModule): + r"""Head of DETR. DETR:End-to-End Object Detection with Transformers. + + More details can be found in the `paper + `_ . + + Args: + num_classes (int): Number of categories excluding the background. + embed_dims (int): The dims of Transformer embedding. + num_reg_fcs (int): Number of fully-connected layers used in `FFN`, + which is then used for the regression head. Defaults to 2. + sync_cls_avg_factor (bool): Whether to sync the `avg_factor` of + all ranks. Default to `False`. + loss_cls (:obj:`ConfigDict` or dict): Config of the classification + loss. Defaults to `CrossEntropyLoss`. + loss_bbox (:obj:`ConfigDict` or dict): Config of the regression bbox + loss. Defaults to `L1Loss`. + loss_iou (:obj:`ConfigDict` or dict): Config of the regression iou + loss. Defaults to `GIoULoss`. + train_cfg (:obj:`ConfigDict` or dict): Training config of transformer + head. + test_cfg (:obj:`ConfigDict` or dict): Testing config of transformer + head. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + _version = 2 + + def __init__( + self, + num_classes: int, + embed_dims: int = 256, + num_reg_fcs: int = 2, + sync_cls_avg_factor: bool = False, + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox: ConfigType = dict(type='L1Loss', loss_weight=5.0), + loss_iou: ConfigType = dict(type='GIoULoss', loss_weight=2.0), + train_cfg: ConfigType = dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='ClassificationCost', weight=1.), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg: ConfigType = dict(max_per_img=100), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.bg_cls_weight = 0 + self.sync_cls_avg_factor = sync_cls_avg_factor + class_weight = loss_cls.get('class_weight', None) + if class_weight is not None and (self.__class__ is DETRHead): + assert isinstance(class_weight, float), 'Expected ' \ + 'class_weight to have type float. Found ' \ + f'{type(class_weight)}.' + # NOTE following the official DETR repo, bg_cls_weight means + # relative classification weight of the no-object class. + bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) + assert isinstance(bg_cls_weight, float), 'Expected ' \ + 'bg_cls_weight to have type float. Found ' \ + f'{type(bg_cls_weight)}.' + class_weight = torch.ones(num_classes + 1) * class_weight + # set background class as the last indice + class_weight[num_classes] = bg_cls_weight + loss_cls.update({'class_weight': class_weight}) + if 'bg_cls_weight' in loss_cls: + loss_cls.pop('bg_cls_weight') + self.bg_cls_weight = bg_cls_weight + + if train_cfg: + assert 'assigner' in train_cfg, 'assigner should be provided ' \ + 'when train_cfg is set.' + assigner = train_cfg['assigner'] + self.assigner = TASK_UTILS.build(assigner) + if train_cfg.get('sampler', None) is not None: + raise RuntimeError('DETR do not build sampler.') + self.num_classes = num_classes + self.embed_dims = embed_dims + self.num_reg_fcs = num_reg_fcs + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + self.loss_iou = MODELS.build(loss_iou) + + if self.loss_cls.use_sigmoid: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the transformer head.""" + # cls branch + self.fc_cls = Linear(self.embed_dims, self.cls_out_channels) + # reg branch + self.activate = nn.ReLU() + self.reg_ffn = FFN( + self.embed_dims, + self.embed_dims, + self.num_reg_fcs, + dict(type='ReLU', inplace=True), + dropout=0.0, + add_residual=False) + # NOTE the activations of reg_branch here is the same as + # those in transformer, but they are actually different + # in DAB-DETR (prelu in transformer and relu in reg_branch) + self.fc_reg = Linear(self.embed_dims, 4) + + def forward(self, hidden_states: Tensor) -> Tuple[Tensor]: + """"Forward function. + + Args: + hidden_states (Tensor): Features from transformer decoder. If + `return_intermediate_dec` in detr.py is True output has shape + (num_decoder_layers, bs, num_queries, dim), else has shape + (1, bs, num_queries, dim) which only contains the last layer + outputs. + Returns: + tuple[Tensor]: results of head containing the following tensor. + + - layers_cls_scores (Tensor): Outputs from the classification head, + shape (num_decoder_layers, bs, num_queries, cls_out_channels). + Note cls_out_channels should include background. + - layers_bbox_preds (Tensor): Sigmoid outputs from the regression + head with normalized coordinate format (cx, cy, w, h), has shape + (num_decoder_layers, bs, num_queries, 4). + """ + layers_cls_scores = self.fc_cls(hidden_states) + layers_bbox_preds = self.fc_reg( + self.activate(self.reg_ffn(hidden_states))).sigmoid() + return layers_cls_scores, layers_bbox_preds + + def loss(self, hidden_states: Tensor, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + hidden_states (Tensor): Feature from the transformer decoder, has + shape (num_decoder_layers, bs, num_queries, cls_out_channels) + or (num_decoder_layers, num_queries, bs, cls_out_channels). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states) + loss_inputs = outs + (batch_gt_instances, batch_img_metas) + losses = self.loss_by_feat(*loss_inputs) + return losses + + def loss_by_feat( + self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """"Loss function. + + Only outputs from the last feature level are used for computing + losses by default. + + Args: + all_layers_cls_scores (Tensor): Classification outputs + of each decoder layers. Each is a 4D-tensor, has shape + (num_decoder_layers, bs, num_queries, cls_out_channels). + all_layers_bbox_preds (Tensor): Sigmoid regression + outputs of each decoder layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + (num_decoder_layers, bs, num_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert batch_gt_instances_ignore is None, \ + f'{self.__class__.__name__} only supports ' \ + 'for batch_gt_instances_ignore setting to None.' + + losses_cls, losses_bbox, losses_iou = multi_apply( + self.loss_by_feat_single, + all_layers_cls_scores, + all_layers_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas) + + loss_dict = dict() + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + loss_dict['loss_iou'] = losses_iou[-1] + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i, loss_iou_i in \ + zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i + num_dec_layer += 1 + return loss_dict + + def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict]) -> Tuple[Tensor]: + """Loss function for outputs from a single decoder layer of a single + feature level. + + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images, has shape (bs, num_queries, cls_out_channels). + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape (bs, num_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, + batch_gt_instances, batch_img_metas) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + if isinstance(self.loss_cls, QualityFocalLoss): + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + scores = label_weights.new_zeros(labels.shape) + pos_bbox_targets = bbox_targets[pos_inds] + pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets) + pos_bbox_pred = bbox_preds.reshape(-1, 4)[pos_inds] + pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred) + scores[pos_inds] = bbox_overlaps( + pos_decode_bbox_pred.detach(), + pos_decode_bbox_targets, + is_aligned=True) + loss_cls = self.loss_cls( + cls_scores, (labels, scores), + label_weights, + avg_factor=cls_avg_factor) + else: + loss_cls = self.loss_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # construct factors used for rescale bboxes + factors = [] + for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds): + img_h, img_w, = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0).repeat( + bbox_pred.size(0), 1) + factors.append(factor) + factors = torch.cat(factors, 0) + + # DETR regress the relative position of boxes (cxcywh) in the image, + # thus the learning target is normalized by the image size. So here + # we need to re-scale them for calculating IoU loss + bbox_preds = bbox_preds.reshape(-1, 4) + bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors + bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors + + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou( + bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) + + # regression L1 loss + loss_bbox = self.loss_bbox( + bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) + return loss_cls, loss_bbox, loss_iou + + def get_targets(self, cls_scores_list: List[Tensor], + bbox_preds_list: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict]) -> tuple: + """Compute regression and classification targets for a batch image. + + Outputs from a single decoder layer of a single feature level are used. + + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image, has shape [num_queries, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_queries, 4]. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple: a tuple containing the following targets. + + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all images. + - bbox_targets_list (list[Tensor]): BBox targets for all images. + - bbox_weights_list (list[Tensor]): BBox weights for all images. + - num_total_pos (int): Number of positive samples in all images. + - num_total_neg (int): Number of negative samples in all images. + """ + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + pos_inds_list, + neg_inds_list) = multi_apply(self._get_targets_single, + cls_scores_list, bbox_preds_list, + batch_gt_instances, batch_img_metas) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, num_total_pos, num_total_neg) + + def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor, + gt_instances: InstanceData, + img_meta: dict) -> tuple: + """Compute regression and classification targets for one image. + + Outputs from a single decoder layer of a single feature level are used. + + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_queries, 4]. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + img_h, img_w = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + num_bboxes = bbox_pred.size(0) + # convert bbox_pred from xywh, normalized to xyxy, unnormalized + bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) + bbox_pred = bbox_pred * factor + + pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred) + # assigner and sampler + assign_result = self.assigner.assign( + pred_instances=pred_instances, + gt_instances=gt_instances, + img_meta=img_meta) + + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + pos_inds = torch.nonzero( + assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() + neg_inds = torch.nonzero( + assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() + pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :] + + # label targets + labels = gt_bboxes.new_full((num_bboxes, ), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype) + bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype) + bbox_weights[pos_inds] = 1.0 + + # DETR regress the relative position of boxes (cxcywh) in the image. + # Thus the learning target should be normalized by the image size, also + # the box format should be converted from defaultly x1y1x2y2 to cxcywh. + pos_gt_bboxes_normalized = pos_gt_bboxes / factor + pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) + bbox_targets[pos_inds] = pos_gt_bboxes_targets + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds) + + def loss_and_predict( + self, hidden_states: Tuple[Tensor], + batch_data_samples: SampleList) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. Over-write because + img_metas are needed as inputs for bbox_head. + + Args: + hidden_states (tuple[Tensor]): Feature from the transformer + decoder, has shape (num_decoder_layers, bs, num_queries, dim). + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns: + tuple: the return value is a tuple contains: + + - losses: (dict[str, Tensor]): A dictionary of loss components. + - predictions (list[:obj:`InstanceData`]): Detection + results of each image after the post process. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states) + loss_inputs = outs + (batch_gt_instances, batch_img_metas) + losses = self.loss_by_feat(*loss_inputs) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas) + return losses, predictions + + def predict(self, + hidden_states: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. Over-write + because img_metas are needed as inputs for bbox_head. + + Args: + hidden_states (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to True. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + last_layer_hidden_state = hidden_states[-1].unsqueeze(0) + outs = self(last_layer_hidden_state) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas, rescale=rescale) + + return predictions + + def predict_by_feat(self, + layer_cls_scores: Tensor, + layer_bbox_preds: Tensor, + batch_img_metas: List[dict], + rescale: bool = True) -> InstanceList: + """Transform network outputs for a batch into bbox predictions. + + Args: + layer_cls_scores (Tensor): Classification outputs of the last or + all decoder layer. Each is a 4D-tensor, has shape + (num_decoder_layers, bs, num_queries, cls_out_channels). + layer_bbox_preds (Tensor): Sigmoid regression outputs of the last + or all decoder layer. Each is a 4D-tensor with normalized + coordinate format (cx, cy, w, h) and shape + (num_decoder_layers, bs, num_queries, 4). + batch_img_metas (list[dict]): Meta information of each image. + rescale (bool, optional): If `True`, return boxes in original + image space. Defaults to `True`. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + # NOTE only using outputs from the last feature level, + # and only the outputs from the last decoder layer is used. + cls_scores = layer_cls_scores[-1] + bbox_preds = layer_bbox_preds[-1] + + result_list = [] + for img_id in range(len(batch_img_metas)): + cls_score = cls_scores[img_id] + bbox_pred = bbox_preds[img_id] + img_meta = batch_img_metas[img_id] + results = self._predict_by_feat_single(cls_score, bbox_pred, + img_meta, rescale) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, + cls_score: Tensor, + bbox_pred: Tensor, + img_meta: dict, + rescale: bool = True) -> InstanceData: + """Transform outputs from the last decoder layer into bbox predictions + for each image. + + Args: + cls_score (Tensor): Box score logits from the last decoder layer + for each image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from the last decoder layer + for each image, with coordinate format (cx, cy, w, h) and + shape [num_queries, 4]. + img_meta (dict): Image meta info. + rescale (bool): If True, return boxes in original image + space. Default True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_score) == len(bbox_pred) # num_queries + max_per_img = self.test_cfg.get('max_per_img', len(cls_score)) + img_shape = img_meta['img_shape'] + # exclude background + if self.loss_cls.use_sigmoid: + cls_score = cls_score.sigmoid() + scores, indexes = cls_score.view(-1).topk(max_per_img) + det_labels = indexes % self.num_classes + bbox_index = indexes // self.num_classes + bbox_pred = bbox_pred[bbox_index] + else: + scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1) + scores, bbox_index = scores.topk(max_per_img) + bbox_pred = bbox_pred[bbox_index] + det_labels = det_labels[bbox_index] + + det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred) + det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] + det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] + det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1]) + det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0]) + if rescale: + assert img_meta.get('scale_factor') is not None + det_bboxes /= det_bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + + results = InstanceData() + results.bboxes = det_bboxes + results.scores = scores + results.labels = det_labels + return results diff --git a/mmdetection/mmdet/models/dense_heads/dino_head.py b/mmdetection/mmdet/models/dense_heads/dino_head.py new file mode 100644 index 0000000..54f46d1 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/dino_head.py @@ -0,0 +1,479 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Tuple + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps, + bbox_xyxy_to_cxcywh) +from mmdet.utils import InstanceList, OptInstanceList, reduce_mean +from ..losses import QualityFocalLoss +from ..utils import multi_apply +from .deformable_detr_head import DeformableDETRHead + + +@MODELS.register_module() +class DINOHead(DeformableDETRHead): + r"""Head of the DINO: DETR with Improved DeNoising Anchor Boxes + for End-to-End Object Detection + + Code is modified from the `official github repo + `_. + + More details can be found in the `paper + `_ . + """ + + def loss(self, hidden_states: Tensor, references: List[Tensor], + enc_outputs_class: Tensor, enc_outputs_coord: Tensor, + batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the queries of the upstream network. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries_total, + dim), where `num_queries_total` is the sum of + `num_denoising_queries` and `num_matching_queries` when + `self.training` is `True`, else `num_matching_queries`. + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries_total, 4) and each `inter_reference` has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + enc_outputs_class (Tensor): The score of each point on encode + feature map, has shape (bs, num_feat_points, cls_out_channels). + enc_outputs_coord (Tensor): The proposal generate from the + encode feature map, has shape (bs, num_feat_points, 4) with the + last dimension arranged as (cx, cy, w, h). + batch_data_samples (list[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + dict: A dictionary of loss components. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states, references) + loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, + batch_gt_instances, batch_img_metas, dn_meta) + losses = self.loss_by_feat(*loss_inputs) + return losses + + def loss_by_feat( + self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + enc_cls_scores: Tensor, + enc_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + dn_meta: Dict[str, int], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Loss function. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels), where + `num_queries_total` is the sum of `num_denoising_queries` + and `num_matching_queries`. + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and has shape (num_decoder_layers, bs, + num_queries_total, 4). + enc_cls_scores (Tensor): The score of each point on encode + feature map, has shape (bs, num_feat_points, cls_out_channels). + enc_bbox_preds (Tensor): The proposal generate from the encode + feature map, has shape (bs, num_feat_points, 4) with the last + dimension arranged as (cx, cy, w, h). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + # extract denoising and matching part of outputs + (all_layers_matching_cls_scores, all_layers_matching_bbox_preds, + all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \ + self.split_outputs( + all_layers_cls_scores, all_layers_bbox_preds, dn_meta) + + loss_dict = super(DeformableDETRHead, self).loss_by_feat( + all_layers_matching_cls_scores, all_layers_matching_bbox_preds, + batch_gt_instances, batch_img_metas, batch_gt_instances_ignore) + # NOTE DETRHead.loss_by_feat but not DeformableDETRHead.loss_by_feat + # is called, because the encoder loss calculations are different + # between DINO and DeformableDETR. + + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + # NOTE The enc_loss calculation of the DINO is + # different from that of Deformable DETR. + enc_loss_cls, enc_losses_bbox, enc_losses_iou = \ + self.loss_by_feat_single( + enc_cls_scores, enc_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + loss_dict['enc_loss_iou'] = enc_losses_iou + + if all_layers_denoising_cls_scores is not None: + # calculate denoising loss from all decoder layers + dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn( + all_layers_denoising_cls_scores, + all_layers_denoising_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + dn_meta=dn_meta) + # collate denoising loss + loss_dict['dn_loss_cls'] = dn_losses_cls[-1] + loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1] + loss_dict['dn_loss_iou'] = dn_losses_iou[-1] + for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \ + enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1], + dn_losses_iou[:-1])): + loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i + loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i + return loss_dict + + def loss_dn(self, all_layers_denoising_cls_scores: Tensor, + all_layers_denoising_bbox_preds: Tensor, + batch_gt_instances: InstanceList, batch_img_metas: List[dict], + dn_meta: Dict[str, int]) -> Tuple[List[Tensor]]: + """Calculate denoising loss. + + Args: + all_layers_denoising_cls_scores (Tensor): Classification scores of + all decoder layers in denoising part, has shape ( + num_decoder_layers, bs, num_denoising_queries, + cls_out_channels). + all_layers_denoising_bbox_preds (Tensor): Regression outputs of all + decoder layers in denoising part. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and has shape + (num_decoder_layers, bs, num_denoising_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + Tuple[List[Tensor]]: The loss_dn_cls, loss_dn_bbox, and loss_dn_iou + of each decoder layers. + """ + return multi_apply( + self._loss_dn_single, + all_layers_denoising_cls_scores, + all_layers_denoising_bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + dn_meta=dn_meta) + + def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + dn_meta: Dict[str, int]) -> Tuple[Tensor]: + """Denoising loss for outputs from a single decoder layer. + + Args: + dn_cls_scores (Tensor): Classification scores of a single decoder + layer in denoising part, has shape (bs, num_denoising_queries, + cls_out_channels). + dn_bbox_preds (Tensor): Regression outputs of a single decoder + layer in denoising part. Each is a 4D-tensor with normalized + coordinate format (cx, cy, w, h) and has shape + (bs, num_denoising_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + cls_reg_targets = self.get_dn_targets(batch_gt_instances, + batch_img_metas, dn_meta) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # classification loss + cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = \ + num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + if len(cls_scores) > 0: + if isinstance(self.loss_cls, QualityFocalLoss): + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + scores = label_weights.new_zeros(labels.shape) + pos_bbox_targets = bbox_targets[pos_inds] + pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets) + pos_bbox_pred = dn_bbox_preds.reshape(-1, 4)[pos_inds] + pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred) + scores[pos_inds] = bbox_overlaps( + pos_decode_bbox_pred.detach(), + pos_decode_bbox_targets, + is_aligned=True) + loss_cls = self.loss_cls( + cls_scores, (labels, scores), + weight=label_weights, + avg_factor=cls_avg_factor) + else: + loss_cls = self.loss_cls( + cls_scores, + labels, + label_weights, + avg_factor=cls_avg_factor) + else: + loss_cls = torch.zeros( + 1, dtype=cls_scores.dtype, device=cls_scores.device) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # construct factors used for rescale bboxes + factors = [] + for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds): + img_h, img_w = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0).repeat( + bbox_pred.size(0), 1) + factors.append(factor) + factors = torch.cat(factors) + + # DETR regress the relative position of boxes (cxcywh) in the image, + # thus the learning target is normalized by the image size. So here + # we need to re-scale them for calculating IoU loss + bbox_preds = dn_bbox_preds.reshape(-1, 4) + bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors + bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors + + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou( + bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) + + # regression L1 loss + loss_bbox = self.loss_bbox( + bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) + return loss_cls, loss_bbox, loss_iou + + def get_dn_targets(self, batch_gt_instances: InstanceList, + batch_img_metas: dict, dn_meta: Dict[str, + int]) -> tuple: + """Get targets in denoising part for a batch of images. + + Args: + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + tuple: a tuple containing the following targets. + + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all images. + - bbox_targets_list (list[Tensor]): BBox targets for all images. + - bbox_weights_list (list[Tensor]): BBox weights for all images. + - num_total_pos (int): Number of positive samples in all images. + - num_total_neg (int): Number of negative samples in all images. + """ + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + pos_inds_list, neg_inds_list) = multi_apply( + self._get_dn_targets_single, + batch_gt_instances, + batch_img_metas, + dn_meta=dn_meta) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, num_total_pos, num_total_neg) + + def _get_dn_targets_single(self, gt_instances: InstanceData, + img_meta: dict, dn_meta: Dict[str, + int]) -> tuple: + """Get targets in denoising part for one image. + + Args: + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + num_groups = dn_meta['num_denoising_groups'] + num_denoising_queries = dn_meta['num_denoising_queries'] + num_queries_each_group = int(num_denoising_queries / num_groups) + device = gt_bboxes.device + + if len(gt_labels) > 0: + t = torch.arange(len(gt_labels), dtype=torch.long, device=device) + t = t.unsqueeze(0).repeat(num_groups, 1) + pos_assigned_gt_inds = t.flatten() + pos_inds = torch.arange( + num_groups, dtype=torch.long, device=device) + pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t + pos_inds = pos_inds.flatten() + else: + pos_inds = pos_assigned_gt_inds = \ + gt_bboxes.new_tensor([], dtype=torch.long) + + neg_inds = pos_inds + num_queries_each_group // 2 + + # label targets + labels = gt_bboxes.new_full((num_denoising_queries, ), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_denoising_queries) + + # bbox targets + bbox_targets = torch.zeros(num_denoising_queries, 4, device=device) + bbox_weights = torch.zeros(num_denoising_queries, 4, device=device) + bbox_weights[pos_inds] = 1.0 + img_h, img_w = img_meta['img_shape'] + + # DETR regress the relative position of boxes (cxcywh) in the image. + # Thus the learning target should be normalized by the image size, also + # the box format should be converted from defaultly x1y1x2y2 to cxcywh. + factor = gt_bboxes.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + gt_bboxes_normalized = gt_bboxes / factor + gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized) + bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1]) + + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds) + + @staticmethod + def split_outputs(all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + dn_meta: Dict[str, int]) -> Tuple[Tensor]: + """Split outputs of the denoising part and the matching part. + + For the total outputs of `num_queries_total` length, the former + `num_denoising_queries` outputs are from denoising queries, and + the rest `num_matching_queries` ones are from matching queries, + where `num_queries_total` is the sum of `num_denoising_queries` and + `num_matching_queries`. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels). + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and has shape (num_decoder_layers, bs, + num_queries_total, 4). + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. + + Returns: + Tuple[Tensor]: a tuple containing the following outputs. + + - all_layers_matching_cls_scores (Tensor): Classification scores + of all decoder layers in matching part, has shape + (num_decoder_layers, bs, num_matching_queries, cls_out_channels). + - all_layers_matching_bbox_preds (Tensor): Regression outputs of + all decoder layers in matching part. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and has shape + (num_decoder_layers, bs, num_matching_queries, 4). + - all_layers_denoising_cls_scores (Tensor): Classification scores + of all decoder layers in denoising part, has shape + (num_decoder_layers, bs, num_denoising_queries, + cls_out_channels). + - all_layers_denoising_bbox_preds (Tensor): Regression outputs of + all decoder layers in denoising part. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and has shape + (num_decoder_layers, bs, num_denoising_queries, 4). + """ + num_denoising_queries = dn_meta['num_denoising_queries'] + if dn_meta is not None: + all_layers_denoising_cls_scores = \ + all_layers_cls_scores[:, :, : num_denoising_queries, :] + all_layers_denoising_bbox_preds = \ + all_layers_bbox_preds[:, :, : num_denoising_queries, :] + all_layers_matching_cls_scores = \ + all_layers_cls_scores[:, :, num_denoising_queries:, :] + all_layers_matching_bbox_preds = \ + all_layers_bbox_preds[:, :, num_denoising_queries:, :] + else: + all_layers_denoising_cls_scores = None + all_layers_denoising_bbox_preds = None + all_layers_matching_cls_scores = all_layers_cls_scores + all_layers_matching_bbox_preds = all_layers_bbox_preds + return (all_layers_matching_cls_scores, all_layers_matching_bbox_preds, + all_layers_denoising_cls_scores, + all_layers_denoising_bbox_preds) diff --git a/mmdetection/mmdet/models/dense_heads/embedding_rpn_head.py b/mmdetection/mmdet/models/dense_heads/embedding_rpn_head.py new file mode 100644 index 0000000..97e84fa --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/embedding_rpn_head.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy +from mmdet.structures.det_data_sample import SampleList +from mmdet.utils import InstanceList, OptConfigType + + +@MODELS.register_module() +class EmbeddingRPNHead(BaseModule): + """RPNHead in the `Sparse R-CNN `_ . + + Unlike traditional RPNHead, this module does not need FPN input, but just + decode `init_proposal_bboxes` and expand the first dimension of + `init_proposal_bboxes` and `init_proposal_features` to the batch_size. + + Args: + num_proposals (int): Number of init_proposals. Defaults to 100. + proposal_feature_channel (int): Channel number of + init_proposal_feature. Defaults to 256. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. Defaults to None. + """ + + def __init__(self, + num_proposals: int = 100, + proposal_feature_channel: int = 256, + init_cfg: OptConfigType = None, + **kwargs) -> None: + # `**kwargs` is necessary to avoid some potential error. + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super().__init__(init_cfg=init_cfg) + self.num_proposals = num_proposals + self.proposal_feature_channel = proposal_feature_channel + self._init_layers() + + def _init_layers(self) -> None: + """Initialize a sparse set of proposal boxes and proposal features.""" + self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4) + self.init_proposal_features = nn.Embedding( + self.num_proposals, self.proposal_feature_channel) + + def init_weights(self) -> None: + """Initialize the init_proposal_bboxes as normalized. + + [c_x, c_y, w, h], and we initialize it to the size of the entire + image. + """ + super().init_weights() + nn.init.constant_(self.init_proposal_bboxes.weight[:, :2], 0.5) + nn.init.constant_(self.init_proposal_bboxes.weight[:, 2:], 1) + + def _decode_init_proposals(self, x: List[Tensor], + batch_data_samples: SampleList) -> InstanceList: + """Decode init_proposal_bboxes according to the size of images and + expand dimension of init_proposal_features to batch_size. + + Args: + x (list[Tensor]): List of FPN features. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + List[:obj:`InstanceData`:] Detection results of each image. + Each item usually contains following keys. + + - proposals: Decoded proposal bboxes, + has shape (num_proposals, 4). + - features: init_proposal_features, expanded proposal + features, has shape + (num_proposals, proposal_feature_channel). + - imgs_whwh: Tensor with shape + (num_proposals, 4), the dimension means + [img_width, img_height, img_width, img_height]. + """ + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + + proposals = self.init_proposal_bboxes.weight.clone() + proposals = bbox_cxcywh_to_xyxy(proposals) + imgs_whwh = [] + for meta in batch_img_metas: + h, w = meta['img_shape'][:2] + imgs_whwh.append(x[0].new_tensor([[w, h, w, h]])) + imgs_whwh = torch.cat(imgs_whwh, dim=0) + imgs_whwh = imgs_whwh[:, None, :] + proposals = proposals * imgs_whwh + + rpn_results_list = [] + for idx in range(len(batch_img_metas)): + rpn_results = InstanceData() + rpn_results.bboxes = proposals[idx] + rpn_results.imgs_whwh = imgs_whwh[idx].repeat( + self.num_proposals, 1) + rpn_results.features = self.init_proposal_features.weight.clone() + rpn_results_list.append(rpn_results) + return rpn_results_list + + def loss(self, *args, **kwargs): + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network.""" + raise NotImplementedError( + 'EmbeddingRPNHead does not have `loss`, please use ' + '`predict` or `loss_and_predict` instead.') + + def predict(self, x: List[Tensor], batch_data_samples: SampleList, + **kwargs) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network.""" + # `**kwargs` is necessary to avoid some potential error. + return self._decode_init_proposals( + x=x, batch_data_samples=batch_data_samples) + + def loss_and_predict(self, x: List[Tensor], batch_data_samples: SampleList, + **kwargs) -> tuple: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples.""" + # `**kwargs` is necessary to avoid some potential error. + predictions = self._decode_init_proposals( + x=x, batch_data_samples=batch_data_samples) + + return dict(), predictions diff --git a/mmdetection/mmdet/models/dense_heads/fcos_head.py b/mmdetection/mmdet/models/dense_heads/fcos_head.py new file mode 100644 index 0000000..ba4d464 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/fcos_head.py @@ -0,0 +1,476 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import Scale +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.layers import NormedConv2d +from mmdet.registry import MODELS +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, + OptInstanceList, RangeType, reduce_mean) +from ..utils import multi_apply +from .anchor_free_head import AnchorFreeHead + +INF = 1e8 + + +@MODELS.register_module() +class FCOSHead(AnchorFreeHead): + """Anchor-free head used in `FCOS `_. + + The FCOS head does not use anchor boxes. Instead bounding boxes are + predicted at each pixel and a centerness measure is used to suppress + low-quality predictions. + Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training + tricks used in official repo, which will bring remarkable mAP gains + of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for + more detail. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + strides (Sequence[int] or Sequence[Tuple[int, int]]): Strides of points + in multiple feature levels. Defaults to (4, 8, 16, 32, 64). + regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple + level points. + center_sampling (bool): If true, use center sampling. + Defaults to False. + center_sample_radius (float): Radius of center sampling. + Defaults to 1.5. + norm_on_bbox (bool): If true, normalize the regression targets with + FPN strides. Defaults to False. + centerness_on_reg (bool): If true, position centerness on the + regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042. + Defaults to False. + conv_bias (bool or str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Defaults to "auto". + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_centerness (:obj:`ConfigDict`, or dict): Config of centerness + loss. + norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and + config norm layer. Defaults to + ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``. + cls_predictor_cfg (:obj:`ConfigDict` or dict): dictionary to construct and + config conv_cls. Defaults to None. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. + + Example: + >>> self = FCOSHead(11, 7) + >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]] + >>> cls_score, bbox_pred, centerness = self.forward(feats) + >>> assert len(cls_score) == len(self.scales) + """ # noqa: E501 + + def __init__(self, + num_classes: int, + in_channels: int, + regress_ranges: RangeType = ((-1, 64), (64, 128), (128, 256), + (256, 512), (512, INF)), + center_sampling: bool = False, + center_sample_radius: float = 1.5, + norm_on_bbox: bool = False, + centerness_on_reg: bool = False, + loss_cls: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox: ConfigType = dict(type='IoULoss', loss_weight=1.0), + loss_centerness: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + norm_cfg: ConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + cls_predictor_cfg=None, + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='conv_cls', + std=0.01, + bias_prob=0.01)), + **kwargs) -> None: + self.regress_ranges = regress_ranges + self.center_sampling = center_sampling + self.center_sample_radius = center_sample_radius + self.norm_on_bbox = norm_on_bbox + self.centerness_on_reg = centerness_on_reg + self.cls_predictor_cfg = cls_predictor_cfg + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + norm_cfg=norm_cfg, + init_cfg=init_cfg, + **kwargs) + self.loss_centerness = MODELS.build(loss_centerness) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + super()._init_layers() + self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1) + self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) + if self.cls_predictor_cfg is not None: + self.cls_predictor_cfg.pop('type') + self.conv_cls = NormedConv2d( + self.feat_channels, + self.cls_out_channels, + 1, + padding=0, + **self.cls_predictor_cfg) + + def forward( + self, x: Tuple[Tensor] + ) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of each level outputs. + + - cls_scores (list[Tensor]): Box scores for each scale level, \ + each is a 4D-tensor, the channel number is \ + num_points * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for each \ + scale level, each is a 4D-tensor, the channel number is \ + num_points * 4. + - centernesses (list[Tensor]): centerness for each scale level, \ + each is a 4D-tensor, the channel number is num_points * 1. + """ + return multi_apply(self.forward_single, x, self.scales, self.strides) + + def forward_single(self, x: Tensor, scale: Scale, + stride: int) -> Tuple[Tensor, Tensor, Tensor]: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps, only + used to normalize the bbox prediction when self.norm_on_bbox + is True. + + Returns: + tuple: scores for each class, bbox predictions and centerness + predictions of input feature maps. + """ + cls_score, bbox_pred, cls_feat, reg_feat = super().forward_single(x) + if self.centerness_on_reg: + centerness = self.conv_centerness(reg_feat) + else: + centerness = self.conv_centerness(cls_feat) + # scale the bbox_pred of different level + # float to avoid overflow when enabling FP16 + bbox_pred = scale(bbox_pred).float() + if self.norm_on_bbox: + # bbox_pred needed for gradient computation has been modified + # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace + # F.relu(bbox_pred) with bbox_pred.clamp(min=0) + bbox_pred = bbox_pred.clamp(min=0) + if not self.training: + bbox_pred *= stride + else: + bbox_pred = bbox_pred.exp() + return cls_score, bbox_pred, centerness + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + centernesses: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * 4. + centernesses (list[Tensor]): centerness for each scale level, each + is a 4D-tensor, the channel number is num_points * 1. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) == len(centernesses) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device) + labels, bbox_targets = self.get_targets(all_level_points, + batch_gt_instances) + + num_imgs = cls_scores[0].size(0) + # flatten cls_scores, bbox_preds and centerness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + for bbox_pred in bbox_preds + ] + flatten_centerness = [ + centerness.permute(0, 2, 3, 1).reshape(-1) + for centerness in centernesses + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + flatten_centerness = torch.cat(flatten_centerness) + flatten_labels = torch.cat(labels) + flatten_bbox_targets = torch.cat(bbox_targets) + # repeat points to align with bbox_preds + flatten_points = torch.cat( + [points.repeat(num_imgs, 1) for points in all_level_points]) + + losses = dict() + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((flatten_labels >= 0) + & (flatten_labels < bg_class_ind)).nonzero().reshape(-1) + num_pos = torch.tensor( + len(pos_inds), dtype=torch.float, device=bbox_preds[0].device) + num_pos = max(reduce_mean(num_pos), 1.0) + loss_cls = self.loss_cls( + flatten_cls_scores, flatten_labels, avg_factor=num_pos) + + if getattr(self.loss_cls, 'custom_accuracy', False): + acc = self.loss_cls.get_accuracy(flatten_cls_scores, + flatten_labels) + losses.update(acc) + + pos_bbox_preds = flatten_bbox_preds[pos_inds] + pos_centerness = flatten_centerness[pos_inds] + pos_bbox_targets = flatten_bbox_targets[pos_inds] + pos_centerness_targets = self.centerness_target(pos_bbox_targets) + # centerness weighted iou loss + centerness_denorm = max( + reduce_mean(pos_centerness_targets.sum().detach()), 1e-6) + + if len(pos_inds) > 0: + pos_points = flatten_points[pos_inds] + pos_decoded_bbox_preds = self.bbox_coder.decode( + pos_points, pos_bbox_preds) + pos_decoded_target_preds = self.bbox_coder.decode( + pos_points, pos_bbox_targets) + loss_bbox = self.loss_bbox( + pos_decoded_bbox_preds, + pos_decoded_target_preds, + weight=pos_centerness_targets, + avg_factor=centerness_denorm) + loss_centerness = self.loss_centerness( + pos_centerness, pos_centerness_targets, avg_factor=num_pos) + else: + loss_bbox = pos_bbox_preds.sum() + loss_centerness = pos_centerness.sum() + + losses['loss_cls'] = loss_cls + losses['loss_bbox'] = loss_bbox + losses['loss_centerness'] = loss_centerness + + return losses + + def get_targets( + self, points: List[Tensor], batch_gt_instances: InstanceList + ) -> Tuple[List[Tensor], List[Tensor]]: + """Compute regression, classification and centerness targets for points + in multiple images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple: Targets of each level. + + - concat_lvl_labels (list[Tensor]): Labels of each level. + - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \ + level. + """ + assert len(points) == len(self.regress_ranges) + num_levels = len(points) + # expand regress ranges to align with points + expanded_regress_ranges = [ + points[i].new_tensor(self.regress_ranges[i])[None].expand_as( + points[i]) for i in range(num_levels) + ] + # concat all levels points and regress ranges + concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) + concat_points = torch.cat(points, dim=0) + + # the number of points per img, per lvl + num_points = [center.size(0) for center in points] + + # get labels and bbox_targets of each image + labels_list, bbox_targets_list = multi_apply( + self._get_targets_single, + batch_gt_instances, + points=concat_points, + regress_ranges=concat_regress_ranges, + num_points_per_lvl=num_points) + + # split to per img, per level + labels_list = [labels.split(num_points, 0) for labels in labels_list] + bbox_targets_list = [ + bbox_targets.split(num_points, 0) + for bbox_targets in bbox_targets_list + ] + + # concat per level image + concat_lvl_labels = [] + concat_lvl_bbox_targets = [] + for i in range(num_levels): + concat_lvl_labels.append( + torch.cat([labels[i] for labels in labels_list])) + bbox_targets = torch.cat( + [bbox_targets[i] for bbox_targets in bbox_targets_list]) + if self.norm_on_bbox: + bbox_targets = bbox_targets / self.strides[i] + concat_lvl_bbox_targets.append(bbox_targets) + return concat_lvl_labels, concat_lvl_bbox_targets + + def _get_targets_single( + self, gt_instances: InstanceData, points: Tensor, + regress_ranges: Tensor, + num_points_per_lvl: List[int]) -> Tuple[Tensor, Tensor]: + """Compute regression and classification targets for a single image.""" + num_points = points.size(0) + num_gts = len(gt_instances) + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + + if num_gts == 0: + return gt_labels.new_full((num_points,), self.num_classes), \ + gt_bboxes.new_zeros((num_points, 4)) + + areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1]) + # TODO: figure out why these two are different + # areas = areas[None].expand(num_points, num_gts) + areas = areas[None].repeat(num_points, 1) + regress_ranges = regress_ranges[:, None, :].expand( + num_points, num_gts, 2) + gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) + xs, ys = points[:, 0], points[:, 1] + xs = xs[:, None].expand(num_points, num_gts) + ys = ys[:, None].expand(num_points, num_gts) + + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + bbox_targets = torch.stack((left, top, right, bottom), -1) + + if self.center_sampling: + # condition1: inside a `center bbox` + radius = self.center_sample_radius + center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2 + center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2 + center_gts = torch.zeros_like(gt_bboxes) + stride = center_xs.new_zeros(center_xs.shape) + + # project the points on current lvl back to the `original` sizes + lvl_begin = 0 + for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl): + lvl_end = lvl_begin + num_points_lvl + stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius + lvl_begin = lvl_end + + x_mins = center_xs - stride + y_mins = center_ys - stride + x_maxs = center_xs + stride + y_maxs = center_ys + stride + center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0], + x_mins, gt_bboxes[..., 0]) + center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1], + y_mins, gt_bboxes[..., 1]) + center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2], + gt_bboxes[..., 2], x_maxs) + center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3], + gt_bboxes[..., 3], y_maxs) + + cb_dist_left = xs - center_gts[..., 0] + cb_dist_right = center_gts[..., 2] - xs + cb_dist_top = ys - center_gts[..., 1] + cb_dist_bottom = center_gts[..., 3] - ys + center_bbox = torch.stack( + (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1) + inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0 + else: + # condition1: inside a gt bbox + inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0 + + # condition2: limit the regression range for each location + max_regress_distance = bbox_targets.max(-1)[0] + inside_regress_range = ( + (max_regress_distance >= regress_ranges[..., 0]) + & (max_regress_distance <= regress_ranges[..., 1])) + + # if there are still more than one objects for a location, + # we choose the one with minimal area + areas[inside_gt_bbox_mask == 0] = INF + areas[inside_regress_range == 0] = INF + min_area, min_area_inds = areas.min(dim=1) + + labels = gt_labels[min_area_inds] + labels[min_area == INF] = self.num_classes # set as BG + bbox_targets = bbox_targets[range(num_points), min_area_inds] + + return labels, bbox_targets + + def centerness_target(self, pos_bbox_targets: Tensor) -> Tensor: + """Compute centerness targets. + + Args: + pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape + (num_pos, 4) + + Returns: + Tensor: Centerness target. + """ + # only calculate pos centerness targets, otherwise there may be nan + left_right = pos_bbox_targets[:, [0, 2]] + top_bottom = pos_bbox_targets[:, [1, 3]] + if len(left_right) == 0: + centerness_targets = left_right[..., 0] + else: + centerness_targets = ( + left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * ( + top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) + return torch.sqrt(centerness_targets) diff --git a/mmdetection/mmdet/models/dense_heads/fovea_head.py b/mmdetection/mmdet/models/dense_heads/fovea_head.py new file mode 100644 index 0000000..89353de --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/fovea_head.py @@ -0,0 +1,509 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.ops import DeformConv2d +from mmengine.config import ConfigDict +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig +from ..utils import filter_scores_and_topk, multi_apply +from .anchor_free_head import AnchorFreeHead + +INF = 1e8 + + +class FeatureAlign(BaseModule): + """Feature Align Module. + + Feature Align Module is implemented based on DCN v1. + It uses anchor shape prediction rather than feature map to + predict offsets of deform conv layer. + + Args: + in_channels (int): Number of channels in the input feature map. + out_channels (int): Number of channels in the output feature map. + kernel_size (int): Size of the convolution kernel. + ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``. + deform_groups: (int): Group number of DCN in + FeatureAdaption module. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int = 3, + deform_groups: int = 4, + init_cfg: OptMultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.1, + override=dict(type='Normal', name='conv_adaption', std=0.01)) + ) -> None: + super().__init__(init_cfg=init_cfg) + offset_channels = kernel_size * kernel_size * 2 + self.conv_offset = nn.Conv2d( + 4, deform_groups * offset_channels, 1, bias=False) + self.conv_adaption = DeformConv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + padding=(kernel_size - 1) // 2, + deform_groups=deform_groups) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x: Tensor, shape: Tensor) -> Tensor: + """Forward function of feature align module. + + Args: + x (Tensor): Features from the upstream network. + shape (Tensor): Exponential of bbox predictions. + + Returns: + x (Tensor): The aligned features. + """ + offset = self.conv_offset(shape) + x = self.relu(self.conv_adaption(x, offset)) + return x + + +@MODELS.register_module() +class FoveaHead(AnchorFreeHead): + """Detection Head of `FoveaBox: Beyond Anchor-based Object Detector. + + `_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + base_edge_list (list[int]): List of edges. + scale_ranges (list[tuple]): Range of scales. + sigma (float): Super parameter of ``FoveaHead``. + with_deform (bool): Whether use deform conv. + deform_groups (int): Deformable conv group size. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + base_edge_list: List[int] = (16, 32, 64, 128, 256), + scale_ranges: List[tuple] = ((8, 32), (16, 64), (32, 128), + (64, 256), (128, 512)), + sigma: float = 0.4, + with_deform: bool = False, + deform_groups: int = 4, + init_cfg: OptMultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='conv_cls', + std=0.01, + bias_prob=0.01)), + **kwargs) -> None: + self.base_edge_list = base_edge_list + self.scale_ranges = scale_ranges + self.sigma = sigma + self.with_deform = with_deform + self.deform_groups = deform_groups + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + init_cfg=init_cfg, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + # box branch + super()._init_reg_convs() + self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + + # cls branch + if not self.with_deform: + super()._init_cls_convs() + self.conv_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + else: + self.cls_convs = nn.ModuleList() + self.cls_convs.append( + ConvModule( + self.feat_channels, (self.feat_channels * 4), + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.norm_cfg is None)) + self.cls_convs.append( + ConvModule((self.feat_channels * 4), (self.feat_channels * 4), + 1, + stride=1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.norm_cfg is None)) + self.feature_adaption = FeatureAlign( + self.feat_channels, + self.feat_channels, + kernel_size=3, + deform_groups=self.deform_groups) + self.conv_cls = nn.Conv2d( + int(self.feat_channels * 4), + self.cls_out_channels, + 3, + padding=1) + + def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + + Returns: + tuple: scores for each class and bbox predictions of input + feature maps. + """ + cls_feat = x + reg_feat = x + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + bbox_pred = self.conv_reg(reg_feat) + if self.with_deform: + cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp()) + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + cls_score = self.conv_cls(cls_feat) + return cls_score, bbox_pred + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device) + num_imgs = cls_scores[0].size(0) + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + for bbox_pred in bbox_preds + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + flatten_labels, flatten_bbox_targets = self.get_targets( + batch_gt_instances, featmap_sizes, priors) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + pos_inds = ((flatten_labels >= 0) + & (flatten_labels < self.num_classes)).nonzero().view(-1) + num_pos = len(pos_inds) + + loss_cls = self.loss_cls( + flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs) + if num_pos > 0: + pos_bbox_preds = flatten_bbox_preds[pos_inds] + pos_bbox_targets = flatten_bbox_targets[pos_inds] + pos_weights = pos_bbox_targets.new_ones(pos_bbox_targets.size()) + loss_bbox = self.loss_bbox( + pos_bbox_preds, + pos_bbox_targets, + pos_weights, + avg_factor=num_pos) + else: + loss_bbox = torch.tensor( + 0, + dtype=flatten_bbox_preds.dtype, + device=flatten_bbox_preds.device) + return dict(loss_cls=loss_cls, loss_bbox=loss_bbox) + + def get_targets( + self, batch_gt_instances: InstanceList, featmap_sizes: List[tuple], + priors_list: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]: + """Compute regression and classification for priors in multiple images. + + Args: + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + featmap_sizes (list[tuple]): Size tuple of feature maps. + priors_list (list[Tensor]): Priors list of each fpn level, each has + shape (num_priors, 2). + + Returns: + tuple: Targets of each level. + + - flatten_labels (list[Tensor]): Labels of each level. + - flatten_bbox_targets (list[Tensor]): BBox targets of each + level. + """ + label_list, bbox_target_list = multi_apply( + self._get_targets_single, + batch_gt_instances, + featmap_size_list=featmap_sizes, + priors_list=priors_list) + flatten_labels = [ + torch.cat([ + labels_level_img.flatten() for labels_level_img in labels_level + ]) for labels_level in zip(*label_list) + ] + flatten_bbox_targets = [ + torch.cat([ + bbox_targets_level_img.reshape(-1, 4) + for bbox_targets_level_img in bbox_targets_level + ]) for bbox_targets_level in zip(*bbox_target_list) + ] + flatten_labels = torch.cat(flatten_labels) + flatten_bbox_targets = torch.cat(flatten_bbox_targets) + return flatten_labels, flatten_bbox_targets + + def _get_targets_single(self, + gt_instances: InstanceData, + featmap_size_list: List[tuple] = None, + priors_list: List[Tensor] = None) -> tuple: + """Compute regression and classification targets for a single image. + + Args: + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + featmap_size_list (list[tuple]): Size tuple of feature maps. + priors_list (list[Tensor]): Priors of each fpn level, each has + shape (num_priors, 2). + + Returns: + tuple: + + - label_list (list[Tensor]): Labels of all anchors in the image. + - box_target_list (list[Tensor]): BBox targets of all anchors in + the image. + """ + gt_bboxes_raw = gt_instances.bboxes + gt_labels_raw = gt_instances.labels + gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) * + (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1])) + label_list = [] + bbox_target_list = [] + # for each pyramid, find the cls and box target + for base_len, (lower_bound, upper_bound), stride, featmap_size, \ + priors in zip(self.base_edge_list, self.scale_ranges, + self.strides, featmap_size_list, priors_list): + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + priors = priors.view(*featmap_size, 2) + x, y = priors[..., 0], priors[..., 1] + labels = gt_labels_raw.new_full(featmap_size, self.num_classes) + bbox_targets = gt_bboxes_raw.new_ones(featmap_size[0], + featmap_size[1], 4) + # scale assignment + hit_indices = ((gt_areas >= lower_bound) & + (gt_areas <= upper_bound)).nonzero().flatten() + if len(hit_indices) == 0: + label_list.append(labels) + bbox_target_list.append(torch.log(bbox_targets)) + continue + _, hit_index_order = torch.sort(-gt_areas[hit_indices]) + hit_indices = hit_indices[hit_index_order] + gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride + gt_labels = gt_labels_raw[hit_indices] + half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0]) + half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1]) + # valid fovea area: left, right, top, down + pos_left = torch.ceil( + gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long(). \ + clamp(0, featmap_size[1] - 1) + pos_right = torch.floor( + gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long(). \ + clamp(0, featmap_size[1] - 1) + pos_top = torch.ceil( + gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long(). \ + clamp(0, featmap_size[0] - 1) + pos_down = torch.floor( + gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long(). \ + clamp(0, featmap_size[0] - 1) + for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \ + zip(pos_left, pos_top, pos_right, pos_down, gt_labels, + gt_bboxes_raw[hit_indices, :]): + labels[py1:py2 + 1, px1:px2 + 1] = label + bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \ + (x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len + bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \ + (y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len + bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \ + (gt_x2 - x[py1:py2 + 1, px1:px2 + 1]) / base_len + bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \ + (gt_y2 - y[py1:py2 + 1, px1:px2 + 1]) / base_len + bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.) + label_list.append(labels) + bbox_target_list.append(torch.log(bbox_targets)) + return label_list, bbox_target_list + + # Same as base_dense_head/_predict_by_feat_single except self._bbox_decode + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid, has shape + (num_priors, 2). + img_meta (dict): Image meta info. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + cfg = self.test_cfg if cfg is None else cfg + assert len(cls_score_list) == len(bbox_pred_list) + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_labels = [] + for level_idx, (cls_score, bbox_pred, stride, base_len, priors) in \ + enumerate(zip(cls_score_list, bbox_pred_list, self.strides, + self.base_edge_list, mlvl_priors)): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + + scores = cls_score.permute(1, 2, 0).reshape( + -1, self.cls_out_channels).sigmoid() + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + results = filter_scores_and_topk( + scores, cfg.score_thr, nms_pre, + dict(bbox_pred=bbox_pred, priors=priors)) + scores, labels, _, filtered_results = results + + bbox_pred = filtered_results['bbox_pred'] + priors = filtered_results['priors'] + + bboxes = self._bbox_decode(priors, bbox_pred, base_len, img_shape) + + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + + results = InstanceData() + results.bboxes = torch.cat(mlvl_bboxes) + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + def _bbox_decode(self, priors: Tensor, bbox_pred: Tensor, base_len: int, + max_shape: int) -> Tensor: + """Function to decode bbox. + + Args: + priors (Tensor): Center proiors of an image, has shape + (num_instances, 2). + bbox_preds (Tensor): Box energies / deltas for all instances, + has shape (batch_size, num_instances, 4). + base_len (int): The base length. + max_shape (int): The max shape of bbox. + + Returns: + Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has + shape (batch_size, num_instances, 4). + """ + bbox_pred = bbox_pred.exp() + + y = priors[:, 1] + x = priors[:, 0] + x1 = (x - base_len * bbox_pred[:, 0]). \ + clamp(min=0, max=max_shape[1] - 1) + y1 = (y - base_len * bbox_pred[:, 1]). \ + clamp(min=0, max=max_shape[0] - 1) + x2 = (x + base_len * bbox_pred[:, 2]). \ + clamp(min=0, max=max_shape[1] - 1) + y2 = (y + base_len * bbox_pred[:, 3]). \ + clamp(min=0, max=max_shape[0] - 1) + decoded_bboxes = torch.stack([x1, y1, x2, y2], -1) + return decoded_bboxes diff --git a/mmdetection/mmdet/models/dense_heads/free_anchor_retina_head.py b/mmdetection/mmdet/models/dense_heads/free_anchor_retina_head.py new file mode 100644 index 0000000..df6fb92 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/free_anchor_retina_head.py @@ -0,0 +1,312 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn.functional as F +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import InstanceList, OptConfigType, OptInstanceList +from ..utils import multi_apply +from .retina_head import RetinaHead + +EPS = 1e-12 + + +@MODELS.register_module() +class FreeAnchorRetinaHead(RetinaHead): + """FreeAnchor RetinaHead used in https://arxiv.org/abs/1909.02466. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + stacked_convs (int): Number of conv layers in cls and reg tower. + Defaults to 4. + conv_cfg (:obj:`ConfigDict` or dict, optional): dictionary to + construct and config conv layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to + construct and config norm layer. Defaults to + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). + pre_anchor_topk (int): Number of boxes that be token in each bag. + Defaults to 50 + bbox_thr (float): The threshold of the saturated linear function. + It is usually the same with the IoU threshold used in NMS. + Defaults to 0.6. + gamma (float): Gamma parameter in focal loss. Defaults to 2.0. + alpha (float): Alpha parameter in focal loss. Defaults to 0.5. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + stacked_convs: int = 4, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + pre_anchor_topk: int = 50, + bbox_thr: float = 0.6, + gamma: float = 2.0, + alpha: float = 0.5, + **kwargs) -> None: + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + stacked_convs=stacked_convs, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs) + + self.pre_anchor_topk = pre_anchor_topk + self.bbox_thr = bbox_thr + self.gamma = gamma + self.alpha = alpha + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, _ = self.get_anchors( + featmap_sizes=featmap_sizes, + batch_img_metas=batch_img_metas, + device=device) + concat_anchor_list = [torch.cat(anchor) for anchor in anchor_list] + + # concatenate each level + cls_scores = [ + cls.permute(0, 2, 3, + 1).reshape(cls.size(0), -1, self.cls_out_channels) + for cls in cls_scores + ] + bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(bbox_pred.size(0), -1, 4) + for bbox_pred in bbox_preds + ] + cls_scores = torch.cat(cls_scores, dim=1) + cls_probs = torch.sigmoid(cls_scores) + bbox_preds = torch.cat(bbox_preds, dim=1) + + box_probs, positive_losses, num_pos_list = multi_apply( + self.positive_loss_single, cls_probs, bbox_preds, + concat_anchor_list, batch_gt_instances) + + num_pos = sum(num_pos_list) + positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos) + + # box_prob: P{a_{j} \in A_{+}} + box_probs = torch.stack(box_probs, dim=0) + + # negative_loss: + # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B|| + negative_loss = self.negative_bag_loss(cls_probs, box_probs).sum() / \ + max(1, num_pos * self.pre_anchor_topk) + + # avoid the absence of gradients in regression subnet + # when no ground-truth in a batch + if num_pos == 0: + positive_loss = bbox_preds.sum() * 0 + + losses = { + 'positive_bag_loss': positive_loss, + 'negative_bag_loss': negative_loss + } + return losses + + def positive_loss_single(self, cls_prob: Tensor, bbox_pred: Tensor, + flat_anchors: Tensor, + gt_instances: InstanceData) -> tuple: + """Compute positive loss. + + Args: + cls_prob (Tensor): Classification probability of shape + (num_anchors, num_classes). + bbox_pred (Tensor): Box probability of shape (num_anchors, 4). + flat_anchors (Tensor): Multi-level anchors of the image, which are + concatenated into a single tensor of shape (num_anchors, 4) + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple: + + - box_prob (Tensor): Box probability of shape (num_anchors, 4). + - positive_loss (Tensor): Positive loss of shape (num_pos, ). + - num_pos (int): positive samples indexes. + """ + + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + with torch.no_grad(): + if len(gt_bboxes) == 0: + image_box_prob = torch.zeros( + flat_anchors.size(0), + self.cls_out_channels).type_as(bbox_pred) + else: + # box_localization: a_{j}^{loc}, shape: [j, 4] + pred_boxes = self.bbox_coder.decode(flat_anchors, bbox_pred) + + # object_box_iou: IoU_{ij}^{loc}, shape: [i, j] + object_box_iou = bbox_overlaps(gt_bboxes, pred_boxes) + + # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j] + t1 = self.bbox_thr + t2 = object_box_iou.max( + dim=1, keepdim=True).values.clamp(min=t1 + 1e-12) + object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp( + min=0, max=1) + + # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j] + num_obj = gt_labels.size(0) + indices = torch.stack( + [torch.arange(num_obj).type_as(gt_labels), gt_labels], + dim=0) + object_cls_box_prob = torch.sparse_coo_tensor( + indices, object_box_prob) + + # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j] + """ + from "start" to "end" implement: + image_box_iou = torch.sparse.max(object_cls_box_prob, + dim=0).t() + + """ + # start + box_cls_prob = torch.sparse.sum( + object_cls_box_prob, dim=0).to_dense() + + indices = torch.nonzero(box_cls_prob, as_tuple=False).t_() + if indices.numel() == 0: + image_box_prob = torch.zeros( + flat_anchors.size(0), + self.cls_out_channels).type_as(object_box_prob) + else: + nonzero_box_prob = torch.where( + (gt_labels.unsqueeze(dim=-1) == indices[0]), + object_box_prob[:, indices[1]], + torch.tensor( + [0]).type_as(object_box_prob)).max(dim=0).values + + # upmap to shape [j, c] + image_box_prob = torch.sparse_coo_tensor( + indices.flip([0]), + nonzero_box_prob, + size=(flat_anchors.size(0), + self.cls_out_channels)).to_dense() + # end + box_prob = image_box_prob + + # construct bags for objects + match_quality_matrix = bbox_overlaps(gt_bboxes, flat_anchors) + _, matched = torch.topk( + match_quality_matrix, self.pre_anchor_topk, dim=1, sorted=False) + del match_quality_matrix + + # matched_cls_prob: P_{ij}^{cls} + matched_cls_prob = torch.gather( + cls_prob[matched], 2, + gt_labels.view(-1, 1, 1).repeat(1, self.pre_anchor_topk, + 1)).squeeze(2) + + # matched_box_prob: P_{ij}^{loc} + matched_anchors = flat_anchors[matched] + matched_object_targets = self.bbox_coder.encode( + matched_anchors, + gt_bboxes.unsqueeze(dim=1).expand_as(matched_anchors)) + loss_bbox = self.loss_bbox( + bbox_pred[matched], + matched_object_targets, + reduction_override='none').sum(-1) + matched_box_prob = torch.exp(-loss_bbox) + + # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )} + num_pos = len(gt_bboxes) + positive_loss = self.positive_bag_loss(matched_cls_prob, + matched_box_prob) + + return box_prob, positive_loss, num_pos + + def positive_bag_loss(self, matched_cls_prob: Tensor, + matched_box_prob: Tensor) -> Tensor: + """Compute positive bag loss. + + :math:`-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )`. + + :math:`P_{ij}^{cls}`: matched_cls_prob, classification probability of matched samples. + + :math:`P_{ij}^{loc}`: matched_box_prob, box probability of matched samples. + + Args: + matched_cls_prob (Tensor): Classification probability of matched + samples in shape (num_gt, pre_anchor_topk). + matched_box_prob (Tensor): BBox probability of matched samples, + in shape (num_gt, pre_anchor_topk). + + Returns: + Tensor: Positive bag loss in shape (num_gt,). + """ # noqa: E501, W605 + # bag_prob = Mean-max(matched_prob) + matched_prob = matched_cls_prob * matched_box_prob + weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None) + weight /= weight.sum(dim=1).unsqueeze(dim=-1) + bag_prob = (weight * matched_prob).sum(dim=1) + # positive_bag_loss = -self.alpha * log(bag_prob) + return self.alpha * F.binary_cross_entropy( + bag_prob, torch.ones_like(bag_prob), reduction='none') + + def negative_bag_loss(self, cls_prob: Tensor, box_prob: Tensor) -> Tensor: + """Compute negative bag loss. + + :math:`FL((1 - P_{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}))`. + + :math:`P_{a_{j} \in A_{+}}`: Box_probability of matched samples. + + :math:`P_{j}^{bg}`: Classification probability of negative samples. + + Args: + cls_prob (Tensor): Classification probability, in shape + (num_img, num_anchors, num_classes). + box_prob (Tensor): Box probability, in shape + (num_img, num_anchors, num_classes). + + Returns: + Tensor: Negative bag loss in shape (num_img, num_anchors, + num_classes). + """ # noqa: E501, W605 + prob = cls_prob * (1 - box_prob) + # There are some cases when neg_prob = 0. + # This will cause the neg_prob.log() to be inf without clamp. + prob = prob.clamp(min=EPS, max=1 - EPS) + negative_bag_loss = prob**self.gamma * F.binary_cross_entropy( + prob, torch.zeros_like(prob), reduction='none') + return (1 - self.alpha) * negative_bag_loss diff --git a/mmdetection/mmdet/models/dense_heads/fsaf_head.py b/mmdetection/mmdet/models/dense_heads/fsaf_head.py new file mode 100644 index 0000000..0a01c48 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/fsaf_head.py @@ -0,0 +1,458 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple + +import numpy as np +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig +from ..losses.accuracy import accuracy +from ..losses.utils import weight_reduce_loss +from ..task_modules.prior_generators import anchor_inside_flags +from ..utils import images_to_levels, multi_apply, unmap +from .retina_head import RetinaHead + + +@MODELS.register_module() +class FSAFHead(RetinaHead): + """Anchor-free head used in `FSAF `_. + + The head contains two subnetworks. The first classifies anchor boxes and + the second regresses deltas for the anchors (num_anchors is 1 for anchor- + free methods) + + Args: + *args: Same as its base class in :class:`RetinaHead` + score_threshold (float, optional): The score_threshold to calculate + positive recall. If given, prediction scores lower than this value + is counted as incorrect prediction. Defaults to None. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. + **kwargs: Same as its base class in :class:`RetinaHead` + + Example: + >>> import torch + >>> self = FSAFHead(11, 7) + >>> x = torch.rand(1, 7, 32, 32) + >>> cls_score, bbox_pred = self.forward_single(x) + >>> # Each anchor predicts a score for each class except background + >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors + >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors + >>> assert cls_per_anchor == self.num_classes + >>> assert box_per_anchor == 4 + """ + + def __init__(self, + *args, + score_threshold: Optional[float] = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + # The positive bias in self.retina_reg conv is to prevent predicted \ + # bbox with 0 area + if init_cfg is None: + init_cfg = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=[ + dict( + type='Normal', + name='retina_cls', + std=0.01, + bias_prob=0.01), + dict( + type='Normal', name='retina_reg', std=0.01, bias=0.25) + ]) + super().__init__(*args, init_cfg=init_cfg, **kwargs) + self.score_threshold = score_threshold + + def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]: + """Forward feature map of a single scale level. + + Args: + x (Tensor): Feature map of a single scale level. + + Returns: + tuple[Tensor, Tensor]: + + - cls_score (Tensor): Box scores for each scale level Has \ + shape (N, num_points * num_classes, H, W). + - bbox_pred (Tensor): Box energies / deltas for each scale \ + level with shape (N, num_points * 4, H, W). + """ + cls_score, bbox_pred = super().forward_single(x) + # relu: TBLR encoder only accepts positive bbox_pred + return cls_score, self.relu(bbox_pred) + + def _get_targets_single(self, + flat_anchors: Tensor, + valid_flags: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression and classification targets for anchors in a + single image. + + Most of the codes are the same with the base class :obj: `AnchorHead`, + except that it also collects and returns the matched gt index in the + image (from 0 to num_gt-1). If the anchor bbox is not matched to any + gt, the corresponding value in pos_gt_inds is -1. + + Args: + flat_anchors (Tensor): Multi-level anchors of the image, which are + concatenated into a single tensor of shape (num_anchors, 4) + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_anchors, ). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + """ + inside_flags = anchor_inside_flags(flat_anchors, valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + # Assign gt and sample anchors + anchors = flat_anchors[inside_flags.type(torch.bool), :] + + pred_instances = InstanceData(priors=anchors) + assign_result = self.assigner.assign(pred_instances, gt_instances, + gt_instances_ignore) + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_anchors = anchors.shape[0] + bbox_targets = torch.zeros_like(anchors) + bbox_weights = torch.zeros_like(anchors) + labels = anchors.new_full((num_valid_anchors, ), + self.num_classes, + dtype=torch.long) + label_weights = anchors.new_zeros( + (num_valid_anchors, self.cls_out_channels), dtype=torch.float) + pos_gt_inds = anchors.new_full((num_valid_anchors, ), + -1, + dtype=torch.long) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + if len(pos_inds) > 0: + if not self.reg_decoded_bbox: + pos_bbox_targets = self.bbox_coder.encode( + sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes) + else: + # When the regression loss (e.g. `IouLoss`, `GIouLoss`) + # is applied directly on the decoded bounding boxes, both + # the predicted boxes and regression targets should be with + # absolute coordinate format. + pos_bbox_targets = sampling_result.pos_gt_bboxes + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + # The assigned gt_index for each anchor. (0-based) + pos_gt_inds[pos_inds] = sampling_result.pos_assigned_gt_inds + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + # shadowed_labels is a tensor composed of tuples + # (anchor_inds, class_label) that indicate those anchors lying in the + # outer region of a gt or overlapped by another gt with a smaller + # area. + # + # Therefore, only the shadowed labels are ignored for loss calculation. + # the key `shadowed_labels` is defined in :obj:`CenterRegionAssigner` + shadowed_labels = assign_result.get_extra_property('shadowed_labels') + if shadowed_labels is not None and shadowed_labels.numel(): + if len(shadowed_labels.shape) == 2: + idx_, label_ = shadowed_labels[:, 0], shadowed_labels[:, 1] + assert (labels[idx_] != label_).all(), \ + 'One label cannot be both positive and ignored' + label_weights[idx_, label_] = 0 + else: + label_weights[shadowed_labels] = 0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_anchors.size(0) + labels = unmap( + labels, num_total_anchors, inside_flags, + fill=self.num_classes) # fill bg label + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) + bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) + pos_gt_inds = unmap( + pos_gt_inds, num_total_anchors, inside_flags, fill=-1) + + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds, sampling_result, pos_gt_inds) + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_points * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_points * 4, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + for i in range(len(bbox_preds)): # loop over fpn level + # avoid 0 area of the predicted bbox + bbox_preds[i] = bbox_preds[i].clamp(min=1e-4) + # TODO: It may directly use the base-class loss function. + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + batch_size = len(batch_img_metas) + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + return_sampling_results=True) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor, sampling_results_list, + pos_assigned_gt_inds_list) = cls_reg_targets + + num_gts = np.array(list(map(len, batch_gt_instances))) + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + # concat all level anchors and flags to a single tensor + concat_anchor_list = [] + for i in range(len(anchor_list)): + concat_anchor_list.append(torch.cat(anchor_list[i])) + all_anchor_list = images_to_levels(concat_anchor_list, + num_level_anchors) + losses_cls, losses_bbox = multi_apply( + self.loss_by_feat_single, + cls_scores, + bbox_preds, + all_anchor_list, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + avg_factor=avg_factor) + + # `pos_assigned_gt_inds_list` (length: fpn_levels) stores the assigned + # gt index of each anchor bbox in each fpn level. + cum_num_gts = list(np.cumsum(num_gts)) # length of batch_size + for i, assign in enumerate(pos_assigned_gt_inds_list): + # loop over fpn levels + for j in range(1, batch_size): + # loop over batch size + # Convert gt indices in each img to those in the batch + assign[j][assign[j] >= 0] += int(cum_num_gts[j - 1]) + pos_assigned_gt_inds_list[i] = assign.flatten() + labels_list[i] = labels_list[i].flatten() + num_gts = num_gts.sum() # total number of gt in the batch + # The unique label index of each gt in the batch + label_sequence = torch.arange(num_gts, device=device) + # Collect the average loss of each gt in each level + with torch.no_grad(): + loss_levels, = multi_apply( + self.collect_loss_level_single, + losses_cls, + losses_bbox, + pos_assigned_gt_inds_list, + labels_seq=label_sequence) + # Shape: (fpn_levels, num_gts). Loss of each gt at each fpn level + loss_levels = torch.stack(loss_levels, dim=0) + # Locate the best fpn level for loss back-propagation + if loss_levels.numel() == 0: # zero gt + argmin = loss_levels.new_empty((num_gts, ), dtype=torch.long) + else: + _, argmin = loss_levels.min(dim=0) + + # Reweight the loss of each (anchor, label) pair, so that only those + # at the best gt level are back-propagated. + losses_cls, losses_bbox, pos_inds = multi_apply( + self.reweight_loss_single, + losses_cls, + losses_bbox, + pos_assigned_gt_inds_list, + labels_list, + list(range(len(losses_cls))), + min_levels=argmin) + num_pos = torch.cat(pos_inds, 0).sum().float() + pos_recall = self.calculate_pos_recall(cls_scores, labels_list, + pos_inds) + + if num_pos == 0: # No gt + num_total_neg = sum( + [results.num_neg for results in sampling_results_list]) + avg_factor = num_pos + num_total_neg + else: + avg_factor = num_pos + for i in range(len(losses_cls)): + losses_cls[i] /= avg_factor + losses_bbox[i] /= avg_factor + return dict( + loss_cls=losses_cls, + loss_bbox=losses_bbox, + num_pos=num_pos / batch_size, + pos_recall=pos_recall) + + def calculate_pos_recall(self, cls_scores: List[Tensor], + labels_list: List[Tensor], + pos_inds: List[Tensor]) -> Tensor: + """Calculate positive recall with score threshold. + + Args: + cls_scores (list[Tensor]): Classification scores at all fpn levels. + Each tensor is in shape (N, num_classes * num_anchors, H, W) + labels_list (list[Tensor]): The label that each anchor is assigned + to. Shape (N * H * W * num_anchors, ) + pos_inds (list[Tensor]): List of bool tensors indicating whether + the anchor is assigned to a positive label. + Shape (N * H * W * num_anchors, ) + + Returns: + Tensor: A single float number indicating the positive recall. + """ + with torch.no_grad(): + num_class = self.num_classes + scores = [ + cls.permute(0, 2, 3, 1).reshape(-1, num_class)[pos] + for cls, pos in zip(cls_scores, pos_inds) + ] + labels = [ + label.reshape(-1)[pos] + for label, pos in zip(labels_list, pos_inds) + ] + scores = torch.cat(scores, dim=0) + labels = torch.cat(labels, dim=0) + if self.use_sigmoid_cls: + scores = scores.sigmoid() + else: + scores = scores.softmax(dim=1) + + return accuracy(scores, labels, thresh=self.score_threshold) + + def collect_loss_level_single(self, cls_loss: Tensor, reg_loss: Tensor, + assigned_gt_inds: Tensor, + labels_seq: Tensor) -> Tensor: + """Get the average loss in each FPN level w.r.t. each gt label. + + Args: + cls_loss (Tensor): Classification loss of each feature map pixel, + shape (num_anchor, num_class) + reg_loss (Tensor): Regression loss of each feature map pixel, + shape (num_anchor, 4) + assigned_gt_inds (Tensor): It indicates which gt the prior is + assigned to (0-based, -1: no assignment). shape (num_anchor), + labels_seq: The rank of labels. shape (num_gt) + + Returns: + Tensor: shape (num_gt), average loss of each gt in this level + """ + if len(reg_loss.shape) == 2: # iou loss has shape (num_prior, 4) + reg_loss = reg_loss.sum(dim=-1) # sum loss in tblr dims + if len(cls_loss.shape) == 2: + cls_loss = cls_loss.sum(dim=-1) # sum loss in class dims + loss = cls_loss + reg_loss + assert loss.size(0) == assigned_gt_inds.size(0) + # Default loss value is 1e6 for a layer where no anchor is positive + # to ensure it will not be chosen to back-propagate gradient + losses_ = loss.new_full(labels_seq.shape, 1e6) + for i, l in enumerate(labels_seq): + match = assigned_gt_inds == l + if match.any(): + losses_[i] = loss[match].mean() + return losses_, + + def reweight_loss_single(self, cls_loss: Tensor, reg_loss: Tensor, + assigned_gt_inds: Tensor, labels: Tensor, + level: int, min_levels: Tensor) -> tuple: + """Reweight loss values at each level. + + Reassign loss values at each level by masking those where the + pre-calculated loss is too large. Then return the reduced losses. + + Args: + cls_loss (Tensor): Element-wise classification loss. + Shape: (num_anchors, num_classes) + reg_loss (Tensor): Element-wise regression loss. + Shape: (num_anchors, 4) + assigned_gt_inds (Tensor): The gt indices that each anchor bbox + is assigned to. -1 denotes a negative anchor, otherwise it is the + gt index (0-based). Shape: (num_anchors, ), + labels (Tensor): Label assigned to anchors. Shape: (num_anchors, ). + level (int): The current level index in the pyramid + (0-4 for RetinaNet) + min_levels (Tensor): The best-matching level for each gt. + Shape: (num_gts, ), + + Returns: + tuple: + + - cls_loss: Reduced corrected classification loss. Scalar. + - reg_loss: Reduced corrected regression loss. Scalar. + - pos_flags (Tensor): Corrected bool tensor indicating the \ + final positive anchors. Shape: (num_anchors, ). + """ + loc_weight = torch.ones_like(reg_loss) + cls_weight = torch.ones_like(cls_loss) + pos_flags = assigned_gt_inds >= 0 # positive pixel flag + pos_indices = torch.nonzero(pos_flags, as_tuple=False).flatten() + + if pos_flags.any(): # pos pixels exist + pos_assigned_gt_inds = assigned_gt_inds[pos_flags] + zeroing_indices = (min_levels[pos_assigned_gt_inds] != level) + neg_indices = pos_indices[zeroing_indices] + + if neg_indices.numel(): + pos_flags[neg_indices] = 0 + loc_weight[neg_indices] = 0 + # Only the weight corresponding to the label is + # zeroed out if not selected + zeroing_labels = labels[neg_indices] + assert (zeroing_labels >= 0).all() + cls_weight[neg_indices, zeroing_labels] = 0 + + # Weighted loss for both cls and reg loss + cls_loss = weight_reduce_loss(cls_loss, cls_weight, reduction='sum') + reg_loss = weight_reduce_loss(reg_loss, loc_weight, reduction='sum') + + return cls_loss, reg_loss, pos_flags diff --git a/mmdetection/mmdet/models/dense_heads/ga_retina_head.py b/mmdetection/mmdet/models/dense_heads/ga_retina_head.py new file mode 100644 index 0000000..569910b --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/ga_retina_head.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.ops import MaskedConv2d +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType, OptMultiConfig +from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead + + +@MODELS.register_module() +class GARetinaHead(GuidedAnchorHead): + """Guided-Anchor-based RetinaNet head.""" + + def __init__(self, + num_classes: int, + in_channels: int, + stacked_convs: int = 4, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + if init_cfg is None: + init_cfg = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=[ + dict( + type='Normal', + name='conv_loc', + std=0.01, + bias_prob=0.01), + dict( + type='Normal', + name='retina_cls', + std=0.01, + bias_prob=0.01) + ]) + self.stacked_convs = stacked_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + init_cfg=init_cfg, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + + self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1) + num_anchors = self.square_anchor_generator.num_base_priors[0] + self.conv_shape = nn.Conv2d(self.feat_channels, num_anchors * 2, 1) + self.feature_adaption_cls = FeatureAdaption( + self.feat_channels, + self.feat_channels, + kernel_size=3, + deform_groups=self.deform_groups) + self.feature_adaption_reg = FeatureAdaption( + self.feat_channels, + self.feat_channels, + kernel_size=3, + deform_groups=self.deform_groups) + self.retina_cls = MaskedConv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + 3, + padding=1) + self.retina_reg = MaskedConv2d( + self.feat_channels, self.num_base_priors * 4, 3, padding=1) + + def forward_single(self, x: Tensor) -> Tuple[Tensor]: + """Forward feature map of a single scale level.""" + cls_feat = x + reg_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + reg_feat = reg_conv(reg_feat) + + loc_pred = self.conv_loc(cls_feat) + shape_pred = self.conv_shape(reg_feat) + + cls_feat = self.feature_adaption_cls(cls_feat, shape_pred) + reg_feat = self.feature_adaption_reg(reg_feat, shape_pred) + + if not self.training: + mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr + else: + mask = None + cls_score = self.retina_cls(cls_feat, mask) + bbox_pred = self.retina_reg(reg_feat, mask) + return cls_score, bbox_pred, shape_pred, loc_pred diff --git a/mmdetection/mmdet/models/dense_heads/ga_rpn_head.py b/mmdetection/mmdet/models/dense_heads/ga_rpn_head.py new file mode 100644 index 0000000..9614463 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/ga_rpn_head.py @@ -0,0 +1,222 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.ops import nms +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList +from .guided_anchor_head import GuidedAnchorHead + + +@MODELS.register_module() +class GARPNHead(GuidedAnchorHead): + """Guided-Anchor-based RPN head.""" + + def __init__(self, + in_channels: int, + num_classes: int = 1, + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='conv_loc', + std=0.01, + bias_prob=0.01)), + **kwargs) -> None: + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + init_cfg=init_cfg, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.rpn_conv = nn.Conv2d( + self.in_channels, self.feat_channels, 3, padding=1) + super(GARPNHead, self)._init_layers() + + def forward_single(self, x: Tensor) -> Tuple[Tensor]: + """Forward feature of a single scale level.""" + + x = self.rpn_conv(x) + x = F.relu(x, inplace=True) + (cls_score, bbox_pred, shape_pred, + loc_pred) = super().forward_single(x) + return cls_score, bbox_pred, shape_pred, loc_pred + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + shape_preds: List[Tensor], + loc_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + shape_preds (list[Tensor]): shape predictions for each scale + level with shape (N, 1, H, W). + loc_preds (list[Tensor]): location predictions for each scale + level with shape (N, num_anchors * 2, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + losses = super().loss_by_feat( + cls_scores, + bbox_preds, + shape_preds, + loc_preds, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + return dict( + loss_rpn_cls=losses['loss_cls'], + loss_rpn_bbox=losses['loss_bbox'], + loss_anchor_shape=losses['loss_shape'], + loss_anchor_loc=losses['loss_loc']) + + def _predict_by_feat_single(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + mlvl_anchors: List[Tensor], + mlvl_masks: List[Tensor], + img_meta: dict, + cfg: ConfigType, + rescale: bool = False) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_scores (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + mlvl_anchors (list[Tensor]): Each element in the list is + the anchors of a single level in feature pyramid. it has + shape (num_priors, 4). + mlvl_masks (list[Tensor]): Each element in the list is location + masks of a single level. + img_meta (dict): Image meta info. + cfg (:obj:`ConfigDict` or dict): Test / postprocessing + configuration, if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), the last + dimension 4 arrange as (x1, y1, x2, y2). + """ + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + assert cfg.nms.get('type', 'nms') == 'nms', 'GARPNHead only support ' \ + 'naive nms.' + + mlvl_proposals = [] + for idx in range(len(cls_scores)): + rpn_cls_score = cls_scores[idx] + rpn_bbox_pred = bbox_preds[idx] + anchors = mlvl_anchors[idx] + mask = mlvl_masks[idx] + assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:] + # if no location is kept, end. + if mask.sum() == 0: + continue + rpn_cls_score = rpn_cls_score.permute(1, 2, 0) + if self.use_sigmoid_cls: + rpn_cls_score = rpn_cls_score.reshape(-1) + scores = rpn_cls_score.sigmoid() + else: + rpn_cls_score = rpn_cls_score.reshape(-1, 2) + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + scores = rpn_cls_score.softmax(dim=1)[:, :-1] + # filter scores, bbox_pred w.r.t. mask. + # anchors are filtered in get_anchors() beforehand. + scores = scores[mask] + rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, + 4)[mask, :] + if scores.dim() == 0: + rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0) + anchors = anchors.unsqueeze(0) + scores = scores.unsqueeze(0) + # filter anchors, bbox_pred, scores w.r.t. scores + if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre: + _, topk_inds = scores.topk(cfg.nms_pre) + rpn_bbox_pred = rpn_bbox_pred[topk_inds, :] + anchors = anchors[topk_inds, :] + scores = scores[topk_inds] + # get proposals w.r.t. anchors and rpn_bbox_pred + proposals = self.bbox_coder.decode( + anchors, rpn_bbox_pred, max_shape=img_meta['img_shape']) + # filter out too small bboxes + if cfg.min_bbox_size >= 0: + w = proposals[:, 2] - proposals[:, 0] + h = proposals[:, 3] - proposals[:, 1] + valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + if not valid_mask.all(): + proposals = proposals[valid_mask] + scores = scores[valid_mask] + + # NMS in current level + proposals, _ = nms(proposals, scores, cfg.nms.iou_threshold) + proposals = proposals[:cfg.nms_post, :] + mlvl_proposals.append(proposals) + proposals = torch.cat(mlvl_proposals, 0) + if cfg.get('nms_across_levels', False): + # NMS across multi levels + proposals, _ = nms(proposals[:, :4], proposals[:, -1], + cfg.nms.iou_threshold) + proposals = proposals[:cfg.max_per_img, :] + else: + scores = proposals[:, 4] + num = min(cfg.max_per_img, proposals.shape[0]) + _, topk_inds = scores.topk(num) + proposals = proposals[topk_inds, :] + + bboxes = proposals[:, :-1] + scores = proposals[:, -1] + if rescale: + assert img_meta.get('scale_factor') is not None + bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + + results = InstanceData() + results.bboxes = bboxes + results.scores = scores + results.labels = scores.new_zeros(scores.size(0), dtype=torch.long) + return results diff --git a/mmdetection/mmdet/models/dense_heads/gfl_head.py b/mmdetection/mmdet/models/dense_heads/gfl_head.py new file mode 100644 index 0000000..be43d9b --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/gfl_head.py @@ -0,0 +1,667 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, Scale +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType, + OptInstanceList, reduce_mean) +from ..task_modules.prior_generators import anchor_inside_flags +from ..task_modules.samplers import PseudoSampler +from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply, + unmap) +from .anchor_head import AnchorHead + + +class Integral(nn.Module): + """A fixed layer for calculating integral result from distribution. + + This layer calculates the target location by :math: ``sum{P(y_i) * y_i}``, + P(y_i) denotes the softmax vector that represents the discrete distribution + y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max} + + Args: + reg_max (int): The maximal value of the discrete set. Defaults to 16. + You may want to reset it according to your new dataset or related + settings. + """ + + def __init__(self, reg_max: int = 16) -> None: + super().__init__() + self.reg_max = reg_max + self.register_buffer('project', + torch.linspace(0, self.reg_max, self.reg_max + 1)) + + def forward(self, x: Tensor) -> Tensor: + """Forward feature from the regression head to get integral result of + bounding box location. + + Args: + x (Tensor): Features of the regression head, shape (N, 4*(n+1)), + n is self.reg_max. + + Returns: + x (Tensor): Integral result of box locations, i.e., distance + offsets from the box center in four directions, shape (N, 4). + """ + x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1) + x = F.linear(x, self.project.type_as(x)).reshape(-1, 4) + return x + + +@MODELS.register_module() +class GFLHead(AnchorHead): + """Generalized Focal Loss: Learning Qualified and Distributed Bounding + Boxes for Dense Object Detection. + + GFL head structure is similar with ATSS, however GFL uses + 1) joint representation for classification and localization quality, and + 2) flexible General distribution for bounding box locations, + which are supervised by + Quality Focal Loss (QFL) and Distribution Focal Loss (DFL), respectively + + https://arxiv.org/abs/2006.04388 + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + stacked_convs (int): Number of conv layers in cls and reg tower. + Defaults to 4. + conv_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct + and config conv layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and + config norm layer. Default: dict(type='GN', num_groups=32, + requires_grad=True). + loss_qfl (:obj:`ConfigDict` or dict): Config of Quality Focal Loss + (QFL). + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. Defaults + to 'DistancePointBBoxCoder'. + reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}`` + in QFL setting. Defaults to 16. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + Example: + >>> self = GFLHead(11, 7) + >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]] + >>> cls_quality_score, bbox_pred = self.forward(feats) + >>> assert len(cls_quality_score) == len(self.scales) + """ + + def __init__(self, + num_classes: int, + in_channels: int, + stacked_convs: int = 4, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + loss_dfl: ConfigType = dict( + type='DistributionFocalLoss', loss_weight=0.25), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + reg_max: int = 16, + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='gfl_cls', + std=0.01, + bias_prob=0.01)), + **kwargs) -> None: + self.stacked_convs = stacked_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.reg_max = reg_max + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + bbox_coder=bbox_coder, + init_cfg=init_cfg, + **kwargs) + + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + if self.train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + self.integral = Integral(self.reg_max) + self.loss_dfl = MODELS.build(loss_dfl) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU() + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + assert self.num_anchors == 1, 'anchor free version' + self.gfl_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + self.gfl_reg = nn.Conv2d( + self.feat_channels, 4 * (self.reg_max + 1), 3, padding=1) + self.scales = nn.ModuleList( + [Scale(1.0) for _ in self.prior_generator.strides]) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + + - cls_scores (list[Tensor]): Classification and quality (IoU) + joint scores for all scale levels, each is a 4D-tensor, + the channel number is num_classes. + - bbox_preds (list[Tensor]): Box distribution logits for all + scale levels, each is a 4D-tensor, the channel number is + 4*(n+1), n is max value of integral set. + """ + return multi_apply(self.forward_single, x, self.scales) + + def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]: + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + + Returns: + tuple: + + - cls_score (Tensor): Cls and quality joint scores for a single + scale level the channel number is num_classes. + - bbox_pred (Tensor): Box distribution logits for a single scale + level, the channel number is 4*(n+1), n is max value of + integral set. + """ + cls_feat = x + reg_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + reg_feat = reg_conv(reg_feat) + cls_score = self.gfl_cls(cls_feat) + bbox_pred = scale(self.gfl_reg(reg_feat)).float() + return cls_score, bbox_pred + + def anchor_center(self, anchors: Tensor) -> Tensor: + """Get anchor centers from anchors. + + Args: + anchors (Tensor): Anchor list with shape (N, 4), ``xyxy`` format. + + Returns: + Tensor: Anchor centers with shape (N, 2), ``xy`` format. + """ + anchors_cx = (anchors[..., 2] + anchors[..., 0]) / 2 + anchors_cy = (anchors[..., 3] + anchors[..., 1]) / 2 + return torch.stack([anchors_cx, anchors_cy], dim=-1) + + def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor, + bbox_pred: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + stride: Tuple[int], avg_factor: int) -> dict: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + anchors (Tensor): Box reference for each scale level with shape + (N, num_total_anchors, 4). + cls_score (Tensor): Cls and quality joint scores for each scale + level has shape (N, num_classes, H, W). + bbox_pred (Tensor): Box distribution logits for each scale + level with shape (N, 4*(n+1), H, W), n is max value of integral + set. + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (N, num_total_anchors, 4). + stride (Tuple[int]): Stride in this scale level. + avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert stride[0] == stride[1], 'h stride is not equal to w stride!' + anchors = anchors.reshape(-1, 4) + cls_score = cls_score.permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + bbox_pred = bbox_pred.permute(0, 2, 3, + 1).reshape(-1, 4 * (self.reg_max + 1)) + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + score = label_weights.new_zeros(labels.shape) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + pos_anchors = anchors[pos_inds] + pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0] + + weight_targets = cls_score.detach().sigmoid() + weight_targets = weight_targets.max(dim=1)[0][pos_inds] + pos_bbox_pred_corners = self.integral(pos_bbox_pred) + pos_decode_bbox_pred = self.bbox_coder.decode( + pos_anchor_centers, pos_bbox_pred_corners) + pos_decode_bbox_targets = pos_bbox_targets / stride[0] + score[pos_inds] = bbox_overlaps( + pos_decode_bbox_pred.detach(), + pos_decode_bbox_targets, + is_aligned=True) + pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1) + target_corners = self.bbox_coder.encode(pos_anchor_centers, + pos_decode_bbox_targets, + self.reg_max).reshape(-1) + + # regression loss + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_decode_bbox_targets, + weight=weight_targets, + avg_factor=1.0) + + # dfl loss + loss_dfl = self.loss_dfl( + pred_corners, + target_corners, + weight=weight_targets[:, None].expand(-1, 4).reshape(-1), + avg_factor=4.0) + else: + loss_bbox = bbox_pred.sum() * 0 + loss_dfl = bbox_pred.sum() * 0 + weight_targets = bbox_pred.new_tensor(0) + + # cls (qfl) loss + loss_cls = self.loss_cls( + cls_score, (labels, score), + weight=label_weights, + avg_factor=avg_factor) + + return loss_cls, loss_bbox, loss_dfl, weight_targets.sum() + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Cls and quality scores for each scale + level has shape (N, num_classes, H, W). + bbox_preds (list[Tensor]): Box distribution logits for each scale + level with shape (N, 4*(n+1), H, W), n is max value of integral + set. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) = cls_reg_targets + + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + + losses_cls, losses_bbox, losses_dfl,\ + avg_factor = multi_apply( + self.loss_by_feat_single, + anchor_list, + cls_scores, + bbox_preds, + labels_list, + label_weights_list, + bbox_targets_list, + self.prior_generator.strides, + avg_factor=avg_factor) + + avg_factor = sum(avg_factor) + avg_factor = reduce_mean(avg_factor).clamp_(min=1).item() + losses_bbox = list(map(lambda x: x / avg_factor, losses_bbox)) + losses_dfl = list(map(lambda x: x / avg_factor, losses_dfl)) + return dict( + loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dfl=losses_dfl) + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image. GFL head does not need this value. + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid, has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (:obj: `ConfigDict`): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + tuple[Tensor]: Results of detected bboxes and labels. If with_nms + is False and mlvl_score_factor is None, return mlvl_bboxes and + mlvl_scores, else return mlvl_bboxes, mlvl_scores and + mlvl_score_factor. Usually with_nms is False is used for aug + test. If with_nms is True, then return the following format + + - det_bboxes (Tensor): Predicted bboxes with shape + [num_bboxes, 5], where the first 4 columns are bounding + box positions (tl_x, tl_y, br_x, br_y) and the 5-th + column are scores between 0 and 1. + - det_labels (Tensor): Predicted labels of the corresponding + box with shape [num_bboxes]. + """ + cfg = self.test_cfg if cfg is None else cfg + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_labels = [] + for level_idx, (cls_score, bbox_pred, stride, priors) in enumerate( + zip(cls_score_list, bbox_pred_list, + self.prior_generator.strides, mlvl_priors)): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + assert stride[0] == stride[1] + + bbox_pred = bbox_pred.permute(1, 2, 0) + bbox_pred = self.integral(bbox_pred) * stride[0] + + scores = cls_score.permute(1, 2, 0).reshape( + -1, self.cls_out_channels).sigmoid() + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + results = filter_scores_and_topk( + scores, cfg.score_thr, nms_pre, + dict(bbox_pred=bbox_pred, priors=priors)) + scores, labels, _, filtered_results = results + + bbox_pred = filtered_results['bbox_pred'] + priors = filtered_results['priors'] + + bboxes = self.bbox_coder.decode( + self.anchor_center(priors), bbox_pred, max_shape=img_shape) + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + + results = InstanceData() + results.bboxes = torch.cat(mlvl_bboxes) + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + def get_targets(self, + anchor_list: List[Tensor], + valid_flag_list: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs=True) -> tuple: + """Get targets for GFL head. + + This method is almost the same as `AnchorHead.get_targets()`. Besides + returning the targets as the parent method does, it also returns the + anchors as the first element of the returned tuple. + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + num_level_anchors_list = [num_level_anchors] * num_imgs + + # concat all level anchors and flags to a single tensor + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + anchor_list[i] = torch.cat(anchor_list[i]) + valid_flag_list[i] = torch.cat(valid_flag_list[i]) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + (all_anchors, all_labels, all_label_weights, all_bbox_targets, + all_bbox_weights, pos_inds_list, neg_inds_list, + sampling_results_list) = multi_apply( + self._get_targets_single, + anchor_list, + valid_flag_list, + num_level_anchors_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + # Get `avg_factor` of all images, which calculate in `SamplingResult`. + # When using sampling method, avg_factor is usually the sum of + # positive and negative priors. When using `PseudoSampler`, + # `avg_factor` is usually equal to the number of positive priors. + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # split targets to a list w.r.t. multiple levels + anchors_list = images_to_levels(all_anchors, num_level_anchors) + labels_list = images_to_levels(all_labels, num_level_anchors) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_anchors) + return (anchors_list, labels_list, label_weights_list, + bbox_targets_list, bbox_weights_list, avg_factor) + + def _get_targets_single(self, + flat_anchors: Tensor, + valid_flags: Tensor, + num_level_anchors: List[int], + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression, classification targets for anchors in a single + image. + + Args: + flat_anchors (Tensor): Multi-level anchors of the image, which are + concatenated into a single tensor of shape (num_anchors, 4) + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_anchors,). + num_level_anchors (list[int]): Number of anchors of each scale + level. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + + Returns: + tuple: N is the number of total anchors in the image. + + - anchors (Tensor): All anchors in the image with shape (N, 4). + - labels (Tensor): Labels of all anchors in the image with + shape (N,). + - label_weights (Tensor): Label weights of all anchor in the + image with shape (N,). + - bbox_targets (Tensor): BBox targets of all anchors in the + image with shape (N, 4). + - bbox_weights (Tensor): BBox weights of all anchors in the + image with shape (N, 4). + - pos_inds (Tensor): Indices of positive anchor with shape + (num_pos,). + - neg_inds (Tensor): Indices of negative anchor with shape + (num_neg,). + - sampling_result (:obj:`SamplingResult`): Sampling results. + """ + inside_flags = anchor_inside_flags(flat_anchors, valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + # assign gt and sample anchors + anchors = flat_anchors[inside_flags, :] + num_level_anchors_inside = self.get_num_level_anchors_inside( + num_level_anchors, inside_flags) + pred_instances = InstanceData(priors=anchors) + assign_result = self.assigner.assign( + pred_instances=pred_instances, + num_level_priors=num_level_anchors_inside, + gt_instances=gt_instances, + gt_instances_ignore=gt_instances_ignore) + + sampling_result = self.sampler.sample( + assign_result=assign_result, + pred_instances=pred_instances, + gt_instances=gt_instances) + + num_valid_anchors = anchors.shape[0] + bbox_targets = torch.zeros_like(anchors) + bbox_weights = torch.zeros_like(anchors) + labels = anchors.new_full((num_valid_anchors, ), + self.num_classes, + dtype=torch.long) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + pos_bbox_targets = sampling_result.pos_gt_bboxes + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1.0 + + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_anchors.size(0) + anchors = unmap(anchors, num_total_anchors, inside_flags) + labels = unmap( + labels, num_total_anchors, inside_flags, fill=self.num_classes) + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) + bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) + + return (anchors, labels, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds, sampling_result) + + def get_num_level_anchors_inside(self, num_level_anchors: List[int], + inside_flags: Tensor) -> List[int]: + """Get the number of valid anchors in every level.""" + + split_inside_flags = torch.split(inside_flags, num_level_anchors) + num_level_anchors_inside = [ + int(flags.sum()) for flags in split_inside_flags + ] + return num_level_anchors_inside diff --git a/mmdetection/mmdet/models/dense_heads/grounding_dino_head.py b/mmdetection/mmdet/models/dense_heads/grounding_dino_head.py new file mode 100644 index 0000000..3aced62 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/grounding_dino_head.py @@ -0,0 +1,767 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import Linear +from mmengine.model import constant_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.losses import QualityFocalLoss +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh +from mmdet.utils import InstanceList, reduce_mean +from ..layers import inverse_sigmoid +from .atss_vlfusion_head import convert_grounding_to_cls_scores +from .dino_head import DINOHead + + +class ContrastiveEmbed(nn.Module): + """text visual ContrastiveEmbed layer. + + Args: + max_text_len (int, optional): Maximum length of text. + log_scale (Optional[Union[str, float]]): The initial value of a + learnable parameter to multiply with the similarity + matrix to normalize the output. Defaults to 0.0. + - If set to 'auto', the similarity matrix will be normalized by + a fixed value ``sqrt(d_c)`` where ``d_c`` is the channel number. + - If set to 'none' or ``None``, there is no normalization applied. + - If set to a float number, the similarity matrix will be multiplied + by ``exp(log_scale)``, where ``log_scale`` is learnable. + bias (bool, optional): Whether to add bias to the output. + If set to ``True``, a learnable bias that is initialized as -4.6 + will be added to the output. Useful when training from scratch. + Defaults to False. + """ + + def __init__(self, + max_text_len: int = 256, + log_scale: Optional[Union[str, float]] = None, + bias: bool = False): + super().__init__() + self.max_text_len = max_text_len + self.log_scale = log_scale + if isinstance(log_scale, float): + self.log_scale = nn.Parameter( + torch.Tensor([float(log_scale)]), requires_grad=True) + elif log_scale not in ['auto', 'none', None]: + raise ValueError(f'log_scale should be one of ' + f'"auto", "none", None, but got {log_scale}') + + self.bias = None + if bias: + bias_value = -math.log((1 - 0.01) / 0.01) + self.bias = nn.Parameter( + torch.Tensor([bias_value]), requires_grad=True) + + def forward(self, visual_feat: Tensor, text_feat: Tensor, + text_token_mask: Tensor) -> Tensor: + """Forward function. + + Args: + visual_feat (Tensor): Visual features. + text_feat (Tensor): Text features. + text_token_mask (Tensor): A mask used for text feats. + + Returns: + Tensor: Classification score. + """ + res = visual_feat @ text_feat.transpose(-1, -2) + if isinstance(self.log_scale, nn.Parameter): + res = res * self.log_scale.exp() + elif self.log_scale == 'auto': + # NOTE: similar to the normalizer in self-attention + res = res / math.sqrt(visual_feat.shape[-1]) + if self.bias is not None: + res = res + self.bias + res.masked_fill_(~text_token_mask[:, None, :], float('-inf')) + + new_res = torch.full((*res.shape[:-1], self.max_text_len), + float('-inf'), + device=res.device) + new_res[..., :res.shape[-1]] = res + + return new_res + + +@MODELS.register_module() +class GroundingDINOHead(DINOHead): + """Head of the Grounding DINO: Marrying DINO with Grounded Pre-Training for + Open-Set Object Detection. + + Args: + contrastive_cfg (dict, optional): Contrastive config that contains + keys like ``max_text_len``. Defaults to dict(max_text_len=256). + """ + + def __init__(self, contrastive_cfg=dict(max_text_len=256), **kwargs): + self.contrastive_cfg = contrastive_cfg + self.max_text_len = contrastive_cfg.get('max_text_len', 256) + super().__init__(**kwargs) + + def _init_layers(self) -> None: + """Initialize classification branch and regression branch of head.""" + fc_cls = ContrastiveEmbed(**self.contrastive_cfg) + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, 4)) + reg_branch = nn.Sequential(*reg_branch) + + # NOTE: due to the fc_cls is a contrastive embedding and don't + # have any trainable parameters,we do not need to copy it. + if self.share_pred_layer: + self.cls_branches = nn.ModuleList( + [fc_cls for _ in range(self.num_pred_layer)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(self.num_pred_layer)]) + else: + self.cls_branches = nn.ModuleList( + [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)]) + self.reg_branches = nn.ModuleList([ + copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer) + ]) + + def init_weights(self) -> None: + """Initialize weights of the Deformable DETR head.""" + for m in self.reg_branches: + constant_init(m[-1], 0, bias=0) + nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0) + if self.as_two_stage: + for m in self.reg_branches: + nn.init.constant_(m[-1].bias.data[2:], 0.0) + + def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor, + gt_instances: InstanceData, + img_meta: dict) -> tuple: + """Compute regression and classification targets for one image. + + Outputs from a single decoder layer of a single feature level are used. + + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_queries, 4]. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + img_h, img_w = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + num_bboxes = bbox_pred.size(0) + # convert bbox_pred from xywh, normalized to xyxy, unnormalized + bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) + bbox_pred = bbox_pred * factor + + pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred) + # assigner and sampler + assign_result = self.assigner.assign( + pred_instances=pred_instances, + gt_instances=gt_instances, + img_meta=img_meta) + gt_bboxes = gt_instances.bboxes + + pos_inds = torch.nonzero( + assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() + neg_inds = torch.nonzero( + assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() + pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :] + + # Major changes. The labels are 0-1 binary labels for each bbox + # and text tokens. + labels = gt_bboxes.new_full((num_bboxes, self.max_text_len), + 0, + dtype=torch.float32) + labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype) + bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype) + bbox_weights[pos_inds] = 1.0 + + # DETR regress the relative position of boxes (cxcywh) in the image. + # Thus the learning target should be normalized by the image size, also + # the box format should be converted from defaultly x1y1x2y2 to cxcywh. + pos_gt_bboxes_normalized = pos_gt_bboxes / factor + pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) + bbox_targets[pos_inds] = pos_gt_bboxes_targets + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds) + + def forward( + self, + hidden_states: Tensor, + references: List[Tensor], + memory_text: Tensor, + text_token_mask: Tensor, + ) -> Tuple[Tensor]: + """Forward function. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries, dim). + references (List[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries, 4) when `as_two_stage` of the detector is `True`, + otherwise (bs, num_queries, 2). Each `inter_reference` has + shape (bs, num_queries, 4) when `with_box_refine` of the + detector is `True`, otherwise (bs, num_queries, 2). The + coordinates are arranged as (cx, cy) when the last dimension is + 2, and (cx, cy, w, h) when it is 4. + memory_text (Tensor): Memory text. It has shape (bs, len_text, + text_embed_dims). + text_token_mask (Tensor): Text token mask. It has shape (bs, + len_text). + + Returns: + tuple[Tensor]: results of head containing the following tensor. + + - all_layers_outputs_classes (Tensor): Outputs from the + classification head, has shape (num_decoder_layers, bs, + num_queries, cls_out_channels). + - all_layers_outputs_coords (Tensor): Sigmoid outputs from the + regression head with normalized coordinate format (cx, cy, w, + h), has shape (num_decoder_layers, bs, num_queries, 4) with the + last dimension arranged as (cx, cy, w, h). + """ + all_layers_outputs_classes = [] + all_layers_outputs_coords = [] + + for layer_id in range(hidden_states.shape[0]): + reference = inverse_sigmoid(references[layer_id]) + # NOTE The last reference will not be used. + hidden_state = hidden_states[layer_id] + outputs_class = self.cls_branches[layer_id](hidden_state, + memory_text, + text_token_mask) + tmp_reg_preds = self.reg_branches[layer_id](hidden_state) + if reference.shape[-1] == 4: + # When `layer` is 0 and `as_two_stage` of the detector + # is `True`, or when `layer` is greater than 0 and + # `with_box_refine` of the detector is `True`. + tmp_reg_preds += reference + else: + # When `layer` is 0 and `as_two_stage` of the detector + # is `False`, or when `layer` is greater than 0 and + # `with_box_refine` of the detector is `False`. + assert reference.shape[-1] == 2 + tmp_reg_preds[..., :2] += reference + outputs_coord = tmp_reg_preds.sigmoid() + all_layers_outputs_classes.append(outputs_class) + all_layers_outputs_coords.append(outputs_coord) + + all_layers_outputs_classes = torch.stack(all_layers_outputs_classes) + all_layers_outputs_coords = torch.stack(all_layers_outputs_coords) + + return all_layers_outputs_classes, all_layers_outputs_coords + + def predict(self, + hidden_states: Tensor, + references: List[Tensor], + memory_text: Tensor, + text_token_mask: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + """Perform forward propagation and loss calculation of the detection + head on the queries of the upstream network. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, num_queries, bs, dim). + references (List[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries, 4) when `as_two_stage` of the detector is `True`, + otherwise (bs, num_queries, 2). Each `inter_reference` has + shape (bs, num_queries, 4) when `with_box_refine` of the + detector is `True`, otherwise (bs, num_queries, 2). The + coordinates are arranged as (cx, cy) when the last dimension is + 2, and (cx, cy, w, h) when it is 4. + memory_text (Tensor): Memory text. It has shape (bs, len_text, + text_embed_dims). + text_token_mask (Tensor): Text token mask. It has shape (bs, + len_text). + batch_data_samples (SampleList): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): If `True`, return boxes in original + image space. Defaults to `True`. + + Returns: + InstanceList: Detection results of each image + after the post process. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + batch_token_positive_maps = [ + data_samples.token_positive_map + for data_samples in batch_data_samples + ] + + outs = self(hidden_states, references, memory_text, text_token_mask) + + predictions = self.predict_by_feat( + *outs, + batch_img_metas=batch_img_metas, + batch_token_positive_maps=batch_token_positive_maps, + rescale=rescale) + return predictions + + def predict_by_feat(self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + batch_img_metas: List[Dict], + batch_token_positive_maps: Optional[List[dict]] = None, + rescale: bool = False) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, num_queries, + cls_out_channels). + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries, + 4) with the last dimension arranged as (cx, cy, w, h). + batch_img_metas (List[Dict]): _description_ + batch_token_positive_maps (list[dict], Optional): Batch token + positive map. Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + cls_scores = all_layers_cls_scores[-1] + bbox_preds = all_layers_bbox_preds[-1] + result_list = [] + for img_id in range(len(batch_img_metas)): + cls_score = cls_scores[img_id] + bbox_pred = bbox_preds[img_id] + img_meta = batch_img_metas[img_id] + token_positive_maps = batch_token_positive_maps[img_id] + results = self._predict_by_feat_single(cls_score, bbox_pred, + token_positive_maps, + img_meta, rescale) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, + cls_score: Tensor, + bbox_pred: Tensor, + token_positive_maps: dict, + img_meta: dict, + rescale: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score (Tensor): Box score logits from the last decoder layer + for each image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from the last decoder layer + for each image, with coordinate format (cx, cy, w, h) and + shape [num_queries, 4]. + token_positive_maps (dict): Token positive map. + img_meta (dict): Image meta info. + rescale (bool, optional): If True, return boxes in original image + space. Default True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_score) == len(bbox_pred) # num_queries + max_per_img = self.test_cfg.get('max_per_img', len(cls_score)) + img_shape = img_meta['img_shape'] + + cls_score = convert_grounding_to_cls_scores( + logits=cls_score.sigmoid()[None], + positive_maps=[token_positive_maps])[0] + scores, indexes = cls_score.view(-1).topk(max_per_img) + num_classes = cls_score.shape[-1] + det_labels = indexes % num_classes + bbox_index = indexes // num_classes + bbox_pred = bbox_pred[bbox_index] + + det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred) + det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] + det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] + det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1]) + det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0]) + if rescale: + assert img_meta.get('scale_factor') is not None + det_bboxes /= det_bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + results = InstanceData() + results.bboxes = det_bboxes + results.scores = scores + results.labels = det_labels + return results + + def loss(self, hidden_states: Tensor, references: List[Tensor], + memory_text: Tensor, text_token_mask: Tensor, + enc_outputs_class: Tensor, enc_outputs_coord: Tensor, + batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the queries of the upstream network. + + Args: + hidden_states (Tensor): Hidden states output from each decoder + layer, has shape (num_decoder_layers, bs, num_queries_total, + dim), where `num_queries_total` is the sum of + `num_denoising_queries` and `num_matching_queries` when + `self.training` is `True`, else `num_matching_queries`. + references (list[Tensor]): List of the reference from the decoder. + The first reference is the `init_reference` (initial) and the + other num_decoder_layers(6) references are `inter_references` + (intermediate). The `init_reference` has shape (bs, + num_queries_total, 4) and each `inter_reference` has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + memory_text (Tensor): Memory text. It has shape (bs, len_text, + text_embed_dims). + enc_outputs_class (Tensor): The score of each point on encode + feature map, has shape (bs, num_feat_points, cls_out_channels). + enc_outputs_coord (Tensor): The proposal generate from the + encode feature map, has shape (bs, num_feat_points, 4) with the + last dimension arranged as (cx, cy, w, h). + batch_data_samples (list[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + dict: A dictionary of loss components. + """ + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + outs = self(hidden_states, references, memory_text, text_token_mask) + self.text_masks = text_token_mask + loss_inputs = outs + (enc_outputs_class, enc_outputs_coord, + batch_gt_instances, batch_img_metas, dn_meta) + losses = self.loss_by_feat(*loss_inputs) + return losses + + def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict]) -> Tuple[Tensor]: + """Loss function for outputs from a single decoder layer of a single + feature level. + + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images, has shape (bs, num_queries, cls_out_channels). + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape (bs, num_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + with torch.no_grad(): + cls_reg_targets = self.get_targets(cls_scores_list, + bbox_preds_list, + batch_gt_instances, + batch_img_metas) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.stack(labels_list, 0) + label_weights = torch.stack(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # ===== this change ===== + # Loss is not computed for the padded regions of the text. + assert (self.text_masks.dim() == 2) + text_masks = self.text_masks.new_zeros( + (self.text_masks.size(0), self.max_text_len)) + text_masks[:, :self.text_masks.size(1)] = self.text_masks + text_mask = (text_masks > 0).unsqueeze(1) + text_mask = text_mask.repeat(1, cls_scores.size(1), 1) + cls_scores = torch.masked_select(cls_scores, text_mask).contiguous() + + labels = torch.masked_select(labels, text_mask) + label_weights = label_weights[..., + None].repeat(1, 1, text_mask.size(-1)) + label_weights = torch.masked_select(label_weights, text_mask) + + # classification loss + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + if isinstance(self.loss_cls, QualityFocalLoss): + raise NotImplementedError( + 'QualityFocalLoss for GroundingDINOHead is not supported yet.') + else: + loss_cls = self.loss_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # construct factors used for rescale bboxes + factors = [] + for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds): + img_h, img_w, = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0).repeat( + bbox_pred.size(0), 1) + factors.append(factor) + factors = torch.cat(factors, 0) + + # DETR regress the relative position of boxes (cxcywh) in the image, + # thus the learning target is normalized by the image size. So here + # we need to re-scale them for calculating IoU loss + bbox_preds = bbox_preds.reshape(-1, 4) + bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors + bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors + + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou( + bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) + + # regression L1 loss + loss_bbox = self.loss_bbox( + bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) + return loss_cls, loss_bbox, loss_iou + + def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + dn_meta: Dict[str, int]) -> Tuple[Tensor]: + """Denoising loss for outputs from a single decoder layer. + + Args: + dn_cls_scores (Tensor): Classification scores of a single decoder + layer in denoising part, has shape (bs, num_denoising_queries, + cls_out_channels). + dn_bbox_preds (Tensor): Regression outputs of a single decoder + layer in denoising part. Each is a 4D-tensor with normalized + coordinate format (cx, cy, w, h) and has shape + (bs, num_denoising_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + cls_reg_targets = self.get_dn_targets(batch_gt_instances, + batch_img_metas, dn_meta) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.stack(labels_list, 0) + label_weights = torch.stack(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + # ===== this change ===== + # Loss is not computed for the padded regions of the text. + assert (self.text_masks.dim() == 2) + text_masks = self.text_masks.new_zeros( + (self.text_masks.size(0), self.max_text_len)) + text_masks[:, :self.text_masks.size(1)] = self.text_masks + text_mask = (text_masks > 0).unsqueeze(1) + text_mask = text_mask.repeat(1, dn_cls_scores.size(1), 1) + cls_scores = torch.masked_select(dn_cls_scores, text_mask).contiguous() + labels = torch.masked_select(labels, text_mask) + label_weights = label_weights[..., + None].repeat(1, 1, text_mask.size(-1)) + label_weights = torch.masked_select(label_weights, text_mask) + # ======================= + + # classification loss + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = \ + num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + if len(cls_scores) > 0: + if isinstance(self.loss_cls, QualityFocalLoss): + raise NotImplementedError('QualityFocalLoss is not supported') + else: + loss_cls = self.loss_cls( + cls_scores, + labels, + label_weights, + avg_factor=cls_avg_factor) + else: + loss_cls = torch.zeros( + 1, dtype=cls_scores.dtype, device=cls_scores.device) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # construct factors used for rescale bboxes + factors = [] + for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds): + img_h, img_w = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0).repeat( + bbox_pred.size(0), 1) + factors.append(factor) + factors = torch.cat(factors) + + # DETR regress the relative position of boxes (cxcywh) in the image, + # thus the learning target is normalized by the image size. So here + # we need to re-scale them for calculating IoU loss + bbox_preds = dn_bbox_preds.reshape(-1, 4) + bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors + bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors + + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou( + bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) + + # regression L1 loss + loss_bbox = self.loss_bbox( + bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) + return loss_cls, loss_bbox, loss_iou + + def _get_dn_targets_single(self, gt_instances: InstanceData, + img_meta: dict, dn_meta: Dict[str, + int]) -> tuple: + """Get targets in denoising part for one image. + + Args: + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + num_groups = dn_meta['num_denoising_groups'] + num_denoising_queries = dn_meta['num_denoising_queries'] + num_queries_each_group = int(num_denoising_queries / num_groups) + device = gt_bboxes.device + + if len(gt_labels) > 0: + t = torch.arange(len(gt_labels), dtype=torch.long, device=device) + t = t.unsqueeze(0).repeat(num_groups, 1) + pos_assigned_gt_inds = t.flatten() + pos_inds = torch.arange( + num_groups, dtype=torch.long, device=device) + pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t + pos_inds = pos_inds.flatten() + else: + pos_inds = pos_assigned_gt_inds = \ + gt_bboxes.new_tensor([], dtype=torch.long) + + neg_inds = pos_inds + num_queries_each_group // 2 + # label targets + # this change + labels = gt_bboxes.new_full((num_denoising_queries, self.max_text_len), + 0, + dtype=torch.float32) + labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_denoising_queries) + + # bbox targets + bbox_targets = torch.zeros(num_denoising_queries, 4, device=device) + bbox_weights = torch.zeros(num_denoising_queries, 4, device=device) + bbox_weights[pos_inds] = 1.0 + img_h, img_w = img_meta['img_shape'] + + # DETR regress the relative position of boxes (cxcywh) in the image. + # Thus the learning target should be normalized by the image size, also + # the box format should be converted from defaultly x1y1x2y2 to cxcywh. + factor = gt_bboxes.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + gt_bboxes_normalized = gt_bboxes / factor + gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized) + bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1]) + + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + neg_inds) diff --git a/mmdetection/mmdet/models/dense_heads/guided_anchor_head.py b/mmdetection/mmdet/models/dense_heads/guided_anchor_head.py new file mode 100644 index 0000000..59f6dd3 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/guided_anchor_head.py @@ -0,0 +1,994 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmcv.ops import DeformConv2d, MaskedConv2d +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType, + OptInstanceList) +from ..layers import multiclass_nms +from ..task_modules.prior_generators import anchor_inside_flags, calc_region +from ..task_modules.samplers import PseudoSampler +from ..utils import images_to_levels, multi_apply, unmap +from .anchor_head import AnchorHead + + +class FeatureAdaption(BaseModule): + """Feature Adaption Module. + + Feature Adaption Module is implemented based on DCN v1. + It uses anchor shape prediction rather than feature map to + predict offsets of deform conv layer. + + Args: + in_channels (int): Number of channels in the input feature map. + out_channels (int): Number of channels in the output feature map. + kernel_size (int): Deformable conv kernel size. Defaults to 3. + deform_groups (int): Deformable conv group size. Defaults to 4. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \ + list[dict], optional): Initialization config dict. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int = 3, + deform_groups: int = 4, + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.1, + override=dict(type='Normal', name='conv_adaption', std=0.01)) + ) -> None: + super().__init__(init_cfg=init_cfg) + offset_channels = kernel_size * kernel_size * 2 + self.conv_offset = nn.Conv2d( + 2, deform_groups * offset_channels, 1, bias=False) + self.conv_adaption = DeformConv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + padding=(kernel_size - 1) // 2, + deform_groups=deform_groups) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x: Tensor, shape: Tensor) -> Tensor: + offset = self.conv_offset(shape.detach()) + x = self.relu(self.conv_adaption(x, offset)) + return x + + +@MODELS.register_module() +class GuidedAnchorHead(AnchorHead): + """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.). + + This GuidedAnchorHead will predict high-quality feature guided + anchors and locations where anchors will be kept in inference. + There are mainly 3 categories of bounding-boxes. + + - Sampled 9 pairs for target assignment. (approxes) + - The square boxes where the predicted anchors are based on. (squares) + - Guided anchors. + + Please refer to https://arxiv.org/abs/1901.03278 for more details. + + Args: + num_classes (int): Number of classes. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels. Defaults to 256. + approx_anchor_generator (:obj:`ConfigDict` or dict): Config dict + for approx generator + square_anchor_generator (:obj:`ConfigDict` or dict): Config dict + for square generator + anchor_coder (:obj:`ConfigDict` or dict): Config dict for anchor coder + bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Defaults to False. It should be `True` when + using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. + deform_groups: (int): Group number of DCN in FeatureAdaption module. + Defaults to 4. + loc_filter_thr (float): Threshold to filter out unconcerned regions. + Defaults to 0.01. + loss_loc (:obj:`ConfigDict` or dict): Config of location loss. + loss_shape (:obj:`ConfigDict` or dict): Config of anchor shape loss. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of bbox regression loss. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \ + list[dict], optional): Initialization config dict. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + feat_channels: int = 256, + approx_anchor_generator: ConfigType = dict( + type='AnchorGenerator', + octave_base_scale=8, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + square_anchor_generator: ConfigType = dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[8], + strides=[4, 8, 16, 32, 64]), + anchor_coder: ConfigType = dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + bbox_coder: ConfigType = dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + reg_decoded_bbox: bool = False, + deform_groups: int = 4, + loc_filter_thr: float = 0.01, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + loss_loc: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_shape: ConfigType = dict( + type='BoundedIoULoss', beta=0.2, loss_weight=1.0), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='SmoothL1Loss', beta=1.0, loss_weight=1.0), + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', name='conv_loc', std=0.01, lbias_prob=0.01)) + ) -> None: + super(AnchorHead, self).__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.num_classes = num_classes + self.feat_channels = feat_channels + self.deform_groups = deform_groups + self.loc_filter_thr = loc_filter_thr + + # build approx_anchor_generator and square_anchor_generator + assert (approx_anchor_generator['octave_base_scale'] == + square_anchor_generator['scales'][0]) + assert (approx_anchor_generator['strides'] == + square_anchor_generator['strides']) + self.approx_anchor_generator = TASK_UTILS.build( + approx_anchor_generator) + self.square_anchor_generator = TASK_UTILS.build( + square_anchor_generator) + self.approxs_per_octave = self.approx_anchor_generator \ + .num_base_priors[0] + + self.reg_decoded_bbox = reg_decoded_bbox + + # one anchor per location + self.num_base_priors = self.square_anchor_generator.num_base_priors[0] + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + self.loc_focal_loss = loss_loc['type'] in ['FocalLoss'] + if self.use_sigmoid_cls: + self.cls_out_channels = self.num_classes + else: + self.cls_out_channels = self.num_classes + 1 + + # build bbox_coder + self.anchor_coder = TASK_UTILS.build(anchor_coder) + self.bbox_coder = TASK_UTILS.build(bbox_coder) + + # build losses + self.loss_loc = MODELS.build(loss_loc) + self.loss_shape = MODELS.build(loss_shape) + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + # use PseudoSampler when no sampler in train_cfg + if train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + else: + self.sampler = PseudoSampler() + + self.ga_assigner = TASK_UTILS.build(self.train_cfg['ga_assigner']) + if train_cfg.get('ga_sampler', None) is not None: + self.ga_sampler = TASK_UTILS.build( + self.train_cfg['ga_sampler'], + default_args=dict(context=self)) + else: + self.ga_sampler = PseudoSampler() + + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.conv_loc = nn.Conv2d(self.in_channels, 1, 1) + self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2, + 1) + self.feature_adaption = FeatureAdaption( + self.in_channels, + self.feat_channels, + kernel_size=3, + deform_groups=self.deform_groups) + self.conv_cls = MaskedConv2d( + self.feat_channels, self.num_base_priors * self.cls_out_channels, + 1) + self.conv_reg = MaskedConv2d(self.feat_channels, + self.num_base_priors * 4, 1) + + def forward_single(self, x: Tensor) -> Tuple[Tensor]: + """Forward feature of a single scale level.""" + loc_pred = self.conv_loc(x) + shape_pred = self.conv_shape(x) + x = self.feature_adaption(x, shape_pred) + # masked conv is only used during inference for speed-up + if not self.training: + mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr + else: + mask = None + cls_score = self.conv_cls(x, mask) + bbox_pred = self.conv_reg(x, mask) + return cls_score, bbox_pred, shape_pred, loc_pred + + def forward(self, x: List[Tensor]) -> Tuple[List[Tensor]]: + """Forward features from the upstream network.""" + return multi_apply(self.forward_single, x) + + def get_sampled_approxs(self, + featmap_sizes: List[Tuple[int, int]], + batch_img_metas: List[dict], + device: str = 'cuda') -> tuple: + """Get sampled approxs and inside flags according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + batch_img_metas (list[dict]): Image meta info. + device (str): device for returned tensors + + Returns: + tuple: approxes of each image, inside flags of each image + """ + num_imgs = len(batch_img_metas) + + # since feature map sizes of all images are the same, we only compute + # approxes for one time + multi_level_approxs = self.approx_anchor_generator.grid_priors( + featmap_sizes, device=device) + approxs_list = [multi_level_approxs for _ in range(num_imgs)] + + # for each image, we compute inside flags of multi level approxes + inside_flag_list = [] + for img_id, img_meta in enumerate(batch_img_metas): + multi_level_flags = [] + multi_level_approxs = approxs_list[img_id] + + # obtain valid flags for each approx first + multi_level_approx_flags = self.approx_anchor_generator \ + .valid_flags(featmap_sizes, + img_meta['pad_shape'], + device=device) + + for i, flags in enumerate(multi_level_approx_flags): + approxs = multi_level_approxs[i] + inside_flags_list = [] + for j in range(self.approxs_per_octave): + split_valid_flags = flags[j::self.approxs_per_octave] + split_approxs = approxs[j::self.approxs_per_octave, :] + inside_flags = anchor_inside_flags( + split_approxs, split_valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + inside_flags_list.append(inside_flags) + # inside_flag for a position is true if any anchor in this + # position is true + inside_flags = ( + torch.stack(inside_flags_list, 0).sum(dim=0) > 0) + multi_level_flags.append(inside_flags) + inside_flag_list.append(multi_level_flags) + return approxs_list, inside_flag_list + + def get_anchors(self, + featmap_sizes: List[Tuple[int, int]], + shape_preds: List[Tensor], + loc_preds: List[Tensor], + batch_img_metas: List[dict], + use_loc_filter: bool = False, + device: str = 'cuda') -> tuple: + """Get squares according to feature map sizes and guided anchors. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + shape_preds (list[tensor]): Multi-level shape predictions. + loc_preds (list[tensor]): Multi-level location predictions. + batch_img_metas (list[dict]): Image meta info. + use_loc_filter (bool): Use loc filter or not. Defaults to False + device (str): device for returned tensors. + Defaults to `cuda`. + + Returns: + tuple: square approxs of each image, guided anchors of each image, + loc masks of each image. + """ + num_imgs = len(batch_img_metas) + num_levels = len(featmap_sizes) + + # since feature map sizes of all images are the same, we only compute + # squares for one time + multi_level_squares = self.square_anchor_generator.grid_priors( + featmap_sizes, device=device) + squares_list = [multi_level_squares for _ in range(num_imgs)] + + # for each image, we compute multi level guided anchors + guided_anchors_list = [] + loc_mask_list = [] + for img_id, img_meta in enumerate(batch_img_metas): + multi_level_guided_anchors = [] + multi_level_loc_mask = [] + for i in range(num_levels): + squares = squares_list[img_id][i] + shape_pred = shape_preds[i][img_id] + loc_pred = loc_preds[i][img_id] + guided_anchors, loc_mask = self._get_guided_anchors_single( + squares, + shape_pred, + loc_pred, + use_loc_filter=use_loc_filter) + multi_level_guided_anchors.append(guided_anchors) + multi_level_loc_mask.append(loc_mask) + guided_anchors_list.append(multi_level_guided_anchors) + loc_mask_list.append(multi_level_loc_mask) + return squares_list, guided_anchors_list, loc_mask_list + + def _get_guided_anchors_single( + self, + squares: Tensor, + shape_pred: Tensor, + loc_pred: Tensor, + use_loc_filter: bool = False) -> Tuple[Tensor]: + """Get guided anchors and loc masks for a single level. + + Args: + squares (tensor): Squares of a single level. + shape_pred (tensor): Shape predictions of a single level. + loc_pred (tensor): Loc predictions of a single level. + use_loc_filter (list[tensor]): Use loc filter or not. + Defaults to False. + + Returns: + tuple: guided anchors, location masks + """ + # calculate location filtering mask + loc_pred = loc_pred.sigmoid().detach() + if use_loc_filter: + loc_mask = loc_pred >= self.loc_filter_thr + else: + loc_mask = loc_pred >= 0.0 + mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors) + mask = mask.contiguous().view(-1) + # calculate guided anchors + squares = squares[mask] + anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view( + -1, 2).detach()[mask] + bbox_deltas = anchor_deltas.new_full(squares.size(), 0) + bbox_deltas[:, 2:] = anchor_deltas + guided_anchors = self.anchor_coder.decode( + squares, bbox_deltas, wh_ratio_clip=1e-6) + return guided_anchors, mask + + def ga_loc_targets(self, batch_gt_instances: InstanceList, + featmap_sizes: List[Tuple[int, int]]) -> tuple: + """Compute location targets for guided anchoring. + + Each feature map is divided into positive, negative and ignore regions. + - positive regions: target 1, weight 1 + - ignore regions: target 0, weight 0 + - negative regions: target 0, weight 0.1 + + Args: + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + featmap_sizes (list[tuple]): Multi level sizes of each feature + maps. + + Returns: + tuple: Returns a tuple containing location targets. + """ + anchor_scale = self.approx_anchor_generator.octave_base_scale + anchor_strides = self.approx_anchor_generator.strides + # Currently only supports same stride in x and y direction. + for stride in anchor_strides: + assert (stride[0] == stride[1]) + anchor_strides = [stride[0] for stride in anchor_strides] + + center_ratio = self.train_cfg['center_ratio'] + ignore_ratio = self.train_cfg['ignore_ratio'] + img_per_gpu = len(batch_gt_instances) + num_lvls = len(featmap_sizes) + r1 = (1 - center_ratio) / 2 + r2 = (1 - ignore_ratio) / 2 + all_loc_targets = [] + all_loc_weights = [] + all_ignore_map = [] + for lvl_id in range(num_lvls): + h, w = featmap_sizes[lvl_id] + loc_targets = torch.zeros( + img_per_gpu, + 1, + h, + w, + device=batch_gt_instances[0].bboxes.device, + dtype=torch.float32) + loc_weights = torch.full_like(loc_targets, -1) + ignore_map = torch.zeros_like(loc_targets) + all_loc_targets.append(loc_targets) + all_loc_weights.append(loc_weights) + all_ignore_map.append(ignore_map) + for img_id in range(img_per_gpu): + gt_bboxes = batch_gt_instances[img_id].bboxes + scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) * + (gt_bboxes[:, 3] - gt_bboxes[:, 1])) + min_anchor_size = scale.new_full( + (1, ), float(anchor_scale * anchor_strides[0])) + # assign gt bboxes to different feature levels w.r.t. their scales + target_lvls = torch.floor( + torch.log2(scale) - torch.log2(min_anchor_size) + 0.5) + target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long() + for gt_id in range(gt_bboxes.size(0)): + lvl = target_lvls[gt_id].item() + # rescaled to corresponding feature map + gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl] + # calculate ignore regions + ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( + gt_, r2, featmap_sizes[lvl]) + # calculate positive (center) regions + ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region( + gt_, r1, featmap_sizes[lvl]) + all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1, + ctr_x1:ctr_x2 + 1] = 1 + all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1, + ignore_x1:ignore_x2 + 1] = 0 + all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1, + ctr_x1:ctr_x2 + 1] = 1 + # calculate ignore map on nearby low level feature + if lvl > 0: + d_lvl = lvl - 1 + # rescaled to corresponding feature map + gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl] + ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( + gt_, r2, featmap_sizes[d_lvl]) + all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1, + ignore_x1:ignore_x2 + 1] = 1 + # calculate ignore map on nearby high level feature + if lvl < num_lvls - 1: + u_lvl = lvl + 1 + # rescaled to corresponding feature map + gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl] + ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region( + gt_, r2, featmap_sizes[u_lvl]) + all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1, + ignore_x1:ignore_x2 + 1] = 1 + for lvl_id in range(num_lvls): + # ignore negative regions w.r.t. ignore map + all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0) + & (all_ignore_map[lvl_id] > 0)] = 0 + # set negative regions with weight 0.1 + all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1 + # loc average factor to balance loss + loc_avg_factor = sum( + [t.size(0) * t.size(-1) * t.size(-2) + for t in all_loc_targets]) / 200 + return all_loc_targets, all_loc_weights, loc_avg_factor + + def _ga_shape_target_single(self, + flat_approxs: Tensor, + inside_flags: Tensor, + flat_squares: Tensor, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData], + img_meta: dict, + unmap_outputs: bool = True) -> tuple: + """Compute guided anchoring targets. + + This function returns sampled anchors and gt bboxes directly + rather than calculates regression targets. + + Args: + flat_approxs (Tensor): flat approxs of a single image, + shape (n, 4) + inside_flags (Tensor): inside flags of a single image, + shape (n, ). + flat_squares (Tensor): flat squares of a single image, + shape (approxs_per_octave * n, 4) + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + img_meta (dict): Meta info of a single image. + unmap_outputs (bool): unmap outputs or not. + + Returns: + tuple: Returns a tuple containing shape targets of each image. + """ + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + # assign gt and sample anchors + num_square = flat_squares.size(0) + approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4) + approxs = approxs[inside_flags, ...] + squares = flat_squares[inside_flags, :] + + pred_instances = InstanceData() + pred_instances.priors = squares + pred_instances.approxs = approxs + + assign_result = self.ga_assigner.assign( + pred_instances=pred_instances, + gt_instances=gt_instances, + gt_instances_ignore=gt_instances_ignore) + sampling_result = self.ga_sampler.sample( + assign_result=assign_result, + pred_instances=pred_instances, + gt_instances=gt_instances) + + bbox_anchors = torch.zeros_like(squares) + bbox_gts = torch.zeros_like(squares) + bbox_weights = torch.zeros_like(squares) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes + bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes + bbox_weights[pos_inds, :] = 1.0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_squares.size(0) + bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags) + bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags) + bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags) + + return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds, + sampling_result) + + def ga_shape_targets(self, + approx_list: List[List[Tensor]], + inside_flag_list: List[List[Tensor]], + square_list: List[List[Tensor]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True) -> tuple: + """Compute guided anchoring targets. + + Args: + approx_list (list[list[Tensor]]): Multi level approxs of each + image. + inside_flag_list (list[list[Tensor]]): Multi level inside flags + of each image. + square_list (list[list[Tensor]]): Multi level squares of each + image. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): unmap outputs or not. Defaults to None. + + Returns: + tuple: Returns a tuple containing shape targets. + """ + num_imgs = len(batch_img_metas) + assert len(approx_list) == len(inside_flag_list) == len( + square_list) == num_imgs + # anchor number of multi levels + num_level_squares = [squares.size(0) for squares in square_list[0]] + # concat all level anchors and flags to a single tensor + inside_flag_flat_list = [] + approx_flat_list = [] + square_flat_list = [] + for i in range(num_imgs): + assert len(square_list[i]) == len(inside_flag_list[i]) + inside_flag_flat_list.append(torch.cat(inside_flag_list[i])) + approx_flat_list.append(torch.cat(approx_list[i])) + square_flat_list.append(torch.cat(square_list[i])) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None for _ in range(num_imgs)] + (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list, + neg_inds_list, sampling_results_list) = multi_apply( + self._ga_shape_target_single, + approx_flat_list, + inside_flag_flat_list, + square_flat_list, + batch_gt_instances, + batch_gt_instances_ignore, + batch_img_metas, + unmap_outputs=unmap_outputs) + # sampled anchors of all images + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # split targets to a list w.r.t. multiple levels + bbox_anchors_list = images_to_levels(all_bbox_anchors, + num_level_squares) + bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_squares) + return (bbox_anchors_list, bbox_gts_list, bbox_weights_list, + avg_factor) + + def loss_shape_single(self, shape_pred: Tensor, bbox_anchors: Tensor, + bbox_gts: Tensor, anchor_weights: Tensor, + avg_factor: int) -> Tensor: + """Compute shape loss in single level.""" + shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2) + bbox_anchors = bbox_anchors.contiguous().view(-1, 4) + bbox_gts = bbox_gts.contiguous().view(-1, 4) + anchor_weights = anchor_weights.contiguous().view(-1, 4) + bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0) + bbox_deltas[:, 2:] += shape_pred + # filter out negative samples to speed-up weighted_bounded_iou_loss + inds = torch.nonzero( + anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1) + bbox_deltas_ = bbox_deltas[inds] + bbox_anchors_ = bbox_anchors[inds] + bbox_gts_ = bbox_gts[inds] + anchor_weights_ = anchor_weights[inds] + pred_anchors_ = self.anchor_coder.decode( + bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6) + loss_shape = self.loss_shape( + pred_anchors_, bbox_gts_, anchor_weights_, avg_factor=avg_factor) + return loss_shape + + def loss_loc_single(self, loc_pred: Tensor, loc_target: Tensor, + loc_weight: Tensor, avg_factor: float) -> Tensor: + """Compute location loss in single level.""" + loss_loc = self.loss_loc( + loc_pred.reshape(-1, 1), + loc_target.reshape(-1).long(), + loc_weight.reshape(-1), + avg_factor=avg_factor) + return loss_loc + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + shape_preds: List[Tensor], + loc_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + shape_preds (list[Tensor]): shape predictions for each scale + level with shape (N, 1, H, W). + loc_preds (list[Tensor]): location predictions for each scale + level with shape (N, num_anchors * 2, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.approx_anchor_generator.num_levels + + device = cls_scores[0].device + + # get loc targets + loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets( + batch_gt_instances, featmap_sizes) + + # get sampled approxes + approxs_list, inside_flag_list = self.get_sampled_approxs( + featmap_sizes, batch_img_metas, device=device) + # get squares and guided anchors + squares_list, guided_anchors_list, _ = self.get_anchors( + featmap_sizes, + shape_preds, + loc_preds, + batch_img_metas, + device=device) + + # get shape targets + shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list, + squares_list, batch_gt_instances, + batch_img_metas) + (bbox_anchors_list, bbox_gts_list, anchor_weights_list, + ga_avg_factor) = shape_targets + + # get anchor targets + cls_reg_targets = self.get_targets( + guided_anchors_list, + inside_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor) = cls_reg_targets + + # anchor number of multi levels + num_level_anchors = [ + anchors.size(0) for anchors in guided_anchors_list[0] + ] + # concat all level anchors to a single tensor + concat_anchor_list = [] + for i in range(len(guided_anchors_list)): + concat_anchor_list.append(torch.cat(guided_anchors_list[i])) + all_anchor_list = images_to_levels(concat_anchor_list, + num_level_anchors) + + # get classification and bbox regression losses + losses_cls, losses_bbox = multi_apply( + self.loss_by_feat_single, + cls_scores, + bbox_preds, + all_anchor_list, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + avg_factor=avg_factor) + + # get anchor location loss + losses_loc = [] + for i in range(len(loc_preds)): + loss_loc = self.loss_loc_single( + loc_preds[i], + loc_targets[i], + loc_weights[i], + avg_factor=loc_avg_factor) + losses_loc.append(loss_loc) + + # get anchor shape loss + losses_shape = [] + for i in range(len(shape_preds)): + loss_shape = self.loss_shape_single( + shape_preds[i], + bbox_anchors_list[i], + bbox_gts_list[i], + anchor_weights_list[i], + avg_factor=ga_avg_factor) + losses_shape.append(loss_shape) + + return dict( + loss_cls=losses_cls, + loss_bbox=losses_bbox, + loss_shape=losses_shape, + loss_loc=losses_loc) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + shape_preds: List[Tensor], + loc_preds: List[Tensor], + batch_img_metas: List[dict], + cfg: OptConfigType = None, + rescale: bool = False) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + shape_preds (list[Tensor]): shape predictions for each scale + level with shape (N, 1, H, W). + loc_preds (list[Tensor]): location predictions for each scale + level with shape (N, num_anchors * 2, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), the last + dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len( + loc_preds) + num_levels = len(cls_scores) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + device = cls_scores[0].device + # get guided anchors + _, guided_anchors, loc_masks = self.get_anchors( + featmap_sizes, + shape_preds, + loc_preds, + batch_img_metas, + use_loc_filter=not self.training, + device=device) + result_list = [] + for img_id in range(len(batch_img_metas)): + cls_score_list = [ + cls_scores[i][img_id].detach() for i in range(num_levels) + ] + bbox_pred_list = [ + bbox_preds[i][img_id].detach() for i in range(num_levels) + ] + guided_anchor_list = [ + guided_anchors[img_id][i].detach() for i in range(num_levels) + ] + loc_mask_list = [ + loc_masks[img_id][i].detach() for i in range(num_levels) + ] + proposals = self._predict_by_feat_single( + cls_scores=cls_score_list, + bbox_preds=bbox_pred_list, + mlvl_anchors=guided_anchor_list, + mlvl_masks=loc_mask_list, + img_meta=batch_img_metas[img_id], + cfg=cfg, + rescale=rescale) + result_list.append(proposals) + return result_list + + def _predict_by_feat_single(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + mlvl_anchors: List[Tensor], + mlvl_masks: List[Tensor], + img_meta: dict, + cfg: ConfigType, + rescale: bool = False) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_scores (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + mlvl_anchors (list[Tensor]): Each element in the list is + the anchors of a single level in feature pyramid. it has + shape (num_priors, 4). + mlvl_masks (list[Tensor]): Each element in the list is location + masks of a single level. + img_meta (dict): Image meta info. + cfg (:obj:`ConfigDict` or dict): Test / postprocessing + configuration, if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), the last + dimension 4 arrange as (x1, y1, x2, y2). + """ + cfg = self.test_cfg if cfg is None else cfg + assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) + mlvl_bbox_preds = [] + mlvl_valid_anchors = [] + mlvl_scores = [] + for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds, + mlvl_anchors, + mlvl_masks): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + # if no location is kept, end. + if mask.sum() == 0: + continue + # reshape scores and bbox_pred + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(-1) + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + # filter scores, bbox_pred w.r.t. mask. + # anchors are filtered in get_anchors() beforehand. + scores = scores[mask, :] + bbox_pred = bbox_pred[mask, :] + if scores.dim() == 0: + anchors = anchors.unsqueeze(0) + scores = scores.unsqueeze(0) + bbox_pred = bbox_pred.unsqueeze(0) + # filter anchors, bbox_pred, scores w.r.t. scores + nms_pre = cfg.get('nms_pre', -1) + if nms_pre > 0 and scores.shape[0] > nms_pre: + if self.use_sigmoid_cls: + max_scores, _ = scores.max(dim=1) + else: + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + max_scores, _ = scores[:, :-1].max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + anchors = anchors[topk_inds, :] + bbox_pred = bbox_pred[topk_inds, :] + scores = scores[topk_inds, :] + + mlvl_bbox_preds.append(bbox_pred) + mlvl_valid_anchors.append(anchors) + mlvl_scores.append(scores) + + mlvl_bbox_preds = torch.cat(mlvl_bbox_preds) + mlvl_anchors = torch.cat(mlvl_valid_anchors) + mlvl_scores = torch.cat(mlvl_scores) + mlvl_bboxes = self.bbox_coder.decode( + mlvl_anchors, mlvl_bbox_preds, max_shape=img_meta['img_shape']) + + if rescale: + assert img_meta.get('scale_factor') is not None + mlvl_bboxes /= mlvl_bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + + if self.use_sigmoid_cls: + # Add a dummy background class to the backend when using sigmoid + # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 + # BG cat_id: num_class + padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) + mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) + # multi class NMS + det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores, + cfg.score_thr, cfg.nms, + cfg.max_per_img) + + results = InstanceData() + results.bboxes = det_bboxes[:, :-1] + results.scores = det_bboxes[:, -1] + results.labels = det_labels + return results diff --git a/mmdetection/mmdet/models/dense_heads/lad_head.py b/mmdetection/mmdet/models/dense_heads/lad_head.py new file mode 100644 index 0000000..d1218e1 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/lad_head.py @@ -0,0 +1,226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import InstanceList, OptInstanceList +from ..utils import levels_to_images, multi_apply, unpack_gt_instances +from .paa_head import PAAHead + + +@MODELS.register_module() +class LADHead(PAAHead): + """Label Assignment Head from the paper: `Improving Object Detection by + Label Assignment Distillation `_""" + + def get_label_assignment( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + iou_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> tuple: + """Get label assignment (from teacher). + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + iou_preds (list[Tensor]): iou_preds for each scale + level with shape (N, num_anchors * 1, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + tuple: Returns a tuple containing label assignment variables. + + - labels (Tensor): Labels of all anchors, each with + shape (num_anchors,). + - labels_weight (Tensor): Label weights of all anchor. + each with shape (num_anchors,). + - bboxes_target (Tensor): BBox targets of all anchors. + each with shape (num_anchors, 4). + - bboxes_weight (Tensor): BBox weights of all anchors. + each with shape (num_anchors, 4). + - pos_inds_flatten (Tensor): Contains all index of positive + sample in all anchor. + - pos_anchors (Tensor): Positive anchors. + - num_pos (int): Number of positive anchors. + """ + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + ) + (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds, + pos_gt_index) = cls_reg_targets + cls_scores = levels_to_images(cls_scores) + cls_scores = [ + item.reshape(-1, self.cls_out_channels) for item in cls_scores + ] + bbox_preds = levels_to_images(bbox_preds) + bbox_preds = [item.reshape(-1, 4) for item in bbox_preds] + pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list, + cls_scores, bbox_preds, labels, + labels_weight, bboxes_target, + bboxes_weight, pos_inds) + + with torch.no_grad(): + reassign_labels, reassign_label_weight, \ + reassign_bbox_weights, num_pos = multi_apply( + self.paa_reassign, + pos_losses_list, + labels, + labels_weight, + bboxes_weight, + pos_inds, + pos_gt_index, + anchor_list) + num_pos = sum(num_pos) + # convert all tensor list to a flatten tensor + labels = torch.cat(reassign_labels, 0).view(-1) + flatten_anchors = torch.cat( + [torch.cat(item, 0) for item in anchor_list]) + labels_weight = torch.cat(reassign_label_weight, 0).view(-1) + bboxes_target = torch.cat(bboxes_target, + 0).view(-1, bboxes_target[0].size(-1)) + + pos_inds_flatten = ((labels >= 0) + & + (labels < self.num_classes)).nonzero().reshape(-1) + + if num_pos: + pos_anchors = flatten_anchors[pos_inds_flatten] + else: + pos_anchors = None + + label_assignment_results = (labels, labels_weight, bboxes_target, + bboxes_weight, pos_inds_flatten, + pos_anchors, num_pos) + return label_assignment_results + + def loss(self, x: List[Tensor], label_assignment_results: tuple, + batch_data_samples: SampleList) -> dict: + """Forward train with the available label assignment (student receives + from teacher). + + Args: + x (list[Tensor]): Features from FPN. + label_assignment_results (tuple): As the outputs defined in the + function `self.get_label_assignment`. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + losses: (dict[str, Tensor]): A dictionary of loss components. + """ + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + + outs = self(x) + loss_inputs = outs + (batch_gt_instances, batch_img_metas) + losses = self.loss_by_feat( + *loss_inputs, + batch_gt_instances_ignore=batch_gt_instances_ignore, + label_assignment_results=label_assignment_results) + return losses + + def loss_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + iou_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + label_assignment_results: Optional[tuple] = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + iou_preds (list[Tensor]): iou_preds for each scale + level with shape (N, num_anchors * 1, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + label_assignment_results (tuple, optional): As the outputs defined + in the function `self.get_ + label_assignment`. + + Returns: + dict[str, Tensor]: A dictionary of loss gmm_assignment. + """ + + (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds_flatten, + pos_anchors, num_pos) = label_assignment_results + + cls_scores = levels_to_images(cls_scores) + cls_scores = [ + item.reshape(-1, self.cls_out_channels) for item in cls_scores + ] + bbox_preds = levels_to_images(bbox_preds) + bbox_preds = [item.reshape(-1, 4) for item in bbox_preds] + iou_preds = levels_to_images(iou_preds) + iou_preds = [item.reshape(-1, 1) for item in iou_preds] + + # convert all tensor list to a flatten tensor + cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1)) + bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1)) + iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1)) + + losses_cls = self.loss_cls( + cls_scores, + labels, + labels_weight, + avg_factor=max(num_pos, len(batch_img_metas))) # avoid num_pos=0 + if num_pos: + pos_bbox_pred = self.bbox_coder.decode( + pos_anchors, bbox_preds[pos_inds_flatten]) + pos_bbox_target = bboxes_target[pos_inds_flatten] + iou_target = bbox_overlaps( + pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True) + losses_iou = self.loss_centerness( + iou_preds[pos_inds_flatten], + iou_target.unsqueeze(-1), + avg_factor=num_pos) + losses_bbox = self.loss_bbox( + pos_bbox_pred, pos_bbox_target, avg_factor=num_pos) + + else: + losses_iou = iou_preds.sum() * 0 + losses_bbox = bbox_preds.sum() * 0 + + return dict( + loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou) diff --git a/mmdetection/mmdet/models/dense_heads/ld_head.py b/mmdetection/mmdet/models/dense_heads/ld_head.py new file mode 100644 index 0000000..2558fac --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/ld_head.py @@ -0,0 +1,257 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean +from ..utils import multi_apply, unpack_gt_instances +from .gfl_head import GFLHead + + +@MODELS.register_module() +class LDHead(GFLHead): + """Localization distillation Head. (Short description) + + It utilizes the learned bbox distributions to transfer the localization + dark knowledge from teacher to student. Original paper: `Localization + Distillation for Object Detection. `_ + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + loss_ld (:obj:`ConfigDict` or dict): Config of Localization + Distillation Loss (LD), T is the temperature for distillation. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_ld: ConfigType = dict( + type='LocalizationDistillationLoss', + loss_weight=0.25, + T=10), + **kwargs) -> dict: + + super().__init__( + num_classes=num_classes, in_channels=in_channels, **kwargs) + self.loss_ld = MODELS.build(loss_ld) + + def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor, + bbox_pred: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + stride: Tuple[int], soft_targets: Tensor, + avg_factor: int): + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + anchors (Tensor): Box reference for each scale level with shape + (N, num_total_anchors, 4). + cls_score (Tensor): Cls and quality joint scores for each scale + level has shape (N, num_classes, H, W). + bbox_pred (Tensor): Box distribution logits for each scale + level with shape (N, 4*(n+1), H, W), n is max value of integral + set. + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (N, num_total_anchors, 4). + stride (tuple): Stride in this scale level. + soft_targets (Tensor): Soft BBox regression targets. + avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + dict[tuple, Tensor]: Loss components and weight targets. + """ + assert stride[0] == stride[1], 'h stride is not equal to w stride!' + anchors = anchors.reshape(-1, 4) + cls_score = cls_score.permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + bbox_pred = bbox_pred.permute(0, 2, 3, + 1).reshape(-1, 4 * (self.reg_max + 1)) + soft_targets = soft_targets.permute(0, 2, 3, + 1).reshape(-1, + 4 * (self.reg_max + 1)) + + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + score = label_weights.new_zeros(labels.shape) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + pos_anchors = anchors[pos_inds] + pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0] + + weight_targets = cls_score.detach().sigmoid() + weight_targets = weight_targets.max(dim=1)[0][pos_inds] + pos_bbox_pred_corners = self.integral(pos_bbox_pred) + pos_decode_bbox_pred = self.bbox_coder.decode( + pos_anchor_centers, pos_bbox_pred_corners) + pos_decode_bbox_targets = pos_bbox_targets / stride[0] + score[pos_inds] = bbox_overlaps( + pos_decode_bbox_pred.detach(), + pos_decode_bbox_targets, + is_aligned=True) + pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1) + pos_soft_targets = soft_targets[pos_inds] + soft_corners = pos_soft_targets.reshape(-1, self.reg_max + 1) + + target_corners = self.bbox_coder.encode(pos_anchor_centers, + pos_decode_bbox_targets, + self.reg_max).reshape(-1) + + # regression loss + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_decode_bbox_targets, + weight=weight_targets, + avg_factor=1.0) + + # dfl loss + loss_dfl = self.loss_dfl( + pred_corners, + target_corners, + weight=weight_targets[:, None].expand(-1, 4).reshape(-1), + avg_factor=4.0) + + # ld loss + loss_ld = self.loss_ld( + pred_corners, + soft_corners, + weight=weight_targets[:, None].expand(-1, 4).reshape(-1), + avg_factor=4.0) + + else: + loss_ld = bbox_pred.sum() * 0 + loss_bbox = bbox_pred.sum() * 0 + loss_dfl = bbox_pred.sum() * 0 + weight_targets = bbox_pred.new_tensor(0) + + # cls (qfl) loss + loss_cls = self.loss_cls( + cls_score, (labels, score), + weight=label_weights, + avg_factor=avg_factor) + + return loss_cls, loss_bbox, loss_dfl, loss_ld, weight_targets.sum() + + def loss(self, x: List[Tensor], out_teacher: Tuple[Tensor], + batch_data_samples: SampleList) -> dict: + """ + Args: + x (list[Tensor]): Features from FPN. + out_teacher (tuple[Tensor]): The output of teacher. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + tuple[dict, list]: The loss components and proposals of each image. + + - losses (dict[str, Tensor]): A dictionary of loss components. + - proposal_list (list[Tensor]): Proposals of each image. + """ + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + + outs = self(x) + soft_targets = out_teacher[1] + loss_inputs = outs + (batch_gt_instances, batch_img_metas, + soft_targets) + losses = self.loss_by_feat( + *loss_inputs, batch_gt_instances_ignore=batch_gt_instances_ignore) + + return losses + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + soft_targets: List[Tensor], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Cls and quality scores for each scale + level has shape (N, num_classes, H, W). + bbox_preds (list[Tensor]): Box distribution logits for each scale + level with shape (N, 4*(n+1), H, W), n is max value of integral + set. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + soft_targets (list[Tensor]): Soft BBox regression targets. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) = cls_reg_targets + + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + + losses_cls, losses_bbox, losses_dfl, losses_ld, \ + avg_factor = multi_apply( + self.loss_by_feat_single, + anchor_list, + cls_scores, + bbox_preds, + labels_list, + label_weights_list, + bbox_targets_list, + self.prior_generator.strides, + soft_targets, + avg_factor=avg_factor) + + avg_factor = sum(avg_factor) + 1e-6 + avg_factor = reduce_mean(avg_factor).item() + losses_bbox = [x / avg_factor for x in losses_bbox] + losses_dfl = [x / avg_factor for x in losses_dfl] + return dict( + loss_cls=losses_cls, + loss_bbox=losses_bbox, + loss_dfl=losses_dfl, + loss_ld=losses_ld) diff --git a/mmdetection/mmdet/models/dense_heads/mask2former_head.py b/mmdetection/mmdet/models/dense_heads/mask2former_head.py new file mode 100644 index 0000000..12d47c6 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/mask2former_head.py @@ -0,0 +1,459 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Conv2d +from mmcv.ops import point_sample +from mmengine.model import ModuleList, caffe2_xavier_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig, reduce_mean +from ..layers import Mask2FormerTransformerDecoder, SinePositionalEncoding +from ..utils import get_uncertain_point_coords_with_randomness +from .anchor_free_head import AnchorFreeHead +from .maskformer_head import MaskFormerHead + + +@MODELS.register_module() +class Mask2FormerHead(MaskFormerHead): + """Implements the Mask2Former head. + + See `Masked-attention Mask Transformer for Universal Image + Segmentation `_ for details. + + Args: + in_channels (list[int]): Number of channels in the input feature map. + feat_channels (int): Number of channels for features. + out_channels (int): Number of channels for output. + num_things_classes (int): Number of things. + num_stuff_classes (int): Number of stuff. + num_queries (int): Number of query in Transformer decoder. + pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel + decoder. Defaults to None. + enforce_decoder_input_project (bool, optional): Whether to add + a layer to change the embed_dim of tranformer encoder in + pixel decoder to the embed_dim of transformer decoder. + Defaults to False. + transformer_decoder (:obj:`ConfigDict` or dict): Config for + transformer decoder. Defaults to None. + positional_encoding (:obj:`ConfigDict` or dict): Config for + transformer decoder position encoding. Defaults to + dict(num_feats=128, normalize=True). + loss_cls (:obj:`ConfigDict` or dict): Config of the classification + loss. Defaults to None. + loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss. + Defaults to None. + loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss. + Defaults to None. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + Mask2Former head. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + Mask2Former head. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + feat_channels: int, + out_channels: int, + num_things_classes: int = 80, + num_stuff_classes: int = 53, + num_queries: int = 100, + num_transformer_feat_level: int = 3, + pixel_decoder: ConfigType = ..., + enforce_decoder_input_project: bool = False, + transformer_decoder: ConfigType = ..., + positional_encoding: ConfigType = dict( + num_feats=128, normalize=True), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * 133 + [0.1]), + loss_mask: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice: ConfigType = dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + super(AnchorFreeHead, self).__init__(init_cfg=init_cfg) + self.num_things_classes = num_things_classes + self.num_stuff_classes = num_stuff_classes + self.num_classes = self.num_things_classes + self.num_stuff_classes + self.num_queries = num_queries + self.num_transformer_feat_level = num_transformer_feat_level + self.num_heads = transformer_decoder.layer_cfg.cross_attn_cfg.num_heads + self.num_transformer_decoder_layers = transformer_decoder.num_layers + assert pixel_decoder.encoder.layer_cfg. \ + self_attn_cfg.num_levels == num_transformer_feat_level + pixel_decoder_ = copy.deepcopy(pixel_decoder) + pixel_decoder_.update( + in_channels=in_channels, + feat_channels=feat_channels, + out_channels=out_channels) + self.pixel_decoder = MODELS.build(pixel_decoder_) + self.transformer_decoder = Mask2FormerTransformerDecoder( + **transformer_decoder) + self.decoder_embed_dims = self.transformer_decoder.embed_dims + + self.decoder_input_projs = ModuleList() + # from low resolution to high resolution + for _ in range(num_transformer_feat_level): + if (self.decoder_embed_dims != feat_channels + or enforce_decoder_input_project): + self.decoder_input_projs.append( + Conv2d( + feat_channels, self.decoder_embed_dims, kernel_size=1)) + else: + self.decoder_input_projs.append(nn.Identity()) + self.decoder_positional_encoding = SinePositionalEncoding( + **positional_encoding) + self.query_embed = nn.Embedding(self.num_queries, feat_channels) + self.query_feat = nn.Embedding(self.num_queries, feat_channels) + # from low resolution to high resolution + self.level_embed = nn.Embedding(self.num_transformer_feat_level, + feat_channels) + + self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) + self.mask_embed = nn.Sequential( + nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), + nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), + nn.Linear(feat_channels, out_channels)) + + self.test_cfg = test_cfg + self.train_cfg = train_cfg + if train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + self.num_points = self.train_cfg.get('num_points', 12544) + self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0) + self.importance_sample_ratio = self.train_cfg.get( + 'importance_sample_ratio', 0.75) + + self.class_weight = loss_cls.class_weight + self.loss_cls = MODELS.build(loss_cls) + self.loss_mask = MODELS.build(loss_mask) + self.loss_dice = MODELS.build(loss_dice) + + def init_weights(self) -> None: + for m in self.decoder_input_projs: + if isinstance(m, Conv2d): + caffe2_xavier_init(m, bias=0) + + self.pixel_decoder.init_weights() + + for p in self.transformer_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_normal_(p) + + def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor, + gt_instances: InstanceData, + img_meta: dict) -> Tuple[Tensor]: + """Compute classification and mask targets for one image. + + Args: + cls_score (Tensor): Mask score logits from a single decoder layer + for one image. Shape (num_queries, cls_out_channels). + mask_pred (Tensor): Mask logits for a single decoder layer for one + image. Shape (num_queries, h, w). + gt_instances (:obj:`InstanceData`): It contains ``labels`` and + ``masks``. + img_meta (dict): Image informtation. + + Returns: + tuple[Tensor]: A tuple containing the following for one image. + + - labels (Tensor): Labels of each image. \ + shape (num_queries, ). + - label_weights (Tensor): Label weights of each image. \ + shape (num_queries, ). + - mask_targets (Tensor): Mask targets of each image. \ + shape (num_queries, h, w). + - mask_weights (Tensor): Mask weights of each image. \ + shape (num_queries, ). + - pos_inds (Tensor): Sampled positive indices for each \ + image. + - neg_inds (Tensor): Sampled negative indices for each \ + image. + - sampling_result (:obj:`SamplingResult`): Sampling results. + """ + gt_labels = gt_instances.labels + gt_masks = gt_instances.masks + # sample points + num_queries = cls_score.shape[0] + num_gts = gt_labels.shape[0] + + point_coords = torch.rand((1, self.num_points, 2), + device=cls_score.device) + # shape (num_queries, num_points) + mask_points_pred = point_sample( + mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1, + 1)).squeeze(1) + # shape (num_gts, num_points) + gt_points_masks = point_sample( + gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1, + 1)).squeeze(1) + + sampled_gt_instances = InstanceData( + labels=gt_labels, masks=gt_points_masks) + sampled_pred_instances = InstanceData( + scores=cls_score, masks=mask_points_pred) + # assign and sample + assign_result = self.assigner.assign( + pred_instances=sampled_pred_instances, + gt_instances=sampled_gt_instances, + img_meta=img_meta) + pred_instances = InstanceData(scores=cls_score, masks=mask_pred) + sampling_result = self.sampler.sample( + assign_result=assign_result, + pred_instances=pred_instances, + gt_instances=gt_instances) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label target + labels = gt_labels.new_full((self.num_queries, ), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_labels.new_ones((self.num_queries, )) + + # mask target + mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds] + mask_weights = mask_pred.new_zeros((self.num_queries, )) + mask_weights[pos_inds] = 1.0 + + return (labels, label_weights, mask_targets, mask_weights, pos_inds, + neg_inds, sampling_result) + + def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor, + batch_gt_instances: List[InstanceData], + batch_img_metas: List[dict]) -> Tuple[Tensor]: + """Loss function for outputs from a single decoder layer. + + Args: + cls_scores (Tensor): Mask score logits from a single decoder layer + for all images. Shape (batch_size, num_queries, + cls_out_channels). Note `cls_out_channels` should includes + background. + mask_preds (Tensor): Mask logits for a pixel decoder for all + images. Shape (batch_size, num_queries, h, w). + batch_gt_instances (list[obj:`InstanceData`]): each contains + ``labels`` and ``masks``. + batch_img_metas (list[dict]): List of image meta information. + + Returns: + tuple[Tensor]: Loss components for outputs from a single \ + decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + mask_preds_list = [mask_preds[i] for i in range(num_imgs)] + (labels_list, label_weights_list, mask_targets_list, mask_weights_list, + avg_factor) = self.get_targets(cls_scores_list, mask_preds_list, + batch_gt_instances, batch_img_metas) + # shape (batch_size, num_queries) + labels = torch.stack(labels_list, dim=0) + # shape (batch_size, num_queries) + label_weights = torch.stack(label_weights_list, dim=0) + # shape (num_total_gts, h, w) + mask_targets = torch.cat(mask_targets_list, dim=0) + # shape (batch_size, num_queries) + mask_weights = torch.stack(mask_weights_list, dim=0) + + # classfication loss + # shape (batch_size * num_queries, ) + cls_scores = cls_scores.flatten(0, 1) + labels = labels.flatten(0, 1) + label_weights = label_weights.flatten(0, 1) + + class_weight = cls_scores.new_tensor(self.class_weight) + loss_cls = self.loss_cls( + cls_scores, + labels, + label_weights, + avg_factor=class_weight[labels].sum()) + + num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor])) + num_total_masks = max(num_total_masks, 1) + + # extract positive ones + # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w) + mask_preds = mask_preds[mask_weights > 0] + + if mask_targets.shape[0] == 0: + # zero match + loss_dice = mask_preds.sum() + loss_mask = mask_preds.sum() + return loss_cls, loss_mask, loss_dice + + with torch.no_grad(): + points_coords = get_uncertain_point_coords_with_randomness( + mask_preds.unsqueeze(1), None, self.num_points, + self.oversample_ratio, self.importance_sample_ratio) + # shape (num_total_gts, h, w) -> (num_total_gts, num_points) + mask_point_targets = point_sample( + mask_targets.unsqueeze(1).float(), points_coords).squeeze(1) + # shape (num_queries, h, w) -> (num_queries, num_points) + mask_point_preds = point_sample( + mask_preds.unsqueeze(1), points_coords).squeeze(1) + + # dice loss + loss_dice = self.loss_dice( + mask_point_preds, mask_point_targets, avg_factor=num_total_masks) + + # mask loss + # shape (num_queries, num_points) -> (num_queries * num_points, ) + mask_point_preds = mask_point_preds.reshape(-1) + # shape (num_total_gts, num_points) -> (num_total_gts * num_points, ) + mask_point_targets = mask_point_targets.reshape(-1) + loss_mask = self.loss_mask( + mask_point_preds, + mask_point_targets, + avg_factor=num_total_masks * self.num_points) + + return loss_cls, loss_mask, loss_dice + + def _forward_head(self, decoder_out: Tensor, mask_feature: Tensor, + attn_mask_target_size: Tuple[int, int]) -> Tuple[Tensor]: + """Forward for head part which is called after every decoder layer. + + Args: + decoder_out (Tensor): in shape (batch_size, num_queries, c). + mask_feature (Tensor): in shape (batch_size, c, h, w). + attn_mask_target_size (tuple[int, int]): target attention + mask size. + + Returns: + tuple: A tuple contain three elements. + + - cls_pred (Tensor): Classification scores in shape \ + (batch_size, num_queries, cls_out_channels). \ + Note `cls_out_channels` should includes background. + - mask_pred (Tensor): Mask scores in shape \ + (batch_size, num_queries,h, w). + - attn_mask (Tensor): Attention mask in shape \ + (batch_size * num_heads, num_queries, h, w). + """ + decoder_out = self.transformer_decoder.post_norm(decoder_out) + # shape (num_queries, batch_size, c) + cls_pred = self.cls_embed(decoder_out) + # shape (num_queries, batch_size, c) + mask_embed = self.mask_embed(decoder_out) + # shape (num_queries, batch_size, h, w) + mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature) + attn_mask = F.interpolate( + mask_pred, + attn_mask_target_size, + mode='bilinear', + align_corners=False) + # shape (num_queries, batch_size, h, w) -> + # (batch_size * num_head, num_queries, h, w) + attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat( + (1, self.num_heads, 1, 1)).flatten(0, 1) + attn_mask = attn_mask.sigmoid() < 0.5 + attn_mask = attn_mask.detach() + + return cls_pred, mask_pred, attn_mask + + def forward(self, x: List[Tensor], + batch_data_samples: SampleList) -> Tuple[List[Tensor]]: + """Forward function. + + Args: + x (list[Tensor]): Multi scale Features from the + upstream network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + tuple[list[Tensor]]: A tuple contains two elements. + + - cls_pred_list (list[Tensor)]: Classification logits \ + for each decoder layer. Each is a 3D-tensor with shape \ + (batch_size, num_queries, cls_out_channels). \ + Note `cls_out_channels` should includes background. + - mask_pred_list (list[Tensor]): Mask logits for each \ + decoder layer. Each with shape (batch_size, num_queries, \ + h, w). + """ + batch_size = x[0].shape[0] + mask_features, multi_scale_memorys = self.pixel_decoder(x) + # multi_scale_memorys (from low resolution to high resolution) + decoder_inputs = [] + decoder_positional_encodings = [] + for i in range(self.num_transformer_feat_level): + decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i]) + # shape (batch_size, c, h, w) -> (batch_size, h*w, c) + decoder_input = decoder_input.flatten(2).permute(0, 2, 1) + level_embed = self.level_embed.weight[i].view(1, 1, -1) + decoder_input = decoder_input + level_embed + # shape (batch_size, c, h, w) -> (batch_size, h*w, c) + mask = decoder_input.new_zeros( + (batch_size, ) + multi_scale_memorys[i].shape[-2:], + dtype=torch.bool) + decoder_positional_encoding = self.decoder_positional_encoding( + mask) + decoder_positional_encoding = decoder_positional_encoding.flatten( + 2).permute(0, 2, 1) + decoder_inputs.append(decoder_input) + decoder_positional_encodings.append(decoder_positional_encoding) + # shape (num_queries, c) -> (batch_size, num_queries, c) + query_feat = self.query_feat.weight.unsqueeze(0).repeat( + (batch_size, 1, 1)) + query_embed = self.query_embed.weight.unsqueeze(0).repeat( + (batch_size, 1, 1)) + + cls_pred_list = [] + mask_pred_list = [] + cls_pred, mask_pred, attn_mask = self._forward_head( + query_feat, mask_features, multi_scale_memorys[0].shape[-2:]) + cls_pred_list.append(cls_pred) + mask_pred_list.append(mask_pred) + + for i in range(self.num_transformer_decoder_layers): + level_idx = i % self.num_transformer_feat_level + # if a mask is all True(all background), then set it all False. + mask_sum = (attn_mask.sum(-1) != attn_mask.shape[-1]).unsqueeze(-1) + attn_mask = attn_mask & mask_sum + # cross_attn + self_attn + layer = self.transformer_decoder.layers[i] + query_feat = layer( + query=query_feat, + key=decoder_inputs[level_idx], + value=decoder_inputs[level_idx], + query_pos=query_embed, + key_pos=decoder_positional_encodings[level_idx], + cross_attn_mask=attn_mask, + query_key_padding_mask=None, + # here we do not apply masking on padded region + key_padding_mask=None) + cls_pred, mask_pred, attn_mask = self._forward_head( + query_feat, mask_features, multi_scale_memorys[ + (i + 1) % self.num_transformer_feat_level].shape[-2:]) + + cls_pred_list.append(cls_pred) + mask_pred_list.append(mask_pred) + + return cls_pred_list, mask_pred_list diff --git a/mmdetection/mmdet/models/dense_heads/maskformer_head.py b/mmdetection/mmdet/models/dense_heads/maskformer_head.py new file mode 100644 index 0000000..24c0655 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/maskformer_head.py @@ -0,0 +1,601 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Conv2d +from mmengine.model import caffe2_xavier_init +from mmengine.structures import InstanceData, PixelData +from torch import Tensor + +from mmdet.models.layers.pixel_decoder import PixelDecoder +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import SampleList +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptMultiConfig, reduce_mean) +from ..layers import DetrTransformerDecoder, SinePositionalEncoding +from ..utils import multi_apply, preprocess_panoptic_gt +from .anchor_free_head import AnchorFreeHead + + +@MODELS.register_module() +class MaskFormerHead(AnchorFreeHead): + """Implements the MaskFormer head. + + See `Per-Pixel Classification is Not All You Need for Semantic + Segmentation `_ for details. + + Args: + in_channels (list[int]): Number of channels in the input feature map. + feat_channels (int): Number of channels for feature. + out_channels (int): Number of channels for output. + num_things_classes (int): Number of things. + num_stuff_classes (int): Number of stuff. + num_queries (int): Number of query in Transformer. + pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel + decoder. + enforce_decoder_input_project (bool): Whether to add a layer + to change the embed_dim of transformer encoder in pixel decoder to + the embed_dim of transformer decoder. Defaults to False. + transformer_decoder (:obj:`ConfigDict` or dict): Config for + transformer decoder. + positional_encoding (:obj:`ConfigDict` or dict): Config for + transformer decoder position encoding. + loss_cls (:obj:`ConfigDict` or dict): Config of the classification + loss. Defaults to `CrossEntropyLoss`. + loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss. + Defaults to `FocalLoss`. + loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss. + Defaults to `DiceLoss`. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + MaskFormer head. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + MaskFormer head. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + feat_channels: int, + out_channels: int, + num_things_classes: int = 80, + num_stuff_classes: int = 53, + num_queries: int = 100, + pixel_decoder: ConfigType = ..., + enforce_decoder_input_project: bool = False, + transformer_decoder: ConfigType = ..., + positional_encoding: ConfigType = dict( + num_feats=128, normalize=True), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=[1.0] * 133 + [0.1]), + loss_mask: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=20.0), + loss_dice: ConfigType = dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + naive_dice=True, + loss_weight=1.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + super(AnchorFreeHead, self).__init__(init_cfg=init_cfg) + self.num_things_classes = num_things_classes + self.num_stuff_classes = num_stuff_classes + self.num_classes = self.num_things_classes + self.num_stuff_classes + self.num_queries = num_queries + + pixel_decoder.update( + in_channels=in_channels, + feat_channels=feat_channels, + out_channels=out_channels) + self.pixel_decoder = MODELS.build(pixel_decoder) + self.transformer_decoder = DetrTransformerDecoder( + **transformer_decoder) + self.decoder_embed_dims = self.transformer_decoder.embed_dims + if type(self.pixel_decoder) == PixelDecoder and ( + self.decoder_embed_dims != in_channels[-1] + or enforce_decoder_input_project): + self.decoder_input_proj = Conv2d( + in_channels[-1], self.decoder_embed_dims, kernel_size=1) + else: + self.decoder_input_proj = nn.Identity() + self.decoder_pe = SinePositionalEncoding(**positional_encoding) + self.query_embed = nn.Embedding(self.num_queries, out_channels) + + self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) + self.mask_embed = nn.Sequential( + nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), + nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), + nn.Linear(feat_channels, out_channels)) + + self.test_cfg = test_cfg + self.train_cfg = train_cfg + if train_cfg: + self.assigner = TASK_UTILS.build(train_cfg['assigner']) + self.sampler = TASK_UTILS.build( + train_cfg['sampler'], default_args=dict(context=self)) + + self.class_weight = loss_cls.class_weight + self.loss_cls = MODELS.build(loss_cls) + self.loss_mask = MODELS.build(loss_mask) + self.loss_dice = MODELS.build(loss_dice) + + def init_weights(self) -> None: + if isinstance(self.decoder_input_proj, Conv2d): + caffe2_xavier_init(self.decoder_input_proj, bias=0) + + self.pixel_decoder.init_weights() + + for p in self.transformer_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def preprocess_gt( + self, batch_gt_instances: InstanceList, + batch_gt_semantic_segs: List[Optional[PixelData]]) -> InstanceList: + """Preprocess the ground truth for all images. + + Args: + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``labels``, each is + ground truth labels of each bbox, with shape (num_gts, ) + and ``masks``, each is ground truth masks of each instances + of a image, shape (num_gts, h, w). + gt_semantic_seg (list[Optional[PixelData]]): Ground truth of + semantic segmentation, each with the shape (1, h, w). + [0, num_thing_class - 1] means things, + [num_thing_class, num_class-1] means stuff, + 255 means VOID. It's None when training instance segmentation. + + Returns: + list[obj:`InstanceData`]: each contains the following keys + + - labels (Tensor): Ground truth class indices\ + for a image, with shape (n, ), n is the sum of\ + number of stuff type and number of instance in a image. + - masks (Tensor): Ground truth mask for a\ + image, with shape (n, h, w). + """ + num_things_list = [self.num_things_classes] * len(batch_gt_instances) + num_stuff_list = [self.num_stuff_classes] * len(batch_gt_instances) + gt_labels_list = [ + gt_instances['labels'] for gt_instances in batch_gt_instances + ] + gt_masks_list = [ + gt_instances['masks'] for gt_instances in batch_gt_instances + ] + gt_semantic_segs = [ + None if gt_semantic_seg is None else gt_semantic_seg.sem_seg + for gt_semantic_seg in batch_gt_semantic_segs + ] + targets = multi_apply(preprocess_panoptic_gt, gt_labels_list, + gt_masks_list, gt_semantic_segs, num_things_list, + num_stuff_list) + labels, masks = targets + batch_gt_instances = [ + InstanceData(labels=label, masks=mask) + for label, mask in zip(labels, masks) + ] + return batch_gt_instances + + def get_targets( + self, + cls_scores_list: List[Tensor], + mask_preds_list: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + return_sampling_results: bool = False + ) -> Tuple[List[Union[Tensor, int]]]: + """Compute classification and mask targets for all images for a decoder + layer. + + Args: + cls_scores_list (list[Tensor]): Mask score logits from a single + decoder layer for all images. Each with shape (num_queries, + cls_out_channels). + mask_preds_list (list[Tensor]): Mask logits from a single decoder + layer for all images. Each with shape (num_queries, h, w). + batch_gt_instances (list[obj:`InstanceData`]): each contains + ``labels`` and ``masks``. + batch_img_metas (list[dict]): List of image meta information. + return_sampling_results (bool): Whether to return the sampling + results. Defaults to False. + + Returns: + tuple: a tuple containing the following targets. + + - labels_list (list[Tensor]): Labels of all images.\ + Each with shape (num_queries, ). + - label_weights_list (list[Tensor]): Label weights\ + of all images. Each with shape (num_queries, ). + - mask_targets_list (list[Tensor]): Mask targets of\ + all images. Each with shape (num_queries, h, w). + - mask_weights_list (list[Tensor]): Mask weights of\ + all images. Each with shape (num_queries, ). + - avg_factor (int): Average factor that is used to average\ + the loss. When using sampling method, avg_factor is + usually the sum of positive and negative priors. When + using `MaskPseudoSampler`, `avg_factor` is usually equal + to the number of positive priors. + + additional_returns: This function enables user-defined returns from + `self._get_targets_single`. These returns are currently refined + to properties at each feature map (i.e. having HxW dimension). + The results will be concatenated after the end. + """ + results = multi_apply(self._get_targets_single, cls_scores_list, + mask_preds_list, batch_gt_instances, + batch_img_metas) + (labels_list, label_weights_list, mask_targets_list, mask_weights_list, + pos_inds_list, neg_inds_list, sampling_results_list) = results[:7] + rest_results = list(results[7:]) + + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + + res = (labels_list, label_weights_list, mask_targets_list, + mask_weights_list, avg_factor) + if return_sampling_results: + res = res + (sampling_results_list) + + return res + tuple(rest_results) + + def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor, + gt_instances: InstanceData, + img_meta: dict) -> Tuple[Tensor]: + """Compute classification and mask targets for one image. + + Args: + cls_score (Tensor): Mask score logits from a single decoder layer + for one image. Shape (num_queries, cls_out_channels). + mask_pred (Tensor): Mask logits for a single decoder layer for one + image. Shape (num_queries, h, w). + gt_instances (:obj:`InstanceData`): It contains ``labels`` and + ``masks``. + img_meta (dict): Image informtation. + + Returns: + tuple: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + shape (num_queries, ). + - label_weights (Tensor): Label weights of each image. + shape (num_queries, ). + - mask_targets (Tensor): Mask targets of each image. + shape (num_queries, h, w). + - mask_weights (Tensor): Mask weights of each image. + shape (num_queries, ). + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + - sampling_result (:obj:`SamplingResult`): Sampling results. + """ + gt_masks = gt_instances.masks + gt_labels = gt_instances.labels + + target_shape = mask_pred.shape[-2:] + if gt_masks.shape[0] > 0: + gt_masks_downsampled = F.interpolate( + gt_masks.unsqueeze(1).float(), target_shape, + mode='nearest').squeeze(1).long() + else: + gt_masks_downsampled = gt_masks + + pred_instances = InstanceData(scores=cls_score, masks=mask_pred) + downsampled_gt_instances = InstanceData( + labels=gt_labels, masks=gt_masks_downsampled) + # assign and sample + assign_result = self.assigner.assign( + pred_instances=pred_instances, + gt_instances=downsampled_gt_instances, + img_meta=img_meta) + sampling_result = self.sampler.sample( + assign_result=assign_result, + pred_instances=pred_instances, + gt_instances=gt_instances) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label target + labels = gt_labels.new_full((self.num_queries, ), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_labels.new_ones(self.num_queries) + + # mask target + mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds] + mask_weights = mask_pred.new_zeros((self.num_queries, )) + mask_weights[pos_inds] = 1.0 + + return (labels, label_weights, mask_targets, mask_weights, pos_inds, + neg_inds, sampling_result) + + def loss_by_feat(self, all_cls_scores: Tensor, all_mask_preds: Tensor, + batch_gt_instances: List[InstanceData], + batch_img_metas: List[dict]) -> Dict[str, Tensor]: + """Loss function. + + Args: + all_cls_scores (Tensor): Classification scores for all decoder + layers with shape (num_decoder, batch_size, num_queries, + cls_out_channels). Note `cls_out_channels` should includes + background. + all_mask_preds (Tensor): Mask scores for all decoder layers with + shape (num_decoder, batch_size, num_queries, h, w). + batch_gt_instances (list[obj:`InstanceData`]): each contains + ``labels`` and ``masks``. + batch_img_metas (list[dict]): List of image meta information. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_dec_layers = len(all_cls_scores) + batch_gt_instances_list = [ + batch_gt_instances for _ in range(num_dec_layers) + ] + img_metas_list = [batch_img_metas for _ in range(num_dec_layers)] + losses_cls, losses_mask, losses_dice = multi_apply( + self._loss_by_feat_single, all_cls_scores, all_mask_preds, + batch_gt_instances_list, img_metas_list) + + loss_dict = dict() + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_mask'] = losses_mask[-1] + loss_dict['loss_dice'] = losses_dice[-1] + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_mask_i, loss_dice_i in zip( + losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i + loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i + num_dec_layer += 1 + return loss_dict + + def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor, + batch_gt_instances: List[InstanceData], + batch_img_metas: List[dict]) -> Tuple[Tensor]: + """Loss function for outputs from a single decoder layer. + + Args: + cls_scores (Tensor): Mask score logits from a single decoder layer + for all images. Shape (batch_size, num_queries, + cls_out_channels). Note `cls_out_channels` should includes + background. + mask_preds (Tensor): Mask logits for a pixel decoder for all + images. Shape (batch_size, num_queries, h, w). + batch_gt_instances (list[obj:`InstanceData`]): each contains + ``labels`` and ``masks``. + batch_img_metas (list[dict]): List of image meta information. + + Returns: + tuple[Tensor]: Loss components for outputs from a single decoder\ + layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + mask_preds_list = [mask_preds[i] for i in range(num_imgs)] + + (labels_list, label_weights_list, mask_targets_list, mask_weights_list, + avg_factor) = self.get_targets(cls_scores_list, mask_preds_list, + batch_gt_instances, batch_img_metas) + # shape (batch_size, num_queries) + labels = torch.stack(labels_list, dim=0) + # shape (batch_size, num_queries) + label_weights = torch.stack(label_weights_list, dim=0) + # shape (num_total_gts, h, w) + mask_targets = torch.cat(mask_targets_list, dim=0) + # shape (batch_size, num_queries) + mask_weights = torch.stack(mask_weights_list, dim=0) + + # classfication loss + # shape (batch_size * num_queries, ) + cls_scores = cls_scores.flatten(0, 1) + labels = labels.flatten(0, 1) + label_weights = label_weights.flatten(0, 1) + + class_weight = cls_scores.new_tensor(self.class_weight) + loss_cls = self.loss_cls( + cls_scores, + labels, + label_weights, + avg_factor=class_weight[labels].sum()) + + num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor])) + num_total_masks = max(num_total_masks, 1) + + # extract positive ones + # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w) + mask_preds = mask_preds[mask_weights > 0] + target_shape = mask_targets.shape[-2:] + + if mask_targets.shape[0] == 0: + # zero match + loss_dice = mask_preds.sum() + loss_mask = mask_preds.sum() + return loss_cls, loss_mask, loss_dice + + # upsample to shape of target + # shape (num_total_gts, h, w) + mask_preds = F.interpolate( + mask_preds.unsqueeze(1), + target_shape, + mode='bilinear', + align_corners=False).squeeze(1) + + # dice loss + loss_dice = self.loss_dice( + mask_preds, mask_targets, avg_factor=num_total_masks) + + # mask loss + # FocalLoss support input of shape (n, num_class) + h, w = mask_preds.shape[-2:] + # shape (num_total_gts, h, w) -> (num_total_gts * h * w, 1) + mask_preds = mask_preds.reshape(-1, 1) + # shape (num_total_gts, h, w) -> (num_total_gts * h * w) + mask_targets = mask_targets.reshape(-1) + # target is (1 - mask_targets) !!! + loss_mask = self.loss_mask( + mask_preds, 1 - mask_targets, avg_factor=num_total_masks * h * w) + + return loss_cls, loss_mask, loss_dice + + def forward(self, x: Tuple[Tensor], + batch_data_samples: SampleList) -> Tuple[Tensor]: + """Forward function. + + Args: + x (tuple[Tensor]): Features from the upstream network, each + is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + tuple[Tensor]: a tuple contains two elements. + + - all_cls_scores (Tensor): Classification scores for each\ + scale level. Each is a 4D-tensor with shape\ + (num_decoder, batch_size, num_queries, cls_out_channels).\ + Note `cls_out_channels` should includes background. + - all_mask_preds (Tensor): Mask scores for each decoder\ + layer. Each with shape (num_decoder, batch_size,\ + num_queries, h, w). + """ + batch_img_metas = [ + data_sample.metainfo for data_sample in batch_data_samples + ] + batch_size = x[0].shape[0] + input_img_h, input_img_w = batch_img_metas[0]['batch_input_shape'] + padding_mask = x[-1].new_ones((batch_size, input_img_h, input_img_w), + dtype=torch.float32) + for i in range(batch_size): + img_h, img_w = batch_img_metas[i]['img_shape'] + padding_mask[i, :img_h, :img_w] = 0 + padding_mask = F.interpolate( + padding_mask.unsqueeze(1), size=x[-1].shape[-2:], + mode='nearest').to(torch.bool).squeeze(1) + # when backbone is swin, memory is output of last stage of swin. + # when backbone is r50, memory is output of tranformer encoder. + mask_features, memory = self.pixel_decoder(x, batch_img_metas) + pos_embed = self.decoder_pe(padding_mask) + memory = self.decoder_input_proj(memory) + # shape (batch_size, c, h, w) -> (batch_size, h*w, c) + memory = memory.flatten(2).permute(0, 2, 1) + pos_embed = pos_embed.flatten(2).permute(0, 2, 1) + # shape (batch_size, h * w) + padding_mask = padding_mask.flatten(1) + # shape = (num_queries, embed_dims) + query_embed = self.query_embed.weight + # shape = (batch_size, num_queries, embed_dims) + query_embed = query_embed.unsqueeze(0).repeat(batch_size, 1, 1) + target = torch.zeros_like(query_embed) + # shape (num_decoder, num_queries, batch_size, embed_dims) + out_dec = self.transformer_decoder( + query=target, + key=memory, + value=memory, + query_pos=query_embed, + key_pos=pos_embed, + key_padding_mask=padding_mask) + + # cls_scores + all_cls_scores = self.cls_embed(out_dec) + + # mask_preds + mask_embed = self.mask_embed(out_dec) + all_mask_preds = torch.einsum('lbqc,bchw->lbqhw', mask_embed, + mask_features) + + return all_cls_scores, all_mask_preds + + def loss( + self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + ) -> Dict[str, Tensor]: + """Perform forward propagation and loss calculation of the panoptic + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the upstream + network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + batch_img_metas = [] + batch_gt_instances = [] + batch_gt_semantic_segs = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + if 'gt_sem_seg' in data_sample: + batch_gt_semantic_segs.append(data_sample.gt_sem_seg) + else: + batch_gt_semantic_segs.append(None) + + # forward + all_cls_scores, all_mask_preds = self(x, batch_data_samples) + + # preprocess ground truth + batch_gt_instances = self.preprocess_gt(batch_gt_instances, + batch_gt_semantic_segs) + + # loss + losses = self.loss_by_feat(all_cls_scores, all_mask_preds, + batch_gt_instances, batch_img_metas) + + return losses + + def predict(self, x: Tuple[Tensor], + batch_data_samples: SampleList) -> Tuple[Tensor]: + """Test without augmentaton. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + tuple[Tensor]: A tuple contains two tensors. + + - mask_cls_results (Tensor): Mask classification logits,\ + shape (batch_size, num_queries, cls_out_channels). + Note `cls_out_channels` should includes background. + - mask_pred_results (Tensor): Mask logits, shape \ + (batch_size, num_queries, h, w). + """ + batch_img_metas = [ + data_sample.metainfo for data_sample in batch_data_samples + ] + all_cls_scores, all_mask_preds = self(x, batch_data_samples) + mask_cls_results = all_cls_scores[-1] + mask_pred_results = all_mask_preds[-1] + + # upsample masks + img_shape = batch_img_metas[0]['batch_input_shape'] + mask_pred_results = F.interpolate( + mask_pred_results, + size=(img_shape[0], img_shape[1]), + mode='bilinear', + align_corners=False) + + return mask_cls_results, mask_pred_results diff --git a/mmdetection/mmdet/models/dense_heads/nasfcos_head.py b/mmdetection/mmdet/models/dense_heads/nasfcos_head.py new file mode 100644 index 0000000..14ee62a --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/nasfcos_head.py @@ -0,0 +1,114 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +from mmcv.cnn import ConvModule, Scale + +from mmdet.models.dense_heads.fcos_head import FCOSHead +from mmdet.registry import MODELS +from mmdet.utils import OptMultiConfig + + +@MODELS.register_module() +class NASFCOSHead(FCOSHead): + """Anchor-free head used in `NASFCOS `_. + + It is quite similar with FCOS head, except for the searched structure of + classification branch and bbox regression branch, where a structure of + "dconv3x3, conv3x3, dconv3x3, conv1x1" is utilized instead. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + strides (Sequence[int] or Sequence[Tuple[int, int]]): Strides of points + in multiple feature levels. Defaults to (4, 8, 16, 32, 64). + regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple + level points. + center_sampling (bool): If true, use center sampling. + Defaults to False. + center_sample_radius (float): Radius of center sampling. + Defaults to 1.5. + norm_on_bbox (bool): If true, normalize the regression targets with + FPN strides. Defaults to False. + centerness_on_reg (bool): If true, position centerness on the + regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042. + Defaults to False. + conv_bias (bool or str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Defaults to "auto". + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_centerness (:obj:`ConfigDict`, or dict): Config of centerness + loss. + norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and + config norm layer. Defaults to + ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], opitonal): Initialization config dict. + """ # noqa: E501 + + def __init__(self, + *args, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + if init_cfg is None: + init_cfg = [ + dict(type='Caffe2Xavier', layer=['ConvModule', 'Conv2d']), + dict( + type='Normal', + std=0.01, + override=[ + dict(name='conv_reg'), + dict(name='conv_centerness'), + dict( + name='conv_cls', + type='Normal', + std=0.01, + bias_prob=0.01) + ]), + ] + super().__init__(*args, init_cfg=init_cfg, **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + dconv3x3_config = dict( + type='DCNv2', + kernel_size=3, + use_bias=True, + deform_groups=2, + padding=1) + conv3x3_config = dict(type='Conv', kernel_size=3, padding=1) + conv1x1_config = dict(type='Conv', kernel_size=1) + + self.arch_config = [ + dconv3x3_config, conv3x3_config, dconv3x3_config, conv1x1_config + ] + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i, op_ in enumerate(self.arch_config): + op = copy.deepcopy(op_) + chn = self.in_channels if i == 0 else self.feat_channels + assert isinstance(op, dict) + use_bias = op.pop('use_bias', False) + padding = op.pop('padding', 0) + kernel_size = op.pop('kernel_size') + module = ConvModule( + chn, + self.feat_channels, + kernel_size, + stride=1, + padding=padding, + norm_cfg=self.norm_cfg, + bias=use_bias, + conv_cfg=op) + + self.cls_convs.append(copy.deepcopy(module)) + self.reg_convs.append(copy.deepcopy(module)) + + self.conv_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1) + + self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) diff --git a/mmdetection/mmdet/models/dense_heads/paa_head.py b/mmdetection/mmdet/models/dense_heads/paa_head.py new file mode 100644 index 0000000..3c1f453 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/paa_head.py @@ -0,0 +1,730 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import numpy as np +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList) +from ..layers import multiclass_nms +from ..utils import levels_to_images, multi_apply +from . import ATSSHead + +EPS = 1e-12 +try: + import sklearn.mixture as skm +except ImportError: + skm = None + + +@MODELS.register_module() +class PAAHead(ATSSHead): + """Head of PAAAssignment: Probabilistic Anchor Assignment with IoU + Prediction for Object Detection. + + Code is modified from the `official github repo + `_. + + More details can be found in the `paper + `_ . + + Args: + topk (int): Select topk samples with smallest loss in + each level. + score_voting (bool): Whether to use score voting in post-process. + covariance_type : String describing the type of covariance parameters + to be used in :class:`sklearn.mixture.GaussianMixture`. + It must be one of: + + - 'full': each component has its own general covariance matrix + - 'tied': all components share the same general covariance matrix + - 'diag': each component has its own diagonal covariance matrix + - 'spherical': each component has its own single variance + Default: 'diag'. From 'full' to 'spherical', the gmm fitting + process is faster yet the performance could be influenced. For most + cases, 'diag' should be a good choice. + """ + + def __init__(self, + *args, + topk: int = 9, + score_voting: bool = True, + covariance_type: str = 'diag', + **kwargs): + # topk used in paa reassign process + self.topk = topk + self.with_score_voting = score_voting + self.covariance_type = covariance_type + super().__init__(*args, **kwargs) + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + iou_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + iou_preds (list[Tensor]): iou_preds for each scale + level with shape (N, num_anchors * 1, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss gmm_assignment. + """ + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + ) + (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds, + pos_gt_index) = cls_reg_targets + cls_scores = levels_to_images(cls_scores) + cls_scores = [ + item.reshape(-1, self.cls_out_channels) for item in cls_scores + ] + bbox_preds = levels_to_images(bbox_preds) + bbox_preds = [item.reshape(-1, 4) for item in bbox_preds] + iou_preds = levels_to_images(iou_preds) + iou_preds = [item.reshape(-1, 1) for item in iou_preds] + pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list, + cls_scores, bbox_preds, labels, + labels_weight, bboxes_target, + bboxes_weight, pos_inds) + + with torch.no_grad(): + reassign_labels, reassign_label_weight, \ + reassign_bbox_weights, num_pos = multi_apply( + self.paa_reassign, + pos_losses_list, + labels, + labels_weight, + bboxes_weight, + pos_inds, + pos_gt_index, + anchor_list) + num_pos = sum(num_pos) + # convert all tensor list to a flatten tensor + cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1)) + bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1)) + iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1)) + labels = torch.cat(reassign_labels, 0).view(-1) + flatten_anchors = torch.cat( + [torch.cat(item, 0) for item in anchor_list]) + labels_weight = torch.cat(reassign_label_weight, 0).view(-1) + bboxes_target = torch.cat(bboxes_target, + 0).view(-1, bboxes_target[0].size(-1)) + + pos_inds_flatten = ((labels >= 0) + & + (labels < self.num_classes)).nonzero().reshape(-1) + + losses_cls = self.loss_cls( + cls_scores, + labels, + labels_weight, + avg_factor=max(num_pos, len(batch_img_metas))) # avoid num_pos=0 + if num_pos: + pos_bbox_pred = self.bbox_coder.decode( + flatten_anchors[pos_inds_flatten], + bbox_preds[pos_inds_flatten]) + pos_bbox_target = bboxes_target[pos_inds_flatten] + iou_target = bbox_overlaps( + pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True) + losses_iou = self.loss_centerness( + iou_preds[pos_inds_flatten], + iou_target.unsqueeze(-1), + avg_factor=num_pos) + losses_bbox = self.loss_bbox( + pos_bbox_pred, + pos_bbox_target, + iou_target.clamp(min=EPS), + avg_factor=iou_target.sum()) + else: + losses_iou = iou_preds.sum() * 0 + losses_bbox = bbox_preds.sum() * 0 + + return dict( + loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou) + + def get_pos_loss(self, anchors: List[Tensor], cls_score: Tensor, + bbox_pred: Tensor, label: Tensor, label_weight: Tensor, + bbox_target: dict, bbox_weight: Tensor, + pos_inds: Tensor) -> Tensor: + """Calculate loss of all potential positive samples obtained from first + match process. + + Args: + anchors (list[Tensor]): Anchors of each scale. + cls_score (Tensor): Box scores of single image with shape + (num_anchors, num_classes) + bbox_pred (Tensor): Box energies / deltas of single image + with shape (num_anchors, 4) + label (Tensor): classification target of each anchor with + shape (num_anchors,) + label_weight (Tensor): Classification loss weight of each + anchor with shape (num_anchors). + bbox_target (dict): Regression target of each anchor with + shape (num_anchors, 4). + bbox_weight (Tensor): Bbox weight of each anchor with shape + (num_anchors, 4). + pos_inds (Tensor): Index of all positive samples got from + first assign process. + + Returns: + Tensor: Losses of all positive samples in single image. + """ + if not len(pos_inds): + return cls_score.new([]), + anchors_all_level = torch.cat(anchors, 0) + pos_scores = cls_score[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + pos_label = label[pos_inds] + pos_label_weight = label_weight[pos_inds] + pos_bbox_target = bbox_target[pos_inds] + pos_bbox_weight = bbox_weight[pos_inds] + pos_anchors = anchors_all_level[pos_inds] + pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred) + + # to keep loss dimension + loss_cls = self.loss_cls( + pos_scores, + pos_label, + pos_label_weight, + avg_factor=1.0, + reduction_override='none') + + loss_bbox = self.loss_bbox( + pos_bbox_pred, + pos_bbox_target, + pos_bbox_weight, + avg_factor=1.0, # keep same loss weight before reassign + reduction_override='none') + + loss_cls = loss_cls.sum(-1) + pos_loss = loss_bbox + loss_cls + return pos_loss, + + def paa_reassign(self, pos_losses: Tensor, label: Tensor, + label_weight: Tensor, bbox_weight: Tensor, + pos_inds: Tensor, pos_gt_inds: Tensor, + anchors: List[Tensor]) -> tuple: + """Fit loss to GMM distribution and separate positive, ignore, negative + samples again with GMM model. + + Args: + pos_losses (Tensor): Losses of all positive samples in + single image. + label (Tensor): classification target of each anchor with + shape (num_anchors,) + label_weight (Tensor): Classification loss weight of each + anchor with shape (num_anchors). + bbox_weight (Tensor): Bbox weight of each anchor with shape + (num_anchors, 4). + pos_inds (Tensor): Index of all positive samples got from + first assign process. + pos_gt_inds (Tensor): Gt_index of all positive samples got + from first assign process. + anchors (list[Tensor]): Anchors of each scale. + + Returns: + tuple: Usually returns a tuple containing learning targets. + + - label (Tensor): classification target of each anchor after + paa assign, with shape (num_anchors,) + - label_weight (Tensor): Classification loss weight of each + anchor after paa assign, with shape (num_anchors). + - bbox_weight (Tensor): Bbox weight of each anchor with shape + (num_anchors, 4). + - num_pos (int): The number of positive samples after paa + assign. + """ + if not len(pos_inds): + return label, label_weight, bbox_weight, 0 + label = label.clone() + label_weight = label_weight.clone() + bbox_weight = bbox_weight.clone() + num_gt = pos_gt_inds.max() + 1 + num_level = len(anchors) + num_anchors_each_level = [item.size(0) for item in anchors] + num_anchors_each_level.insert(0, 0) + inds_level_interval = np.cumsum(num_anchors_each_level) + pos_level_mask = [] + for i in range(num_level): + mask = (pos_inds >= inds_level_interval[i]) & ( + pos_inds < inds_level_interval[i + 1]) + pos_level_mask.append(mask) + pos_inds_after_paa = [label.new_tensor([])] + ignore_inds_after_paa = [label.new_tensor([])] + for gt_ind in range(num_gt): + pos_inds_gmm = [] + pos_loss_gmm = [] + gt_mask = pos_gt_inds == gt_ind + for level in range(num_level): + level_mask = pos_level_mask[level] + level_gt_mask = level_mask & gt_mask + value, topk_inds = pos_losses[level_gt_mask].topk( + min(level_gt_mask.sum(), self.topk), largest=False) + pos_inds_gmm.append(pos_inds[level_gt_mask][topk_inds]) + pos_loss_gmm.append(value) + pos_inds_gmm = torch.cat(pos_inds_gmm) + pos_loss_gmm = torch.cat(pos_loss_gmm) + # fix gmm need at least two sample + if len(pos_inds_gmm) < 2: + continue + device = pos_inds_gmm.device + pos_loss_gmm, sort_inds = pos_loss_gmm.sort() + pos_inds_gmm = pos_inds_gmm[sort_inds] + pos_loss_gmm = pos_loss_gmm.view(-1, 1).cpu().numpy() + min_loss, max_loss = pos_loss_gmm.min(), pos_loss_gmm.max() + means_init = np.array([min_loss, max_loss]).reshape(2, 1) + weights_init = np.array([0.5, 0.5]) + precisions_init = np.array([1.0, 1.0]).reshape(2, 1, 1) # full + if self.covariance_type == 'spherical': + precisions_init = precisions_init.reshape(2) + elif self.covariance_type == 'diag': + precisions_init = precisions_init.reshape(2, 1) + elif self.covariance_type == 'tied': + precisions_init = np.array([[1.0]]) + if skm is None: + raise ImportError('Please run "pip install sklearn" ' + 'to install sklearn first.') + gmm = skm.GaussianMixture( + 2, + weights_init=weights_init, + means_init=means_init, + precisions_init=precisions_init, + covariance_type=self.covariance_type) + gmm.fit(pos_loss_gmm) + gmm_assignment = gmm.predict(pos_loss_gmm) + scores = gmm.score_samples(pos_loss_gmm) + gmm_assignment = torch.from_numpy(gmm_assignment).to(device) + scores = torch.from_numpy(scores).to(device) + + pos_inds_temp, ignore_inds_temp = self.gmm_separation_scheme( + gmm_assignment, scores, pos_inds_gmm) + pos_inds_after_paa.append(pos_inds_temp) + ignore_inds_after_paa.append(ignore_inds_temp) + + pos_inds_after_paa = torch.cat(pos_inds_after_paa) + ignore_inds_after_paa = torch.cat(ignore_inds_after_paa) + reassign_mask = (pos_inds.unsqueeze(1) != pos_inds_after_paa).all(1) + reassign_ids = pos_inds[reassign_mask] + label[reassign_ids] = self.num_classes + label_weight[ignore_inds_after_paa] = 0 + bbox_weight[reassign_ids] = 0 + num_pos = len(pos_inds_after_paa) + return label, label_weight, bbox_weight, num_pos + + def gmm_separation_scheme(self, gmm_assignment: Tensor, scores: Tensor, + pos_inds_gmm: Tensor) -> Tuple[Tensor, Tensor]: + """A general separation scheme for gmm model. + + It separates a GMM distribution of candidate samples into three + parts, 0 1 and uncertain areas, and you can implement other + separation schemes by rewriting this function. + + Args: + gmm_assignment (Tensor): The prediction of GMM which is of shape + (num_samples,). The 0/1 value indicates the distribution + that each sample comes from. + scores (Tensor): The probability of sample coming from the + fit GMM distribution. The tensor is of shape (num_samples,). + pos_inds_gmm (Tensor): All the indexes of samples which are used + to fit GMM model. The tensor is of shape (num_samples,) + + Returns: + tuple[Tensor, Tensor]: The indices of positive and ignored samples. + + - pos_inds_temp (Tensor): Indices of positive samples. + - ignore_inds_temp (Tensor): Indices of ignore samples. + """ + # The implementation is (c) in Fig.3 in origin paper instead of (b). + # You can refer to issues such as + # https://github.com/kkhoot/PAA/issues/8 and + # https://github.com/kkhoot/PAA/issues/9. + fgs = gmm_assignment == 0 + pos_inds_temp = fgs.new_tensor([], dtype=torch.long) + ignore_inds_temp = fgs.new_tensor([], dtype=torch.long) + if fgs.nonzero().numel(): + _, pos_thr_ind = scores[fgs].topk(1) + pos_inds_temp = pos_inds_gmm[fgs][:pos_thr_ind + 1] + ignore_inds_temp = pos_inds_gmm.new_tensor([]) + return pos_inds_temp, ignore_inds_temp + + def get_targets(self, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True) -> tuple: + """Get targets for PAA head. + + This method is almost the same as `AnchorHead.get_targets()`. We direct + return the results from _get_targets_single instead map it to levels + by images_to_levels function. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, 4). + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, ) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + + Returns: + tuple: Usually returns a tuple containing learning targets. + + - labels (list[Tensor]): Labels of all anchors, each with + shape (num_anchors,). + - label_weights (list[Tensor]): Label weights of all anchor. + each with shape (num_anchors,). + - bbox_targets (list[Tensor]): BBox targets of all anchors. + each with shape (num_anchors, 4). + - bbox_weights (list[Tensor]): BBox weights of all anchors. + each with shape (num_anchors, 4). + - pos_inds (list[Tensor]): Contains all index of positive + sample in all anchor. + - gt_inds (list[Tensor]): Contains all gt_index of positive + sample in all anchor. + """ + + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + concat_anchor_list = [] + concat_valid_flag_list = [] + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + concat_anchor_list.append(torch.cat(anchor_list[i])) + concat_valid_flag_list.append(torch.cat(valid_flag_list[i])) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + results = multi_apply( + self._get_targets_single, + concat_anchor_list, + concat_valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + + (labels, label_weights, bbox_targets, bbox_weights, valid_pos_inds, + valid_neg_inds, sampling_result) = results + + # Due to valid flag of anchors, we have to calculate the real pos_inds + # in origin anchor set. + pos_inds = [] + for i, single_labels in enumerate(labels): + pos_mask = (0 <= single_labels) & ( + single_labels < self.num_classes) + pos_inds.append(pos_mask.nonzero().view(-1)) + + gt_inds = [item.pos_assigned_gt_inds for item in sampling_result] + return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, + gt_inds) + + def _get_targets_single(self, + flat_anchors: Tensor, + valid_flags: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression and classification targets for anchors in a + single image. + + This method is same as `AnchorHead._get_targets_single()`. + """ + assert unmap_outputs, 'We must map outputs back to the original' \ + 'set of anchors in PAAhead' + return super(ATSSHead, self)._get_targets_single( + flat_anchors, + valid_flags, + gt_instances, + img_meta, + gt_instances_ignore, + unmap_outputs=True) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + score_factors: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: OptConfigType = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + This method is same as `BaseDenseHead.get_results()`. + """ + assert with_nms, 'PAA only supports "with_nms=True" now and it ' \ + 'means PAAHead does not support ' \ + 'test-time augmentation' + return super().predict_by_feat( + cls_scores=cls_scores, + bbox_preds=bbox_preds, + score_factors=score_factors, + batch_img_metas=batch_img_metas, + cfg=cfg, + rescale=rescale, + with_nms=with_nms) + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: OptConfigType = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factors from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid, has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (:obj:`ConfigDict` or dict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default: False. + with_nms (bool): If True, do nms before return boxes. + Default: True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + cfg = self.test_cfg if cfg is None else cfg + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_score_factors = [] + for level_idx, (cls_score, bbox_pred, score_factor, priors) in \ + enumerate(zip(cls_score_list, bbox_pred_list, + score_factor_list, mlvl_priors)): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + scores = cls_score.permute(1, 2, 0).reshape( + -1, self.cls_out_channels).sigmoid() + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid() + + if 0 < nms_pre < scores.shape[0]: + max_scores, _ = (scores * + score_factor[:, None]).sqrt().max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + priors = priors[topk_inds, :] + bbox_pred = bbox_pred[topk_inds, :] + scores = scores[topk_inds, :] + score_factor = score_factor[topk_inds] + + bboxes = self.bbox_coder.decode( + priors, bbox_pred, max_shape=img_shape) + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_score_factors.append(score_factor) + + results = InstanceData() + results.bboxes = torch.cat(mlvl_bboxes) + results.scores = torch.cat(mlvl_scores) + results.score_factors = torch.cat(mlvl_score_factors) + + return self._bbox_post_process(results, cfg, rescale, with_nms, + img_meta) + + def _bbox_post_process(self, + results: InstanceData, + cfg: ConfigType, + rescale: bool = False, + with_nms: bool = True, + img_meta: Optional[dict] = None): + """bbox post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually with_nms is False is used for aug test. + + Args: + results (:obj:`InstaceData`): Detection instance results, + each item has shape (num_bboxes, ). + cfg (:obj:`ConfigDict` or dict): Test / postprocessing + configuration, if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default: False. + with_nms (bool): If True, do nms before return boxes. + Default: True. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + if rescale: + results.bboxes /= results.bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + # Add a dummy background class to the backend when using sigmoid + # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 + # BG cat_id: num_class + padding = results.scores.new_zeros(results.scores.shape[0], 1) + mlvl_scores = torch.cat([results.scores, padding], dim=1) + + mlvl_nms_scores = (mlvl_scores * results.score_factors[:, None]).sqrt() + det_bboxes, det_labels = multiclass_nms( + results.bboxes, + mlvl_nms_scores, + cfg.score_thr, + cfg.nms, + cfg.max_per_img, + score_factors=None) + if self.with_score_voting and len(det_bboxes) > 0: + det_bboxes, det_labels = self.score_voting(det_bboxes, det_labels, + results.bboxes, + mlvl_nms_scores, + cfg.score_thr) + nms_results = InstanceData() + nms_results.bboxes = det_bboxes[:, :-1] + nms_results.scores = det_bboxes[:, -1] + nms_results.labels = det_labels + return nms_results + + def score_voting(self, det_bboxes: Tensor, det_labels: Tensor, + mlvl_bboxes: Tensor, mlvl_nms_scores: Tensor, + score_thr: float) -> Tuple[Tensor, Tensor]: + """Implementation of score voting method works on each remaining boxes + after NMS procedure. + + Args: + det_bboxes (Tensor): Remaining boxes after NMS procedure, + with shape (k, 5), each dimension means + (x1, y1, x2, y2, score). + det_labels (Tensor): The label of remaining boxes, with shape + (k, 1),Labels are 0-based. + mlvl_bboxes (Tensor): All boxes before the NMS procedure, + with shape (num_anchors,4). + mlvl_nms_scores (Tensor): The scores of all boxes which is used + in the NMS procedure, with shape (num_anchors, num_class) + score_thr (float): The score threshold of bboxes. + + Returns: + tuple: Usually returns a tuple containing voting results. + + - det_bboxes_voted (Tensor): Remaining boxes after + score voting procedure, with shape (k, 5), each + dimension means (x1, y1, x2, y2, score). + - det_labels_voted (Tensor): Label of remaining bboxes + after voting, with shape (num_anchors,). + """ + candidate_mask = mlvl_nms_scores > score_thr + candidate_mask_nonzeros = candidate_mask.nonzero(as_tuple=False) + candidate_inds = candidate_mask_nonzeros[:, 0] + candidate_labels = candidate_mask_nonzeros[:, 1] + candidate_bboxes = mlvl_bboxes[candidate_inds] + candidate_scores = mlvl_nms_scores[candidate_mask] + det_bboxes_voted = [] + det_labels_voted = [] + for cls in range(self.cls_out_channels): + candidate_cls_mask = candidate_labels == cls + if not candidate_cls_mask.any(): + continue + candidate_cls_scores = candidate_scores[candidate_cls_mask] + candidate_cls_bboxes = candidate_bboxes[candidate_cls_mask] + det_cls_mask = det_labels == cls + det_cls_bboxes = det_bboxes[det_cls_mask].view( + -1, det_bboxes.size(-1)) + det_candidate_ious = bbox_overlaps(det_cls_bboxes[:, :4], + candidate_cls_bboxes) + for det_ind in range(len(det_cls_bboxes)): + single_det_ious = det_candidate_ious[det_ind] + pos_ious_mask = single_det_ious > 0.01 + pos_ious = single_det_ious[pos_ious_mask] + pos_bboxes = candidate_cls_bboxes[pos_ious_mask] + pos_scores = candidate_cls_scores[pos_ious_mask] + pis = (torch.exp(-(1 - pos_ious)**2 / 0.025) * + pos_scores)[:, None] + voted_box = torch.sum( + pis * pos_bboxes, dim=0) / torch.sum( + pis, dim=0) + voted_score = det_cls_bboxes[det_ind][-1:][None, :] + det_bboxes_voted.append( + torch.cat((voted_box[None, :], voted_score), dim=1)) + det_labels_voted.append(cls) + + det_bboxes_voted = torch.cat(det_bboxes_voted, dim=0) + det_labels_voted = det_labels.new_tensor(det_labels_voted) + return det_bboxes_voted, det_labels_voted diff --git a/mmdetection/mmdet/models/dense_heads/pisa_retinanet_head.py b/mmdetection/mmdet/models/dense_heads/pisa_retinanet_head.py new file mode 100644 index 0000000..85fd54f --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/pisa_retinanet_head.py @@ -0,0 +1,154 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import InstanceList, OptInstanceList +from ..losses import carl_loss, isr_p +from ..utils import images_to_levels +from .retina_head import RetinaHead + + +@MODELS.register_module() +class PISARetinaHead(RetinaHead): + """PISA Retinanet Head. + + The head owns the same structure with Retinanet Head, but differs in two + aspects: + 1. Importance-based Sample Reweighting Positive (ISR-P) is applied to + change the positive loss weights. + 2. Classification-aware regression loss is adopted as a third loss. + """ + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: Loss dict, comprise classification loss, regression loss and + carl loss. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + return_sampling_results=True) + if cls_reg_targets is None: + return None + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor, sampling_results_list) = cls_reg_targets + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + # concat all level anchors and flags to a single tensor + concat_anchor_list = [] + for i in range(len(anchor_list)): + concat_anchor_list.append(torch.cat(anchor_list[i])) + all_anchor_list = images_to_levels(concat_anchor_list, + num_level_anchors) + + num_imgs = len(batch_img_metas) + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, label_channels) + for cls_score in cls_scores + ] + flatten_cls_scores = torch.cat( + flatten_cls_scores, dim=1).reshape(-1, + flatten_cls_scores[0].size(-1)) + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_bbox_preds = torch.cat( + flatten_bbox_preds, dim=1).view(-1, flatten_bbox_preds[0].size(-1)) + flatten_labels = torch.cat(labels_list, dim=1).reshape(-1) + flatten_label_weights = torch.cat( + label_weights_list, dim=1).reshape(-1) + flatten_anchors = torch.cat(all_anchor_list, dim=1).reshape(-1, 4) + flatten_bbox_targets = torch.cat( + bbox_targets_list, dim=1).reshape(-1, 4) + flatten_bbox_weights = torch.cat( + bbox_weights_list, dim=1).reshape(-1, 4) + + # Apply ISR-P + isr_cfg = self.train_cfg.get('isr', None) + if isr_cfg is not None: + all_targets = (flatten_labels, flatten_label_weights, + flatten_bbox_targets, flatten_bbox_weights) + with torch.no_grad(): + all_targets = isr_p( + flatten_cls_scores, + flatten_bbox_preds, + all_targets, + flatten_anchors, + sampling_results_list, + bbox_coder=self.bbox_coder, + loss_cls=self.loss_cls, + num_class=self.num_classes, + **self.train_cfg['isr']) + (flatten_labels, flatten_label_weights, flatten_bbox_targets, + flatten_bbox_weights) = all_targets + + # For convenience we compute loss once instead separating by fpn level, + # so that we don't need to separate the weights by level again. + # The result should be the same + losses_cls = self.loss_cls( + flatten_cls_scores, + flatten_labels, + flatten_label_weights, + avg_factor=avg_factor) + losses_bbox = self.loss_bbox( + flatten_bbox_preds, + flatten_bbox_targets, + flatten_bbox_weights, + avg_factor=avg_factor) + loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox) + + # CARL Loss + carl_cfg = self.train_cfg.get('carl', None) + if carl_cfg is not None: + loss_carl = carl_loss( + flatten_cls_scores, + flatten_labels, + flatten_bbox_preds, + flatten_bbox_targets, + self.loss_bbox, + **self.train_cfg['carl'], + avg_factor=avg_factor, + sigmoid=True, + num_class=self.num_classes) + loss_dict.update(loss_carl) + + return loss_dict diff --git a/mmdetection/mmdet/models/dense_heads/pisa_ssd_head.py b/mmdetection/mmdet/models/dense_heads/pisa_ssd_head.py new file mode 100644 index 0000000..ec09cb4 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/pisa_ssd_head.py @@ -0,0 +1,182 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Union + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import InstanceList, OptInstanceList +from ..losses import CrossEntropyLoss, SmoothL1Loss, carl_loss, isr_p +from ..utils import multi_apply +from .ssd_head import SSDHead + + +# TODO: add loss evaluator for SSD +@MODELS.register_module() +class PISASSDHead(SSDHead): + """Implementation of `PISA SSD head `_ + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Sequence[int]): Number of channels in the input feature + map. + stacked_convs (int): Number of conv layers in cls and reg tower. + Defaults to 0. + feat_channels (int): Number of hidden channels when stacked_convs + > 0. Defaults to 256. + use_depthwise (bool): Whether to use DepthwiseSeparableConv. + Defaults to False. + conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct + and config conv layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct + and config norm layer. Defaults to None. + act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct + and config activation layer. Defaults to None. + anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor + generator. + bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder. + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Defaults to False. It should be `True` when + using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. + train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of + anchor head. + test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of + anchor head. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], Optional): Initialization config dict. + """ # noqa: W605 + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Union[List[Tensor], Tensor]]: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Union[List[Tensor], Tensor]]: A dictionary of loss + components. the dict has components below: + + - loss_cls (list[Tensor]): A list containing each feature map \ + classification loss. + - loss_bbox (list[Tensor]): A list containing each feature map \ + regression loss. + - loss_carl (Tensor): The loss of CARL. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + unmap_outputs=False, + return_sampling_results=True) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor, sampling_results_list) = cls_reg_targets + + num_images = len(batch_img_metas) + all_cls_scores = torch.cat([ + s.permute(0, 2, 3, 1).reshape( + num_images, -1, self.cls_out_channels) for s in cls_scores + ], 1) + all_labels = torch.cat(labels_list, -1).view(num_images, -1) + all_label_weights = torch.cat(label_weights_list, + -1).view(num_images, -1) + all_bbox_preds = torch.cat([ + b.permute(0, 2, 3, 1).reshape(num_images, -1, 4) + for b in bbox_preds + ], -2) + all_bbox_targets = torch.cat(bbox_targets_list, + -2).view(num_images, -1, 4) + all_bbox_weights = torch.cat(bbox_weights_list, + -2).view(num_images, -1, 4) + + # concat all level anchors to a single tensor + all_anchors = [] + for i in range(num_images): + all_anchors.append(torch.cat(anchor_list[i])) + + isr_cfg = self.train_cfg.get('isr', None) + all_targets = (all_labels.view(-1), all_label_weights.view(-1), + all_bbox_targets.view(-1, + 4), all_bbox_weights.view(-1, 4)) + # apply ISR-P + if isr_cfg is not None: + all_targets = isr_p( + all_cls_scores.view(-1, all_cls_scores.size(-1)), + all_bbox_preds.view(-1, 4), + all_targets, + torch.cat(all_anchors), + sampling_results_list, + loss_cls=CrossEntropyLoss(), + bbox_coder=self.bbox_coder, + **self.train_cfg['isr'], + num_class=self.num_classes) + (new_labels, new_label_weights, new_bbox_targets, + new_bbox_weights) = all_targets + all_labels = new_labels.view(all_labels.shape) + all_label_weights = new_label_weights.view(all_label_weights.shape) + all_bbox_targets = new_bbox_targets.view(all_bbox_targets.shape) + all_bbox_weights = new_bbox_weights.view(all_bbox_weights.shape) + + # add CARL loss + carl_loss_cfg = self.train_cfg.get('carl', None) + if carl_loss_cfg is not None: + loss_carl = carl_loss( + all_cls_scores.view(-1, all_cls_scores.size(-1)), + all_targets[0], + all_bbox_preds.view(-1, 4), + all_targets[2], + SmoothL1Loss(beta=1.), + **self.train_cfg['carl'], + avg_factor=avg_factor, + num_class=self.num_classes) + + # check NaN and Inf + assert torch.isfinite(all_cls_scores).all().item(), \ + 'classification scores become infinite or NaN!' + assert torch.isfinite(all_bbox_preds).all().item(), \ + 'bbox predications become infinite or NaN!' + + losses_cls, losses_bbox = multi_apply( + self.loss_by_feat_single, + all_cls_scores, + all_bbox_preds, + all_anchors, + all_labels, + all_label_weights, + all_bbox_targets, + all_bbox_weights, + avg_factor=avg_factor) + loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox) + if carl_loss_cfg is not None: + loss_dict.update(loss_carl) + return loss_dict diff --git a/mmdetection/mmdet/models/dense_heads/reppoints_head.py b/mmdetection/mmdet/models/dense_heads/reppoints_head.py new file mode 100644 index 0000000..22f3e34 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/reppoints_head.py @@ -0,0 +1,885 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Sequence, Tuple + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.ops import DeformConv2d +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList +from ..task_modules.prior_generators import MlvlPointGenerator +from ..task_modules.samplers import PseudoSampler +from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply, + unmap) +from .anchor_free_head import AnchorFreeHead + + +@MODELS.register_module() +class RepPointsHead(AnchorFreeHead): + """RepPoint head. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + point_feat_channels (int): Number of channels of points features. + num_points (int): Number of points. + gradient_mul (float): The multiplier to gradients from + points refinement and recognition. + point_strides (Sequence[int]): points strides. + point_base_scale (int): bbox scale for assigning labels. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox_init (:obj:`ConfigDict` or dict): Config of initial points + loss. + loss_bbox_refine (:obj:`ConfigDict` or dict): Config of points loss in + refinement. + use_grid_points (bool): If we use bounding box representation, the + reppoints is represented as grid points on the bounding box. + center_init (bool): Whether to use center point assignment. + transform_method (str): The methods to transform RepPoints to bbox. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. + """ # noqa: W605 + + def __init__(self, + num_classes: int, + in_channels: int, + point_feat_channels: int = 256, + num_points: int = 9, + gradient_mul: float = 0.1, + point_strides: Sequence[int] = [8, 16, 32, 64, 128], + point_base_scale: int = 4, + loss_cls: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init: ConfigType = dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5), + loss_bbox_refine: ConfigType = dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + use_grid_points: bool = False, + center_init: bool = True, + transform_method: str = 'moment', + moment_mul: float = 0.01, + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='reppoints_cls_out', + std=0.01, + bias_prob=0.01)), + **kwargs) -> None: + self.num_points = num_points + self.point_feat_channels = point_feat_channels + self.use_grid_points = use_grid_points + self.center_init = center_init + + # we use deform conv to extract points features + self.dcn_kernel = int(np.sqrt(num_points)) + self.dcn_pad = int((self.dcn_kernel - 1) / 2) + assert self.dcn_kernel * self.dcn_kernel == num_points, \ + 'The points number should be a square number.' + assert self.dcn_kernel % 2 == 1, \ + 'The points number should be an odd square number.' + dcn_base = np.arange(-self.dcn_pad, + self.dcn_pad + 1).astype(np.float64) + dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) + dcn_base_x = np.tile(dcn_base, self.dcn_kernel) + dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape( + (-1)) + self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1) + + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + loss_cls=loss_cls, + init_cfg=init_cfg, + **kwargs) + + self.gradient_mul = gradient_mul + self.point_base_scale = point_base_scale + self.point_strides = point_strides + self.prior_generator = MlvlPointGenerator( + self.point_strides, offset=0.) + + if self.train_cfg: + self.init_assigner = TASK_UTILS.build( + self.train_cfg['init']['assigner']) + self.refine_assigner = TASK_UTILS.build( + self.train_cfg['refine']['assigner']) + + if self.train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + self.transform_method = transform_method + if self.transform_method == 'moment': + self.moment_transfer = nn.Parameter( + data=torch.zeros(2), requires_grad=True) + self.moment_mul = moment_mul + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if self.use_sigmoid_cls: + self.cls_out_channels = self.num_classes + else: + self.cls_out_channels = self.num_classes + 1 + self.loss_bbox_init = MODELS.build(loss_bbox_init) + self.loss_bbox_refine = MODELS.build(loss_bbox_refine) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points + self.reppoints_cls_conv = DeformConv2d(self.feat_channels, + self.point_feat_channels, + self.dcn_kernel, 1, + self.dcn_pad) + self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels, + self.cls_out_channels, 1, 1, 0) + self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels, + self.point_feat_channels, 3, + 1, 1) + self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels, + pts_out_dim, 1, 1, 0) + self.reppoints_pts_refine_conv = DeformConv2d(self.feat_channels, + self.point_feat_channels, + self.dcn_kernel, 1, + self.dcn_pad) + self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels, + pts_out_dim, 1, 1, 0) + + def points2bbox(self, pts: Tensor, y_first: bool = True) -> Tensor: + """Converting the points set into bounding box. + + Args: + pts (Tensor): the input points sets (fields), each points + set (fields) is represented as 2n scalar. + y_first (bool): if y_first=True, the point set is + represented as [y1, x1, y2, x2 ... yn, xn], otherwise + the point set is represented as + [x1, y1, x2, y2 ... xn, yn]. Defaults to True. + + Returns: + Tensor: each points set is converting to a bbox [x1, y1, x2, y2]. + """ + pts_reshape = pts.view(pts.shape[0], -1, 2, *pts.shape[2:]) + pts_y = pts_reshape[:, :, 0, ...] if y_first else pts_reshape[:, :, 1, + ...] + pts_x = pts_reshape[:, :, 1, ...] if y_first else pts_reshape[:, :, 0, + ...] + if self.transform_method == 'minmax': + bbox_left = pts_x.min(dim=1, keepdim=True)[0] + bbox_right = pts_x.max(dim=1, keepdim=True)[0] + bbox_up = pts_y.min(dim=1, keepdim=True)[0] + bbox_bottom = pts_y.max(dim=1, keepdim=True)[0] + bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom], + dim=1) + elif self.transform_method == 'partial_minmax': + pts_y = pts_y[:, :4, ...] + pts_x = pts_x[:, :4, ...] + bbox_left = pts_x.min(dim=1, keepdim=True)[0] + bbox_right = pts_x.max(dim=1, keepdim=True)[0] + bbox_up = pts_y.min(dim=1, keepdim=True)[0] + bbox_bottom = pts_y.max(dim=1, keepdim=True)[0] + bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom], + dim=1) + elif self.transform_method == 'moment': + pts_y_mean = pts_y.mean(dim=1, keepdim=True) + pts_x_mean = pts_x.mean(dim=1, keepdim=True) + pts_y_std = torch.std(pts_y - pts_y_mean, dim=1, keepdim=True) + pts_x_std = torch.std(pts_x - pts_x_mean, dim=1, keepdim=True) + moment_transfer = (self.moment_transfer * self.moment_mul) + ( + self.moment_transfer.detach() * (1 - self.moment_mul)) + moment_width_transfer = moment_transfer[0] + moment_height_transfer = moment_transfer[1] + half_width = pts_x_std * torch.exp(moment_width_transfer) + half_height = pts_y_std * torch.exp(moment_height_transfer) + bbox = torch.cat([ + pts_x_mean - half_width, pts_y_mean - half_height, + pts_x_mean + half_width, pts_y_mean + half_height + ], + dim=1) + else: + raise NotImplementedError + return bbox + + def gen_grid_from_reg(self, reg: Tensor, + previous_boxes: Tensor) -> Tuple[Tensor]: + """Base on the previous bboxes and regression values, we compute the + regressed bboxes and generate the grids on the bboxes. + + Args: + reg (Tensor): the regression value to previous bboxes. + previous_boxes (Tensor): previous bboxes. + + Returns: + Tuple[Tensor]: generate grids on the regressed bboxes. + """ + b, _, h, w = reg.shape + bxy = (previous_boxes[:, :2, ...] + previous_boxes[:, 2:, ...]) / 2. + bwh = (previous_boxes[:, 2:, ...] - + previous_boxes[:, :2, ...]).clamp(min=1e-6) + grid_topleft = bxy + bwh * reg[:, :2, ...] - 0.5 * bwh * torch.exp( + reg[:, 2:, ...]) + grid_wh = bwh * torch.exp(reg[:, 2:, ...]) + grid_left = grid_topleft[:, [0], ...] + grid_top = grid_topleft[:, [1], ...] + grid_width = grid_wh[:, [0], ...] + grid_height = grid_wh[:, [1], ...] + intervel = torch.linspace(0., 1., self.dcn_kernel).view( + 1, self.dcn_kernel, 1, 1).type_as(reg) + grid_x = grid_left + grid_width * intervel + grid_x = grid_x.unsqueeze(1).repeat(1, self.dcn_kernel, 1, 1, 1) + grid_x = grid_x.view(b, -1, h, w) + grid_y = grid_top + grid_height * intervel + grid_y = grid_y.unsqueeze(2).repeat(1, 1, self.dcn_kernel, 1, 1) + grid_y = grid_y.view(b, -1, h, w) + grid_yx = torch.stack([grid_y, grid_x], dim=2) + grid_yx = grid_yx.view(b, -1, h, w) + regressed_bbox = torch.cat([ + grid_left, grid_top, grid_left + grid_width, grid_top + grid_height + ], 1) + return grid_yx, regressed_bbox + + def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]: + return multi_apply(self.forward_single, feats) + + def forward_single(self, x: Tensor) -> Tuple[Tensor]: + """Forward feature map of a single FPN level.""" + dcn_base_offset = self.dcn_base_offset.type_as(x) + # If we use center_init, the initial reppoints is from center points. + # If we use bounding bbox representation, the initial reppoints is + # from regular grid placed on a pre-defined bbox. + if self.use_grid_points or not self.center_init: + scale = self.point_base_scale / 2 + points_init = dcn_base_offset / dcn_base_offset.max() * scale + bbox_init = x.new_tensor([-scale, -scale, scale, + scale]).view(1, 4, 1, 1) + else: + points_init = 0 + cls_feat = x + pts_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + pts_feat = reg_conv(pts_feat) + # initialize reppoints + pts_out_init = self.reppoints_pts_init_out( + self.relu(self.reppoints_pts_init_conv(pts_feat))) + if self.use_grid_points: + pts_out_init, bbox_out_init = self.gen_grid_from_reg( + pts_out_init, bbox_init.detach()) + else: + pts_out_init = pts_out_init + points_init + # refine and classify reppoints + pts_out_init_grad_mul = (1 - self.gradient_mul) * pts_out_init.detach( + ) + self.gradient_mul * pts_out_init + dcn_offset = pts_out_init_grad_mul - dcn_base_offset + cls_out = self.reppoints_cls_out( + self.relu(self.reppoints_cls_conv(cls_feat, dcn_offset))) + pts_out_refine = self.reppoints_pts_refine_out( + self.relu(self.reppoints_pts_refine_conv(pts_feat, dcn_offset))) + if self.use_grid_points: + pts_out_refine, bbox_out_refine = self.gen_grid_from_reg( + pts_out_refine, bbox_out_init.detach()) + else: + pts_out_refine = pts_out_refine + pts_out_init.detach() + + if self.training: + return cls_out, pts_out_init, pts_out_refine + else: + return cls_out, self.points2bbox(pts_out_refine) + + def get_points(self, featmap_sizes: List[Tuple[int]], + batch_img_metas: List[dict], device: str) -> tuple: + """Get points according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + batch_img_metas (list[dict]): Image meta info. + + Returns: + tuple: points of each image, valid flags of each image + """ + num_imgs = len(batch_img_metas) + + # since feature map sizes of all images are the same, we only compute + # points center for one time + multi_level_points = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + points_list = [[point.clone() for point in multi_level_points] + for _ in range(num_imgs)] + + # for each image, we compute valid flags of multi level grids + valid_flag_list = [] + for img_id, img_meta in enumerate(batch_img_metas): + multi_level_flags = self.prior_generator.valid_flags( + featmap_sizes, img_meta['pad_shape'], device=device) + valid_flag_list.append(multi_level_flags) + + return points_list, valid_flag_list + + def centers_to_bboxes(self, point_list: List[Tensor]) -> List[Tensor]: + """Get bboxes according to center points. + + Only used in :class:`MaxIoUAssigner`. + """ + bbox_list = [] + for i_img, point in enumerate(point_list): + bbox = [] + for i_lvl in range(len(self.point_strides)): + scale = self.point_base_scale * self.point_strides[i_lvl] * 0.5 + bbox_shift = torch.Tensor([-scale, -scale, scale, + scale]).view(1, 4).type_as(point[0]) + bbox_center = torch.cat( + [point[i_lvl][:, :2], point[i_lvl][:, :2]], dim=1) + bbox.append(bbox_center + bbox_shift) + bbox_list.append(bbox) + return bbox_list + + def offset_to_pts(self, center_list: List[Tensor], + pred_list: List[Tensor]) -> List[Tensor]: + """Change from point offset to point coordinate.""" + pts_list = [] + for i_lvl in range(len(self.point_strides)): + pts_lvl = [] + for i_img in range(len(center_list)): + pts_center = center_list[i_img][i_lvl][:, :2].repeat( + 1, self.num_points) + pts_shift = pred_list[i_lvl][i_img] + yx_pts_shift = pts_shift.permute(1, 2, 0).view( + -1, 2 * self.num_points) + y_pts_shift = yx_pts_shift[..., 0::2] + x_pts_shift = yx_pts_shift[..., 1::2] + xy_pts_shift = torch.stack([x_pts_shift, y_pts_shift], -1) + xy_pts_shift = xy_pts_shift.view(*yx_pts_shift.shape[:-1], -1) + pts = xy_pts_shift * self.point_strides[i_lvl] + pts_center + pts_lvl.append(pts) + pts_lvl = torch.stack(pts_lvl, 0) + pts_list.append(pts_lvl) + return pts_list + + def _get_targets_single(self, + flat_proposals: Tensor, + valid_flags: Tensor, + gt_instances: InstanceData, + gt_instances_ignore: InstanceData, + stage: str = 'init', + unmap_outputs: bool = True) -> tuple: + """Compute corresponding GT box and classification targets for + proposals. + + Args: + flat_proposals (Tensor): Multi level points of a image. + valid_flags (Tensor): Multi level valid flags of a image. + gt_instances (InstanceData): It usually includes ``bboxes`` and + ``labels`` attributes. + gt_instances_ignore (InstanceData): It includes ``bboxes`` + attribute data that is ignored during training and testing. + stage (str): 'init' or 'refine'. Generate target for + init stage or refine stage. Defaults to 'init'. + unmap_outputs (bool): Whether to map outputs back to + the original set of anchors. Defaults to True. + + Returns: + tuple: + + - labels (Tensor): Labels of each level. + - label_weights (Tensor): Label weights of each level. + - bbox_targets (Tensor): BBox targets of each level. + - bbox_weights (Tensor): BBox weights of each level. + - pos_inds (Tensor): positive samples indexes. + - neg_inds (Tensor): negative samples indexes. + - sampling_result (:obj:`SamplingResult`): Sampling results. + """ + inside_flags = valid_flags + if not inside_flags.any(): + raise ValueError( + 'There is no valid proposal inside the image boundary. Please ' + 'check the image size.') + # assign gt and sample proposals + proposals = flat_proposals[inside_flags, :] + pred_instances = InstanceData(priors=proposals) + + if stage == 'init': + assigner = self.init_assigner + pos_weight = self.train_cfg['init']['pos_weight'] + else: + assigner = self.refine_assigner + pos_weight = self.train_cfg['refine']['pos_weight'] + + assign_result = assigner.assign(pred_instances, gt_instances, + gt_instances_ignore) + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_proposals = proposals.shape[0] + bbox_gt = proposals.new_zeros([num_valid_proposals, 4]) + pos_proposals = torch.zeros_like(proposals) + proposals_weights = proposals.new_zeros([num_valid_proposals, 4]) + labels = proposals.new_full((num_valid_proposals, ), + self.num_classes, + dtype=torch.long) + label_weights = proposals.new_zeros( + num_valid_proposals, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + bbox_gt[pos_inds, :] = sampling_result.pos_gt_bboxes + pos_proposals[pos_inds, :] = proposals[pos_inds, :] + proposals_weights[pos_inds, :] = 1.0 + + labels[pos_inds] = sampling_result.pos_gt_labels + if pos_weight <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = pos_weight + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + # map up to original set of proposals + if unmap_outputs: + num_total_proposals = flat_proposals.size(0) + labels = unmap( + labels, + num_total_proposals, + inside_flags, + fill=self.num_classes) # fill bg label + label_weights = unmap(label_weights, num_total_proposals, + inside_flags) + bbox_gt = unmap(bbox_gt, num_total_proposals, inside_flags) + pos_proposals = unmap(pos_proposals, num_total_proposals, + inside_flags) + proposals_weights = unmap(proposals_weights, num_total_proposals, + inside_flags) + + return (labels, label_weights, bbox_gt, pos_proposals, + proposals_weights, pos_inds, neg_inds, sampling_result) + + def get_targets(self, + proposals_list: List[Tensor], + valid_flag_list: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + stage: str = 'init', + unmap_outputs: bool = True, + return_sampling_results: bool = False) -> tuple: + """Compute corresponding GT box and classification targets for + proposals. + + Args: + proposals_list (list[Tensor]): Multi level points/bboxes of each + image. + valid_flag_list (list[Tensor]): Multi level valid flags of each + image. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + stage (str): 'init' or 'refine'. Generate target for init stage or + refine stage. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + return_sampling_results (bool): Whether to return the sampling + results. Defaults to False. + + Returns: + tuple: + + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_gt_list (list[Tensor]): Ground truth bbox of each level. + - proposals_list (list[Tensor]): Proposals(points/bboxes) of + each level. + - proposal_weights_list (list[Tensor]): Proposal weights of + each level. + - avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + """ + assert stage in ['init', 'refine'] + num_imgs = len(batch_img_metas) + assert len(proposals_list) == len(valid_flag_list) == num_imgs + + # points number of multi levels + num_level_proposals = [points.size(0) for points in proposals_list[0]] + + # concat all level points and flags to a single tensor + for i in range(num_imgs): + assert len(proposals_list[i]) == len(valid_flag_list[i]) + proposals_list[i] = torch.cat(proposals_list[i]) + valid_flag_list[i] = torch.cat(valid_flag_list[i]) + + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + (all_labels, all_label_weights, all_bbox_gt, all_proposals, + all_proposal_weights, pos_inds_list, neg_inds_list, + sampling_results_list) = multi_apply( + self._get_targets_single, + proposals_list, + valid_flag_list, + batch_gt_instances, + batch_gt_instances_ignore, + stage=stage, + unmap_outputs=unmap_outputs) + + # sampled points of all images + avg_refactor = sum( + [results.avg_factor for results in sampling_results_list]) + labels_list = images_to_levels(all_labels, num_level_proposals) + label_weights_list = images_to_levels(all_label_weights, + num_level_proposals) + bbox_gt_list = images_to_levels(all_bbox_gt, num_level_proposals) + proposals_list = images_to_levels(all_proposals, num_level_proposals) + proposal_weights_list = images_to_levels(all_proposal_weights, + num_level_proposals) + res = (labels_list, label_weights_list, bbox_gt_list, proposals_list, + proposal_weights_list, avg_refactor) + if return_sampling_results: + res = res + (sampling_results_list, ) + + return res + + def loss_by_feat_single(self, cls_score: Tensor, pts_pred_init: Tensor, + pts_pred_refine: Tensor, labels: Tensor, + label_weights, bbox_gt_init: Tensor, + bbox_weights_init: Tensor, bbox_gt_refine: Tensor, + bbox_weights_refine: Tensor, stride: int, + avg_factor_init: int, + avg_factor_refine: int) -> Tuple[Tensor]: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_classes, h_i, w_i). + pts_pred_init (Tensor): Points of shape + (batch_size, h_i * w_i, num_points * 2). + pts_pred_refine (Tensor): Points refined of shape + (batch_size, h_i * w_i, num_points * 2). + labels (Tensor): Ground truth class indices with shape + (batch_size, h_i * w_i). + label_weights (Tensor): Label weights of shape + (batch_size, h_i * w_i). + bbox_gt_init (Tensor): BBox regression targets in the init stage + of shape (batch_size, h_i * w_i, 4). + bbox_weights_init (Tensor): BBox regression loss weights in the + init stage of shape (batch_size, h_i * w_i, 4). + bbox_gt_refine (Tensor): BBox regression targets in the refine + stage of shape (batch_size, h_i * w_i, 4). + bbox_weights_refine (Tensor): BBox regression loss weights in the + refine stage of shape (batch_size, h_i * w_i, 4). + stride (int): Point stride. + avg_factor_init (int): Average factor that is used to average + the loss in the init stage. + avg_factor_refine (int): Average factor that is used to average + the loss in the refine stage. + + Returns: + Tuple[Tensor]: loss components. + """ + # classification loss + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + cls_score = cls_score.permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + cls_score = cls_score.contiguous() + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=avg_factor_refine) + + # points loss + bbox_gt_init = bbox_gt_init.reshape(-1, 4) + bbox_weights_init = bbox_weights_init.reshape(-1, 4) + bbox_pred_init = self.points2bbox( + pts_pred_init.reshape(-1, 2 * self.num_points), y_first=False) + bbox_gt_refine = bbox_gt_refine.reshape(-1, 4) + bbox_weights_refine = bbox_weights_refine.reshape(-1, 4) + bbox_pred_refine = self.points2bbox( + pts_pred_refine.reshape(-1, 2 * self.num_points), y_first=False) + normalize_term = self.point_base_scale * stride + loss_pts_init = self.loss_bbox_init( + bbox_pred_init / normalize_term, + bbox_gt_init / normalize_term, + bbox_weights_init, + avg_factor=avg_factor_init) + loss_pts_refine = self.loss_bbox_refine( + bbox_pred_refine / normalize_term, + bbox_gt_refine / normalize_term, + bbox_weights_refine, + avg_factor=avg_factor_refine) + return loss_cls, loss_pts_init, loss_pts_refine + + def loss_by_feat( + self, + cls_scores: List[Tensor], + pts_preds_init: List[Tensor], + pts_preds_refine: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, of shape (batch_size, num_classes, h, w). + pts_preds_init (list[Tensor]): Points for each scale level, each is + a 3D-tensor, of shape (batch_size, h_i * w_i, num_points * 2). + pts_preds_refine (list[Tensor]): Points refined for each scale + level, each is a 3D-tensor, of shape + (batch_size, h_i * w_i, num_points * 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + device = cls_scores[0].device + + # target for initial stage + center_list, valid_flag_list = self.get_points(featmap_sizes, + batch_img_metas, device) + pts_coordinate_preds_init = self.offset_to_pts(center_list, + pts_preds_init) + if self.train_cfg['init']['assigner']['type'] == 'PointAssigner': + # Assign target for center list + candidate_list = center_list + else: + # transform center list to bbox list and + # assign target for bbox list + bbox_list = self.centers_to_bboxes(center_list) + candidate_list = bbox_list + cls_reg_targets_init = self.get_targets( + proposals_list=candidate_list, + valid_flag_list=valid_flag_list, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + stage='init', + return_sampling_results=False) + (*_, bbox_gt_list_init, candidate_list_init, bbox_weights_list_init, + avg_factor_init) = cls_reg_targets_init + + # target for refinement stage + center_list, valid_flag_list = self.get_points(featmap_sizes, + batch_img_metas, device) + pts_coordinate_preds_refine = self.offset_to_pts( + center_list, pts_preds_refine) + bbox_list = [] + for i_img, center in enumerate(center_list): + bbox = [] + for i_lvl in range(len(pts_preds_refine)): + bbox_preds_init = self.points2bbox( + pts_preds_init[i_lvl].detach()) + bbox_shift = bbox_preds_init * self.point_strides[i_lvl] + bbox_center = torch.cat( + [center[i_lvl][:, :2], center[i_lvl][:, :2]], dim=1) + bbox.append(bbox_center + + bbox_shift[i_img].permute(1, 2, 0).reshape(-1, 4)) + bbox_list.append(bbox) + cls_reg_targets_refine = self.get_targets( + proposals_list=bbox_list, + valid_flag_list=valid_flag_list, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + stage='refine', + return_sampling_results=False) + (labels_list, label_weights_list, bbox_gt_list_refine, + candidate_list_refine, bbox_weights_list_refine, + avg_factor_refine) = cls_reg_targets_refine + + # compute loss + losses_cls, losses_pts_init, losses_pts_refine = multi_apply( + self.loss_by_feat_single, + cls_scores, + pts_coordinate_preds_init, + pts_coordinate_preds_refine, + labels_list, + label_weights_list, + bbox_gt_list_init, + bbox_weights_list_init, + bbox_gt_list_refine, + bbox_weights_list_refine, + self.point_strides, + avg_factor_init=avg_factor_init, + avg_factor_refine=avg_factor_refine) + loss_dict_all = { + 'loss_cls': losses_cls, + 'loss_pts_init': losses_pts_init, + 'loss_pts_refine': losses_pts_refine + } + return loss_dict_all + + # Same as base_dense_head/_get_bboxes_single except self._bbox_decode + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform outputs of a single image into bbox predictions. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image. RepPoints head does not need + this value. + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid, has shape + (num_priors, 2). + img_meta (dict): Image meta info. + cfg (:obj:`ConfigDict`): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + cfg = self.test_cfg if cfg is None else cfg + assert len(cls_score_list) == len(bbox_pred_list) + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_labels = [] + for level_idx, (cls_score, bbox_pred, priors) in enumerate( + zip(cls_score_list, bbox_pred_list, mlvl_priors)): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(-1)[:, :-1] + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + results = filter_scores_and_topk( + scores, cfg.score_thr, nms_pre, + dict(bbox_pred=bbox_pred, priors=priors)) + scores, labels, _, filtered_results = results + + bbox_pred = filtered_results['bbox_pred'] + priors = filtered_results['priors'] + + bboxes = self._bbox_decode(priors, bbox_pred, + self.point_strides[level_idx], + img_shape) + + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + + results = InstanceData() + results.bboxes = torch.cat(mlvl_bboxes) + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + def _bbox_decode(self, points: Tensor, bbox_pred: Tensor, stride: int, + max_shape: Tuple[int, int]) -> Tensor: + """Decode the prediction to bounding box. + + Args: + points (Tensor): shape (h_i * w_i, 2). + bbox_pred (Tensor): shape (h_i * w_i, 4). + stride (int): Stride for bbox_pred in different level. + max_shape (Tuple[int, int]): image shape. + + Returns: + Tensor: Bounding boxes decoded. + """ + bbox_pos_center = torch.cat([points[:, :2], points[:, :2]], dim=1) + bboxes = bbox_pred * stride + bbox_pos_center + x1 = bboxes[:, 0].clamp(min=0, max=max_shape[1]) + y1 = bboxes[:, 1].clamp(min=0, max=max_shape[0]) + x2 = bboxes[:, 2].clamp(min=0, max=max_shape[1]) + y2 = bboxes[:, 3].clamp(min=0, max=max_shape[0]) + decoded_bboxes = torch.stack([x1, y1, x2, y2], dim=-1) + return decoded_bboxes diff --git a/mmdetection/mmdet/models/dense_heads/retina_head.py b/mmdetection/mmdet/models/dense_heads/retina_head.py new file mode 100644 index 0000000..be3ae74 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/retina_head.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import ConvModule + +from mmdet.registry import MODELS +from .anchor_head import AnchorHead + + +@MODELS.register_module() +class RetinaHead(AnchorHead): + r"""An anchor-based head used in `RetinaNet + `_. + + The head contains two subnetworks. The first classifies anchor boxes and + the second regresses deltas for the anchors. + + Example: + >>> import torch + >>> self = RetinaHead(11, 7) + >>> x = torch.rand(1, 7, 32, 32) + >>> cls_score, bbox_pred = self.forward_single(x) + >>> # Each anchor predicts a score for each class except background + >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors + >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors + >>> assert cls_per_anchor == (self.num_classes) + >>> assert box_per_anchor == 4 + """ + + def __init__(self, + num_classes, + in_channels, + stacked_convs=4, + conv_cfg=None, + norm_cfg=None, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + init_cfg=dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='retina_cls', + std=0.01, + bias_prob=0.01)), + **kwargs): + assert stacked_convs >= 0, \ + '`stacked_convs` must be non-negative integers, ' \ + f'but got {stacked_convs} instead.' + self.stacked_convs = stacked_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + super(RetinaHead, self).__init__( + num_classes, + in_channels, + anchor_generator=anchor_generator, + init_cfg=init_cfg, + **kwargs) + + def _init_layers(self): + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + in_channels = self.in_channels + for i in range(self.stacked_convs): + self.cls_convs.append( + ConvModule( + in_channels, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule( + in_channels, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + in_channels = self.feat_channels + self.retina_cls = nn.Conv2d( + in_channels, + self.num_base_priors * self.cls_out_channels, + 3, + padding=1) + reg_dim = self.bbox_coder.encode_size + self.retina_reg = nn.Conv2d( + in_channels, self.num_base_priors * reg_dim, 3, padding=1) + + def forward_single(self, x): + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + + Returns: + tuple: + cls_score (Tensor): Cls scores for a single scale level + the channels number is num_anchors * num_classes. + bbox_pred (Tensor): Box energies / deltas for a single scale + level, the channels number is num_anchors * 4. + """ + cls_feat = x + reg_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + reg_feat = reg_conv(reg_feat) + cls_score = self.retina_cls(cls_feat) + bbox_pred = self.retina_reg(reg_feat) + return cls_score, bbox_pred diff --git a/mmdetection/mmdet/models/dense_heads/retina_sepbn_head.py b/mmdetection/mmdet/models/dense_heads/retina_sepbn_head.py new file mode 100644 index 0000000..681a399 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/retina_sepbn_head.py @@ -0,0 +1,127 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import bias_init_with_prob, normal_init +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType, OptMultiConfig +from .anchor_head import AnchorHead + + +@MODELS.register_module() +class RetinaSepBNHead(AnchorHead): + """"RetinaHead with separate BN. + + In RetinaHead, conv/norm layers are shared across different FPN levels, + while in RetinaSepBNHead, conv layers are shared across different FPN + levels, but BN layers are separated. + """ + + def __init__(self, + num_classes: int, + num_ins: int, + in_channels: int, + stacked_convs: int = 4, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + self.stacked_convs = stacked_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.num_ins = num_ins + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + init_cfg=init_cfg, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.num_ins): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + for j in range(self.stacked_convs): + chn = self.in_channels if j == 0 else self.feat_channels + cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(reg_convs) + for i in range(self.stacked_convs): + for j in range(1, self.num_ins): + self.cls_convs[j][i].conv = self.cls_convs[0][i].conv + self.reg_convs[j][i].conv = self.reg_convs[0][i].conv + self.retina_cls = nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + 3, + padding=1) + self.retina_reg = nn.Conv2d( + self.feat_channels, self.num_base_priors * 4, 3, padding=1) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + super().init_weights() + for m in self.cls_convs[0]: + normal_init(m.conv, std=0.01) + for m in self.reg_convs[0]: + normal_init(m.conv, std=0.01) + bias_cls = bias_init_with_prob(0.01) + normal_init(self.retina_cls, std=0.01, bias=bias_cls) + normal_init(self.retina_reg, std=0.01) + + def forward(self, feats: Tuple[Tensor]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + + - cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, the channels number is + num_anchors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, the channels number is + num_anchors * 4. + """ + cls_scores = [] + bbox_preds = [] + for i, x in enumerate(feats): + cls_feat = feats[i] + reg_feat = feats[i] + for cls_conv in self.cls_convs[i]: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs[i]: + reg_feat = reg_conv(reg_feat) + cls_score = self.retina_cls(cls_feat) + bbox_pred = self.retina_reg(reg_feat) + cls_scores.append(cls_score) + bbox_preds.append(bbox_pred) + return cls_scores, bbox_preds diff --git a/mmdetection/mmdet/models/dense_heads/rpn_head.py b/mmdetection/mmdet/models/dense_heads/rpn_head.py new file mode 100644 index 0000000..6b54400 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/rpn_head.py @@ -0,0 +1,302 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmcv.ops import batched_nms +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import (cat_boxes, empty_box_as, get_box_tensor, + get_box_wh, scale_boxes) +from mmdet.utils import InstanceList, MultiConfig, OptInstanceList +from .anchor_head import AnchorHead + + +@MODELS.register_module() +class RPNHead(AnchorHead): + """Implementation of RPN head. + + Args: + in_channels (int): Number of channels in the input feature map. + num_classes (int): Number of categories excluding the background + category. Defaults to 1. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \ + list[dict]): Initialization config dict. + num_convs (int): Number of convolution layers in the head. + Defaults to 1. + """ # noqa: W605 + + def __init__(self, + in_channels: int, + num_classes: int = 1, + init_cfg: MultiConfig = dict( + type='Normal', layer='Conv2d', std=0.01), + num_convs: int = 1, + **kwargs) -> None: + self.num_convs = num_convs + assert num_classes == 1 + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + init_cfg=init_cfg, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + if self.num_convs > 1: + rpn_convs = [] + for i in range(self.num_convs): + if i == 0: + in_channels = self.in_channels + else: + in_channels = self.feat_channels + # use ``inplace=False`` to avoid error: one of the variables + # needed for gradient computation has been modified by an + # inplace operation. + rpn_convs.append( + ConvModule( + in_channels, + self.feat_channels, + 3, + padding=1, + inplace=False)) + self.rpn_conv = nn.Sequential(*rpn_convs) + else: + self.rpn_conv = nn.Conv2d( + self.in_channels, self.feat_channels, 3, padding=1) + self.rpn_cls = nn.Conv2d(self.feat_channels, + self.num_base_priors * self.cls_out_channels, + 1) + reg_dim = self.bbox_coder.encode_size + self.rpn_reg = nn.Conv2d(self.feat_channels, + self.num_base_priors * reg_dim, 1) + + def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]: + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + + Returns: + tuple: + cls_score (Tensor): Cls scores for a single scale level \ + the channels number is num_base_priors * num_classes. + bbox_pred (Tensor): Box energies / deltas for a single scale \ + level, the channels number is num_base_priors * 4. + """ + x = self.rpn_conv(x) + x = F.relu(x) + rpn_cls_score = self.rpn_cls(x) + rpn_bbox_pred = self.rpn_reg(x) + return rpn_cls_score, rpn_bbox_pred + + def loss_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) \ + -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + batch_gt_instances (list[obj:InstanceData]): Batch of gt_instance. + It usually includes ``bboxes`` and ``labels`` attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[obj:InstanceData], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + losses = super().loss_by_feat( + cls_scores, + bbox_preds, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + return dict( + loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox']) + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Be compatible with + BaseDenseHead. Not used in RPNHead. + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (ConfigDict, optional): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bbox_preds = [] + mlvl_valid_priors = [] + mlvl_scores = [] + level_ids = [] + for level_idx, (cls_score, bbox_pred, priors) in \ + enumerate(zip(cls_score_list, bbox_pred_list, + mlvl_priors)): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + reg_dim = self.bbox_coder.encode_size + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, reg_dim) + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + # remind that we set FG labels to [0] since mmdet v2.0 + # BG cat_id: 1 + scores = cls_score.softmax(-1)[:, :-1] + + scores = torch.squeeze(scores) + if 0 < nms_pre < scores.shape[0]: + # sort is faster than topk + # _, topk_inds = scores.topk(cfg.nms_pre) + ranked_scores, rank_inds = scores.sort(descending=True) + topk_inds = rank_inds[:nms_pre] + scores = ranked_scores[:nms_pre] + bbox_pred = bbox_pred[topk_inds, :] + priors = priors[topk_inds] + + mlvl_bbox_preds.append(bbox_pred) + mlvl_valid_priors.append(priors) + mlvl_scores.append(scores) + + # use level id to implement the separate level nms + level_ids.append( + scores.new_full((scores.size(0), ), + level_idx, + dtype=torch.long)) + + bbox_pred = torch.cat(mlvl_bbox_preds) + priors = cat_boxes(mlvl_valid_priors) + bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) + + results = InstanceData() + results.bboxes = bboxes + results.scores = torch.cat(mlvl_scores) + results.level_ids = torch.cat(level_ids) + + return self._bbox_post_process( + results=results, cfg=cfg, rescale=rescale, img_meta=img_meta) + + def _bbox_post_process(self, + results: InstanceData, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True, + img_meta: Optional[dict] = None) -> InstanceData: + """bbox post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. + + Args: + results (:obj:`InstaceData`): Detection instance results, + each item has shape (num_bboxes, ). + cfg (ConfigDict): Test / postprocessing configuration. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Default to True. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert with_nms, '`with_nms` must be True in RPNHead' + if rescale: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + # filter small size bboxes + if cfg.get('min_bbox_size', -1) >= 0: + w, h = get_box_wh(results.bboxes) + valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + if not valid_mask.all(): + results = results[valid_mask] + + if results.bboxes.numel() > 0: + bboxes = get_box_tensor(results.bboxes) + det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, + results.level_ids, cfg.nms) + results = results[keep_idxs] + # some nms would reweight the score, such as softnms + results.scores = det_bboxes[:, -1] + results = results[:cfg.max_per_img] + # TODO: This would unreasonably show the 0th class label + # in visualization + results.labels = results.scores.new_zeros( + len(results), dtype=torch.long) + del results.level_ids + else: + # To avoid some potential error + results_ = InstanceData() + results_.bboxes = empty_box_as(results.bboxes) + results_.scores = results.scores.new_zeros(0) + results_.labels = results.scores.new_zeros(0) + results = results_ + return results diff --git a/mmdetection/mmdet/models/dense_heads/rtmdet_head.py b/mmdetection/mmdet/models/dense_heads/rtmdet_head.py new file mode 100644 index 0000000..ae0ee6d --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/rtmdet_head.py @@ -0,0 +1,692 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Scale, is_norm +from mmengine.model import bias_init_with_prob, constant_init, normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import distance2bbox +from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean +from ..layers.transformer import inverse_sigmoid +from ..task_modules import anchor_inside_flags +from ..utils import (images_to_levels, multi_apply, sigmoid_geometric_mean, + unmap) +from .atss_head import ATSSHead + + +@MODELS.register_module() +class RTMDetHead(ATSSHead): + """Detection Head of RTMDet. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + with_objectness (bool): Whether to add an objectness branch. + Defaults to True. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='ReLU') + """ + + def __init__(self, + num_classes: int, + in_channels: int, + with_objectness: bool = True, + act_cfg: ConfigType = dict(type='ReLU'), + **kwargs) -> None: + self.act_cfg = act_cfg + self.with_objectness = with_objectness + super().__init__(num_classes, in_channels, **kwargs) + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + + def _init_layers(self): + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + pred_pad_size = self.pred_kernel_size // 2 + self.rtm_cls = nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + self.pred_kernel_size, + padding=pred_pad_size) + self.rtm_reg = nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=pred_pad_size) + if self.with_objectness: + self.rtm_obj = nn.Conv2d( + self.feat_channels, + 1, + self.pred_kernel_size, + padding=pred_pad_size) + + self.scales = nn.ModuleList( + [Scale(1.0) for _ in self.prior_generator.strides]) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + normal_init(self.rtm_cls, std=0.01, bias=bias_cls) + normal_init(self.rtm_reg, std=0.01) + if self.with_objectness: + normal_init(self.rtm_obj, std=0.01, bias=bias_cls) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + """ + + cls_scores = [] + bbox_preds = [] + for idx, (x, scale, stride) in enumerate( + zip(feats, self.scales, self.prior_generator.strides)): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls(cls_feat) + + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + + if self.with_objectness: + objectness = self.rtm_obj(reg_feat) + cls_score = inverse_sigmoid( + sigmoid_geometric_mean(cls_score, objectness)) + + reg_dist = scale(self.rtm_reg(reg_feat).exp()).float() * stride[0] + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + return tuple(cls_scores), tuple(bbox_preds) + + def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, + labels: Tensor, label_weights: Tensor, + bbox_targets: Tensor, assign_metrics: Tensor, + stride: List[int]): + """Compute loss of a single scale level. + + Args: + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Decoded bboxes for each scale + level with shape (N, num_anchors * 4, H, W). + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors). + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (N, num_total_anchors, 4). + assign_metrics (Tensor): Assign metrics with shape + (N, num_total_anchors). + stride (List[int]): Downsample stride of the feature map. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert stride[0] == stride[1], 'h stride is not equal to w stride!' + cls_score = cls_score.permute(0, 2, 3, 1).reshape( + -1, self.cls_out_channels).contiguous() + bbox_pred = bbox_pred.reshape(-1, 4) + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + assign_metrics = assign_metrics.reshape(-1) + label_weights = label_weights.reshape(-1) + targets = (labels, assign_metrics) + + loss_cls = self.loss_cls( + cls_score, targets, label_weights, avg_factor=1.0) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + + pos_decode_bbox_pred = pos_bbox_pred + pos_decode_bbox_targets = pos_bbox_targets + + # regression loss + pos_bbox_weight = assign_metrics[pos_inds] + + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_decode_bbox_targets, + weight=pos_bbox_weight, + avg_factor=1.0) + else: + loss_bbox = bbox_pred.sum() * 0 + pos_bbox_weight = bbox_targets.new_tensor(0.) + + return loss_cls, loss_bbox, assign_metrics.sum(), pos_bbox_weight.sum() + + def loss_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None): + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1) + decoded_bboxes = [] + for anchor, bbox_pred in zip(anchor_list[0], bbox_preds): + anchor = anchor.reshape(-1, 4) + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + bbox_pred = distance2bbox(anchor, bbox_pred) + decoded_bboxes.append(bbox_pred) + + flatten_bboxes = torch.cat(decoded_bboxes, 1) + + cls_reg_targets = self.get_targets( + flatten_cls_scores, + flatten_bboxes, + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + assign_metrics_list, sampling_results_list) = cls_reg_targets + + losses_cls, losses_bbox,\ + cls_avg_factors, bbox_avg_factors = multi_apply( + self.loss_by_feat_single, + cls_scores, + decoded_bboxes, + labels_list, + label_weights_list, + bbox_targets_list, + assign_metrics_list, + self.prior_generator.strides) + + cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item() + losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls)) + + bbox_avg_factor = reduce_mean( + sum(bbox_avg_factors)).clamp_(min=1).item() + losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) + return dict(loss_cls=losses_cls, loss_bbox=losses_bbox) + + def get_targets(self, + cls_scores: Tensor, + bbox_preds: Tensor, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs=True): + """Compute regression and classification targets for anchors in + multiple images. + + Args: + cls_scores (Tensor): Classification predictions of images, + a 3D-Tensor with shape [num_imgs, num_priors, num_classes]. + bbox_preds (Tensor): Decoded bboxes predictions of one image, + a 3D-Tensor with shape [num_imgs, num_priors, 4] in [tl_x, + tl_y, br_x, br_y] format. + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, 4). + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, ) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + + Returns: + tuple: a tuple containing learning targets. + + - anchors_list (list[list[Tensor]]): Anchors of each level. + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_targets_list (list[Tensor]): BBox targets of each level. + - assign_metrics_list (list[Tensor]): alignment metrics of each + level. + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + + # concat all level anchors and flags to a single tensor + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + anchor_list[i] = torch.cat(anchor_list[i]) + valid_flag_list[i] = torch.cat(valid_flag_list[i]) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + # anchor_list: list(b * [-1, 4]) + (all_anchors, all_labels, all_label_weights, all_bbox_targets, + all_assign_metrics, sampling_results_list) = multi_apply( + self._get_targets_single, + cls_scores.detach(), + bbox_preds.detach(), + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + # no valid anchors + if any([labels is None for labels in all_labels]): + return None + + # split targets to a list w.r.t. multiple levels + anchors_list = images_to_levels(all_anchors, num_level_anchors) + labels_list = images_to_levels(all_labels, num_level_anchors) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors) + assign_metrics_list = images_to_levels(all_assign_metrics, + num_level_anchors) + + return (anchors_list, labels_list, label_weights_list, + bbox_targets_list, assign_metrics_list, sampling_results_list) + + def _get_targets_single(self, + cls_scores: Tensor, + bbox_preds: Tensor, + flat_anchors: Tensor, + valid_flags: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs=True): + """Compute regression, classification targets for anchors in a single + image. + + Args: + cls_scores (list(Tensor)): Box scores for each image. + bbox_preds (list(Tensor)): Box energies / deltas for each image. + flat_anchors (Tensor): Multi-level anchors of the image, which are + concatenated into a single tensor of shape (num_anchors ,4) + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_anchors,). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + + Returns: + tuple: N is the number of total anchors in the image. + + - anchors (Tensor): All anchors in the image with shape (N, 4). + - labels (Tensor): Labels of all anchors in the image with shape + (N,). + - label_weights (Tensor): Label weights of all anchor in the + image with shape (N,). + - bbox_targets (Tensor): BBox targets of all anchors in the + image with shape (N, 4). + - norm_alignment_metrics (Tensor): Normalized alignment metrics + of all priors in the image with shape (N,). + """ + inside_flags = anchor_inside_flags(flat_anchors, valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + if not inside_flags.any(): + return (None, ) * 7 + # assign gt and sample anchors + anchors = flat_anchors[inside_flags, :] + + pred_instances = InstanceData( + scores=cls_scores[inside_flags, :], + bboxes=bbox_preds[inside_flags, :], + priors=anchors) + + assign_result = self.assigner.assign(pred_instances, gt_instances, + gt_instances_ignore) + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_anchors = anchors.shape[0] + bbox_targets = torch.zeros_like(anchors) + labels = anchors.new_full((num_valid_anchors, ), + self.num_classes, + dtype=torch.long) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + assign_metrics = anchors.new_zeros( + num_valid_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + # point-based + pos_bbox_targets = sampling_result.pos_gt_bboxes + bbox_targets[pos_inds, :] = pos_bbox_targets + + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + class_assigned_gt_inds = torch.unique( + sampling_result.pos_assigned_gt_inds) + for gt_inds in class_assigned_gt_inds: + gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds == + gt_inds] + assign_metrics[gt_class_inds] = assign_result.max_overlaps[ + gt_class_inds] + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_anchors.size(0) + anchors = unmap(anchors, num_total_anchors, inside_flags) + labels = unmap( + labels, num_total_anchors, inside_flags, fill=self.num_classes) + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) + assign_metrics = unmap(assign_metrics, num_total_anchors, + inside_flags) + return (anchors, labels, label_weights, bbox_targets, assign_metrics, + sampling_result) + + def get_anchors(self, + featmap_sizes: List[tuple], + batch_img_metas: List[dict], + device: Union[torch.device, str] = 'cuda') \ + -> Tuple[List[List[Tensor]], List[List[Tensor]]]: + """Get anchors according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + batch_img_metas (list[dict]): Image meta info. + device (torch.device or str): Device for returned tensors. + Defaults to cuda. + + Returns: + tuple: + + - anchor_list (list[list[Tensor]]): Anchors of each image. + - valid_flag_list (list[list[Tensor]]): Valid flags of each + image. + """ + num_imgs = len(batch_img_metas) + + # since feature map sizes of all images are the same, we only compute + # anchors for one time + multi_level_anchors = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + anchor_list = [multi_level_anchors for _ in range(num_imgs)] + + # for each image, we compute valid flags of multi level anchors + valid_flag_list = [] + for img_id, img_meta in enumerate(batch_img_metas): + multi_level_flags = self.prior_generator.valid_flags( + featmap_sizes, img_meta['pad_shape'], device) + valid_flag_list.append(multi_level_flags) + return anchor_list, valid_flag_list + + +@MODELS.register_module() +class RTMDetSepBNHead(RTMDetHead): + """RTMDetHead with separated BN layers and shared conv layers. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + share_conv (bool): Whether to share conv layers between stages. + Defaults to True. + use_depthwise (bool): Whether to use depthwise separable convolution in + head. Defaults to False. + norm_cfg (:obj:`ConfigDict` or dict)): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict)): Config dict for activation layer. + Defaults to dict(type='SiLU'). + pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + share_conv: bool = True, + use_depthwise: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU'), + pred_kernel_size: int = 1, + exp_on_reg=False, + **kwargs) -> None: + self.share_conv = share_conv + self.exp_on_reg = exp_on_reg + self.use_depthwise = use_depthwise + super().__init__( + num_classes, + in_channels, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + pred_kernel_size=pred_kernel_size, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + + self.rtm_cls = nn.ModuleList() + self.rtm_reg = nn.ModuleList() + if self.with_objectness: + self.rtm_obj = nn.ModuleList() + for n in range(len(self.prior_generator.strides)): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + cls_convs.append( + conv( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_convs.append( + conv( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(reg_convs) + + self.rtm_cls.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + self.rtm_reg.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + if self.with_objectness: + self.rtm_obj.append( + nn.Conv2d( + self.feat_channels, + 1, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + + if self.share_conv: + for n in range(len(self.prior_generator.strides)): + for i in range(self.stacked_convs): + self.cls_convs[n][i].conv = self.cls_convs[0][i].conv + self.reg_convs[n][i].conv = self.reg_convs[0][i].conv + + def init_weights(self) -> None: + """Initialize weights of the head.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg): + normal_init(rtm_cls, std=0.01, bias=bias_cls) + normal_init(rtm_reg, std=0.01) + if self.with_objectness: + for rtm_obj in self.rtm_obj: + normal_init(rtm_obj, std=0.01, bias=bias_cls) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + + - cls_scores (tuple[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_anchors * num_classes. + - bbox_preds (tuple[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_anchors * 4. + """ + + cls_scores = [] + bbox_preds = [] + for idx, (x, stride) in enumerate( + zip(feats, self.prior_generator.strides)): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + + if self.with_objectness: + objectness = self.rtm_obj[idx](reg_feat) + cls_score = inverse_sigmoid( + sigmoid_geometric_mean(cls_score, objectness)) + if self.exp_on_reg: + reg_dist = self.rtm_reg[idx](reg_feat).exp() * stride[0] + else: + reg_dist = self.rtm_reg[idx](reg_feat) * stride[0] + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + return tuple(cls_scores), tuple(bbox_preds) diff --git a/mmdetection/mmdet/models/dense_heads/rtmdet_ins_head.py b/mmdetection/mmdet/models/dense_heads/rtmdet_ins_head.py new file mode 100644 index 0000000..261a57f --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/rtmdet_ins_head.py @@ -0,0 +1,1034 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, is_norm +from mmcv.ops import batched_nms +from mmengine.model import (BaseModule, bias_init_with_prob, constant_init, + normal_init) +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.layers.transformer import inverse_sigmoid +from mmdet.models.utils import (filter_scores_and_topk, multi_apply, + select_single_mlvl, sigmoid_geometric_mean) +from mmdet.registry import MODELS +from mmdet.structures.bbox import (cat_boxes, distance2bbox, get_box_tensor, + get_box_wh, scale_boxes) +from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean +from .rtmdet_head import RTMDetHead + + +@MODELS.register_module() +class RTMDetInsHead(RTMDetHead): + """Detection Head of RTMDet-Ins. + + Args: + num_prototypes (int): Number of mask prototype features extracted + from the mask head. Defaults to 8. + dyconv_channels (int): Channel of the dynamic conv layers. + Defaults to 8. + num_dyconvs (int): Number of the dynamic convolution layers. + Defaults to 3. + mask_loss_stride (int): Down sample stride of the masks for loss + computation. Defaults to 4. + loss_mask (:obj:`ConfigDict` or dict): Config dict for mask loss. + """ + + def __init__(self, + *args, + num_prototypes: int = 8, + dyconv_channels: int = 8, + num_dyconvs: int = 3, + mask_loss_stride: int = 4, + loss_mask=dict( + type='DiceLoss', + loss_weight=2.0, + eps=5e-6, + reduction='mean'), + **kwargs) -> None: + self.num_prototypes = num_prototypes + self.num_dyconvs = num_dyconvs + self.dyconv_channels = dyconv_channels + self.mask_loss_stride = mask_loss_stride + super().__init__(*args, **kwargs) + self.loss_mask = MODELS.build(loss_mask) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + super()._init_layers() + # a branch to predict kernels of dynamic convs + self.kernel_convs = nn.ModuleList() + # calculate num dynamic parameters + weight_nums, bias_nums = [], [] + for i in range(self.num_dyconvs): + if i == 0: + weight_nums.append( + # mask prototype and coordinate features + (self.num_prototypes + 2) * self.dyconv_channels) + bias_nums.append(self.dyconv_channels * 1) + elif i == self.num_dyconvs - 1: + weight_nums.append(self.dyconv_channels * 1) + bias_nums.append(1) + else: + weight_nums.append(self.dyconv_channels * self.dyconv_channels) + bias_nums.append(self.dyconv_channels * 1) + self.weight_nums = weight_nums + self.bias_nums = bias_nums + self.num_gen_params = sum(weight_nums) + sum(bias_nums) + + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.kernel_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + pred_pad_size = self.pred_kernel_size // 2 + self.rtm_kernel = nn.Conv2d( + self.feat_channels, + self.num_gen_params, + self.pred_kernel_size, + padding=pred_pad_size) + self.mask_head = MaskFeatModule( + in_channels=self.in_channels, + feat_channels=self.feat_channels, + stacked_convs=4, + num_levels=len(self.prior_generator.strides), + num_prototypes=self.num_prototypes, + act_cfg=self.act_cfg, + norm_cfg=self.norm_cfg) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale + levels, each is a 4D-tensor, the channels number is + num_gen_params. + - mask_feat (Tensor): Output feature of the mask head. Each is a + 4D-tensor, the channels number is num_prototypes. + """ + mask_feat = self.mask_head(feats) + + cls_scores = [] + bbox_preds = [] + kernel_preds = [] + for idx, (x, scale, stride) in enumerate( + zip(feats, self.scales, self.prior_generator.strides)): + cls_feat = x + reg_feat = x + kernel_feat = x + + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls(cls_feat) + + for kernel_layer in self.kernel_convs: + kernel_feat = kernel_layer(kernel_feat) + kernel_pred = self.rtm_kernel(kernel_feat) + + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + + if self.with_objectness: + objectness = self.rtm_obj(reg_feat) + cls_score = inverse_sigmoid( + sigmoid_geometric_mean(cls_score, objectness)) + + reg_dist = scale(self.rtm_reg(reg_feat)) * stride[0] + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + kernel_preds.append(kernel_pred) + return tuple(cls_scores), tuple(bbox_preds), tuple( + kernel_preds), mask_feat + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + kernel_preds: List[Tensor], + mask_feat: Tensor, + score_factors: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigType] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS, + such as CenterNess in FCOS, IoU branch in ATSS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + kernel_preds (list[Tensor]): Kernel predictions of dynamic + convs for all scale levels, each is a 4D-tensor, has shape + (batch_size, num_params, H, W). + mask_feat (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, num_prototypes, H, W). + score_factors (list[Tensor], optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Defaults to None. + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + assert len(cls_scores) == len(bbox_preds) + + if score_factors is None: + # e.g. Retina, FreeAnchor, Foveabox, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, AutoAssign, etc. + with_score_factors = True + assert len(cls_scores) == len(score_factors) + + num_levels = len(cls_scores) + + featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + result_list = [] + + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + cls_score_list = select_single_mlvl( + cls_scores, img_id, detach=True) + bbox_pred_list = select_single_mlvl( + bbox_preds, img_id, detach=True) + kernel_pred_list = select_single_mlvl( + kernel_preds, img_id, detach=True) + if with_score_factors: + score_factor_list = select_single_mlvl( + score_factors, img_id, detach=True) + else: + score_factor_list = [None for _ in range(num_levels)] + + results = self._predict_by_feat_single( + cls_score_list=cls_score_list, + bbox_pred_list=bbox_pred_list, + kernel_pred_list=kernel_pred_list, + mask_feat=mask_feat[img_id], + score_factor_list=score_factor_list, + mlvl_priors=mlvl_priors, + img_meta=img_meta, + cfg=cfg, + rescale=rescale, + with_nms=with_nms) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + kernel_pred_list: List[Tensor], + mask_feat: Tensor, + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: ConfigType, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox and mask results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + kernel_preds (list[Tensor]): Kernel predictions of dynamic + convs for all scale levels of a single image, each is a + 4D-tensor, has shape (num_params, H, W). + mask_feat (Tensor): Mask prototype features of a single image + extracted from the mask head, has shape (num_prototypes, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (mmengine.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + if score_factor_list[0] is None: + # e.g. Retina, FreeAnchor, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, etc. + with_score_factors = True + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bbox_preds = [] + mlvl_kernels = [] + mlvl_valid_priors = [] + mlvl_scores = [] + mlvl_labels = [] + if with_score_factors: + mlvl_score_factors = [] + else: + mlvl_score_factors = None + + for level_idx, (cls_score, bbox_pred, kernel_pred, + score_factor, priors) in \ + enumerate(zip(cls_score_list, bbox_pred_list, kernel_pred_list, + score_factor_list, mlvl_priors)): + + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + dim = self.bbox_coder.encode_size + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim) + if with_score_factors: + score_factor = score_factor.permute(1, 2, + 0).reshape(-1).sigmoid() + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + kernel_pred = kernel_pred.permute(1, 2, 0).reshape( + -1, self.num_gen_params) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + scores = cls_score.softmax(-1)[:, :-1] + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + score_thr = cfg.get('score_thr', 0) + + results = filter_scores_and_topk( + scores, score_thr, nms_pre, + dict( + bbox_pred=bbox_pred, + priors=priors, + kernel_pred=kernel_pred)) + scores, labels, keep_idxs, filtered_results = results + + bbox_pred = filtered_results['bbox_pred'] + priors = filtered_results['priors'] + kernel_pred = filtered_results['kernel_pred'] + + if with_score_factors: + score_factor = score_factor[keep_idxs] + + mlvl_bbox_preds.append(bbox_pred) + mlvl_valid_priors.append(priors) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + mlvl_kernels.append(kernel_pred) + + if with_score_factors: + mlvl_score_factors.append(score_factor) + + bbox_pred = torch.cat(mlvl_bbox_preds) + priors = cat_boxes(mlvl_valid_priors) + bboxes = self.bbox_coder.decode( + priors[..., :2], bbox_pred, max_shape=img_shape) + + results = InstanceData() + results.bboxes = bboxes + results.priors = priors + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + results.kernels = torch.cat(mlvl_kernels) + if with_score_factors: + results.score_factors = torch.cat(mlvl_score_factors) + + return self._bbox_mask_post_process( + results=results, + mask_feat=mask_feat, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + def _bbox_mask_post_process( + self, + results: InstanceData, + mask_feat, + cfg: ConfigType, + rescale: bool = False, + with_nms: bool = True, + img_meta: Optional[dict] = None) -> InstanceData: + """bbox and mask post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + results (:obj:`InstaceData`): Detection instance results, + each item has shape (num_bboxes, ). + cfg (ConfigDict): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default to False. + with_nms (bool): If True, do nms before return boxes. + Default to True. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + stride = self.prior_generator.strides[0][0] + if rescale: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + if hasattr(results, 'score_factors'): + # TODO: Add sqrt operation in order to be consistent with + # the paper. + score_factors = results.pop('score_factors') + results.scores = results.scores * score_factors + + # filter small size bboxes + if cfg.get('min_bbox_size', -1) >= 0: + w, h = get_box_wh(results.bboxes) + valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + if not valid_mask.all(): + results = results[valid_mask] + + # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg + assert with_nms, 'with_nms must be True for RTMDet-Ins' + if results.bboxes.numel() > 0: + bboxes = get_box_tensor(results.bboxes) + det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, + results.labels, cfg.nms) + results = results[keep_idxs] + # some nms would reweight the score, such as softnms + results.scores = det_bboxes[:, -1] + results = results[:cfg.max_per_img] + + # process masks + mask_logits = self._mask_predict_by_feat_single( + mask_feat, results.kernels, results.priors) + + mask_logits = F.interpolate( + mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear') + if rescale: + ori_h, ori_w = img_meta['ori_shape'][:2] + mask_logits = F.interpolate( + mask_logits, + size=[ + math.ceil(mask_logits.shape[-2] * scale_factor[0]), + math.ceil(mask_logits.shape[-1] * scale_factor[1]) + ], + mode='bilinear', + align_corners=False)[..., :ori_h, :ori_w] + masks = mask_logits.sigmoid().squeeze(0) + masks = masks > cfg.mask_thr_binary + results.masks = masks + else: + h, w = img_meta['ori_shape'][:2] if rescale else img_meta[ + 'img_shape'][:2] + results.masks = torch.zeros( + size=(results.bboxes.shape[0], h, w), + dtype=torch.bool, + device=results.bboxes.device) + + return results + + def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple: + """split kernel head prediction to conv weight and bias.""" + n_inst = flatten_kernels.size(0) + n_layers = len(self.weight_nums) + params_splits = list( + torch.split_with_sizes( + flatten_kernels, self.weight_nums + self.bias_nums, dim=1)) + weight_splits = params_splits[:n_layers] + bias_splits = params_splits[n_layers:] + for i in range(n_layers): + if i < n_layers - 1: + weight_splits[i] = weight_splits[i].reshape( + n_inst * self.dyconv_channels, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst * + self.dyconv_channels) + else: + weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst) + + return weight_splits, bias_splits + + def _mask_predict_by_feat_single(self, mask_feat: Tensor, kernels: Tensor, + priors: Tensor) -> Tensor: + """Generate mask logits from mask features with dynamic convs. + + Args: + mask_feat (Tensor): Mask prototype features. + Has shape (num_prototypes, H, W). + kernels (Tensor): Kernel parameters for each instance. + Has shape (num_instance, num_params) + priors (Tensor): Center priors for each instance. + Has shape (num_instance, 4). + Returns: + Tensor: Instance segmentation masks for each instance. + Has shape (num_instance, H, W). + """ + num_inst = priors.shape[0] + h, w = mask_feat.size()[-2:] + if num_inst < 1: + return torch.empty( + size=(num_inst, h, w), + dtype=mask_feat.dtype, + device=mask_feat.device) + if len(mask_feat.shape) < 4: + mask_feat.unsqueeze(0) + + coord = self.prior_generator.single_level_grid_priors( + (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2) + num_inst = priors.shape[0] + points = priors[:, :2].reshape(-1, 1, 2) + strides = priors[:, 2:].reshape(-1, 1, 2) + relative_coord = (points - coord).permute(0, 2, 1) / ( + strides[..., 0].reshape(-1, 1, 1) * 8) + relative_coord = relative_coord.reshape(num_inst, 2, h, w) + + mask_feat = torch.cat( + [relative_coord, + mask_feat.repeat(num_inst, 1, 1, 1)], dim=1) + weights, biases = self.parse_dynamic_params(kernels) + + n_layers = len(weights) + x = mask_feat.reshape(1, -1, h, w) + for i, (weight, bias) in enumerate(zip(weights, biases)): + x = F.conv2d( + x, weight, bias=bias, stride=1, padding=0, groups=num_inst) + if i < n_layers - 1: + x = F.relu(x) + x = x.reshape(num_inst, h, w) + return x + + def loss_mask_by_feat(self, mask_feats: Tensor, flatten_kernels: Tensor, + sampling_results_list: list, + batch_gt_instances: InstanceList) -> Tensor: + """Compute instance segmentation loss. + + Args: + mask_feats (list[Tensor]): Mask prototype features extracted from + the mask head. Has shape (N, num_prototypes, H, W) + flatten_kernels (list[Tensor]): Kernels of the dynamic conv layers. + Has shape (N, num_instances, num_params) + sampling_results_list (list[:obj:`SamplingResults`]) Batch of + assignment results. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + Tensor: The mask loss tensor. + """ + batch_pos_mask_logits = [] + pos_gt_masks = [] + for idx, (mask_feat, kernels, sampling_results, + gt_instances) in enumerate( + zip(mask_feats, flatten_kernels, sampling_results_list, + batch_gt_instances)): + pos_priors = sampling_results.pos_priors + pos_inds = sampling_results.pos_inds + pos_kernels = kernels[pos_inds] # n_pos, num_gen_params + pos_mask_logits = self._mask_predict_by_feat_single( + mask_feat, pos_kernels, pos_priors) + if gt_instances.masks.numel() == 0: + gt_masks = torch.empty_like(gt_instances.masks) + else: + gt_masks = gt_instances.masks[ + sampling_results.pos_assigned_gt_inds, :] + batch_pos_mask_logits.append(pos_mask_logits) + pos_gt_masks.append(gt_masks) + + pos_gt_masks = torch.cat(pos_gt_masks, 0) + batch_pos_mask_logits = torch.cat(batch_pos_mask_logits, 0) + + # avg_factor + num_pos = batch_pos_mask_logits.shape[0] + num_pos = reduce_mean(mask_feats.new_tensor([num_pos + ])).clamp_(min=1).item() + + if batch_pos_mask_logits.shape[0] == 0: + return mask_feats.sum() * 0 + + scale = self.prior_generator.strides[0][0] // self.mask_loss_stride + # upsample pred masks + batch_pos_mask_logits = F.interpolate( + batch_pos_mask_logits.unsqueeze(0), + scale_factor=scale, + mode='bilinear', + align_corners=False).squeeze(0) + # downsample gt masks + pos_gt_masks = pos_gt_masks[:, self.mask_loss_stride // + 2::self.mask_loss_stride, + self.mask_loss_stride // + 2::self.mask_loss_stride] + + loss_mask = self.loss_mask( + batch_pos_mask_logits, + pos_gt_masks, + weight=None, + avg_factor=num_pos) + + return loss_mask + + def loss_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + kernel_preds: List[Tensor], + mask_feat: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None): + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1) + flatten_kernels = torch.cat([ + kernel_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_gen_params) + for kernel_pred in kernel_preds + ], 1) + decoded_bboxes = [] + for anchor, bbox_pred in zip(anchor_list[0], bbox_preds): + anchor = anchor.reshape(-1, 4) + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + bbox_pred = distance2bbox(anchor, bbox_pred) + decoded_bboxes.append(bbox_pred) + + flatten_bboxes = torch.cat(decoded_bboxes, 1) + for gt_instances in batch_gt_instances: + gt_instances.masks = gt_instances.masks.to_tensor( + dtype=torch.bool, device=device) + + cls_reg_targets = self.get_targets( + flatten_cls_scores, + flatten_bboxes, + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + assign_metrics_list, sampling_results_list) = cls_reg_targets + + losses_cls, losses_bbox,\ + cls_avg_factors, bbox_avg_factors = multi_apply( + self.loss_by_feat_single, + cls_scores, + decoded_bboxes, + labels_list, + label_weights_list, + bbox_targets_list, + assign_metrics_list, + self.prior_generator.strides) + + cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item() + losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls)) + + bbox_avg_factor = reduce_mean( + sum(bbox_avg_factors)).clamp_(min=1).item() + losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) + + loss_mask = self.loss_mask_by_feat(mask_feat, flatten_kernels, + sampling_results_list, + batch_gt_instances) + loss = dict( + loss_cls=losses_cls, loss_bbox=losses_bbox, loss_mask=loss_mask) + return loss + + +class MaskFeatModule(BaseModule): + """Mask feature head used in RTMDet-Ins. + + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels of the mask feature + map branch. + num_levels (int): The starting feature map level from RPN that + will be used to predict the mask feature map. + num_prototypes (int): Number of output channel of the mask feature + map branch. This is the channel count of the mask + feature map that to be dynamically convolved with the predicted + kernel. + stacked_convs (int): Number of convs in mask feature branch. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='ReLU', inplace=True) + norm_cfg (dict): Config dict for normalization layer. Default: None. + """ + + def __init__( + self, + in_channels: int, + feat_channels: int = 256, + stacked_convs: int = 4, + num_levels: int = 3, + num_prototypes: int = 8, + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + norm_cfg: ConfigType = dict(type='BN') + ) -> None: + super().__init__(init_cfg=None) + self.num_levels = num_levels + self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1) + convs = [] + for i in range(stacked_convs): + in_c = in_channels if i == 0 else feat_channels + convs.append( + ConvModule( + in_c, + feat_channels, + 3, + padding=1, + act_cfg=act_cfg, + norm_cfg=norm_cfg)) + self.stacked_convs = nn.Sequential(*convs) + self.projection = nn.Conv2d( + feat_channels, num_prototypes, kernel_size=1) + + def forward(self, features: Tuple[Tensor, ...]) -> Tensor: + # multi-level feature fusion + fusion_feats = [features[0]] + size = features[0].shape[-2:] + for i in range(1, self.num_levels): + f = F.interpolate(features[i], size=size, mode='bilinear') + fusion_feats.append(f) + fusion_feats = torch.cat(fusion_feats, dim=1) + fusion_feats = self.fusion_conv(fusion_feats) + # pred mask feats + mask_features = self.stacked_convs(fusion_feats) + mask_features = self.projection(mask_features) + return mask_features + + +@MODELS.register_module() +class RTMDetInsSepBNHead(RTMDetInsHead): + """Detection Head of RTMDet-Ins with sep-bn layers. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + share_conv (bool): Whether to share conv layers between stages. + Defaults to True. + norm_cfg (:obj:`ConfigDict` or dict)): Config dict for normalization + layer. Defaults to dict(type='BN'). + act_cfg (:obj:`ConfigDict` or dict)): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + share_conv: bool = True, + with_objectness: bool = False, + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + pred_kernel_size: int = 1, + **kwargs) -> None: + self.share_conv = share_conv + super().__init__( + num_classes, + in_channels, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + pred_kernel_size=pred_kernel_size, + with_objectness=with_objectness, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.kernel_convs = nn.ModuleList() + + self.rtm_cls = nn.ModuleList() + self.rtm_reg = nn.ModuleList() + self.rtm_kernel = nn.ModuleList() + self.rtm_obj = nn.ModuleList() + + # calculate num dynamic parameters + weight_nums, bias_nums = [], [] + for i in range(self.num_dyconvs): + if i == 0: + weight_nums.append( + (self.num_prototypes + 2) * self.dyconv_channels) + bias_nums.append(self.dyconv_channels) + elif i == self.num_dyconvs - 1: + weight_nums.append(self.dyconv_channels) + bias_nums.append(1) + else: + weight_nums.append(self.dyconv_channels * self.dyconv_channels) + bias_nums.append(self.dyconv_channels) + self.weight_nums = weight_nums + self.bias_nums = bias_nums + self.num_gen_params = sum(weight_nums) + sum(bias_nums) + pred_pad_size = self.pred_kernel_size // 2 + + for n in range(len(self.prior_generator.strides)): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + kernel_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + kernel_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(cls_convs) + self.kernel_convs.append(kernel_convs) + + self.rtm_cls.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + self.pred_kernel_size, + padding=pred_pad_size)) + self.rtm_reg.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=pred_pad_size)) + self.rtm_kernel.append( + nn.Conv2d( + self.feat_channels, + self.num_gen_params, + self.pred_kernel_size, + padding=pred_pad_size)) + if self.with_objectness: + self.rtm_obj.append( + nn.Conv2d( + self.feat_channels, + 1, + self.pred_kernel_size, + padding=pred_pad_size)) + + if self.share_conv: + for n in range(len(self.prior_generator.strides)): + for i in range(self.stacked_convs): + self.cls_convs[n][i].conv = self.cls_convs[0][i].conv + self.reg_convs[n][i].conv = self.reg_convs[0][i].conv + + self.mask_head = MaskFeatModule( + in_channels=self.in_channels, + feat_channels=self.feat_channels, + stacked_convs=4, + num_levels=len(self.prior_generator.strides), + num_prototypes=self.num_prototypes, + act_cfg=self.act_cfg, + norm_cfg=self.norm_cfg) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg, + self.rtm_kernel): + normal_init(rtm_cls, std=0.01, bias=bias_cls) + normal_init(rtm_reg, std=0.01, bias=1) + if self.with_objectness: + for rtm_obj in self.rtm_obj: + normal_init(rtm_obj, std=0.01, bias=bias_cls) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale + levels, each is a 4D-tensor, the channels number is + num_gen_params. + - mask_feat (Tensor): Output feature of the mask head. Each is a + 4D-tensor, the channels number is num_prototypes. + """ + mask_feat = self.mask_head(feats) + + cls_scores = [] + bbox_preds = [] + kernel_preds = [] + for idx, (x, stride) in enumerate( + zip(feats, self.prior_generator.strides)): + cls_feat = x + reg_feat = x + kernel_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for kernel_layer in self.kernel_convs[idx]: + kernel_feat = kernel_layer(kernel_feat) + kernel_pred = self.rtm_kernel[idx](kernel_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + + if self.with_objectness: + objectness = self.rtm_obj[idx](reg_feat) + cls_score = inverse_sigmoid( + sigmoid_geometric_mean(cls_score, objectness)) + + reg_dist = F.relu(self.rtm_reg[idx](reg_feat)) * stride[0] + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + kernel_preds.append(kernel_pred) + return tuple(cls_scores), tuple(bbox_preds), tuple( + kernel_preds), mask_feat diff --git a/mmdetection/mmdet/models/dense_heads/sabl_retina_head.py b/mmdetection/mmdet/models/dense_heads/sabl_retina_head.py new file mode 100644 index 0000000..8cd1b71 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/sabl_retina_head.py @@ -0,0 +1,706 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType, + OptInstanceList) +from ..task_modules.samplers import PseudoSampler +from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply, + unmap) +from .base_dense_head import BaseDenseHead +from .guided_anchor_head import GuidedAnchorHead + + +@MODELS.register_module() +class SABLRetinaHead(BaseDenseHead): + """Side-Aware Boundary Localization (SABL) for RetinaNet. + + The anchor generation, assigning and sampling in SABLRetinaHead + are the same as GuidedAnchorHead for guided anchoring. + + Please refer to https://arxiv.org/abs/1912.04260 for more details. + + Args: + num_classes (int): Number of classes. + in_channels (int): Number of channels in the input feature map. + stacked_convs (int): Number of Convs for classification and + regression branches. Defaults to 4. + feat_channels (int): Number of hidden channels. Defaults to 256. + approx_anchor_generator (:obj:`ConfigType` or dict): Config dict for + approx generator. + square_anchor_generator (:obj:`ConfigDict` or dict): Config dict for + square generator. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + ConvModule. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + Norm Layer. Defaults to None. + bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder. + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Default False. It should be ``True`` when + using ``IoULoss``, ``GIoULoss``, or ``DIoULoss`` in the bbox head. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + SABLRetinaHead. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + SABLRetinaHead. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox_cls (:obj:`ConfigDict` or dict): Config of classification + loss for bbox branch. + loss_bbox_reg (:obj:`ConfigDict` or dict): Config of regression loss + for bbox branch. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + stacked_convs: int = 4, + feat_channels: int = 256, + approx_anchor_generator: ConfigType = dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + square_anchor_generator: ConfigType = dict( + type='AnchorGenerator', + ratios=[1.0], + scales=[4], + strides=[8, 16, 32, 64, 128]), + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + bbox_coder: ConfigType = dict( + type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0), + reg_decoded_bbox: bool = False, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + loss_cls: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_cls: ConfigType = dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5), + loss_bbox_reg: ConfigType = dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5), + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', name='retina_cls', std=0.01, bias_prob=0.01)) + ) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.num_classes = num_classes + self.feat_channels = feat_channels + self.num_buckets = bbox_coder['num_buckets'] + self.side_num = int(np.ceil(self.num_buckets / 2)) + + assert (approx_anchor_generator['octave_base_scale'] == + square_anchor_generator['scales'][0]) + assert (approx_anchor_generator['strides'] == + square_anchor_generator['strides']) + + self.approx_anchor_generator = TASK_UTILS.build( + approx_anchor_generator) + self.square_anchor_generator = TASK_UTILS.build( + square_anchor_generator) + self.approxs_per_octave = ( + self.approx_anchor_generator.num_base_priors[0]) + + # one anchor per location + self.num_base_priors = self.square_anchor_generator.num_base_priors[0] + + self.stacked_convs = stacked_convs + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + self.reg_decoded_bbox = reg_decoded_bbox + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + + self.bbox_coder = TASK_UTILS.build(bbox_coder) + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox_cls = MODELS.build(loss_bbox_cls) + self.loss_bbox_reg = MODELS.build(loss_bbox_reg) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + # use PseudoSampler when sampling is False + if 'sampler' in self.train_cfg: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + self._init_layers() + + def _init_layers(self) -> None: + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.retina_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + self.retina_bbox_reg = nn.Conv2d( + self.feat_channels, self.side_num * 4, 3, padding=1) + self.retina_bbox_cls = nn.Conv2d( + self.feat_channels, self.side_num * 4, 3, padding=1) + + def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]: + cls_feat = x + reg_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + reg_feat = reg_conv(reg_feat) + cls_score = self.retina_cls(cls_feat) + bbox_cls_pred = self.retina_bbox_cls(reg_feat) + bbox_reg_pred = self.retina_bbox_reg(reg_feat) + bbox_pred = (bbox_cls_pred, bbox_reg_pred) + return cls_score, bbox_pred + + def forward(self, feats: List[Tensor]) -> Tuple[List[Tensor]]: + return multi_apply(self.forward_single, feats) + + def get_anchors( + self, + featmap_sizes: List[tuple], + img_metas: List[dict], + device: Union[torch.device, str] = 'cuda' + ) -> Tuple[List[List[Tensor]], List[List[Tensor]]]: + """Get squares according to feature map sizes and guided anchors. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + img_metas (list[dict]): Image meta info. + device (torch.device | str): device for returned tensors + + Returns: + tuple: square approxs of each image + """ + num_imgs = len(img_metas) + + # since feature map sizes of all images are the same, we only compute + # squares for one time + multi_level_squares = self.square_anchor_generator.grid_priors( + featmap_sizes, device=device) + squares_list = [multi_level_squares for _ in range(num_imgs)] + + return squares_list + + def get_targets(self, + approx_list: List[List[Tensor]], + inside_flag_list: List[List[Tensor]], + square_list: List[List[Tensor]], + batch_gt_instances: InstanceList, + batch_img_metas, + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs=True) -> tuple: + """Compute bucketing targets. + + Args: + approx_list (list[list[Tensor]]): Multi level approxs of each + image. + inside_flag_list (list[list[Tensor]]): Multi level inside flags of + each image. + square_list (list[list[Tensor]]): Multi level squares of each + image. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + + Returns: + tuple: Returns a tuple containing learning targets. + + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each level. + - bbox_cls_targets_list (list[Tensor]): BBox cls targets of \ + each level. + - bbox_cls_weights_list (list[Tensor]): BBox cls weights of \ + each level. + - bbox_reg_targets_list (list[Tensor]): BBox reg targets of \ + each level. + - bbox_reg_weights_list (list[Tensor]): BBox reg weights of \ + each level. + - num_total_pos (int): Number of positive samples in all images. + - num_total_neg (int): Number of negative samples in all images. + """ + num_imgs = len(batch_img_metas) + assert len(approx_list) == len(inside_flag_list) == len( + square_list) == num_imgs + # anchor number of multi levels + num_level_squares = [squares.size(0) for squares in square_list[0]] + # concat all level anchors and flags to a single tensor + inside_flag_flat_list = [] + approx_flat_list = [] + square_flat_list = [] + for i in range(num_imgs): + assert len(square_list[i]) == len(inside_flag_list[i]) + inside_flag_flat_list.append(torch.cat(inside_flag_list[i])) + approx_flat_list.append(torch.cat(approx_list[i])) + square_flat_list.append(torch.cat(square_list[i])) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None for _ in range(num_imgs)] + (all_labels, all_label_weights, all_bbox_cls_targets, + all_bbox_cls_weights, all_bbox_reg_targets, all_bbox_reg_weights, + pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply( + self._get_targets_single, + approx_flat_list, + inside_flag_flat_list, + square_flat_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + + # sampled anchors of all images + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # split targets to a list w.r.t. multiple levels + labels_list = images_to_levels(all_labels, num_level_squares) + label_weights_list = images_to_levels(all_label_weights, + num_level_squares) + bbox_cls_targets_list = images_to_levels(all_bbox_cls_targets, + num_level_squares) + bbox_cls_weights_list = images_to_levels(all_bbox_cls_weights, + num_level_squares) + bbox_reg_targets_list = images_to_levels(all_bbox_reg_targets, + num_level_squares) + bbox_reg_weights_list = images_to_levels(all_bbox_reg_weights, + num_level_squares) + return (labels_list, label_weights_list, bbox_cls_targets_list, + bbox_cls_weights_list, bbox_reg_targets_list, + bbox_reg_weights_list, avg_factor) + + def _get_targets_single(self, + flat_approxs: Tensor, + inside_flags: Tensor, + flat_squares: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression and classification targets for anchors in a + single image. + + Args: + flat_approxs (Tensor): flat approxs of a single image, + shape (n, 4) + inside_flags (Tensor): inside flags of a single image, + shape (n, ). + flat_squares (Tensor): flat squares of a single image, + shape (approxs_per_octave * n, 4) + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. Defaults to True. + + Returns: + tuple: + + - labels_list (Tensor): Labels in a single image. + - label_weights (Tensor): Label weights in a single image. + - bbox_cls_targets (Tensor): BBox cls targets in a single image. + - bbox_cls_weights (Tensor): BBox cls weights in a single image. + - bbox_reg_targets (Tensor): BBox reg targets in a single image. + - bbox_reg_weights (Tensor): BBox reg weights in a single image. + - num_total_pos (int): Number of positive samples in a single \ + image. + - num_total_neg (int): Number of negative samples in a single \ + image. + - sampling_result (:obj:`SamplingResult`): Sampling result object. + """ + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + # assign gt and sample anchors + num_square = flat_squares.size(0) + approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4) + approxs = approxs[inside_flags, ...] + squares = flat_squares[inside_flags, :] + + pred_instances = InstanceData() + pred_instances.priors = squares + pred_instances.approxs = approxs + assign_result = self.assigner.assign(pred_instances, gt_instances, + gt_instances_ignore) + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_squares = squares.shape[0] + bbox_cls_targets = squares.new_zeros( + (num_valid_squares, self.side_num * 4)) + bbox_cls_weights = squares.new_zeros( + (num_valid_squares, self.side_num * 4)) + bbox_reg_targets = squares.new_zeros( + (num_valid_squares, self.side_num * 4)) + bbox_reg_weights = squares.new_zeros( + (num_valid_squares, self.side_num * 4)) + labels = squares.new_full((num_valid_squares, ), + self.num_classes, + dtype=torch.long) + label_weights = squares.new_zeros(num_valid_squares, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + (pos_bbox_reg_targets, pos_bbox_reg_weights, pos_bbox_cls_targets, + pos_bbox_cls_weights) = self.bbox_coder.encode( + sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes) + + bbox_cls_targets[pos_inds, :] = pos_bbox_cls_targets + bbox_reg_targets[pos_inds, :] = pos_bbox_reg_targets + bbox_cls_weights[pos_inds, :] = pos_bbox_cls_weights + bbox_reg_weights[pos_inds, :] = pos_bbox_reg_weights + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_squares.size(0) + labels = unmap( + labels, num_total_anchors, inside_flags, fill=self.num_classes) + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + bbox_cls_targets = unmap(bbox_cls_targets, num_total_anchors, + inside_flags) + bbox_cls_weights = unmap(bbox_cls_weights, num_total_anchors, + inside_flags) + bbox_reg_targets = unmap(bbox_reg_targets, num_total_anchors, + inside_flags) + bbox_reg_weights = unmap(bbox_reg_weights, num_total_anchors, + inside_flags) + return (labels, label_weights, bbox_cls_targets, bbox_cls_weights, + bbox_reg_targets, bbox_reg_weights, pos_inds, neg_inds, + sampling_result) + + def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, + labels: Tensor, label_weights: Tensor, + bbox_cls_targets: Tensor, bbox_cls_weights: Tensor, + bbox_reg_targets: Tensor, bbox_reg_weights: Tensor, + avg_factor: float) -> Tuple[Tensor]: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + labels (Tensor): Labels in a single image. + label_weights (Tensor): Label weights in a single level. + bbox_cls_targets (Tensor): BBox cls targets in a single level. + bbox_cls_weights (Tensor): BBox cls weights in a single level. + bbox_reg_targets (Tensor): BBox reg targets in a single level. + bbox_reg_weights (Tensor): BBox reg weights in a single level. + avg_factor (int): Average factor that is used to average the loss. + + Returns: + tuple: loss components. + """ + # classification loss + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + cls_score = cls_score.permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=avg_factor) + # regression loss + bbox_cls_targets = bbox_cls_targets.reshape(-1, self.side_num * 4) + bbox_cls_weights = bbox_cls_weights.reshape(-1, self.side_num * 4) + bbox_reg_targets = bbox_reg_targets.reshape(-1, self.side_num * 4) + bbox_reg_weights = bbox_reg_weights.reshape(-1, self.side_num * 4) + (bbox_cls_pred, bbox_reg_pred) = bbox_pred + bbox_cls_pred = bbox_cls_pred.permute(0, 2, 3, 1).reshape( + -1, self.side_num * 4) + bbox_reg_pred = bbox_reg_pred.permute(0, 2, 3, 1).reshape( + -1, self.side_num * 4) + loss_bbox_cls = self.loss_bbox_cls( + bbox_cls_pred, + bbox_cls_targets.long(), + bbox_cls_weights, + avg_factor=avg_factor * 4 * self.side_num) + loss_bbox_reg = self.loss_bbox_reg( + bbox_reg_pred, + bbox_reg_targets, + bbox_reg_weights, + avg_factor=avg_factor * 4 * self.bbox_coder.offset_topk) + return loss_cls, loss_bbox_cls, loss_bbox_reg + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.approx_anchor_generator.num_levels + + device = cls_scores[0].device + + # get sampled approxes + approxs_list, inside_flag_list = GuidedAnchorHead.get_sampled_approxs( + self, featmap_sizes, batch_img_metas, device=device) + + square_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + cls_reg_targets = self.get_targets( + approxs_list, + inside_flag_list, + square_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + (labels_list, label_weights_list, bbox_cls_targets_list, + bbox_cls_weights_list, bbox_reg_targets_list, bbox_reg_weights_list, + avg_factor) = cls_reg_targets + + losses_cls, losses_bbox_cls, losses_bbox_reg = multi_apply( + self.loss_by_feat_single, + cls_scores, + bbox_preds, + labels_list, + label_weights_list, + bbox_cls_targets_list, + bbox_cls_weights_list, + bbox_reg_targets_list, + bbox_reg_weights_list, + avg_factor=avg_factor) + return dict( + loss_cls=losses_cls, + loss_bbox_cls=losses_bbox_cls, + loss_bbox_reg=losses_bbox_reg) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_img_metas: List[dict], + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS, + such as CenterNess in FCOS, IoU branch in ATSS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + cfg (:obj:`ConfigDict`, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + num_levels = len(cls_scores) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + + device = cls_scores[0].device + mlvl_anchors = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + result_list = [] + for img_id in range(len(batch_img_metas)): + cls_score_list = [ + cls_scores[i][img_id].detach() for i in range(num_levels) + ] + bbox_cls_pred_list = [ + bbox_preds[i][0][img_id].detach() for i in range(num_levels) + ] + bbox_reg_pred_list = [ + bbox_preds[i][1][img_id].detach() for i in range(num_levels) + ] + proposals = self._predict_by_feat_single( + cls_scores=cls_score_list, + bbox_cls_preds=bbox_cls_pred_list, + bbox_reg_preds=bbox_reg_pred_list, + mlvl_anchors=mlvl_anchors[img_id], + img_meta=batch_img_metas[img_id], + cfg=cfg, + rescale=rescale, + with_nms=with_nms) + result_list.append(proposals) + return result_list + + def _predict_by_feat_single(self, + cls_scores: List[Tensor], + bbox_cls_preds: List[Tensor], + bbox_reg_preds: List[Tensor], + mlvl_anchors: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + cfg = self.test_cfg if cfg is None else cfg + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_confids = [] + mlvl_labels = [] + assert len(cls_scores) == len(bbox_cls_preds) == len( + bbox_reg_preds) == len(mlvl_anchors) + for cls_score, bbox_cls_pred, bbox_reg_pred, anchors in zip( + cls_scores, bbox_cls_preds, bbox_reg_preds, mlvl_anchors): + assert cls_score.size()[-2:] == bbox_cls_pred.size( + )[-2:] == bbox_reg_pred.size()[-2::] + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(-1)[:, :-1] + bbox_cls_pred = bbox_cls_pred.permute(1, 2, 0).reshape( + -1, self.side_num * 4) + bbox_reg_pred = bbox_reg_pred.permute(1, 2, 0).reshape( + -1, self.side_num * 4) + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + results = filter_scores_and_topk( + scores, cfg.score_thr, nms_pre, + dict( + anchors=anchors, + bbox_cls_pred=bbox_cls_pred, + bbox_reg_pred=bbox_reg_pred)) + scores, labels, _, filtered_results = results + + anchors = filtered_results['anchors'] + bbox_cls_pred = filtered_results['bbox_cls_pred'] + bbox_reg_pred = filtered_results['bbox_reg_pred'] + + bbox_preds = [ + bbox_cls_pred.contiguous(), + bbox_reg_pred.contiguous() + ] + bboxes, confids = self.bbox_coder.decode( + anchors.contiguous(), + bbox_preds, + max_shape=img_meta['img_shape']) + + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_confids.append(confids) + mlvl_labels.append(labels) + + results = InstanceData() + results.bboxes = torch.cat(mlvl_bboxes) + results.scores = torch.cat(mlvl_scores) + results.score_factors = torch.cat(mlvl_confids) + results.labels = torch.cat(mlvl_labels) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) diff --git a/mmdetection/mmdet/models/dense_heads/solo_head.py b/mmdetection/mmdet/models/dense_heads/solo_head.py new file mode 100644 index 0000000..8cf3384 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/solo_head.py @@ -0,0 +1,1263 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import mmcv +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.utils.misc import floordiv +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType +from ..layers import mask_matrix_nms +from ..utils import center_of_mass, generate_coordinate, multi_apply +from .base_mask_head import BaseMaskHead + + +@MODELS.register_module() +class SOLOHead(BaseMaskHead): + """SOLO mask head used in `SOLO: Segmenting Objects by Locations. + + `_ + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels. Used in child classes. + Defaults to 256. + stacked_convs (int): Number of stacking convs of the head. + Defaults to 4. + strides (tuple): Downsample factor of each feature map. + scale_ranges (tuple[tuple[int, int]]): Area range of multiple + level masks, in the format [(min1, max1), (min2, max2), ...]. + A range of (16, 64) means the area range between (16, 64). + pos_scale (float): Constant scale factor to control the center region. + num_grids (list[int]): Divided image into a uniform grids, each + feature map has a different grid value. The number of output + channels is grid ** 2. Defaults to [40, 36, 24, 16, 12]. + cls_down_index (int): The index of downsample operation in + classification branch. Defaults to 0. + loss_mask (dict): Config of mask loss. + loss_cls (dict): Config of classification loss. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to norm_cfg=dict(type='GN', num_groups=32, + requires_grad=True). + train_cfg (dict): Training config of head. + test_cfg (dict): Testing config of head. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + feat_channels: int = 256, + stacked_convs: int = 4, + strides: tuple = (4, 8, 16, 32, 64), + scale_ranges: tuple = ((8, 32), (16, 64), (32, 128), (64, 256), (128, + 512)), + pos_scale: float = 0.2, + num_grids: list = [40, 36, 24, 16, 12], + cls_down_index: int = 0, + loss_mask: ConfigType = dict( + type='DiceLoss', use_sigmoid=True, loss_weight=3.0), + loss_cls: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + norm_cfg: ConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: MultiConfig = [ + dict(type='Normal', layer='Conv2d', std=0.01), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_mask_list')), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_cls')) + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.cls_out_channels = self.num_classes + self.in_channels = in_channels + self.feat_channels = feat_channels + self.stacked_convs = stacked_convs + self.strides = strides + self.num_grids = num_grids + # number of FPN feats + self.num_levels = len(strides) + assert self.num_levels == len(scale_ranges) == len(num_grids) + self.scale_ranges = scale_ranges + self.pos_scale = pos_scale + + self.cls_down_index = cls_down_index + self.loss_cls = MODELS.build(loss_cls) + self.loss_mask = MODELS.build(loss_mask) + self.norm_cfg = norm_cfg + self.init_cfg = init_cfg + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.mask_convs = nn.ModuleList() + self.cls_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels + 2 if i == 0 else self.feat_channels + self.mask_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg)) + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg)) + self.conv_mask_list = nn.ModuleList() + for num_grid in self.num_grids: + self.conv_mask_list.append( + nn.Conv2d(self.feat_channels, num_grid**2, 1)) + + self.conv_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + + def resize_feats(self, x: Tuple[Tensor]) -> List[Tensor]: + """Downsample the first feat and upsample last feat in feats. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + list[Tensor]: Features after resizing, each is a 4D-tensor. + """ + out = [] + for i in range(len(x)): + if i == 0: + out.append( + F.interpolate(x[0], scale_factor=0.5, mode='bilinear')) + elif i == len(x) - 1: + out.append( + F.interpolate( + x[i], size=x[i - 1].shape[-2:], mode='bilinear')) + else: + out.append(x[i]) + return out + + def forward(self, x: Tuple[Tensor]) -> tuple: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores and mask prediction. + + - mlvl_mask_preds (list[Tensor]): Multi-level mask prediction. + Each element in the list has shape + (batch_size, num_grids**2 ,h ,w). + - mlvl_cls_preds (list[Tensor]): Multi-level scores. + Each element in the list has shape + (batch_size, num_classes, num_grids ,num_grids). + """ + assert len(x) == self.num_levels + feats = self.resize_feats(x) + mlvl_mask_preds = [] + mlvl_cls_preds = [] + for i in range(self.num_levels): + x = feats[i] + mask_feat = x + cls_feat = x + # generate and concat the coordinate + coord_feat = generate_coordinate(mask_feat.size(), + mask_feat.device) + mask_feat = torch.cat([mask_feat, coord_feat], 1) + + for mask_layer in (self.mask_convs): + mask_feat = mask_layer(mask_feat) + + mask_feat = F.interpolate( + mask_feat, scale_factor=2, mode='bilinear') + mask_preds = self.conv_mask_list[i](mask_feat) + + # cls branch + for j, cls_layer in enumerate(self.cls_convs): + if j == self.cls_down_index: + num_grid = self.num_grids[i] + cls_feat = F.interpolate( + cls_feat, size=num_grid, mode='bilinear') + cls_feat = cls_layer(cls_feat) + + cls_pred = self.conv_cls(cls_feat) + + if not self.training: + feat_wh = feats[0].size()[-2:] + upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2) + mask_preds = F.interpolate( + mask_preds.sigmoid(), size=upsampled_size, mode='bilinear') + cls_pred = cls_pred.sigmoid() + # get local maximum + local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1) + keep_mask = local_max[:, :, :-1, :-1] == cls_pred + cls_pred = cls_pred * keep_mask + + mlvl_mask_preds.append(mask_preds) + mlvl_cls_preds.append(cls_pred) + return mlvl_mask_preds, mlvl_cls_preds + + def loss_by_feat(self, mlvl_mask_preds: List[Tensor], + mlvl_cls_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], **kwargs) -> dict: + """Calculate the loss based on the features extracted by the mask head. + + Args: + mlvl_mask_preds (list[Tensor]): Multi-level mask prediction. + Each element in the list has shape + (batch_size, num_grids**2 ,h ,w). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``masks``, + and ``labels`` attributes. + batch_img_metas (list[dict]): Meta information of multiple images. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_levels = self.num_levels + num_imgs = len(batch_img_metas) + + featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds] + + # `BoolTensor` in `pos_masks` represent + # whether the corresponding point is + # positive + pos_mask_targets, labels, pos_masks = multi_apply( + self._get_targets_single, + batch_gt_instances, + featmap_sizes=featmap_sizes) + + # change from the outside list meaning multi images + # to the outside list meaning multi levels + mlvl_pos_mask_targets = [[] for _ in range(num_levels)] + mlvl_pos_mask_preds = [[] for _ in range(num_levels)] + mlvl_pos_masks = [[] for _ in range(num_levels)] + mlvl_labels = [[] for _ in range(num_levels)] + for img_id in range(num_imgs): + assert num_levels == len(pos_mask_targets[img_id]) + for lvl in range(num_levels): + mlvl_pos_mask_targets[lvl].append( + pos_mask_targets[img_id][lvl]) + mlvl_pos_mask_preds[lvl].append( + mlvl_mask_preds[lvl][img_id, pos_masks[img_id][lvl], ...]) + mlvl_pos_masks[lvl].append(pos_masks[img_id][lvl].flatten()) + mlvl_labels[lvl].append(labels[img_id][lvl].flatten()) + + # cat multiple image + temp_mlvl_cls_preds = [] + for lvl in range(num_levels): + mlvl_pos_mask_targets[lvl] = torch.cat( + mlvl_pos_mask_targets[lvl], dim=0) + mlvl_pos_mask_preds[lvl] = torch.cat( + mlvl_pos_mask_preds[lvl], dim=0) + mlvl_pos_masks[lvl] = torch.cat(mlvl_pos_masks[lvl], dim=0) + mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0) + temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute( + 0, 2, 3, 1).reshape(-1, self.cls_out_channels)) + + num_pos = sum(item.sum() for item in mlvl_pos_masks) + # dice loss + loss_mask = [] + for pred, target in zip(mlvl_pos_mask_preds, mlvl_pos_mask_targets): + if pred.size()[0] == 0: + loss_mask.append(pred.sum().unsqueeze(0)) + continue + loss_mask.append( + self.loss_mask(pred, target, reduction_override='none')) + if num_pos > 0: + loss_mask = torch.cat(loss_mask).sum() / num_pos + else: + loss_mask = torch.cat(loss_mask).mean() + + flatten_labels = torch.cat(mlvl_labels) + flatten_cls_preds = torch.cat(temp_mlvl_cls_preds) + loss_cls = self.loss_cls( + flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1) + return dict(loss_mask=loss_mask, loss_cls=loss_cls) + + def _get_targets_single(self, + gt_instances: InstanceData, + featmap_sizes: Optional[list] = None) -> tuple: + """Compute targets for predictions of single image. + + Args: + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes``, ``labels``, + and ``masks`` attributes. + featmap_sizes (list[:obj:`torch.size`]): Size of each + feature map from feature pyramid, each element + means (feat_h, feat_w). Defaults to None. + + Returns: + Tuple: Usually returns a tuple containing targets for predictions. + + - mlvl_pos_mask_targets (list[Tensor]): Each element represent + the binary mask targets for positive points in this + level, has shape (num_pos, out_h, out_w). + - mlvl_labels (list[Tensor]): Each element is + classification labels for all + points in this level, has shape + (num_grid, num_grid). + - mlvl_pos_masks (list[Tensor]): Each element is + a `BoolTensor` to represent whether the + corresponding point in single level + is positive, has shape (num_grid **2). + """ + gt_labels = gt_instances.labels + device = gt_labels.device + + gt_bboxes = gt_instances.bboxes + gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) * + (gt_bboxes[:, 3] - gt_bboxes[:, 1])) + + gt_masks = gt_instances.masks.to_tensor( + dtype=torch.bool, device=device) + + mlvl_pos_mask_targets = [] + mlvl_labels = [] + mlvl_pos_masks = [] + for (lower_bound, upper_bound), stride, featmap_size, num_grid \ + in zip(self.scale_ranges, self.strides, + featmap_sizes, self.num_grids): + + mask_target = torch.zeros( + [num_grid**2, featmap_size[0], featmap_size[1]], + dtype=torch.uint8, + device=device) + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + labels = torch.zeros([num_grid, num_grid], + dtype=torch.int64, + device=device) + self.num_classes + pos_mask = torch.zeros([num_grid**2], + dtype=torch.bool, + device=device) + + gt_inds = ((gt_areas >= lower_bound) & + (gt_areas <= upper_bound)).nonzero().flatten() + if len(gt_inds) == 0: + mlvl_pos_mask_targets.append( + mask_target.new_zeros(0, featmap_size[0], featmap_size[1])) + mlvl_labels.append(labels) + mlvl_pos_masks.append(pos_mask) + continue + hit_gt_bboxes = gt_bboxes[gt_inds] + hit_gt_labels = gt_labels[gt_inds] + hit_gt_masks = gt_masks[gt_inds, ...] + + pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] - + hit_gt_bboxes[:, 0]) * self.pos_scale + pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] - + hit_gt_bboxes[:, 1]) * self.pos_scale + + # Make sure hit_gt_masks has a value + valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0 + output_stride = stride / 2 + + for gt_mask, gt_label, pos_h_range, pos_w_range, \ + valid_mask_flag in \ + zip(hit_gt_masks, hit_gt_labels, pos_h_ranges, + pos_w_ranges, valid_mask_flags): + if not valid_mask_flag: + continue + upsampled_size = (featmap_sizes[0][0] * 4, + featmap_sizes[0][1] * 4) + center_h, center_w = center_of_mass(gt_mask) + + coord_w = int( + floordiv((center_w / upsampled_size[1]), (1. / num_grid), + rounding_mode='trunc')) + coord_h = int( + floordiv((center_h / upsampled_size[0]), (1. / num_grid), + rounding_mode='trunc')) + + # left, top, right, down + top_box = max( + 0, + int( + floordiv( + (center_h - pos_h_range) / upsampled_size[0], + (1. / num_grid), + rounding_mode='trunc'))) + down_box = min( + num_grid - 1, + int( + floordiv( + (center_h + pos_h_range) / upsampled_size[0], + (1. / num_grid), + rounding_mode='trunc'))) + left_box = max( + 0, + int( + floordiv( + (center_w - pos_w_range) / upsampled_size[1], + (1. / num_grid), + rounding_mode='trunc'))) + right_box = min( + num_grid - 1, + int( + floordiv( + (center_w + pos_w_range) / upsampled_size[1], + (1. / num_grid), + rounding_mode='trunc'))) + + top = max(top_box, coord_h - 1) + down = min(down_box, coord_h + 1) + left = max(coord_w - 1, left_box) + right = min(right_box, coord_w + 1) + + labels[top:(down + 1), left:(right + 1)] = gt_label + # ins + gt_mask = np.uint8(gt_mask.cpu().numpy()) + # Follow the original implementation, F.interpolate is + # different from cv2 and opencv + gt_mask = mmcv.imrescale(gt_mask, scale=1. / output_stride) + gt_mask = torch.from_numpy(gt_mask).to(device=device) + + for i in range(top, down + 1): + for j in range(left, right + 1): + index = int(i * num_grid + j) + mask_target[index, :gt_mask.shape[0], :gt_mask. + shape[1]] = gt_mask + pos_mask[index] = True + mlvl_pos_mask_targets.append(mask_target[pos_mask]) + mlvl_labels.append(labels) + mlvl_pos_masks.append(pos_mask) + return mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks + + def predict_by_feat(self, mlvl_mask_preds: List[Tensor], + mlvl_cls_scores: List[Tensor], + batch_img_metas: List[dict], **kwargs) -> InstanceList: + """Transform a batch of output features extracted from the head into + mask results. + + Args: + mlvl_mask_preds (list[Tensor]): Multi-level mask prediction. + Each element in the list has shape + (batch_size, num_grids**2 ,h ,w). + mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element + in the list has shape + (batch_size, num_classes, num_grids ,num_grids). + batch_img_metas (list[dict]): Meta information of all images. + + Returns: + list[:obj:`InstanceData`]: Processed results of multiple + images.Each :obj:`InstanceData` usually contains + following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + mlvl_cls_scores = [ + item.permute(0, 2, 3, 1) for item in mlvl_cls_scores + ] + assert len(mlvl_mask_preds) == len(mlvl_cls_scores) + num_levels = len(mlvl_cls_scores) + + results_list = [] + for img_id in range(len(batch_img_metas)): + cls_pred_list = [ + mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels) + for lvl in range(num_levels) + ] + mask_pred_list = [ + mlvl_mask_preds[lvl][img_id] for lvl in range(num_levels) + ] + + cls_pred_list = torch.cat(cls_pred_list, dim=0) + mask_pred_list = torch.cat(mask_pred_list, dim=0) + img_meta = batch_img_metas[img_id] + + results = self._predict_by_feat_single( + cls_pred_list, mask_pred_list, img_meta=img_meta) + results_list.append(results) + + return results_list + + def _predict_by_feat_single(self, + cls_scores: Tensor, + mask_preds: Tensor, + img_meta: dict, + cfg: OptConfigType = None) -> InstanceData: + """Transform a single image's features extracted from the head into + mask results. + + Args: + cls_scores (Tensor): Classification score of all points + in single image, has shape (num_points, num_classes). + mask_preds (Tensor): Mask prediction of all points in + single image, has shape (num_points, feat_h, feat_w). + img_meta (dict): Meta information of corresponding image. + cfg (dict, optional): Config used in test phase. + Defaults to None. + + Returns: + :obj:`InstanceData`: Processed results of single image. + it usually contains following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + + def empty_results(cls_scores, ori_shape): + """Generate a empty results.""" + results = InstanceData() + results.scores = cls_scores.new_ones(0) + results.masks = cls_scores.new_zeros(0, *ori_shape) + results.labels = cls_scores.new_ones(0) + results.bboxes = cls_scores.new_zeros(0, 4) + return results + + cfg = self.test_cfg if cfg is None else cfg + assert len(cls_scores) == len(mask_preds) + + featmap_size = mask_preds.size()[-2:] + + h, w = img_meta['img_shape'][:2] + upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4) + + score_mask = (cls_scores > cfg.score_thr) + cls_scores = cls_scores[score_mask] + if len(cls_scores) == 0: + return empty_results(cls_scores, img_meta['ori_shape'][:2]) + + inds = score_mask.nonzero() + cls_labels = inds[:, 1] + + # Filter the mask mask with an area is smaller than + # stride of corresponding feature level + lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0) + strides = cls_scores.new_ones(lvl_interval[-1]) + strides[:lvl_interval[0]] *= self.strides[0] + for lvl in range(1, self.num_levels): + strides[lvl_interval[lvl - + 1]:lvl_interval[lvl]] *= self.strides[lvl] + strides = strides[inds[:, 0]] + mask_preds = mask_preds[inds[:, 0]] + + masks = mask_preds > cfg.mask_thr + sum_masks = masks.sum((1, 2)).float() + keep = sum_masks > strides + if keep.sum() == 0: + return empty_results(cls_scores, img_meta['ori_shape'][:2]) + masks = masks[keep] + mask_preds = mask_preds[keep] + sum_masks = sum_masks[keep] + cls_scores = cls_scores[keep] + cls_labels = cls_labels[keep] + + # maskness. + mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks + cls_scores *= mask_scores + + scores, labels, _, keep_inds = mask_matrix_nms( + masks, + cls_labels, + cls_scores, + mask_area=sum_masks, + nms_pre=cfg.nms_pre, + max_num=cfg.max_per_img, + kernel=cfg.kernel, + sigma=cfg.sigma, + filter_thr=cfg.filter_thr) + # mask_matrix_nms may return an empty Tensor + if len(keep_inds) == 0: + return empty_results(cls_scores, img_meta['ori_shape'][:2]) + mask_preds = mask_preds[keep_inds] + mask_preds = F.interpolate( + mask_preds.unsqueeze(0), size=upsampled_size, + mode='bilinear')[:, :, :h, :w] + mask_preds = F.interpolate( + mask_preds, size=img_meta['ori_shape'][:2], + mode='bilinear').squeeze(0) + masks = mask_preds > cfg.mask_thr + + results = InstanceData() + results.masks = masks + results.labels = labels + results.scores = scores + # create an empty bbox in InstanceData to avoid bugs when + # calculating metrics. + results.bboxes = results.scores.new_zeros(len(scores), 4) + return results + + +@MODELS.register_module() +class DecoupledSOLOHead(SOLOHead): + """Decoupled SOLO mask head used in `SOLO: Segmenting Objects by Locations. + + `_ + + Args: + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + *args, + init_cfg: MultiConfig = [ + dict(type='Normal', layer='Conv2d', std=0.01), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_mask_list_x')), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_mask_list_y')), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_cls')) + ], + **kwargs) -> None: + super().__init__(*args, init_cfg=init_cfg, **kwargs) + + def _init_layers(self) -> None: + self.mask_convs_x = nn.ModuleList() + self.mask_convs_y = nn.ModuleList() + self.cls_convs = nn.ModuleList() + + for i in range(self.stacked_convs): + chn = self.in_channels + 1 if i == 0 else self.feat_channels + self.mask_convs_x.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg)) + self.mask_convs_y.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg)) + + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg)) + + self.conv_mask_list_x = nn.ModuleList() + self.conv_mask_list_y = nn.ModuleList() + for num_grid in self.num_grids: + self.conv_mask_list_x.append( + nn.Conv2d(self.feat_channels, num_grid, 3, padding=1)) + self.conv_mask_list_y.append( + nn.Conv2d(self.feat_channels, num_grid, 3, padding=1)) + self.conv_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + + def forward(self, x: Tuple[Tensor]) -> Tuple: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores and mask prediction. + + - mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction + from x branch. Each element in the list has shape + (batch_size, num_grids ,h ,w). + - mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction + from y branch. Each element in the list has shape + (batch_size, num_grids ,h ,w). + - mlvl_cls_preds (list[Tensor]): Multi-level scores. + Each element in the list has shape + (batch_size, num_classes, num_grids ,num_grids). + """ + assert len(x) == self.num_levels + feats = self.resize_feats(x) + mask_preds_x = [] + mask_preds_y = [] + cls_preds = [] + for i in range(self.num_levels): + x = feats[i] + mask_feat = x + cls_feat = x + # generate and concat the coordinate + coord_feat = generate_coordinate(mask_feat.size(), + mask_feat.device) + mask_feat_x = torch.cat([mask_feat, coord_feat[:, 0:1, ...]], 1) + mask_feat_y = torch.cat([mask_feat, coord_feat[:, 1:2, ...]], 1) + + for mask_layer_x, mask_layer_y in \ + zip(self.mask_convs_x, self.mask_convs_y): + mask_feat_x = mask_layer_x(mask_feat_x) + mask_feat_y = mask_layer_y(mask_feat_y) + + mask_feat_x = F.interpolate( + mask_feat_x, scale_factor=2, mode='bilinear') + mask_feat_y = F.interpolate( + mask_feat_y, scale_factor=2, mode='bilinear') + + mask_pred_x = self.conv_mask_list_x[i](mask_feat_x) + mask_pred_y = self.conv_mask_list_y[i](mask_feat_y) + + # cls branch + for j, cls_layer in enumerate(self.cls_convs): + if j == self.cls_down_index: + num_grid = self.num_grids[i] + cls_feat = F.interpolate( + cls_feat, size=num_grid, mode='bilinear') + cls_feat = cls_layer(cls_feat) + + cls_pred = self.conv_cls(cls_feat) + + if not self.training: + feat_wh = feats[0].size()[-2:] + upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2) + mask_pred_x = F.interpolate( + mask_pred_x.sigmoid(), + size=upsampled_size, + mode='bilinear') + mask_pred_y = F.interpolate( + mask_pred_y.sigmoid(), + size=upsampled_size, + mode='bilinear') + cls_pred = cls_pred.sigmoid() + # get local maximum + local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1) + keep_mask = local_max[:, :, :-1, :-1] == cls_pred + cls_pred = cls_pred * keep_mask + + mask_preds_x.append(mask_pred_x) + mask_preds_y.append(mask_pred_y) + cls_preds.append(cls_pred) + return mask_preds_x, mask_preds_y, cls_preds + + def loss_by_feat(self, mlvl_mask_preds_x: List[Tensor], + mlvl_mask_preds_y: List[Tensor], + mlvl_cls_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], **kwargs) -> dict: + """Calculate the loss based on the features extracted by the mask head. + + Args: + mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction + from x branch. Each element in the list has shape + (batch_size, num_grids ,h ,w). + mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction + from y branch. Each element in the list has shape + (batch_size, num_grids ,h ,w). + mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element + in the list has shape + (batch_size, num_classes, num_grids ,num_grids). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``masks``, + and ``labels`` attributes. + batch_img_metas (list[dict]): Meta information of multiple images. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_levels = self.num_levels + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds_x] + + pos_mask_targets, labels, xy_pos_indexes = multi_apply( + self._get_targets_single, + batch_gt_instances, + featmap_sizes=featmap_sizes) + + # change from the outside list meaning multi images + # to the outside list meaning multi levels + mlvl_pos_mask_targets = [[] for _ in range(num_levels)] + mlvl_pos_mask_preds_x = [[] for _ in range(num_levels)] + mlvl_pos_mask_preds_y = [[] for _ in range(num_levels)] + mlvl_labels = [[] for _ in range(num_levels)] + for img_id in range(num_imgs): + + for lvl in range(num_levels): + mlvl_pos_mask_targets[lvl].append( + pos_mask_targets[img_id][lvl]) + mlvl_pos_mask_preds_x[lvl].append( + mlvl_mask_preds_x[lvl][img_id, + xy_pos_indexes[img_id][lvl][:, 1]]) + mlvl_pos_mask_preds_y[lvl].append( + mlvl_mask_preds_y[lvl][img_id, + xy_pos_indexes[img_id][lvl][:, 0]]) + mlvl_labels[lvl].append(labels[img_id][lvl].flatten()) + + # cat multiple image + temp_mlvl_cls_preds = [] + for lvl in range(num_levels): + mlvl_pos_mask_targets[lvl] = torch.cat( + mlvl_pos_mask_targets[lvl], dim=0) + mlvl_pos_mask_preds_x[lvl] = torch.cat( + mlvl_pos_mask_preds_x[lvl], dim=0) + mlvl_pos_mask_preds_y[lvl] = torch.cat( + mlvl_pos_mask_preds_y[lvl], dim=0) + mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0) + temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute( + 0, 2, 3, 1).reshape(-1, self.cls_out_channels)) + + num_pos = 0. + # dice loss + loss_mask = [] + for pred_x, pred_y, target in \ + zip(mlvl_pos_mask_preds_x, + mlvl_pos_mask_preds_y, mlvl_pos_mask_targets): + num_masks = pred_x.size(0) + if num_masks == 0: + # make sure can get grad + loss_mask.append((pred_x.sum() + pred_y.sum()).unsqueeze(0)) + continue + num_pos += num_masks + pred_mask = pred_y.sigmoid() * pred_x.sigmoid() + loss_mask.append( + self.loss_mask(pred_mask, target, reduction_override='none')) + if num_pos > 0: + loss_mask = torch.cat(loss_mask).sum() / num_pos + else: + loss_mask = torch.cat(loss_mask).mean() + + # cate + flatten_labels = torch.cat(mlvl_labels) + flatten_cls_preds = torch.cat(temp_mlvl_cls_preds) + + loss_cls = self.loss_cls( + flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1) + return dict(loss_mask=loss_mask, loss_cls=loss_cls) + + def _get_targets_single(self, + gt_instances: InstanceData, + featmap_sizes: Optional[list] = None) -> tuple: + """Compute targets for predictions of single image. + + Args: + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes``, ``labels``, + and ``masks`` attributes. + featmap_sizes (list[:obj:`torch.size`]): Size of each + feature map from feature pyramid, each element + means (feat_h, feat_w). Defaults to None. + + Returns: + Tuple: Usually returns a tuple containing targets for predictions. + + - mlvl_pos_mask_targets (list[Tensor]): Each element represent + the binary mask targets for positive points in this + level, has shape (num_pos, out_h, out_w). + - mlvl_labels (list[Tensor]): Each element is + classification labels for all + points in this level, has shape + (num_grid, num_grid). + - mlvl_xy_pos_indexes (list[Tensor]): Each element + in the list contains the index of positive samples in + corresponding level, has shape (num_pos, 2), last + dimension 2 present (index_x, index_y). + """ + mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks = \ + super()._get_targets_single(gt_instances, + featmap_sizes=featmap_sizes) + + mlvl_xy_pos_indexes = [(item - self.num_classes).nonzero() + for item in mlvl_labels] + + return mlvl_pos_mask_targets, mlvl_labels, mlvl_xy_pos_indexes + + def predict_by_feat(self, mlvl_mask_preds_x: List[Tensor], + mlvl_mask_preds_y: List[Tensor], + mlvl_cls_scores: List[Tensor], + batch_img_metas: List[dict], **kwargs) -> InstanceList: + """Transform a batch of output features extracted from the head into + mask results. + + Args: + mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction + from x branch. Each element in the list has shape + (batch_size, num_grids ,h ,w). + mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction + from y branch. Each element in the list has shape + (batch_size, num_grids ,h ,w). + mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element + in the list has shape + (batch_size, num_classes ,num_grids ,num_grids). + batch_img_metas (list[dict]): Meta information of all images. + + Returns: + list[:obj:`InstanceData`]: Processed results of multiple + images.Each :obj:`InstanceData` usually contains + following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + mlvl_cls_scores = [ + item.permute(0, 2, 3, 1) for item in mlvl_cls_scores + ] + assert len(mlvl_mask_preds_x) == len(mlvl_cls_scores) + num_levels = len(mlvl_cls_scores) + + results_list = [] + for img_id in range(len(batch_img_metas)): + cls_pred_list = [ + mlvl_cls_scores[i][img_id].view( + -1, self.cls_out_channels).detach() + for i in range(num_levels) + ] + mask_pred_list_x = [ + mlvl_mask_preds_x[i][img_id] for i in range(num_levels) + ] + mask_pred_list_y = [ + mlvl_mask_preds_y[i][img_id] for i in range(num_levels) + ] + + cls_pred_list = torch.cat(cls_pred_list, dim=0) + mask_pred_list_x = torch.cat(mask_pred_list_x, dim=0) + mask_pred_list_y = torch.cat(mask_pred_list_y, dim=0) + img_meta = batch_img_metas[img_id] + + results = self._predict_by_feat_single( + cls_pred_list, + mask_pred_list_x, + mask_pred_list_y, + img_meta=img_meta) + results_list.append(results) + return results_list + + def _predict_by_feat_single(self, + cls_scores: Tensor, + mask_preds_x: Tensor, + mask_preds_y: Tensor, + img_meta: dict, + cfg: OptConfigType = None) -> InstanceData: + """Transform a single image's features extracted from the head into + mask results. + + Args: + cls_scores (Tensor): Classification score of all points + in single image, has shape (num_points, num_classes). + mask_preds_x (Tensor): Mask prediction of x branch of + all points in single image, has shape + (sum_num_grids, feat_h, feat_w). + mask_preds_y (Tensor): Mask prediction of y branch of + all points in single image, has shape + (sum_num_grids, feat_h, feat_w). + img_meta (dict): Meta information of corresponding image. + cfg (dict): Config used in test phase. + + Returns: + :obj:`InstanceData`: Processed results of single image. + it usually contains following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + + def empty_results(cls_scores, ori_shape): + """Generate a empty results.""" + results = InstanceData() + results.scores = cls_scores.new_ones(0) + results.masks = cls_scores.new_zeros(0, *ori_shape) + results.labels = cls_scores.new_ones(0) + results.bboxes = cls_scores.new_zeros(0, 4) + return results + + cfg = self.test_cfg if cfg is None else cfg + + featmap_size = mask_preds_x.size()[-2:] + + h, w = img_meta['img_shape'][:2] + upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4) + + score_mask = (cls_scores > cfg.score_thr) + cls_scores = cls_scores[score_mask] + inds = score_mask.nonzero() + lvl_interval = inds.new_tensor(self.num_grids).pow(2).cumsum(0) + num_all_points = lvl_interval[-1] + lvl_start_index = inds.new_ones(num_all_points) + num_grids = inds.new_ones(num_all_points) + seg_size = inds.new_tensor(self.num_grids).cumsum(0) + mask_lvl_start_index = inds.new_ones(num_all_points) + strides = inds.new_ones(num_all_points) + + lvl_start_index[:lvl_interval[0]] *= 0 + mask_lvl_start_index[:lvl_interval[0]] *= 0 + num_grids[:lvl_interval[0]] *= self.num_grids[0] + strides[:lvl_interval[0]] *= self.strides[0] + + for lvl in range(1, self.num_levels): + lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \ + lvl_interval[lvl - 1] + mask_lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \ + seg_size[lvl - 1] + num_grids[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \ + self.num_grids[lvl] + strides[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \ + self.strides[lvl] + + lvl_start_index = lvl_start_index[inds[:, 0]] + mask_lvl_start_index = mask_lvl_start_index[inds[:, 0]] + num_grids = num_grids[inds[:, 0]] + strides = strides[inds[:, 0]] + + y_lvl_offset = (inds[:, 0] - lvl_start_index) // num_grids + x_lvl_offset = (inds[:, 0] - lvl_start_index) % num_grids + y_inds = mask_lvl_start_index + y_lvl_offset + x_inds = mask_lvl_start_index + x_lvl_offset + + cls_labels = inds[:, 1] + mask_preds = mask_preds_x[x_inds, ...] * mask_preds_y[y_inds, ...] + + masks = mask_preds > cfg.mask_thr + sum_masks = masks.sum((1, 2)).float() + keep = sum_masks > strides + if keep.sum() == 0: + return empty_results(cls_scores, img_meta['ori_shape'][:2]) + + masks = masks[keep] + mask_preds = mask_preds[keep] + sum_masks = sum_masks[keep] + cls_scores = cls_scores[keep] + cls_labels = cls_labels[keep] + + # maskness. + mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks + cls_scores *= mask_scores + + scores, labels, _, keep_inds = mask_matrix_nms( + masks, + cls_labels, + cls_scores, + mask_area=sum_masks, + nms_pre=cfg.nms_pre, + max_num=cfg.max_per_img, + kernel=cfg.kernel, + sigma=cfg.sigma, + filter_thr=cfg.filter_thr) + # mask_matrix_nms may return an empty Tensor + if len(keep_inds) == 0: + return empty_results(cls_scores, img_meta['ori_shape'][:2]) + mask_preds = mask_preds[keep_inds] + mask_preds = F.interpolate( + mask_preds.unsqueeze(0), size=upsampled_size, + mode='bilinear')[:, :, :h, :w] + mask_preds = F.interpolate( + mask_preds, size=img_meta['ori_shape'][:2], + mode='bilinear').squeeze(0) + masks = mask_preds > cfg.mask_thr + + results = InstanceData() + results.masks = masks + results.labels = labels + results.scores = scores + # create an empty bbox in InstanceData to avoid bugs when + # calculating metrics. + results.bboxes = results.scores.new_zeros(len(scores), 4) + + return results + + +@MODELS.register_module() +class DecoupledSOLOLightHead(DecoupledSOLOHead): + """Decoupled Light SOLO mask head used in `SOLO: Segmenting Objects by + Locations `_ + + Args: + with_dcn (bool): Whether use dcn in mask_convs and cls_convs, + Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + *args, + dcn_cfg: OptConfigType = None, + init_cfg: MultiConfig = [ + dict(type='Normal', layer='Conv2d', std=0.01), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_mask_list_x')), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_mask_list_y')), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_cls')) + ], + **kwargs) -> None: + assert dcn_cfg is None or isinstance(dcn_cfg, dict) + self.dcn_cfg = dcn_cfg + super().__init__(*args, init_cfg=init_cfg, **kwargs) + + def _init_layers(self) -> None: + self.mask_convs = nn.ModuleList() + self.cls_convs = nn.ModuleList() + + for i in range(self.stacked_convs): + if self.dcn_cfg is not None \ + and i == self.stacked_convs - 1: + conv_cfg = self.dcn_cfg + else: + conv_cfg = None + + chn = self.in_channels + 2 if i == 0 else self.feat_channels + self.mask_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg)) + + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg)) + + self.conv_mask_list_x = nn.ModuleList() + self.conv_mask_list_y = nn.ModuleList() + for num_grid in self.num_grids: + self.conv_mask_list_x.append( + nn.Conv2d(self.feat_channels, num_grid, 3, padding=1)) + self.conv_mask_list_y.append( + nn.Conv2d(self.feat_channels, num_grid, 3, padding=1)) + self.conv_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + + def forward(self, x: Tuple[Tensor]) -> Tuple: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores and mask prediction. + + - mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction + from x branch. Each element in the list has shape + (batch_size, num_grids ,h ,w). + - mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction + from y branch. Each element in the list has shape + (batch_size, num_grids ,h ,w). + - mlvl_cls_preds (list[Tensor]): Multi-level scores. + Each element in the list has shape + (batch_size, num_classes, num_grids ,num_grids). + """ + assert len(x) == self.num_levels + feats = self.resize_feats(x) + mask_preds_x = [] + mask_preds_y = [] + cls_preds = [] + for i in range(self.num_levels): + x = feats[i] + mask_feat = x + cls_feat = x + # generate and concat the coordinate + coord_feat = generate_coordinate(mask_feat.size(), + mask_feat.device) + mask_feat = torch.cat([mask_feat, coord_feat], 1) + + for mask_layer in self.mask_convs: + mask_feat = mask_layer(mask_feat) + + mask_feat = F.interpolate( + mask_feat, scale_factor=2, mode='bilinear') + + mask_pred_x = self.conv_mask_list_x[i](mask_feat) + mask_pred_y = self.conv_mask_list_y[i](mask_feat) + + # cls branch + for j, cls_layer in enumerate(self.cls_convs): + if j == self.cls_down_index: + num_grid = self.num_grids[i] + cls_feat = F.interpolate( + cls_feat, size=num_grid, mode='bilinear') + cls_feat = cls_layer(cls_feat) + + cls_pred = self.conv_cls(cls_feat) + + if not self.training: + feat_wh = feats[0].size()[-2:] + upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2) + mask_pred_x = F.interpolate( + mask_pred_x.sigmoid(), + size=upsampled_size, + mode='bilinear') + mask_pred_y = F.interpolate( + mask_pred_y.sigmoid(), + size=upsampled_size, + mode='bilinear') + cls_pred = cls_pred.sigmoid() + # get local maximum + local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1) + keep_mask = local_max[:, :, :-1, :-1] == cls_pred + cls_pred = cls_pred * keep_mask + + mask_preds_x.append(mask_pred_x) + mask_preds_y.append(mask_pred_y) + cls_preds.append(cls_pred) + return mask_preds_x, mask_preds_y, cls_preds diff --git a/mmdetection/mmdet/models/dense_heads/solov2_head.py b/mmdetection/mmdet/models/dense_heads/solov2_head.py new file mode 100644 index 0000000..35b9df0 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/solov2_head.py @@ -0,0 +1,799 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import List, Optional, Tuple + +import mmcv +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.utils.misc import floordiv +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType +from ..layers import mask_matrix_nms +from ..utils import center_of_mass, generate_coordinate, multi_apply +from .solo_head import SOLOHead + + +class MaskFeatModule(BaseModule): + """SOLOv2 mask feature map branch used in `SOLOv2: Dynamic and Fast + Instance Segmentation. `_ + + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels of the mask feature + map branch. + start_level (int): The starting feature map level from RPN that + will be used to predict the mask feature map. + end_level (int): The ending feature map level from rpn that + will be used to predict the mask feature map. + out_channels (int): Number of output channels of the mask feature + map branch. This is the channel count of the mask + feature map that to be dynamically convolved with the predicted + kernel. + mask_stride (int): Downsample factor of the mask feature map output. + Defaults to 4. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__( + self, + in_channels: int, + feat_channels: int, + start_level: int, + end_level: int, + out_channels: int, + mask_stride: int = 4, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + init_cfg: MultiConfig = [ + dict(type='Normal', layer='Conv2d', std=0.01) + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.feat_channels = feat_channels + self.start_level = start_level + self.end_level = end_level + self.mask_stride = mask_stride + assert start_level >= 0 and end_level >= start_level + self.out_channels = out_channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self._init_layers() + self.fp16_enabled = False + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.convs_all_levels = nn.ModuleList() + for i in range(self.start_level, self.end_level + 1): + convs_per_level = nn.Sequential() + if i == 0: + convs_per_level.add_module( + f'conv{i}', + ConvModule( + self.in_channels, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=False)) + self.convs_all_levels.append(convs_per_level) + continue + + for j in range(i): + if j == 0: + if i == self.end_level: + chn = self.in_channels + 2 + else: + chn = self.in_channels + convs_per_level.add_module( + f'conv{j}', + ConvModule( + chn, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=False)) + convs_per_level.add_module( + f'upsample{j}', + nn.Upsample( + scale_factor=2, + mode='bilinear', + align_corners=False)) + continue + + convs_per_level.add_module( + f'conv{j}', + ConvModule( + self.feat_channels, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=False)) + convs_per_level.add_module( + f'upsample{j}', + nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=False)) + + self.convs_all_levels.append(convs_per_level) + + self.conv_pred = ConvModule( + self.feat_channels, + self.out_channels, + 1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + + def forward(self, x: Tuple[Tensor]) -> Tensor: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + Tensor: The predicted mask feature map. + """ + inputs = x[self.start_level:self.end_level + 1] + assert len(inputs) == (self.end_level - self.start_level + 1) + feature_add_all_level = self.convs_all_levels[0](inputs[0]) + for i in range(1, len(inputs)): + input_p = inputs[i] + if i == len(inputs) - 1: + coord_feat = generate_coordinate(input_p.size(), + input_p.device) + input_p = torch.cat([input_p, coord_feat], 1) + + feature_add_all_level = feature_add_all_level + \ + self.convs_all_levels[i](input_p) + + feature_pred = self.conv_pred(feature_add_all_level) + return feature_pred + + +@MODELS.register_module() +class SOLOV2Head(SOLOHead): + """SOLOv2 mask head used in `SOLOv2: Dynamic and Fast Instance + Segmentation. `_ + + Args: + mask_feature_head (dict): Config of SOLOv2MaskFeatHead. + dynamic_conv_size (int): Dynamic Conv kernel size. Defaults to 1. + dcn_cfg (dict): Dcn conv configurations in kernel_convs and cls_conv. + Defaults to None. + dcn_apply_to_all_conv (bool): Whether to use dcn in every layer of + kernel_convs and cls_convs, or only the last layer. It shall be set + `True` for the normal version of SOLOv2 and `False` for the + light-weight version. Defaults to True. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + *args, + mask_feature_head: ConfigType, + dynamic_conv_size: int = 1, + dcn_cfg: OptConfigType = None, + dcn_apply_to_all_conv: bool = True, + init_cfg: MultiConfig = [ + dict(type='Normal', layer='Conv2d', std=0.01), + dict( + type='Normal', + std=0.01, + bias_prob=0.01, + override=dict(name='conv_cls')) + ], + **kwargs) -> None: + assert dcn_cfg is None or isinstance(dcn_cfg, dict) + self.dcn_cfg = dcn_cfg + self.with_dcn = dcn_cfg is not None + self.dcn_apply_to_all_conv = dcn_apply_to_all_conv + self.dynamic_conv_size = dynamic_conv_size + mask_out_channels = mask_feature_head.get('out_channels') + self.kernel_out_channels = \ + mask_out_channels * self.dynamic_conv_size * self.dynamic_conv_size + + super().__init__(*args, init_cfg=init_cfg, **kwargs) + + # update the in_channels of mask_feature_head + if mask_feature_head.get('in_channels', None) is not None: + if mask_feature_head.in_channels != self.in_channels: + warnings.warn('The `in_channels` of SOLOv2MaskFeatHead and ' + 'SOLOv2Head should be same, changing ' + 'mask_feature_head.in_channels to ' + f'{self.in_channels}') + mask_feature_head.update(in_channels=self.in_channels) + else: + mask_feature_head.update(in_channels=self.in_channels) + + self.mask_feature_head = MaskFeatModule(**mask_feature_head) + self.mask_stride = self.mask_feature_head.mask_stride + self.fp16_enabled = False + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.kernel_convs = nn.ModuleList() + conv_cfg = None + for i in range(self.stacked_convs): + if self.with_dcn: + if self.dcn_apply_to_all_conv: + conv_cfg = self.dcn_cfg + elif i == self.stacked_convs - 1: + # light head + conv_cfg = self.dcn_cfg + + chn = self.in_channels + 2 if i == 0 else self.feat_channels + self.kernel_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.norm_cfg is None)) + + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.norm_cfg is None)) + + self.conv_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + + self.conv_kernel = nn.Conv2d( + self.feat_channels, self.kernel_out_channels, 3, padding=1) + + def forward(self, x): + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores, mask prediction, + and mask features. + + - mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel + prediction. The kernel is used to generate instance + segmentation masks by dynamic convolution. Each element in + the list has shape + (batch_size, kernel_out_channels, num_grids, num_grids). + - mlvl_cls_preds (list[Tensor]): Multi-level scores. Each + element in the list has shape + (batch_size, num_classes, num_grids, num_grids). + - mask_feats (Tensor): Unified mask feature map used to + generate instance segmentation masks by dynamic convolution. + Has shape (batch_size, mask_out_channels, h, w). + """ + assert len(x) == self.num_levels + mask_feats = self.mask_feature_head(x) + ins_kernel_feats = self.resize_feats(x) + mlvl_kernel_preds = [] + mlvl_cls_preds = [] + for i in range(self.num_levels): + ins_kernel_feat = ins_kernel_feats[i] + # ins branch + # concat coord + coord_feat = generate_coordinate(ins_kernel_feat.size(), + ins_kernel_feat.device) + ins_kernel_feat = torch.cat([ins_kernel_feat, coord_feat], 1) + + # kernel branch + kernel_feat = ins_kernel_feat + kernel_feat = F.interpolate( + kernel_feat, + size=self.num_grids[i], + mode='bilinear', + align_corners=False) + + cate_feat = kernel_feat[:, :-2, :, :] + + kernel_feat = kernel_feat.contiguous() + for i, kernel_conv in enumerate(self.kernel_convs): + kernel_feat = kernel_conv(kernel_feat) + kernel_pred = self.conv_kernel(kernel_feat) + + # cate branch + cate_feat = cate_feat.contiguous() + for i, cls_conv in enumerate(self.cls_convs): + cate_feat = cls_conv(cate_feat) + cate_pred = self.conv_cls(cate_feat) + + mlvl_kernel_preds.append(kernel_pred) + mlvl_cls_preds.append(cate_pred) + + return mlvl_kernel_preds, mlvl_cls_preds, mask_feats + + def _get_targets_single(self, + gt_instances: InstanceData, + featmap_sizes: Optional[list] = None) -> tuple: + """Compute targets for predictions of single image. + + Args: + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes``, ``labels``, + and ``masks`` attributes. + featmap_sizes (list[:obj:`torch.size`]): Size of each + feature map from feature pyramid, each element + means (feat_h, feat_w). Defaults to None. + + Returns: + Tuple: Usually returns a tuple containing targets for predictions. + + - mlvl_pos_mask_targets (list[Tensor]): Each element represent + the binary mask targets for positive points in this + level, has shape (num_pos, out_h, out_w). + - mlvl_labels (list[Tensor]): Each element is + classification labels for all + points in this level, has shape + (num_grid, num_grid). + - mlvl_pos_masks (list[Tensor]): Each element is + a `BoolTensor` to represent whether the + corresponding point in single level + is positive, has shape (num_grid **2). + - mlvl_pos_indexes (list[list]): Each element + in the list contains the positive index in + corresponding level, has shape (num_pos). + """ + gt_labels = gt_instances.labels + device = gt_labels.device + + gt_bboxes = gt_instances.bboxes + gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) * + (gt_bboxes[:, 3] - gt_bboxes[:, 1])) + gt_masks = gt_instances.masks.to_tensor( + dtype=torch.bool, device=device) + + mlvl_pos_mask_targets = [] + mlvl_pos_indexes = [] + mlvl_labels = [] + mlvl_pos_masks = [] + for (lower_bound, upper_bound), num_grid \ + in zip(self.scale_ranges, self.num_grids): + mask_target = [] + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + pos_index = [] + labels = torch.zeros([num_grid, num_grid], + dtype=torch.int64, + device=device) + self.num_classes + pos_mask = torch.zeros([num_grid**2], + dtype=torch.bool, + device=device) + + gt_inds = ((gt_areas >= lower_bound) & + (gt_areas <= upper_bound)).nonzero().flatten() + if len(gt_inds) == 0: + mlvl_pos_mask_targets.append( + torch.zeros([0, featmap_sizes[0], featmap_sizes[1]], + dtype=torch.uint8, + device=device)) + mlvl_labels.append(labels) + mlvl_pos_masks.append(pos_mask) + mlvl_pos_indexes.append([]) + continue + hit_gt_bboxes = gt_bboxes[gt_inds] + hit_gt_labels = gt_labels[gt_inds] + hit_gt_masks = gt_masks[gt_inds, ...] + + pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] - + hit_gt_bboxes[:, 0]) * self.pos_scale + pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] - + hit_gt_bboxes[:, 1]) * self.pos_scale + + # Make sure hit_gt_masks has a value + valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0 + + for gt_mask, gt_label, pos_h_range, pos_w_range, \ + valid_mask_flag in \ + zip(hit_gt_masks, hit_gt_labels, pos_h_ranges, + pos_w_ranges, valid_mask_flags): + if not valid_mask_flag: + continue + upsampled_size = (featmap_sizes[0] * self.mask_stride, + featmap_sizes[1] * self.mask_stride) + center_h, center_w = center_of_mass(gt_mask) + + coord_w = int( + floordiv((center_w / upsampled_size[1]), (1. / num_grid), + rounding_mode='trunc')) + coord_h = int( + floordiv((center_h / upsampled_size[0]), (1. / num_grid), + rounding_mode='trunc')) + + # left, top, right, down + top_box = max( + 0, + int( + floordiv( + (center_h - pos_h_range) / upsampled_size[0], + (1. / num_grid), + rounding_mode='trunc'))) + down_box = min( + num_grid - 1, + int( + floordiv( + (center_h + pos_h_range) / upsampled_size[0], + (1. / num_grid), + rounding_mode='trunc'))) + left_box = max( + 0, + int( + floordiv( + (center_w - pos_w_range) / upsampled_size[1], + (1. / num_grid), + rounding_mode='trunc'))) + right_box = min( + num_grid - 1, + int( + floordiv( + (center_w + pos_w_range) / upsampled_size[1], + (1. / num_grid), + rounding_mode='trunc'))) + + top = max(top_box, coord_h - 1) + down = min(down_box, coord_h + 1) + left = max(coord_w - 1, left_box) + right = min(right_box, coord_w + 1) + + labels[top:(down + 1), left:(right + 1)] = gt_label + # ins + gt_mask = np.uint8(gt_mask.cpu().numpy()) + # Follow the original implementation, F.interpolate is + # different from cv2 and opencv + gt_mask = mmcv.imrescale(gt_mask, scale=1. / self.mask_stride) + gt_mask = torch.from_numpy(gt_mask).to(device=device) + + for i in range(top, down + 1): + for j in range(left, right + 1): + index = int(i * num_grid + j) + this_mask_target = torch.zeros( + [featmap_sizes[0], featmap_sizes[1]], + dtype=torch.uint8, + device=device) + this_mask_target[:gt_mask.shape[0], :gt_mask. + shape[1]] = gt_mask + mask_target.append(this_mask_target) + pos_mask[index] = True + pos_index.append(index) + if len(mask_target) == 0: + mask_target = torch.zeros( + [0, featmap_sizes[0], featmap_sizes[1]], + dtype=torch.uint8, + device=device) + else: + mask_target = torch.stack(mask_target, 0) + mlvl_pos_mask_targets.append(mask_target) + mlvl_labels.append(labels) + mlvl_pos_masks.append(pos_mask) + mlvl_pos_indexes.append(pos_index) + return (mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks, + mlvl_pos_indexes) + + def loss_by_feat(self, mlvl_kernel_preds: List[Tensor], + mlvl_cls_preds: List[Tensor], mask_feats: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], **kwargs) -> dict: + """Calculate the loss based on the features extracted by the mask head. + + Args: + mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel + prediction. The kernel is used to generate instance + segmentation masks by dynamic convolution. Each element in the + list has shape + (batch_size, kernel_out_channels, num_grids, num_grids). + mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element + in the list has shape + (batch_size, num_classes, num_grids, num_grids). + mask_feats (Tensor): Unified mask feature map used to generate + instance segmentation masks by dynamic convolution. Has shape + (batch_size, mask_out_channels, h, w). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``masks``, + and ``labels`` attributes. + batch_img_metas (list[dict]): Meta information of multiple images. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + featmap_sizes = mask_feats.size()[-2:] + + pos_mask_targets, labels, pos_masks, pos_indexes = multi_apply( + self._get_targets_single, + batch_gt_instances, + featmap_sizes=featmap_sizes) + + mlvl_mask_targets = [ + torch.cat(lvl_mask_targets, 0) + for lvl_mask_targets in zip(*pos_mask_targets) + ] + + mlvl_pos_kernel_preds = [] + for lvl_kernel_preds, lvl_pos_indexes in zip(mlvl_kernel_preds, + zip(*pos_indexes)): + lvl_pos_kernel_preds = [] + for img_lvl_kernel_preds, img_lvl_pos_indexes in zip( + lvl_kernel_preds, lvl_pos_indexes): + img_lvl_pos_kernel_preds = img_lvl_kernel_preds.view( + img_lvl_kernel_preds.shape[0], -1)[:, img_lvl_pos_indexes] + lvl_pos_kernel_preds.append(img_lvl_pos_kernel_preds) + mlvl_pos_kernel_preds.append(lvl_pos_kernel_preds) + + # make multilevel mlvl_mask_pred + mlvl_mask_preds = [] + for lvl_pos_kernel_preds in mlvl_pos_kernel_preds: + lvl_mask_preds = [] + for img_id, img_lvl_pos_kernel_pred in enumerate( + lvl_pos_kernel_preds): + if img_lvl_pos_kernel_pred.size()[-1] == 0: + continue + img_mask_feats = mask_feats[[img_id]] + h, w = img_mask_feats.shape[-2:] + num_kernel = img_lvl_pos_kernel_pred.shape[1] + img_lvl_mask_pred = F.conv2d( + img_mask_feats, + img_lvl_pos_kernel_pred.permute(1, 0).view( + num_kernel, -1, self.dynamic_conv_size, + self.dynamic_conv_size), + stride=1).view(-1, h, w) + lvl_mask_preds.append(img_lvl_mask_pred) + if len(lvl_mask_preds) == 0: + lvl_mask_preds = None + else: + lvl_mask_preds = torch.cat(lvl_mask_preds, 0) + mlvl_mask_preds.append(lvl_mask_preds) + # dice loss + num_pos = 0 + for img_pos_masks in pos_masks: + for lvl_img_pos_masks in img_pos_masks: + # Fix `Tensor` object has no attribute `count_nonzero()` + # in PyTorch 1.6, the type of `lvl_img_pos_masks` + # should be `torch.bool`. + num_pos += lvl_img_pos_masks.nonzero().numel() + loss_mask = [] + for lvl_mask_preds, lvl_mask_targets in zip(mlvl_mask_preds, + mlvl_mask_targets): + if lvl_mask_preds is None: + continue + loss_mask.append( + self.loss_mask( + lvl_mask_preds, + lvl_mask_targets, + reduction_override='none')) + if num_pos > 0: + loss_mask = torch.cat(loss_mask).sum() / num_pos + else: + loss_mask = mask_feats.sum() * 0 + + # cate + flatten_labels = [ + torch.cat( + [img_lvl_labels.flatten() for img_lvl_labels in lvl_labels]) + for lvl_labels in zip(*labels) + ] + flatten_labels = torch.cat(flatten_labels) + + flatten_cls_preds = [ + lvl_cls_preds.permute(0, 2, 3, 1).reshape(-1, self.num_classes) + for lvl_cls_preds in mlvl_cls_preds + ] + flatten_cls_preds = torch.cat(flatten_cls_preds) + + loss_cls = self.loss_cls( + flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1) + return dict(loss_mask=loss_mask, loss_cls=loss_cls) + + def predict_by_feat(self, mlvl_kernel_preds: List[Tensor], + mlvl_cls_scores: List[Tensor], mask_feats: Tensor, + batch_img_metas: List[dict], **kwargs) -> InstanceList: + """Transform a batch of output features extracted from the head into + mask results. + + Args: + mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel + prediction. The kernel is used to generate instance + segmentation masks by dynamic convolution. Each element in the + list has shape + (batch_size, kernel_out_channels, num_grids, num_grids). + mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element + in the list has shape + (batch_size, num_classes, num_grids, num_grids). + mask_feats (Tensor): Unified mask feature map used to generate + instance segmentation masks by dynamic convolution. Has shape + (batch_size, mask_out_channels, h, w). + batch_img_metas (list[dict]): Meta information of all images. + + Returns: + list[:obj:`InstanceData`]: Processed results of multiple + images.Each :obj:`InstanceData` usually contains + following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + num_levels = len(mlvl_cls_scores) + assert len(mlvl_kernel_preds) == len(mlvl_cls_scores) + + for lvl in range(num_levels): + cls_scores = mlvl_cls_scores[lvl] + cls_scores = cls_scores.sigmoid() + local_max = F.max_pool2d(cls_scores, 2, stride=1, padding=1) + keep_mask = local_max[:, :, :-1, :-1] == cls_scores + cls_scores = cls_scores * keep_mask + mlvl_cls_scores[lvl] = cls_scores.permute(0, 2, 3, 1) + + result_list = [] + for img_id in range(len(batch_img_metas)): + img_cls_pred = [ + mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels) + for lvl in range(num_levels) + ] + img_mask_feats = mask_feats[[img_id]] + img_kernel_pred = [ + mlvl_kernel_preds[lvl][img_id].permute(1, 2, 0).view( + -1, self.kernel_out_channels) for lvl in range(num_levels) + ] + img_cls_pred = torch.cat(img_cls_pred, dim=0) + img_kernel_pred = torch.cat(img_kernel_pred, dim=0) + result = self._predict_by_feat_single( + img_kernel_pred, + img_cls_pred, + img_mask_feats, + img_meta=batch_img_metas[img_id]) + result_list.append(result) + return result_list + + def _predict_by_feat_single(self, + kernel_preds: Tensor, + cls_scores: Tensor, + mask_feats: Tensor, + img_meta: dict, + cfg: OptConfigType = None) -> InstanceData: + """Transform a single image's features extracted from the head into + mask results. + + Args: + kernel_preds (Tensor): Dynamic kernel prediction of all points + in single image, has shape + (num_points, kernel_out_channels). + cls_scores (Tensor): Classification score of all points + in single image, has shape (num_points, num_classes). + mask_feats (Tensor): Mask prediction of all points in + single image, has shape (num_points, feat_h, feat_w). + img_meta (dict): Meta information of corresponding image. + cfg (dict, optional): Config used in test phase. + Defaults to None. + + Returns: + :obj:`InstanceData`: Processed results of single image. + it usually contains following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + + def empty_results(cls_scores, ori_shape): + """Generate a empty results.""" + results = InstanceData() + results.scores = cls_scores.new_ones(0) + results.masks = cls_scores.new_zeros(0, *ori_shape) + results.labels = cls_scores.new_ones(0) + results.bboxes = cls_scores.new_zeros(0, 4) + return results + + cfg = self.test_cfg if cfg is None else cfg + assert len(kernel_preds) == len(cls_scores) + + featmap_size = mask_feats.size()[-2:] + + # overall info + h, w = img_meta['img_shape'][:2] + upsampled_size = (featmap_size[0] * self.mask_stride, + featmap_size[1] * self.mask_stride) + + # process. + score_mask = (cls_scores > cfg.score_thr) + cls_scores = cls_scores[score_mask] + if len(cls_scores) == 0: + return empty_results(cls_scores, img_meta['ori_shape'][:2]) + + # cate_labels & kernel_preds + inds = score_mask.nonzero() + cls_labels = inds[:, 1] + kernel_preds = kernel_preds[inds[:, 0]] + + # trans vector. + lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0) + strides = kernel_preds.new_ones(lvl_interval[-1]) + + strides[:lvl_interval[0]] *= self.strides[0] + for lvl in range(1, self.num_levels): + strides[lvl_interval[lvl - + 1]:lvl_interval[lvl]] *= self.strides[lvl] + strides = strides[inds[:, 0]] + + # mask encoding. + kernel_preds = kernel_preds.view( + kernel_preds.size(0), -1, self.dynamic_conv_size, + self.dynamic_conv_size) + mask_preds = F.conv2d( + mask_feats, kernel_preds, stride=1).squeeze(0).sigmoid() + # mask. + masks = mask_preds > cfg.mask_thr + sum_masks = masks.sum((1, 2)).float() + keep = sum_masks > strides + if keep.sum() == 0: + return empty_results(cls_scores, img_meta['ori_shape'][:2]) + masks = masks[keep] + mask_preds = mask_preds[keep] + sum_masks = sum_masks[keep] + cls_scores = cls_scores[keep] + cls_labels = cls_labels[keep] + + # maskness. + mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks + cls_scores *= mask_scores + + scores, labels, _, keep_inds = mask_matrix_nms( + masks, + cls_labels, + cls_scores, + mask_area=sum_masks, + nms_pre=cfg.nms_pre, + max_num=cfg.max_per_img, + kernel=cfg.kernel, + sigma=cfg.sigma, + filter_thr=cfg.filter_thr) + if len(keep_inds) == 0: + return empty_results(cls_scores, img_meta['ori_shape'][:2]) + mask_preds = mask_preds[keep_inds] + mask_preds = F.interpolate( + mask_preds.unsqueeze(0), + size=upsampled_size, + mode='bilinear', + align_corners=False)[:, :, :h, :w] + mask_preds = F.interpolate( + mask_preds, + size=img_meta['ori_shape'][:2], + mode='bilinear', + align_corners=False).squeeze(0) + masks = mask_preds > cfg.mask_thr + + results = InstanceData() + results.masks = masks + results.labels = labels + results.scores = scores + # create an empty bbox in InstanceData to avoid bugs when + # calculating metrics. + results.bboxes = results.scores.new_zeros(len(scores), 4) + + return results diff --git a/mmdetection/mmdet/models/dense_heads/ssd_head.py b/mmdetection/mmdet/models/dense_heads/ssd_head.py new file mode 100644 index 0000000..950df29 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/ssd_head.py @@ -0,0 +1,362 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList +from ..losses import smooth_l1_loss +from ..task_modules.samplers import PseudoSampler +from ..utils import multi_apply +from .anchor_head import AnchorHead + + +# TODO: add loss evaluator for SSD +@MODELS.register_module() +class SSDHead(AnchorHead): + """Implementation of `SSD head `_ + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Sequence[int]): Number of channels in the input feature + map. + stacked_convs (int): Number of conv layers in cls and reg tower. + Defaults to 0. + feat_channels (int): Number of hidden channels when stacked_convs + > 0. Defaults to 256. + use_depthwise (bool): Whether to use DepthwiseSeparableConv. + Defaults to False. + conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct + and config conv layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct + and config norm layer. Defaults to None. + act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct + and config activation layer. Defaults to None. + anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor + generator. + bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder. + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Defaults to False. It should be `True` when + using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. + train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of + anchor head. + test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of + anchor head. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], Optional): Initialization config dict. + """ # noqa: W605 + + def __init__( + self, + num_classes: int = 80, + in_channels: Sequence[int] = (512, 1024, 512, 256, 256, 256), + stacked_convs: int = 0, + feat_channels: int = 256, + use_depthwise: bool = False, + conv_cfg: Optional[ConfigType] = None, + norm_cfg: Optional[ConfigType] = None, + act_cfg: Optional[ConfigType] = None, + anchor_generator: ConfigType = dict( + type='SSDAnchorGenerator', + scale_major=False, + input_size=300, + strides=[8, 16, 32, 64, 100, 300], + ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), + basesize_ratio_range=(0.1, 0.9)), + bbox_coder: ConfigType = dict( + type='DeltaXYWHBBoxCoder', + clip_border=True, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + ), + reg_decoded_bbox: bool = False, + train_cfg: Optional[ConfigType] = None, + test_cfg: Optional[ConfigType] = None, + init_cfg: MultiConfig = dict( + type='Xavier', layer='Conv2d', distribution='uniform', bias=0) + ) -> None: + super(AnchorHead, self).__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.in_channels = in_channels + self.stacked_convs = stacked_convs + self.feat_channels = feat_channels + self.use_depthwise = use_depthwise + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + self.cls_out_channels = num_classes + 1 # add background class + self.prior_generator = TASK_UTILS.build(anchor_generator) + + # Usually the numbers of anchors for each level are the same + # except SSD detectors. So it is an int in the most dense + # heads but a list of int in SSDHead + self.num_base_priors = self.prior_generator.num_base_priors + + self._init_layers() + + self.bbox_coder = TASK_UTILS.build(bbox_coder) + self.reg_decoded_bbox = reg_decoded_bbox + self.use_sigmoid_cls = False + self.cls_focal_loss = False + self.train_cfg = train_cfg + self.test_cfg = test_cfg + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + if self.train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + # TODO: Use registry to choose ConvModule type + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + + for channel, num_base_priors in zip(self.in_channels, + self.num_base_priors): + cls_layers = [] + reg_layers = [] + in_channel = channel + # build stacked conv tower, not used in default ssd + for i in range(self.stacked_convs): + cls_layers.append( + conv( + in_channel, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_layers.append( + conv( + in_channel, + self.feat_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + in_channel = self.feat_channels + # SSD-Lite head + if self.use_depthwise: + cls_layers.append( + ConvModule( + in_channel, + in_channel, + 3, + padding=1, + groups=in_channel, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_layers.append( + ConvModule( + in_channel, + in_channel, + 3, + padding=1, + groups=in_channel, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + cls_layers.append( + nn.Conv2d( + in_channel, + num_base_priors * self.cls_out_channels, + kernel_size=1 if self.use_depthwise else 3, + padding=0 if self.use_depthwise else 1)) + reg_layers.append( + nn.Conv2d( + in_channel, + num_base_priors * 4, + kernel_size=1 if self.use_depthwise else 3, + padding=0 if self.use_depthwise else 1)) + self.cls_convs.append(nn.Sequential(*cls_layers)) + self.reg_convs.append(nn.Sequential(*reg_layers)) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple[list[Tensor], list[Tensor]]: A tuple of cls_scores list and + bbox_preds list. + + - cls_scores (list[Tensor]): Classification scores for all scale \ + levels, each is a 4D-tensor, the channels number is \ + num_anchors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale \ + levels, each is a 4D-tensor, the channels number is \ + num_anchors * 4. + """ + cls_scores = [] + bbox_preds = [] + for feat, reg_conv, cls_conv in zip(x, self.reg_convs, self.cls_convs): + cls_scores.append(cls_conv(feat)) + bbox_preds.append(reg_conv(feat)) + return cls_scores, bbox_preds + + def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, + anchor: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + bbox_weights: Tensor, + avg_factor: int) -> Tuple[Tensor, Tensor]: + """Compute loss of a single image. + + Args: + cls_score (Tensor): Box scores for eachimage + Has shape (num_total_anchors, num_classes). + bbox_pred (Tensor): Box energies / deltas for each image + level with shape (num_total_anchors, 4). + anchors (Tensor): Box reference for each scale level with shape + (num_total_anchors, 4). + labels (Tensor): Labels of each anchors with shape + (num_total_anchors,). + label_weights (Tensor): Label weights of each anchor with shape + (num_total_anchors,) + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (num_total_anchors, 4). + bbox_weights (Tensor): BBox regression loss weights of each anchor + with shape (num_total_anchors, 4). + avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one + feature map. + """ + + loss_cls_all = F.cross_entropy( + cls_score, labels, reduction='none') * label_weights + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero( + as_tuple=False).reshape(-1) + neg_inds = (labels == self.num_classes).nonzero( + as_tuple=False).view(-1) + + num_pos_samples = pos_inds.size(0) + num_neg_samples = self.train_cfg['neg_pos_ratio'] * num_pos_samples + if num_neg_samples > neg_inds.size(0): + num_neg_samples = neg_inds.size(0) + topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples) + loss_cls_pos = loss_cls_all[pos_inds].sum() + loss_cls_neg = topk_loss_cls_neg.sum() + loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor + + if self.reg_decoded_bbox: + # When the regression loss (e.g. `IouLoss`, `GIouLoss`) + # is applied directly on the decoded bounding boxes, it + # decodes the already encoded coordinates to absolute format. + bbox_pred = self.bbox_coder.decode(anchor, bbox_pred) + + loss_bbox = smooth_l1_loss( + bbox_pred, + bbox_targets, + bbox_weights, + beta=self.train_cfg['smoothl1_beta'], + avg_factor=avg_factor) + return loss_cls[None], loss_bbox + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, List[Tensor]]: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, list[Tensor]]: A dictionary of loss components. the dict + has components below: + + - loss_cls (list[Tensor]): A list containing each feature map \ + classification loss. + - loss_bbox (list[Tensor]): A list containing each feature map \ + regression loss. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + unmap_outputs=True) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor) = cls_reg_targets + + num_images = len(batch_img_metas) + all_cls_scores = torch.cat([ + s.permute(0, 2, 3, 1).reshape( + num_images, -1, self.cls_out_channels) for s in cls_scores + ], 1) + all_labels = torch.cat(labels_list, -1).view(num_images, -1) + all_label_weights = torch.cat(label_weights_list, + -1).view(num_images, -1) + all_bbox_preds = torch.cat([ + b.permute(0, 2, 3, 1).reshape(num_images, -1, 4) + for b in bbox_preds + ], -2) + all_bbox_targets = torch.cat(bbox_targets_list, + -2).view(num_images, -1, 4) + all_bbox_weights = torch.cat(bbox_weights_list, + -2).view(num_images, -1, 4) + + # concat all level anchors to a single tensor + all_anchors = [] + for i in range(num_images): + all_anchors.append(torch.cat(anchor_list[i])) + + losses_cls, losses_bbox = multi_apply( + self.loss_by_feat_single, + all_cls_scores, + all_bbox_preds, + all_anchors, + all_labels, + all_label_weights, + all_bbox_targets, + all_bbox_weights, + avg_factor=avg_factor) + return dict(loss_cls=losses_cls, loss_bbox=losses_bbox) diff --git a/mmdetection/mmdet/models/dense_heads/tood_head.py b/mmdetection/mmdet/models/dense_heads/tood_head.py new file mode 100644 index 0000000..8c59598 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/tood_head.py @@ -0,0 +1,805 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, Scale +from mmcv.ops import deform_conv2d +from mmengine import MessageHub +from mmengine.config import ConfigDict +from mmengine.model import bias_init_with_prob, normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import distance2bbox +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, reduce_mean) +from ..task_modules.prior_generators import anchor_inside_flags +from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply, + sigmoid_geometric_mean, unmap) +from .atss_head import ATSSHead + + +class TaskDecomposition(nn.Module): + """Task decomposition module in task-aligned predictor of TOOD. + + Args: + feat_channels (int): Number of feature channels in TOOD head. + stacked_convs (int): Number of conv layers in TOOD head. + la_down_rate (int): Downsample rate of layer attention. + Defaults to 8. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + normalization layer. Defaults to None. + """ + + def __init__(self, + feat_channels: int, + stacked_convs: int, + la_down_rate: int = 8, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None) -> None: + super().__init__() + self.feat_channels = feat_channels + self.stacked_convs = stacked_convs + self.in_channels = self.feat_channels * self.stacked_convs + self.norm_cfg = norm_cfg + self.layer_attention = nn.Sequential( + nn.Conv2d(self.in_channels, self.in_channels // la_down_rate, 1), + nn.ReLU(inplace=True), + nn.Conv2d( + self.in_channels // la_down_rate, + self.stacked_convs, + 1, + padding=0), nn.Sigmoid()) + + self.reduction_conv = ConvModule( + self.in_channels, + self.feat_channels, + 1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + bias=norm_cfg is None) + + def init_weights(self) -> None: + """Initialize the parameters.""" + for m in self.layer_attention.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + normal_init(self.reduction_conv.conv, std=0.01) + + def forward(self, + feat: Tensor, + avg_feat: Optional[Tensor] = None) -> Tensor: + """Forward function of task decomposition module.""" + b, c, h, w = feat.shape + if avg_feat is None: + avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) + weight = self.layer_attention(avg_feat) + + # here we first compute the product between layer attention weight and + # conv weight, and then compute the convolution between new conv weight + # and feature map, in order to save memory and FLOPs. + conv_weight = weight.reshape( + b, 1, self.stacked_convs, + 1) * self.reduction_conv.conv.weight.reshape( + 1, self.feat_channels, self.stacked_convs, self.feat_channels) + conv_weight = conv_weight.reshape(b, self.feat_channels, + self.in_channels) + feat = feat.reshape(b, self.in_channels, h * w) + feat = torch.bmm(conv_weight, feat).reshape(b, self.feat_channels, h, + w) + if self.norm_cfg is not None: + feat = self.reduction_conv.norm(feat) + feat = self.reduction_conv.activate(feat) + + return feat + + +@MODELS.register_module() +class TOODHead(ATSSHead): + """TOODHead used in `TOOD: Task-aligned One-stage Object Detection. + + `_. + + TOOD uses Task-aligned head (T-head) and is optimized by Task Alignment + Learning (TAL). + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + num_dcn (int): Number of deformable convolution in the head. + Defaults to 0. + anchor_type (str): If set to ``anchor_free``, the head will use centers + to regress bboxes. If set to ``anchor_based``, the head will + regress bboxes based on anchors. Defaults to ``anchor_free``. + initial_loss_cls (:obj:`ConfigDict` or dict): Config of initial loss. + + Example: + >>> self = TOODHead(11, 7) + >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]] + >>> cls_score, bbox_pred = self.forward(feats) + >>> assert len(cls_score) == len(self.scales) + """ + + def __init__(self, + num_classes: int, + in_channels: int, + num_dcn: int = 0, + anchor_type: str = 'anchor_free', + initial_loss_cls: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + activated=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + **kwargs) -> None: + assert anchor_type in ['anchor_free', 'anchor_based'] + self.num_dcn = num_dcn + self.anchor_type = anchor_type + super().__init__( + num_classes=num_classes, in_channels=in_channels, **kwargs) + + if self.train_cfg: + self.initial_epoch = self.train_cfg['initial_epoch'] + self.initial_assigner = TASK_UTILS.build( + self.train_cfg['initial_assigner']) + self.initial_loss_cls = MODELS.build(initial_loss_cls) + self.assigner = self.initial_assigner + self.alignment_assigner = TASK_UTILS.build( + self.train_cfg['assigner']) + self.alpha = self.train_cfg['alpha'] + self.beta = self.train_cfg['beta'] + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.inter_convs = nn.ModuleList() + for i in range(self.stacked_convs): + if i < self.num_dcn: + conv_cfg = dict(type='DCNv2', deform_groups=4) + else: + conv_cfg = self.conv_cfg + chn = self.in_channels if i == 0 else self.feat_channels + self.inter_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg)) + + self.cls_decomp = TaskDecomposition(self.feat_channels, + self.stacked_convs, + self.stacked_convs * 8, + self.conv_cfg, self.norm_cfg) + self.reg_decomp = TaskDecomposition(self.feat_channels, + self.stacked_convs, + self.stacked_convs * 8, + self.conv_cfg, self.norm_cfg) + + self.tood_cls = nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + 3, + padding=1) + self.tood_reg = nn.Conv2d( + self.feat_channels, self.num_base_priors * 4, 3, padding=1) + + self.cls_prob_module = nn.Sequential( + nn.Conv2d(self.feat_channels * self.stacked_convs, + self.feat_channels // 4, 1), nn.ReLU(inplace=True), + nn.Conv2d(self.feat_channels // 4, 1, 3, padding=1)) + self.reg_offset_module = nn.Sequential( + nn.Conv2d(self.feat_channels * self.stacked_convs, + self.feat_channels // 4, 1), nn.ReLU(inplace=True), + nn.Conv2d(self.feat_channels // 4, 4 * 2, 3, padding=1)) + + self.scales = nn.ModuleList( + [Scale(1.0) for _ in self.prior_generator.strides]) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + bias_cls = bias_init_with_prob(0.01) + for m in self.inter_convs: + normal_init(m.conv, std=0.01) + for m in self.cls_prob_module: + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.01) + for m in self.reg_offset_module: + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + normal_init(self.cls_prob_module[-1], std=0.01, bias=bias_cls) + + self.cls_decomp.init_weights() + self.reg_decomp.init_weights() + + normal_init(self.tood_cls, std=0.01, bias=bias_cls) + normal_init(self.tood_reg, std=0.01) + + def forward(self, feats: Tuple[Tensor]) -> Tuple[List[Tensor]]: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_anchors * num_classes. + bbox_preds (list[Tensor]): Decoded box for all scale levels, + each is a 4D-tensor, the channels number is + num_anchors * 4. In [tl_x, tl_y, br_x, br_y] format. + """ + cls_scores = [] + bbox_preds = [] + for idx, (x, scale, stride) in enumerate( + zip(feats, self.scales, self.prior_generator.strides)): + b, c, h, w = x.shape + anchor = self.prior_generator.single_level_grid_priors( + (h, w), idx, device=x.device) + anchor = torch.cat([anchor for _ in range(b)]) + # extract task interactive features + inter_feats = [] + for inter_conv in self.inter_convs: + x = inter_conv(x) + inter_feats.append(x) + feat = torch.cat(inter_feats, 1) + + # task decomposition + avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) + cls_feat = self.cls_decomp(feat, avg_feat) + reg_feat = self.reg_decomp(feat, avg_feat) + + # cls prediction and alignment + cls_logits = self.tood_cls(cls_feat) + cls_prob = self.cls_prob_module(feat) + cls_score = sigmoid_geometric_mean(cls_logits, cls_prob) + + # reg prediction and alignment + if self.anchor_type == 'anchor_free': + reg_dist = scale(self.tood_reg(reg_feat).exp()).float() + reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4) + reg_bbox = distance2bbox( + self.anchor_center(anchor) / stride[0], + reg_dist).reshape(b, h, w, 4).permute(0, 3, 1, + 2) # (b, c, h, w) + elif self.anchor_type == 'anchor_based': + reg_dist = scale(self.tood_reg(reg_feat)).float() + reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4) + reg_bbox = self.bbox_coder.decode(anchor, reg_dist).reshape( + b, h, w, 4).permute(0, 3, 1, 2) / stride[0] + else: + raise NotImplementedError( + f'Unknown anchor type: {self.anchor_type}.' + f'Please use `anchor_free` or `anchor_based`.') + reg_offset = self.reg_offset_module(feat) + bbox_pred = self.deform_sampling(reg_bbox.contiguous(), + reg_offset.contiguous()) + + # After deform_sampling, some boxes will become invalid (The + # left-top point is at the right or bottom of the right-bottom + # point), which will make the GIoULoss negative. + invalid_bbox_idx = (bbox_pred[:, [0]] > bbox_pred[:, [2]]) | \ + (bbox_pred[:, [1]] > bbox_pred[:, [3]]) + invalid_bbox_idx = invalid_bbox_idx.expand_as(bbox_pred) + bbox_pred = torch.where(invalid_bbox_idx, reg_bbox, bbox_pred) + + cls_scores.append(cls_score) + bbox_preds.append(bbox_pred) + return tuple(cls_scores), tuple(bbox_preds) + + def deform_sampling(self, feat: Tensor, offset: Tensor) -> Tensor: + """Sampling the feature x according to offset. + + Args: + feat (Tensor): Feature + offset (Tensor): Spatial offset for feature sampling + """ + # it is an equivalent implementation of bilinear interpolation + b, c, h, w = feat.shape + weight = feat.new_ones(c, 1, 1, 1) + y = deform_conv2d(feat, offset, weight, 1, 0, 1, c, c) + return y + + def anchor_center(self, anchors: Tensor) -> Tensor: + """Get anchor centers from anchors. + + Args: + anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format. + + Returns: + Tensor: Anchor centers with shape (N, 2), "xy" format. + """ + anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2 + anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2 + return torch.stack([anchors_cx, anchors_cy], dim=-1) + + def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor, + bbox_pred: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + alignment_metrics: Tensor, + stride: Tuple[int, int]) -> dict: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + anchors (Tensor): Box reference for each scale level with shape + (N, num_total_anchors, 4). + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Decoded bboxes for each scale + level with shape (N, num_anchors * 4, H, W). + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors). + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (N, num_total_anchors, 4). + alignment_metrics (Tensor): Alignment metrics with shape + (N, num_total_anchors). + stride (Tuple[int, int]): Downsample stride of the feature map. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert stride[0] == stride[1], 'h stride is not equal to w stride!' + anchors = anchors.reshape(-1, 4) + cls_score = cls_score.permute(0, 2, 3, 1).reshape( + -1, self.cls_out_channels).contiguous() + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + alignment_metrics = alignment_metrics.reshape(-1) + label_weights = label_weights.reshape(-1) + targets = labels if self.epoch < self.initial_epoch else ( + labels, alignment_metrics) + cls_loss_func = self.initial_loss_cls \ + if self.epoch < self.initial_epoch else self.loss_cls + + loss_cls = cls_loss_func( + cls_score, targets, label_weights, avg_factor=1.0) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + pos_anchors = anchors[pos_inds] + + pos_decode_bbox_pred = pos_bbox_pred + pos_decode_bbox_targets = pos_bbox_targets / stride[0] + + # regression loss + pos_bbox_weight = self.centerness_target( + pos_anchors, pos_bbox_targets + ) if self.epoch < self.initial_epoch else alignment_metrics[ + pos_inds] + + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_decode_bbox_targets, + weight=pos_bbox_weight, + avg_factor=1.0) + else: + loss_bbox = bbox_pred.sum() * 0 + pos_bbox_weight = bbox_targets.new_tensor(0.) + + return loss_cls, loss_bbox, alignment_metrics.sum( + ), pos_bbox_weight.sum() + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1) + flatten_bbox_preds = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) * stride[0] + for bbox_pred, stride in zip(bbox_preds, + self.prior_generator.strides) + ], 1) + + cls_reg_targets = self.get_targets( + flatten_cls_scores, + flatten_bbox_preds, + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + alignment_metrics_list) = cls_reg_targets + + losses_cls, losses_bbox, \ + cls_avg_factors, bbox_avg_factors = multi_apply( + self.loss_by_feat_single, + anchor_list, + cls_scores, + bbox_preds, + labels_list, + label_weights_list, + bbox_targets_list, + alignment_metrics_list, + self.prior_generator.strides) + + cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item() + losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls)) + + bbox_avg_factor = reduce_mean( + sum(bbox_avg_factors)).clamp_(min=1).item() + losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) + return dict(loss_cls=losses_cls, loss_bbox=losses_bbox) + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (:obj:`ConfigDict`, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + tuple[Tensor]: Results of detected bboxes and labels. If with_nms + is False and mlvl_score_factor is None, return mlvl_bboxes and + mlvl_scores, else return mlvl_bboxes, mlvl_scores and + mlvl_score_factor. Usually with_nms is False is used for aug + test. If with_nms is True, then return the following format + + - det_bboxes (Tensor): Predicted bboxes with shape \ + [num_bboxes, 5], where the first 4 columns are bounding \ + box positions (tl_x, tl_y, br_x, br_y) and the 5-th \ + column are scores between 0 and 1. + - det_labels (Tensor): Predicted labels of the corresponding \ + box with shape [num_bboxes]. + """ + + cfg = self.test_cfg if cfg is None else cfg + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_labels = [] + for cls_score, bbox_pred, priors, stride in zip( + cls_score_list, bbox_pred_list, mlvl_priors, + self.prior_generator.strides): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) * stride[0] + scores = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + results = filter_scores_and_topk( + scores, cfg.score_thr, nms_pre, + dict(bbox_pred=bbox_pred, priors=priors)) + scores, labels, keep_idxs, filtered_results = results + + bboxes = filtered_results['bbox_pred'] + + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + + results = InstanceData() + results.bboxes = torch.cat(mlvl_bboxes) + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + def get_targets(self, + cls_scores: List[List[Tensor]], + bbox_preds: List[List[Tensor]], + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression and classification targets for anchors in + multiple images. + + Args: + cls_scores (list[list[Tensor]]): Classification predictions of + images, a 3D-Tensor with shape [num_imgs, num_priors, + num_classes]. + bbox_preds (list[list[Tensor]]): Decoded bboxes predictions of one + image, a 3D-Tensor with shape [num_imgs, num_priors, 4] in + [tl_x, tl_y, br_x, br_y] format. + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, 4). + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, ) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Returns: + tuple: a tuple containing learning targets. + + - anchors_list (list[list[Tensor]]): Anchors of each level. + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_targets_list (list[Tensor]): BBox targets of each level. + - norm_alignment_metrics_list (list[Tensor]): Normalized + alignment metrics of each level. + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + num_level_anchors_list = [num_level_anchors] * num_imgs + + # concat all level anchors and flags to a single tensor + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + anchor_list[i] = torch.cat(anchor_list[i]) + valid_flag_list[i] = torch.cat(valid_flag_list[i]) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + # anchor_list: list(b * [-1, 4]) + + # get epoch information from message hub + message_hub = MessageHub.get_current_instance() + self.epoch = message_hub.get_info('epoch') + + if self.epoch < self.initial_epoch: + (all_anchors, all_labels, all_label_weights, all_bbox_targets, + all_bbox_weights, pos_inds_list, neg_inds_list, + sampling_result) = multi_apply( + super()._get_targets_single, + anchor_list, + valid_flag_list, + num_level_anchors_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + all_assign_metrics = [ + weight[..., 0] for weight in all_bbox_weights + ] + else: + (all_anchors, all_labels, all_label_weights, all_bbox_targets, + all_assign_metrics) = multi_apply( + self._get_targets_single, + cls_scores, + bbox_preds, + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + + # split targets to a list w.r.t. multiple levels + anchors_list = images_to_levels(all_anchors, num_level_anchors) + labels_list = images_to_levels(all_labels, num_level_anchors) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors) + norm_alignment_metrics_list = images_to_levels(all_assign_metrics, + num_level_anchors) + + return (anchors_list, labels_list, label_weights_list, + bbox_targets_list, norm_alignment_metrics_list) + + def _get_targets_single(self, + cls_scores: Tensor, + bbox_preds: Tensor, + flat_anchors: Tensor, + valid_flags: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression, classification targets for anchors in a single + image. + + Args: + cls_scores (Tensor): Box scores for each image. + bbox_preds (Tensor): Box energies / deltas for each image. + flat_anchors (Tensor): Multi-level anchors of the image, which are + concatenated into a single tensor of shape (num_anchors ,4) + valid_flags (Tensor): Multi level valid flags of the image, + which are concatenated into a single tensor of + shape (num_anchors,). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Returns: + tuple: N is the number of total anchors in the image. + anchors (Tensor): All anchors in the image with shape (N, 4). + labels (Tensor): Labels of all anchors in the image with shape + (N,). + label_weights (Tensor): Label weights of all anchor in the + image with shape (N,). + bbox_targets (Tensor): BBox targets of all anchors in the + image with shape (N, 4). + norm_alignment_metrics (Tensor): Normalized alignment metrics + of all priors in the image with shape (N,). + """ + inside_flags = anchor_inside_flags(flat_anchors, valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + # assign gt and sample anchors + anchors = flat_anchors[inside_flags, :] + pred_instances = InstanceData( + priors=anchors, + scores=cls_scores[inside_flags, :], + bboxes=bbox_preds[inside_flags, :]) + assign_result = self.alignment_assigner.assign(pred_instances, + gt_instances, + gt_instances_ignore, + self.alpha, self.beta) + assign_ious = assign_result.max_overlaps + assign_metrics = assign_result.assign_metrics + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_anchors = anchors.shape[0] + bbox_targets = torch.zeros_like(anchors) + labels = anchors.new_full((num_valid_anchors, ), + self.num_classes, + dtype=torch.long) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + norm_alignment_metrics = anchors.new_zeros( + num_valid_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + # point-based + pos_bbox_targets = sampling_result.pos_gt_bboxes + bbox_targets[pos_inds, :] = pos_bbox_targets + + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + class_assigned_gt_inds = torch.unique( + sampling_result.pos_assigned_gt_inds) + for gt_inds in class_assigned_gt_inds: + gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds == + gt_inds] + pos_alignment_metrics = assign_metrics[gt_class_inds] + pos_ious = assign_ious[gt_class_inds] + pos_norm_alignment_metrics = pos_alignment_metrics / ( + pos_alignment_metrics.max() + 10e-8) * pos_ious.max() + norm_alignment_metrics[gt_class_inds] = pos_norm_alignment_metrics + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_anchors.size(0) + anchors = unmap(anchors, num_total_anchors, inside_flags) + labels = unmap( + labels, num_total_anchors, inside_flags, fill=self.num_classes) + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags) + norm_alignment_metrics = unmap(norm_alignment_metrics, + num_total_anchors, inside_flags) + return (anchors, labels, label_weights, bbox_targets, + norm_alignment_metrics) diff --git a/mmdetection/mmdet/models/dense_heads/vfnet_head.py b/mmdetection/mmdet/models/dense_heads/vfnet_head.py new file mode 100644 index 0000000..430b06d --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/vfnet_head.py @@ -0,0 +1,722 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, Scale +from mmcv.ops import DeformConv2d +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, + OptInstanceList, RangeType, reduce_mean) +from ..task_modules.prior_generators import MlvlPointGenerator +from ..task_modules.samplers import PseudoSampler +from ..utils import multi_apply +from .atss_head import ATSSHead +from .fcos_head import FCOSHead + +INF = 1e8 + + +@MODELS.register_module() +class VFNetHead(ATSSHead, FCOSHead): + """Head of `VarifocalNet (VFNet): An IoU-aware Dense Object + Detector.`_. + + The VFNet predicts IoU-aware classification scores which mix the + object presence confidence and object localization accuracy as the + detection score. It is built on the FCOS architecture and uses ATSS + for defining positive/negative training examples. The VFNet is trained + with Varifocal Loss and empolys star-shaped deformable convolution to + extract features for a bbox. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple + level points. + center_sampling (bool): If true, use center sampling. Defaults to False. + center_sample_radius (float): Radius of center sampling. Defaults to 1.5. + sync_num_pos (bool): If true, synchronize the number of positive + examples across GPUs. Defaults to True + gradient_mul (float): The multiplier to gradients from bbox refinement + and recognition. Defaults to 0.1. + bbox_norm_type (str): The bbox normalization type, 'reg_denom' or + 'stride'. Defaults to reg_denom + loss_cls_fl (:obj:`ConfigDict` or dict): Config of focal loss. + use_vfl (bool): If true, use varifocal loss for training. + Defaults to True. + loss_cls (:obj:`ConfigDict` or dict): Config of varifocal loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss, + GIoU Loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization + refinement loss, GIoU Loss. + norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and + config norm layer. Defaults to norm_cfg=dict(type='GN', + num_groups=32, requires_grad=True). + use_atss (bool): If true, use ATSS to define positive/negative + examples. Defaults to True. + anchor_generator (:obj:`ConfigDict` or dict): Config of anchor + generator for ATSS. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + + Example: + >>> self = VFNetHead(11, 7) + >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]] + >>> cls_score, bbox_pred, bbox_pred_refine= self.forward(feats) + >>> assert len(cls_score) == len(self.scales) + """ # noqa: E501 + + def __init__(self, + num_classes: int, + in_channels: int, + regress_ranges: RangeType = ((-1, 64), (64, 128), (128, 256), + (256, 512), (512, INF)), + center_sampling: bool = False, + center_sample_radius: float = 1.5, + sync_num_pos: bool = True, + gradient_mul: float = 0.1, + bbox_norm_type: str = 'reg_denom', + loss_cls_fl: ConfigType = dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + use_vfl: bool = True, + loss_cls: ConfigType = dict( + type='VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='GIoULoss', loss_weight=1.5), + loss_bbox_refine: ConfigType = dict( + type='GIoULoss', loss_weight=2.0), + norm_cfg: ConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + use_atss: bool = True, + reg_decoded_bbox: bool = True, + anchor_generator: ConfigType = dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + center_offset=0.0, + strides=[8, 16, 32, 64, 128]), + init_cfg: MultiConfig = dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='vfnet_cls', + std=0.01, + bias_prob=0.01)), + **kwargs) -> None: + # dcn base offsets, adapted from reppoints_head.py + self.num_dconv_points = 9 + self.dcn_kernel = int(np.sqrt(self.num_dconv_points)) + self.dcn_pad = int((self.dcn_kernel - 1) / 2) + dcn_base = np.arange(-self.dcn_pad, + self.dcn_pad + 1).astype(np.float64) + dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) + dcn_base_x = np.tile(dcn_base, self.dcn_kernel) + dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape( + (-1)) + self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1) + + super(FCOSHead, self).__init__( + num_classes=num_classes, + in_channels=in_channels, + norm_cfg=norm_cfg, + init_cfg=init_cfg, + **kwargs) + self.regress_ranges = regress_ranges + self.reg_denoms = [ + regress_range[-1] for regress_range in regress_ranges + ] + self.reg_denoms[-1] = self.reg_denoms[-2] * 2 + self.center_sampling = center_sampling + self.center_sample_radius = center_sample_radius + self.sync_num_pos = sync_num_pos + self.bbox_norm_type = bbox_norm_type + self.gradient_mul = gradient_mul + self.use_vfl = use_vfl + if self.use_vfl: + self.loss_cls = MODELS.build(loss_cls) + else: + self.loss_cls = MODELS.build(loss_cls_fl) + self.loss_bbox = MODELS.build(loss_bbox) + self.loss_bbox_refine = MODELS.build(loss_bbox_refine) + + # for getting ATSS targets + self.use_atss = use_atss + self.reg_decoded_bbox = reg_decoded_bbox + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + + self.anchor_center_offset = anchor_generator['center_offset'] + + self.num_base_priors = self.prior_generator.num_base_priors[0] + + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + if self.train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], default_args=dict(context=self)) + else: + self.sampler = PseudoSampler() + # only be used in `get_atss_targets` when `use_atss` is True + self.atss_prior_generator = TASK_UTILS.build(anchor_generator) + + self.fcos_prior_generator = MlvlPointGenerator( + anchor_generator['strides'], + self.anchor_center_offset if self.use_atss else 0.5) + + # In order to reuse the `get_bboxes` in `BaseDenseHead. + # Only be used in testing phase. + self.prior_generator = self.fcos_prior_generator + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + super(FCOSHead, self)._init_cls_convs() + super(FCOSHead, self)._init_reg_convs() + self.relu = nn.ReLU() + self.vfnet_reg_conv = ConvModule( + self.feat_channels, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias) + self.vfnet_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) + + self.vfnet_reg_refine_dconv = DeformConv2d( + self.feat_channels, + self.feat_channels, + self.dcn_kernel, + 1, + padding=self.dcn_pad) + self.vfnet_reg_refine = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + self.scales_refine = nn.ModuleList([Scale(1.0) for _ in self.strides]) + + self.vfnet_cls_dconv = DeformConv2d( + self.feat_channels, + self.feat_channels, + self.dcn_kernel, + 1, + padding=self.dcn_pad) + self.vfnet_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: + + - cls_scores (list[Tensor]): Box iou-aware scores for each scale + level, each is a 4D-tensor, the channel number is + num_points * num_classes. + - bbox_preds (list[Tensor]): Box offsets for each + scale level, each is a 4D-tensor, the channel number is + num_points * 4. + - bbox_preds_refine (list[Tensor]): Refined Box offsets for + each scale level, each is a 4D-tensor, the channel + number is num_points * 4. + """ + return multi_apply(self.forward_single, x, self.scales, + self.scales_refine, self.strides, self.reg_denoms) + + def forward_single(self, x: Tensor, scale: Scale, scale_refine: Scale, + stride: int, reg_denom: int) -> tuple: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + scale_refine (:obj: `mmcv.cnn.Scale`): Learnable scale module to + resize the refined bbox prediction. + stride (int): The corresponding stride for feature maps, + used to normalize the bbox prediction when + bbox_norm_type = 'stride'. + reg_denom (int): The corresponding regression range for feature + maps, only used to normalize the bbox prediction when + bbox_norm_type = 'reg_denom'. + + Returns: + tuple: iou-aware cls scores for each box, bbox predictions and + refined bbox predictions of input feature maps. + """ + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + + # predict the bbox_pred of different level + reg_feat_init = self.vfnet_reg_conv(reg_feat) + if self.bbox_norm_type == 'reg_denom': + bbox_pred = scale( + self.vfnet_reg(reg_feat_init)).float().exp() * reg_denom + elif self.bbox_norm_type == 'stride': + bbox_pred = scale( + self.vfnet_reg(reg_feat_init)).float().exp() * stride + else: + raise NotImplementedError + + # compute star deformable convolution offsets + # converting dcn_offset to reg_feat.dtype thus VFNet can be + # trained with FP16 + dcn_offset = self.star_dcn_offset(bbox_pred, self.gradient_mul, + stride).to(reg_feat.dtype) + + # refine the bbox_pred + reg_feat = self.relu(self.vfnet_reg_refine_dconv(reg_feat, dcn_offset)) + bbox_pred_refine = scale_refine( + self.vfnet_reg_refine(reg_feat)).float().exp() + bbox_pred_refine = bbox_pred_refine * bbox_pred.detach() + + # predict the iou-aware cls score + cls_feat = self.relu(self.vfnet_cls_dconv(cls_feat, dcn_offset)) + cls_score = self.vfnet_cls(cls_feat) + + if self.training: + return cls_score, bbox_pred, bbox_pred_refine + else: + return cls_score, bbox_pred_refine + + def star_dcn_offset(self, bbox_pred: Tensor, gradient_mul: float, + stride: int) -> Tensor: + """Compute the star deformable conv offsets. + + Args: + bbox_pred (Tensor): Predicted bbox distance offsets (l, r, t, b). + gradient_mul (float): Gradient multiplier. + stride (int): The corresponding stride for feature maps, + used to project the bbox onto the feature map. + + Returns: + Tensor: The offsets for deformable convolution. + """ + dcn_base_offset = self.dcn_base_offset.type_as(bbox_pred) + bbox_pred_grad_mul = (1 - gradient_mul) * bbox_pred.detach() + \ + gradient_mul * bbox_pred + # map to the feature map scale + bbox_pred_grad_mul = bbox_pred_grad_mul / stride + N, C, H, W = bbox_pred.size() + + x1 = bbox_pred_grad_mul[:, 0, :, :] + y1 = bbox_pred_grad_mul[:, 1, :, :] + x2 = bbox_pred_grad_mul[:, 2, :, :] + y2 = bbox_pred_grad_mul[:, 3, :, :] + bbox_pred_grad_mul_offset = bbox_pred.new_zeros( + N, 2 * self.num_dconv_points, H, W) + bbox_pred_grad_mul_offset[:, 0, :, :] = -1.0 * y1 # -y1 + bbox_pred_grad_mul_offset[:, 1, :, :] = -1.0 * x1 # -x1 + bbox_pred_grad_mul_offset[:, 2, :, :] = -1.0 * y1 # -y1 + bbox_pred_grad_mul_offset[:, 4, :, :] = -1.0 * y1 # -y1 + bbox_pred_grad_mul_offset[:, 5, :, :] = x2 # x2 + bbox_pred_grad_mul_offset[:, 7, :, :] = -1.0 * x1 # -x1 + bbox_pred_grad_mul_offset[:, 11, :, :] = x2 # x2 + bbox_pred_grad_mul_offset[:, 12, :, :] = y2 # y2 + bbox_pred_grad_mul_offset[:, 13, :, :] = -1.0 * x1 # -x1 + bbox_pred_grad_mul_offset[:, 14, :, :] = y2 # y2 + bbox_pred_grad_mul_offset[:, 16, :, :] = y2 # y2 + bbox_pred_grad_mul_offset[:, 17, :, :] = x2 # x2 + dcn_offset = bbox_pred_grad_mul_offset - dcn_base_offset + + return dcn_offset + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + bbox_preds_refine: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box iou-aware scores for each scale + level, each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box offsets for each + scale level, each is a 4D-tensor, the channel number is + num_points * 4. + bbox_preds_refine (list[Tensor]): Refined Box offsets for + each scale level, each is a 4D-tensor, the channel + number is num_points * 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.fcos_prior_generator.grid_priors( + featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) + labels, label_weights, bbox_targets, bbox_weights = self.get_targets( + cls_scores, + all_level_points, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + + num_imgs = cls_scores[0].size(0) + # flatten cls_scores, bbox_preds and bbox_preds_refine + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, + 1).reshape(-1, + self.cls_out_channels).contiguous() + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous() + for bbox_pred in bbox_preds + ] + flatten_bbox_preds_refine = [ + bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous() + for bbox_pred_refine in bbox_preds_refine + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine) + flatten_labels = torch.cat(labels) + flatten_bbox_targets = torch.cat(bbox_targets) + # repeat points to align with bbox_preds + flatten_points = torch.cat( + [points.repeat(num_imgs, 1) for points in all_level_points]) + + # FG cat_id: [0, num_classes - 1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = torch.where( + ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0] + num_pos = len(pos_inds) + + pos_bbox_preds = flatten_bbox_preds[pos_inds] + pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds] + pos_labels = flatten_labels[pos_inds] + + # sync num_pos across all gpus + if self.sync_num_pos: + num_pos_avg_per_gpu = reduce_mean( + pos_inds.new_tensor(num_pos).float()).item() + num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0) + else: + num_pos_avg_per_gpu = num_pos + + pos_bbox_targets = flatten_bbox_targets[pos_inds] + pos_points = flatten_points[pos_inds] + + pos_decoded_bbox_preds = self.bbox_coder.decode( + pos_points, pos_bbox_preds) + pos_decoded_target_preds = self.bbox_coder.decode( + pos_points, pos_bbox_targets) + iou_targets_ini = bbox_overlaps( + pos_decoded_bbox_preds, + pos_decoded_target_preds.detach(), + is_aligned=True).clamp(min=1e-6) + bbox_weights_ini = iou_targets_ini.clone().detach() + bbox_avg_factor_ini = reduce_mean( + bbox_weights_ini.sum()).clamp_(min=1).item() + + pos_decoded_bbox_preds_refine = \ + self.bbox_coder.decode(pos_points, pos_bbox_preds_refine) + iou_targets_rf = bbox_overlaps( + pos_decoded_bbox_preds_refine, + pos_decoded_target_preds.detach(), + is_aligned=True).clamp(min=1e-6) + bbox_weights_rf = iou_targets_rf.clone().detach() + bbox_avg_factor_rf = reduce_mean( + bbox_weights_rf.sum()).clamp_(min=1).item() + + if num_pos > 0: + loss_bbox = self.loss_bbox( + pos_decoded_bbox_preds, + pos_decoded_target_preds.detach(), + weight=bbox_weights_ini, + avg_factor=bbox_avg_factor_ini) + + loss_bbox_refine = self.loss_bbox_refine( + pos_decoded_bbox_preds_refine, + pos_decoded_target_preds.detach(), + weight=bbox_weights_rf, + avg_factor=bbox_avg_factor_rf) + + # build IoU-aware cls_score targets + if self.use_vfl: + pos_ious = iou_targets_rf.clone().detach() + cls_iou_targets = torch.zeros_like(flatten_cls_scores) + cls_iou_targets[pos_inds, pos_labels] = pos_ious + else: + loss_bbox = pos_bbox_preds.sum() * 0 + loss_bbox_refine = pos_bbox_preds_refine.sum() * 0 + if self.use_vfl: + cls_iou_targets = torch.zeros_like(flatten_cls_scores) + + if self.use_vfl: + loss_cls = self.loss_cls( + flatten_cls_scores, + cls_iou_targets, + avg_factor=num_pos_avg_per_gpu) + else: + loss_cls = self.loss_cls( + flatten_cls_scores, + flatten_labels, + weight=label_weights, + avg_factor=num_pos_avg_per_gpu) + + return dict( + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_bbox_rf=loss_bbox_refine) + + def get_targets( + self, + cls_scores: List[Tensor], + mlvl_points: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> tuple: + """A wrapper for computing ATSS and FCOS targets for points in multiple + images. + + Args: + cls_scores (list[Tensor]): Box iou-aware scores for each scale + level with shape (N, num_points * num_classes, H, W). + mlvl_points (list[Tensor]): Points of each fpn level, each has + shape (num_points, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + tuple: + + - labels_list (list[Tensor]): Labels of each level. + - label_weights (Tensor/None): Label weights of all levels. + - bbox_targets_list (list[Tensor]): Regression targets of each + level, (l, t, r, b). + - bbox_weights (Tensor/None): Bbox weights of all levels. + """ + if self.use_atss: + return self.get_atss_targets(cls_scores, mlvl_points, + batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + else: + self.norm_on_bbox = False + return self.get_fcos_targets(mlvl_points, batch_gt_instances) + + def _get_targets_single(self, *args, **kwargs): + """Avoid ambiguity in multiple inheritance.""" + if self.use_atss: + return ATSSHead._get_targets_single(self, *args, **kwargs) + else: + return FCOSHead._get_targets_single(self, *args, **kwargs) + + def get_fcos_targets(self, points: List[Tensor], + batch_gt_instances: InstanceList) -> tuple: + """Compute FCOS regression and classification targets for points in + multiple images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple: + + - labels (list[Tensor]): Labels of each level. + - label_weights: None, to be compatible with ATSS targets. + - bbox_targets (list[Tensor]): BBox targets of each level. + - bbox_weights: None, to be compatible with ATSS targets. + """ + labels, bbox_targets = FCOSHead.get_targets(self, points, + batch_gt_instances) + label_weights = None + bbox_weights = None + return labels, label_weights, bbox_targets, bbox_weights + + def get_anchors(self, + featmap_sizes: List[Tuple], + batch_img_metas: List[dict], + device: str = 'cuda') -> tuple: + """Get anchors according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + batch_img_metas (list[dict]): Image meta info. + device (str): Device for returned tensors + + Returns: + tuple: + + - anchor_list (list[Tensor]): Anchors of each image. + - valid_flag_list (list[Tensor]): Valid flags of each image. + """ + num_imgs = len(batch_img_metas) + + # since feature map sizes of all images are the same, we only compute + # anchors for one time + multi_level_anchors = self.atss_prior_generator.grid_priors( + featmap_sizes, device=device) + anchor_list = [multi_level_anchors for _ in range(num_imgs)] + + # for each image, we compute valid flags of multi level anchors + valid_flag_list = [] + for img_id, img_meta in enumerate(batch_img_metas): + multi_level_flags = self.atss_prior_generator.valid_flags( + featmap_sizes, img_meta['pad_shape'], device=device) + valid_flag_list.append(multi_level_flags) + + return anchor_list, valid_flag_list + + def get_atss_targets( + self, + cls_scores: List[Tensor], + mlvl_points: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> tuple: + """A wrapper for computing ATSS targets for points in multiple images. + + Args: + cls_scores (list[Tensor]): Box iou-aware scores for each scale + level with shape (N, num_points * num_classes, H, W). + mlvl_points (list[Tensor]): Points of each fpn level, each has + shape (num_points, 2). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + tuple: + + - labels_list (list[Tensor]): Labels of each level. + - label_weights (Tensor): Label weights of all levels. + - bbox_targets_list (list[Tensor]): Regression targets of each + level, (l, t, r, b). + - bbox_weights (Tensor): Bbox weights of all levels. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len( + featmap_sizes + ) == self.atss_prior_generator.num_levels == \ + self.fcos_prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + cls_reg_targets = ATSSHead.get_targets( + self, + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=True) + + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor) = cls_reg_targets + + bbox_targets_list = [ + bbox_targets.reshape(-1, 4) for bbox_targets in bbox_targets_list + ] + + num_imgs = len(batch_img_metas) + # transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format + bbox_targets_list = self.transform_bbox_targets( + bbox_targets_list, mlvl_points, num_imgs) + + labels_list = [labels.reshape(-1) for labels in labels_list] + label_weights_list = [ + label_weights.reshape(-1) for label_weights in label_weights_list + ] + bbox_weights_list = [ + bbox_weights.reshape(-1) for bbox_weights in bbox_weights_list + ] + label_weights = torch.cat(label_weights_list) + bbox_weights = torch.cat(bbox_weights_list) + return labels_list, label_weights, bbox_targets_list, bbox_weights + + def transform_bbox_targets(self, decoded_bboxes: List[Tensor], + mlvl_points: List[Tensor], + num_imgs: int) -> List[Tensor]: + """Transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format. + + Args: + decoded_bboxes (list[Tensor]): Regression targets of each level, + in the form of (x1, y1, x2, y2). + mlvl_points (list[Tensor]): Points of each fpn level, each has + shape (num_points, 2). + num_imgs (int): the number of images in a batch. + + Returns: + bbox_targets (list[Tensor]): Regression targets of each level in + the form of (l, t, r, b). + """ + # TODO: Re-implemented in Class PointCoder + assert len(decoded_bboxes) == len(mlvl_points) + num_levels = len(decoded_bboxes) + mlvl_points = [points.repeat(num_imgs, 1) for points in mlvl_points] + bbox_targets = [] + for i in range(num_levels): + bbox_target = self.bbox_coder.encode(mlvl_points[i], + decoded_bboxes[i]) + bbox_targets.append(bbox_target) + + return bbox_targets + + def _load_from_state_dict(self, state_dict: dict, prefix: str, + local_metadata: dict, strict: bool, + missing_keys: Union[List[str], str], + unexpected_keys: Union[List[str], str], + error_msgs: Union[List[str], str]) -> None: + """Override the method in the parent class to avoid changing para's + name.""" + pass diff --git a/mmdetection/mmdet/models/dense_heads/yolact_head.py b/mmdetection/mmdet/models/dense_heads/yolact_head.py new file mode 100644 index 0000000..3390c13 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/yolact_head.py @@ -0,0 +1,1193 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule, ModuleList +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig) +from ..layers import fast_nms +from ..utils import images_to_levels, multi_apply, select_single_mlvl +from ..utils.misc import empty_instances +from .anchor_head import AnchorHead +from .base_mask_head import BaseMaskHead + + +@MODELS.register_module() +class YOLACTHead(AnchorHead): + """YOLACT box head used in https://arxiv.org/abs/1904.02689. + + Note that YOLACT head is a light version of RetinaNet head. + Four differences are described as follows: + + 1. YOLACT box head has three-times fewer anchors. + 2. YOLACT box head shares the convs for box and cls branches. + 3. YOLACT box head uses OHEM instead of Focal loss. + 4. YOLACT box head predicts a set of mask coefficients for each box. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + anchor_generator (:obj:`ConfigDict` or dict): Config dict for + anchor generator + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + num_head_convs (int): Number of the conv layers shared by + box and cls branches. + num_protos (int): Number of the mask coefficients. + use_ohem (bool): If true, ``loss_single_OHEM`` will be used for + cls loss calculation. If false, ``loss_single`` will be used. + conv_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to + construct and config conv layer. + norm_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to + construct and config norm layer. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + anchor_generator: ConfigType = dict( + type='AnchorGenerator', + octave_base_scale=3, + scales_per_octave=1, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=False, + reduction='none', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='SmoothL1Loss', beta=1.0, loss_weight=1.5), + num_head_convs: int = 1, + num_protos: int = 32, + use_ohem: bool = True, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = dict( + type='Xavier', + distribution='uniform', + bias=0, + layer='Conv2d'), + **kwargs) -> None: + self.num_head_convs = num_head_convs + self.num_protos = num_protos + self.use_ohem = use_ohem + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + anchor_generator=anchor_generator, + init_cfg=init_cfg, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.relu = nn.ReLU(inplace=True) + self.head_convs = ModuleList() + for i in range(self.num_head_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.head_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.conv_cls = nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + 3, + padding=1) + self.conv_reg = nn.Conv2d( + self.feat_channels, self.num_base_priors * 4, 3, padding=1) + self.conv_coeff = nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.num_protos, + 3, + padding=1) + + def forward_single(self, x: Tensor) -> tuple: + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + + Returns: + tuple: + + - cls_score (Tensor): Cls scores for a single scale level + the channels number is num_anchors * num_classes. + - bbox_pred (Tensor): Box energies / deltas for a single scale + level, the channels number is num_anchors * 4. + - coeff_pred (Tensor): Mask coefficients for a single scale + level, the channels number is num_anchors * num_protos. + """ + for head_conv in self.head_convs: + x = head_conv(x) + cls_score = self.conv_cls(x) + bbox_pred = self.conv_reg(x) + coeff_pred = self.conv_coeff(x).tanh() + return cls_score, bbox_pred, coeff_pred + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + coeff_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the bbox head. + + When ``self.use_ohem == True``, it functions like ``SSDHead.loss``, + otherwise, it follows ``AnchorHead.loss``. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + coeff_preds (list[Tensor]): Mask coefficients for each scale + level with shape (N, num_anchors * num_protos, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore, + unmap_outputs=not self.use_ohem, + return_sampling_results=True) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor, sampling_results) = cls_reg_targets + + if self.use_ohem: + num_images = len(batch_img_metas) + all_cls_scores = torch.cat([ + s.permute(0, 2, 3, 1).reshape( + num_images, -1, self.cls_out_channels) for s in cls_scores + ], 1) + all_labels = torch.cat(labels_list, -1).view(num_images, -1) + all_label_weights = torch.cat(label_weights_list, + -1).view(num_images, -1) + all_bbox_preds = torch.cat([ + b.permute(0, 2, 3, 1).reshape(num_images, -1, 4) + for b in bbox_preds + ], -2) + all_bbox_targets = torch.cat(bbox_targets_list, + -2).view(num_images, -1, 4) + all_bbox_weights = torch.cat(bbox_weights_list, + -2).view(num_images, -1, 4) + + # concat all level anchors to a single tensor + all_anchors = [] + for i in range(num_images): + all_anchors.append(torch.cat(anchor_list[i])) + + # check NaN and Inf + assert torch.isfinite(all_cls_scores).all().item(), \ + 'classification scores become infinite or NaN!' + assert torch.isfinite(all_bbox_preds).all().item(), \ + 'bbox predications become infinite or NaN!' + + losses_cls, losses_bbox = multi_apply( + self.OHEMloss_by_feat_single, + all_cls_scores, + all_bbox_preds, + all_anchors, + all_labels, + all_label_weights, + all_bbox_targets, + all_bbox_weights, + avg_factor=avg_factor) + else: + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + # concat all level anchors and flags to a single tensor + concat_anchor_list = [] + for i in range(len(anchor_list)): + concat_anchor_list.append(torch.cat(anchor_list[i])) + all_anchor_list = images_to_levels(concat_anchor_list, + num_level_anchors) + losses_cls, losses_bbox = multi_apply( + self.loss_by_feat_single, + cls_scores, + bbox_preds, + all_anchor_list, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + avg_factor=avg_factor) + losses = dict(loss_cls=losses_cls, loss_bbox=losses_bbox) + # update `_raw_positive_infos`, which will be used when calling + # `get_positive_infos`. + self._raw_positive_infos.update(coeff_preds=coeff_preds) + return losses + + def OHEMloss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, + anchors: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + bbox_weights: Tensor, + avg_factor: int) -> tuple: + """Compute loss of a single image. Similar to + func:``SSDHead.loss_by_feat_single`` + + Args: + cls_score (Tensor): Box scores for eachimage + Has shape (num_total_anchors, num_classes). + bbox_pred (Tensor): Box energies / deltas for each image + level with shape (num_total_anchors, 4). + anchors (Tensor): Box reference for each scale level with shape + (num_total_anchors, 4). + labels (Tensor): Labels of each anchors with shape + (num_total_anchors,). + label_weights (Tensor): Label weights of each anchor with shape + (num_total_anchors,) + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (num_total_anchors, 4). + bbox_weights (Tensor): BBox regression loss weights of each anchor + with shape (num_total_anchors, 4). + avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one + feature map. + """ + + loss_cls_all = self.loss_cls(cls_score, labels, label_weights) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero( + as_tuple=False).reshape(-1) + neg_inds = (labels == self.num_classes).nonzero( + as_tuple=False).view(-1) + + num_pos_samples = pos_inds.size(0) + if num_pos_samples == 0: + num_neg_samples = neg_inds.size(0) + else: + num_neg_samples = self.train_cfg['neg_pos_ratio'] * \ + num_pos_samples + if num_neg_samples > neg_inds.size(0): + num_neg_samples = neg_inds.size(0) + topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples) + loss_cls_pos = loss_cls_all[pos_inds].sum() + loss_cls_neg = topk_loss_cls_neg.sum() + loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor + if self.reg_decoded_bbox: + # When the regression loss (e.g. `IouLoss`, `GIouLoss`) + # is applied directly on the decoded bounding boxes, it + # decodes the already encoded coordinates to absolute format. + bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) + loss_bbox = self.loss_bbox( + bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor) + return loss_cls[None], loss_bbox + + def get_positive_infos(self) -> InstanceList: + """Get positive information from sampling results. + + Returns: + list[:obj:`InstanceData`]: Positive Information of each image, + usually including positive bboxes, positive labels, positive + priors, positive coeffs, etc. + """ + assert len(self._raw_positive_infos) > 0 + sampling_results = self._raw_positive_infos['sampling_results'] + num_imgs = len(sampling_results) + + coeff_pred_list = [] + for coeff_pred_per_level in self._raw_positive_infos['coeff_preds']: + coeff_pred_per_level = \ + coeff_pred_per_level.permute( + 0, 2, 3, 1).reshape(num_imgs, -1, self.num_protos) + coeff_pred_list.append(coeff_pred_per_level) + coeff_preds = torch.cat(coeff_pred_list, dim=1) + + pos_info_list = [] + for idx, sampling_result in enumerate(sampling_results): + pos_info = InstanceData() + coeff_preds_single = coeff_preds[idx] + pos_info.pos_assigned_gt_inds = \ + sampling_result.pos_assigned_gt_inds + pos_info.pos_inds = sampling_result.pos_inds + pos_info.coeffs = coeff_preds_single[sampling_result.pos_inds] + pos_info.bboxes = sampling_result.pos_gt_bboxes + pos_info_list.append(pos_info) + return pos_info_list + + def predict_by_feat(self, + cls_scores, + bbox_preds, + coeff_preds, + batch_img_metas, + cfg=None, + rescale=True, + **kwargs): + """Similar to func:``AnchorHead.get_bboxes``, but additionally + processes coeff_preds. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + with shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + coeff_preds (list[Tensor]): Mask coefficients for each scale + level with shape (N, num_anchors * num_protos, H, W) + batch_img_metas (list[dict]): Batch image meta info. + cfg (:obj:`Config` | None): Test / postprocessing configuration, + if None, test_cfg would be used + rescale (bool): If True, return boxes in original image space. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - coeffs (Tensor): the predicted mask coefficients of + instance inside the corresponding box has a shape + (n, num_protos). + """ + assert len(cls_scores) == len(bbox_preds) + num_levels = len(cls_scores) + + device = cls_scores[0].device + featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, device=device) + + result_list = [] + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + cls_score_list = select_single_mlvl(cls_scores, img_id) + bbox_pred_list = select_single_mlvl(bbox_preds, img_id) + coeff_pred_list = select_single_mlvl(coeff_preds, img_id) + results = self._predict_by_feat_single( + cls_score_list=cls_score_list, + bbox_pred_list=bbox_pred_list, + coeff_preds_list=coeff_pred_list, + mlvl_priors=mlvl_priors, + img_meta=img_meta, + cfg=cfg, + rescale=rescale) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + coeff_preds_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: ConfigType, + rescale: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. Similar to func:``AnchorHead._predict_by_feat_single``, + but additionally processes coeff_preds_list and uses fast NMS instead + of traditional NMS. + + Args: + cls_score_list (list[Tensor]): Box scores for a single scale level + Has shape (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas for a single + scale level with shape (num_priors * 4, H, W). + coeff_preds_list (list[Tensor]): Mask coefficients for a single + scale level with shape (num_priors * num_protos, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid, + has shape (num_priors, 4). + img_meta (dict): Image meta info. + cfg (mmengine.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - coeffs (Tensor): the predicted mask coefficients of + instance inside the corresponding box has a shape + (n, num_protos). + """ + assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_priors) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + img_shape = img_meta['img_shape'] + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bbox_preds = [] + mlvl_valid_priors = [] + mlvl_scores = [] + mlvl_coeffs = [] + for cls_score, bbox_pred, coeff_pred, priors in \ + zip(cls_score_list, bbox_pred_list, + coeff_preds_list, mlvl_priors): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(-1) + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + coeff_pred = coeff_pred.permute(1, 2, + 0).reshape(-1, self.num_protos) + + if 0 < nms_pre < scores.shape[0]: + # Get maximum scores for foreground classes. + if self.use_sigmoid_cls: + max_scores, _ = scores.max(dim=1) + else: + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + max_scores, _ = scores[:, :-1].max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + priors = priors[topk_inds, :] + bbox_pred = bbox_pred[topk_inds, :] + scores = scores[topk_inds, :] + coeff_pred = coeff_pred[topk_inds, :] + + mlvl_bbox_preds.append(bbox_pred) + mlvl_valid_priors.append(priors) + mlvl_scores.append(scores) + mlvl_coeffs.append(coeff_pred) + + bbox_pred = torch.cat(mlvl_bbox_preds) + priors = torch.cat(mlvl_valid_priors) + multi_bboxes = self.bbox_coder.decode( + priors, bbox_pred, max_shape=img_shape) + + multi_scores = torch.cat(mlvl_scores) + multi_coeffs = torch.cat(mlvl_coeffs) + + return self._bbox_post_process( + multi_bboxes=multi_bboxes, + multi_scores=multi_scores, + multi_coeffs=multi_coeffs, + cfg=cfg, + rescale=rescale, + img_meta=img_meta) + + def _bbox_post_process(self, + multi_bboxes: Tensor, + multi_scores: Tensor, + multi_coeffs: Tensor, + cfg: ConfigType, + rescale: bool = False, + img_meta: Optional[dict] = None, + **kwargs) -> InstanceData: + """bbox post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + multi_bboxes (Tensor): Predicted bbox that concat all levels. + multi_scores (Tensor): Bbox scores that concat all levels. + multi_coeffs (Tensor): Mask coefficients that concat all levels. + cfg (ConfigDict): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default to False. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - coeffs (Tensor): the predicted mask coefficients of + instance inside the corresponding box has a shape + (n, num_protos). + """ + if rescale: + assert img_meta.get('scale_factor') is not None + multi_bboxes /= multi_bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + # mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) + + if self.use_sigmoid_cls: + # Add a dummy background class to the backend when using sigmoid + # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 + # BG cat_id: num_class + + padding = multi_scores.new_zeros(multi_scores.shape[0], 1) + multi_scores = torch.cat([multi_scores, padding], dim=1) + det_bboxes, det_labels, det_coeffs = fast_nms( + multi_bboxes, multi_scores, multi_coeffs, cfg.score_thr, + cfg.iou_thr, cfg.top_k, cfg.max_per_img) + results = InstanceData() + results.bboxes = det_bboxes[:, :4] + results.scores = det_bboxes[:, -1] + results.labels = det_labels + results.coeffs = det_coeffs + return results + + +@MODELS.register_module() +class YOLACTProtonet(BaseMaskHead): + """YOLACT mask head used in https://arxiv.org/abs/1904.02689. + + This head outputs the mask prototypes for YOLACT. + + Args: + in_channels (int): Number of channels in the input feature map. + proto_channels (tuple[int]): Output channels of protonet convs. + proto_kernel_sizes (tuple[int]): Kernel sizes of protonet convs. + include_last_relu (bool): If keep the last relu of protonet. + num_protos (int): Number of prototypes. + num_classes (int): Number of categories excluding the background + category. + loss_mask_weight (float): Reweight the mask loss by this factor. + max_masks_to_train (int): Maximum number of masks to train for + each image. + with_seg_branch (bool): Whether to apply a semantic segmentation + branch and calculate loss during training to increase + performance with no speed penalty. Defaults to True. + loss_segm (:obj:`ConfigDict` or dict, optional): Config of + semantic segmentation loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config + of head. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + head. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + """ + + def __init__( + self, + num_classes: int, + in_channels: int = 256, + proto_channels: tuple = (256, 256, 256, None, 256, 32), + proto_kernel_sizes: tuple = (3, 3, 3, -2, 3, 1), + include_last_relu: bool = True, + num_protos: int = 32, + loss_mask_weight: float = 1.0, + max_masks_to_train: int = 100, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + with_seg_branch: bool = True, + loss_segm: ConfigType = dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + init_cfg=dict( + type='Xavier', + distribution='uniform', + override=dict(name='protonet')) + ) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.proto_channels = proto_channels + self.proto_kernel_sizes = proto_kernel_sizes + self.include_last_relu = include_last_relu + + # Segmentation branch + self.with_seg_branch = with_seg_branch + self.segm_branch = SegmentationModule( + num_classes=num_classes, in_channels=in_channels) \ + if with_seg_branch else None + self.loss_segm = MODELS.build(loss_segm) if with_seg_branch else None + + self.loss_mask_weight = loss_mask_weight + self.num_protos = num_protos + self.num_classes = num_classes + self.max_masks_to_train = max_masks_to_train + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + # Possible patterns: + # ( 256, 3) -> conv + # ( 256,-2) -> deconv + # (None,-2) -> bilinear interpolate + in_channels = self.in_channels + protonets = ModuleList() + for num_channels, kernel_size in zip(self.proto_channels, + self.proto_kernel_sizes): + if kernel_size > 0: + layer = nn.Conv2d( + in_channels, + num_channels, + kernel_size, + padding=kernel_size // 2) + else: + if num_channels is None: + layer = InterpolateModule( + scale_factor=-kernel_size, + mode='bilinear', + align_corners=False) + else: + layer = nn.ConvTranspose2d( + in_channels, + num_channels, + -kernel_size, + padding=kernel_size // 2) + protonets.append(layer) + protonets.append(nn.ReLU(inplace=True)) + in_channels = num_channels if num_channels is not None \ + else in_channels + if not self.include_last_relu: + protonets = protonets[:-1] + self.protonet = nn.Sequential(*protonets) + + def forward(self, x: tuple, positive_infos: InstanceList) -> tuple: + """Forward feature from the upstream network to get prototypes and + linearly combine the prototypes, using masks coefficients, into + instance masks. Finally, crop the instance masks with given bboxes. + + Args: + x (Tuple[Tensor]): Feature from the upstream network, which is + a 4D-tensor. + positive_infos (List[:obj:``InstanceData``]): Positive information + that calculate from detect head. + + Returns: + tuple: Predicted instance segmentation masks and + semantic segmentation map. + """ + # YOLACT used single feature map to get segmentation masks + single_x = x[0] + + # YOLACT segmentation branch, if not training or segmentation branch + # is None, will not process the forward function. + if self.segm_branch is not None and self.training: + segm_preds = self.segm_branch(single_x) + else: + segm_preds = None + # YOLACT mask head + prototypes = self.protonet(single_x) + prototypes = prototypes.permute(0, 2, 3, 1).contiguous() + + num_imgs = single_x.size(0) + + mask_pred_list = [] + for idx in range(num_imgs): + cur_prototypes = prototypes[idx] + pos_coeffs = positive_infos[idx].coeffs + + # Linearly combine the prototypes with the mask coefficients + mask_preds = cur_prototypes @ pos_coeffs.t() + mask_preds = torch.sigmoid(mask_preds) + mask_pred_list.append(mask_preds) + return mask_pred_list, segm_preds + + def loss_by_feat(self, mask_preds: List[Tensor], segm_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], positive_infos: InstanceList, + **kwargs) -> dict: + """Calculate the loss based on the features extracted by the mask head. + + Args: + mask_preds (list[Tensor]): List of predicted prototypes, each has + shape (num_classes, H, W). + segm_preds (Tensor): Predicted semantic segmentation map with + shape (N, num_classes, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``masks``, + and ``labels`` attributes. + batch_img_metas (list[dict]): Meta information of multiple images. + positive_infos (List[:obj:``InstanceData``]): Information of + positive samples of each image that are assigned in detection + head. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert positive_infos is not None, \ + 'positive_infos should not be None in `YOLACTProtonet`' + losses = dict() + + # crop + croped_mask_pred = self.crop_mask_preds(mask_preds, batch_img_metas, + positive_infos) + + loss_mask = [] + loss_segm = [] + num_imgs, _, mask_h, mask_w = segm_preds.size() + assert num_imgs == len(croped_mask_pred) + segm_avg_factor = num_imgs * mask_h * mask_w + total_pos = 0 + + if self.segm_branch is not None: + assert segm_preds is not None + + for idx in range(num_imgs): + img_meta = batch_img_metas[idx] + + (mask_preds, pos_mask_targets, segm_targets, num_pos, + gt_bboxes_for_reweight) = self._get_targets_single( + croped_mask_pred[idx], segm_preds[idx], + batch_gt_instances[idx], positive_infos[idx]) + + # segmentation loss + if self.with_seg_branch: + if segm_targets is None: + loss = segm_preds[idx].sum() * 0. + else: + loss = self.loss_segm( + segm_preds[idx], + segm_targets, + avg_factor=segm_avg_factor) + loss_segm.append(loss) + # mask loss + total_pos += num_pos + if num_pos == 0 or pos_mask_targets is None: + loss = mask_preds.sum() * 0. + else: + mask_preds = torch.clamp(mask_preds, 0, 1) + loss = F.binary_cross_entropy( + mask_preds, pos_mask_targets, + reduction='none') * self.loss_mask_weight + + h, w = img_meta['img_shape'][:2] + gt_bboxes_width = (gt_bboxes_for_reweight[:, 2] - + gt_bboxes_for_reweight[:, 0]) / w + gt_bboxes_height = (gt_bboxes_for_reweight[:, 3] - + gt_bboxes_for_reweight[:, 1]) / h + loss = loss.mean(dim=(1, + 2)) / gt_bboxes_width / gt_bboxes_height + loss = torch.sum(loss) + loss_mask.append(loss) + + if total_pos == 0: + total_pos += 1 # avoid nan + loss_mask = [x / total_pos for x in loss_mask] + + losses.update(loss_mask=loss_mask) + if self.with_seg_branch: + losses.update(loss_segm=loss_segm) + + return losses + + def _get_targets_single(self, mask_preds: Tensor, segm_pred: Tensor, + gt_instances: InstanceData, + positive_info: InstanceData): + """Compute targets for predictions of single image. + + Args: + mask_preds (Tensor): Predicted prototypes with shape + (num_classes, H, W). + segm_pred (Tensor): Predicted semantic segmentation map + with shape (num_classes, H, W). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes``, ``labels``, + and ``masks`` attributes. + positive_info (:obj:`InstanceData`): Information of positive + samples that are assigned in detection head. It usually + contains following keys. + + - pos_assigned_gt_inds (Tensor): Assigner GT indexes of + positive proposals, has shape (num_pos, ) + - pos_inds (Tensor): Positive index of image, has + shape (num_pos, ). + - coeffs (Tensor): Positive mask coefficients + with shape (num_pos, num_protos). + - bboxes (Tensor): Positive bboxes with shape + (num_pos, 4) + + Returns: + tuple: Usually returns a tuple containing learning targets. + + - mask_preds (Tensor): Positive predicted mask with shape + (num_pos, mask_h, mask_w). + - pos_mask_targets (Tensor): Positive mask targets with shape + (num_pos, mask_h, mask_w). + - segm_targets (Tensor): Semantic segmentation targets with shape + (num_classes, segm_h, segm_w). + - num_pos (int): Positive numbers. + - gt_bboxes_for_reweight (Tensor): GT bboxes that match to the + positive priors has shape (num_pos, 4). + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + device = gt_bboxes.device + gt_masks = gt_instances.masks.to_tensor( + dtype=torch.bool, device=device).float() + if gt_masks.size(0) == 0: + return mask_preds, None, None, 0, None + + # process with semantic segmentation targets + if segm_pred is not None: + num_classes, segm_h, segm_w = segm_pred.size() + with torch.no_grad(): + downsampled_masks = F.interpolate( + gt_masks.unsqueeze(0), (segm_h, segm_w), + mode='bilinear', + align_corners=False).squeeze(0) + downsampled_masks = downsampled_masks.gt(0.5).float() + segm_targets = torch.zeros_like(segm_pred, requires_grad=False) + for obj_idx in range(downsampled_masks.size(0)): + segm_targets[gt_labels[obj_idx] - 1] = torch.max( + segm_targets[gt_labels[obj_idx] - 1], + downsampled_masks[obj_idx]) + else: + segm_targets = None + # process with mask targets + pos_assigned_gt_inds = positive_info.pos_assigned_gt_inds + num_pos = pos_assigned_gt_inds.size(0) + # Since we're producing (near) full image masks, + # it'd take too much vram to backprop on every single mask. + # Thus we select only a subset. + if num_pos > self.max_masks_to_train: + perm = torch.randperm(num_pos) + select = perm[:self.max_masks_to_train] + mask_preds = mask_preds[select] + pos_assigned_gt_inds = pos_assigned_gt_inds[select] + num_pos = self.max_masks_to_train + + gt_bboxes_for_reweight = gt_bboxes[pos_assigned_gt_inds] + + mask_h, mask_w = mask_preds.shape[-2:] + gt_masks = F.interpolate( + gt_masks.unsqueeze(0), (mask_h, mask_w), + mode='bilinear', + align_corners=False).squeeze(0) + gt_masks = gt_masks.gt(0.5).float() + pos_mask_targets = gt_masks[pos_assigned_gt_inds] + + return (mask_preds, pos_mask_targets, segm_targets, num_pos, + gt_bboxes_for_reweight) + + def crop_mask_preds(self, mask_preds: List[Tensor], + batch_img_metas: List[dict], + positive_infos: InstanceList) -> list: + """Crop predicted masks by zeroing out everything not in the predicted + bbox. + + Args: + mask_preds (list[Tensor]): Predicted prototypes with shape + (num_classes, H, W). + batch_img_metas (list[dict]): Meta information of multiple images. + positive_infos (List[:obj:``InstanceData``]): Positive + information that calculate from detect head. + + Returns: + list: The cropped masks. + """ + croped_mask_preds = [] + for img_meta, mask_preds, cur_info in zip(batch_img_metas, mask_preds, + positive_infos): + bboxes_for_cropping = copy.deepcopy(cur_info.bboxes) + h, w = img_meta['img_shape'][:2] + bboxes_for_cropping[:, 0::2] /= w + bboxes_for_cropping[:, 1::2] /= h + mask_preds = self.crop_single(mask_preds, bboxes_for_cropping) + mask_preds = mask_preds.permute(2, 0, 1).contiguous() + croped_mask_preds.append(mask_preds) + return croped_mask_preds + + def crop_single(self, + masks: Tensor, + boxes: Tensor, + padding: int = 1) -> Tensor: + """Crop single predicted masks by zeroing out everything not in the + predicted bbox. + + Args: + masks (Tensor): Predicted prototypes, has shape [H, W, N]. + boxes (Tensor): Bbox coords in relative point form with + shape [N, 4]. + padding (int): Image padding size. + + Return: + Tensor: The cropped masks. + """ + h, w, n = masks.size() + x1, x2 = self.sanitize_coordinates( + boxes[:, 0], boxes[:, 2], w, padding, cast=False) + y1, y2 = self.sanitize_coordinates( + boxes[:, 1], boxes[:, 3], h, padding, cast=False) + + rows = torch.arange( + w, device=masks.device, dtype=x1.dtype).view(1, -1, + 1).expand(h, w, n) + cols = torch.arange( + h, device=masks.device, dtype=x1.dtype).view(-1, 1, + 1).expand(h, w, n) + + masks_left = rows >= x1.view(1, 1, -1) + masks_right = rows < x2.view(1, 1, -1) + masks_up = cols >= y1.view(1, 1, -1) + masks_down = cols < y2.view(1, 1, -1) + + crop_mask = masks_left * masks_right * masks_up * masks_down + + return masks * crop_mask.float() + + def sanitize_coordinates(self, + x1: Tensor, + x2: Tensor, + img_size: int, + padding: int = 0, + cast: bool = True) -> tuple: + """Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0, + and x2 <= image_size. Also converts from relative to absolute + coordinates and casts the results to long tensors. + + Warning: this does things in-place behind the scenes so + copy if necessary. + + Args: + x1 (Tensor): shape (N, ). + x2 (Tensor): shape (N, ). + img_size (int): Size of the input image. + padding (int): x1 >= padding, x2 <= image_size-padding. + cast (bool): If cast is false, the result won't be cast to longs. + + Returns: + tuple: + + - x1 (Tensor): Sanitized _x1. + - x2 (Tensor): Sanitized _x2. + """ + x1 = x1 * img_size + x2 = x2 * img_size + if cast: + x1 = x1.long() + x2 = x2.long() + x1 = torch.min(x1, x2) + x2 = torch.max(x1, x2) + x1 = torch.clamp(x1 - padding, min=0) + x2 = torch.clamp(x2 + padding, max=img_size) + return x1, x2 + + def predict_by_feat(self, + mask_preds: List[Tensor], + segm_preds: Tensor, + results_list: InstanceList, + batch_img_metas: List[dict], + rescale: bool = True, + **kwargs) -> InstanceList: + """Transform a batch of output features extracted from the head into + mask results. + + Args: + mask_preds (list[Tensor]): Predicted prototypes with shape + (num_classes, H, W). + results_list (List[:obj:``InstanceData``]): BBoxHead results. + batch_img_metas (list[dict]): Meta information of all images. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Processed results of multiple + images.Each :obj:`InstanceData` usually contains + following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + assert len(mask_preds) == len(results_list) == len(batch_img_metas) + + croped_mask_pred = self.crop_mask_preds(mask_preds, batch_img_metas, + results_list) + + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + results = results_list[img_id] + bboxes = results.bboxes + mask_preds = croped_mask_pred[img_id] + if bboxes.shape[0] == 0 or mask_preds.shape[0] == 0: + results_list[img_id] = empty_instances( + [img_meta], + bboxes.device, + task_type='mask', + instance_results=[results])[0] + else: + im_mask = self._predict_by_feat_single( + mask_preds=croped_mask_pred[img_id], + bboxes=bboxes, + img_meta=img_meta, + rescale=rescale) + results.masks = im_mask + return results_list + + def _predict_by_feat_single(self, + mask_preds: Tensor, + bboxes: Tensor, + img_meta: dict, + rescale: bool, + cfg: OptConfigType = None): + """Transform a single image's features extracted from the head into + mask results. + + Args: + mask_preds (Tensor): Predicted prototypes, has shape [H, W, N]. + bboxes (Tensor): Bbox coords in relative point form with + shape [N, 4]. + img_meta (dict): Meta information of each image, e.g., + image size, scaling factor, etc. + rescale (bool): If rescale is False, then returned masks will + fit the scale of imgs[0]. + cfg (dict, optional): Config used in test phase. + Defaults to None. + + Returns: + :obj:`InstanceData`: Processed results of single image. + it usually contains following keys. + + - scores (Tensor): Classification scores, has shape + (num_instance,). + - labels (Tensor): Has shape (num_instances,). + - masks (Tensor): Processed mask results, has + shape (num_instances, h, w). + """ + cfg = self.test_cfg if cfg is None else cfg + scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + img_h, img_w = img_meta['ori_shape'][:2] + if rescale: # in-placed rescale the bboxes + scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + bboxes /= scale_factor + else: + w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1] + img_h = np.round(img_h * h_scale.item()).astype(np.int32) + img_w = np.round(img_w * w_scale.item()).astype(np.int32) + + masks = F.interpolate( + mask_preds.unsqueeze(0), (img_h, img_w), + mode='bilinear', + align_corners=False).squeeze(0) > cfg.mask_thr + + if cfg.mask_thr_binary < 0: + # for visualization and debugging + masks = (masks * 255).to(dtype=torch.uint8) + + return masks + + +class SegmentationModule(BaseModule): + """YOLACT segmentation branch used in `_ + + In mmdet v2.x `segm_loss` is calculated in YOLACTSegmHead, while in + mmdet v3.x `SegmentationModule` is used to obtain the predicted semantic + segmentation map and `segm_loss` is calculated in YOLACTProtonet. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__( + self, + num_classes: int, + in_channels: int = 256, + init_cfg: ConfigType = dict( + type='Xavier', + distribution='uniform', + override=dict(name='segm_conv')) + ) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.num_classes = num_classes + self._init_layers() + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.segm_conv = nn.Conv2d( + self.in_channels, self.num_classes, kernel_size=1) + + def forward(self, x: Tensor) -> Tensor: + """Forward feature from the upstream network. + + Args: + x (Tensor): Feature from the upstream network, which is + a 4D-tensor. + + Returns: + Tensor: Predicted semantic segmentation map with shape + (N, num_classes, H, W). + """ + return self.segm_conv(x) + + +class InterpolateModule(BaseModule): + """This is a module version of F.interpolate. + + Any arguments you give it just get passed along for the ride. + """ + + def __init__(self, *args, init_cfg=None, **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + self.args = args + self.kwargs = kwargs + + def forward(self, x: Tensor) -> Tensor: + """Forward features from the upstream network. + + Args: + x (Tensor): Feature from the upstream network, which is + a 4D-tensor. + + Returns: + Tensor: A 4D-tensor feature map. + """ + return F.interpolate(x, *self.args, **self.kwargs) diff --git a/mmdetection/mmdet/models/dense_heads/yolo_head.py b/mmdetection/mmdet/models/dense_heads/yolo_head.py new file mode 100644 index 0000000..0f63afb --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/yolo_head.py @@ -0,0 +1,527 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) 2019 Western Digital Corporation or its affiliates. + +import copy +import warnings +from typing import List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, is_norm +from mmengine.model import bias_init_with_prob, constant_init, normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList) +from ..task_modules.samplers import PseudoSampler +from ..utils import filter_scores_and_topk, images_to_levels, multi_apply +from .base_dense_head import BaseDenseHead + + +@MODELS.register_module() +class YOLOV3Head(BaseDenseHead): + """YOLOV3Head Paper link: https://arxiv.org/abs/1804.02767. + + Args: + num_classes (int): The number of object classes (w/o background) + in_channels (Sequence[int]): Number of input channels per scale. + out_channels (Sequence[int]): The number of output channels per scale + before the final 1x1 layer. Default: (1024, 512, 256). + anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor + generator. + bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder. + featmap_strides (Sequence[int]): The stride of each scale. + Should be in descending order. Defaults to (32, 16, 8). + one_hot_smoother (float): Set a non-zero value to enable label-smooth + Defaults to 0. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_conf (:obj:`ConfigDict` or dict): Config of confidence loss. + loss_xy (:obj:`ConfigDict` or dict): Config of xy coordinate loss. + loss_wh (:obj:`ConfigDict` or dict): Config of wh coordinate loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + YOLOV3 head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + YOLOV3 head. Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Sequence[int], + out_channels: Sequence[int] = (1024, 512, 256), + anchor_generator: ConfigType = dict( + type='YOLOAnchorGenerator', + base_sizes=[[(116, 90), (156, 198), (373, 326)], + [(30, 61), (62, 45), (59, 119)], + [(10, 13), (16, 30), (33, 23)]], + strides=[32, 16, 8]), + bbox_coder: ConfigType = dict(type='YOLOBBoxCoder'), + featmap_strides: Sequence[int] = (32, 16, 8), + one_hot_smoother: float = 0., + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + act_cfg: ConfigType = dict( + type='LeakyReLU', negative_slope=0.1), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_conf: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_xy: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_wh: ConfigType = dict(type='MSELoss', loss_weight=1.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None) -> None: + super().__init__(init_cfg=None) + # Check params + assert (len(in_channels) == len(out_channels) == len(featmap_strides)) + + self.num_classes = num_classes + self.in_channels = in_channels + self.out_channels = out_channels + self.featmap_strides = featmap_strides + self.train_cfg = train_cfg + self.test_cfg = test_cfg + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + if train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg['sampler'], context=self) + else: + self.sampler = PseudoSampler() + + self.one_hot_smoother = one_hot_smoother + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + self.bbox_coder = TASK_UTILS.build(bbox_coder) + + self.prior_generator = TASK_UTILS.build(anchor_generator) + + self.loss_cls = MODELS.build(loss_cls) + self.loss_conf = MODELS.build(loss_conf) + self.loss_xy = MODELS.build(loss_xy) + self.loss_wh = MODELS.build(loss_wh) + + self.num_base_priors = self.prior_generator.num_base_priors[0] + assert len( + self.prior_generator.num_base_priors) == len(featmap_strides) + self._init_layers() + + @property + def num_levels(self) -> int: + """int: number of feature map levels""" + return len(self.featmap_strides) + + @property + def num_attrib(self) -> int: + """int: number of attributes in pred_map, bboxes (4) + + objectness (1) + num_classes""" + + return 5 + self.num_classes + + def _init_layers(self) -> None: + """initialize conv layers in YOLOv3 head.""" + self.convs_bridge = nn.ModuleList() + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_bridge = ConvModule( + self.in_channels[i], + self.out_channels[i], + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + conv_pred = nn.Conv2d(self.out_channels[i], + self.num_base_priors * self.num_attrib, 1) + + self.convs_bridge.append(conv_bridge) + self.convs_pred.append(conv_pred) + + def init_weights(self) -> None: + """initialize weights.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + + # Use prior in model initialization to improve stability + for conv_pred, stride in zip(self.convs_pred, self.featmap_strides): + bias = conv_pred.bias.reshape(self.num_base_priors, -1) + # init objectness with prior of 8 objects per feature map + # refer to https://github.com/ultralytics/yolov3 + nn.init.constant_(bias.data[:, 4], + bias_init_with_prob(8 / (608 / stride)**2)) + nn.init.constant_(bias.data[:, 5:], bias_init_with_prob(0.01)) + + def forward(self, x: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple[Tensor]: A tuple of multi-level predication map, each is a + 4D-tensor of shape (batch_size, 5+num_classes, height, width). + """ + + assert len(x) == self.num_levels + pred_maps = [] + for i in range(self.num_levels): + feat = x[i] + feat = self.convs_bridge[i](feat) + pred_map = self.convs_pred[i](feat) + pred_maps.append(pred_map) + + return tuple(pred_maps), + + def predict_by_feat(self, + pred_maps: Sequence[Tensor], + batch_img_metas: Optional[List[dict]], + cfg: OptConfigType = None, + rescale: bool = False, + with_nms: bool = True) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. It has been accelerated since PR #5991. + + Args: + pred_maps (Sequence[Tensor]): Raw predictions for a batch of + images. + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (:obj:`ConfigDict` or dict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(pred_maps) == self.num_levels + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + num_imgs = len(batch_img_metas) + featmap_sizes = [pred_map.shape[-2:] for pred_map in pred_maps] + + mlvl_anchors = self.prior_generator.grid_priors( + featmap_sizes, device=pred_maps[0].device) + flatten_preds = [] + flatten_strides = [] + for pred, stride in zip(pred_maps, self.featmap_strides): + pred = pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_attrib) + pred[..., :2].sigmoid_() + flatten_preds.append(pred) + flatten_strides.append( + pred.new_tensor(stride).expand(pred.size(1))) + + flatten_preds = torch.cat(flatten_preds, dim=1) + flatten_bbox_preds = flatten_preds[..., :4] + flatten_objectness = flatten_preds[..., 4].sigmoid() + flatten_cls_scores = flatten_preds[..., 5:].sigmoid() + flatten_anchors = torch.cat(mlvl_anchors) + flatten_strides = torch.cat(flatten_strides) + flatten_bboxes = self.bbox_coder.decode(flatten_anchors, + flatten_bbox_preds, + flatten_strides.unsqueeze(-1)) + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + # Filtering out all predictions with conf < conf_thr + conf_thr = cfg.get('conf_thr', -1) + if conf_thr > 0: + conf_inds = objectness >= conf_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + score_thr = cfg.get('score_thr', 0) + nms_pre = cfg.get('nms_pre', -1) + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=bboxes[keep_idxs], + score_factors=objectness[keep_idxs], + ) + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + results_list.append(results) + return results_list + + def loss_by_feat( + self, + pred_maps: Sequence[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + pred_maps (list[Tensor]): Prediction map for each scale level, + shape (N, num_anchors * num_attrib, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + device = pred_maps[0][0].device + + featmap_sizes = [ + pred_maps[i].shape[-2:] for i in range(self.num_levels) + ] + mlvl_anchors = self.prior_generator.grid_priors( + featmap_sizes, device=device) + anchor_list = [mlvl_anchors for _ in range(num_imgs)] + + responsible_flag_list = [] + for img_id in range(num_imgs): + responsible_flag_list.append( + self.responsible_flags(featmap_sizes, + batch_gt_instances[img_id].bboxes, + device)) + + target_maps_list, neg_maps_list = self.get_targets( + anchor_list, responsible_flag_list, batch_gt_instances) + + losses_cls, losses_conf, losses_xy, losses_wh = multi_apply( + self.loss_by_feat_single, pred_maps, target_maps_list, + neg_maps_list) + + return dict( + loss_cls=losses_cls, + loss_conf=losses_conf, + loss_xy=losses_xy, + loss_wh=losses_wh) + + def loss_by_feat_single(self, pred_map: Tensor, target_map: Tensor, + neg_map: Tensor) -> tuple: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + pred_map (Tensor): Raw predictions for a single level. + target_map (Tensor): The Ground-Truth target for a single level. + neg_map (Tensor): The negative masks for a single level. + + Returns: + tuple: + loss_cls (Tensor): Classification loss. + loss_conf (Tensor): Confidence loss. + loss_xy (Tensor): Regression loss of x, y coordinate. + loss_wh (Tensor): Regression loss of w, h coordinate. + """ + + num_imgs = len(pred_map) + pred_map = pred_map.permute(0, 2, 3, + 1).reshape(num_imgs, -1, self.num_attrib) + neg_mask = neg_map.float() + pos_mask = target_map[..., 4] + pos_and_neg_mask = neg_mask + pos_mask + pos_mask = pos_mask.unsqueeze(dim=-1) + if torch.max(pos_and_neg_mask) > 1.: + warnings.warn('There is overlap between pos and neg sample.') + pos_and_neg_mask = pos_and_neg_mask.clamp(min=0., max=1.) + + pred_xy = pred_map[..., :2] + pred_wh = pred_map[..., 2:4] + pred_conf = pred_map[..., 4] + pred_label = pred_map[..., 5:] + + target_xy = target_map[..., :2] + target_wh = target_map[..., 2:4] + target_conf = target_map[..., 4] + target_label = target_map[..., 5:] + + loss_cls = self.loss_cls(pred_label, target_label, weight=pos_mask) + loss_conf = self.loss_conf( + pred_conf, target_conf, weight=pos_and_neg_mask) + loss_xy = self.loss_xy(pred_xy, target_xy, weight=pos_mask) + loss_wh = self.loss_wh(pred_wh, target_wh, weight=pos_mask) + + return loss_cls, loss_conf, loss_xy, loss_wh + + def get_targets(self, anchor_list: List[List[Tensor]], + responsible_flag_list: List[List[Tensor]], + batch_gt_instances: List[InstanceData]) -> tuple: + """Compute target maps for anchors in multiple images. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_total_anchors, 4). + responsible_flag_list (list[list[Tensor]]): Multi level responsible + flags of each image. Each element is a tensor of shape + (num_total_anchors, ) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple: Usually returns a tuple containing learning targets. + - target_map_list (list[Tensor]): Target map of each level. + - neg_map_list (list[Tensor]): Negative map of each level. + """ + num_imgs = len(anchor_list) + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + + results = multi_apply(self._get_targets_single, anchor_list, + responsible_flag_list, batch_gt_instances) + + all_target_maps, all_neg_maps = results + assert num_imgs == len(all_target_maps) == len(all_neg_maps) + target_maps_list = images_to_levels(all_target_maps, num_level_anchors) + neg_maps_list = images_to_levels(all_neg_maps, num_level_anchors) + + return target_maps_list, neg_maps_list + + def _get_targets_single(self, anchors: List[Tensor], + responsible_flags: List[Tensor], + gt_instances: InstanceData) -> tuple: + """Generate matching bounding box prior and converted GT. + + Args: + anchors (List[Tensor]): Multi-level anchors of the image. + responsible_flags (List[Tensor]): Multi-level responsible flags of + anchors + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + + Returns: + tuple: + target_map (Tensor): Predication target map of each + scale level, shape (num_total_anchors, + 5+num_classes) + neg_map (Tensor): Negative map of each scale level, + shape (num_total_anchors,) + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + anchor_strides = [] + for i in range(len(anchors)): + anchor_strides.append( + torch.tensor(self.featmap_strides[i], + device=gt_bboxes.device).repeat(len(anchors[i]))) + concat_anchors = torch.cat(anchors) + concat_responsible_flags = torch.cat(responsible_flags) + + anchor_strides = torch.cat(anchor_strides) + assert len(anchor_strides) == len(concat_anchors) == \ + len(concat_responsible_flags) + pred_instances = InstanceData( + priors=concat_anchors, responsible_flags=concat_responsible_flags) + + assign_result = self.assigner.assign(pred_instances, gt_instances) + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + target_map = concat_anchors.new_zeros( + concat_anchors.size(0), self.num_attrib) + + target_map[sampling_result.pos_inds, :4] = self.bbox_coder.encode( + sampling_result.pos_priors, sampling_result.pos_gt_bboxes, + anchor_strides[sampling_result.pos_inds]) + + target_map[sampling_result.pos_inds, 4] = 1 + + gt_labels_one_hot = F.one_hot( + gt_labels, num_classes=self.num_classes).float() + if self.one_hot_smoother != 0: # label smooth + gt_labels_one_hot = gt_labels_one_hot * ( + 1 - self.one_hot_smoother + ) + self.one_hot_smoother / self.num_classes + target_map[sampling_result.pos_inds, 5:] = gt_labels_one_hot[ + sampling_result.pos_assigned_gt_inds] + + neg_map = concat_anchors.new_zeros( + concat_anchors.size(0), dtype=torch.uint8) + neg_map[sampling_result.neg_inds] = 1 + + return target_map, neg_map + + def responsible_flags(self, featmap_sizes: List[tuple], gt_bboxes: Tensor, + device: str) -> List[Tensor]: + """Generate responsible anchor flags of grid cells in multiple scales. + + Args: + featmap_sizes (List[tuple]): List of feature map sizes in multiple + feature levels. + gt_bboxes (Tensor): Ground truth boxes, shape (n, 4). + device (str): Device where the anchors will be put on. + + Return: + List[Tensor]: responsible flags of anchors in multiple level + """ + assert self.num_levels == len(featmap_sizes) + multi_level_responsible_flags = [] + for i in range(self.num_levels): + anchor_stride = self.prior_generator.strides[i] + feat_h, feat_w = featmap_sizes[i] + gt_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device) + gt_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device) + gt_grid_x = torch.floor(gt_cx / anchor_stride[0]).long() + gt_grid_y = torch.floor(gt_cy / anchor_stride[1]).long() + # row major indexing + gt_bboxes_grid_idx = gt_grid_y * feat_w + gt_grid_x + + responsible_grid = torch.zeros( + feat_h * feat_w, dtype=torch.uint8, device=device) + responsible_grid[gt_bboxes_grid_idx] = 1 + + responsible_grid = responsible_grid[:, None].expand( + responsible_grid.size(0), + self.prior_generator.num_base_priors[i]).contiguous().view(-1) + + multi_level_responsible_flags.append(responsible_grid) + return multi_level_responsible_flags diff --git a/mmdetection/mmdet/models/dense_heads/yolof_head.py b/mmdetection/mmdet/models/dense_heads/yolof_head.py new file mode 100644 index 0000000..b5e5e6b --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/yolof_head.py @@ -0,0 +1,399 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, is_norm +from mmengine.model import bias_init_with_prob, constant_init, normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean +from ..task_modules.prior_generators import anchor_inside_flags +from ..utils import levels_to_images, multi_apply, unmap +from .anchor_head import AnchorHead + +INF = 1e8 + + +@MODELS.register_module() +class YOLOFHead(AnchorHead): + """Detection Head of `YOLOF `_ + + Args: + num_classes (int): The number of object classes (w/o background) + in_channels (list[int]): The number of input channels per scale. + cls_num_convs (int): The number of convolutions of cls branch. + Defaults to 2. + reg_num_convs (int): The number of convolutions of reg branch. + Defaults to 4. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN', requires_grad=True)``. + """ + + def __init__(self, + num_classes: int, + in_channels: List[int], + num_cls_convs: int = 2, + num_reg_convs: int = 4, + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + **kwargs) -> None: + self.num_cls_convs = num_cls_convs + self.num_reg_convs = num_reg_convs + self.norm_cfg = norm_cfg + super().__init__( + num_classes=num_classes, in_channels=in_channels, **kwargs) + + def _init_layers(self) -> None: + cls_subnet = [] + bbox_subnet = [] + for i in range(self.num_cls_convs): + cls_subnet.append( + ConvModule( + self.in_channels, + self.in_channels, + kernel_size=3, + padding=1, + norm_cfg=self.norm_cfg)) + for i in range(self.num_reg_convs): + bbox_subnet.append( + ConvModule( + self.in_channels, + self.in_channels, + kernel_size=3, + padding=1, + norm_cfg=self.norm_cfg)) + self.cls_subnet = nn.Sequential(*cls_subnet) + self.bbox_subnet = nn.Sequential(*bbox_subnet) + self.cls_score = nn.Conv2d( + self.in_channels, + self.num_base_priors * self.num_classes, + kernel_size=3, + stride=1, + padding=1) + self.bbox_pred = nn.Conv2d( + self.in_channels, + self.num_base_priors * 4, + kernel_size=3, + stride=1, + padding=1) + self.object_pred = nn.Conv2d( + self.in_channels, + self.num_base_priors, + kernel_size=3, + stride=1, + padding=1) + + def init_weights(self) -> None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + + # Use prior in model initialization to improve stability + bias_cls = bias_init_with_prob(0.01) + torch.nn.init.constant_(self.cls_score.bias, bias_cls) + + def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]: + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + + Returns: + tuple: + normalized_cls_score (Tensor): Normalized Cls scores for a \ + single scale level, the channels number is \ + num_base_priors * num_classes. + bbox_reg (Tensor): Box energies / deltas for a single scale \ + level, the channels number is num_base_priors * 4. + """ + cls_score = self.cls_score(self.cls_subnet(x)) + N, _, H, W = cls_score.shape + cls_score = cls_score.view(N, -1, self.num_classes, H, W) + + reg_feat = self.bbox_subnet(x) + bbox_reg = self.bbox_pred(reg_feat) + objectness = self.object_pred(reg_feat) + + # implicit objectness + objectness = objectness.view(N, -1, 1, H, W) + normalized_cls_score = cls_score + objectness - torch.log( + 1. + torch.clamp(cls_score.exp(), max=INF) + + torch.clamp(objectness.exp(), max=INF)) + normalized_cls_score = normalized_cls_score.view(N, -1, H, W) + return normalized_cls_score, bbox_reg + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + assert len(cls_scores) == 1 + assert self.prior_generator.num_levels == 1 + + device = cls_scores[0].device + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + # The output level is always 1 + anchor_list = [anchors[0] for anchors in anchor_list] + valid_flag_list = [valid_flags[0] for valid_flags in valid_flag_list] + + cls_scores_list = levels_to_images(cls_scores) + bbox_preds_list = levels_to_images(bbox_preds) + + cls_reg_targets = self.get_targets( + cls_scores_list, + bbox_preds_list, + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + if cls_reg_targets is None: + return None + (batch_labels, batch_label_weights, avg_factor, batch_bbox_weights, + batch_pos_predicted_boxes, batch_target_boxes) = cls_reg_targets + + flatten_labels = batch_labels.reshape(-1) + batch_label_weights = batch_label_weights.reshape(-1) + cls_score = cls_scores[0].permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + + # classification loss + loss_cls = self.loss_cls( + cls_score, + flatten_labels, + batch_label_weights, + avg_factor=avg_factor) + + # regression loss + if batch_pos_predicted_boxes.shape[0] == 0: + # no pos sample + loss_bbox = batch_pos_predicted_boxes.sum() * 0 + else: + loss_bbox = self.loss_bbox( + batch_pos_predicted_boxes, + batch_target_boxes, + batch_bbox_weights.float(), + avg_factor=avg_factor) + + return dict(loss_cls=loss_cls, loss_bbox=loss_bbox) + + def get_targets(self, + cls_scores_list: List[Tensor], + bbox_preds_list: List[Tensor], + anchor_list: List[Tensor], + valid_flag_list: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True): + """Compute regression and classification targets for anchors in + multiple images. + + Args: + cls_scores_list (list[Tensor]): Classification scores of + each image. each is a 4D-tensor, the shape is + (h * w, num_anchors * num_classes). + bbox_preds_list (list[Tensor]): Bbox preds of each image. + each is a 4D-tensor, the shape is (h * w, num_anchors * 4). + anchor_list (list[Tensor]): Anchors of each image. Each element of + is a tensor of shape (h * w * num_anchors, 4). + valid_flag_list (list[Tensor]): Valid flags of each image. Each + element of is a tensor of shape (h * w * num_anchors, ) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Returns: + tuple: Usually returns a tuple containing learning targets. + + - batch_labels (Tensor): Label of all images. Each element \ + of is a tensor of shape (batch, h * w * num_anchors) + - batch_label_weights (Tensor): Label weights of all images \ + of is a tensor of shape (batch, h * w * num_anchors) + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + additional_returns: This function enables user-defined returns from + `self._get_targets_single`. These returns are currently refined + to properties at each feature map (i.e. having HxW dimension). + The results will be concatenated after the end + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + results = multi_apply( + self._get_targets_single, + bbox_preds_list, + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + (all_labels, all_label_weights, pos_inds, neg_inds, + sampling_results_list) = results[:5] + # Get `avg_factor` of all images, which calculate in `SamplingResult`. + # When using sampling method, avg_factor is usually the sum of + # positive and negative priors. When using `PseudoSampler`, + # `avg_factor` is usually equal to the number of positive priors. + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + rest_results = list(results[5:]) # user-added return values + + batch_labels = torch.stack(all_labels, 0) + batch_label_weights = torch.stack(all_label_weights, 0) + + res = (batch_labels, batch_label_weights, avg_factor) + for i, rests in enumerate(rest_results): # user-added return values + rest_results[i] = torch.cat(rests, 0) + + return res + tuple(rest_results) + + def _get_targets_single(self, + bbox_preds: Tensor, + flat_anchors: Tensor, + valid_flags: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None, + unmap_outputs: bool = True) -> tuple: + """Compute regression and classification targets for anchors in a + single image. + + Args: + bbox_preds (Tensor): Bbox prediction of the image, which + shape is (h * w ,4) + flat_anchors (Tensor): Anchors of the image, which shape is + (h * w * num_anchors ,4) + valid_flags (Tensor): Valid flags of the image, which shape is + (h * w * num_anchors,). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Returns: + tuple: + labels (Tensor): Labels of image, which shape is + (h * w * num_anchors, ). + label_weights (Tensor): Label weights of image, which shape is + (h * w * num_anchors, ). + pos_inds (Tensor): Pos index of image. + neg_inds (Tensor): Neg index of image. + sampling_result (obj:`SamplingResult`): Sampling result. + pos_bbox_weights (Tensor): The Weight of using to calculate + the bbox branch loss, which shape is (num, ). + pos_predicted_boxes (Tensor): boxes predicted value of + using to calculate the bbox branch loss, which shape is + (num, 4). + pos_target_boxes (Tensor): boxes target value of + using to calculate the bbox branch loss, which shape is + (num, 4). + """ + inside_flags = anchor_inside_flags(flat_anchors, valid_flags, + img_meta['img_shape'][:2], + self.train_cfg['allowed_border']) + if not inside_flags.any(): + raise ValueError( + 'There is no valid anchor inside the image boundary. Please ' + 'check the image size and anchor sizes, or set ' + '``allowed_border`` to -1 to skip the condition.') + + # assign gt and sample anchors + anchors = flat_anchors[inside_flags, :] + bbox_preds = bbox_preds.reshape(-1, 4) + bbox_preds = bbox_preds[inside_flags, :] + + # decoded bbox + decoder_bbox_preds = self.bbox_coder.decode(anchors, bbox_preds) + pred_instances = InstanceData( + priors=anchors, decoder_priors=decoder_bbox_preds) + assign_result = self.assigner.assign(pred_instances, gt_instances, + gt_instances_ignore) + + pos_bbox_weights = assign_result.get_extra_property('pos_idx') + pos_predicted_boxes = assign_result.get_extra_property( + 'pos_predicted_boxes') + pos_target_boxes = assign_result.get_extra_property('target_boxes') + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + num_valid_anchors = anchors.shape[0] + labels = anchors.new_full((num_valid_anchors, ), + self.num_classes, + dtype=torch.long) + label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + labels[pos_inds] = sampling_result.pos_gt_labels + if self.train_cfg['pos_weight'] <= 0: + label_weights[pos_inds] = 1.0 + else: + label_weights[pos_inds] = self.train_cfg['pos_weight'] + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + # map up to original set of anchors + if unmap_outputs: + num_total_anchors = flat_anchors.size(0) + labels = unmap( + labels, num_total_anchors, inside_flags, + fill=self.num_classes) # fill bg label + label_weights = unmap(label_weights, num_total_anchors, + inside_flags) + + return (labels, label_weights, pos_inds, neg_inds, sampling_result, + pos_bbox_weights, pos_predicted_boxes, pos_target_boxes) diff --git a/mmdetection/mmdet/models/dense_heads/yolox_head.py b/mmdetection/mmdet/models/dense_heads/yolox_head.py new file mode 100644 index 0000000..00fe1e4 --- /dev/null +++ b/mmdetection/mmdet/models/dense_heads/yolox_head.py @@ -0,0 +1,618 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmcv.ops.nms import batched_nms +from mmengine.config import ConfigDict +from mmengine.model import bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import bbox_xyxy_to_cxcywh +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig, reduce_mean) +from ..task_modules.prior_generators import MlvlPointGenerator +from ..task_modules.samplers import PseudoSampler +from ..utils import multi_apply +from .base_dense_head import BaseDenseHead + + +@MODELS.register_module() +class YOLOXHead(BaseDenseHead): + """YOLOXHead head used in `YOLOX `_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels in stacking convs. + Defaults to 256 + stacked_convs (int): Number of stacking convs of the head. + Defaults to (8, 16, 32). + strides (Sequence[int]): Downsample factor of each feature map. + Defaults to None. + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Defaults to False. + dcn_on_last_conv (bool): If true, use dcn in the last layer of + towers. Defaults to False. + conv_bias (bool or str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Defaults to "auto". + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss. + loss_l1 (:obj:`ConfigDict` or dict): Config of L1 loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + feat_channels: int = 256, + stacked_convs: int = 2, + strides: Sequence[int] = (8, 16, 32), + use_depthwise: bool = False, + dcn_on_last_conv: bool = False, + conv_bias: Union[bool, str] = 'auto', + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='Swish'), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_l1: ConfigType = dict( + type='L1Loss', reduction='sum', loss_weight=1.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.cls_out_channels = num_classes + self.in_channels = in_channels + self.feat_channels = feat_channels + self.stacked_convs = stacked_convs + self.strides = strides + self.use_depthwise = use_depthwise + self.dcn_on_last_conv = dcn_on_last_conv + assert conv_bias == 'auto' or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + self.use_sigmoid_cls = True + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + self.loss_cls: nn.Module = MODELS.build(loss_cls) + self.loss_bbox: nn.Module = MODELS.build(loss_bbox) + self.loss_obj: nn.Module = MODELS.build(loss_obj) + + self.use_l1 = False # This flag will be modified by hooks. + self.loss_l1: nn.Module = MODELS.build(loss_l1) + + self.prior_generator = MlvlPointGenerator(strides, offset=0) + + self.test_cfg = test_cfg + self.train_cfg = train_cfg + + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + # YOLOX does not support sampling + self.sampler = PseudoSampler() + + self._init_layers() + + def _init_layers(self) -> None: + """Initialize heads for all level feature maps.""" + self.multi_level_cls_convs = nn.ModuleList() + self.multi_level_reg_convs = nn.ModuleList() + self.multi_level_conv_cls = nn.ModuleList() + self.multi_level_conv_reg = nn.ModuleList() + self.multi_level_conv_obj = nn.ModuleList() + for _ in self.strides: + self.multi_level_cls_convs.append(self._build_stacked_convs()) + self.multi_level_reg_convs.append(self._build_stacked_convs()) + conv_cls, conv_reg, conv_obj = self._build_predictor() + self.multi_level_conv_cls.append(conv_cls) + self.multi_level_conv_reg.append(conv_reg) + self.multi_level_conv_obj.append(conv_obj) + + def _build_stacked_convs(self) -> nn.Sequential: + """Initialize conv layers of a single level head.""" + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + stacked_convs = [] + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type='DCNv2') + else: + conv_cfg = self.conv_cfg + stacked_convs.append( + conv( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=self.conv_bias)) + return nn.Sequential(*stacked_convs) + + def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]: + """Initialize predictor layers of a single level head.""" + conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1) + conv_reg = nn.Conv2d(self.feat_channels, 4, 1) + conv_obj = nn.Conv2d(self.feat_channels, 1, 1) + return conv_cls, conv_reg, conv_obj + + def init_weights(self) -> None: + """Initialize weights of the head.""" + super(YOLOXHead, self).init_weights() + # Use prior in model initialization to improve stability + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj in zip(self.multi_level_conv_cls, + self.multi_level_conv_obj): + conv_cls.bias.data.fill_(bias_init) + conv_obj.bias.data.fill_(bias_init) + + def forward_single(self, x: Tensor, cls_convs: nn.Module, + reg_convs: nn.Module, conv_cls: nn.Module, + conv_reg: nn.Module, + conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + cls_feat = cls_convs(x) + reg_feat = reg_convs(x) + + cls_score = conv_cls(cls_feat) + bbox_pred = conv_reg(reg_feat) + objectness = conv_obj(reg_feat) + + return cls_score, bbox_pred, objectness + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + + return multi_apply(self.forward_single, x, self.multi_level_cls_convs, + self.multi_level_reg_convs, + self.multi_level_conv_cls, + self.multi_level_conv_reg, + self.multi_level_conv_obj) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]], + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) == len(objectnesses) + cfg = self.test_cfg if cfg is None else cfg + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + flatten_priors = torch.cat(mlvl_priors) + + flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds) + + result_list = [] + for img_id, img_meta in enumerate(batch_img_metas): + max_scores, labels = torch.max(flatten_cls_scores[img_id], 1) + valid_mask = flatten_objectness[ + img_id] * max_scores >= cfg.score_thr + results = InstanceData( + bboxes=flatten_bboxes[img_id][valid_mask], + scores=max_scores[valid_mask] * + flatten_objectness[img_id][valid_mask], + labels=labels[valid_mask]) + + result_list.append( + self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta)) + + return result_list + + def _bbox_decode(self, priors: Tensor, bbox_preds: Tensor) -> Tensor: + """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + + Args: + priors (Tensor): Center proiors of an image, has shape + (num_instances, 2). + bbox_preds (Tensor): Box energies / deltas for all instances, + has shape (batch_size, num_instances, 4). + + Returns: + Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has + shape (batch_size, num_instances, 4). + """ + xys = (bbox_preds[..., :2] * priors[:, 2:]) + priors[:, :2] + whs = bbox_preds[..., 2:].exp() * priors[:, 2:] + + tl_x = (xys[..., 0] - whs[..., 0] / 2) + tl_y = (xys[..., 1] - whs[..., 1] / 2) + br_x = (xys[..., 0] + whs[..., 0] / 2) + br_y = (xys[..., 1] + whs[..., 1] / 2) + + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes + + def _bbox_post_process(self, + results: InstanceData, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True, + img_meta: Optional[dict] = None) -> InstanceData: + """bbox post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + results (:obj:`InstaceData`): Detection instance results, + each item has shape (num_bboxes, ). + cfg (mmengine.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default to False. + with_nms (bool): If True, do nms before return boxes. + Default to True. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + + if rescale: + assert img_meta.get('scale_factor') is not None + results.bboxes /= results.bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + + if with_nms and results.bboxes.numel() > 0: + det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores, + results.labels, cfg.nms) + results = results[keep_idxs] + # some nms would reweight the score, such as softnms + results.scores = det_bboxes[:, -1] + return results + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_pred in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_objectness = torch.cat(flatten_objectness, dim=1) + flatten_priors = torch.cat(mlvl_priors) + flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds) + + (pos_masks, cls_targets, obj_targets, bbox_targets, l1_targets, + num_fg_imgs) = multi_apply( + self._get_targets_single, + flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1), + flatten_cls_preds.detach(), flatten_bboxes.detach(), + flatten_objectness.detach(), batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + + # The experimental results show that 'reduce_mean' can improve + # performance on the COCO dataset. + num_pos = torch.tensor( + sum(num_fg_imgs), + dtype=torch.float, + device=flatten_cls_preds.device) + num_total_samples = max(reduce_mean(num_pos), 1.0) + + pos_masks = torch.cat(pos_masks, 0) + cls_targets = torch.cat(cls_targets, 0) + obj_targets = torch.cat(obj_targets, 0) + bbox_targets = torch.cat(bbox_targets, 0) + if self.use_l1: + l1_targets = torch.cat(l1_targets, 0) + + loss_obj = self.loss_obj(flatten_objectness.view(-1, 1), + obj_targets) / num_total_samples + if num_pos > 0: + loss_cls = self.loss_cls( + flatten_cls_preds.view(-1, self.num_classes)[pos_masks], + cls_targets) / num_total_samples + loss_bbox = self.loss_bbox( + flatten_bboxes.view(-1, 4)[pos_masks], + bbox_targets) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_cls = flatten_cls_preds.sum() * 0 + loss_bbox = flatten_bboxes.sum() * 0 + + loss_dict = dict( + loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj) + + if self.use_l1: + if num_pos > 0: + loss_l1 = self.loss_l1( + flatten_bbox_preds.view(-1, 4)[pos_masks], + l1_targets) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_l1 = flatten_bbox_preds.sum() * 0 + loss_dict.update(loss_l1=loss_l1) + + return loss_dict + + @torch.no_grad() + def _get_targets_single( + self, + priors: Tensor, + cls_preds: Tensor, + decoded_bboxes: Tensor, + objectness: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None) -> tuple: + """Compute classification, regression, and objectness targets for + priors in a single image. + + Args: + priors (Tensor): All priors of one image, a 2D-Tensor with shape + [num_priors, 4] in [cx, xy, stride_w, stride_y] format. + cls_preds (Tensor): Classification predictions of one image, + a 2D-Tensor with shape [num_priors, num_classes] + decoded_bboxes (Tensor): Decoded bboxes predictions of one image, + a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y, + br_x, br_y] format. + objectness (Tensor): Objectness predictions of one image, + a 1D-Tensor with shape [num_priors] + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + tuple: + foreground_mask (list[Tensor]): Binary mask of foreground + targets. + cls_target (list[Tensor]): Classification targets of an image. + obj_target (list[Tensor]): Objectness targets of an image. + bbox_target (list[Tensor]): BBox targets of an image. + l1_target (int): BBox L1 targets of an image. + num_pos_per_img (int): Number of positive samples in an image. + """ + + num_priors = priors.size(0) + num_gts = len(gt_instances) + # No target + if num_gts == 0: + cls_target = cls_preds.new_zeros((0, self.num_classes)) + bbox_target = cls_preds.new_zeros((0, 4)) + l1_target = cls_preds.new_zeros((0, 4)) + obj_target = cls_preds.new_zeros((num_priors, 1)) + foreground_mask = cls_preds.new_zeros(num_priors).bool() + return (foreground_mask, cls_target, obj_target, bbox_target, + l1_target, 0) + + # YOLOX uses center priors with 0.5 offset to assign targets, + # but use center priors without offset to regress bboxes. + offset_priors = torch.cat( + [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1) + + scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid() + pred_instances = InstanceData( + bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors) + assign_result = self.assigner.assign( + pred_instances=pred_instances, + gt_instances=gt_instances, + gt_instances_ignore=gt_instances_ignore) + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + pos_inds = sampling_result.pos_inds + num_pos_per_img = pos_inds.size(0) + + pos_ious = assign_result.max_overlaps[pos_inds] + # IOU aware classification score + cls_target = F.one_hot(sampling_result.pos_gt_labels, + self.num_classes) * pos_ious.unsqueeze(-1) + obj_target = torch.zeros_like(objectness).unsqueeze(-1) + obj_target[pos_inds] = 1 + bbox_target = sampling_result.pos_gt_bboxes + l1_target = cls_preds.new_zeros((num_pos_per_img, 4)) + if self.use_l1: + l1_target = self._get_l1_target(l1_target, bbox_target, + priors[pos_inds]) + foreground_mask = torch.zeros_like(objectness).to(torch.bool) + foreground_mask[pos_inds] = 1 + return (foreground_mask, cls_target, obj_target, bbox_target, + l1_target, num_pos_per_img) + + def _get_l1_target(self, + l1_target: Tensor, + gt_bboxes: Tensor, + priors: Tensor, + eps: float = 1e-8) -> Tensor: + """Convert gt bboxes to center offset and log width height.""" + gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes) + l1_target[:, :2] = (gt_cxcywh[:, :2] - priors[:, :2]) / priors[:, 2:] + l1_target[:, 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps) + return l1_target diff --git a/mmdetection/mmdet/models/detectors/__init__.py b/mmdetection/mmdet/models/detectors/__init__.py new file mode 100644 index 0000000..e5a06d2 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/__init__.py @@ -0,0 +1,75 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .atss import ATSS +from .autoassign import AutoAssign +from .base import BaseDetector +from .base_detr import DetectionTransformer +from .boxinst import BoxInst +from .cascade_rcnn import CascadeRCNN +from .centernet import CenterNet +from .condinst import CondInst +from .conditional_detr import ConditionalDETR +from .cornernet import CornerNet +from .crowddet import CrowdDet +from .d2_wrapper import Detectron2Wrapper +from .dab_detr import DABDETR +from .ddod import DDOD +from .ddq_detr import DDQDETR +from .deformable_detr import DeformableDETR +from .detr import DETR +from .dino import DINO +from .fast_rcnn import FastRCNN +from .faster_rcnn import FasterRCNN +from .fcos import FCOS +from .fovea import FOVEA +from .fsaf import FSAF +from .gfl import GFL +from .glip import GLIP +from .grid_rcnn import GridRCNN +from .grounding_dino import GroundingDINO +from .htc import HybridTaskCascade +from .kd_one_stage import KnowledgeDistillationSingleStageDetector +from .lad import LAD +from .mask2former import Mask2Former +from .mask_rcnn import MaskRCNN +from .mask_scoring_rcnn import MaskScoringRCNN +from .maskformer import MaskFormer +from .nasfcos import NASFCOS +from .paa import PAA +from .panoptic_fpn import PanopticFPN +from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor +from .point_rend import PointRend +from .queryinst import QueryInst +from .reppoints_detector import RepPointsDetector +from .retinanet import RetinaNet +from .rpn import RPN +from .rtmdet import RTMDet +from .scnet import SCNet +from .semi_base import SemiBaseDetector +from .single_stage import SingleStageDetector +from .soft_teacher import SoftTeacher +from .solo import SOLO +from .solov2 import SOLOv2 +from .sparse_rcnn import SparseRCNN +from .tood import TOOD +from .trident_faster_rcnn import TridentFasterRCNN +from .two_stage import TwoStageDetector +from .vfnet import VFNet +from .yolact import YOLACT +from .yolo import YOLOV3 +from .yolof import YOLOF +from .yolox import YOLOX + +__all__ = [ + 'ATSS', 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN', + 'KnowledgeDistillationSingleStageDetector', 'FastRCNN', 'FasterRCNN', + 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', 'RetinaNet', 'FCOS', + 'GridRCNN', 'MaskScoringRCNN', 'RepPointsDetector', 'FOVEA', 'FSAF', + 'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA', 'YOLOV3', 'YOLACT', + 'VFNet', 'DETR', 'TridentFasterRCNN', 'SparseRCNN', 'SCNet', 'SOLO', + 'SOLOv2', 'DeformableDETR', 'AutoAssign', 'YOLOF', 'CenterNet', 'YOLOX', + 'TwoStagePanopticSegmentor', 'PanopticFPN', 'QueryInst', 'LAD', 'TOOD', + 'MaskFormer', 'DDOD', 'Mask2Former', 'SemiBaseDetector', 'SoftTeacher', + 'RTMDet', 'Detectron2Wrapper', 'CrowdDet', 'CondInst', 'BoxInst', + 'DetectionTransformer', 'ConditionalDETR', 'DINO', 'DABDETR', 'GLIP', + 'DDQDETR', 'GroundingDINO' +] diff --git a/mmdetection/mmdet/models/detectors/atss.py b/mmdetection/mmdet/models/detectors/atss.py new file mode 100644 index 0000000..0bfcc72 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/atss.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class ATSS(SingleStageDetector): + """Implementation of `ATSS `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of ATSS. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of ATSS. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/autoassign.py b/mmdetection/mmdet/models/detectors/autoassign.py new file mode 100644 index 0000000..a0b3570 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/autoassign.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class AutoAssign(SingleStageDetector): + """Implementation of `AutoAssign: Differentiable Label Assignment for Dense + Object Detection `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of AutoAssign. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of AutoAssign. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/base.py b/mmdetection/mmdet/models/detectors/base.py new file mode 100644 index 0000000..1a193b0 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/base.py @@ -0,0 +1,156 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Tuple, Union + +import torch +from mmengine.model import BaseModel +from torch import Tensor + +from mmdet.structures import DetDataSample, OptSampleList, SampleList +from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig +from ..utils import samplelist_boxtype2tensor + +ForwardResults = Union[Dict[str, torch.Tensor], List[DetDataSample], + Tuple[torch.Tensor], torch.Tensor] + + +class BaseDetector(BaseModel, metaclass=ABCMeta): + """Base class for detectors. + + Args: + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`BaseDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + init_cfg (dict or ConfigDict, optional): the config to control the + initialization. Defaults to None. + """ + + def __init__(self, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + + @property + def with_neck(self) -> bool: + """bool: whether the detector has a neck""" + return hasattr(self, 'neck') and self.neck is not None + + # TODO: these properties need to be carefully handled + # for both single stage & two stage detectors + @property + def with_shared_head(self) -> bool: + """bool: whether the detector has a shared head in the RoI Head""" + return hasattr(self, 'roi_head') and self.roi_head.with_shared_head + + @property + def with_bbox(self) -> bool: + """bool: whether the detector has a bbox head""" + return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox) + or (hasattr(self, 'bbox_head') and self.bbox_head is not None)) + + @property + def with_mask(self) -> bool: + """bool: whether the detector has a mask head""" + return ((hasattr(self, 'roi_head') and self.roi_head.with_mask) + or (hasattr(self, 'mask_head') and self.mask_head is not None)) + + def forward(self, + inputs: torch.Tensor, + data_samples: OptSampleList = None, + mode: str = 'tensor') -> ForwardResults: + """The unified entry for a forward process in both training and test. + + The method should accept three modes: "tensor", "predict" and "loss": + + - "tensor": Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - "predict": Forward and return the predictions, which are fully + processed to a list of :obj:`DetDataSample`. + - "loss": Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle either back propagation or + parameter update, which are supposed to be done in :meth:`train_step`. + + Args: + inputs (torch.Tensor): The input tensor with shape + (N, C, ...) in general. + data_samples (list[:obj:`DetDataSample`], optional): A batch of + data samples that contain annotations and predictions. + Defaults to None. + mode (str): Return what kind of value. Defaults to 'tensor'. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of :obj:`DetDataSample`. + - If ``mode="loss"``, return a dict of tensor. + """ + if mode == 'loss': + return self.loss(inputs, data_samples) + elif mode == 'predict': + return self.predict(inputs, data_samples) + elif mode == 'tensor': + return self._forward(inputs, data_samples) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + @abstractmethod + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, tuple]: + """Calculate losses from a batch of inputs and data samples.""" + pass + + @abstractmethod + def predict(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing.""" + pass + + @abstractmethod + def _forward(self, + batch_inputs: Tensor, + batch_data_samples: OptSampleList = None): + """Network forward process. + + Usually includes backbone, neck and head forward without any post- + processing. + """ + pass + + @abstractmethod + def extract_feat(self, batch_inputs: Tensor): + """Extract features from images.""" + pass + + def add_pred_to_datasample(self, data_samples: SampleList, + results_list: InstanceList) -> SampleList: + """Add predictions to `DetDataSample`. + + Args: + data_samples (list[:obj:`DetDataSample`], optional): A batch of + data samples that contain annotations and predictions. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the + input images. Each DetDataSample usually contain + 'pred_instances'. And the ``pred_instances`` usually + contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + for data_sample, pred_instances in zip(data_samples, results_list): + data_sample.pred_instances = pred_instances + samplelist_boxtype2tensor(data_samples) + return data_samples diff --git a/mmdetection/mmdet/models/detectors/base_detr.py b/mmdetection/mmdet/models/detectors/base_detr.py new file mode 100644 index 0000000..88f00ec --- /dev/null +++ b/mmdetection/mmdet/models/detectors/base_detr.py @@ -0,0 +1,332 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Tuple, Union + +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList, SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .base import BaseDetector + + +@MODELS.register_module() +class DetectionTransformer(BaseDetector, metaclass=ABCMeta): + r"""Base class for Detection Transformer. + + In Detection Transformer, an encoder is used to process output features of + neck, then several queries interact with the encoder features using a + decoder and do the regression and classification with the bounding box + head. + + Args: + backbone (:obj:`ConfigDict` or dict): Config of the backbone. + neck (:obj:`ConfigDict` or dict, optional): Config of the neck. + Defaults to None. + encoder (:obj:`ConfigDict` or dict, optional): Config of the + Transformer encoder. Defaults to None. + decoder (:obj:`ConfigDict` or dict, optional): Config of the + Transformer decoder. Defaults to None. + bbox_head (:obj:`ConfigDict` or dict, optional): Config for the + bounding box head module. Defaults to None. + positional_encoding (:obj:`ConfigDict` or dict, optional): Config + of the positional encoding module. Defaults to None. + num_queries (int, optional): Number of decoder query in Transformer. + Defaults to 100. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + the bounding box head module. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + the bounding box head module. Defaults to None. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`BaseDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + encoder: OptConfigType = None, + decoder: OptConfigType = None, + bbox_head: OptConfigType = None, + positional_encoding: OptConfigType = None, + num_queries: int = 100, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + # process args + bbox_head.update(train_cfg=train_cfg) + bbox_head.update(test_cfg=test_cfg) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.encoder = encoder + self.decoder = decoder + self.positional_encoding = positional_encoding + self.num_queries = num_queries + + # init model layers + self.backbone = MODELS.build(backbone) + if neck is not None: + self.neck = MODELS.build(neck) + self.bbox_head = MODELS.build(bbox_head) + self._init_layers() + + @abstractmethod + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + pass + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, list]: + """Calculate losses from a batch of inputs and data samples. + + Args: + batch_inputs (Tensor): Input images of shape (bs, dim, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components + """ + img_feats = self.extract_feat(batch_inputs) + head_inputs_dict = self.forward_transformer(img_feats, + batch_data_samples) + losses = self.bbox_head.loss( + **head_inputs_dict, batch_data_samples=batch_data_samples) + + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the input images. + Each DetDataSample usually contain 'pred_instances'. And the + `pred_instances` usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + img_feats = self.extract_feat(batch_inputs) + head_inputs_dict = self.forward_transformer(img_feats, + batch_data_samples) + results_list = self.bbox_head.predict( + **head_inputs_dict, + rescale=rescale, + batch_data_samples=batch_data_samples) + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples + + def _forward( + self, + batch_inputs: Tensor, + batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W). + batch_data_samples (List[:obj:`DetDataSample`], optional): The + batch data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + Defaults to None. + + Returns: + tuple[Tensor]: A tuple of features from ``bbox_head`` forward. + """ + img_feats = self.extract_feat(batch_inputs) + head_inputs_dict = self.forward_transformer(img_feats, + batch_data_samples) + results = self.bbox_head.forward(**head_inputs_dict) + return results + + def forward_transformer(self, + img_feats: Tuple[Tensor], + batch_data_samples: OptSampleList = None) -> Dict: + """Forward process of Transformer, which includes four steps: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'. We + summarized the parameters flow of the existing DETR-like detector, + which can be illustrated as follow: + + .. code:: text + + img_feats & batch_data_samples + | + V + +-----------------+ + | pre_transformer | + +-----------------+ + | | + | V + | +-----------------+ + | | forward_encoder | + | +-----------------+ + | | + | V + | +---------------+ + | | pre_decoder | + | +---------------+ + | | | + V V | + +-----------------+ | + | forward_decoder | | + +-----------------+ | + | | + V V + head_inputs_dict + + Args: + img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each + feature map has shape (bs, dim, H, W). + batch_data_samples (list[:obj:`DetDataSample`], optional): The + batch data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + Defaults to None. + + Returns: + dict: The dictionary of bbox_head function inputs, which always + includes the `hidden_states` of the decoder output and may contain + `references` including the initial and intermediate references. + """ + encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer( + img_feats, batch_data_samples) + + encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict) + + tmp_dec_in, head_inputs_dict = self.pre_decoder(**encoder_outputs_dict) + decoder_inputs_dict.update(tmp_dec_in) + + decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict) + head_inputs_dict.update(decoder_outputs_dict) + return head_inputs_dict + + def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]: + """Extract features. + + Args: + batch_inputs (Tensor): Image tensor, has shape (bs, dim, H, W). + + Returns: + tuple[Tensor]: Tuple of feature maps from neck. Each feature map + has shape (bs, dim, H, W). + """ + x = self.backbone(batch_inputs) + if self.with_neck: + x = self.neck(x) + return x + + @abstractmethod + def pre_transformer( + self, + img_feats: Tuple[Tensor], + batch_data_samples: OptSampleList = None) -> Tuple[Dict, Dict]: + """Process image features before feeding them to the transformer. + + Args: + img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each + feature map has shape (bs, dim, H, W). + batch_data_samples (list[:obj:`DetDataSample`], optional): The + batch data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + Defaults to None. + + Returns: + tuple[dict, dict]: The first dict contains the inputs of encoder + and the second dict contains the inputs of decoder. + + - encoder_inputs_dict (dict): The keyword args dictionary of + `self.forward_encoder()`, which includes 'feat', 'feat_mask', + 'feat_pos', and other algorithm-specific arguments. + - decoder_inputs_dict (dict): The keyword args dictionary of + `self.forward_decoder()`, which includes 'memory_mask', and + other algorithm-specific arguments. + """ + pass + + @abstractmethod + def forward_encoder(self, feat: Tensor, feat_mask: Tensor, + feat_pos: Tensor, **kwargs) -> Dict: + """Forward with Transformer encoder. + + Args: + feat (Tensor): Sequential features, has shape (bs, num_feat_points, + dim). + feat_mask (Tensor): ByteTensor, the padding mask of the features, + has shape (bs, num_feat_points). + feat_pos (Tensor): The positional embeddings of the features, has + shape (bs, num_feat_points, dim). + + Returns: + dict: The dictionary of encoder outputs, which includes the + `memory` of the encoder output and other algorithm-specific + arguments. + """ + pass + + @abstractmethod + def pre_decoder(self, memory: Tensor, **kwargs) -> Tuple[Dict, Dict]: + """Prepare intermediate variables before entering Transformer decoder, + such as `query`, `query_pos`, and `reference_points`. + + Args: + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + + Returns: + tuple[dict, dict]: The first dict contains the inputs of decoder + and the second dict contains the inputs of the bbox_head function. + + - decoder_inputs_dict (dict): The keyword dictionary args of + `self.forward_decoder()`, which includes 'query', 'query_pos', + 'memory', and other algorithm-specific arguments. + - head_inputs_dict (dict): The keyword dictionary args of the + bbox_head functions, which is usually empty, or includes + `enc_outputs_class` and `enc_outputs_class` when the detector + support 'two stage' or 'query selection' strategies. + """ + pass + + @abstractmethod + def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor, + **kwargs) -> Dict: + """Forward with Transformer decoder. + + Args: + query (Tensor): The queries of decoder inputs, has shape + (bs, num_queries, dim). + query_pos (Tensor): The positional queries of decoder inputs, + has shape (bs, num_queries, dim). + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + + Returns: + dict: The dictionary of decoder outputs, which includes the + `hidden_states` of the decoder output, `references` including + the initial and intermediate reference_points, and other + algorithm-specific arguments. + """ + pass diff --git a/mmdetection/mmdet/models/detectors/boxinst.py b/mmdetection/mmdet/models/detectors/boxinst.py new file mode 100644 index 0000000..ca6b0bd --- /dev/null +++ b/mmdetection/mmdet/models/detectors/boxinst.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage_instance_seg import SingleStageInstanceSegmentor + + +@MODELS.register_module() +class BoxInst(SingleStageInstanceSegmentor): + """Implementation of `BoxInst `_""" + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + mask_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + mask_head=mask_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/cascade_rcnn.py b/mmdetection/mmdet/models/detectors/cascade_rcnn.py new file mode 100644 index 0000000..ecf733f --- /dev/null +++ b/mmdetection/mmdet/models/detectors/cascade_rcnn.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class CascadeRCNN(TwoStageDetector): + r"""Implementation of `Cascade R-CNN: Delving into High Quality Object + Detection `_""" + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + rpn_head: OptConfigType = None, + roi_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/centernet.py b/mmdetection/mmdet/models/detectors/centernet.py new file mode 100644 index 0000000..9c6622d --- /dev/null +++ b/mmdetection/mmdet/models/detectors/centernet.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class CenterNet(SingleStageDetector): + """Implementation of CenterNet(Objects as Points) + + . + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/condinst.py b/mmdetection/mmdet/models/detectors/condinst.py new file mode 100644 index 0000000..ed2dc99 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/condinst.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage_instance_seg import SingleStageInstanceSegmentor + + +@MODELS.register_module() +class CondInst(SingleStageInstanceSegmentor): + """Implementation of `CondInst `_""" + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + mask_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + mask_head=mask_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/conditional_detr.py b/mmdetection/mmdet/models/detectors/conditional_detr.py new file mode 100644 index 0000000..d57868e --- /dev/null +++ b/mmdetection/mmdet/models/detectors/conditional_detr.py @@ -0,0 +1,74 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict + +import torch.nn as nn +from torch import Tensor + +from mmdet.registry import MODELS +from ..layers import (ConditionalDetrTransformerDecoder, + DetrTransformerEncoder, SinePositionalEncoding) +from .detr import DETR + + +@MODELS.register_module() +class ConditionalDETR(DETR): + r"""Implementation of `Conditional DETR for Fast Training Convergence. + + `_. + + Code is modified from the `official github repo + `_. + """ + + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + self.positional_encoding = SinePositionalEncoding( + **self.positional_encoding) + self.encoder = DetrTransformerEncoder(**self.encoder) + self.decoder = ConditionalDetrTransformerDecoder(**self.decoder) + self.embed_dims = self.encoder.embed_dims + # NOTE The embed_dims is typically passed from the inside out. + # For example in DETR, The embed_dims is passed as + # self_attn -> the first encoder layer -> encoder -> detector. + self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims) + + num_feats = self.positional_encoding.num_feats + assert num_feats * 2 == self.embed_dims, \ + f'embed_dims should be exactly 2 times of num_feats. ' \ + f'Found {self.embed_dims} and {num_feats}.' + + def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor, + memory_mask: Tensor, memory_pos: Tensor) -> Dict: + """Forward with Transformer decoder. + + Args: + query (Tensor): The queries of decoder inputs, has shape + (bs, num_queries, dim). + query_pos (Tensor): The positional queries of decoder inputs, + has shape (bs, num_queries, dim). + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). + memory_pos (Tensor): The positional embeddings of memory, has + shape (bs, num_feat_points, dim). + + Returns: + dict: The dictionary of decoder outputs, which includes the + `hidden_states` and `references` of the decoder output. + + - hidden_states (Tensor): Has shape + (num_decoder_layers, bs, num_queries, dim) + - references (Tensor): Has shape + (bs, num_queries, 2) + """ + + hidden_states, references = self.decoder( + query=query, + key=memory, + query_pos=query_pos, + key_pos=memory_pos, + key_padding_mask=memory_mask) + head_inputs_dict = dict( + hidden_states=hidden_states, references=references) + return head_inputs_dict diff --git a/mmdetection/mmdet/models/detectors/cornernet.py b/mmdetection/mmdet/models/detectors/cornernet.py new file mode 100644 index 0000000..946af4d --- /dev/null +++ b/mmdetection/mmdet/models/detectors/cornernet.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class CornerNet(SingleStageDetector): + """CornerNet. + + This detector is the implementation of the paper `CornerNet: Detecting + Objects as Paired Keypoints `_ . + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/crowddet.py b/mmdetection/mmdet/models/detectors/crowddet.py new file mode 100644 index 0000000..4f43bc0 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/crowddet.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class CrowdDet(TwoStageDetector): + """Implementation of `CrowdDet `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + rpn_head (:obj:`ConfigDict` or dict): The rpn config. + roi_head (:obj:`ConfigDict` or dict): The roi config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of FCOS. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of FCOS. Defaults to None. + neck (:obj:`ConfigDict` or dict): The neck config. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + rpn_head: ConfigType, + roi_head: ConfigType, + train_cfg: ConfigType, + test_cfg: ConfigType, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg, + data_preprocessor=data_preprocessor) diff --git a/mmdetection/mmdet/models/detectors/d2_wrapper.py b/mmdetection/mmdet/models/detectors/d2_wrapper.py new file mode 100644 index 0000000..3a2daa4 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/d2_wrapper.py @@ -0,0 +1,291 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import BaseBoxes +from mmdet.structures.mask import BitmapMasks, PolygonMasks +from mmdet.utils import ConfigType +from .base import BaseDetector + +try: + import detectron2 + from detectron2.config import get_cfg + from detectron2.modeling import build_model + from detectron2.structures.masks import BitMasks as D2_BitMasks + from detectron2.structures.masks import PolygonMasks as D2_PolygonMasks + from detectron2.utils.events import EventStorage +except ImportError: + detectron2 = None + + +def _to_cfgnode_list(cfg: ConfigType, + config_list: list = [], + father_name: str = 'MODEL') -> tuple: + """Convert the key and value of mmengine.ConfigDict into a list. + + Args: + cfg (ConfigDict): The detectron2 model config. + config_list (list): A list contains the key and value of ConfigDict. + Defaults to []. + father_name (str): The father name add before the key. + Defaults to "MODEL". + + Returns: + tuple: + + - config_list: A list contains the key and value of ConfigDict. + - father_name (str): The father name add before the key. + Defaults to "MODEL". + """ + for key, value in cfg.items(): + name = f'{father_name}.{key.upper()}' + if isinstance(value, ConfigDict) or isinstance(value, dict): + config_list, fater_name = \ + _to_cfgnode_list(value, config_list, name) + else: + config_list.append(name) + config_list.append(value) + + return config_list, father_name + + +def convert_d2_pred_to_datasample(data_samples: SampleList, + d2_results_list: list) -> SampleList: + """Convert the Detectron2's result to DetDataSample. + + Args: + data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + d2_results_list (list): The list of the results of Detectron2's model. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the + input images. Each DetDataSample usually contain + 'pred_instances'. And the ``pred_instances`` usually + contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(data_samples) == len(d2_results_list) + for data_sample, d2_results in zip(data_samples, d2_results_list): + d2_instance = d2_results['instances'] + + results = InstanceData() + results.bboxes = d2_instance.pred_boxes.tensor + results.scores = d2_instance.scores + results.labels = d2_instance.pred_classes + + if d2_instance.has('pred_masks'): + results.masks = d2_instance.pred_masks + data_sample.pred_instances = results + + return data_samples + + +@MODELS.register_module() +class Detectron2Wrapper(BaseDetector): + """Wrapper of a Detectron2 model. Input/output formats of this class follow + MMDetection's convention, so a Detectron2 model can be trained and + evaluated in MMDetection. + + Args: + detector (:obj:`ConfigDict` or dict): The module config of + Detectron2. + bgr_to_rgb (bool): whether to convert image from BGR to RGB. + Defaults to False. + rgb_to_bgr (bool): whether to convert image from RGB to BGR. + Defaults to False. + """ + + def __init__(self, + detector: ConfigType, + bgr_to_rgb: bool = False, + rgb_to_bgr: bool = False) -> None: + if detectron2 is None: + raise ImportError('Please install Detectron2 first') + assert not (bgr_to_rgb and rgb_to_bgr), ( + '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time') + super().__init__() + self._channel_conversion = rgb_to_bgr or bgr_to_rgb + cfgnode_list, _ = _to_cfgnode_list(detector) + self.cfg = get_cfg() + self.cfg.merge_from_list(cfgnode_list) + self.d2_model = build_model(self.cfg) + self.storage = EventStorage() + + def init_weights(self) -> None: + """Initialization Backbone. + + NOTE: The initialization of other layers are in Detectron2, + if users want to change the initialization way, please + change the code in Detectron2. + """ + from detectron2.checkpoint import DetectionCheckpointer + checkpointer = DetectionCheckpointer(model=self.d2_model) + checkpointer.load(self.cfg.MODEL.WEIGHTS, checkpointables=[]) + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, tuple]: + """Calculate losses from a batch of inputs and data samples. + + The inputs will first convert to the Detectron2 type and feed into + D2 models. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + d2_batched_inputs = self._convert_to_d2_inputs( + batch_inputs=batch_inputs, + batch_data_samples=batch_data_samples, + training=True) + + with self.storage as storage: # noqa + losses = self.d2_model(d2_batched_inputs) + # storage contains some training information, such as cls_accuracy. + # you can use storage.latest() to get the detail information + return losses + + def predict(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + The inputs will first convert to the Detectron2 type and feed into + D2 models. And the results will convert back to the MMDet type. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + + Returns: + list[:obj:`DetDataSample`]: Detection results of the + input images. Each DetDataSample usually contain + 'pred_instances'. And the ``pred_instances`` usually + contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + d2_batched_inputs = self._convert_to_d2_inputs( + batch_inputs=batch_inputs, + batch_data_samples=batch_data_samples, + training=False) + # results in detectron2 has already rescale + d2_results_list = self.d2_model(d2_batched_inputs) + batch_data_samples = convert_d2_pred_to_datasample( + data_samples=batch_data_samples, d2_results_list=d2_results_list) + + return batch_data_samples + + def _forward(self, *args, **kwargs): + """Network forward process. + + Usually includes backbone, neck and head forward without any post- + processing. + """ + raise NotImplementedError( + f'`_forward` is not implemented in {self.__class__.__name__}') + + def extract_feat(self, *args, **kwargs): + """Extract features from images. + + `extract_feat` will not be used in obj:``Detectron2Wrapper``. + """ + pass + + def _convert_to_d2_inputs(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + training=True) -> list: + """Convert inputs type to support Detectron2's model. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + training (bool): Whether to enable training time processing. + + Returns: + list[dict]: A list of dict, which will be fed into Detectron2's + model. And the dict usually contains following keys. + + - image (Tensor): Image in (C, H, W) format. + - instances (Instances): GT Instance. + - height (int): the output height resolution of the model + - width (int): the output width resolution of the model + """ + from detectron2.data.detection_utils import filter_empty_instances + from detectron2.structures import Boxes, Instances + + batched_d2_inputs = [] + for image, data_samples in zip(batch_inputs, batch_data_samples): + d2_inputs = dict() + # deal with metainfo + meta_info = data_samples.metainfo + d2_inputs['file_name'] = meta_info['img_path'] + d2_inputs['height'], d2_inputs['width'] = meta_info['ori_shape'] + d2_inputs['image_id'] = meta_info['img_id'] + # deal with image + if self._channel_conversion: + image = image[[2, 1, 0], ...] + d2_inputs['image'] = image + # deal with gt_instances + gt_instances = data_samples.gt_instances + d2_instances = Instances(meta_info['img_shape']) + + gt_boxes = gt_instances.bboxes + # TODO: use mmdet.structures.box.get_box_tensor after PR 8658 + # has merged + if isinstance(gt_boxes, BaseBoxes): + gt_boxes = gt_boxes.tensor + d2_instances.gt_boxes = Boxes(gt_boxes) + + d2_instances.gt_classes = gt_instances.labels + if gt_instances.get('masks', None) is not None: + gt_masks = gt_instances.masks + if isinstance(gt_masks, PolygonMasks): + d2_instances.gt_masks = D2_PolygonMasks(gt_masks.masks) + elif isinstance(gt_masks, BitmapMasks): + d2_instances.gt_masks = D2_BitMasks(gt_masks.masks) + else: + raise TypeError('The type of `gt_mask` can be ' + '`PolygonMasks` or `BitMasks`, but get ' + f'{type(gt_masks)}.') + # convert to cpu and convert back to cuda to avoid + # some potential error + if training: + device = gt_boxes.device + d2_instances = filter_empty_instances( + d2_instances.to('cpu')).to(device) + d2_inputs['instances'] = d2_instances + batched_d2_inputs.append(d2_inputs) + + return batched_d2_inputs diff --git a/mmdetection/mmdet/models/detectors/dab_detr.py b/mmdetection/mmdet/models/detectors/dab_detr.py new file mode 100644 index 0000000..b61301c --- /dev/null +++ b/mmdetection/mmdet/models/detectors/dab_detr.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple + +from mmengine.model import uniform_init +from torch import Tensor, nn + +from mmdet.registry import MODELS +from ..layers import SinePositionalEncoding +from ..layers.transformer import (DABDetrTransformerDecoder, + DABDetrTransformerEncoder, inverse_sigmoid) +from .detr import DETR + + +@MODELS.register_module() +class DABDETR(DETR): + r"""Implementation of `DAB-DETR: + Dynamic Anchor Boxes are Better Queries for DETR. + + `_. + + Code is modified from the `official github repo + `_. + + Args: + with_random_refpoints (bool): Whether to randomly initialize query + embeddings and not update them during training. + Defaults to False. + num_patterns (int): Inspired by Anchor-DETR. Defaults to 0. + """ + + def __init__(self, + *args, + with_random_refpoints: bool = False, + num_patterns: int = 0, + **kwargs) -> None: + self.with_random_refpoints = with_random_refpoints + assert isinstance(num_patterns, int), \ + f'num_patterns should be int but {num_patterns}.' + self.num_patterns = num_patterns + + super().__init__(*args, **kwargs) + + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + self.positional_encoding = SinePositionalEncoding( + **self.positional_encoding) + self.encoder = DABDetrTransformerEncoder(**self.encoder) + self.decoder = DABDetrTransformerDecoder(**self.decoder) + self.embed_dims = self.encoder.embed_dims + self.query_dim = self.decoder.query_dim + self.query_embedding = nn.Embedding(self.num_queries, self.query_dim) + if self.num_patterns > 0: + self.patterns = nn.Embedding(self.num_patterns, self.embed_dims) + + num_feats = self.positional_encoding.num_feats + assert num_feats * 2 == self.embed_dims, \ + f'embed_dims should be exactly 2 times of num_feats. ' \ + f'Found {self.embed_dims} and {num_feats}.' + + def init_weights(self) -> None: + """Initialize weights for Transformer and other components.""" + super(DABDETR, self).init_weights() + if self.with_random_refpoints: + uniform_init(self.query_embedding) + self.query_embedding.weight.data[:, :2] = \ + inverse_sigmoid(self.query_embedding.weight.data[:, :2]) + self.query_embedding.weight.data[:, :2].requires_grad = False + + def pre_decoder(self, memory: Tensor) -> Tuple[Dict, Dict]: + """Prepare intermediate variables before entering Transformer decoder, + such as `query`, `query_pos`. + + Args: + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + + Returns: + tuple[dict, dict]: The first dict contains the inputs of decoder + and the second dict contains the inputs of the bbox_head function. + + - decoder_inputs_dict (dict): The keyword args dictionary of + `self.forward_decoder()`, which includes 'query', 'query_pos', + 'memory' and 'reg_branches'. + - head_inputs_dict (dict): The keyword args dictionary of the + bbox_head functions, which is usually empty, or includes + `enc_outputs_class` and `enc_outputs_class` when the detector + support 'two stage' or 'query selection' strategies. + """ + batch_size = memory.size(0) + query_pos = self.query_embedding.weight + query_pos = query_pos.unsqueeze(0).repeat(batch_size, 1, 1) + if self.num_patterns == 0: + query = query_pos.new_zeros(batch_size, self.num_queries, + self.embed_dims) + else: + query = self.patterns.weight[:, None, None, :]\ + .repeat(1, self.num_queries, batch_size, 1)\ + .view(-1, batch_size, self.embed_dims)\ + .permute(1, 0, 2) + query_pos = query_pos.repeat(1, self.num_patterns, 1) + + decoder_inputs_dict = dict( + query_pos=query_pos, query=query, memory=memory) + head_inputs_dict = dict() + return decoder_inputs_dict, head_inputs_dict + + def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor, + memory_mask: Tensor, memory_pos: Tensor) -> Dict: + """Forward with Transformer decoder. + + Args: + query (Tensor): The queries of decoder inputs, has shape + (bs, num_queries, dim). + query_pos (Tensor): The positional queries of decoder inputs, + has shape (bs, num_queries, dim). + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). + memory_pos (Tensor): The positional embeddings of memory, has + shape (bs, num_feat_points, dim). + + Returns: + dict: The dictionary of decoder outputs, which includes the + `hidden_states` and `references` of the decoder output. + """ + + hidden_states, references = self.decoder( + query=query, + key=memory, + query_pos=query_pos, + key_pos=memory_pos, + key_padding_mask=memory_mask, + reg_branches=self.bbox_head. + fc_reg # iterative refinement for anchor boxes + ) + head_inputs_dict = dict( + hidden_states=hidden_states, references=references) + return head_inputs_dict diff --git a/mmdetection/mmdet/models/detectors/ddod.py b/mmdetection/mmdet/models/detectors/ddod.py new file mode 100644 index 0000000..3503a40 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/ddod.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class DDOD(SingleStageDetector): + """Implementation of `DDOD `_. + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of ATSS. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of ATSS. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/ddq_detr.py b/mmdetection/mmdet/models/detectors/ddq_detr.py new file mode 100644 index 0000000..57d4959 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/ddq_detr.py @@ -0,0 +1,274 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple + +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmcv.ops import MultiScaleDeformableAttention, batched_nms +from torch import Tensor, nn +from torch.nn.init import normal_ + +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy +from mmdet.utils import OptConfigType +from ..layers import DDQTransformerDecoder +from ..utils import align_tensor +from .deformable_detr import DeformableDETR +from .dino import DINO + + +@MODELS.register_module() +class DDQDETR(DINO): + r"""Implementation of `Dense Distinct Query for + End-to-End Object Detection `_ + + Code is modified from the `official github repo + `_. + + Args: + dense_topk_ratio (float): Ratio of num_dense queries to num_queries. + Defaults to 1.5. + dqs_cfg (:obj:`ConfigDict` or dict, optional): Config of + Distinct Queries Selection. Defaults to nms with + `iou_threshold` = 0.8. + """ + + def __init__(self, + *args, + dense_topk_ratio: float = 1.5, + dqs_cfg: OptConfigType = dict(type='nms', iou_threshold=0.8), + **kwargs): + self.dense_topk_ratio = dense_topk_ratio + self.decoder_cfg = kwargs['decoder'] + self.dqs_cfg = dqs_cfg + super().__init__(*args, **kwargs) + + # a share dict in all moduls + # pass some intermediate results and config parameters + cache_dict = dict() + for m in self.modules(): + m.cache_dict = cache_dict + # first element is the start index of matching queries + # second element is the number of matching queries + self.cache_dict['dis_query_info'] = [0, 0] + + # mask for distinct queries in each decoder layer + self.cache_dict['distinct_query_mask'] = [] + # pass to decoder do the dqs + self.cache_dict['cls_branches'] = self.bbox_head.cls_branches + # Used to construct the attention mask after dqs + self.cache_dict['num_heads'] = self.encoder.layers[ + 0].self_attn.num_heads + # pass to decoder to do the dqs + self.cache_dict['dqs_cfg'] = self.dqs_cfg + + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + super(DDQDETR, self)._init_layers() + self.decoder = DDQTransformerDecoder(**self.decoder_cfg) + self.query_embedding = None + self.query_map = nn.Linear(self.embed_dims, self.embed_dims) + + def init_weights(self) -> None: + """Initialize weights for Transformer and other components.""" + super(DeformableDETR, self).init_weights() + for coder in self.encoder, self.decoder: + for p in coder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MultiScaleDeformableAttention): + m.init_weights() + nn.init.xavier_uniform_(self.memory_trans_fc.weight) + normal_(self.level_embed) + + def pre_decoder( + self, + memory: Tensor, + memory_mask: Tensor, + spatial_shapes: Tensor, + batch_data_samples: OptSampleList = None, + ) -> Tuple[Dict]: + """Prepare intermediate variables before entering Transformer decoder, + such as `query`, `memory`, and `reference_points`. + + Args: + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). Will only be used when + `as_two_stage` is `True`. + spatial_shapes (Tensor): Spatial shapes of features in all levels. + With shape (num_levels, 2), last dimension represents (h, w). + Will only be used when `as_two_stage` is `True`. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + Defaults to None. + + Returns: + tuple[dict]: The decoder_inputs_dict and head_inputs_dict. + + - decoder_inputs_dict (dict): The keyword dictionary args of + `self.forward_decoder()`, which includes 'query', 'memory', + `reference_points`, and `dn_mask`. The reference points of + decoder input here are 4D boxes, although it has `points` + in its name. + - head_inputs_dict (dict): The keyword dictionary args of the + bbox_head functions, which includes `topk_score`, `topk_coords`, + `dense_topk_score`, `dense_topk_coords`, + and `dn_meta`, when `self.training` is `True`, else is empty. + """ + bs, _, c = memory.shape + output_memory, output_proposals = self.gen_encoder_output_proposals( + memory, memory_mask, spatial_shapes) + enc_outputs_class = self.bbox_head.cls_branches[ + self.decoder.num_layers]( + output_memory) + enc_outputs_coord_unact = self.bbox_head.reg_branches[ + self.decoder.num_layers](output_memory) + output_proposals + + if self.training: + # aux dense branch particularly in DDQ DETR, which doesn't exist + # in DINO. + # -1 is the aux head for the encoder + dense_enc_outputs_class = self.bbox_head.cls_branches[-1]( + output_memory) + dense_enc_outputs_coord_unact = self.bbox_head.reg_branches[-1]( + output_memory) + output_proposals + + topk = self.num_queries + dense_topk = int(topk * self.dense_topk_ratio) + + proposals = enc_outputs_coord_unact.sigmoid() + proposals = bbox_cxcywh_to_xyxy(proposals) + scores = enc_outputs_class.max(-1)[0].sigmoid() + + if self.training: + # aux dense branch particularly in DDQ DETR, which doesn't exist + # in DINO. + dense_proposals = dense_enc_outputs_coord_unact.sigmoid() + dense_proposals = bbox_cxcywh_to_xyxy(dense_proposals) + dense_scores = dense_enc_outputs_class.max(-1)[0].sigmoid() + + num_imgs = len(scores) + topk_score = [] + topk_coords_unact = [] + # Distinct query. + query = [] + + dense_topk_score = [] + dense_topk_coords_unact = [] + dense_query = [] + + for img_id in range(num_imgs): + single_proposals = proposals[img_id] + single_scores = scores[img_id] + + # `batched_nms` of class scores and bbox coordinations is used + # particularly by DDQ DETR for region proposal generation, + # instead of `topk` of class scores by DINO. + _, keep_idxs = batched_nms( + single_proposals, single_scores, + torch.ones(len(single_scores), device=single_scores.device), + self.cache_dict['dqs_cfg']) + + if self.training: + # aux dense branch particularly in DDQ DETR, which doesn't + # exist in DINO. + dense_single_proposals = dense_proposals[img_id] + dense_single_scores = dense_scores[img_id] + # sort according the score + # Only sort by classification score, neither nms nor topk is + # required. So input parameter `nms_cfg` = None. + _, dense_keep_idxs = batched_nms( + dense_single_proposals, dense_single_scores, + torch.ones( + len(dense_single_scores), + device=dense_single_scores.device), None) + + dense_topk_score.append(dense_enc_outputs_class[img_id] + [dense_keep_idxs][:dense_topk]) + dense_topk_coords_unact.append( + dense_enc_outputs_coord_unact[img_id][dense_keep_idxs] + [:dense_topk]) + + topk_score.append(enc_outputs_class[img_id][keep_idxs][:topk]) + + # Instead of initializing the content part with transformed + # coordinates in Deformable DETR, we fuse the feature map + # embedding of distinct positions as the content part, which + # makes the initial queries more distinct. + topk_coords_unact.append( + enc_outputs_coord_unact[img_id][keep_idxs][:topk]) + + map_memory = self.query_map(memory[img_id].detach()) + query.append(map_memory[keep_idxs][:topk]) + if self.training: + # aux dense branch particularly in DDQ DETR, which doesn't + # exist in DINO. + dense_query.append(map_memory[dense_keep_idxs][:dense_topk]) + + topk_score = align_tensor(topk_score, topk) + topk_coords_unact = align_tensor(topk_coords_unact, topk) + query = align_tensor(query, topk) + if self.training: + dense_topk_score = align_tensor(dense_topk_score) + dense_topk_coords_unact = align_tensor(dense_topk_coords_unact) + + dense_query = align_tensor(dense_query) + num_dense_queries = dense_query.size(1) + if self.training: + query = torch.cat([query, dense_query], dim=1) + topk_coords_unact = torch.cat( + [topk_coords_unact, dense_topk_coords_unact], dim=1) + + topk_coords = topk_coords_unact.sigmoid() + if self.training: + dense_topk_coords = topk_coords[:, -num_dense_queries:] + topk_coords = topk_coords[:, :-num_dense_queries] + + topk_coords_unact = topk_coords_unact.detach() + + if self.training: + dn_label_query, dn_bbox_query, dn_mask, dn_meta = \ + self.dn_query_generator(batch_data_samples) + query = torch.cat([dn_label_query, query], dim=1) + reference_points = torch.cat([dn_bbox_query, topk_coords_unact], + dim=1) + + # Update `dn_mask` to add mask for dense queries. + ori_size = dn_mask.size(-1) + new_size = dn_mask.size(-1) + num_dense_queries + new_dn_mask = dn_mask.new_ones((new_size, new_size)).bool() + dense_mask = torch.zeros(num_dense_queries, + num_dense_queries).bool() + self.cache_dict['dis_query_info'] = [dn_label_query.size(1), topk] + + new_dn_mask[ori_size:, ori_size:] = dense_mask + new_dn_mask[:ori_size, :ori_size] = dn_mask + dn_meta['num_dense_queries'] = num_dense_queries + dn_mask = new_dn_mask + self.cache_dict['num_dense_queries'] = num_dense_queries + self.decoder.aux_reg_branches = self.bbox_head.aux_reg_branches + + else: + self.cache_dict['dis_query_info'] = [0, topk] + reference_points = topk_coords_unact + dn_mask, dn_meta = None, None + + reference_points = reference_points.sigmoid() + + decoder_inputs_dict = dict( + query=query, + memory=memory, + reference_points=reference_points, + dn_mask=dn_mask) + head_inputs_dict = dict( + enc_outputs_class=topk_score, + enc_outputs_coord=topk_coords, + aux_enc_outputs_class=dense_topk_score, + aux_enc_outputs_coord=dense_topk_coords, + dn_meta=dn_meta) if self.training else dict() + + return decoder_inputs_dict, head_inputs_dict diff --git a/mmdetection/mmdet/models/detectors/deformable_detr.py b/mmdetection/mmdet/models/detectors/deformable_detr.py new file mode 100644 index 0000000..0eb5cd2 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/deformable_detr.py @@ -0,0 +1,572 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Dict, Tuple + +import torch +import torch.nn.functional as F +from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention +from mmengine.model import xavier_init +from torch import Tensor, nn +from torch.nn.init import normal_ + +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList +from mmdet.utils import OptConfigType +from ..layers import (DeformableDetrTransformerDecoder, + DeformableDetrTransformerEncoder, SinePositionalEncoding) +from .base_detr import DetectionTransformer + + +@MODELS.register_module() +class DeformableDETR(DetectionTransformer): + r"""Implementation of `Deformable DETR: Deformable Transformers for + End-to-End Object Detection `_ + + Code is modified from the `official github repo + `_. + + Args: + decoder (:obj:`ConfigDict` or dict, optional): Config of the + Transformer decoder. Defaults to None. + bbox_head (:obj:`ConfigDict` or dict, optional): Config for the + bounding box head module. Defaults to None. + with_box_refine (bool, optional): Whether to refine the references + in the decoder. Defaults to `False`. + as_two_stage (bool, optional): Whether to generate the proposal + from the outputs of encoder. Defaults to `False`. + num_feature_levels (int, optional): Number of feature levels. + Defaults to 4. + """ + + def __init__(self, + *args, + decoder: OptConfigType = None, + bbox_head: OptConfigType = None, + with_box_refine: bool = False, + as_two_stage: bool = False, + num_feature_levels: int = 4, + **kwargs) -> None: + self.with_box_refine = with_box_refine + self.as_two_stage = as_two_stage + self.num_feature_levels = num_feature_levels + + if bbox_head is not None: + assert 'share_pred_layer' not in bbox_head and \ + 'num_pred_layer' not in bbox_head and \ + 'as_two_stage' not in bbox_head, \ + 'The two keyword args `share_pred_layer`, `num_pred_layer`, ' \ + 'and `as_two_stage are set in `detector.__init__()`, users ' \ + 'should not set them in `bbox_head` config.' + # The last prediction layer is used to generate proposal + # from encode feature map when `as_two_stage` is `True`. + # And all the prediction layers should share parameters + # when `with_box_refine` is `True`. + bbox_head['share_pred_layer'] = not with_box_refine + bbox_head['num_pred_layer'] = (decoder['num_layers'] + 1) \ + if self.as_two_stage else decoder['num_layers'] + bbox_head['as_two_stage'] = as_two_stage + + super().__init__(*args, decoder=decoder, bbox_head=bbox_head, **kwargs) + + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + self.positional_encoding = SinePositionalEncoding( + **self.positional_encoding) + self.encoder = DeformableDetrTransformerEncoder(**self.encoder) + self.decoder = DeformableDetrTransformerDecoder(**self.decoder) + self.embed_dims = self.encoder.embed_dims + if not self.as_two_stage: + self.query_embedding = nn.Embedding(self.num_queries, + self.embed_dims * 2) + # NOTE The query_embedding will be split into query and query_pos + # in self.pre_decoder, hence, the embed_dims are doubled. + + num_feats = self.positional_encoding.num_feats + assert num_feats * 2 == self.embed_dims, \ + 'embed_dims should be exactly 2 times of num_feats. ' \ + f'Found {self.embed_dims} and {num_feats}.' + + self.level_embed = nn.Parameter( + torch.Tensor(self.num_feature_levels, self.embed_dims)) + + if self.as_two_stage: + self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims) + self.memory_trans_norm = nn.LayerNorm(self.embed_dims) + self.pos_trans_fc = nn.Linear(self.embed_dims * 2, + self.embed_dims * 2) + self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2) + else: + self.reference_points_fc = nn.Linear(self.embed_dims, 2) + + def init_weights(self) -> None: + """Initialize weights for Transformer and other components.""" + super().init_weights() + for coder in self.encoder, self.decoder: + for p in coder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MultiScaleDeformableAttention): + m.init_weights() + if self.as_two_stage: + nn.init.xavier_uniform_(self.memory_trans_fc.weight) + nn.init.xavier_uniform_(self.pos_trans_fc.weight) + else: + xavier_init( + self.reference_points_fc, distribution='uniform', bias=0.) + normal_(self.level_embed) + + def pre_transformer( + self, + mlvl_feats: Tuple[Tensor], + batch_data_samples: OptSampleList = None) -> Tuple[Dict]: + """Process image features before feeding them to the transformer. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + mlvl_feats (tuple[Tensor]): Multi-level features that may have + different resolutions, output from neck. Each feature has + shape (bs, dim, h_lvl, w_lvl), where 'lvl' means 'layer'. + batch_data_samples (list[:obj:`DetDataSample`], optional): The + batch data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + Defaults to None. + + Returns: + tuple[dict]: The first dict contains the inputs of encoder and the + second dict contains the inputs of decoder. + + - encoder_inputs_dict (dict): The keyword args dictionary of + `self.forward_encoder()`, which includes 'feat', 'feat_mask', + and 'feat_pos'. + - decoder_inputs_dict (dict): The keyword args dictionary of + `self.forward_decoder()`, which includes 'memory_mask'. + """ + batch_size = mlvl_feats[0].size(0) + + # construct binary masks for the transformer. + assert batch_data_samples is not None + batch_input_shape = batch_data_samples[0].batch_input_shape + input_img_h, input_img_w = batch_input_shape + img_shape_list = [sample.img_shape for sample in batch_data_samples] + same_shape_flag = all([ + s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list + ]) + # support torch2onnx without feeding masks + if torch.onnx.is_in_onnx_export() or same_shape_flag: + mlvl_masks = [] + mlvl_pos_embeds = [] + for feat in mlvl_feats: + mlvl_masks.append(None) + mlvl_pos_embeds.append( + self.positional_encoding(None, input=feat)) + else: + masks = mlvl_feats[0].new_ones( + (batch_size, input_img_h, input_img_w)) + for img_id in range(batch_size): + img_h, img_w = img_shape_list[img_id] + masks[img_id, :img_h, :img_w] = 0 + # NOTE following the official DETR repo, non-zero + # values representing ignored positions, while + # zero values means valid positions. + + mlvl_masks = [] + mlvl_pos_embeds = [] + for feat in mlvl_feats: + mlvl_masks.append( + F.interpolate(masks[None], size=feat.shape[-2:]).to( + torch.bool).squeeze(0)) + mlvl_pos_embeds.append( + self.positional_encoding(mlvl_masks[-1])) + + feat_flatten = [] + lvl_pos_embed_flatten = [] + mask_flatten = [] + spatial_shapes = [] + for lvl, (feat, mask, pos_embed) in enumerate( + zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): + batch_size, c, h, w = feat.shape + spatial_shape = torch._shape_as_tensor(feat)[2:].to(feat.device) + # [bs, c, h_lvl, w_lvl] -> [bs, h_lvl*w_lvl, c] + feat = feat.view(batch_size, c, -1).permute(0, 2, 1) + pos_embed = pos_embed.view(batch_size, c, -1).permute(0, 2, 1) + lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) + # [bs, h_lvl, w_lvl] -> [bs, h_lvl*w_lvl] + if mask is not None: + mask = mask.flatten(1) + + feat_flatten.append(feat) + lvl_pos_embed_flatten.append(lvl_pos_embed) + mask_flatten.append(mask) + spatial_shapes.append(spatial_shape) + + # (bs, num_feat_points, dim) + feat_flatten = torch.cat(feat_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + # (bs, num_feat_points), where num_feat_points = sum_lvl(h_lvl*w_lvl) + if mask_flatten[0] is not None: + mask_flatten = torch.cat(mask_flatten, 1) + else: + mask_flatten = None + + # (num_level, 2) + spatial_shapes = torch.cat(spatial_shapes).view(-1, 2) + level_start_index = torch.cat(( + spatial_shapes.new_zeros((1, )), # (num_level) + spatial_shapes.prod(1).cumsum(0)[:-1])) + if mlvl_masks[0] is not None: + valid_ratios = torch.stack( # (bs, num_level, 2) + [self.get_valid_ratio(m) for m in mlvl_masks], 1) + else: + valid_ratios = mlvl_feats[0].new_ones(batch_size, len(mlvl_feats), + 2) + + encoder_inputs_dict = dict( + feat=feat_flatten, + feat_mask=mask_flatten, + feat_pos=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios) + decoder_inputs_dict = dict( + memory_mask=mask_flatten, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios) + return encoder_inputs_dict, decoder_inputs_dict + + def forward_encoder(self, feat: Tensor, feat_mask: Tensor, + feat_pos: Tensor, spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios: Tensor) -> Dict: + """Forward with Transformer encoder. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + feat (Tensor): Sequential features, has shape (bs, num_feat_points, + dim). + feat_mask (Tensor): ByteTensor, the padding mask of the features, + has shape (bs, num_feat_points). + feat_pos (Tensor): The positional embeddings of the features, has + shape (bs, num_feat_points, dim). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + + Returns: + dict: The dictionary of encoder outputs, which includes the + `memory` of the encoder output. + """ + memory = self.encoder( + query=feat, + query_pos=feat_pos, + key_padding_mask=feat_mask, # for self_attn + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios) + encoder_outputs_dict = dict( + memory=memory, + memory_mask=feat_mask, + spatial_shapes=spatial_shapes) + return encoder_outputs_dict + + def pre_decoder(self, memory: Tensor, memory_mask: Tensor, + spatial_shapes: Tensor) -> Tuple[Dict, Dict]: + """Prepare intermediate variables before entering Transformer decoder, + such as `query`, `query_pos`, and `reference_points`. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). It will only be used when + `as_two_stage` is `True`. + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + It will only be used when `as_two_stage` is `True`. + + Returns: + tuple[dict, dict]: The decoder_inputs_dict and head_inputs_dict. + + - decoder_inputs_dict (dict): The keyword dictionary args of + `self.forward_decoder()`, which includes 'query', 'query_pos', + 'memory', and `reference_points`. The reference_points of + decoder input here are 4D boxes when `as_two_stage` is `True`, + otherwise 2D points, although it has `points` in its name. + The reference_points in encoder is always 2D points. + - head_inputs_dict (dict): The keyword dictionary args of the + bbox_head functions, which includes `enc_outputs_class` and + `enc_outputs_coord`. They are both `None` when 'as_two_stage' + is `False`. The dict is empty when `self.training` is `False`. + """ + batch_size, _, c = memory.shape + if self.as_two_stage: + output_memory, output_proposals = \ + self.gen_encoder_output_proposals( + memory, memory_mask, spatial_shapes) + enc_outputs_class = self.bbox_head.cls_branches[ + self.decoder.num_layers]( + output_memory) + enc_outputs_coord_unact = self.bbox_head.reg_branches[ + self.decoder.num_layers](output_memory) + output_proposals + enc_outputs_coord = enc_outputs_coord_unact.sigmoid() + # We only use the first channel in enc_outputs_class as foreground, + # the other (num_classes - 1) channels are actually not used. + # Its targets are set to be 0s, which indicates the first + # class (foreground) because we use [0, num_classes - 1] to + # indicate class labels, background class is indicated by + # num_classes (similar convention in RPN). + # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa + # This follows the official implementation of Deformable DETR. + topk_proposals = torch.topk( + enc_outputs_class[..., 0], self.num_queries, dim=1)[1] + topk_coords_unact = torch.gather( + enc_outputs_coord_unact, 1, + topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) + topk_coords_unact = topk_coords_unact.detach() + reference_points = topk_coords_unact.sigmoid() + pos_trans_out = self.pos_trans_fc( + self.get_proposal_pos_embed(topk_coords_unact)) + pos_trans_out = self.pos_trans_norm(pos_trans_out) + query_pos, query = torch.split(pos_trans_out, c, dim=2) + else: + enc_outputs_class, enc_outputs_coord = None, None + query_embed = self.query_embedding.weight + query_pos, query = torch.split(query_embed, c, dim=1) + query_pos = query_pos.unsqueeze(0).expand(batch_size, -1, -1) + query = query.unsqueeze(0).expand(batch_size, -1, -1) + reference_points = self.reference_points_fc(query_pos).sigmoid() + + decoder_inputs_dict = dict( + query=query, + query_pos=query_pos, + memory=memory, + reference_points=reference_points) + head_inputs_dict = dict( + enc_outputs_class=enc_outputs_class, + enc_outputs_coord=enc_outputs_coord) if self.training else dict() + return decoder_inputs_dict, head_inputs_dict + + def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor, + memory_mask: Tensor, reference_points: Tensor, + spatial_shapes: Tensor, level_start_index: Tensor, + valid_ratios: Tensor) -> Dict: + """Forward with Transformer decoder. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + query (Tensor): The queries of decoder inputs, has shape + (bs, num_queries, dim). + query_pos (Tensor): The positional queries of decoder inputs, + has shape (bs, num_queries, dim). + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). + reference_points (Tensor): The initial reference, has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has + shape (bs, num_queries, 2) with the last dimension arranged as + (cx, cy). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + + Returns: + dict: The dictionary of decoder outputs, which includes the + `hidden_states` of the decoder output and `references` including + the initial and intermediate reference_points. + """ + inter_states, inter_references = self.decoder( + query=query, + value=memory, + query_pos=query_pos, + key_padding_mask=memory_mask, # for cross_attn + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reg_branches=self.bbox_head.reg_branches + if self.with_box_refine else None) + references = [reference_points, *inter_references] + decoder_outputs_dict = dict( + hidden_states=inter_states, references=references) + return decoder_outputs_dict + + @staticmethod + def get_valid_ratio(mask: Tensor) -> Tensor: + """Get the valid radios of feature map in a level. + + .. code:: text + + |---> valid_W <---| + ---+-----------------+-----+--- + A | | | A + | | | | | + | | | | | + valid_H | | | | + | | | | H + | | | | | + V | | | | + ---+-----------------+ | | + | | V + +-----------------------+--- + |---------> W <---------| + + The valid_ratios are defined as: + r_h = valid_H / H, r_w = valid_W / W + They are the factors to re-normalize the relative coordinates of the + image to the relative coordinates of the current level feature map. + + Args: + mask (Tensor): Binary mask of a feature map, has shape (bs, H, W). + + Returns: + Tensor: valid ratios [r_w, r_h] of a feature map, has shape (1, 2). + """ + _, H, W = mask.shape + valid_H = torch.sum(~mask[:, :, 0], 1) + valid_W = torch.sum(~mask[:, 0, :], 1) + valid_ratio_h = valid_H.float() / H + valid_ratio_w = valid_W.float() / W + valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) + return valid_ratio + + def gen_encoder_output_proposals( + self, memory: Tensor, memory_mask: Tensor, + spatial_shapes: Tensor) -> Tuple[Tensor, Tensor]: + """Generate proposals from encoded memory. The function will only be + used when `as_two_stage` is `True`. + + Args: + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + + Returns: + tuple: A tuple of transformed memory and proposals. + + - output_memory (Tensor): The transformed memory for obtaining + top-k proposals, has shape (bs, num_feat_points, dim). + - output_proposals (Tensor): The inverse-normalized proposal, has + shape (batch_size, num_keys, 4) with the last dimension arranged + as (cx, cy, w, h). + """ + + bs = memory.size(0) + proposals = [] + _cur = 0 # start index in the sequence of the current level + for lvl, HW in enumerate(spatial_shapes): + H, W = HW + + if memory_mask is not None: + mask_flatten_ = memory_mask[:, _cur:(_cur + H * W)].view( + bs, H, W, 1) + valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], + 1).unsqueeze(-1) + valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], + 1).unsqueeze(-1) + scale = torch.cat([valid_W, valid_H], 1).view(bs, 1, 1, 2) + else: + if not isinstance(HW, torch.Tensor): + HW = memory.new_tensor(HW) + scale = HW.unsqueeze(0).flip(dims=[0, 1]).view(1, 1, 1, 2) + grid_y, grid_x = torch.meshgrid( + torch.linspace( + 0, H - 1, H, dtype=torch.float32, device=memory.device), + torch.linspace( + 0, W - 1, W, dtype=torch.float32, device=memory.device)) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + grid = (grid.unsqueeze(0).expand(bs, -1, -1, -1) + 0.5) / scale + wh = torch.ones_like(grid) * 0.05 * (2.0**lvl) + proposal = torch.cat((grid, wh), -1).view(bs, -1, 4) + proposals.append(proposal) + _cur += (H * W) + output_proposals = torch.cat(proposals, 1) + # do not use `all` to make it exportable to onnx + output_proposals_valid = ( + (output_proposals > 0.01) & (output_proposals < 0.99)).sum( + -1, keepdim=True) == output_proposals.shape[-1] + # inverse_sigmoid + output_proposals = torch.log(output_proposals / (1 - output_proposals)) + if memory_mask is not None: + output_proposals = output_proposals.masked_fill( + memory_mask.unsqueeze(-1), float('inf')) + output_proposals = output_proposals.masked_fill( + ~output_proposals_valid, float('inf')) + + output_memory = memory + if memory_mask is not None: + output_memory = output_memory.masked_fill( + memory_mask.unsqueeze(-1), float(0)) + output_memory = output_memory.masked_fill(~output_proposals_valid, + float(0)) + output_memory = self.memory_trans_fc(output_memory) + output_memory = self.memory_trans_norm(output_memory) + # [bs, sum(hw), 2] + return output_memory, output_proposals + + @staticmethod + def get_proposal_pos_embed(proposals: Tensor, + num_pos_feats: int = 128, + temperature: int = 10000) -> Tensor: + """Get the position embedding of the proposal. + + Args: + proposals (Tensor): Not normalized proposals, has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + num_pos_feats (int, optional): The feature dimension for each + position along x, y, w, and h-axis. Note the final returned + dimension for each position is 4 times of num_pos_feats. + Default to 128. + temperature (int, optional): The temperature used for scaling the + position embedding. Defaults to 10000. + + Returns: + Tensor: The position embedding of proposal, has shape + (bs, num_queries, num_pos_feats * 4), with the last dimension + arranged as (cx, cy, w, h) + """ + scale = 2 * math.pi + dim_t = torch.arange( + num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats) + # N, L, 4 + proposals = proposals.sigmoid() * scale + # N, L, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # N, L, 4, 64, 2 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), + dim=4).flatten(2) + return pos diff --git a/mmdetection/mmdet/models/detectors/detr.py b/mmdetection/mmdet/models/detectors/detr.py new file mode 100644 index 0000000..7895e9e --- /dev/null +++ b/mmdetection/mmdet/models/detectors/detr.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList +from ..layers import (DetrTransformerDecoder, DetrTransformerEncoder, + SinePositionalEncoding) +from .base_detr import DetectionTransformer + + +@MODELS.register_module() +class DETR(DetectionTransformer): + r"""Implementation of `DETR: End-to-End Object Detection with Transformers. + + `_. + + Code is modified from the `official github repo + `_. + """ + + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + self.positional_encoding = SinePositionalEncoding( + **self.positional_encoding) + self.encoder = DetrTransformerEncoder(**self.encoder) + self.decoder = DetrTransformerDecoder(**self.decoder) + self.embed_dims = self.encoder.embed_dims + # NOTE The embed_dims is typically passed from the inside out. + # For example in DETR, The embed_dims is passed as + # self_attn -> the first encoder layer -> encoder -> detector. + self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims) + + num_feats = self.positional_encoding.num_feats + assert num_feats * 2 == self.embed_dims, \ + 'embed_dims should be exactly 2 times of num_feats. ' \ + f'Found {self.embed_dims} and {num_feats}.' + + def init_weights(self) -> None: + """Initialize weights for Transformer and other components.""" + super().init_weights() + for coder in self.encoder, self.decoder: + for p in coder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def pre_transformer( + self, + img_feats: Tuple[Tensor], + batch_data_samples: OptSampleList = None) -> Tuple[Dict, Dict]: + """Prepare the inputs of the Transformer. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + img_feats (Tuple[Tensor]): Tuple of features output from the neck, + has shape (bs, c, h, w). + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such as + `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + Defaults to None. + + Returns: + tuple[dict, dict]: The first dict contains the inputs of encoder + and the second dict contains the inputs of decoder. + + - encoder_inputs_dict (dict): The keyword args dictionary of + `self.forward_encoder()`, which includes 'feat', 'feat_mask', + and 'feat_pos'. + - decoder_inputs_dict (dict): The keyword args dictionary of + `self.forward_decoder()`, which includes 'memory_mask', + and 'memory_pos'. + """ + + feat = img_feats[-1] # NOTE img_feats contains only one feature. + batch_size, feat_dim, _, _ = feat.shape + # construct binary masks which for the transformer. + assert batch_data_samples is not None + batch_input_shape = batch_data_samples[0].batch_input_shape + input_img_h, input_img_w = batch_input_shape + img_shape_list = [sample.img_shape for sample in batch_data_samples] + same_shape_flag = all([ + s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list + ]) + if torch.onnx.is_in_onnx_export() or same_shape_flag: + masks = None + # [batch_size, embed_dim, h, w] + pos_embed = self.positional_encoding(masks, input=feat) + else: + masks = feat.new_ones((batch_size, input_img_h, input_img_w)) + for img_id in range(batch_size): + img_h, img_w = img_shape_list[img_id] + masks[img_id, :img_h, :img_w] = 0 + # NOTE following the official DETR repo, non-zero values represent + # ignored positions, while zero values mean valid positions. + + masks = F.interpolate( + masks.unsqueeze(1), + size=feat.shape[-2:]).to(torch.bool).squeeze(1) + # [batch_size, embed_dim, h, w] + pos_embed = self.positional_encoding(masks) + + # use `view` instead of `flatten` for dynamically exporting to ONNX + # [bs, c, h, w] -> [bs, h*w, c] + feat = feat.view(batch_size, feat_dim, -1).permute(0, 2, 1) + pos_embed = pos_embed.view(batch_size, feat_dim, -1).permute(0, 2, 1) + # [bs, h, w] -> [bs, h*w] + if masks is not None: + masks = masks.view(batch_size, -1) + + # prepare transformer_inputs_dict + encoder_inputs_dict = dict( + feat=feat, feat_mask=masks, feat_pos=pos_embed) + decoder_inputs_dict = dict(memory_mask=masks, memory_pos=pos_embed) + return encoder_inputs_dict, decoder_inputs_dict + + def forward_encoder(self, feat: Tensor, feat_mask: Tensor, + feat_pos: Tensor) -> Dict: + """Forward with Transformer encoder. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + feat (Tensor): Sequential features, has shape (bs, num_feat_points, + dim). + feat_mask (Tensor): ByteTensor, the padding mask of the features, + has shape (bs, num_feat_points). + feat_pos (Tensor): The positional embeddings of the features, has + shape (bs, num_feat_points, dim). + + Returns: + dict: The dictionary of encoder outputs, which includes the + `memory` of the encoder output. + """ + memory = self.encoder( + query=feat, query_pos=feat_pos, + key_padding_mask=feat_mask) # for self_attn + encoder_outputs_dict = dict(memory=memory) + return encoder_outputs_dict + + def pre_decoder(self, memory: Tensor) -> Tuple[Dict, Dict]: + """Prepare intermediate variables before entering Transformer decoder, + such as `query`, `query_pos`. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + + Returns: + tuple[dict, dict]: The first dict contains the inputs of decoder + and the second dict contains the inputs of the bbox_head function. + + - decoder_inputs_dict (dict): The keyword args dictionary of + `self.forward_decoder()`, which includes 'query', 'query_pos', + 'memory'. + - head_inputs_dict (dict): The keyword args dictionary of the + bbox_head functions, which is usually empty, or includes + `enc_outputs_class` and `enc_outputs_class` when the detector + support 'two stage' or 'query selection' strategies. + """ + + batch_size = memory.size(0) # (bs, num_feat_points, dim) + query_pos = self.query_embedding.weight + # (num_queries, dim) -> (bs, num_queries, dim) + query_pos = query_pos.unsqueeze(0).repeat(batch_size, 1, 1) + query = torch.zeros_like(query_pos) + + decoder_inputs_dict = dict( + query_pos=query_pos, query=query, memory=memory) + head_inputs_dict = dict() + return decoder_inputs_dict, head_inputs_dict + + def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor, + memory_mask: Tensor, memory_pos: Tensor) -> Dict: + """Forward with Transformer decoder. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + query (Tensor): The queries of decoder inputs, has shape + (bs, num_queries, dim). + query_pos (Tensor): The positional queries of decoder inputs, + has shape (bs, num_queries, dim). + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). + memory_pos (Tensor): The positional embeddings of memory, has + shape (bs, num_feat_points, dim). + + Returns: + dict: The dictionary of decoder outputs, which includes the + `hidden_states` of the decoder output. + + - hidden_states (Tensor): Has shape + (num_decoder_layers, bs, num_queries, dim) + """ + + hidden_states = self.decoder( + query=query, + key=memory, + value=memory, + query_pos=query_pos, + key_pos=memory_pos, + key_padding_mask=memory_mask) # for cross_attn + + head_inputs_dict = dict(hidden_states=hidden_states) + return head_inputs_dict diff --git a/mmdetection/mmdet/models/detectors/dino.py b/mmdetection/mmdet/models/detectors/dino.py new file mode 100644 index 0000000..ade47f5 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/dino.py @@ -0,0 +1,287 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional, Tuple + +import torch +from torch import Tensor, nn +from torch.nn.init import normal_ + +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList +from mmdet.utils import OptConfigType +from ..layers import (CdnQueryGenerator, DeformableDetrTransformerEncoder, + DinoTransformerDecoder, SinePositionalEncoding) +from .deformable_detr import DeformableDETR, MultiScaleDeformableAttention + + +@MODELS.register_module() +class DINO(DeformableDETR): + r"""Implementation of `DINO: DETR with Improved DeNoising Anchor Boxes + for End-to-End Object Detection `_ + + Code is modified from the `official github repo + `_. + + Args: + dn_cfg (:obj:`ConfigDict` or dict, optional): Config of denoising + query generator. Defaults to `None`. + """ + + def __init__(self, *args, dn_cfg: OptConfigType = None, **kwargs) -> None: + super().__init__(*args, **kwargs) + assert self.as_two_stage, 'as_two_stage must be True for DINO' + assert self.with_box_refine, 'with_box_refine must be True for DINO' + + if dn_cfg is not None: + assert 'num_classes' not in dn_cfg and \ + 'num_queries' not in dn_cfg and \ + 'hidden_dim' not in dn_cfg, \ + 'The three keyword args `num_classes`, `embed_dims`, and ' \ + '`num_matching_queries` are set in `detector.__init__()`, ' \ + 'users should not set them in `dn_cfg` config.' + dn_cfg['num_classes'] = self.bbox_head.num_classes + dn_cfg['embed_dims'] = self.embed_dims + dn_cfg['num_matching_queries'] = self.num_queries + self.dn_query_generator = CdnQueryGenerator(**dn_cfg) + + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + self.positional_encoding = SinePositionalEncoding( + **self.positional_encoding) + self.encoder = DeformableDetrTransformerEncoder(**self.encoder) + self.decoder = DinoTransformerDecoder(**self.decoder) + self.embed_dims = self.encoder.embed_dims + self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims) + # NOTE In DINO, the query_embedding only contains content + # queries, while in Deformable DETR, the query_embedding + # contains both content and spatial queries, and in DETR, + # it only contains spatial queries. + + num_feats = self.positional_encoding.num_feats + assert num_feats * 2 == self.embed_dims, \ + f'embed_dims should be exactly 2 times of num_feats. ' \ + f'Found {self.embed_dims} and {num_feats}.' + + self.level_embed = nn.Parameter( + torch.Tensor(self.num_feature_levels, self.embed_dims)) + self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims) + self.memory_trans_norm = nn.LayerNorm(self.embed_dims) + + def init_weights(self) -> None: + """Initialize weights for Transformer and other components.""" + super(DeformableDETR, self).init_weights() + for coder in self.encoder, self.decoder: + for p in coder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MultiScaleDeformableAttention): + m.init_weights() + nn.init.xavier_uniform_(self.memory_trans_fc.weight) + nn.init.xavier_uniform_(self.query_embedding.weight) + normal_(self.level_embed) + + def forward_transformer( + self, + img_feats: Tuple[Tensor], + batch_data_samples: OptSampleList = None, + ) -> Dict: + """Forward process of Transformer. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + The difference is that the ground truth in `batch_data_samples` is + required for the `pre_decoder` to prepare the query of DINO. + Additionally, DINO inherits the `pre_transformer` method and the + `forward_encoder` method of DeformableDETR. More details about the + two methods can be found in `mmdet/detector/deformable_detr.py`. + + Args: + img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each + feature map has shape (bs, dim, H, W). + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + Defaults to None. + + Returns: + dict: The dictionary of bbox_head function inputs, which always + includes the `hidden_states` of the decoder output and may contain + `references` including the initial and intermediate references. + """ + encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer( + img_feats, batch_data_samples) + + encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict) + + tmp_dec_in, head_inputs_dict = self.pre_decoder( + **encoder_outputs_dict, batch_data_samples=batch_data_samples) + decoder_inputs_dict.update(tmp_dec_in) + + decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict) + head_inputs_dict.update(decoder_outputs_dict) + return head_inputs_dict + + def pre_decoder( + self, + memory: Tensor, + memory_mask: Tensor, + spatial_shapes: Tensor, + batch_data_samples: OptSampleList = None, + ) -> Tuple[Dict]: + """Prepare intermediate variables before entering Transformer decoder, + such as `query`, `query_pos`, and `reference_points`. + + Args: + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). Will only be used when + `as_two_stage` is `True`. + spatial_shapes (Tensor): Spatial shapes of features in all levels. + With shape (num_levels, 2), last dimension represents (h, w). + Will only be used when `as_two_stage` is `True`. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + Defaults to None. + + Returns: + tuple[dict]: The decoder_inputs_dict and head_inputs_dict. + + - decoder_inputs_dict (dict): The keyword dictionary args of + `self.forward_decoder()`, which includes 'query', 'memory', + `reference_points`, and `dn_mask`. The reference points of + decoder input here are 4D boxes, although it has `points` + in its name. + - head_inputs_dict (dict): The keyword dictionary args of the + bbox_head functions, which includes `topk_score`, `topk_coords`, + and `dn_meta` when `self.training` is `True`, else is empty. + """ + bs, _, c = memory.shape + cls_out_features = self.bbox_head.cls_branches[ + self.decoder.num_layers].out_features + + output_memory, output_proposals = self.gen_encoder_output_proposals( + memory, memory_mask, spatial_shapes) + enc_outputs_class = self.bbox_head.cls_branches[ + self.decoder.num_layers]( + output_memory) + enc_outputs_coord_unact = self.bbox_head.reg_branches[ + self.decoder.num_layers](output_memory) + output_proposals + + # NOTE The DINO selects top-k proposals according to scores of + # multi-class classification, while DeformDETR, where the input + # is `enc_outputs_class[..., 0]` selects according to scores of + # binary classification. + topk_indices = torch.topk( + enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1] + topk_score = torch.gather( + enc_outputs_class, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features)) + topk_coords_unact = torch.gather( + enc_outputs_coord_unact, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, 4)) + topk_coords = topk_coords_unact.sigmoid() + topk_coords_unact = topk_coords_unact.detach() + + query = self.query_embedding.weight[:, None, :] + query = query.repeat(1, bs, 1).transpose(0, 1) + if self.training: + dn_label_query, dn_bbox_query, dn_mask, dn_meta = \ + self.dn_query_generator(batch_data_samples) + query = torch.cat([dn_label_query, query], dim=1) + reference_points = torch.cat([dn_bbox_query, topk_coords_unact], + dim=1) + else: + reference_points = topk_coords_unact + dn_mask, dn_meta = None, None + reference_points = reference_points.sigmoid() + + decoder_inputs_dict = dict( + query=query, + memory=memory, + reference_points=reference_points, + dn_mask=dn_mask) + # NOTE DINO calculates encoder losses on scores and coordinates + # of selected top-k encoder queries, while DeformDETR is of all + # encoder queries. + head_inputs_dict = dict( + enc_outputs_class=topk_score, + enc_outputs_coord=topk_coords, + dn_meta=dn_meta) if self.training else dict() + return decoder_inputs_dict, head_inputs_dict + + def forward_decoder(self, + query: Tensor, + memory: Tensor, + memory_mask: Tensor, + reference_points: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios: Tensor, + dn_mask: Optional[Tensor] = None, + **kwargs) -> Dict: + """Forward with Transformer decoder. + + The forward procedure of the transformer is defined as: + 'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder' + More details can be found at `TransformerDetector.forward_transformer` + in `mmdet/detector/base_detr.py`. + + Args: + query (Tensor): The queries of decoder inputs, has shape + (bs, num_queries_total, dim), where `num_queries_total` is the + sum of `num_denoising_queries` and `num_matching_queries` when + `self.training` is `True`, else `num_matching_queries`. + memory (Tensor): The output embeddings of the Transformer encoder, + has shape (bs, num_feat_points, dim). + memory_mask (Tensor): ByteTensor, the padding mask of the memory, + has shape (bs, num_feat_points). + reference_points (Tensor): The initial reference, has shape + (bs, num_queries_total, 4) with the last dimension arranged as + (cx, cy, w, h). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + dn_mask (Tensor, optional): The attention mask to prevent + information leakage from different denoising groups and + matching parts, will be used as `self_attn_mask` of the + `self.decoder`, has shape (num_queries_total, + num_queries_total). + It is `None` when `self.training` is `False`. + + Returns: + dict: The dictionary of decoder outputs, which includes the + `hidden_states` of the decoder output and `references` including + the initial and intermediate reference_points. + """ + inter_states, references = self.decoder( + query=query, + value=memory, + key_padding_mask=memory_mask, + self_attn_mask=dn_mask, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reg_branches=self.bbox_head.reg_branches, + **kwargs) + + if len(query) == self.num_queries: + # NOTE: This is to make sure label_embeding can be involved to + # produce loss even if there is no denoising query (no ground truth + # target in this GPU), otherwise, this will raise runtime error in + # distributed training. + inter_states[0] += \ + self.dn_query_generator.label_embedding.weight[0, 0] * 0.0 + + decoder_outputs_dict = dict( + hidden_states=inter_states, references=list(references)) + return decoder_outputs_dict diff --git a/mmdetection/mmdet/models/detectors/fast_rcnn.py b/mmdetection/mmdet/models/detectors/fast_rcnn.py new file mode 100644 index 0000000..5b39050 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/fast_rcnn.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class FastRCNN(TwoStageDetector): + """Implementation of `Fast R-CNN `_""" + + def __init__(self, + backbone: ConfigType, + roi_head: ConfigType, + train_cfg: ConfigType, + test_cfg: ConfigType, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg, + data_preprocessor=data_preprocessor) diff --git a/mmdetection/mmdet/models/detectors/faster_rcnn.py b/mmdetection/mmdet/models/detectors/faster_rcnn.py new file mode 100644 index 0000000..36109e3 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/faster_rcnn.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class FasterRCNN(TwoStageDetector): + """Implementation of `Faster R-CNN `_""" + + def __init__(self, + backbone: ConfigType, + rpn_head: ConfigType, + roi_head: ConfigType, + train_cfg: ConfigType, + test_cfg: ConfigType, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg, + data_preprocessor=data_preprocessor) diff --git a/mmdetection/mmdet/models/detectors/fcos.py b/mmdetection/mmdet/models/detectors/fcos.py new file mode 100644 index 0000000..c628059 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/fcos.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class FCOS(SingleStageDetector): + """Implementation of `FCOS `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of FCOS. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of FCOS. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/fovea.py b/mmdetection/mmdet/models/detectors/fovea.py new file mode 100644 index 0000000..5e4f21c --- /dev/null +++ b/mmdetection/mmdet/models/detectors/fovea.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class FOVEA(SingleStageDetector): + """Implementation of `FoveaBox `_ + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of FOVEA. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of FOVEA. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/fsaf.py b/mmdetection/mmdet/models/detectors/fsaf.py new file mode 100644 index 0000000..01b4027 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/fsaf.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class FSAF(SingleStageDetector): + """Implementation of `FSAF `_""" + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/gfl.py b/mmdetection/mmdet/models/detectors/gfl.py new file mode 100644 index 0000000..c26821a --- /dev/null +++ b/mmdetection/mmdet/models/detectors/gfl.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class GFL(SingleStageDetector): + """Implementation of `GFL `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of GFL. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of GFL. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/glip.py b/mmdetection/mmdet/models/detectors/glip.py new file mode 100644 index 0000000..e076a55 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/glip.py @@ -0,0 +1,403 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import re +import warnings +from typing import Tuple, Union + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +def find_noun_phrases(caption: str) -> list: + """Find noun phrases in a caption using nltk. + Args: + caption (str): The caption to analyze. + + Returns: + list: List of noun phrases found in the caption. + + Examples: + >>> caption = 'There is two cat and a remote in the picture' + >>> find_noun_phrases(caption) # ['cat', 'a remote', 'the picture'] + """ + try: + import nltk + nltk.download('punkt') + nltk.download('averaged_perceptron_tagger') + except ImportError: + raise RuntimeError('nltk is not installed, please install it by: ' + 'pip install nltk.') + + caption = caption.lower() + tokens = nltk.word_tokenize(caption) + pos_tags = nltk.pos_tag(tokens) + + grammar = 'NP: {
    ?*+}' + cp = nltk.RegexpParser(grammar) + result = cp.parse(pos_tags) + + noun_phrases = [] + for subtree in result.subtrees(): + if subtree.label() == 'NP': + noun_phrases.append(' '.join(t[0] for t in subtree.leaves())) + + return noun_phrases + + +def remove_punctuation(text: str) -> str: + """Remove punctuation from a text. + Args: + text (str): The input text. + + Returns: + str: The text with punctuation removed. + """ + punctuation = [ + '|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^', '\'', '\"', '’', + '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.' + ] + for p in punctuation: + text = text.replace(p, '') + return text.strip() + + +def run_ner(caption: str) -> Tuple[list, list]: + """Run NER on a caption and return the tokens and noun phrases. + Args: + caption (str): The input caption. + + Returns: + Tuple[List, List]: A tuple containing the tokens and noun phrases. + - tokens_positive (List): A list of token positions. + - noun_phrases (List): A list of noun phrases. + """ + noun_phrases = find_noun_phrases(caption) + noun_phrases = [remove_punctuation(phrase) for phrase in noun_phrases] + noun_phrases = [phrase for phrase in noun_phrases if phrase != ''] + relevant_phrases = noun_phrases + labels = noun_phrases + + tokens_positive = [] + for entity, label in zip(relevant_phrases, labels): + try: + # search all occurrences and mark them as different entities + # TODO: Not Robust + for m in re.finditer(entity, caption.lower()): + tokens_positive.append([[m.start(), m.end()]]) + except Exception: + print('noun entities:', noun_phrases) + print('entity:', entity) + print('caption:', caption.lower()) + return tokens_positive, noun_phrases + + +def create_positive_map(tokenized, + tokens_positive: list, + max_num_entities: int = 256) -> Tensor: + """construct a map such that positive_map[i,j] = True + if box i is associated to token j + + Args: + tokenized: The tokenized input. + tokens_positive (list): A list of token ranges + associated with positive boxes. + max_num_entities (int, optional): The maximum number of entities. + Defaults to 256. + + Returns: + torch.Tensor: The positive map. + + Raises: + Exception: If an error occurs during token-to-char mapping. + """ + positive_map = torch.zeros((len(tokens_positive), max_num_entities), + dtype=torch.float) + + for j, tok_list in enumerate(tokens_positive): + for (beg, end) in tok_list: + try: + beg_pos = tokenized.char_to_token(beg) + end_pos = tokenized.char_to_token(end - 1) + except Exception as e: + print('beg:', beg, 'end:', end) + print('token_positive:', tokens_positive) + raise e + if beg_pos is None: + try: + beg_pos = tokenized.char_to_token(beg + 1) + if beg_pos is None: + beg_pos = tokenized.char_to_token(beg + 2) + except Exception: + beg_pos = None + if end_pos is None: + try: + end_pos = tokenized.char_to_token(end - 2) + if end_pos is None: + end_pos = tokenized.char_to_token(end - 3) + except Exception: + end_pos = None + if beg_pos is None or end_pos is None: + continue + + assert beg_pos is not None and end_pos is not None + positive_map[j, beg_pos:end_pos + 1].fill_(1) + return positive_map / (positive_map.sum(-1)[:, None] + 1e-6) + + +def create_positive_map_label_to_token(positive_map: Tensor, + plus: int = 0) -> dict: + """Create a dictionary mapping the label to the token. + Args: + positive_map (Tensor): The positive map tensor. + plus (int, optional): Value added to the label for indexing. + Defaults to 0. + + Returns: + dict: The dictionary mapping the label to the token. + """ + positive_map_label_to_token = {} + for i in range(len(positive_map)): + positive_map_label_to_token[i + plus] = torch.nonzero( + positive_map[i], as_tuple=True)[0].tolist() + return positive_map_label_to_token + + +@MODELS.register_module() +class GLIP(SingleStageDetector): + """Implementation of `GLIP `_ + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + language_model (:obj:`ConfigDict` or dict): The language model config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of GLIP. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of GLIP. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + language_model: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + self.language_model = MODELS.build(language_model) + + self._special_tokens = '. ' + + def get_tokens_and_prompts( + self, + original_caption: Union[str, list, tuple], + custom_entities: bool = False) -> Tuple[dict, str, list, list]: + """Get the tokens positive and prompts for the caption.""" + if isinstance(original_caption, (list, tuple)) or custom_entities: + if custom_entities and isinstance(original_caption, str): + original_caption = original_caption.strip(self._special_tokens) + original_caption = original_caption.split(self._special_tokens) + original_caption = list( + filter(lambda x: len(x) > 0, original_caption)) + + caption_string = '' + tokens_positive = [] + for idx, word in enumerate(original_caption): + tokens_positive.append( + [[len(caption_string), + len(caption_string) + len(word)]]) + caption_string += word + if idx != len(original_caption) - 1: + caption_string += self._special_tokens + tokenized = self.language_model.tokenizer([caption_string], + return_tensors='pt') + entities = original_caption + else: + original_caption = original_caption.strip(self._special_tokens) + tokenized = self.language_model.tokenizer([original_caption], + return_tensors='pt') + tokens_positive, noun_phrases = run_ner(original_caption) + entities = noun_phrases + caption_string = original_caption + + return tokenized, caption_string, tokens_positive, entities + + def get_positive_map(self, tokenized, tokens_positive): + positive_map = create_positive_map(tokenized, tokens_positive) + positive_map_label_to_token = create_positive_map_label_to_token( + positive_map, plus=1) + return positive_map_label_to_token, positive_map + + def get_tokens_positive_and_prompts( + self, + original_caption: Union[str, list, tuple], + custom_entities: bool = False) -> Tuple[dict, str, Tensor, list]: + tokenized, caption_string, tokens_positive, entities = \ + self.get_tokens_and_prompts( + original_caption, custom_entities) + positive_map_label_to_token, positive_map = self.get_positive_map( + tokenized, tokens_positive) + return positive_map_label_to_token, caption_string, \ + positive_map, entities + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, list]: + # TODO: Only open vocabulary tasks are supported for training now. + text_prompts = [ + data_samples.text for data_samples in batch_data_samples + ] + + gt_labels = [ + data_samples.gt_instances.labels + for data_samples in batch_data_samples + ] + + new_text_prompts = [] + positive_maps = [] + if len(set(text_prompts)) == 1: + # All the text prompts are the same, + # so there is no need to calculate them multiple times. + tokenized, caption_string, tokens_positive, _ = \ + self.get_tokens_and_prompts( + text_prompts[0], True) + new_text_prompts = [caption_string] * len(batch_inputs) + for gt_label in gt_labels: + new_tokens_positive = [ + tokens_positive[label] for label in gt_label + ] + _, positive_map = self.get_positive_map( + tokenized, new_tokens_positive) + positive_maps.append(positive_map) + else: + for text_prompt, gt_label in zip(text_prompts, gt_labels): + tokenized, caption_string, tokens_positive, _ = \ + self.get_tokens_and_prompts( + text_prompt, True) + new_tokens_positive = [ + tokens_positive[label] for label in gt_label + ] + _, positive_map = self.get_positive_map( + tokenized, new_tokens_positive) + positive_maps.append(positive_map) + new_text_prompts.append(caption_string) + + language_dict_features = self.language_model(new_text_prompts) + for i, data_samples in enumerate(batch_data_samples): + # .bool().float() is very important + positive_map = positive_maps[i].to( + batch_inputs.device).bool().float() + data_samples.gt_instances.positive_maps = positive_map + + visual_features = self.extract_feat(batch_inputs) + + losses = self.bbox_head.loss(visual_features, language_dict_features, + batch_data_samples) + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the + input images. Each DetDataSample usually contain + 'pred_instances'. And the ``pred_instances`` usually + contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - label_names (List[str]): Label names of bboxes. + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + text_prompts = [ + data_samples.text for data_samples in batch_data_samples + ] + + if 'custom_entities' in batch_data_samples[0]: + # Assuming that the `custom_entities` flag + # inside a batch is always the same. For single image inference + custom_entities = batch_data_samples[0].custom_entities + else: + custom_entities = False + + if len(set(text_prompts)) == 1: + # All the text prompts are the same, + # so there is no need to calculate them multiple times. + _positive_maps_and_prompts = [ + self.get_tokens_positive_and_prompts(text_prompts[0], + custom_entities) + ] * len(batch_inputs) + else: + _positive_maps_and_prompts = [ + self.get_tokens_positive_and_prompts(text_prompt, + custom_entities) + for text_prompt in text_prompts + ] + + token_positive_maps, text_prompts, _, entities = zip( + *_positive_maps_and_prompts) + + language_dict_features = self.language_model(list(text_prompts)) + + for i, data_samples in enumerate(batch_data_samples): + data_samples.token_positive_map = token_positive_maps[i] + + visual_features = self.extract_feat(batch_inputs) + + results_list = self.bbox_head.predict( + visual_features, + language_dict_features, + batch_data_samples, + rescale=rescale) + + for data_sample, pred_instances, entity in zip(batch_data_samples, + results_list, entities): + if len(pred_instances) > 0: + label_names = [] + for labels in pred_instances.labels: + if labels >= len(entity): + warnings.warn( + 'The unexpected output indicates an issue with ' + 'named entity recognition. You can try ' + 'setting custom_entities=True and running ' + 'again to see if it helps.') + label_names.append('unobject') + else: + label_names.append(entity[labels]) + # for visualization + pred_instances.label_names = label_names + data_sample.pred_instances = pred_instances + return batch_data_samples diff --git a/mmdetection/mmdet/models/detectors/grid_rcnn.py b/mmdetection/mmdet/models/detectors/grid_rcnn.py new file mode 100644 index 0000000..7bcb5b0 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/grid_rcnn.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class GridRCNN(TwoStageDetector): + """Grid R-CNN. + + This detector is the implementation of: + - Grid R-CNN (https://arxiv.org/abs/1811.12030) + - Grid R-CNN Plus: Faster and Better (https://arxiv.org/abs/1906.05688) + """ + + def __init__(self, + backbone: ConfigType, + rpn_head: ConfigType, + roi_head: ConfigType, + train_cfg: ConfigType, + test_cfg: ConfigType, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/grounding_dino.py b/mmdetection/mmdet/models/detectors/grounding_dino.py new file mode 100644 index 0000000..69d398b --- /dev/null +++ b/mmdetection/mmdet/models/detectors/grounding_dino.py @@ -0,0 +1,384 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Dict, Tuple, Union + +import torch +import torch.nn as nn +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList, SampleList +from ..layers import SinePositionalEncoding +from ..layers.transformer.grounding_dino_layers import ( + GroundingDinoTransformerDecoder, GroundingDinoTransformerEncoder) +from .dino import DINO +from .glip import (create_positive_map, create_positive_map_label_to_token, + run_ner) + + +@MODELS.register_module() +class GroundingDINO(DINO): + """Implementation of `Grounding DINO: Marrying DINO with Grounded Pre- + Training for Open-Set Object Detection. + + `_ + + Code is modified from the `official github repo + `_. + """ + + def __init__(self, language_model, *args, **kwargs) -> None: + + self.language_model_cfg = language_model + self._special_tokens = '. ' + super().__init__(*args, **kwargs) + + def _init_layers(self) -> None: + """Initialize layers except for backbone, neck and bbox_head.""" + self.positional_encoding = SinePositionalEncoding( + **self.positional_encoding) + self.encoder = GroundingDinoTransformerEncoder(**self.encoder) + self.decoder = GroundingDinoTransformerDecoder(**self.decoder) + self.embed_dims = self.encoder.embed_dims + self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims) + num_feats = self.positional_encoding.num_feats + assert num_feats * 2 == self.embed_dims, \ + f'embed_dims should be exactly 2 times of num_feats. ' \ + f'Found {self.embed_dims} and {num_feats}.' + + self.level_embed = nn.Parameter( + torch.Tensor(self.num_feature_levels, self.embed_dims)) + self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims) + self.memory_trans_norm = nn.LayerNorm(self.embed_dims) + + # text modules + self.language_model = MODELS.build(self.language_model_cfg) + self.text_feat_map = nn.Linear( + self.language_model.language_backbone.body.language_dim, + self.embed_dims, + bias=True) + + def init_weights(self) -> None: + """Initialize weights for Transformer and other components.""" + super().init_weights() + nn.init.constant_(self.text_feat_map.bias.data, 0) + nn.init.xavier_uniform_(self.text_feat_map.weight.data) + + def get_tokens_and_prompts( + self, + original_caption: Union[str, list, tuple], + custom_entities: bool = False) -> Tuple[dict, str, list]: + """Get the tokens positive and prompts for the caption.""" + if isinstance(original_caption, (list, tuple)) or custom_entities: + if custom_entities and isinstance(original_caption, str): + original_caption = original_caption.strip(self._special_tokens) + original_caption = original_caption.split(self._special_tokens) + original_caption = list( + filter(lambda x: len(x) > 0, original_caption)) + + caption_string = '' + tokens_positive = [] + for idx, word in enumerate(original_caption): + tokens_positive.append( + [[len(caption_string), + len(caption_string) + len(word)]]) + caption_string += word + caption_string += self._special_tokens + # NOTE: Tokenizer in Grounding DINO is different from + # that in GLIP. The tokenizer in GLIP will pad the + # caption_string to max_length, while the tokenizer + # in Grounding DINO will not. + tokenized = self.language_model.tokenizer( + [caption_string], + padding='max_length' + if self.language_model.pad_to_max else 'longest', + return_tensors='pt') + entities = original_caption + else: + if not original_caption.endswith('.'): + original_caption = original_caption + self._special_tokens + # NOTE: Tokenizer in Grounding DINO is different from + # that in GLIP. The tokenizer in GLIP will pad the + # caption_string to max_length, while the tokenizer + # in Grounding DINO will not. + tokenized = self.language_model.tokenizer( + [original_caption], + padding='max_length' + if self.language_model.pad_to_max else 'longest', + return_tensors='pt') + tokens_positive, noun_phrases = run_ner(original_caption) + entities = noun_phrases + caption_string = original_caption + + return tokenized, caption_string, tokens_positive, entities + + def get_positive_map(self, tokenized, tokens_positive): + positive_map = create_positive_map(tokenized, tokens_positive) + positive_map_label_to_token = create_positive_map_label_to_token( + positive_map, plus=1) + return positive_map_label_to_token, positive_map + + def get_tokens_positive_and_prompts( + self, + original_caption: Union[str, list, tuple], + custom_entities: bool = False) -> Tuple[dict, str, Tensor, list]: + """Get the tokens positive and prompts for the caption. + + Args: + original_caption (str): The original caption, e.g. 'bench . car .' + custom_entities (bool, optional): Whether to use custom entities. + If ``True``, the ``original_caption`` should be a list of + strings, each of which is a word. Defaults to False. + + Returns: + Tuple[dict, str, dict, str]: The dict is a mapping from each entity + id, which is numbered from 1, to its positive token id. + The str represents the prompts. + """ + tokenized, caption_string, tokens_positive, entities = \ + self.get_tokens_and_prompts( + original_caption, custom_entities) + positive_map_label_to_token, positive_map = self.get_positive_map( + tokenized, tokens_positive) + return positive_map_label_to_token, caption_string, \ + positive_map, entities + + def forward_transformer( + self, + img_feats: Tuple[Tensor], + text_dict: Dict, + batch_data_samples: OptSampleList = None, + ) -> Dict: + encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer( + img_feats, batch_data_samples) + + encoder_outputs_dict = self.forward_encoder( + **encoder_inputs_dict, text_dict=text_dict) + + tmp_dec_in, head_inputs_dict = self.pre_decoder( + **encoder_outputs_dict, batch_data_samples=batch_data_samples) + decoder_inputs_dict.update(tmp_dec_in) + + decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict) + head_inputs_dict.update(decoder_outputs_dict) + return head_inputs_dict + + def forward_encoder(self, feat: Tensor, feat_mask: Tensor, + feat_pos: Tensor, spatial_shapes: Tensor, + level_start_index: Tensor, valid_ratios: Tensor, + text_dict: Dict) -> Dict: + text_token_mask = text_dict['text_token_mask'] + memory, memory_text = self.encoder( + query=feat, + query_pos=feat_pos, + key_padding_mask=feat_mask, # for self_attn + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + # for text encoder + memory_text=text_dict['embedded'], + text_attention_mask=~text_token_mask, + position_ids=text_dict['position_ids'], + text_self_attention_masks=text_dict['masks']) + encoder_outputs_dict = dict( + memory=memory, + memory_mask=feat_mask, + spatial_shapes=spatial_shapes, + memory_text=memory_text, + text_token_mask=text_token_mask) + return encoder_outputs_dict + + def pre_decoder( + self, + memory: Tensor, + memory_mask: Tensor, + spatial_shapes: Tensor, + memory_text: Tensor, + text_token_mask: Tensor, + batch_data_samples: OptSampleList = None, + ) -> Tuple[Dict]: + bs, _, c = memory.shape + + output_memory, output_proposals = self.gen_encoder_output_proposals( + memory, memory_mask, spatial_shapes) + + enc_outputs_class = self.bbox_head.cls_branches[ + self.decoder.num_layers](output_memory, memory_text, + text_token_mask) + cls_out_features = self.bbox_head.cls_branches[ + self.decoder.num_layers].max_text_len + enc_outputs_coord_unact = self.bbox_head.reg_branches[ + self.decoder.num_layers](output_memory) + output_proposals + + # NOTE The DINO selects top-k proposals according to scores of + # multi-class classification, while DeformDETR, where the input + # is `enc_outputs_class[..., 0]` selects according to scores of + # binary classification. + topk_indices = torch.topk( + enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1] + + topk_score = torch.gather( + enc_outputs_class, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features)) + topk_coords_unact = torch.gather( + enc_outputs_coord_unact, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, 4)) + topk_coords = topk_coords_unact.sigmoid() + topk_coords_unact = topk_coords_unact.detach() + + query = self.query_embedding.weight[:, None, :] + query = query.repeat(1, bs, 1).transpose(0, 1) + if self.training: + dn_label_query, dn_bbox_query, dn_mask, dn_meta = \ + self.dn_query_generator(batch_data_samples) + query = torch.cat([dn_label_query, query], dim=1) + reference_points = torch.cat([dn_bbox_query, topk_coords_unact], + dim=1) + else: + reference_points = topk_coords_unact + dn_mask, dn_meta = None, None + reference_points = reference_points.sigmoid() + + decoder_inputs_dict = dict( + query=query, + memory=memory, + reference_points=reference_points, + dn_mask=dn_mask, + memory_text=memory_text, + text_attention_mask=~text_token_mask, + ) + # NOTE DINO calculates encoder losses on scores and coordinates + # of selected top-k encoder queries, while DeformDETR is of all + # encoder queries. + head_inputs_dict = dict( + enc_outputs_class=topk_score, + enc_outputs_coord=topk_coords, + dn_meta=dn_meta) if self.training else dict() + # append text_feats to head_inputs_dict + head_inputs_dict['memory_text'] = memory_text + head_inputs_dict['text_token_mask'] = text_token_mask + return decoder_inputs_dict, head_inputs_dict + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, list]: + # TODO: Only open vocabulary tasks are supported for training now. + text_prompts = [ + data_samples.text for data_samples in batch_data_samples + ] + + gt_labels = [ + data_samples.gt_instances.labels + for data_samples in batch_data_samples + ] + + new_text_prompts = [] + positive_maps = [] + if len(set(text_prompts)) == 1: + # All the text prompts are the same, + # so there is no need to calculate them multiple times. + tokenized, caption_string, tokens_positive, _ = \ + self.get_tokens_and_prompts( + text_prompts[0], True) + new_text_prompts = [caption_string] * len(batch_inputs) + for gt_label in gt_labels: + new_tokens_positive = [ + tokens_positive[label] for label in gt_label + ] + _, positive_map = self.get_positive_map( + tokenized, new_tokens_positive) + positive_maps.append(positive_map) + else: + for text_prompt, gt_label in zip(text_prompts, gt_labels): + tokenized, caption_string, tokens_positive, _ = \ + self.get_tokens_and_prompts( + text_prompt, True) + new_tokens_positive = [ + tokens_positive[label] for label in gt_label + ] + _, positive_map = self.get_positive_map( + tokenized, new_tokens_positive) + positive_maps.append(positive_map) + new_text_prompts.append(caption_string) + + text_dict = self.language_model(new_text_prompts) + if self.text_feat_map is not None: + text_dict['embedded'] = self.text_feat_map(text_dict['embedded']) + + for i, data_samples in enumerate(batch_data_samples): + positive_map = positive_maps[i].to( + batch_inputs.device).bool().float() + text_token_mask = text_dict['text_token_mask'][i] + data_samples.gt_instances.positive_maps = positive_map + data_samples.gt_instances.text_token_mask = \ + text_token_mask.unsqueeze(0).repeat( + len(positive_map), 1) + + visual_features = self.extract_feat(batch_inputs) + head_inputs_dict = self.forward_transformer(visual_features, text_dict, + batch_data_samples) + + losses = self.bbox_head.loss( + **head_inputs_dict, batch_data_samples=batch_data_samples) + return losses + + def predict(self, batch_inputs, batch_data_samples, rescale: bool = True): + text_prompts = [ + data_samples.text for data_samples in batch_data_samples + ] + if 'custom_entities' in batch_data_samples[0]: + # Assuming that the `custom_entities` flag + # inside a batch is always the same. For single image inference + custom_entities = batch_data_samples[0].custom_entities + else: + custom_entities = False + if len(text_prompts) == 1: + # All the text prompts are the same, + # so there is no need to calculate them multiple times. + _positive_maps_and_prompts = [ + self.get_tokens_positive_and_prompts(text_prompts[0], + custom_entities) + ] * len(batch_inputs) + else: + _positive_maps_and_prompts = [ + self.get_tokens_positive_and_prompts(text_prompt, + custom_entities) + for text_prompt in text_prompts + ] + token_positive_maps, text_prompts, _, entities = zip( + *_positive_maps_and_prompts) + # extract text feats + text_dict = self.language_model(list(text_prompts)) + # text feature map layer + if self.text_feat_map is not None: + text_dict['embedded'] = self.text_feat_map(text_dict['embedded']) + + for i, data_samples in enumerate(batch_data_samples): + data_samples.token_positive_map = token_positive_maps[i] + + # image feature extraction + visual_feats = self.extract_feat(batch_inputs) + + head_inputs_dict = self.forward_transformer(visual_feats, text_dict, + batch_data_samples) + results_list = self.bbox_head.predict( + **head_inputs_dict, + rescale=rescale, + batch_data_samples=batch_data_samples) + for data_sample, pred_instances, entity in zip(batch_data_samples, + results_list, entities): + if len(pred_instances) > 0: + label_names = [] + for labels in pred_instances.labels: + if labels >= len(entity): + warnings.warn( + 'The unexpected output indicates an issue with ' + 'named entity recognition. You can try ' + 'setting custom_entities=True and running ' + 'again to see if it helps.') + label_names.append('unobject') + else: + label_names.append(entity[labels]) + # for visualization + pred_instances.label_names = label_names + data_sample.pred_instances = pred_instances + return batch_data_samples diff --git a/mmdetection/mmdet/models/detectors/htc.py b/mmdetection/mmdet/models/detectors/htc.py new file mode 100644 index 0000000..22a2aa8 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/htc.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from .cascade_rcnn import CascadeRCNN + + +@MODELS.register_module() +class HybridTaskCascade(CascadeRCNN): + """Implementation of `HTC `_""" + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + @property + def with_semantic(self) -> bool: + """bool: whether the detector has a semantic head""" + return self.roi_head.with_semantic diff --git a/mmdetection/mmdet/models/detectors/kd_one_stage.py b/mmdetection/mmdet/models/detectors/kd_one_stage.py new file mode 100644 index 0000000..8a4a1bb --- /dev/null +++ b/mmdetection/mmdet/models/detectors/kd_one_stage.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from pathlib import Path +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +from mmengine.config import Config +from mmengine.runner import load_checkpoint +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class KnowledgeDistillationSingleStageDetector(SingleStageDetector): + r"""Implementation of `Distilling the Knowledge in a Neural Network. + `_. + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + teacher_config (:obj:`ConfigDict` | dict | str | Path): Config file + path or the config object of teacher model. + teacher_ckpt (str, optional): Checkpoint path of teacher model. + If left as None, the model will not load any weights. + Defaults to True. + eval_teacher (bool): Set the train mode for teacher. + Defaults to True. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of ATSS. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of ATSS. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + """ + + def __init__( + self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + teacher_config: Union[ConfigType, str, Path], + teacher_ckpt: Optional[str] = None, + eval_teacher: bool = True, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + ) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor) + self.eval_teacher = eval_teacher + # Build teacher model + if isinstance(teacher_config, (str, Path)): + teacher_config = Config.fromfile(teacher_config) + self.teacher_model = MODELS.build(teacher_config['model']) + if teacher_ckpt is not None: + load_checkpoint( + self.teacher_model, teacher_ckpt, map_location='cpu') + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> dict: + """ + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + x = self.extract_feat(batch_inputs) + with torch.no_grad(): + teacher_x = self.teacher_model.extract_feat(batch_inputs) + out_teacher = self.teacher_model.bbox_head(teacher_x) + losses = self.bbox_head.loss(x, out_teacher, batch_data_samples) + return losses + + def cuda(self, device: Optional[str] = None) -> nn.Module: + """Since teacher_model is registered as a plain object, it is necessary + to put the teacher model to cuda when calling ``cuda`` function.""" + self.teacher_model.cuda(device=device) + return super().cuda(device=device) + + def to(self, device: Optional[str] = None) -> nn.Module: + """Since teacher_model is registered as a plain object, it is necessary + to put the teacher model to other device when calling ``to`` + function.""" + self.teacher_model.to(device=device) + return super().to(device=device) + + def train(self, mode: bool = True) -> None: + """Set the same train mode for teacher and student model.""" + if self.eval_teacher: + self.teacher_model.train(False) + else: + self.teacher_model.train(mode) + super().train(mode) + + def __setattr__(self, name: str, value: Any) -> None: + """Set attribute, i.e. self.name = value + + This reloading prevent the teacher model from being registered as a + nn.Module. The teacher module is registered as a plain object, so that + the teacher parameters will not show up when calling + ``self.parameters``, ``self.modules``, ``self.children`` methods. + """ + if name == 'teacher_model': + object.__setattr__(self, name, value) + else: + super().__setattr__(name, value) diff --git a/mmdetection/mmdet/models/detectors/lad.py b/mmdetection/mmdet/models/detectors/lad.py new file mode 100644 index 0000000..008f898 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/lad.py @@ -0,0 +1,93 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +import torch.nn as nn +from mmengine.runner import load_checkpoint +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType +from ..utils.misc import unpack_gt_instances +from .kd_one_stage import KnowledgeDistillationSingleStageDetector + + +@MODELS.register_module() +class LAD(KnowledgeDistillationSingleStageDetector): + """Implementation of `LAD `_.""" + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + teacher_backbone: ConfigType, + teacher_neck: ConfigType, + teacher_bbox_head: ConfigType, + teacher_ckpt: Optional[str] = None, + eval_teacher: bool = True, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None) -> None: + super(KnowledgeDistillationSingleStageDetector, self).__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor) + self.eval_teacher = eval_teacher + self.teacher_model = nn.Module() + self.teacher_model.backbone = MODELS.build(teacher_backbone) + if teacher_neck is not None: + self.teacher_model.neck = MODELS.build(teacher_neck) + teacher_bbox_head.update(train_cfg=train_cfg) + teacher_bbox_head.update(test_cfg=test_cfg) + self.teacher_model.bbox_head = MODELS.build(teacher_bbox_head) + if teacher_ckpt is not None: + load_checkpoint( + self.teacher_model, teacher_ckpt, map_location='cpu') + + @property + def with_teacher_neck(self) -> bool: + """bool: whether the detector has a teacher_neck""" + return hasattr(self.teacher_model, 'neck') and \ + self.teacher_model.neck is not None + + def extract_teacher_feat(self, batch_inputs: Tensor) -> Tensor: + """Directly extract teacher features from the backbone+neck.""" + x = self.teacher_model.backbone(batch_inputs) + if self.with_teacher_neck: + x = self.teacher_model.neck(x) + return x + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> dict: + """ + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + # get label assignment from the teacher + with torch.no_grad(): + x_teacher = self.extract_teacher_feat(batch_inputs) + outs_teacher = self.teacher_model.bbox_head(x_teacher) + label_assignment_results = \ + self.teacher_model.bbox_head.get_label_assignment( + *outs_teacher, batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + + # the student use the label assignment from the teacher to learn + x = self.extract_feat(batch_inputs) + losses = self.bbox_head.loss(x, label_assignment_results, + batch_data_samples) + return losses diff --git a/mmdetection/mmdet/models/detectors/mask2former.py b/mmdetection/mmdet/models/detectors/mask2former.py new file mode 100644 index 0000000..4f38ef4 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/mask2former.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .maskformer import MaskFormer + + +@MODELS.register_module() +class Mask2Former(MaskFormer): + r"""Implementation of `Masked-attention Mask + Transformer for Universal Image Segmentation + `_.""" + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + panoptic_head: OptConfigType = None, + panoptic_fusion_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + backbone=backbone, + neck=neck, + panoptic_head=panoptic_head, + panoptic_fusion_head=panoptic_fusion_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/mask_rcnn.py b/mmdetection/mmdet/models/detectors/mask_rcnn.py new file mode 100644 index 0000000..880ee1e --- /dev/null +++ b/mmdetection/mmdet/models/detectors/mask_rcnn.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import ConfigDict + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class MaskRCNN(TwoStageDetector): + """Implementation of `Mask R-CNN `_""" + + def __init__(self, + backbone: ConfigDict, + rpn_head: ConfigDict, + roi_head: ConfigDict, + train_cfg: ConfigDict, + test_cfg: ConfigDict, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg, + data_preprocessor=data_preprocessor) diff --git a/mmdetection/mmdet/models/detectors/mask_scoring_rcnn.py b/mmdetection/mmdet/models/detectors/mask_scoring_rcnn.py new file mode 100644 index 0000000..e09d3a1 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/mask_scoring_rcnn.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class MaskScoringRCNN(TwoStageDetector): + """Mask Scoring RCNN. + + https://arxiv.org/abs/1903.00241 + """ + + def __init__(self, + backbone: ConfigType, + rpn_head: ConfigType, + roi_head: ConfigType, + train_cfg: ConfigType, + test_cfg: ConfigType, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/maskformer.py b/mmdetection/mmdet/models/detectors/maskformer.py new file mode 100644 index 0000000..7493c00 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/maskformer.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Tuple + +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class MaskFormer(SingleStageDetector): + r"""Implementation of `Per-Pixel Classification is + NOT All You Need for Semantic Segmentation + `_.""" + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + panoptic_head: OptConfigType = None, + panoptic_fusion_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super(SingleStageDetector, self).__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + self.backbone = MODELS.build(backbone) + if neck is not None: + self.neck = MODELS.build(neck) + + panoptic_head_ = panoptic_head.deepcopy() + panoptic_head_.update(train_cfg=train_cfg) + panoptic_head_.update(test_cfg=test_cfg) + self.panoptic_head = MODELS.build(panoptic_head_) + + panoptic_fusion_head_ = panoptic_fusion_head.deepcopy() + panoptic_fusion_head_.update(test_cfg=test_cfg) + self.panoptic_fusion_head = MODELS.build(panoptic_fusion_head_) + + self.num_things_classes = self.panoptic_head.num_things_classes + self.num_stuff_classes = self.panoptic_head.num_stuff_classes + self.num_classes = self.panoptic_head.num_classes + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Dict[str, Tensor]: + """ + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + x = self.extract_feat(batch_inputs) + losses = self.panoptic_head.loss(x, batch_data_samples) + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the + input images. Each DetDataSample usually contain + 'pred_instances' and `pred_panoptic_seg`. And the + ``pred_instances`` usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + + And the ``pred_panoptic_seg`` contains the following key + + - sem_seg (Tensor): panoptic segmentation mask, has a + shape (1, h, w). + """ + feats = self.extract_feat(batch_inputs) + mask_cls_results, mask_pred_results = self.panoptic_head.predict( + feats, batch_data_samples) + results_list = self.panoptic_fusion_head.predict( + mask_cls_results, + mask_pred_results, + batch_data_samples, + rescale=rescale) + results = self.add_pred_to_datasample(batch_data_samples, results_list) + + return results + + def add_pred_to_datasample(self, data_samples: SampleList, + results_list: List[dict]) -> SampleList: + """Add predictions to `DetDataSample`. + + Args: + data_samples (list[:obj:`DetDataSample`], optional): A batch of + data samples that contain annotations and predictions. + results_list (List[dict]): Instance segmentation, segmantic + segmentation and panoptic segmentation results. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the + input images. Each DetDataSample usually contain + 'pred_instances' and `pred_panoptic_seg`. And the + ``pred_instances`` usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + + And the ``pred_panoptic_seg`` contains the following key + + - sem_seg (Tensor): panoptic segmentation mask, has a + shape (1, h, w). + """ + for data_sample, pred_results in zip(data_samples, results_list): + if 'pan_results' in pred_results: + data_sample.pred_panoptic_seg = pred_results['pan_results'] + + if 'ins_results' in pred_results: + data_sample.pred_instances = pred_results['ins_results'] + + assert 'sem_results' not in pred_results, 'segmantic ' \ + 'segmentation results are not supported yet.' + + return data_samples + + def _forward(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Tuple[List[Tensor]]: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + tuple[List[Tensor]]: A tuple of features from ``panoptic_head`` + forward. + """ + feats = self.extract_feat(batch_inputs) + results = self.panoptic_head.forward(feats, batch_data_samples) + return results diff --git a/mmdetection/mmdet/models/detectors/nasfcos.py b/mmdetection/mmdet/models/detectors/nasfcos.py new file mode 100644 index 0000000..da2b911 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/nasfcos.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class NASFCOS(SingleStageDetector): + """Implementation of `NAS-FCOS: Fast Neural Architecture Search for Object + Detection. `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of NASFCOS. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of NASFCOS. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/paa.py b/mmdetection/mmdet/models/detectors/paa.py new file mode 100644 index 0000000..094306b --- /dev/null +++ b/mmdetection/mmdet/models/detectors/paa.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class PAA(SingleStageDetector): + """Implementation of `PAA `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of PAA. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of PAA. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/panoptic_fpn.py b/mmdetection/mmdet/models/detectors/panoptic_fpn.py new file mode 100644 index 0000000..ae63ccc --- /dev/null +++ b/mmdetection/mmdet/models/detectors/panoptic_fpn.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor + + +@MODELS.register_module() +class PanopticFPN(TwoStagePanopticSegmentor): + r"""Implementation of `Panoptic feature pyramid + networks `_""" + + def __init__( + self, + backbone: ConfigType, + neck: OptConfigType = None, + rpn_head: OptConfigType = None, + roi_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None, + # for panoptic segmentation + semantic_head: OptConfigType = None, + panoptic_fusion_head: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg, + semantic_head=semantic_head, + panoptic_fusion_head=panoptic_fusion_head) diff --git a/mmdetection/mmdet/models/detectors/panoptic_two_stage_segmentor.py b/mmdetection/mmdet/models/detectors/panoptic_two_stage_segmentor.py new file mode 100644 index 0000000..879edbe --- /dev/null +++ b/mmdetection/mmdet/models/detectors/panoptic_two_stage_segmentor.py @@ -0,0 +1,234 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List + +import torch +from mmengine.structures import PixelData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class TwoStagePanopticSegmentor(TwoStageDetector): + """Base class of Two-stage Panoptic Segmentor. + + As well as the components in TwoStageDetector, Panoptic Segmentor has extra + semantic_head and panoptic_fusion_head. + """ + + def __init__( + self, + backbone: ConfigType, + neck: OptConfigType = None, + rpn_head: OptConfigType = None, + roi_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None, + # for panoptic segmentation + semantic_head: OptConfigType = None, + panoptic_fusion_head: OptConfigType = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + + if semantic_head is not None: + self.semantic_head = MODELS.build(semantic_head) + + if panoptic_fusion_head is not None: + panoptic_cfg = test_cfg.panoptic if test_cfg is not None else None + panoptic_fusion_head_ = panoptic_fusion_head.deepcopy() + panoptic_fusion_head_.update(test_cfg=panoptic_cfg) + self.panoptic_fusion_head = MODELS.build(panoptic_fusion_head_) + + self.num_things_classes = self.panoptic_fusion_head.\ + num_things_classes + self.num_stuff_classes = self.panoptic_fusion_head.\ + num_stuff_classes + self.num_classes = self.panoptic_fusion_head.num_classes + + @property + def with_semantic_head(self) -> bool: + """bool: whether the detector has semantic head""" + return hasattr(self, + 'semantic_head') and self.semantic_head is not None + + @property + def with_panoptic_fusion_head(self) -> bool: + """bool: whether the detector has panoptic fusion head""" + return hasattr(self, 'panoptic_fusion_head') and \ + self.panoptic_fusion_head is not None + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> dict: + """ + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + x = self.extract_feat(batch_inputs) + + losses = dict() + + # RPN forward and loss + if self.with_rpn: + proposal_cfg = self.train_cfg.get('rpn_proposal', + self.test_cfg.rpn) + rpn_data_samples = copy.deepcopy(batch_data_samples) + # set cat_id of gt_labels to 0 in RPN + for data_sample in rpn_data_samples: + data_sample.gt_instances.labels = \ + torch.zeros_like(data_sample.gt_instances.labels) + + rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict( + x, rpn_data_samples, proposal_cfg=proposal_cfg) + # avoid get same name with roi_head loss + keys = rpn_losses.keys() + for key in list(keys): + if 'loss' in key and 'rpn' not in key: + rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) + losses.update(rpn_losses) + else: + # TODO: Not support currently, should have a check at Fast R-CNN + assert batch_data_samples[0].get('proposals', None) is not None + # use pre-defined proposals in InstanceData for the second stage + # to extract ROI features. + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + roi_losses = self.roi_head.loss(x, rpn_results_list, + batch_data_samples) + losses.update(roi_losses) + + semantic_loss = self.semantic_head.loss(x, batch_data_samples) + losses.update(semantic_loss) + + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + List[:obj:`DetDataSample`]: Return the packed panoptic segmentation + results of input images. Each DetDataSample usually contains + 'pred_panoptic_seg'. And the 'pred_panoptic_seg' has a key + ``sem_seg``, which is a tensor of shape (1, h, w). + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + x = self.extract_feat(batch_inputs) + + # If there are no pre-defined proposals, use RPN to get proposals + if batch_data_samples[0].get('proposals', None) is None: + rpn_results_list = self.rpn_head.predict( + x, batch_data_samples, rescale=False) + else: + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + results_list = self.roi_head.predict( + x, rpn_results_list, batch_data_samples, rescale=rescale) + + seg_preds = self.semantic_head.predict(x, batch_img_metas, rescale) + + results_list = self.panoptic_fusion_head.predict( + results_list, seg_preds) + + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples + + # TODO the code has not been verified and needs to be refactored later. + def _forward(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + + Returns: + tuple: A tuple of features from ``rpn_head``, ``roi_head`` and + ``semantic_head`` forward. + """ + results = () + x = self.extract_feat(batch_inputs) + rpn_outs = self.rpn_head.forward(x) + results = results + (rpn_outs) + + # If there are no pre-defined proposals, use RPN to get proposals + if batch_data_samples[0].get('proposals', None) is None: + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + rpn_results_list = self.rpn_head.predict_by_feat( + *rpn_outs, batch_img_metas=batch_img_metas, rescale=False) + else: + # TODO: Not checked currently. + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + # roi_head + roi_outs = self.roi_head(x, rpn_results_list) + results = results + (roi_outs) + + # semantic_head + sem_outs = self.semantic_head.forward(x) + results = results + (sem_outs['seg_preds'], ) + + return results + + def add_pred_to_datasample(self, data_samples: SampleList, + results_list: List[PixelData]) -> SampleList: + """Add predictions to `DetDataSample`. + + Args: + data_samples (list[:obj:`DetDataSample`]): The + annotation data of every samples. + results_list (List[PixelData]): Panoptic segmentation results of + each image. + + Returns: + List[:obj:`DetDataSample`]: Return the packed panoptic segmentation + results of input images. Each DetDataSample usually contains + 'pred_panoptic_seg'. And the 'pred_panoptic_seg' has a key + ``sem_seg``, which is a tensor of shape (1, h, w). + """ + + for data_sample, pred_panoptic_seg in zip(data_samples, results_list): + data_sample.pred_panoptic_seg = pred_panoptic_seg + return data_samples diff --git a/mmdetection/mmdet/models/detectors/point_rend.py b/mmdetection/mmdet/models/detectors/point_rend.py new file mode 100644 index 0000000..5062ac0 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/point_rend.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import ConfigDict + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class PointRend(TwoStageDetector): + """PointRend: Image Segmentation as Rendering + + This detector is the implementation of + `PointRend `_. + + """ + + def __init__(self, + backbone: ConfigDict, + rpn_head: ConfigDict, + roi_head: ConfigDict, + train_cfg: ConfigDict, + test_cfg: ConfigDict, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg, + data_preprocessor=data_preprocessor) diff --git a/mmdetection/mmdet/models/detectors/queryinst.py b/mmdetection/mmdet/models/detectors/queryinst.py new file mode 100644 index 0000000..400ce20 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/queryinst.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .sparse_rcnn import SparseRCNN + + +@MODELS.register_module() +class QueryInst(SparseRCNN): + r"""Implementation of + `Instances as Queries `_""" + + def __init__(self, + backbone: ConfigType, + rpn_head: ConfigType, + roi_head: ConfigType, + train_cfg: ConfigType, + test_cfg: ConfigType, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/reppoints_detector.py b/mmdetection/mmdet/models/detectors/reppoints_detector.py new file mode 100644 index 0000000..d86cec2 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/reppoints_detector.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class RepPointsDetector(SingleStageDetector): + """RepPoints: Point Set Representation for Object Detection. + + This detector is the implementation of: + - RepPoints detector (https://arxiv.org/pdf/1904.11490) + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/retinanet.py b/mmdetection/mmdet/models/detectors/retinanet.py new file mode 100644 index 0000000..03e3cb2 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/retinanet.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class RetinaNet(SingleStageDetector): + """Implementation of `RetinaNet `_""" + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/rpn.py b/mmdetection/mmdet/models/detectors/rpn.py new file mode 100644 index 0000000..72fe852 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/rpn.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class RPN(SingleStageDetector): + """Implementation of Region Proposal Network. + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + rpn_head: ConfigType, + train_cfg: ConfigType, + test_cfg: ConfigType, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + super(SingleStageDetector, self).__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + self.backbone = MODELS.build(backbone) + self.neck = MODELS.build(neck) if neck is not None else None + rpn_train_cfg = train_cfg['rpn'] if train_cfg is not None else None + rpn_head_num_classes = rpn_head.get('num_classes', 1) + if rpn_head_num_classes != 1: + warnings.warn('The `num_classes` should be 1 in RPN, but get ' + f'{rpn_head_num_classes}, please set ' + 'rpn_head.num_classes = 1 in your config file.') + rpn_head.update(num_classes=1) + rpn_head.update(train_cfg=rpn_train_cfg) + rpn_head.update(test_cfg=test_cfg['rpn']) + self.bbox_head = MODELS.build(rpn_head) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + x = self.extract_feat(batch_inputs) + + # set cat_id of gt_labels to 0 in RPN + rpn_data_samples = copy.deepcopy(batch_data_samples) + for data_sample in rpn_data_samples: + data_sample.gt_instances.labels = \ + torch.zeros_like(data_sample.gt_instances.labels) + + losses = self.bbox_head.loss(x, rpn_data_samples) + return losses diff --git a/mmdetection/mmdet/models/detectors/rtmdet.py b/mmdetection/mmdet/models/detectors/rtmdet.py new file mode 100644 index 0000000..b43e053 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/rtmdet.py @@ -0,0 +1,52 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmengine.dist import get_world_size +from mmengine.logging import print_log + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class RTMDet(SingleStageDetector): + """Implementation of RTMDet. + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of ATSS. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of ATSS. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + use_syncbn (bool): Whether to use SyncBatchNorm. Defaults to True. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None, + use_syncbn: bool = True) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + + # TODO: Waiting for mmengine support + if use_syncbn and get_world_size() > 1: + torch.nn.SyncBatchNorm.convert_sync_batchnorm(self) + print_log('Using SyncBatchNorm()', 'current') diff --git a/mmdetection/mmdet/models/detectors/scnet.py b/mmdetection/mmdet/models/detectors/scnet.py new file mode 100644 index 0000000..606a020 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/scnet.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from .cascade_rcnn import CascadeRCNN + + +@MODELS.register_module() +class SCNet(CascadeRCNN): + """Implementation of `SCNet `_""" + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) diff --git a/mmdetection/mmdet/models/detectors/semi_base.py b/mmdetection/mmdet/models/detectors/semi_base.py new file mode 100644 index 0000000..f3f0c8c --- /dev/null +++ b/mmdetection/mmdet/models/detectors/semi_base.py @@ -0,0 +1,266 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from torch import Tensor + +from mmdet.models.utils import (filter_gt_instances, rename_loss_dict, + reweight_loss_dict) +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox_project +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .base import BaseDetector + + +@MODELS.register_module() +class SemiBaseDetector(BaseDetector): + """Base class for semi-supervised detectors. + + Semi-supervised detectors typically consisting of a teacher model + updated by exponential moving average and a student model updated + by gradient descent. + + Args: + detector (:obj:`ConfigDict` or dict): The detector config. + semi_train_cfg (:obj:`ConfigDict` or dict, optional): + The semi-supervised training config. + semi_test_cfg (:obj:`ConfigDict` or dict, optional): + The semi-supervised testing config. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + detector: ConfigType, + semi_train_cfg: OptConfigType = None, + semi_test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + self.student = MODELS.build(detector) + self.teacher = MODELS.build(detector) + self.semi_train_cfg = semi_train_cfg + self.semi_test_cfg = semi_test_cfg + if self.semi_train_cfg.get('freeze_teacher', True) is True: + self.freeze(self.teacher) + + @staticmethod + def freeze(model: nn.Module): + """Freeze the model.""" + model.eval() + for param in model.parameters(): + param.requires_grad = False + + def loss(self, multi_batch_inputs: Dict[str, Tensor], + multi_batch_data_samples: Dict[str, SampleList]) -> dict: + """Calculate losses from multi-branch inputs and data samples. + + Args: + multi_batch_inputs (Dict[str, Tensor]): The dict of multi-branch + input images, each value with shape (N, C, H, W). + Each value should usually be mean centered and std scaled. + multi_batch_data_samples (Dict[str, List[:obj:`DetDataSample`]]): + The dict of multi-branch data samples. + + Returns: + dict: A dictionary of loss components + """ + losses = dict() + losses.update(**self.loss_by_gt_instances( + multi_batch_inputs['sup'], multi_batch_data_samples['sup'])) + + origin_pseudo_data_samples, batch_info = self.get_pseudo_instances( + multi_batch_inputs['unsup_teacher'], + multi_batch_data_samples['unsup_teacher']) + multi_batch_data_samples[ + 'unsup_student'] = self.project_pseudo_instances( + origin_pseudo_data_samples, + multi_batch_data_samples['unsup_student']) + losses.update(**self.loss_by_pseudo_instances( + multi_batch_inputs['unsup_student'], + multi_batch_data_samples['unsup_student'], batch_info)) + return losses + + def loss_by_gt_instances(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> dict: + """Calculate losses from a batch of inputs and ground-truth data + samples. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components + """ + + losses = self.student.loss(batch_inputs, batch_data_samples) + sup_weight = self.semi_train_cfg.get('sup_weight', 1.) + return rename_loss_dict('sup_', reweight_loss_dict(losses, sup_weight)) + + def loss_by_pseudo_instances(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + batch_info: Optional[dict] = None) -> dict: + """Calculate losses from a batch of inputs and pseudo data samples. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`, + which are `pseudo_instance` or `pseudo_panoptic_seg` + or `pseudo_sem_seg` in fact. + batch_info (dict): Batch information of teacher model + forward propagation process. Defaults to None. + + Returns: + dict: A dictionary of loss components + """ + batch_data_samples = filter_gt_instances( + batch_data_samples, score_thr=self.semi_train_cfg.cls_pseudo_thr) + losses = self.student.loss(batch_inputs, batch_data_samples) + pseudo_instances_num = sum([ + len(data_samples.gt_instances) + for data_samples in batch_data_samples + ]) + unsup_weight = self.semi_train_cfg.get( + 'unsup_weight', 1.) if pseudo_instances_num > 0 else 0. + return rename_loss_dict('unsup_', + reweight_loss_dict(losses, unsup_weight)) + + @torch.no_grad() + def get_pseudo_instances( + self, batch_inputs: Tensor, batch_data_samples: SampleList + ) -> Tuple[SampleList, Optional[dict]]: + """Get pseudo instances from teacher model.""" + self.teacher.eval() + results_list = self.teacher.predict( + batch_inputs, batch_data_samples, rescale=False) + batch_info = {} + for data_samples, results in zip(batch_data_samples, results_list): + data_samples.gt_instances = results.pred_instances + data_samples.gt_instances.bboxes = bbox_project( + data_samples.gt_instances.bboxes, + torch.from_numpy(data_samples.homography_matrix).inverse().to( + self.data_preprocessor.device), data_samples.ori_shape) + return batch_data_samples, batch_info + + def project_pseudo_instances(self, batch_pseudo_instances: SampleList, + batch_data_samples: SampleList) -> SampleList: + """Project pseudo instances.""" + for pseudo_instances, data_samples in zip(batch_pseudo_instances, + batch_data_samples): + data_samples.gt_instances = copy.deepcopy( + pseudo_instances.gt_instances) + data_samples.gt_instances.bboxes = bbox_project( + data_samples.gt_instances.bboxes, + torch.tensor(data_samples.homography_matrix).to( + self.data_preprocessor.device), data_samples.img_shape) + wh_thr = self.semi_train_cfg.get('min_pseudo_bbox_wh', (1e-2, 1e-2)) + return filter_gt_instances(batch_data_samples, wh_thr=wh_thr) + + def predict(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + list[:obj:`DetDataSample`]: Return the detection results of the + input images. The returns value is DetDataSample, + which usually contain 'pred_instances'. And the + ``pred_instances`` usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + if self.semi_test_cfg.get('predict_on', 'teacher') == 'teacher': + return self.teacher( + batch_inputs, batch_data_samples, mode='predict') + else: + return self.student( + batch_inputs, batch_data_samples, mode='predict') + + def _forward(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> SampleList: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + + Returns: + tuple: A tuple of features from ``rpn_head`` and ``roi_head`` + forward. + """ + if self.semi_test_cfg.get('forward_on', 'teacher') == 'teacher': + return self.teacher( + batch_inputs, batch_data_samples, mode='tensor') + else: + return self.student( + batch_inputs, batch_data_samples, mode='tensor') + + def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]: + """Extract features. + + Args: + batch_inputs (Tensor): Image tensor with shape (N, C, H ,W). + + Returns: + tuple[Tensor]: Multi-level features that may have + different resolutions. + """ + if self.semi_test_cfg.get('extract_feat_on', 'teacher') == 'teacher': + return self.teacher.extract_feat(batch_inputs) + else: + return self.student.extract_feat(batch_inputs) + + def _load_from_state_dict(self, state_dict: dict, prefix: str, + local_metadata: dict, strict: bool, + missing_keys: Union[List[str], str], + unexpected_keys: Union[List[str], str], + error_msgs: Union[List[str], str]) -> None: + """Add teacher and student prefixes to model parameter names.""" + if not any([ + 'student' in key or 'teacher' in key + for key in state_dict.keys() + ]): + keys = list(state_dict.keys()) + state_dict.update({'teacher.' + k: state_dict[k] for k in keys}) + state_dict.update({'student.' + k: state_dict[k] for k in keys}) + for k in keys: + state_dict.pop(k) + return super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) diff --git a/mmdetection/mmdet/models/detectors/single_stage.py b/mmdetection/mmdet/models/detectors/single_stage.py new file mode 100644 index 0000000..06c0740 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/single_stage.py @@ -0,0 +1,149 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList, SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .base import BaseDetector + + +@MODELS.register_module() +class SingleStageDetector(BaseDetector): + """Base class for single-stage detectors. + + Single-stage detectors directly and densely predict bounding boxes on the + output features of the backbone+neck. + """ + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + bbox_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + self.backbone = MODELS.build(backbone) + if neck is not None: + self.neck = MODELS.build(neck) + bbox_head.update(train_cfg=train_cfg) + bbox_head.update(test_cfg=test_cfg) + self.bbox_head = MODELS.build(bbox_head) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + def _load_from_state_dict(self, state_dict: dict, prefix: str, + local_metadata: dict, strict: bool, + missing_keys: Union[List[str], str], + unexpected_keys: Union[List[str], str], + error_msgs: Union[List[str], str]) -> None: + """Exchange bbox_head key to rpn_head key when loading two-stage + weights into single-stage model.""" + bbox_head_prefix = prefix + '.bbox_head' if prefix else 'bbox_head' + bbox_head_keys = [ + k for k in state_dict.keys() if k.startswith(bbox_head_prefix) + ] + rpn_head_prefix = prefix + '.rpn_head' if prefix else 'rpn_head' + rpn_head_keys = [ + k for k in state_dict.keys() if k.startswith(rpn_head_prefix) + ] + if len(bbox_head_keys) == 0 and len(rpn_head_keys) != 0: + for rpn_head_key in rpn_head_keys: + bbox_head_key = bbox_head_prefix + \ + rpn_head_key[len(rpn_head_prefix):] + state_dict[bbox_head_key] = state_dict.pop(rpn_head_key) + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, list]: + """Calculate losses from a batch of inputs and data samples. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + x = self.extract_feat(batch_inputs) + losses = self.bbox_head.loss(x, batch_data_samples) + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the + input images. Each DetDataSample usually contain + 'pred_instances'. And the ``pred_instances`` usually + contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + x = self.extract_feat(batch_inputs) + results_list = self.bbox_head.predict( + x, batch_data_samples, rescale=rescale) + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples + + def _forward( + self, + batch_inputs: Tensor, + batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns: + tuple[list]: A tuple of features from ``bbox_head`` forward. + """ + x = self.extract_feat(batch_inputs) + results = self.bbox_head.forward(x) + return results + + def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]: + """Extract features. + + Args: + batch_inputs (Tensor): Image tensor with shape (N, C, H ,W). + + Returns: + tuple[Tensor]: Multi-level features that may have + different resolutions. + """ + x = self.backbone(batch_inputs) + if self.with_neck: + x = self.neck(x) + return x diff --git a/mmdetection/mmdet/models/detectors/single_stage_instance_seg.py b/mmdetection/mmdet/models/detectors/single_stage_instance_seg.py new file mode 100644 index 0000000..acb5f0d --- /dev/null +++ b/mmdetection/mmdet/models/detectors/single_stage_instance_seg.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Tuple + +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList, SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .base import BaseDetector + +INF = 1e8 + + +@MODELS.register_module() +class SingleStageInstanceSegmentor(BaseDetector): + """Base class for single-stage instance segmentors.""" + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + bbox_head: OptConfigType = None, + mask_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + self.backbone = MODELS.build(backbone) + if neck is not None: + self.neck = MODELS.build(neck) + else: + self.neck = None + if bbox_head is not None: + bbox_head.update(train_cfg=copy.deepcopy(train_cfg)) + bbox_head.update(test_cfg=copy.deepcopy(test_cfg)) + self.bbox_head = MODELS.build(bbox_head) + else: + self.bbox_head = None + + assert mask_head, f'`mask_head` must ' \ + f'be implemented in {self.__class__.__name__}' + mask_head.update(train_cfg=copy.deepcopy(train_cfg)) + mask_head.update(test_cfg=copy.deepcopy(test_cfg)) + self.mask_head = MODELS.build(mask_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]: + """Extract features. + + Args: + batch_inputs (Tensor): Image tensor with shape (N, C, H ,W). + + Returns: + tuple[Tensor]: Multi-level features that may have different + resolutions. + """ + x = self.backbone(batch_inputs) + if self.with_neck: + x = self.neck(x) + return x + + def _forward(self, + batch_inputs: Tensor, + batch_data_samples: OptSampleList = None, + **kwargs) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + + Returns: + tuple: A tuple of features from ``bbox_head`` forward. + """ + outs = () + # backbone + x = self.extract_feat(batch_inputs) + # bbox_head + positive_infos = None + if self.with_bbox: + assert batch_data_samples is not None + bbox_outs = self.bbox_head.forward(x) + outs = outs + (bbox_outs, ) + # It is necessary to use `bbox_head.loss` to update + # `_raw_positive_infos` which will be used in `get_positive_infos` + # positive_infos will be used in the following mask head. + _ = self.bbox_head.loss(x, batch_data_samples, **kwargs) + positive_infos = self.bbox_head.get_positive_infos() + # mask_head + if positive_infos is None: + mask_outs = self.mask_head.forward(x) + else: + mask_outs = self.mask_head.forward(x, positive_infos) + outs = outs + (mask_outs, ) + return outs + + def loss(self, batch_inputs: Tensor, batch_data_samples: SampleList, + **kwargs) -> dict: + """ + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + x = self.extract_feat(batch_inputs) + losses = dict() + + positive_infos = None + # CondInst and YOLACT have bbox_head + if self.with_bbox: + bbox_losses = self.bbox_head.loss(x, batch_data_samples, **kwargs) + losses.update(bbox_losses) + # get positive information from bbox head, which will be used + # in the following mask head. + positive_infos = self.bbox_head.get_positive_infos() + + mask_loss = self.mask_head.loss( + x, batch_data_samples, positive_infos=positive_infos, **kwargs) + # avoid loss override + assert not set(mask_loss.keys()) & set(losses.keys()) + + losses.update(mask_loss) + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True, + **kwargs) -> SampleList: + """Perform forward propagation of the mask head and predict mask + results on the features of the upstream network. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to False. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the + input images. Each DetDataSample usually contain + 'pred_instances'. And the ``pred_instances`` usually + contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + x = self.extract_feat(batch_inputs) + if self.with_bbox: + # the bbox branch does not need to be scaled to the original + # image scale, because the mask branch will scale both bbox + # and mask at the same time. + bbox_rescale = rescale if not self.with_mask else False + results_list = self.bbox_head.predict( + x, batch_data_samples, rescale=bbox_rescale) + else: + results_list = None + + results_list = self.mask_head.predict( + x, batch_data_samples, rescale=rescale, results_list=results_list) + + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples diff --git a/mmdetection/mmdet/models/detectors/soft_teacher.py b/mmdetection/mmdet/models/detectors/soft_teacher.py new file mode 100644 index 0000000..80853f1 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/soft_teacher.py @@ -0,0 +1,378 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Tuple + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.utils import (filter_gt_instances, rename_loss_dict, + reweight_loss_dict) +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi, bbox_project +from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig +from ..utils.misc import unpack_gt_instances +from .semi_base import SemiBaseDetector + + +@MODELS.register_module() +class SoftTeacher(SemiBaseDetector): + r"""Implementation of `End-to-End Semi-Supervised Object Detection + with Soft Teacher `_ + + Args: + detector (:obj:`ConfigDict` or dict): The detector config. + semi_train_cfg (:obj:`ConfigDict` or dict, optional): + The semi-supervised training config. + semi_test_cfg (:obj:`ConfigDict` or dict, optional): + The semi-supervised testing config. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + detector: ConfigType, + semi_train_cfg: OptConfigType = None, + semi_test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + detector=detector, + semi_train_cfg=semi_train_cfg, + semi_test_cfg=semi_test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + + def loss_by_pseudo_instances(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + batch_info: Optional[dict] = None) -> dict: + """Calculate losses from a batch of inputs and pseudo data samples. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`, + which are `pseudo_instance` or `pseudo_panoptic_seg` + or `pseudo_sem_seg` in fact. + batch_info (dict): Batch information of teacher model + forward propagation process. Defaults to None. + + Returns: + dict: A dictionary of loss components + """ + + x = self.student.extract_feat(batch_inputs) + + losses = {} + rpn_losses, rpn_results_list = self.rpn_loss_by_pseudo_instances( + x, batch_data_samples) + losses.update(**rpn_losses) + losses.update(**self.rcnn_cls_loss_by_pseudo_instances( + x, rpn_results_list, batch_data_samples, batch_info)) + losses.update(**self.rcnn_reg_loss_by_pseudo_instances( + x, rpn_results_list, batch_data_samples)) + unsup_weight = self.semi_train_cfg.get('unsup_weight', 1.) + return rename_loss_dict('unsup_', + reweight_loss_dict(losses, unsup_weight)) + + @torch.no_grad() + def get_pseudo_instances( + self, batch_inputs: Tensor, batch_data_samples: SampleList + ) -> Tuple[SampleList, Optional[dict]]: + """Get pseudo instances from teacher model.""" + assert self.teacher.with_bbox, 'Bbox head must be implemented.' + x = self.teacher.extract_feat(batch_inputs) + + # If there are no pre-defined proposals, use RPN to get proposals + if batch_data_samples[0].get('proposals', None) is None: + rpn_results_list = self.teacher.rpn_head.predict( + x, batch_data_samples, rescale=False) + else: + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + results_list = self.teacher.roi_head.predict( + x, rpn_results_list, batch_data_samples, rescale=False) + + for data_samples, results in zip(batch_data_samples, results_list): + data_samples.gt_instances = results + + batch_data_samples = filter_gt_instances( + batch_data_samples, + score_thr=self.semi_train_cfg.pseudo_label_initial_score_thr) + + reg_uncs_list = self.compute_uncertainty_with_aug( + x, batch_data_samples) + + for data_samples, reg_uncs in zip(batch_data_samples, reg_uncs_list): + data_samples.gt_instances['reg_uncs'] = reg_uncs + data_samples.gt_instances.bboxes = bbox_project( + data_samples.gt_instances.bboxes, + torch.from_numpy(data_samples.homography_matrix).inverse().to( + self.data_preprocessor.device), data_samples.ori_shape) + + batch_info = { + 'feat': x, + 'img_shape': [], + 'homography_matrix': [], + 'metainfo': [] + } + for data_samples in batch_data_samples: + batch_info['img_shape'].append(data_samples.img_shape) + batch_info['homography_matrix'].append( + torch.from_numpy(data_samples.homography_matrix).to( + self.data_preprocessor.device)) + batch_info['metainfo'].append(data_samples.metainfo) + return batch_data_samples, batch_info + + def rpn_loss_by_pseudo_instances(self, x: Tuple[Tensor], + batch_data_samples: SampleList) -> dict: + """Calculate rpn loss from a batch of inputs and pseudo data samples. + + Args: + x (tuple[Tensor]): Features from FPN. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`, + which are `pseudo_instance` or `pseudo_panoptic_seg` + or `pseudo_sem_seg` in fact. + Returns: + dict: A dictionary of rpn loss components + """ + + rpn_data_samples = copy.deepcopy(batch_data_samples) + rpn_data_samples = filter_gt_instances( + rpn_data_samples, score_thr=self.semi_train_cfg.rpn_pseudo_thr) + proposal_cfg = self.student.train_cfg.get('rpn_proposal', + self.student.test_cfg.rpn) + # set cat_id of gt_labels to 0 in RPN + for data_sample in rpn_data_samples: + data_sample.gt_instances.labels = \ + torch.zeros_like(data_sample.gt_instances.labels) + + rpn_losses, rpn_results_list = self.student.rpn_head.loss_and_predict( + x, rpn_data_samples, proposal_cfg=proposal_cfg) + for key in rpn_losses.keys(): + if 'loss' in key and 'rpn' not in key: + rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) + return rpn_losses, rpn_results_list + + def rcnn_cls_loss_by_pseudo_instances(self, x: Tuple[Tensor], + unsup_rpn_results_list: InstanceList, + batch_data_samples: SampleList, + batch_info: dict) -> dict: + """Calculate classification loss from a batch of inputs and pseudo data + samples. + + Args: + x (tuple[Tensor]): List of multi-level img features. + unsup_rpn_results_list (list[:obj:`InstanceData`]): + List of region proposals. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`, + which are `pseudo_instance` or `pseudo_panoptic_seg` + or `pseudo_sem_seg` in fact. + batch_info (dict): Batch information of teacher model + forward propagation process. + + Returns: + dict[str, Tensor]: A dictionary of rcnn + classification loss components + """ + rpn_results_list = copy.deepcopy(unsup_rpn_results_list) + cls_data_samples = copy.deepcopy(batch_data_samples) + cls_data_samples = filter_gt_instances( + cls_data_samples, score_thr=self.semi_train_cfg.cls_pseudo_thr) + + outputs = unpack_gt_instances(cls_data_samples) + batch_gt_instances, batch_gt_instances_ignore, _ = outputs + + # assign gts and sample proposals + num_imgs = len(cls_data_samples) + sampling_results = [] + for i in range(num_imgs): + # rename rpn_results.bboxes to rpn_results.priors + rpn_results = rpn_results_list[i] + rpn_results.priors = rpn_results.pop('bboxes') + assign_result = self.student.roi_head.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.student.roi_head.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + selected_bboxes = [res.priors for res in sampling_results] + rois = bbox2roi(selected_bboxes) + bbox_results = self.student.roi_head._bbox_forward(x, rois) + # cls_reg_targets is a tuple of labels, label_weights, + # and bbox_targets, bbox_weights + cls_reg_targets = self.student.roi_head.bbox_head.get_targets( + sampling_results, self.student.train_cfg.rcnn) + + selected_results_list = [] + for bboxes, data_samples, teacher_matrix, teacher_img_shape in zip( + selected_bboxes, batch_data_samples, + batch_info['homography_matrix'], batch_info['img_shape']): + student_matrix = torch.tensor( + data_samples.homography_matrix, device=teacher_matrix.device) + homography_matrix = teacher_matrix @ student_matrix.inverse() + projected_bboxes = bbox_project(bboxes, homography_matrix, + teacher_img_shape) + selected_results_list.append(InstanceData(bboxes=projected_bboxes)) + + with torch.no_grad(): + results_list = self.teacher.roi_head.predict_bbox( + batch_info['feat'], + batch_info['metainfo'], + selected_results_list, + rcnn_test_cfg=None, + rescale=False) + bg_score = torch.cat( + [results.scores[:, -1] for results in results_list]) + # cls_reg_targets[0] is labels + neg_inds = cls_reg_targets[ + 0] == self.student.roi_head.bbox_head.num_classes + # cls_reg_targets[1] is label_weights + cls_reg_targets[1][neg_inds] = bg_score[neg_inds].detach() + + losses = self.student.roi_head.bbox_head.loss( + bbox_results['cls_score'], bbox_results['bbox_pred'], rois, + *cls_reg_targets) + # cls_reg_targets[1] is label_weights + losses['loss_cls'] = losses['loss_cls'] * len( + cls_reg_targets[1]) / max(sum(cls_reg_targets[1]), 1.0) + return losses + + def rcnn_reg_loss_by_pseudo_instances( + self, x: Tuple[Tensor], unsup_rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> dict: + """Calculate rcnn regression loss from a batch of inputs and pseudo + data samples. + + Args: + x (tuple[Tensor]): List of multi-level img features. + unsup_rpn_results_list (list[:obj:`InstanceData`]): + List of region proposals. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`, + which are `pseudo_instance` or `pseudo_panoptic_seg` + or `pseudo_sem_seg` in fact. + + Returns: + dict[str, Tensor]: A dictionary of rcnn + regression loss components + """ + rpn_results_list = copy.deepcopy(unsup_rpn_results_list) + reg_data_samples = copy.deepcopy(batch_data_samples) + for data_samples in reg_data_samples: + if data_samples.gt_instances.bboxes.shape[0] > 0: + data_samples.gt_instances = data_samples.gt_instances[ + data_samples.gt_instances.reg_uncs < + self.semi_train_cfg.reg_pseudo_thr] + roi_losses = self.student.roi_head.loss(x, rpn_results_list, + reg_data_samples) + return {'loss_bbox': roi_losses['loss_bbox']} + + def compute_uncertainty_with_aug( + self, x: Tuple[Tensor], + batch_data_samples: SampleList) -> List[Tensor]: + """Compute uncertainty with augmented bboxes. + + Args: + x (tuple[Tensor]): List of multi-level img features. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`, + which are `pseudo_instance` or `pseudo_panoptic_seg` + or `pseudo_sem_seg` in fact. + + Returns: + list[Tensor]: A list of uncertainty for pseudo bboxes. + """ + auged_results_list = self.aug_box(batch_data_samples, + self.semi_train_cfg.jitter_times, + self.semi_train_cfg.jitter_scale) + # flatten + auged_results_list = [ + InstanceData(bboxes=auged.reshape(-1, auged.shape[-1])) + for auged in auged_results_list + ] + + self.teacher.roi_head.test_cfg = None + results_list = self.teacher.roi_head.predict( + x, auged_results_list, batch_data_samples, rescale=False) + self.teacher.roi_head.test_cfg = self.teacher.test_cfg.rcnn + + reg_channel = max( + [results.bboxes.shape[-1] for results in results_list]) // 4 + bboxes = [ + results.bboxes.reshape(self.semi_train_cfg.jitter_times, -1, + results.bboxes.shape[-1]) + if results.bboxes.numel() > 0 else results.bboxes.new_zeros( + self.semi_train_cfg.jitter_times, 0, 4 * reg_channel).float() + for results in results_list + ] + + box_unc = [bbox.std(dim=0) for bbox in bboxes] + bboxes = [bbox.mean(dim=0) for bbox in bboxes] + labels = [ + data_samples.gt_instances.labels + for data_samples in batch_data_samples + ] + if reg_channel != 1: + bboxes = [ + bbox.reshape(bbox.shape[0], reg_channel, + 4)[torch.arange(bbox.shape[0]), label] + for bbox, label in zip(bboxes, labels) + ] + box_unc = [ + unc.reshape(unc.shape[0], reg_channel, + 4)[torch.arange(unc.shape[0]), label] + for unc, label in zip(box_unc, labels) + ] + + box_shape = [(bbox[:, 2:4] - bbox[:, :2]).clamp(min=1.0) + for bbox in bboxes] + box_unc = [ + torch.mean( + unc / wh[:, None, :].expand(-1, 2, 2).reshape(-1, 4), dim=-1) + if wh.numel() > 0 else unc for unc, wh in zip(box_unc, box_shape) + ] + return box_unc + + @staticmethod + def aug_box(batch_data_samples, times, frac): + """Augment bboxes with jitter.""" + + def _aug_single(box): + box_scale = box[:, 2:4] - box[:, :2] + box_scale = ( + box_scale.clamp(min=1)[:, None, :].expand(-1, 2, + 2).reshape(-1, 4)) + aug_scale = box_scale * frac # [n,4] + + offset = ( + torch.randn(times, box.shape[0], 4, device=box.device) * + aug_scale[None, ...]) + new_box = box.clone()[None, ...].expand(times, box.shape[0], + -1) + offset + return new_box + + return [ + _aug_single(data_samples.gt_instances.bboxes) + for data_samples in batch_data_samples + ] diff --git a/mmdetection/mmdet/models/detectors/solo.py b/mmdetection/mmdet/models/detectors/solo.py new file mode 100644 index 0000000..6bf47ba --- /dev/null +++ b/mmdetection/mmdet/models/detectors/solo.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage_instance_seg import SingleStageInstanceSegmentor + + +@MODELS.register_module() +class SOLO(SingleStageInstanceSegmentor): + """`SOLO: Segmenting Objects by Locations + `_ + + """ + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + bbox_head: OptConfigType = None, + mask_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + mask_head=mask_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/solov2.py b/mmdetection/mmdet/models/detectors/solov2.py new file mode 100644 index 0000000..1eefe4c --- /dev/null +++ b/mmdetection/mmdet/models/detectors/solov2.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage_instance_seg import SingleStageInstanceSegmentor + + +@MODELS.register_module() +class SOLOv2(SingleStageInstanceSegmentor): + """`SOLOv2: Dynamic and Fast Instance Segmentation + `_ + + """ + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + bbox_head: OptConfigType = None, + mask_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + mask_head=mask_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/sparse_rcnn.py b/mmdetection/mmdet/models/detectors/sparse_rcnn.py new file mode 100644 index 0000000..75442a6 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/sparse_rcnn.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .two_stage import TwoStageDetector + + +@MODELS.register_module() +class SparseRCNN(TwoStageDetector): + r"""Implementation of `Sparse R-CNN: End-to-End Object Detection with + Learnable Proposals `_""" + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + rpn_head: OptConfigType = None, + roi_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + assert self.with_rpn, 'Sparse R-CNN and QueryInst ' \ + 'do not support external proposals' diff --git a/mmdetection/mmdet/models/detectors/tood.py b/mmdetection/mmdet/models/detectors/tood.py new file mode 100644 index 0000000..3872048 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/tood.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class TOOD(SingleStageDetector): + r"""Implementation of `TOOD: Task-aligned One-stage Object Detection. + `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of TOOD. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of TOOD. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/trident_faster_rcnn.py b/mmdetection/mmdet/models/detectors/trident_faster_rcnn.py new file mode 100644 index 0000000..4244925 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/trident_faster_rcnn.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .faster_rcnn import FasterRCNN + + +@MODELS.register_module() +class TridentFasterRCNN(FasterRCNN): + """Implementation of `TridentNet `_""" + + def __init__(self, + backbone: ConfigType, + rpn_head: ConfigType, + roi_head: ConfigType, + train_cfg: ConfigType, + test_cfg: ConfigType, + neck: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + + super().__init__( + backbone=backbone, + neck=neck, + rpn_head=rpn_head, + roi_head=roi_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + assert self.backbone.num_branch == self.roi_head.num_branch + assert self.backbone.test_branch_idx == self.roi_head.test_branch_idx + self.num_branch = self.backbone.num_branch + self.test_branch_idx = self.backbone.test_branch_idx + + def _forward(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> tuple: + """copy the ``batch_data_samples`` to fit multi-branch.""" + num_branch = self.num_branch \ + if self.training or self.test_branch_idx == -1 else 1 + trident_data_samples = batch_data_samples * num_branch + return super()._forward( + batch_inputs=batch_inputs, batch_data_samples=trident_data_samples) + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> dict: + """copy the ``batch_data_samples`` to fit multi-branch.""" + num_branch = self.num_branch \ + if self.training or self.test_branch_idx == -1 else 1 + trident_data_samples = batch_data_samples * num_branch + return super().loss( + batch_inputs=batch_inputs, batch_data_samples=trident_data_samples) + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """copy the ``batch_data_samples`` to fit multi-branch.""" + num_branch = self.num_branch \ + if self.training or self.test_branch_idx == -1 else 1 + trident_data_samples = batch_data_samples * num_branch + return super().predict( + batch_inputs=batch_inputs, + batch_data_samples=trident_data_samples, + rescale=rescale) + + # TODO need to refactor + def aug_test(self, imgs, img_metas, rescale=False): + """Test with augmentations. + + If rescale is False, then returned bboxes and masks will fit the scale + of imgs[0]. + """ + x = self.extract_feats(imgs) + num_branch = (self.num_branch if self.test_branch_idx == -1 else 1) + trident_img_metas = [img_metas * num_branch for img_metas in img_metas] + proposal_list = self.rpn_head.aug_test_rpn(x, trident_img_metas) + return self.roi_head.aug_test( + x, proposal_list, img_metas, rescale=rescale) diff --git a/mmdetection/mmdet/models/detectors/two_stage.py b/mmdetection/mmdet/models/detectors/two_stage.py new file mode 100644 index 0000000..4e83df9 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/two_stage.py @@ -0,0 +1,243 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from typing import List, Tuple, Union + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .base import BaseDetector + + +@MODELS.register_module() +class TwoStageDetector(BaseDetector): + """Base class for two-stage detectors. + + Two-stage detectors typically consisting of a region proposal network and a + task-specific regression head. + """ + + def __init__(self, + backbone: ConfigType, + neck: OptConfigType = None, + rpn_head: OptConfigType = None, + roi_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + self.backbone = MODELS.build(backbone) + + if neck is not None: + self.neck = MODELS.build(neck) + + if rpn_head is not None: + rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None + rpn_head_ = rpn_head.copy() + rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) + rpn_head_num_classes = rpn_head_.get('num_classes', None) + if rpn_head_num_classes is None: + rpn_head_.update(num_classes=1) + else: + if rpn_head_num_classes != 1: + warnings.warn( + 'The `num_classes` should be 1 in RPN, but get ' + f'{rpn_head_num_classes}, please set ' + 'rpn_head.num_classes = 1 in your config file.') + rpn_head_.update(num_classes=1) + self.rpn_head = MODELS.build(rpn_head_) + + if roi_head is not None: + # update train and test cfg here for now + # TODO: refactor assigner & sampler + rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None + roi_head.update(train_cfg=rcnn_train_cfg) + roi_head.update(test_cfg=test_cfg.rcnn) + self.roi_head = MODELS.build(roi_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + def _load_from_state_dict(self, state_dict: dict, prefix: str, + local_metadata: dict, strict: bool, + missing_keys: Union[List[str], str], + unexpected_keys: Union[List[str], str], + error_msgs: Union[List[str], str]) -> None: + """Exchange bbox_head key to rpn_head key when loading single-stage + weights into two-stage model.""" + bbox_head_prefix = prefix + '.bbox_head' if prefix else 'bbox_head' + bbox_head_keys = [ + k for k in state_dict.keys() if k.startswith(bbox_head_prefix) + ] + rpn_head_prefix = prefix + '.rpn_head' if prefix else 'rpn_head' + rpn_head_keys = [ + k for k in state_dict.keys() if k.startswith(rpn_head_prefix) + ] + if len(bbox_head_keys) != 0 and len(rpn_head_keys) == 0: + for bbox_head_key in bbox_head_keys: + rpn_head_key = rpn_head_prefix + \ + bbox_head_key[len(bbox_head_prefix):] + state_dict[rpn_head_key] = state_dict.pop(bbox_head_key) + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) + + @property + def with_rpn(self) -> bool: + """bool: whether the detector has RPN""" + return hasattr(self, 'rpn_head') and self.rpn_head is not None + + @property + def with_roi_head(self) -> bool: + """bool: whether the detector has a RoI head""" + return hasattr(self, 'roi_head') and self.roi_head is not None + + def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]: + """Extract features. + + Args: + batch_inputs (Tensor): Image tensor with shape (N, C, H ,W). + + Returns: + tuple[Tensor]: Multi-level features that may have + different resolutions. + """ + x = self.backbone(batch_inputs) + if self.with_neck: + x = self.neck(x) + return x + + def _forward(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns: + tuple: A tuple of features from ``rpn_head`` and ``roi_head`` + forward. + """ + results = () + x = self.extract_feat(batch_inputs) + + if self.with_rpn: + rpn_results_list = self.rpn_head.predict( + x, batch_data_samples, rescale=False) + else: + assert batch_data_samples[0].get('proposals', None) is not None + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + roi_outs = self.roi_head.forward(x, rpn_results_list, + batch_data_samples) + results = results + (roi_outs, ) + return results + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components + """ + x = self.extract_feat(batch_inputs) + + losses = dict() + + # RPN forward and loss + if self.with_rpn: + proposal_cfg = self.train_cfg.get('rpn_proposal', + self.test_cfg.rpn) + rpn_data_samples = copy.deepcopy(batch_data_samples) + # set cat_id of gt_labels to 0 in RPN + for data_sample in rpn_data_samples: + data_sample.gt_instances.labels = \ + torch.zeros_like(data_sample.gt_instances.labels) + + rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict( + x, rpn_data_samples, proposal_cfg=proposal_cfg) + # avoid get same name with roi_head loss + keys = rpn_losses.keys() + for key in list(keys): + if 'loss' in key and 'rpn' not in key: + rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) + losses.update(rpn_losses) + else: + assert batch_data_samples[0].get('proposals', None) is not None + # use pre-defined proposals in InstanceData for the second stage + # to extract ROI features. + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + roi_losses = self.roi_head.loss(x, rpn_results_list, + batch_data_samples) + losses.update(roi_losses) + + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + list[:obj:`DetDataSample`]: Return the detection results of the + input images. The returns value is DetDataSample, + which usually contain 'pred_instances'. And the + ``pred_instances`` usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + + assert self.with_bbox, 'Bbox head must be implemented.' + x = self.extract_feat(batch_inputs) + + # If there are no pre-defined proposals, use RPN to get proposals + if batch_data_samples[0].get('proposals', None) is None: + rpn_results_list = self.rpn_head.predict( + x, batch_data_samples, rescale=False) + else: + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + results_list = self.roi_head.predict( + x, rpn_results_list, batch_data_samples, rescale=rescale) + + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples diff --git a/mmdetection/mmdet/models/detectors/vfnet.py b/mmdetection/mmdet/models/detectors/vfnet.py new file mode 100644 index 0000000..a695513 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/vfnet.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class VFNet(SingleStageDetector): + """Implementation of `VarifocalNet + (VFNet).`_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of VFNet. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of VFNet. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/yolact.py b/mmdetection/mmdet/models/detectors/yolact.py new file mode 100644 index 0000000..f15fb7b --- /dev/null +++ b/mmdetection/mmdet/models/detectors/yolact.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage_instance_seg import SingleStageInstanceSegmentor + + +@MODELS.register_module() +class YOLACT(SingleStageInstanceSegmentor): + """Implementation of `YOLACT `_""" + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + mask_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + mask_head=mask_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/yolo.py b/mmdetection/mmdet/models/detectors/yolo.py new file mode 100644 index 0000000..5cb9a9c --- /dev/null +++ b/mmdetection/mmdet/models/detectors/yolo.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) 2019 Western Digital Corporation or its affiliates. + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class YOLOV3(SingleStageDetector): + r"""Implementation of `Yolov3: An incremental improvement + `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of YOLOX. Default: None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of YOLOX. Default: None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): + Model preprocessing config for processing the input data. + it usually includes ``to_rgb``, ``pad_size_divisor``, + ``pad_value``, ``mean`` and ``std``. Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/yolof.py b/mmdetection/mmdet/models/detectors/yolof.py new file mode 100644 index 0000000..c6d98b9 --- /dev/null +++ b/mmdetection/mmdet/models/detectors/yolof.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class YOLOF(SingleStageDetector): + r"""Implementation of `You Only Look One-level Feature + `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone module. + neck (:obj:`ConfigDict` or dict): The neck module. + bbox_head (:obj:`ConfigDict` or dict): The bbox head module. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of YOLOF. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of YOLOF. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): + Model preprocessing config for processing the input data. + it usually includes ``to_rgb``, ``pad_size_divisor``, + ``pad_value``, ``mean`` and ``std``. Defaults to None. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/detectors/yolox.py b/mmdetection/mmdet/models/detectors/yolox.py new file mode 100644 index 0000000..df9190c --- /dev/null +++ b/mmdetection/mmdet/models/detectors/yolox.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .single_stage import SingleStageDetector + + +@MODELS.register_module() +class YOLOX(SingleStageDetector): + r"""Implementation of `YOLOX: Exceeding YOLO Series in 2021 + `_ + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of YOLOX. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of YOLOX. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/mmdet/models/language_models/__init__.py b/mmdetection/mmdet/models/language_models/__init__.py new file mode 100644 index 0000000..70f1a22 --- /dev/null +++ b/mmdetection/mmdet/models/language_models/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bert import BertModel + +__all__ = ['BertModel'] diff --git a/mmdetection/mmdet/models/language_models/bert.py b/mmdetection/mmdet/models/language_models/bert.py new file mode 100644 index 0000000..efb0f46 --- /dev/null +++ b/mmdetection/mmdet/models/language_models/bert.py @@ -0,0 +1,231 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict +from typing import Sequence + +import torch +from mmengine.model import BaseModel +from torch import nn + +try: + from transformers import AutoTokenizer, BertConfig + from transformers import BertModel as HFBertModel +except ImportError: + AutoTokenizer = None + HFBertModel = None + +from mmdet.registry import MODELS + + +def generate_masks_with_special_tokens_and_transfer_map( + tokenized, special_tokens_list): + """Generate attention mask between each pair of special tokens. + + Only token pairs in between two special tokens are attended to + and thus the attention mask for these pairs is positive. + + Args: + input_ids (torch.Tensor): input ids. Shape: [bs, num_token] + special_tokens_mask (list): special tokens mask. + + Returns: + Tuple(Tensor, Tensor): + - attention_mask is the attention mask between each tokens. + Only token pairs in between two special tokens are positive. + Shape: [bs, num_token, num_token]. + - position_ids is the position id of tokens within each valid sentence. + The id starts from 0 whenenver a special token is encountered. + Shape: [bs, num_token] + """ + input_ids = tokenized['input_ids'] + bs, num_token = input_ids.shape + # special_tokens_mask: + # bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((bs, num_token), + device=input_ids.device).bool() + + for special_token in special_tokens_list: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = ( + torch.eye(num_token, + device=input_ids.device).bool().unsqueeze(0).repeat( + bs, 1, 1)) + position_ids = torch.zeros((bs, num_token), device=input_ids.device) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1:col + 1, + previous_col + 1:col + 1] = True + position_ids[row, previous_col + 1:col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device) + previous_col = col + + return attention_mask, position_ids.to(torch.long) + + +@MODELS.register_module() +class BertModel(BaseModel): + """BERT model for language embedding only encoder. + + Args: + name (str, optional): name of the pretrained BERT model from + HuggingFace. Defaults to bert-base-uncased. + max_tokens (int, optional): maximum number of tokens to be + used for BERT. Defaults to 256. + pad_to_max (bool, optional): whether to pad the tokens to max_tokens. + Defaults to True. + use_sub_sentence_represent (bool, optional): whether to use sub + sentence represent introduced in `Grounding DINO + `. Defaults to False. + special_tokens_list (list, optional): special tokens used to split + subsentence. It cannot be None when `use_sub_sentence_represent` + is True. Defaults to None. + add_pooling_layer (bool, optional): whether to adding pooling + layer in bert encoder. Defaults to False. + num_layers_of_embedded (int, optional): number of layers of + the embedded model. Defaults to 1. + use_checkpoint (bool, optional): whether to use gradient checkpointing. + Defaults to False. + """ + + def __init__(self, + name: str = 'bert-base-uncased', + max_tokens: int = 256, + pad_to_max: bool = True, + use_sub_sentence_represent: bool = False, + special_tokens_list: list = None, + add_pooling_layer: bool = False, + num_layers_of_embedded: int = 1, + use_checkpoint: bool = False, + **kwargs) -> None: + + super().__init__(**kwargs) + self.max_tokens = max_tokens + self.pad_to_max = pad_to_max + + if AutoTokenizer is None: + raise RuntimeError( + 'transformers is not installed, please install it by: ' + 'pip install transformers.') + + self.tokenizer = AutoTokenizer.from_pretrained(name) + self.language_backbone = nn.Sequential( + OrderedDict([('body', + BertEncoder( + name, + add_pooling_layer=add_pooling_layer, + num_layers_of_embedded=num_layers_of_embedded, + use_checkpoint=use_checkpoint))])) + + self.use_sub_sentence_represent = use_sub_sentence_represent + if self.use_sub_sentence_represent: + assert special_tokens_list is not None, \ + 'special_tokens should not be None \ + if use_sub_sentence_represent is True' + + self.special_tokens = self.tokenizer.convert_tokens_to_ids( + special_tokens_list) + + def forward(self, captions: Sequence[str], **kwargs) -> dict: + """Forward function.""" + device = next(self.language_backbone.parameters()).device + tokenized = self.tokenizer.batch_encode_plus( + captions, + max_length=self.max_tokens, + padding='max_length' if self.pad_to_max else 'longest', + return_special_tokens_mask=True, + return_tensors='pt', + truncation=True).to(device) + input_ids = tokenized.input_ids + if self.use_sub_sentence_represent: + attention_mask, position_ids = \ + generate_masks_with_special_tokens_and_transfer_map( + tokenized, self.special_tokens) + token_type_ids = tokenized['token_type_ids'] + + else: + attention_mask = tokenized.attention_mask + position_ids = None + token_type_ids = None + + tokenizer_input = { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'position_ids': position_ids, + 'token_type_ids': token_type_ids + } + language_dict_features = self.language_backbone(tokenizer_input) + if self.use_sub_sentence_represent: + language_dict_features['position_ids'] = position_ids + language_dict_features[ + 'text_token_mask'] = tokenized.attention_mask.bool() + return language_dict_features + + +class BertEncoder(nn.Module): + """BERT encoder for language embedding. + + Args: + name (str): name of the pretrained BERT model from HuggingFace. + Defaults to bert-base-uncased. + add_pooling_layer (bool): whether to add a pooling layer. + num_layers_of_embedded (int): number of layers of the embedded model. + Defaults to 1. + use_checkpoint (bool): whether to use gradient checkpointing. + Defaults to False. + """ + + def __init__(self, + name: str, + add_pooling_layer: bool = False, + num_layers_of_embedded: int = 1, + use_checkpoint: bool = False): + super().__init__() + if BertConfig is None: + raise RuntimeError( + 'transformers is not installed, please install it by: ' + 'pip install transformers.') + config = BertConfig.from_pretrained(name) + config.gradient_checkpointing = use_checkpoint + # only encoder + self.model = HFBertModel.from_pretrained( + name, add_pooling_layer=add_pooling_layer, config=config) + self.language_dim = config.hidden_size + self.num_layers_of_embedded = num_layers_of_embedded + + def forward(self, x) -> dict: + mask = x['attention_mask'] + + outputs = self.model( + input_ids=x['input_ids'], + attention_mask=mask, + position_ids=x['position_ids'], + token_type_ids=x['token_type_ids'], + output_hidden_states=True, + ) + + # outputs has 13 layers, 1 input layer and 12 hidden layers + encoded_layers = outputs.hidden_states[1:] + features = torch.stack(encoded_layers[-self.num_layers_of_embedded:], + 1).mean(1) + # language embedding has shape [len(phrase), seq_len, language_dim] + features = features / self.num_layers_of_embedded + if mask.dim() == 2: + embedded = features * mask.unsqueeze(-1).float() + else: + embedded = features + + results = { + 'embedded': embedded, + 'masks': mask, + 'hidden': encoded_layers[-1] + } + return results diff --git a/mmdetection/mmdet/models/layers/__init__.py b/mmdetection/mmdet/models/layers/__init__.py new file mode 100644 index 0000000..e3c41f6 --- /dev/null +++ b/mmdetection/mmdet/models/layers/__init__.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .activations import SiLU +from .bbox_nms import fast_nms, multiclass_nms +from .brick_wrappers import (AdaptiveAvgPool2d, FrozenBatchNorm2d, + adaptive_avg_pool2d) +from .conv_upsample import ConvUpsample +from .csp_layer import CSPLayer +from .dropblock import DropBlock +from .ema import ExpMomentumEMA +from .inverted_residual import InvertedResidual +from .matrix_nms import mask_matrix_nms +from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder +from .normed_predictor import NormedConv2d, NormedLinear +from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder +from .positional_encoding import (LearnedPositionalEncoding, + SinePositionalEncoding, + SinePositionalEncoding3D) +from .res_layer import ResLayer, SimplifiedBasicBlock +from .se_layer import ChannelAttention, DyReLU, SELayer +# yapf: disable +from .transformer import (MLP, AdaptivePadding, CdnQueryGenerator, + ConditionalAttention, + ConditionalDetrTransformerDecoder, + ConditionalDetrTransformerDecoderLayer, + DABDetrTransformerDecoder, + DABDetrTransformerDecoderLayer, + DABDetrTransformerEncoder, DDQTransformerDecoder, + DeformableDetrTransformerDecoder, + DeformableDetrTransformerDecoderLayer, + DeformableDetrTransformerEncoder, + DeformableDetrTransformerEncoderLayer, + DetrTransformerDecoder, DetrTransformerDecoderLayer, + DetrTransformerEncoder, DetrTransformerEncoderLayer, + DinoTransformerDecoder, DynamicConv, + Mask2FormerTransformerDecoder, + Mask2FormerTransformerDecoderLayer, + Mask2FormerTransformerEncoder, PatchEmbed, + PatchMerging, coordinate_to_encoding, + inverse_sigmoid, nchw_to_nlc, nlc_to_nchw) + +# yapf: enable + +__all__ = [ + 'fast_nms', 'multiclass_nms', 'mask_matrix_nms', 'DropBlock', + 'PixelDecoder', 'TransformerEncoderPixelDecoder', + 'MSDeformAttnPixelDecoder', 'ResLayer', 'PatchMerging', + 'SinePositionalEncoding', 'LearnedPositionalEncoding', 'DynamicConv', + 'SimplifiedBasicBlock', 'NormedLinear', 'NormedConv2d', 'InvertedResidual', + 'SELayer', 'ConvUpsample', 'CSPLayer', 'adaptive_avg_pool2d', + 'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'DyReLU', + 'ExpMomentumEMA', 'inverse_sigmoid', 'ChannelAttention', 'SiLU', 'MLP', + 'DetrTransformerEncoderLayer', 'DetrTransformerDecoderLayer', + 'DetrTransformerEncoder', 'DetrTransformerDecoder', + 'DeformableDetrTransformerEncoder', 'DeformableDetrTransformerDecoder', + 'DeformableDetrTransformerEncoderLayer', + 'DeformableDetrTransformerDecoderLayer', 'AdaptivePadding', + 'coordinate_to_encoding', 'ConditionalAttention', + 'DABDetrTransformerDecoderLayer', 'DABDetrTransformerDecoder', + 'DABDetrTransformerEncoder', 'DDQTransformerDecoder', + 'ConditionalDetrTransformerDecoder', + 'ConditionalDetrTransformerDecoderLayer', 'DinoTransformerDecoder', + 'CdnQueryGenerator', 'Mask2FormerTransformerEncoder', + 'Mask2FormerTransformerDecoderLayer', 'Mask2FormerTransformerDecoder', + 'SinePositionalEncoding3D', 'FrozenBatchNorm2d' +] diff --git a/mmdetection/mmdet/models/layers/activations.py b/mmdetection/mmdet/models/layers/activations.py new file mode 100644 index 0000000..9e73ef4 --- /dev/null +++ b/mmdetection/mmdet/models/layers/activations.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmengine.utils import digit_version + +from mmdet.registry import MODELS + +if digit_version(torch.__version__) >= digit_version('1.7.0'): + from torch.nn import SiLU +else: + + class SiLU(nn.Module): + """Sigmoid Weighted Liner Unit.""" + + def __init__(self, inplace=True): + super().__init__() + + def forward(self, inputs) -> torch.Tensor: + return inputs * torch.sigmoid(inputs) + + +MODELS.register_module(module=SiLU, name='SiLU') diff --git a/mmdetection/mmdet/models/layers/bbox_nms.py b/mmdetection/mmdet/models/layers/bbox_nms.py new file mode 100644 index 0000000..fd67a45 --- /dev/null +++ b/mmdetection/mmdet/models/layers/bbox_nms.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import torch +from mmcv.ops.nms import batched_nms +from torch import Tensor + +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import ConfigType + + +def multiclass_nms( + multi_bboxes: Tensor, + multi_scores: Tensor, + score_thr: float, + nms_cfg: ConfigType, + max_num: int = -1, + score_factors: Optional[Tensor] = None, + return_inds: bool = False, + box_dim: int = 4 +) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]: + """NMS for multi-class bboxes. + + Args: + multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) + multi_scores (Tensor): shape (n, #class), where the last column + contains scores of the background class, but this will be ignored. + score_thr (float): bbox threshold, bboxes with scores lower than it + will not be considered. + nms_cfg (Union[:obj:`ConfigDict`, dict]): a dict that contains + the arguments of nms operations. + max_num (int, optional): if there are more than max_num bboxes after + NMS, only top max_num will be kept. Default to -1. + score_factors (Tensor, optional): The factors multiplied to scores + before applying NMS. Default to None. + return_inds (bool, optional): Whether return the indices of kept + bboxes. Default to False. + box_dim (int): The dimension of boxes. Defaults to 4. + + Returns: + Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]: + (dets, labels, indices (optional)), tensors of shape (k, 5), + (k), and (k). Dets are boxes with scores. Labels are 0-based. + """ + num_classes = multi_scores.size(1) - 1 + # exclude background category + if multi_bboxes.shape[1] > box_dim: + bboxes = multi_bboxes.view(multi_scores.size(0), -1, box_dim) + else: + bboxes = multi_bboxes[:, None].expand( + multi_scores.size(0), num_classes, box_dim) + + scores = multi_scores[:, :-1] + + labels = torch.arange(num_classes, dtype=torch.long, device=scores.device) + labels = labels.view(1, -1).expand_as(scores) + + bboxes = bboxes.reshape(-1, box_dim) + scores = scores.reshape(-1) + labels = labels.reshape(-1) + + if not torch.onnx.is_in_onnx_export(): + # NonZero not supported in TensorRT + # remove low scoring boxes + valid_mask = scores > score_thr + # multiply score_factor after threshold to preserve more bboxes, improve + # mAP by 1% for YOLOv3 + if score_factors is not None: + # expand the shape to match original shape of score + score_factors = score_factors.view(-1, 1).expand( + multi_scores.size(0), num_classes) + score_factors = score_factors.reshape(-1) + scores = scores * score_factors + + if not torch.onnx.is_in_onnx_export(): + # NonZero not supported in TensorRT + inds = valid_mask.nonzero(as_tuple=False).squeeze(1) + bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds] + else: + # TensorRT NMS plugin has invalid output filled with -1 + # add dummy data to make detection output correct. + bboxes = torch.cat([bboxes, bboxes.new_zeros(1, box_dim)], dim=0) + scores = torch.cat([scores, scores.new_zeros(1)], dim=0) + labels = torch.cat([labels, labels.new_zeros(1)], dim=0) + + if bboxes.numel() == 0: + if torch.onnx.is_in_onnx_export(): + raise RuntimeError('[ONNX Error] Can not record NMS ' + 'as it has not been executed this time') + dets = torch.cat([bboxes, scores[:, None]], -1) + if return_inds: + return dets, labels, inds + else: + return dets, labels + + dets, keep = batched_nms(bboxes, scores, labels, nms_cfg) + + if max_num > 0: + dets = dets[:max_num] + keep = keep[:max_num] + + if return_inds: + return dets, labels[keep], inds[keep] + else: + return dets, labels[keep] + + +def fast_nms( + multi_bboxes: Tensor, + multi_scores: Tensor, + multi_coeffs: Tensor, + score_thr: float, + iou_thr: float, + top_k: int, + max_num: int = -1 +) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]: + """Fast NMS in `YOLACT `_. + + Fast NMS allows already-removed detections to suppress other detections so + that every instance can be decided to be kept or discarded in parallel, + which is not possible in traditional NMS. This relaxation allows us to + implement Fast NMS entirely in standard GPU-accelerated matrix operations. + + Args: + multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) + multi_scores (Tensor): shape (n, #class+1), where the last column + contains scores of the background class, but this will be ignored. + multi_coeffs (Tensor): shape (n, #class*coeffs_dim). + score_thr (float): bbox threshold, bboxes with scores lower than it + will not be considered. + iou_thr (float): IoU threshold to be considered as conflicted. + top_k (int): if there are more than top_k bboxes before NMS, + only top top_k will be kept. + max_num (int): if there are more than max_num bboxes after NMS, + only top max_num will be kept. If -1, keep all the bboxes. + Default: -1. + + Returns: + Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]: + (dets, labels, coefficients), tensors of shape (k, 5), (k, 1), + and (k, coeffs_dim). Dets are boxes with scores. + Labels are 0-based. + """ + + scores = multi_scores[:, :-1].t() # [#class, n] + scores, idx = scores.sort(1, descending=True) + + idx = idx[:, :top_k].contiguous() + scores = scores[:, :top_k] # [#class, topk] + num_classes, num_dets = idx.size() + boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4) + coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1) + + iou = bbox_overlaps(boxes, boxes) # [#class, topk, topk] + iou.triu_(diagonal=1) + iou_max, _ = iou.max(dim=1) + + # Now just filter out the ones higher than the threshold + keep = iou_max <= iou_thr + + # Second thresholding introduces 0.2 mAP gain at negligible time cost + keep *= scores > score_thr + + # Assign each kept detection to its corresponding class + classes = torch.arange( + num_classes, device=boxes.device)[:, None].expand_as(keep) + classes = classes[keep] + + boxes = boxes[keep] + coeffs = coeffs[keep] + scores = scores[keep] + + # Only keep the top max_num highest scores across all classes + scores, idx = scores.sort(0, descending=True) + if max_num > 0: + idx = idx[:max_num] + scores = scores[:max_num] + + classes = classes[idx] + boxes = boxes[idx] + coeffs = coeffs[idx] + + cls_dets = torch.cat([boxes, scores[:, None]], dim=1) + return cls_dets, classes, coeffs diff --git a/mmdetection/mmdet/models/layers/brick_wrappers.py b/mmdetection/mmdet/models/layers/brick_wrappers.py new file mode 100644 index 0000000..5ecb849 --- /dev/null +++ b/mmdetection/mmdet/models/layers/brick_wrappers.py @@ -0,0 +1,138 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn.bricks.wrappers import NewEmptyTensorOp, obsolete_torch_version + +from mmdet.registry import MODELS + +if torch.__version__ == 'parrots': + TORCH_VERSION = torch.__version__ +else: + # torch.__version__ could be 1.3.1+cu92, we only need the first two + # for comparison + TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2]) + + +def adaptive_avg_pool2d(input, output_size): + """Handle empty batch dimension to adaptive_avg_pool2d. + + Args: + input (tensor): 4D tensor. + output_size (int, tuple[int,int]): the target output size. + """ + if input.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): + if isinstance(output_size, int): + output_size = [output_size, output_size] + output_size = [*input.shape[:2], *output_size] + empty = NewEmptyTensorOp.apply(input, output_size) + return empty + else: + return F.adaptive_avg_pool2d(input, output_size) + + +class AdaptiveAvgPool2d(nn.AdaptiveAvgPool2d): + """Handle empty batch dimension to AdaptiveAvgPool2d.""" + + def forward(self, x): + # PyTorch 1.9 does not support empty tensor inference yet + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): + output_size = self.output_size + if isinstance(output_size, int): + output_size = [output_size, output_size] + else: + output_size = [ + v if v is not None else d + for v, d in zip(output_size, + x.size()[-2:]) + ] + output_size = [*x.shape[:2], *output_size] + empty = NewEmptyTensorOp.apply(x, output_size) + return empty + + return super().forward(x) + + +# Modified from +# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py#L13 # noqa +@MODELS.register_module('FrozenBN') +class FrozenBatchNorm2d(nn.Module): + """BatchNorm2d where the batch statistics and the affine parameters are + fixed. + + It contains non-trainable buffers called + "weight" and "bias", "running_mean", "running_var", + initialized to perform identity transformation. + Args: + num_features (int): :math:`C` from an expected input of size + :math:`(N, C, H, W)`. + eps (float): a value added to the denominator for numerical stability. + Default: 1e-5 + """ + + def __init__(self, num_features, eps=1e-5, **kwargs): + super().__init__() + self.num_features = num_features + self.eps = eps + self.register_buffer('weight', torch.ones(num_features)) + self.register_buffer('bias', torch.zeros(num_features)) + self.register_buffer('running_mean', torch.zeros(num_features)) + self.register_buffer('running_var', torch.ones(num_features) - eps) + + def forward(self, x): + if x.requires_grad: + # When gradients are needed, F.batch_norm will use extra memory + # because its backward op computes gradients for weight/bias + # as well. + scale = self.weight * (self.running_var + self.eps).rsqrt() + bias = self.bias - self.running_mean * scale + scale = scale.reshape(1, -1, 1, 1) + bias = bias.reshape(1, -1, 1, 1) + out_dtype = x.dtype # may be half + return x * scale.to(out_dtype) + bias.to(out_dtype) + else: + # When gradients are not needed, F.batch_norm is a single fused op + # and provide more optimization opportunities. + return F.batch_norm( + x, + self.running_mean, + self.running_var, + self.weight, + self.bias, + training=False, + eps=self.eps, + ) + + def __repr__(self): + return 'FrozenBatchNorm2d(num_features={}, eps={})'.format( + self.num_features, self.eps) + + @classmethod + def convert_frozen_batchnorm(cls, module): + """Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm. + + Args: + module (torch.nn.Module): + Returns: + If module is BatchNorm/SyncBatchNorm, returns a new module. + Otherwise, in-place convert module and return it. + Similar to convert_sync_batchnorm in + https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py + """ + bn_module = nn.modules.batchnorm + bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm) + res = module + if isinstance(module, bn_module): + res = cls(module.num_features) + if module.affine: + res.weight.data = module.weight.data.clone().detach() + res.bias.data = module.bias.data.clone().detach() + res.running_mean.data = module.running_mean.data + res.running_var.data = module.running_var.data + res.eps = module.eps + else: + for name, child in module.named_children(): + new_child = cls.convert_frozen_batchnorm(child) + if new_child is not child: + res.add_module(name, new_child) + return res diff --git a/mmdetection/mmdet/models/layers/conv_upsample.py b/mmdetection/mmdet/models/layers/conv_upsample.py new file mode 100644 index 0000000..3250587 --- /dev/null +++ b/mmdetection/mmdet/models/layers/conv_upsample.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule, ModuleList + + +class ConvUpsample(BaseModule): + """ConvUpsample performs 2x upsampling after Conv. + + There are several `ConvModule` layers. In the first few layers, upsampling + will be applied after each layer of convolution. The number of upsampling + must be no more than the number of ConvModule layers. + + Args: + in_channels (int): Number of channels in the input feature map. + inner_channels (int): Number of channels produced by the convolution. + num_layers (int): Number of convolution layers. + num_upsample (int | optional): Number of upsampling layer. Must be no + more than num_layers. Upsampling will be applied after the first + ``num_upsample`` layers of convolution. Default: ``num_layers``. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Default: None. + init_cfg (dict): Config dict for initialization. Default: None. + kwargs (key word augments): Other augments used in ConvModule. + """ + + def __init__(self, + in_channels, + inner_channels, + num_layers=1, + num_upsample=None, + conv_cfg=None, + norm_cfg=None, + init_cfg=None, + **kwargs): + super(ConvUpsample, self).__init__(init_cfg) + if num_upsample is None: + num_upsample = num_layers + assert num_upsample <= num_layers, \ + f'num_upsample({num_upsample})must be no more than ' \ + f'num_layers({num_layers})' + self.num_layers = num_layers + self.num_upsample = num_upsample + self.conv = ModuleList() + for i in range(num_layers): + self.conv.append( + ConvModule( + in_channels, + inner_channels, + 3, + padding=1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + in_channels = inner_channels + + def forward(self, x): + num_upsample = self.num_upsample + for i in range(self.num_layers): + x = self.conv[i](x) + if num_upsample > 0: + num_upsample -= 1 + x = F.interpolate( + x, scale_factor=2, mode='bilinear', align_corners=False) + return x diff --git a/mmdetection/mmdet/models/layers/csp_layer.py b/mmdetection/mmdet/models/layers/csp_layer.py new file mode 100644 index 0000000..c8b547b --- /dev/null +++ b/mmdetection/mmdet/models/layers/csp_layer.py @@ -0,0 +1,246 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from .se_layer import ChannelAttention + + +class DarknetBottleneck(BaseModule): + """The basic bottleneck block used in Darknet. + + Each ResBlock consists of two ConvModules and the input is added to the + final output. Each ConvModule is composed of Conv, BN, and LeakyReLU. + The first convLayer has filter size of 1x1 and the second one has the + filter size of 3x3. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (float): The kernel size of the convolution. + Defaults to 0.5. + add_identity (bool): Whether to add identity to the out. + Defaults to True. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + conv_cfg (dict): Config dict for convolution layer. Defaults to None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish'). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expansion: float = 0.5, + add_identity: bool = True, + use_depthwise: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='Swish'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + hidden_channels = int(out_channels * expansion) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + self.conv1 = ConvModule( + in_channels, + hidden_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = conv( + hidden_channels, + out_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.add_identity = \ + add_identity and in_channels == out_channels + + def forward(self, x: Tensor) -> Tensor: + """Forward function.""" + identity = x + out = self.conv1(x) + out = self.conv2(out) + + if self.add_identity: + return out + identity + else: + return out + + +class CSPNeXtBlock(BaseModule): + """The basic bottleneck block used in CSPNeXt. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (float): Expand ratio of the hidden channel. Defaults to 0.5. + add_identity (bool): Whether to add identity to the out. Only works + when in_channels == out_channels. Defaults to True. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + kernel_size (int): The kernel size of the second convolution layer. + Defaults to 5. + conv_cfg (dict): Config dict for convolution layer. Defaults to None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU'). + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expansion: float = 0.5, + add_identity: bool = True, + use_depthwise: bool = False, + kernel_size: int = 5, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + hidden_channels = int(out_channels * expansion) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + self.conv1 = conv( + in_channels, + hidden_channels, + 3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = DepthwiseSeparableConvModule( + hidden_channels, + out_channels, + kernel_size, + stride=1, + padding=kernel_size // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.add_identity = \ + add_identity and in_channels == out_channels + + def forward(self, x: Tensor) -> Tensor: + """Forward function.""" + identity = x + out = self.conv1(x) + out = self.conv2(out) + + if self.add_identity: + return out + identity + else: + return out + + +class CSPLayer(BaseModule): + """Cross Stage Partial Layer. + + Args: + in_channels (int): The input channels of the CSP layer. + out_channels (int): The output channels of the CSP layer. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + num_blocks (int): Number of blocks. Defaults to 1. + add_identity (bool): Whether to add identity in blocks. + Defaults to True. + use_cspnext_block (bool): Whether to use CSPNeXt block. + Defaults to False. + use_depthwise (bool): Whether to use depthwise separable convolution in + blocks. Defaults to False. + channel_attention (bool): Whether to add channel attention in each + stage. Defaults to True. + conv_cfg (dict, optional): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN') + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish') + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expand_ratio: float = 0.5, + num_blocks: int = 1, + add_identity: bool = True, + use_depthwise: bool = False, + use_cspnext_block: bool = False, + channel_attention: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='Swish'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck + mid_channels = int(out_channels * expand_ratio) + self.channel_attention = channel_attention + self.main_conv = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.short_conv = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.final_conv = ConvModule( + 2 * mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = nn.Sequential(*[ + block( + mid_channels, + mid_channels, + 1.0, + add_identity, + use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) for _ in range(num_blocks) + ]) + if channel_attention: + self.attention = ChannelAttention(2 * mid_channels) + + def forward(self, x: Tensor) -> Tensor: + """Forward function.""" + x_short = self.short_conv(x) + + x_main = self.main_conv(x) + x_main = self.blocks(x_main) + + x_final = torch.cat((x_main, x_short), dim=1) + + if self.channel_attention: + x_final = self.attention(x_final) + return self.final_conv(x_final) diff --git a/mmdetection/mmdet/models/layers/dropblock.py b/mmdetection/mmdet/models/layers/dropblock.py new file mode 100644 index 0000000..7938199 --- /dev/null +++ b/mmdetection/mmdet/models/layers/dropblock.py @@ -0,0 +1,86 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmdet.registry import MODELS + +eps = 1e-6 + + +@MODELS.register_module() +class DropBlock(nn.Module): + """Randomly drop some regions of feature maps. + + Please refer to the method proposed in `DropBlock + `_ for details. + + Args: + drop_prob (float): The probability of dropping each block. + block_size (int): The size of dropped blocks. + warmup_iters (int): The drop probability will linearly increase + from `0` to `drop_prob` during the first `warmup_iters` iterations. + Default: 2000. + """ + + def __init__(self, drop_prob, block_size, warmup_iters=2000, **kwargs): + super(DropBlock, self).__init__() + assert block_size % 2 == 1 + assert 0 < drop_prob <= 1 + assert warmup_iters >= 0 + self.drop_prob = drop_prob + self.block_size = block_size + self.warmup_iters = warmup_iters + self.iter_cnt = 0 + + def forward(self, x): + """ + Args: + x (Tensor): Input feature map on which some areas will be randomly + dropped. + + Returns: + Tensor: The tensor after DropBlock layer. + """ + if not self.training: + return x + self.iter_cnt += 1 + N, C, H, W = list(x.shape) + gamma = self._compute_gamma((H, W)) + mask_shape = (N, C, H - self.block_size + 1, W - self.block_size + 1) + mask = torch.bernoulli(torch.full(mask_shape, gamma, device=x.device)) + + mask = F.pad(mask, [self.block_size // 2] * 4, value=0) + mask = F.max_pool2d( + input=mask, + stride=(1, 1), + kernel_size=(self.block_size, self.block_size), + padding=self.block_size // 2) + mask = 1 - mask + x = x * mask * mask.numel() / (eps + mask.sum()) + return x + + def _compute_gamma(self, feat_size): + """Compute the value of gamma according to paper. gamma is the + parameter of bernoulli distribution, which controls the number of + features to drop. + + gamma = (drop_prob * fm_area) / (drop_area * keep_area) + + Args: + feat_size (tuple[int, int]): The height and width of feature map. + + Returns: + float: The value of gamma. + """ + gamma = (self.drop_prob * feat_size[0] * feat_size[1]) + gamma /= ((feat_size[0] - self.block_size + 1) * + (feat_size[1] - self.block_size + 1)) + gamma /= (self.block_size**2) + factor = (1.0 if self.iter_cnt > self.warmup_iters else self.iter_cnt / + self.warmup_iters) + return gamma * factor + + def extra_repr(self): + return (f'drop_prob={self.drop_prob}, block_size={self.block_size}, ' + f'warmup_iters={self.warmup_iters}') diff --git a/mmdetection/mmdet/models/layers/ema.py b/mmdetection/mmdet/models/layers/ema.py new file mode 100644 index 0000000..bce503c --- /dev/null +++ b/mmdetection/mmdet/models/layers/ema.py @@ -0,0 +1,66 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import torch +import torch.nn as nn +from mmengine.model import ExponentialMovingAverage +from torch import Tensor + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class ExpMomentumEMA(ExponentialMovingAverage): + """Exponential moving average (EMA) with exponential momentum strategy, + which is used in YOLOX. + + Args: + model (nn.Module): The model to be averaged. + momentum (float): The momentum used for updating ema parameter. + Ema's parameter are updated with the formula: + `averaged_param = (1-momentum) * averaged_param + momentum * + source_param`. Defaults to 0.0002. + gamma (int): Use a larger momentum early in training and gradually + annealing to a smaller value to update the ema model smoothly. The + momentum is calculated as + `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`. + Defaults to 2000. + interval (int): Interval between two updates. Defaults to 1. + device (torch.device, optional): If provided, the averaged model will + be stored on the :attr:`device`. Defaults to None. + update_buffers (bool): if True, it will compute running averages for + both the parameters and the buffers of the model. Defaults to + False. + """ + + def __init__(self, + model: nn.Module, + momentum: float = 0.0002, + gamma: int = 2000, + interval=1, + device: Optional[torch.device] = None, + update_buffers: bool = False) -> None: + super().__init__( + model=model, + momentum=momentum, + interval=interval, + device=device, + update_buffers=update_buffers) + assert gamma > 0, f'gamma must be greater than 0, but got {gamma}' + self.gamma = gamma + + def avg_func(self, averaged_param: Tensor, source_param: Tensor, + steps: int) -> None: + """Compute the moving average of the parameters using the exponential + momentum strategy. + + Args: + averaged_param (Tensor): The averaged parameters. + source_param (Tensor): The source parameters. + steps (int): The number of times the parameters have been + updated. + """ + momentum = (1 - self.momentum) * math.exp( + -float(1 + steps) / self.gamma) + self.momentum + averaged_param.mul_(1 - momentum).add_(source_param, alpha=momentum) diff --git a/mmdetection/mmdet/models/layers/inverted_residual.py b/mmdetection/mmdet/models/layers/inverted_residual.py new file mode 100644 index 0000000..a174ccc --- /dev/null +++ b/mmdetection/mmdet/models/layers/inverted_residual.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule +from mmcv.cnn.bricks import DropPath +from mmengine.model import BaseModule + +from .se_layer import SELayer + + +class InvertedResidual(BaseModule): + """Inverted Residual Block. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + mid_channels (int): The input channels of the depthwise convolution. + kernel_size (int): The kernel size of the depthwise convolution. + Default: 3. + stride (int): The stride of the depthwise convolution. Default: 1. + se_cfg (dict): Config dict for se layer. Default: None, which means no + se layer. + with_expand_conv (bool): Use expand conv or not. If set False, + mid_channels must be the same with in_channels. + Default: True. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + drop_path_rate (float): stochastic depth rate. Defaults to 0. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + stride=1, + se_cfg=None, + with_expand_conv=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + drop_path_rate=0., + with_cp=False, + init_cfg=None): + super(InvertedResidual, self).__init__(init_cfg) + self.with_res_shortcut = (stride == 1 and in_channels == out_channels) + assert stride in [1, 2], f'stride must in [1, 2]. ' \ + f'But received {stride}.' + self.with_cp = with_cp + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate > 0 else nn.Identity() + self.with_se = se_cfg is not None + self.with_expand_conv = with_expand_conv + + if self.with_se: + assert isinstance(se_cfg, dict) + if not self.with_expand_conv: + assert mid_channels == in_channels + + if self.with_expand_conv: + self.expand_conv = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.depthwise_conv = ConvModule( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=kernel_size // 2, + groups=mid_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if self.with_se: + self.se = SELayer(**se_cfg) + + self.linear_conv = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x): + + def _inner_forward(x): + out = x + + if self.with_expand_conv: + out = self.expand_conv(out) + + out = self.depthwise_conv(out) + + if self.with_se: + out = self.se(out) + + out = self.linear_conv(out) + + if self.with_res_shortcut: + return x + self.drop_path(out) + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out diff --git a/mmdetection/mmdet/models/layers/matrix_nms.py b/mmdetection/mmdet/models/layers/matrix_nms.py new file mode 100644 index 0000000..9dc8c4f --- /dev/null +++ b/mmdetection/mmdet/models/layers/matrix_nms.py @@ -0,0 +1,121 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def mask_matrix_nms(masks, + labels, + scores, + filter_thr=-1, + nms_pre=-1, + max_num=-1, + kernel='gaussian', + sigma=2.0, + mask_area=None): + """Matrix NMS for multi-class masks. + + Args: + masks (Tensor): Has shape (num_instances, h, w) + labels (Tensor): Labels of corresponding masks, + has shape (num_instances,). + scores (Tensor): Mask scores of corresponding masks, + has shape (num_instances). + filter_thr (float): Score threshold to filter the masks + after matrix nms. Default: -1, which means do not + use filter_thr. + nms_pre (int): The max number of instances to do the matrix nms. + Default: -1, which means do not use nms_pre. + max_num (int, optional): If there are more than max_num masks after + matrix, only top max_num will be kept. Default: -1, which means + do not use max_num. + kernel (str): 'linear' or 'gaussian'. + sigma (float): std in gaussian method. + mask_area (Tensor): The sum of seg_masks. + + Returns: + tuple(Tensor): Processed mask results. + + - scores (Tensor): Updated scores, has shape (n,). + - labels (Tensor): Remained labels, has shape (n,). + - masks (Tensor): Remained masks, has shape (n, w, h). + - keep_inds (Tensor): The indices number of + the remaining mask in the input mask, has shape (n,). + """ + assert len(labels) == len(masks) == len(scores) + if len(labels) == 0: + return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros( + 0, *masks.shape[-2:]), labels.new_zeros(0) + if mask_area is None: + mask_area = masks.sum((1, 2)).float() + else: + assert len(masks) == len(mask_area) + + # sort and keep top nms_pre + scores, sort_inds = torch.sort(scores, descending=True) + + keep_inds = sort_inds + if nms_pre > 0 and len(sort_inds) > nms_pre: + sort_inds = sort_inds[:nms_pre] + keep_inds = keep_inds[:nms_pre] + scores = scores[:nms_pre] + masks = masks[sort_inds] + mask_area = mask_area[sort_inds] + labels = labels[sort_inds] + + num_masks = len(labels) + flatten_masks = masks.reshape(num_masks, -1).float() + # inter. + inter_matrix = torch.mm(flatten_masks, flatten_masks.transpose(1, 0)) + expanded_mask_area = mask_area.expand(num_masks, num_masks) + # Upper triangle iou matrix. + iou_matrix = (inter_matrix / + (expanded_mask_area + expanded_mask_area.transpose(1, 0) - + inter_matrix)).triu(diagonal=1) + # label_specific matrix. + expanded_labels = labels.expand(num_masks, num_masks) + # Upper triangle label matrix. + label_matrix = (expanded_labels == expanded_labels.transpose( + 1, 0)).triu(diagonal=1) + + # IoU compensation + compensate_iou, _ = (iou_matrix * label_matrix).max(0) + compensate_iou = compensate_iou.expand(num_masks, + num_masks).transpose(1, 0) + + # IoU decay + decay_iou = iou_matrix * label_matrix + + # Calculate the decay_coefficient + if kernel == 'gaussian': + decay_matrix = torch.exp(-1 * sigma * (decay_iou**2)) + compensate_matrix = torch.exp(-1 * sigma * (compensate_iou**2)) + decay_coefficient, _ = (decay_matrix / compensate_matrix).min(0) + elif kernel == 'linear': + decay_matrix = (1 - decay_iou) / (1 - compensate_iou) + decay_coefficient, _ = decay_matrix.min(0) + else: + raise NotImplementedError( + f'{kernel} kernel is not supported in matrix nms!') + # update the score. + scores = scores * decay_coefficient + + if filter_thr > 0: + keep = scores >= filter_thr + keep_inds = keep_inds[keep] + if not keep.any(): + return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros( + 0, *masks.shape[-2:]), labels.new_zeros(0) + masks = masks[keep] + scores = scores[keep] + labels = labels[keep] + + # sort and keep top max_num + scores, sort_inds = torch.sort(scores, descending=True) + keep_inds = keep_inds[sort_inds] + if max_num > 0 and len(sort_inds) > max_num: + sort_inds = sort_inds[:max_num] + keep_inds = keep_inds[:max_num] + scores = scores[:max_num] + masks = masks[sort_inds] + labels = labels[sort_inds] + + return scores, labels, masks, keep_inds diff --git a/mmdetection/mmdet/models/layers/msdeformattn_pixel_decoder.py b/mmdetection/mmdet/models/layers/msdeformattn_pixel_decoder.py new file mode 100644 index 0000000..a67dc3c --- /dev/null +++ b/mmdetection/mmdet/models/layers/msdeformattn_pixel_decoder.py @@ -0,0 +1,246 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Conv2d, ConvModule +from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention +from mmengine.model import (BaseModule, ModuleList, caffe2_xavier_init, + normal_init, xavier_init) +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptMultiConfig +from ..task_modules.prior_generators import MlvlPointGenerator +from .positional_encoding import SinePositionalEncoding +from .transformer import Mask2FormerTransformerEncoder + + +@MODELS.register_module() +class MSDeformAttnPixelDecoder(BaseModule): + """Pixel decoder with multi-scale deformable attention. + + Args: + in_channels (list[int] | tuple[int]): Number of channels in the + input feature maps. + strides (list[int] | tuple[int]): Output strides of feature from + backbone. + feat_channels (int): Number of channels for feature. + out_channels (int): Number of channels for output. + num_outs (int): Number of output scales. + norm_cfg (:obj:`ConfigDict` or dict): Config for normalization. + Defaults to dict(type='GN', num_groups=32). + act_cfg (:obj:`ConfigDict` or dict): Config for activation. + Defaults to dict(type='ReLU'). + encoder (:obj:`ConfigDict` or dict): Config for transformer + encoder. Defaults to None. + positional_encoding (:obj:`ConfigDict` or dict): Config for + transformer encoder position encoding. Defaults to + dict(num_feats=128, normalize=True). + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None. + """ + + def __init__(self, + in_channels: Union[List[int], + Tuple[int]] = [256, 512, 1024, 2048], + strides: Union[List[int], Tuple[int]] = [4, 8, 16, 32], + feat_channels: int = 256, + out_channels: int = 256, + num_outs: int = 3, + norm_cfg: ConfigType = dict(type='GN', num_groups=32), + act_cfg: ConfigType = dict(type='ReLU'), + encoder: ConfigType = None, + positional_encoding: ConfigType = dict( + num_feats=128, normalize=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.strides = strides + self.num_input_levels = len(in_channels) + self.num_encoder_levels = \ + encoder.layer_cfg.self_attn_cfg.num_levels + assert self.num_encoder_levels >= 1, \ + 'num_levels in attn_cfgs must be at least one' + input_conv_list = [] + # from top to down (low to high resolution) + for i in range(self.num_input_levels - 1, + self.num_input_levels - self.num_encoder_levels - 1, + -1): + input_conv = ConvModule( + in_channels[i], + feat_channels, + kernel_size=1, + norm_cfg=norm_cfg, + act_cfg=None, + bias=True) + input_conv_list.append(input_conv) + self.input_convs = ModuleList(input_conv_list) + + self.encoder = Mask2FormerTransformerEncoder(**encoder) + self.postional_encoding = SinePositionalEncoding(**positional_encoding) + # high resolution to low resolution + self.level_encoding = nn.Embedding(self.num_encoder_levels, + feat_channels) + + # fpn-like structure + self.lateral_convs = ModuleList() + self.output_convs = ModuleList() + self.use_bias = norm_cfg is None + # from top to down (low to high resolution) + # fpn for the rest features that didn't pass in encoder + for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1, + -1): + lateral_conv = ConvModule( + in_channels[i], + feat_channels, + kernel_size=1, + bias=self.use_bias, + norm_cfg=norm_cfg, + act_cfg=None) + output_conv = ConvModule( + feat_channels, + feat_channels, + kernel_size=3, + stride=1, + padding=1, + bias=self.use_bias, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.lateral_convs.append(lateral_conv) + self.output_convs.append(output_conv) + + self.mask_feature = Conv2d( + feat_channels, out_channels, kernel_size=1, stride=1, padding=0) + + self.num_outs = num_outs + self.point_generator = MlvlPointGenerator(strides) + + def init_weights(self) -> None: + """Initialize weights.""" + for i in range(0, self.num_encoder_levels): + xavier_init( + self.input_convs[i].conv, + gain=1, + bias=0, + distribution='uniform') + + for i in range(0, self.num_input_levels - self.num_encoder_levels): + caffe2_xavier_init(self.lateral_convs[i].conv, bias=0) + caffe2_xavier_init(self.output_convs[i].conv, bias=0) + + caffe2_xavier_init(self.mask_feature, bias=0) + + normal_init(self.level_encoding, mean=0, std=1) + for p in self.encoder.parameters(): + if p.dim() > 1: + nn.init.xavier_normal_(p) + + # init_weights defined in MultiScaleDeformableAttention + for m in self.encoder.layers.modules(): + if isinstance(m, MultiScaleDeformableAttention): + m.init_weights() + + def forward(self, feats: List[Tensor]) -> Tuple[Tensor, Tensor]: + """ + Args: + feats (list[Tensor]): Feature maps of each level. Each has + shape of (batch_size, c, h, w). + + Returns: + tuple: A tuple containing the following: + + - mask_feature (Tensor): shape (batch_size, c, h, w). + - multi_scale_features (list[Tensor]): Multi scale \ + features, each in shape (batch_size, c, h, w). + """ + # generate padding mask for each level, for each image + batch_size = feats[0].shape[0] + encoder_input_list = [] + padding_mask_list = [] + level_positional_encoding_list = [] + spatial_shapes = [] + reference_points_list = [] + for i in range(self.num_encoder_levels): + level_idx = self.num_input_levels - i - 1 + feat = feats[level_idx] + feat_projected = self.input_convs[i](feat) + feat_hw = torch._shape_as_tensor(feat)[2:].to(feat.device) + + # no padding + padding_mask_resized = feat.new_zeros( + (batch_size, ) + feat.shape[-2:], dtype=torch.bool) + pos_embed = self.postional_encoding(padding_mask_resized) + level_embed = self.level_encoding.weight[i] + level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed + # (h_i * w_i, 2) + reference_points = self.point_generator.single_level_grid_priors( + feat.shape[-2:], level_idx, device=feat.device) + # normalize + feat_wh = feat_hw.unsqueeze(0).flip(dims=[0, 1]) + factor = feat_wh * self.strides[level_idx] + reference_points = reference_points / factor + + # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c) + feat_projected = feat_projected.flatten(2).permute(0, 2, 1) + level_pos_embed = level_pos_embed.flatten(2).permute(0, 2, 1) + padding_mask_resized = padding_mask_resized.flatten(1) + + encoder_input_list.append(feat_projected) + padding_mask_list.append(padding_mask_resized) + level_positional_encoding_list.append(level_pos_embed) + spatial_shapes.append(feat_hw) + reference_points_list.append(reference_points) + # shape (batch_size, total_num_queries), + # total_num_queries=sum([., h_i * w_i,.]) + padding_masks = torch.cat(padding_mask_list, dim=1) + # shape (total_num_queries, batch_size, c) + encoder_inputs = torch.cat(encoder_input_list, dim=1) + level_positional_encodings = torch.cat( + level_positional_encoding_list, dim=1) + # shape (num_encoder_levels, 2), from low + # resolution to high resolution + num_queries_per_level = [e[0] * e[1] for e in spatial_shapes] + spatial_shapes = torch.cat(spatial_shapes).view(-1, 2) + # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) + reference_points = torch.cat(reference_points_list, dim=0) + reference_points = reference_points[None, :, None].repeat( + batch_size, 1, self.num_encoder_levels, 1) + valid_radios = reference_points.new_ones( + (batch_size, self.num_encoder_levels, 2)) + # shape (num_total_queries, batch_size, c) + memory = self.encoder( + query=encoder_inputs, + query_pos=level_positional_encodings, + key_padding_mask=padding_masks, + spatial_shapes=spatial_shapes, + reference_points=reference_points, + level_start_index=level_start_index, + valid_ratios=valid_radios) + # (batch_size, c, num_total_queries) + memory = memory.permute(0, 2, 1) + + # from low resolution to high resolution + outs = torch.split(memory, num_queries_per_level, dim=-1) + outs = [ + x.reshape(batch_size, -1, spatial_shapes[i][0], + spatial_shapes[i][1]) for i, x in enumerate(outs) + ] + + for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1, + -1): + x = feats[i] + cur_feat = self.lateral_convs[i](x) + y = cur_feat + F.interpolate( + outs[-1], + size=cur_feat.shape[-2:], + mode='bilinear', + align_corners=False) + y = self.output_convs[i](y) + outs.append(y) + multi_scale_features = outs[:self.num_outs] + + mask_feature = self.mask_feature(outs[-1]) + return mask_feature, multi_scale_features diff --git a/mmdetection/mmdet/models/layers/normed_predictor.py b/mmdetection/mmdet/models/layers/normed_predictor.py new file mode 100644 index 0000000..592194b --- /dev/null +++ b/mmdetection/mmdet/models/layers/normed_predictor.py @@ -0,0 +1,99 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.utils import digit_version +from torch import Tensor + +from mmdet.registry import MODELS + +MODELS.register_module('Linear', module=nn.Linear) + + +@MODELS.register_module(name='NormedLinear') +class NormedLinear(nn.Linear): + """Normalized Linear Layer. + + Args: + tempeature (float, optional): Tempeature term. Defaults to 20. + power (int, optional): Power term. Defaults to 1.0. + eps (float, optional): The minimal value of divisor to + keep numerical stability. Defaults to 1e-6. + """ + + def __init__(self, + *args, + tempearture: float = 20, + power: int = 1.0, + eps: float = 1e-6, + **kwargs) -> None: + super().__init__(*args, **kwargs) + self.tempearture = tempearture + self.power = power + self.eps = eps + self.init_weights() + + def init_weights(self) -> None: + """Initialize the weights.""" + nn.init.normal_(self.weight, mean=0, std=0.01) + if self.bias is not None: + nn.init.constant_(self.bias, 0) + + def forward(self, x: Tensor) -> Tensor: + """Forward function for `NormedLinear`.""" + weight_ = self.weight / ( + self.weight.norm(dim=1, keepdim=True).pow(self.power) + self.eps) + x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps) + x_ = x_ * self.tempearture + + return F.linear(x_, weight_, self.bias) + + +@MODELS.register_module(name='NormedConv2d') +class NormedConv2d(nn.Conv2d): + """Normalized Conv2d Layer. + + Args: + tempeature (float, optional): Tempeature term. Defaults to 20. + power (int, optional): Power term. Defaults to 1.0. + eps (float, optional): The minimal value of divisor to + keep numerical stability. Defaults to 1e-6. + norm_over_kernel (bool, optional): Normalize over kernel. + Defaults to False. + """ + + def __init__(self, + *args, + tempearture: float = 20, + power: int = 1.0, + eps: float = 1e-6, + norm_over_kernel: bool = False, + **kwargs) -> None: + super().__init__(*args, **kwargs) + self.tempearture = tempearture + self.power = power + self.norm_over_kernel = norm_over_kernel + self.eps = eps + + def forward(self, x: Tensor) -> Tensor: + """Forward function for `NormedConv2d`.""" + if not self.norm_over_kernel: + weight_ = self.weight / ( + self.weight.norm(dim=1, keepdim=True).pow(self.power) + + self.eps) + else: + weight_ = self.weight / ( + self.weight.view(self.weight.size(0), -1).norm( + dim=1, keepdim=True).pow(self.power)[..., None, None] + + self.eps) + x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps) + x_ = x_ * self.tempearture + + if hasattr(self, 'conv2d_forward'): + x_ = self.conv2d_forward(x_, weight_) + else: + if digit_version(torch.__version__) >= digit_version('1.8'): + x_ = self._conv_forward(x_, weight_, self.bias) + else: + x_ = self._conv_forward(x_, weight_) + return x_ diff --git a/mmdetection/mmdet/models/layers/pixel_decoder.py b/mmdetection/mmdet/models/layers/pixel_decoder.py new file mode 100644 index 0000000..fb61434 --- /dev/null +++ b/mmdetection/mmdet/models/layers/pixel_decoder.py @@ -0,0 +1,249 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Conv2d, ConvModule +from mmengine.model import BaseModule, ModuleList, caffe2_xavier_init +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptMultiConfig +from .positional_encoding import SinePositionalEncoding +from .transformer import DetrTransformerEncoder + + +@MODELS.register_module() +class PixelDecoder(BaseModule): + """Pixel decoder with a structure like fpn. + + Args: + in_channels (list[int] | tuple[int]): Number of channels in the + input feature maps. + feat_channels (int): Number channels for feature. + out_channels (int): Number channels for output. + norm_cfg (:obj:`ConfigDict` or dict): Config for normalization. + Defaults to dict(type='GN', num_groups=32). + act_cfg (:obj:`ConfigDict` or dict): Config for activation. + Defaults to dict(type='ReLU'). + encoder (:obj:`ConfigDict` or dict): Config for transorformer + encoder.Defaults to None. + positional_encoding (:obj:`ConfigDict` or dict): Config for + transformer encoder position encoding. Defaults to + dict(type='SinePositionalEncoding', num_feats=128, + normalize=True). + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None. + """ + + def __init__(self, + in_channels: Union[List[int], Tuple[int]], + feat_channels: int, + out_channels: int, + norm_cfg: ConfigType = dict(type='GN', num_groups=32), + act_cfg: ConfigType = dict(type='ReLU'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.num_inputs = len(in_channels) + self.lateral_convs = ModuleList() + self.output_convs = ModuleList() + self.use_bias = norm_cfg is None + for i in range(0, self.num_inputs - 1): + lateral_conv = ConvModule( + in_channels[i], + feat_channels, + kernel_size=1, + bias=self.use_bias, + norm_cfg=norm_cfg, + act_cfg=None) + output_conv = ConvModule( + feat_channels, + feat_channels, + kernel_size=3, + stride=1, + padding=1, + bias=self.use_bias, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.lateral_convs.append(lateral_conv) + self.output_convs.append(output_conv) + + self.last_feat_conv = ConvModule( + in_channels[-1], + feat_channels, + kernel_size=3, + padding=1, + stride=1, + bias=self.use_bias, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.mask_feature = Conv2d( + feat_channels, out_channels, kernel_size=3, stride=1, padding=1) + + def init_weights(self) -> None: + """Initialize weights.""" + for i in range(0, self.num_inputs - 2): + caffe2_xavier_init(self.lateral_convs[i].conv, bias=0) + caffe2_xavier_init(self.output_convs[i].conv, bias=0) + + caffe2_xavier_init(self.mask_feature, bias=0) + caffe2_xavier_init(self.last_feat_conv, bias=0) + + def forward(self, feats: List[Tensor], + batch_img_metas: List[dict]) -> Tuple[Tensor, Tensor]: + """ + Args: + feats (list[Tensor]): Feature maps of each level. Each has + shape of (batch_size, c, h, w). + batch_img_metas (list[dict]): List of image information. + Pass in for creating more accurate padding mask. Not + used here. + + Returns: + tuple[Tensor, Tensor]: a tuple containing the following: + + - mask_feature (Tensor): Shape (batch_size, c, h, w). + - memory (Tensor): Output of last stage of backbone.\ + Shape (batch_size, c, h, w). + """ + y = self.last_feat_conv(feats[-1]) + for i in range(self.num_inputs - 2, -1, -1): + x = feats[i] + cur_feat = self.lateral_convs[i](x) + y = cur_feat + \ + F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest') + y = self.output_convs[i](y) + + mask_feature = self.mask_feature(y) + memory = feats[-1] + return mask_feature, memory + + +@MODELS.register_module() +class TransformerEncoderPixelDecoder(PixelDecoder): + """Pixel decoder with transormer encoder inside. + + Args: + in_channels (list[int] | tuple[int]): Number of channels in the + input feature maps. + feat_channels (int): Number channels for feature. + out_channels (int): Number channels for output. + norm_cfg (:obj:`ConfigDict` or dict): Config for normalization. + Defaults to dict(type='GN', num_groups=32). + act_cfg (:obj:`ConfigDict` or dict): Config for activation. + Defaults to dict(type='ReLU'). + encoder (:obj:`ConfigDict` or dict): Config for transformer encoder. + Defaults to None. + positional_encoding (:obj:`ConfigDict` or dict): Config for + transformer encoder position encoding. Defaults to + dict(num_feats=128, normalize=True). + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None. + """ + + def __init__(self, + in_channels: Union[List[int], Tuple[int]], + feat_channels: int, + out_channels: int, + norm_cfg: ConfigType = dict(type='GN', num_groups=32), + act_cfg: ConfigType = dict(type='ReLU'), + encoder: ConfigType = None, + positional_encoding: ConfigType = dict( + num_feats=128, normalize=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + in_channels=in_channels, + feat_channels=feat_channels, + out_channels=out_channels, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + self.last_feat_conv = None + + self.encoder = DetrTransformerEncoder(**encoder) + self.encoder_embed_dims = self.encoder.embed_dims + assert self.encoder_embed_dims == feat_channels, 'embed_dims({}) of ' \ + 'tranformer encoder must equal to feat_channels({})'.format( + feat_channels, self.encoder_embed_dims) + self.positional_encoding = SinePositionalEncoding( + **positional_encoding) + self.encoder_in_proj = Conv2d( + in_channels[-1], feat_channels, kernel_size=1) + self.encoder_out_proj = ConvModule( + feat_channels, + feat_channels, + kernel_size=3, + stride=1, + padding=1, + bias=self.use_bias, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def init_weights(self) -> None: + """Initialize weights.""" + for i in range(0, self.num_inputs - 2): + caffe2_xavier_init(self.lateral_convs[i].conv, bias=0) + caffe2_xavier_init(self.output_convs[i].conv, bias=0) + + caffe2_xavier_init(self.mask_feature, bias=0) + caffe2_xavier_init(self.encoder_in_proj, bias=0) + caffe2_xavier_init(self.encoder_out_proj.conv, bias=0) + + for p in self.encoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, feats: List[Tensor], + batch_img_metas: List[dict]) -> Tuple[Tensor, Tensor]: + """ + Args: + feats (list[Tensor]): Feature maps of each level. Each has + shape of (batch_size, c, h, w). + batch_img_metas (list[dict]): List of image information. Pass in + for creating more accurate padding mask. + + Returns: + tuple: a tuple containing the following: + + - mask_feature (Tensor): shape (batch_size, c, h, w). + - memory (Tensor): shape (batch_size, c, h, w). + """ + feat_last = feats[-1] + bs, c, h, w = feat_last.shape + input_img_h, input_img_w = batch_img_metas[0]['batch_input_shape'] + padding_mask = feat_last.new_ones((bs, input_img_h, input_img_w), + dtype=torch.float32) + for i in range(bs): + img_h, img_w = batch_img_metas[i]['img_shape'] + padding_mask[i, :img_h, :img_w] = 0 + padding_mask = F.interpolate( + padding_mask.unsqueeze(1), + size=feat_last.shape[-2:], + mode='nearest').to(torch.bool).squeeze(1) + + pos_embed = self.positional_encoding(padding_mask) + feat_last = self.encoder_in_proj(feat_last) + # (batch_size, c, h, w) -> (batch_size, num_queries, c) + feat_last = feat_last.flatten(2).permute(0, 2, 1) + pos_embed = pos_embed.flatten(2).permute(0, 2, 1) + # (batch_size, h, w) -> (batch_size, h*w) + padding_mask = padding_mask.flatten(1) + memory = self.encoder( + query=feat_last, + query_pos=pos_embed, + key_padding_mask=padding_mask) + # (batch_size, num_queries, c) -> (batch_size, c, h, w) + memory = memory.permute(0, 2, 1).view(bs, self.encoder_embed_dims, h, + w) + y = self.encoder_out_proj(memory) + for i in range(self.num_inputs - 2, -1, -1): + x = feats[i] + cur_feat = self.lateral_convs[i](x) + y = cur_feat + \ + F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest') + y = self.output_convs[i](y) + + mask_feature = self.mask_feature(y) + return mask_feature, memory diff --git a/mmdetection/mmdet/models/layers/positional_encoding.py b/mmdetection/mmdet/models/layers/positional_encoding.py new file mode 100644 index 0000000..87080d8 --- /dev/null +++ b/mmdetection/mmdet/models/layers/positional_encoding.py @@ -0,0 +1,269 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import torch +import torch.nn as nn +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import MultiConfig, OptMultiConfig + + +@MODELS.register_module() +class SinePositionalEncoding(BaseModule): + """Position encoding with sine and cosine functions. + + See `End-to-End Object Detection with Transformers + `_ for details. + + Args: + num_feats (int): The feature dimension for each position + along x-axis or y-axis. Note the final returned dimension + for each position is 2 times of this value. + temperature (int, optional): The temperature used for scaling + the position embedding. Defaults to 10000. + normalize (bool, optional): Whether to normalize the position + embedding. Defaults to False. + scale (float, optional): A scale factor that scales the position + embedding. The scale will be used only when `normalize` is True. + Defaults to 2*pi. + eps (float, optional): A value added to the denominator for + numerical stability. Defaults to 1e-6. + offset (float): offset add to embed when do the normalization. + Defaults to 0. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None + """ + + def __init__(self, + num_feats: int, + temperature: int = 10000, + normalize: bool = False, + scale: float = 2 * math.pi, + eps: float = 1e-6, + offset: float = 0., + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + if normalize: + assert isinstance(scale, (float, int)), 'when normalize is set,' \ + 'scale should be provided and in float or int type, ' \ + f'found {type(scale)}' + self.num_feats = num_feats + self.temperature = temperature + self.normalize = normalize + self.scale = scale + self.eps = eps + self.offset = offset + + def forward(self, mask: Tensor, input: Optional[Tensor] = None) -> Tensor: + """Forward function for `SinePositionalEncoding`. + + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, h, w]. + input (Tensor, optional): Input image/feature Tensor. + Shape [bs, c, h, w] + + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + assert not (mask is None and input is None) + + if mask is not None: + B, H, W = mask.size() + device = mask.device + # For convenience of exporting to ONNX, + # it's required to convert + # `masks` from bool to int. + mask = mask.to(torch.int) + not_mask = 1 - mask # logical_not + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + else: + # single image or batch image with no padding + B, _, H, W = input.shape + device = input.device + x_embed = torch.arange( + 1, W + 1, dtype=torch.float32, device=device) + x_embed = x_embed.view(1, 1, -1).repeat(B, H, 1) + y_embed = torch.arange( + 1, H + 1, dtype=torch.float32, device=device) + y_embed = y_embed.view(1, -1, 1).repeat(B, 1, W) + if self.normalize: + y_embed = (y_embed + self.offset) / \ + (y_embed[:, -1:, :] + self.eps) * self.scale + x_embed = (x_embed + self.offset) / \ + (x_embed[:, :, -1:] + self.eps) * self.scale + dim_t = torch.arange( + self.num_feats, dtype=torch.float32, device=device) + dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats) + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + # use `view` instead of `flatten` for dynamically exporting to ONNX + + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), + dim=4).view(B, H, W, -1) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), + dim=4).view(B, H, W, -1) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + def __repr__(self) -> str: + """str: a string that describes the module""" + repr_str = self.__class__.__name__ + repr_str += f'(num_feats={self.num_feats}, ' + repr_str += f'temperature={self.temperature}, ' + repr_str += f'normalize={self.normalize}, ' + repr_str += f'scale={self.scale}, ' + repr_str += f'eps={self.eps})' + return repr_str + + +@MODELS.register_module() +class LearnedPositionalEncoding(BaseModule): + """Position embedding with learnable embedding weights. + + Args: + num_feats (int): The feature dimension for each position + along x-axis or y-axis. The final returned dimension for + each position is 2 times of this value. + row_num_embed (int, optional): The dictionary size of row embeddings. + Defaults to 50. + col_num_embed (int, optional): The dictionary size of col embeddings. + Defaults to 50. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__( + self, + num_feats: int, + row_num_embed: int = 50, + col_num_embed: int = 50, + init_cfg: MultiConfig = dict(type='Uniform', layer='Embedding') + ) -> None: + super().__init__(init_cfg=init_cfg) + self.row_embed = nn.Embedding(row_num_embed, num_feats) + self.col_embed = nn.Embedding(col_num_embed, num_feats) + self.num_feats = num_feats + self.row_num_embed = row_num_embed + self.col_num_embed = col_num_embed + + def forward(self, mask: Tensor) -> Tensor: + """Forward function for `LearnedPositionalEncoding`. + + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, h, w]. + + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + h, w = mask.shape[-2:] + x = torch.arange(w, device=mask.device) + y = torch.arange(h, device=mask.device) + x_embed = self.col_embed(x) + y_embed = self.row_embed(y) + pos = torch.cat( + (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat( + 1, w, 1)), + dim=-1).permute(2, 0, + 1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1) + return pos + + def __repr__(self) -> str: + """str: a string that describes the module""" + repr_str = self.__class__.__name__ + repr_str += f'(num_feats={self.num_feats}, ' + repr_str += f'row_num_embed={self.row_num_embed}, ' + repr_str += f'col_num_embed={self.col_num_embed})' + return repr_str + + +@MODELS.register_module() +class SinePositionalEncoding3D(SinePositionalEncoding): + """Position encoding with sine and cosine functions. + + See `End-to-End Object Detection with Transformers + `_ for details. + + Args: + num_feats (int): The feature dimension for each position + along x-axis or y-axis. Note the final returned dimension + for each position is 2 times of this value. + temperature (int, optional): The temperature used for scaling + the position embedding. Defaults to 10000. + normalize (bool, optional): Whether to normalize the position + embedding. Defaults to False. + scale (float, optional): A scale factor that scales the position + embedding. The scale will be used only when `normalize` is True. + Defaults to 2*pi. + eps (float, optional): A value added to the denominator for + numerical stability. Defaults to 1e-6. + offset (float): offset add to embed when do the normalization. + Defaults to 0. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def forward(self, mask: Tensor) -> Tensor: + """Forward function for `SinePositionalEncoding3D`. + + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, t, h, w]. + + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + assert mask.dim() == 4,\ + f'{mask.shape} should be a 4-dimensional Tensor,' \ + f' got {mask.dim()}-dimensional Tensor instead ' + # For convenience of exporting to ONNX, it's required to convert + # `masks` from bool to int. + mask = mask.to(torch.int) + not_mask = 1 - mask # logical_not + z_embed = not_mask.cumsum(1, dtype=torch.float32) + y_embed = not_mask.cumsum(2, dtype=torch.float32) + x_embed = not_mask.cumsum(3, dtype=torch.float32) + if self.normalize: + z_embed = (z_embed + self.offset) / \ + (z_embed[:, -1:, :, :] + self.eps) * self.scale + y_embed = (y_embed + self.offset) / \ + (y_embed[:, :, -1:, :] + self.eps) * self.scale + x_embed = (x_embed + self.offset) / \ + (x_embed[:, :, :, -1:] + self.eps) * self.scale + dim_t = torch.arange( + self.num_feats, dtype=torch.float32, device=mask.device) + dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats) + + dim_t_z = torch.arange((self.num_feats * 2), + dtype=torch.float32, + device=mask.device) + dim_t_z = self.temperature**(2 * (dim_t_z // 2) / (self.num_feats * 2)) + + pos_x = x_embed[:, :, :, :, None] / dim_t + pos_y = y_embed[:, :, :, :, None] / dim_t + pos_z = z_embed[:, :, :, :, None] / dim_t_z + # use `view` instead of `flatten` for dynamically exporting to ONNX + B, T, H, W = mask.size() + pos_x = torch.stack( + (pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), + dim=5).view(B, T, H, W, -1) + pos_y = torch.stack( + (pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), + dim=5).view(B, T, H, W, -1) + pos_z = torch.stack( + (pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()), + dim=5).view(B, T, H, W, -1) + pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3) + return pos diff --git a/mmdetection/mmdet/models/layers/res_layer.py b/mmdetection/mmdet/models/layers/res_layer.py new file mode 100644 index 0000000..ff24d3e --- /dev/null +++ b/mmdetection/mmdet/models/layers/res_layer.py @@ -0,0 +1,195 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmengine.model import BaseModule, Sequential +from torch import Tensor +from torch import nn as nn + +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig + + +class ResLayer(Sequential): + """ResLayer to build ResNet style backbone. + + Args: + block (nn.Module): block used to build ResLayer. + inplanes (int): inplanes of block. + planes (int): planes of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Defaults to 1 + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Defaults to False + conv_cfg (dict): dictionary to construct and config conv layer. + Defaults to None + norm_cfg (dict): dictionary to construct and config norm layer. + Defaults to dict(type='BN') + downsample_first (bool): Downsample at the first block or last block. + False for Hourglass, True for ResNet. Defaults to True + """ + + def __init__(self, + block: BaseModule, + inplanes: int, + planes: int, + num_blocks: int, + stride: int = 1, + avg_down: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + downsample_first: bool = True, + **kwargs) -> None: + self.block = block + + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = [] + conv_stride = stride + if avg_down: + conv_stride = 1 + downsample.append( + nn.AvgPool2d( + kernel_size=stride, + stride=stride, + ceil_mode=True, + count_include_pad=False)) + downsample.extend([ + build_conv_layer( + conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=conv_stride, + bias=False), + build_norm_layer(norm_cfg, planes * block.expansion)[1] + ]) + downsample = nn.Sequential(*downsample) + + layers = [] + if downsample_first: + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + inplanes = planes * block.expansion + for _ in range(1, num_blocks): + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + + else: # downsample_first=False is for HourglassModule + for _ in range(num_blocks - 1): + layers.append( + block( + inplanes=inplanes, + planes=inplanes, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + super().__init__(*layers) + + +class SimplifiedBasicBlock(BaseModule): + """Simplified version of original basic residual block. This is used in + `SCNet `_. + + - Norm layer is now optional + - Last ReLU in forward function is removed + """ + expansion = 1 + + def __init__(self, + inplanes: int, + planes: int, + stride: int = 1, + dilation: int = 1, + downsample: Optional[Sequential] = None, + style: ConfigType = 'pytorch', + with_cp: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + dcn: OptConfigType = None, + plugins: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + assert not with_cp, 'Not implemented yet.' + self.with_norm = norm_cfg is not None + with_bias = True if norm_cfg is None else False + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + 3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=with_bias) + if self.with_norm: + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, planes, postfix=1) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + conv_cfg, planes, planes, 3, padding=1, bias=with_bias) + if self.with_norm: + self.norm2_name, norm2 = build_norm_layer( + norm_cfg, planes, postfix=2) + self.add_module(self.norm2_name, norm2) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + @property + def norm1(self) -> Optional[BaseModule]: + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) if self.with_norm else None + + @property + def norm2(self) -> Optional[BaseModule]: + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) if self.with_norm else None + + def forward(self, x: Tensor) -> Tensor: + """Forward function for SimplifiedBasicBlock.""" + + identity = x + + out = self.conv1(x) + if self.with_norm: + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + if self.with_norm: + out = self.norm2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out diff --git a/mmdetection/mmdet/models/layers/se_layer.py b/mmdetection/mmdet/models/layers/se_layer.py new file mode 100644 index 0000000..5598dab --- /dev/null +++ b/mmdetection/mmdet/models/layers/se_layer.py @@ -0,0 +1,162 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from mmengine.utils import digit_version, is_tuple_of +from torch import Tensor + +from mmdet.utils import MultiConfig, OptConfigType, OptMultiConfig + + +class SELayer(BaseModule): + """Squeeze-and-Excitation Module. + + Args: + channels (int): The input (and output) channels of the SE layer. + ratio (int): Squeeze ratio in SELayer, the intermediate channel will be + ``int(channels/ratio)``. Defaults to 16. + conv_cfg (None or dict): Config dict for convolution layer. + Defaults to None, which means using conv2d. + act_cfg (dict or Sequence[dict]): Config dict for activation layer. + If act_cfg is a dict, two activation layers will be configurated + by this dict. If act_cfg is a sequence of dicts, the first + activation layer will be configurated by the first dict and the + second activation layer will be configurated by the second dict. + Defaults to (dict(type='ReLU'), dict(type='Sigmoid')) + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None + """ + + def __init__(self, + channels: int, + ratio: int = 16, + conv_cfg: OptConfigType = None, + act_cfg: MultiConfig = (dict(type='ReLU'), + dict(type='Sigmoid')), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert is_tuple_of(act_cfg, dict) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=int(channels / ratio), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=int(channels / ratio), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x: Tensor) -> Tensor: + """Forward function for SELayer.""" + out = self.global_avgpool(x) + out = self.conv1(out) + out = self.conv2(out) + return x * out + + +class DyReLU(BaseModule): + """Dynamic ReLU (DyReLU) module. + + See `Dynamic ReLU `_ for details. + Current implementation is specialized for task-aware attention in DyHead. + HSigmoid arguments in default act_cfg follow DyHead official code. + https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py + + Args: + channels (int): The input (and output) channels of DyReLU module. + ratio (int): Squeeze ratio in Squeeze-and-Excitation-like module, + the intermediate channel will be ``int(channels/ratio)``. + Defaults to 4. + conv_cfg (None or dict): Config dict for convolution layer. + Defaults to None, which means using conv2d. + act_cfg (dict or Sequence[dict]): Config dict for activation layer. + If act_cfg is a dict, two activation layers will be configurated + by this dict. If act_cfg is a sequence of dicts, the first + activation layer will be configurated by the first dict and the + second activation layer will be configurated by the second dict. + Defaults to (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, + divisor=6.0)) + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None + """ + + def __init__(self, + channels: int, + ratio: int = 4, + conv_cfg: OptConfigType = None, + act_cfg: MultiConfig = (dict(type='ReLU'), + dict( + type='HSigmoid', + bias=3.0, + divisor=6.0)), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert is_tuple_of(act_cfg, dict) + self.channels = channels + self.expansion = 4 # for a1, b1, a2, b2 + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=int(channels / ratio), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=int(channels / ratio), + out_channels=channels * self.expansion, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x: Tensor) -> Tensor: + """Forward function.""" + coeffs = self.global_avgpool(x) + coeffs = self.conv1(coeffs) + coeffs = self.conv2(coeffs) - 0.5 # value range: [-0.5, 0.5] + a1, b1, a2, b2 = torch.split(coeffs, self.channels, dim=1) + a1 = a1 * 2.0 + 1.0 # [-1.0, 1.0] + 1.0 + a2 = a2 * 2.0 # [-1.0, 1.0] + out = torch.max(x * a1 + b1, x * a2 + b2) + return out + + +class ChannelAttention(BaseModule): + """Channel attention Module. + + Args: + channels (int): The input (and output) channels of the attention layer. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None + """ + + def __init__(self, channels: int, init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True) + if digit_version(torch.__version__) < (1, 7, 0): + self.act = nn.Hardsigmoid() + else: + self.act = nn.Hardsigmoid(inplace=True) + + def forward(self, x: Tensor) -> Tensor: + """Forward function for ChannelAttention.""" + with torch.cuda.amp.autocast(enabled=False): + out = self.global_avgpool(x) + out = self.fc(out) + out = self.act(out) + return x * out diff --git a/mmdetection/mmdet/models/layers/transformer/__init__.py b/mmdetection/mmdet/models/layers/transformer/__init__.py new file mode 100644 index 0000000..839d936 --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/__init__.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .conditional_detr_layers import (ConditionalDetrTransformerDecoder, + ConditionalDetrTransformerDecoderLayer) +from .dab_detr_layers import (DABDetrTransformerDecoder, + DABDetrTransformerDecoderLayer, + DABDetrTransformerEncoder) +from .ddq_detr_layers import DDQTransformerDecoder +from .deformable_detr_layers import (DeformableDetrTransformerDecoder, + DeformableDetrTransformerDecoderLayer, + DeformableDetrTransformerEncoder, + DeformableDetrTransformerEncoderLayer) +from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer, + DetrTransformerEncoder, DetrTransformerEncoderLayer) +from .dino_layers import CdnQueryGenerator, DinoTransformerDecoder +from .grounding_dino_layers import (GroundingDinoTransformerDecoder, + GroundingDinoTransformerDecoderLayer, + GroundingDinoTransformerEncoder) +from .mask2former_layers import (Mask2FormerTransformerDecoder, + Mask2FormerTransformerDecoderLayer, + Mask2FormerTransformerEncoder) +from .utils import (MLP, AdaptivePadding, ConditionalAttention, DynamicConv, + PatchEmbed, PatchMerging, coordinate_to_encoding, + inverse_sigmoid, nchw_to_nlc, nlc_to_nchw) + +__all__ = [ + 'nlc_to_nchw', 'nchw_to_nlc', 'AdaptivePadding', 'PatchEmbed', + 'PatchMerging', 'inverse_sigmoid', 'DynamicConv', 'MLP', + 'DetrTransformerEncoder', 'DetrTransformerDecoder', + 'DetrTransformerEncoderLayer', 'DetrTransformerDecoderLayer', + 'DeformableDetrTransformerEncoder', 'DeformableDetrTransformerDecoder', + 'DeformableDetrTransformerEncoderLayer', + 'DeformableDetrTransformerDecoderLayer', 'coordinate_to_encoding', + 'ConditionalAttention', 'DABDetrTransformerDecoderLayer', + 'DABDetrTransformerDecoder', 'DABDetrTransformerEncoder', + 'DDQTransformerDecoder', 'ConditionalDetrTransformerDecoder', + 'ConditionalDetrTransformerDecoderLayer', 'DinoTransformerDecoder', + 'CdnQueryGenerator', 'Mask2FormerTransformerEncoder', + 'Mask2FormerTransformerDecoderLayer', 'Mask2FormerTransformerDecoder', + 'GroundingDinoTransformerDecoderLayer', 'GroundingDinoTransformerEncoder', + 'GroundingDinoTransformerDecoder' +] diff --git a/mmdetection/mmdet/models/layers/transformer/conditional_detr_layers.py b/mmdetection/mmdet/models/layers/transformer/conditional_detr_layers.py new file mode 100644 index 0000000..6db12a1 --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/conditional_detr_layers.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN +from torch import Tensor +from torch.nn import ModuleList + +from .detr_layers import DetrTransformerDecoder, DetrTransformerDecoderLayer +from .utils import MLP, ConditionalAttention, coordinate_to_encoding + + +class ConditionalDetrTransformerDecoder(DetrTransformerDecoder): + """Decoder of Conditional DETR.""" + + def _init_layers(self) -> None: + """Initialize decoder layers and other layers.""" + self.layers = ModuleList([ + ConditionalDetrTransformerDecoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + self.embed_dims = self.layers[0].embed_dims + self.post_norm = build_norm_layer(self.post_norm_cfg, + self.embed_dims)[1] + # conditional detr affline + self.query_scale = MLP(self.embed_dims, self.embed_dims, + self.embed_dims, 2) + self.ref_point_head = MLP(self.embed_dims, self.embed_dims, 2, 2) + # we have substitute 'qpos_proj' with 'qpos_sine_proj' except for + # the first decoder layer), so 'qpos_proj' should be deleted + # in other layers. + for layer_id in range(self.num_layers - 1): + self.layers[layer_id + 1].cross_attn.qpos_proj = None + + def forward(self, + query: Tensor, + key: Tensor = None, + query_pos: Tensor = None, + key_pos: Tensor = None, + key_padding_mask: Tensor = None): + """Forward function of decoder. + + Args: + query (Tensor): The input query with shape + (bs, num_queries, dim). + key (Tensor): The input key with shape (bs, num_keys, dim) If + `None`, the `query` will be used. Defaults to `None`. + query_pos (Tensor): The positional encoding for `query`, with the + same shape as `query`. If not `None`, it will be added to + `query` before forward function. Defaults to `None`. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. If not `None`, it will be added to + `key` before forward function. If `None`, and `query_pos` + has the same shape as `key`, then `query_pos` will be used + as `key_pos`. Defaults to `None`. + key_padding_mask (Tensor): ByteTensor with shape (bs, num_keys). + Defaults to `None`. + Returns: + List[Tensor]: forwarded results with shape (num_decoder_layers, + bs, num_queries, dim) if `return_intermediate` is True, otherwise + with shape (1, bs, num_queries, dim). References with shape + (bs, num_queries, 2). + """ + reference_unsigmoid = self.ref_point_head( + query_pos) # [bs, num_queries, 2] + reference = reference_unsigmoid.sigmoid() + reference_xy = reference[..., :2] + intermediate = [] + for layer_id, layer in enumerate(self.layers): + if layer_id == 0: + pos_transformation = 1 + else: + pos_transformation = self.query_scale(query) + # get sine embedding for the query reference + ref_sine_embed = coordinate_to_encoding(coord_tensor=reference_xy) + # apply transformation + ref_sine_embed = ref_sine_embed * pos_transformation + query = layer( + query, + key=key, + query_pos=query_pos, + key_pos=key_pos, + key_padding_mask=key_padding_mask, + ref_sine_embed=ref_sine_embed, + is_first=(layer_id == 0)) + if self.return_intermediate: + intermediate.append(self.post_norm(query)) + + if self.return_intermediate: + return torch.stack(intermediate), reference + + query = self.post_norm(query) + return query.unsqueeze(0), reference + + +class ConditionalDetrTransformerDecoderLayer(DetrTransformerDecoderLayer): + """Implements decoder layer in Conditional DETR transformer.""" + + def _init_layers(self): + """Initialize self-attention, cross-attention, FFN, and + normalization.""" + self.self_attn = ConditionalAttention(**self.self_attn_cfg) + self.cross_attn = ConditionalAttention(**self.cross_attn_cfg) + self.embed_dims = self.self_attn.embed_dims + self.ffn = FFN(**self.ffn_cfg) + norms_list = [ + build_norm_layer(self.norm_cfg, self.embed_dims)[1] + for _ in range(3) + ] + self.norms = ModuleList(norms_list) + + def forward(self, + query: Tensor, + key: Tensor = None, + query_pos: Tensor = None, + key_pos: Tensor = None, + self_attn_masks: Tensor = None, + cross_attn_masks: Tensor = None, + key_padding_mask: Tensor = None, + ref_sine_embed: Tensor = None, + is_first: bool = False): + """ + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim) + key (Tensor, optional): The input key, has shape (bs, num_keys, + dim). If `None`, the `query` will be used. Defaults to `None`. + query_pos (Tensor, optional): The positional encoding for `query`, + has the same shape as `query`. If not `None`, it will be + added to `query` before forward function. Defaults to `None`. + ref_sine_embed (Tensor): The positional encoding for query in + cross attention, with the same shape as `x`. Defaults to None. + key_pos (Tensor, optional): The positional encoding for `key`, has + the same shape as `key`. If not None, it will be added to + `key` before forward function. If None, and `query_pos` has + the same shape as `key`, then `query_pos` will be used for + `key_pos`. Defaults to None. + self_attn_masks (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), Same in `nn.MultiheadAttention. + forward`. Defaults to None. + cross_attn_masks (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), Same in `nn.MultiheadAttention. + forward`. Defaults to None. + key_padding_mask (Tensor, optional): ByteTensor, has shape + (bs, num_keys). Defaults to None. + is_first (bool): A indicator to tell whether the current layer + is the first layer of the decoder. Defaults to False. + + Returns: + Tensor: Forwarded results, has shape (bs, num_queries, dim). + """ + query = self.self_attn( + query=query, + key=query, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=self_attn_masks) + query = self.norms[0](query) + query = self.cross_attn( + query=query, + key=key, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=cross_attn_masks, + key_padding_mask=key_padding_mask, + ref_sine_embed=ref_sine_embed, + is_first=is_first) + query = self.norms[1](query) + query = self.ffn(query) + query = self.norms[2](query) + + return query diff --git a/mmdetection/mmdet/models/layers/transformer/dab_detr_layers.py b/mmdetection/mmdet/models/layers/transformer/dab_detr_layers.py new file mode 100644 index 0000000..b8a6e77 --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/dab_detr_layers.py @@ -0,0 +1,298 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN +from mmengine.model import ModuleList +from torch import Tensor + +from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer, + DetrTransformerEncoder, DetrTransformerEncoderLayer) +from .utils import (MLP, ConditionalAttention, coordinate_to_encoding, + inverse_sigmoid) + + +class DABDetrTransformerDecoderLayer(DetrTransformerDecoderLayer): + """Implements decoder layer in DAB-DETR transformer.""" + + def _init_layers(self): + """Initialize self-attention, cross-attention, FFN, normalization and + others.""" + self.self_attn = ConditionalAttention(**self.self_attn_cfg) + self.cross_attn = ConditionalAttention(**self.cross_attn_cfg) + self.embed_dims = self.self_attn.embed_dims + self.ffn = FFN(**self.ffn_cfg) + norms_list = [ + build_norm_layer(self.norm_cfg, self.embed_dims)[1] + for _ in range(3) + ] + self.norms = ModuleList(norms_list) + self.keep_query_pos = self.cross_attn.keep_query_pos + + def forward(self, + query: Tensor, + key: Tensor, + query_pos: Tensor, + key_pos: Tensor, + ref_sine_embed: Tensor = None, + self_attn_masks: Tensor = None, + cross_attn_masks: Tensor = None, + key_padding_mask: Tensor = None, + is_first: bool = False, + **kwargs) -> Tensor: + """ + Args: + query (Tensor): The input query with shape [bs, num_queries, + dim]. + key (Tensor): The key tensor with shape [bs, num_keys, + dim]. + query_pos (Tensor): The positional encoding for query in self + attention, with the same shape as `x`. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. + ref_sine_embed (Tensor): The positional encoding for query in + cross attention, with the same shape as `x`. + Defaults to None. + self_attn_masks (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + cross_attn_masks (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + is_first (bool): A indicator to tell whether the current layer + is the first layer of the decoder. + Defaults to False. + + Returns: + Tensor: forwarded results with shape + [bs, num_queries, dim]. + """ + + query = self.self_attn( + query=query, + key=query, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=self_attn_masks, + **kwargs) + query = self.norms[0](query) + query = self.cross_attn( + query=query, + key=key, + query_pos=query_pos, + key_pos=key_pos, + ref_sine_embed=ref_sine_embed, + attn_mask=cross_attn_masks, + key_padding_mask=key_padding_mask, + is_first=is_first, + **kwargs) + query = self.norms[1](query) + query = self.ffn(query) + query = self.norms[2](query) + + return query + + +class DABDetrTransformerDecoder(DetrTransformerDecoder): + """Decoder of DAB-DETR. + + Args: + query_dim (int): The last dimension of query pos, + 4 for anchor format, 2 for point format. + Defaults to 4. + query_scale_type (str): Type of transformation applied + to content query. Defaults to `cond_elewise`. + with_modulated_hw_attn (bool): Whether to inject h&w info + during cross conditional attention. Defaults to True. + """ + + def __init__(self, + *args, + query_dim: int = 4, + query_scale_type: str = 'cond_elewise', + with_modulated_hw_attn: bool = True, + **kwargs): + + self.query_dim = query_dim + self.query_scale_type = query_scale_type + self.with_modulated_hw_attn = with_modulated_hw_attn + + super().__init__(*args, **kwargs) + + def _init_layers(self): + """Initialize decoder layers and other layers.""" + assert self.query_dim in [2, 4], \ + f'{"dab-detr only supports anchor prior or reference point prior"}' + assert self.query_scale_type in [ + 'cond_elewise', 'cond_scalar', 'fix_elewise' + ] + + self.layers = ModuleList([ + DABDetrTransformerDecoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + + embed_dims = self.layers[0].embed_dims + self.embed_dims = embed_dims + + self.post_norm = build_norm_layer(self.post_norm_cfg, embed_dims)[1] + if self.query_scale_type == 'cond_elewise': + self.query_scale = MLP(embed_dims, embed_dims, embed_dims, 2) + elif self.query_scale_type == 'cond_scalar': + self.query_scale = MLP(embed_dims, embed_dims, 1, 2) + elif self.query_scale_type == 'fix_elewise': + self.query_scale = nn.Embedding(self.num_layers, embed_dims) + else: + raise NotImplementedError('Unknown query_scale_type: {}'.format( + self.query_scale_type)) + + self.ref_point_head = MLP(self.query_dim // 2 * embed_dims, embed_dims, + embed_dims, 2) + + if self.with_modulated_hw_attn and self.query_dim == 4: + self.ref_anchor_head = MLP(embed_dims, embed_dims, 2, 2) + + self.keep_query_pos = self.layers[0].keep_query_pos + if not self.keep_query_pos: + for layer_id in range(self.num_layers - 1): + self.layers[layer_id + 1].cross_attn.qpos_proj = None + + def forward(self, + query: Tensor, + key: Tensor, + query_pos: Tensor, + key_pos: Tensor, + reg_branches: nn.Module, + key_padding_mask: Tensor = None, + **kwargs) -> List[Tensor]: + """Forward function of decoder. + + Args: + query (Tensor): The input query with shape (bs, num_queries, dim). + key (Tensor): The input key with shape (bs, num_keys, dim). + query_pos (Tensor): The positional encoding for `query`, with the + same shape as `query`. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. + reg_branches (nn.Module): The regression branch for dynamically + updating references in each layer. + key_padding_mask (Tensor): ByteTensor with shape (bs, num_keys). + Defaults to `None`. + + Returns: + List[Tensor]: forwarded results with shape (num_decoder_layers, + bs, num_queries, dim) if `return_intermediate` is True, otherwise + with shape (1, bs, num_queries, dim). references with shape + (num_decoder_layers, bs, num_queries, 2/4). + """ + output = query + unsigmoid_references = query_pos + + reference_points = unsigmoid_references.sigmoid() + intermediate_reference_points = [reference_points] + + intermediate = [] + for layer_id, layer in enumerate(self.layers): + obj_center = reference_points[..., :self.query_dim] + ref_sine_embed = coordinate_to_encoding( + coord_tensor=obj_center, num_feats=self.embed_dims // 2) + query_pos = self.ref_point_head( + ref_sine_embed) # [bs, nq, 2c] -> [bs, nq, c] + # For the first decoder layer, do not apply transformation + if self.query_scale_type != 'fix_elewise': + if layer_id == 0: + pos_transformation = 1 + else: + pos_transformation = self.query_scale(output) + else: + pos_transformation = self.query_scale.weight[layer_id] + # apply transformation + ref_sine_embed = ref_sine_embed[ + ..., :self.embed_dims] * pos_transformation + # modulated height and weight attention + if self.with_modulated_hw_attn: + assert obj_center.size(-1) == 4 + ref_hw = self.ref_anchor_head(output).sigmoid() + ref_sine_embed[..., self.embed_dims // 2:] *= \ + (ref_hw[..., 0] / obj_center[..., 2]).unsqueeze(-1) + ref_sine_embed[..., : self.embed_dims // 2] *= \ + (ref_hw[..., 1] / obj_center[..., 3]).unsqueeze(-1) + + output = layer( + output, + key, + query_pos=query_pos, + ref_sine_embed=ref_sine_embed, + key_pos=key_pos, + key_padding_mask=key_padding_mask, + is_first=(layer_id == 0), + **kwargs) + # iter update + tmp_reg_preds = reg_branches(output) + tmp_reg_preds[..., :self.query_dim] += inverse_sigmoid( + reference_points) + new_reference_points = tmp_reg_preds[ + ..., :self.query_dim].sigmoid() + if layer_id != self.num_layers - 1: + intermediate_reference_points.append(new_reference_points) + reference_points = new_reference_points.detach() + + if self.return_intermediate: + intermediate.append(self.post_norm(output)) + + output = self.post_norm(output) + + if self.return_intermediate: + return [ + torch.stack(intermediate), + torch.stack(intermediate_reference_points), + ] + else: + return [ + output.unsqueeze(0), + torch.stack(intermediate_reference_points) + ] + + +class DABDetrTransformerEncoder(DetrTransformerEncoder): + """Encoder of DAB-DETR.""" + + def _init_layers(self): + """Initialize encoder layers.""" + self.layers = ModuleList([ + DetrTransformerEncoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + embed_dims = self.layers[0].embed_dims + self.embed_dims = embed_dims + self.query_scale = MLP(embed_dims, embed_dims, embed_dims, 2) + + def forward(self, query: Tensor, query_pos: Tensor, + key_padding_mask: Tensor, **kwargs): + """Forward function of encoder. + + Args: + query (Tensor): Input queries of encoder, has shape + (bs, num_queries, dim). + query_pos (Tensor): The positional embeddings of the queries, has + shape (bs, num_feat_points, dim). + key_padding_mask (Tensor): ByteTensor, the key padding mask + of the queries, has shape (bs, num_feat_points). + + Returns: + Tensor: With shape (num_queries, bs, dim). + """ + + for layer in self.layers: + pos_scales = self.query_scale(query) + query = layer( + query, + query_pos=query_pos * pos_scales, + key_padding_mask=key_padding_mask, + **kwargs) + + return query diff --git a/mmdetection/mmdet/models/layers/transformer/ddq_detr_layers.py b/mmdetection/mmdet/models/layers/transformer/ddq_detr_layers.py new file mode 100644 index 0000000..57664c7 --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/ddq_detr_layers.py @@ -0,0 +1,223 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch +from mmcv.ops import batched_nms +from torch import Tensor, nn + +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy +from .deformable_detr_layers import DeformableDetrTransformerDecoder +from .utils import MLP, coordinate_to_encoding, inverse_sigmoid + + +class DDQTransformerDecoder(DeformableDetrTransformerDecoder): + """Transformer decoder of DDQ.""" + + def _init_layers(self) -> None: + """Initialize encoder layers.""" + super()._init_layers() + self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims, + self.embed_dims, 2) + self.norm = nn.LayerNorm(self.embed_dims) + + def select_distinct_queries(self, reference_points: Tensor, query: Tensor, + self_attn_mask: Tensor, layer_index): + """Get updated `self_attn_mask` for distinct queries selection, it is + used in self attention layers of decoder. + + Args: + reference_points (Tensor): The input reference of decoder, + has shape (bs, num_queries, 4) with the last dimension + arranged as (cx, cy, w, h). + query (Tensor): The input query of decoder, has shape + (bs, num_queries, dims). + self_attn_mask (Tensor): The input self attention mask of + last decoder layer, has shape (bs, num_queries_total, + num_queries_total). + layer_index (int): Last decoder layer index, used to get + classification score of last layer output, for + distinct queries selection. + + Returns: + Tensor: `self_attn_mask` used in self attention layers + of decoder, has shape (bs, num_queries_total, + num_queries_total). + """ + num_imgs = len(reference_points) + dis_start, num_dis = self.cache_dict['dis_query_info'] + # shape of self_attn_mask + # (batch⋅num_heads, num_queries, embed_dims) + dis_mask = self_attn_mask[:, dis_start:dis_start + num_dis, + dis_start:dis_start + num_dis] + # cls_branches from DDQDETRHead + scores = self.cache_dict['cls_branches'][layer_index]( + query[:, dis_start:dis_start + num_dis]).sigmoid().max(-1).values + proposals = reference_points[:, dis_start:dis_start + num_dis] + proposals = bbox_cxcywh_to_xyxy(proposals) + + attn_mask_list = [] + for img_id in range(num_imgs): + single_proposals = proposals[img_id] + single_scores = scores[img_id] + attn_mask = ~dis_mask[img_id * self.cache_dict['num_heads']][0] + # distinct query inds in this layer + ori_index = attn_mask.nonzero().view(-1) + _, keep_idxs = batched_nms(single_proposals[ori_index], + single_scores[ori_index], + torch.ones(len(ori_index)), + self.cache_dict['dqs_cfg']) + + real_keep_index = ori_index[keep_idxs] + + attn_mask = torch.ones_like(dis_mask[0]).bool() + # such a attn_mask give best result + # If it requires to keep index i, then all cells in row or column + # i should be kept in `attn_mask` . For example, if + # `real_keep_index` = [1, 4], and `attn_mask` size = [8, 8], + # then all cells at rows or columns [1, 4] should be kept, and + # all the other cells should be masked out. So the value of + # `attn_mask` should be: + # + # target\source 0 1 2 3 4 5 6 7 + # 0 [ 0 1 0 0 1 0 0 0 ] + # 1 [ 1 1 1 1 1 1 1 1 ] + # 2 [ 0 1 0 0 1 0 0 0 ] + # 3 [ 0 1 0 0 1 0 0 0 ] + # 4 [ 1 1 1 1 1 1 1 1 ] + # 5 [ 0 1 0 0 1 0 0 0 ] + # 6 [ 0 1 0 0 1 0 0 0 ] + # 7 [ 0 1 0 0 1 0 0 0 ] + attn_mask[real_keep_index] = False + attn_mask[:, real_keep_index] = False + + attn_mask = attn_mask[None].repeat(self.cache_dict['num_heads'], 1, + 1) + attn_mask_list.append(attn_mask) + attn_mask = torch.cat(attn_mask_list) + self_attn_mask = copy.deepcopy(self_attn_mask) + self_attn_mask[:, dis_start:dis_start + num_dis, + dis_start:dis_start + num_dis] = attn_mask + # will be used in loss and inference + self.cache_dict['distinct_query_mask'].append(~attn_mask) + return self_attn_mask + + def forward(self, query: Tensor, value: Tensor, key_padding_mask: Tensor, + self_attn_mask: Tensor, reference_points: Tensor, + spatial_shapes: Tensor, level_start_index: Tensor, + valid_ratios: Tensor, reg_branches: nn.ModuleList, + **kwargs) -> Tensor: + """Forward function of Transformer decoder. + + Args: + query (Tensor): The input query, has shape (bs, num_queries, + dims). + value (Tensor): The input values, has shape (bs, num_value, dim). + key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn` + input. ByteTensor, has shape (bs, num_value). + self_attn_mask (Tensor): The attention mask to prevent information + leakage from different denoising groups, distinct queries and + dense queries, has shape (num_queries_total, + num_queries_total). It will be updated for distinct queries + selection in this forward function. It is `None` when + `self.training` is `False`. + reference_points (Tensor): The initial reference, has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + reg_branches: (obj:`nn.ModuleList`): Used for refining the + regression results. + + Returns: + tuple[Tensor]: Output queries and references of Transformer + decoder + + - query (Tensor): Output embeddings of the last decoder, has + shape (bs, num_queries, embed_dims) when `return_intermediate` + is `False`. Otherwise, Intermediate output embeddings of all + decoder layers, has shape (num_decoder_layers, bs, num_queries, + embed_dims). + - reference_points (Tensor): The reference of the last decoder + layer, has shape (bs, num_queries, 4) when `return_intermediate` + is `False`. Otherwise, Intermediate references of all decoder + layers, has shape (1 + num_decoder_layers, bs, num_queries, 4). + The coordinates are arranged as (cx, cy, w, h). + """ + intermediate = [] + intermediate_reference_points = [reference_points] + self.cache_dict['distinct_query_mask'] = [] + if self_attn_mask is None: + self_attn_mask = torch.zeros((query.size(1), query.size(1)), + device=query.device).bool() + # shape is (batch*number_heads, num_queries, num_queries) + self_attn_mask = self_attn_mask[None].repeat( + len(query) * self.cache_dict['num_heads'], 1, 1) + for layer_index, layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = \ + reference_points[:, :, None] * torch.cat( + [valid_ratios, valid_ratios], -1)[:, None] + else: + assert reference_points.shape[-1] == 2 + reference_points_input = \ + reference_points[:, :, None] * valid_ratios[:, None] + + query_sine_embed = coordinate_to_encoding( + reference_points_input[:, :, 0, :], + num_feats=self.embed_dims // 2) + query_pos = self.ref_point_head(query_sine_embed) + + query = layer( + query, + query_pos=query_pos, + value=value, + key_padding_mask=key_padding_mask, + self_attn_mask=self_attn_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reference_points=reference_points_input, + **kwargs) + + if not self.training: + tmp = reg_branches[layer_index](query) + assert reference_points.shape[-1] == 4 + new_reference_points = tmp + inverse_sigmoid( + reference_points, eps=1e-3) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + if layer_index < (len(self.layers) - 1): + self_attn_mask = self.select_distinct_queries( + reference_points, query, self_attn_mask, layer_index) + + else: + num_dense = self.cache_dict['num_dense_queries'] + tmp = reg_branches[layer_index](query[:, :-num_dense]) + tmp_dense = self.aux_reg_branches[layer_index]( + query[:, -num_dense:]) + + tmp = torch.cat([tmp, tmp_dense], dim=1) + assert reference_points.shape[-1] == 4 + new_reference_points = tmp + inverse_sigmoid( + reference_points, eps=1e-3) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + if layer_index < (len(self.layers) - 1): + self_attn_mask = self.select_distinct_queries( + reference_points, query, self_attn_mask, layer_index) + + if self.return_intermediate: + intermediate.append(self.norm(query)) + intermediate_reference_points.append(new_reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return query, reference_points diff --git a/mmdetection/mmdet/models/layers/transformer/deformable_detr_layers.py b/mmdetection/mmdet/models/layers/transformer/deformable_detr_layers.py new file mode 100644 index 0000000..da6325d --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/deformable_detr_layers.py @@ -0,0 +1,265 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import torch +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention +from mmcv.ops import MultiScaleDeformableAttention +from mmengine.model import ModuleList +from torch import Tensor, nn + +from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer, + DetrTransformerEncoder, DetrTransformerEncoderLayer) +from .utils import inverse_sigmoid + +try: + from fairscale.nn.checkpoint import checkpoint_wrapper +except Exception: + checkpoint_wrapper = None + + +class DeformableDetrTransformerEncoder(DetrTransformerEncoder): + """Transformer encoder of Deformable DETR.""" + + def _init_layers(self) -> None: + """Initialize encoder layers.""" + self.layers = ModuleList([ + DeformableDetrTransformerEncoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + + if self.num_cp > 0: + if checkpoint_wrapper is None: + raise NotImplementedError( + 'If you want to reduce GPU memory usage, \ + please install fairscale by executing the \ + following command: pip install fairscale.') + for i in range(self.num_cp): + self.layers[i] = checkpoint_wrapper(self.layers[i]) + + self.embed_dims = self.layers[0].embed_dims + + def forward(self, query: Tensor, query_pos: Tensor, + key_padding_mask: Tensor, spatial_shapes: Tensor, + level_start_index: Tensor, valid_ratios: Tensor, + **kwargs) -> Tensor: + """Forward function of Transformer encoder. + + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + query_pos (Tensor): The positional encoding for query, has shape + (bs, num_queries, dim). + key_padding_mask (Tensor): The `key_padding_mask` of `self_attn` + input. ByteTensor, has shape (bs, num_queries). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + + Returns: + Tensor: Output queries of Transformer encoder, which is also + called 'encoder output embeddings' or 'memory', has shape + (bs, num_queries, dim) + """ + reference_points = self.get_encoder_reference_points( + spatial_shapes, valid_ratios, device=query.device) + for layer in self.layers: + query = layer( + query=query, + query_pos=query_pos, + key_padding_mask=key_padding_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reference_points=reference_points, + **kwargs) + return query + + @staticmethod + def get_encoder_reference_points( + spatial_shapes: Tensor, valid_ratios: Tensor, + device: Union[torch.device, str]) -> Tensor: + """Get the reference points used in encoder. + + Args: + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + device (obj:`device` or str): The device acquired by the + `reference_points`. + + Returns: + Tensor: Reference points used in decoder, has shape (bs, length, + num_levels, 2). + """ + + reference_points_list = [] + for lvl, (H, W) in enumerate(spatial_shapes): + ref_y, ref_x = torch.meshgrid( + torch.linspace( + 0.5, H - 0.5, H, dtype=torch.float32, device=device), + torch.linspace( + 0.5, W - 0.5, W, dtype=torch.float32, device=device)) + ref_y = ref_y.reshape(-1)[None] / ( + valid_ratios[:, None, lvl, 1] * H) + ref_x = ref_x.reshape(-1)[None] / ( + valid_ratios[:, None, lvl, 0] * W) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + # [bs, sum(hw), num_level, 2] + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + +class DeformableDetrTransformerDecoder(DetrTransformerDecoder): + """Transformer Decoder of Deformable DETR.""" + + def _init_layers(self) -> None: + """Initialize decoder layers.""" + self.layers = ModuleList([ + DeformableDetrTransformerDecoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + self.embed_dims = self.layers[0].embed_dims + if self.post_norm_cfg is not None: + raise ValueError('There is not post_norm in ' + f'{self._get_name()}') + + def forward(self, + query: Tensor, + query_pos: Tensor, + value: Tensor, + key_padding_mask: Tensor, + reference_points: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios: Tensor, + reg_branches: Optional[nn.Module] = None, + **kwargs) -> Tuple[Tensor]: + """Forward function of Transformer decoder. + + Args: + query (Tensor): The input queries, has shape (bs, num_queries, + dim). + query_pos (Tensor): The input positional query, has shape + (bs, num_queries, dim). It will be added to `query` before + forward function. + value (Tensor): The input values, has shape (bs, num_value, dim). + key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn` + input. ByteTensor, has shape (bs, num_value). + reference_points (Tensor): The initial reference, has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has + shape (bs, num_queries, 2) with the last dimension arranged + as (cx, cy). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + reg_branches: (obj:`nn.ModuleList`, optional): Used for refining + the regression results. Only would be passed when + `with_box_refine` is `True`, otherwise would be `None`. + + Returns: + tuple[Tensor]: Outputs of Deformable Transformer Decoder. + + - output (Tensor): Output embeddings of the last decoder, has + shape (num_queries, bs, embed_dims) when `return_intermediate` + is `False`. Otherwise, Intermediate output embeddings of all + decoder layers, has shape (num_decoder_layers, num_queries, bs, + embed_dims). + - reference_points (Tensor): The reference of the last decoder + layer, has shape (bs, num_queries, 4) when `return_intermediate` + is `False`. Otherwise, Intermediate references of all decoder + layers, has shape (num_decoder_layers, bs, num_queries, 4). The + coordinates are arranged as (cx, cy, w, h) + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for layer_id, layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = \ + reference_points[:, :, None] * \ + torch.cat([valid_ratios, valid_ratios], -1)[:, None] + else: + assert reference_points.shape[-1] == 2 + reference_points_input = \ + reference_points[:, :, None] * \ + valid_ratios[:, None] + output = layer( + output, + query_pos=query_pos, + value=value, + key_padding_mask=key_padding_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reference_points=reference_points_input, + **kwargs) + + if reg_branches is not None: + tmp_reg_preds = reg_branches[layer_id](output) + if reference_points.shape[-1] == 4: + new_reference_points = tmp_reg_preds + inverse_sigmoid( + reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + assert reference_points.shape[-1] == 2 + new_reference_points = tmp_reg_preds + new_reference_points[..., :2] = tmp_reg_preds[ + ..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +class DeformableDetrTransformerEncoderLayer(DetrTransformerEncoderLayer): + """Encoder layer of Deformable DETR.""" + + def _init_layers(self) -> None: + """Initialize self_attn, ffn, and norms.""" + self.self_attn = MultiScaleDeformableAttention(**self.self_attn_cfg) + self.embed_dims = self.self_attn.embed_dims + self.ffn = FFN(**self.ffn_cfg) + norms_list = [ + build_norm_layer(self.norm_cfg, self.embed_dims)[1] + for _ in range(2) + ] + self.norms = ModuleList(norms_list) + + +class DeformableDetrTransformerDecoderLayer(DetrTransformerDecoderLayer): + """Decoder layer of Deformable DETR.""" + + def _init_layers(self) -> None: + """Initialize self_attn, cross-attn, ffn, and norms.""" + self.self_attn = MultiheadAttention(**self.self_attn_cfg) + self.cross_attn = MultiScaleDeformableAttention(**self.cross_attn_cfg) + self.embed_dims = self.self_attn.embed_dims + self.ffn = FFN(**self.ffn_cfg) + norms_list = [ + build_norm_layer(self.norm_cfg, self.embed_dims)[1] + for _ in range(3) + ] + self.norms = ModuleList(norms_list) diff --git a/mmdetection/mmdet/models/layers/transformer/detr_layers.py b/mmdetection/mmdet/models/layers/transformer/detr_layers.py new file mode 100644 index 0000000..6a83dd2 --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/detr_layers.py @@ -0,0 +1,374 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention +from mmengine import ConfigDict +from mmengine.model import BaseModule, ModuleList +from torch import Tensor + +from mmdet.utils import ConfigType, OptConfigType + +try: + from fairscale.nn.checkpoint import checkpoint_wrapper +except Exception: + checkpoint_wrapper = None + + +class DetrTransformerEncoder(BaseModule): + """Encoder of DETR. + + Args: + num_layers (int): Number of encoder layers. + layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder + layer. All the layers will share the same config. + num_cp (int): Number of checkpointing blocks in encoder layer. + Default to -1. + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + num_layers: int, + layer_cfg: ConfigType, + num_cp: int = -1, + init_cfg: OptConfigType = None) -> None: + + super().__init__(init_cfg=init_cfg) + self.num_layers = num_layers + self.layer_cfg = layer_cfg + self.num_cp = num_cp + assert self.num_cp <= self.num_layers + self._init_layers() + + def _init_layers(self) -> None: + """Initialize encoder layers.""" + self.layers = ModuleList([ + DetrTransformerEncoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + + if self.num_cp > 0: + if checkpoint_wrapper is None: + raise NotImplementedError( + 'If you want to reduce GPU memory usage, \ + please install fairscale by executing the \ + following command: pip install fairscale.') + for i in range(self.num_cp): + self.layers[i] = checkpoint_wrapper(self.layers[i]) + + self.embed_dims = self.layers[0].embed_dims + + def forward(self, query: Tensor, query_pos: Tensor, + key_padding_mask: Tensor, **kwargs) -> Tensor: + """Forward function of encoder. + + Args: + query (Tensor): Input queries of encoder, has shape + (bs, num_queries, dim). + query_pos (Tensor): The positional embeddings of the queries, has + shape (bs, num_queries, dim). + key_padding_mask (Tensor): The `key_padding_mask` of `self_attn` + input. ByteTensor, has shape (bs, num_queries). + + Returns: + Tensor: Has shape (bs, num_queries, dim) if `batch_first` is + `True`, otherwise (num_queries, bs, dim). + """ + for layer in self.layers: + query = layer(query, query_pos, key_padding_mask, **kwargs) + return query + + +class DetrTransformerDecoder(BaseModule): + """Decoder of DETR. + + Args: + num_layers (int): Number of decoder layers. + layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder + layer. All the layers will share the same config. + post_norm_cfg (:obj:`ConfigDict` or dict, optional): Config of the + post normalization layer. Defaults to `LN`. + return_intermediate (bool, optional): Whether to return outputs of + intermediate layers. Defaults to `True`, + init_cfg (:obj:`ConfigDict` or dict, optional): the config to control + the initialization. Defaults to None. + """ + + def __init__(self, + num_layers: int, + layer_cfg: ConfigType, + post_norm_cfg: OptConfigType = dict(type='LN'), + return_intermediate: bool = True, + init_cfg: Union[dict, ConfigDict] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.layer_cfg = layer_cfg + self.num_layers = num_layers + self.post_norm_cfg = post_norm_cfg + self.return_intermediate = return_intermediate + self._init_layers() + + def _init_layers(self) -> None: + """Initialize decoder layers.""" + self.layers = ModuleList([ + DetrTransformerDecoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + self.embed_dims = self.layers[0].embed_dims + self.post_norm = build_norm_layer(self.post_norm_cfg, + self.embed_dims)[1] + + def forward(self, query: Tensor, key: Tensor, value: Tensor, + query_pos: Tensor, key_pos: Tensor, key_padding_mask: Tensor, + **kwargs) -> Tensor: + """Forward function of decoder + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + key (Tensor): The input key, has shape (bs, num_keys, dim). + value (Tensor): The input value with the same shape as `key`. + query_pos (Tensor): The positional encoding for `query`, with the + same shape as `query`. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. + key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn` + input. ByteTensor, has shape (bs, num_value). + + Returns: + Tensor: The forwarded results will have shape + (num_decoder_layers, bs, num_queries, dim) if + `return_intermediate` is `True` else (1, bs, num_queries, dim). + """ + intermediate = [] + for layer in self.layers: + query = layer( + query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + key_padding_mask=key_padding_mask, + **kwargs) + if self.return_intermediate: + intermediate.append(self.post_norm(query)) + query = self.post_norm(query) + + if self.return_intermediate: + return torch.stack(intermediate) + + return query.unsqueeze(0) + + +class DetrTransformerEncoderLayer(BaseModule): + """Implements encoder layer in DETR transformer. + + Args: + self_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for self + attention. + ffn_cfg (:obj:`ConfigDict` or dict, optional): Config for FFN. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config for + normalization layers. All the layers will share the same + config. Defaults to `LN`. + init_cfg (:obj:`ConfigDict` or dict, optional): Config to control + the initialization. Defaults to None. + """ + + def __init__(self, + self_attn_cfg: OptConfigType = dict( + embed_dims=256, num_heads=8, dropout=0.0), + ffn_cfg: OptConfigType = dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True)), + norm_cfg: OptConfigType = dict(type='LN'), + init_cfg: OptConfigType = None) -> None: + + super().__init__(init_cfg=init_cfg) + + self.self_attn_cfg = self_attn_cfg + if 'batch_first' not in self.self_attn_cfg: + self.self_attn_cfg['batch_first'] = True + else: + assert self.self_attn_cfg['batch_first'] is True, 'First \ + dimension of all DETRs in mmdet is `batch`, \ + please set `batch_first` flag.' + + self.ffn_cfg = ffn_cfg + self.norm_cfg = norm_cfg + self._init_layers() + + def _init_layers(self) -> None: + """Initialize self-attention, FFN, and normalization.""" + self.self_attn = MultiheadAttention(**self.self_attn_cfg) + self.embed_dims = self.self_attn.embed_dims + self.ffn = FFN(**self.ffn_cfg) + norms_list = [ + build_norm_layer(self.norm_cfg, self.embed_dims)[1] + for _ in range(2) + ] + self.norms = ModuleList(norms_list) + + def forward(self, query: Tensor, query_pos: Tensor, + key_padding_mask: Tensor, **kwargs) -> Tensor: + """Forward function of an encoder layer. + + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + query_pos (Tensor): The positional encoding for query, with + the same shape as `query`. + key_padding_mask (Tensor): The `key_padding_mask` of `self_attn` + input. ByteTensor. has shape (bs, num_queries). + Returns: + Tensor: forwarded results, has shape (bs, num_queries, dim). + """ + query = self.self_attn( + query=query, + key=query, + value=query, + query_pos=query_pos, + key_pos=query_pos, + key_padding_mask=key_padding_mask, + **kwargs) + query = self.norms[0](query) + query = self.ffn(query) + query = self.norms[1](query) + + return query + + +class DetrTransformerDecoderLayer(BaseModule): + """Implements decoder layer in DETR transformer. + + Args: + self_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for self + attention. + cross_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for cross + attention. + ffn_cfg (:obj:`ConfigDict` or dict, optional): Config for FFN. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config for + normalization layers. All the layers will share the same + config. Defaults to `LN`. + init_cfg (:obj:`ConfigDict` or dict, optional): Config to control + the initialization. Defaults to None. + """ + + def __init__(self, + self_attn_cfg: OptConfigType = dict( + embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + cross_attn_cfg: OptConfigType = dict( + embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + ffn_cfg: OptConfigType = dict( + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + norm_cfg: OptConfigType = dict(type='LN'), + init_cfg: OptConfigType = None) -> None: + + super().__init__(init_cfg=init_cfg) + + self.self_attn_cfg = self_attn_cfg + self.cross_attn_cfg = cross_attn_cfg + if 'batch_first' not in self.self_attn_cfg: + self.self_attn_cfg['batch_first'] = True + else: + assert self.self_attn_cfg['batch_first'] is True, 'First \ + dimension of all DETRs in mmdet is `batch`, \ + please set `batch_first` flag.' + + if 'batch_first' not in self.cross_attn_cfg: + self.cross_attn_cfg['batch_first'] = True + else: + assert self.cross_attn_cfg['batch_first'] is True, 'First \ + dimension of all DETRs in mmdet is `batch`, \ + please set `batch_first` flag.' + + self.ffn_cfg = ffn_cfg + self.norm_cfg = norm_cfg + self._init_layers() + + def _init_layers(self) -> None: + """Initialize self-attention, FFN, and normalization.""" + self.self_attn = MultiheadAttention(**self.self_attn_cfg) + self.cross_attn = MultiheadAttention(**self.cross_attn_cfg) + self.embed_dims = self.self_attn.embed_dims + self.ffn = FFN(**self.ffn_cfg) + norms_list = [ + build_norm_layer(self.norm_cfg, self.embed_dims)[1] + for _ in range(3) + ] + self.norms = ModuleList(norms_list) + + def forward(self, + query: Tensor, + key: Tensor = None, + value: Tensor = None, + query_pos: Tensor = None, + key_pos: Tensor = None, + self_attn_mask: Tensor = None, + cross_attn_mask: Tensor = None, + key_padding_mask: Tensor = None, + **kwargs) -> Tensor: + """ + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + key (Tensor, optional): The input key, has shape (bs, num_keys, + dim). If `None`, the `query` will be used. Defaults to `None`. + value (Tensor, optional): The input value, has the same shape as + `key`, as in `nn.MultiheadAttention.forward`. If `None`, the + `key` will be used. Defaults to `None`. + query_pos (Tensor, optional): The positional encoding for `query`, + has the same shape as `query`. If not `None`, it will be added + to `query` before forward function. Defaults to `None`. + key_pos (Tensor, optional): The positional encoding for `key`, has + the same shape as `key`. If not `None`, it will be added to + `key` before forward function. If None, and `query_pos` has the + same shape as `key`, then `query_pos` will be used for + `key_pos`. Defaults to None. + self_attn_mask (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), as in `nn.MultiheadAttention.forward`. + Defaults to None. + cross_attn_mask (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), as in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor, optional): The `key_padding_mask` of + `self_attn` input. ByteTensor, has shape (bs, num_value). + Defaults to None. + + Returns: + Tensor: forwarded results, has shape (bs, num_queries, dim). + """ + + query = self.self_attn( + query=query, + key=query, + value=query, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=self_attn_mask, + **kwargs) + query = self.norms[0](query) + query = self.cross_attn( + query=query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=cross_attn_mask, + key_padding_mask=key_padding_mask, + **kwargs) + query = self.norms[1](query) + query = self.ffn(query) + query = self.norms[2](query) + + return query diff --git a/mmdetection/mmdet/models/layers/transformer/dino_layers.py b/mmdetection/mmdet/models/layers/transformer/dino_layers.py new file mode 100644 index 0000000..64610d0 --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/dino_layers.py @@ -0,0 +1,562 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Tuple, Union + +import torch +from mmengine.model import BaseModule +from torch import Tensor, nn + +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox_xyxy_to_cxcywh +from mmdet.utils import OptConfigType +from .deformable_detr_layers import DeformableDetrTransformerDecoder +from .utils import MLP, coordinate_to_encoding, inverse_sigmoid + + +class DinoTransformerDecoder(DeformableDetrTransformerDecoder): + """Transformer decoder of DINO.""" + + def _init_layers(self) -> None: + """Initialize decoder layers.""" + super()._init_layers() + self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims, + self.embed_dims, 2) + self.norm = nn.LayerNorm(self.embed_dims) + + def forward(self, query: Tensor, value: Tensor, key_padding_mask: Tensor, + self_attn_mask: Tensor, reference_points: Tensor, + spatial_shapes: Tensor, level_start_index: Tensor, + valid_ratios: Tensor, reg_branches: nn.ModuleList, + **kwargs) -> Tuple[Tensor]: + """Forward function of Transformer decoder. + + Args: + query (Tensor): The input query, has shape (num_queries, bs, dim). + value (Tensor): The input values, has shape (num_value, bs, dim). + key_padding_mask (Tensor): The `key_padding_mask` of `self_attn` + input. ByteTensor, has shape (num_queries, bs). + self_attn_mask (Tensor): The attention mask to prevent information + leakage from different denoising groups and matching parts, has + shape (num_queries_total, num_queries_total). It is `None` when + `self.training` is `False`. + reference_points (Tensor): The initial reference, has shape + (bs, num_queries, 4) with the last dimension arranged as + (cx, cy, w, h). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + reg_branches: (obj:`nn.ModuleList`): Used for refining the + regression results. + + Returns: + tuple[Tensor]: Output queries and references of Transformer + decoder + + - query (Tensor): Output embeddings of the last decoder, has + shape (num_queries, bs, embed_dims) when `return_intermediate` + is `False`. Otherwise, Intermediate output embeddings of all + decoder layers, has shape (num_decoder_layers, num_queries, bs, + embed_dims). + - reference_points (Tensor): The reference of the last decoder + layer, has shape (bs, num_queries, 4) when `return_intermediate` + is `False`. Otherwise, Intermediate references of all decoder + layers, has shape (num_decoder_layers, bs, num_queries, 4). The + coordinates are arranged as (cx, cy, w, h) + """ + intermediate = [] + intermediate_reference_points = [reference_points] + for lid, layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = \ + reference_points[:, :, None] * torch.cat( + [valid_ratios, valid_ratios], -1)[:, None] + else: + assert reference_points.shape[-1] == 2 + reference_points_input = \ + reference_points[:, :, None] * valid_ratios[:, None] + + query_sine_embed = coordinate_to_encoding( + reference_points_input[:, :, 0, :]) + query_pos = self.ref_point_head(query_sine_embed) + + query = layer( + query, + query_pos=query_pos, + value=value, + key_padding_mask=key_padding_mask, + self_attn_mask=self_attn_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reference_points=reference_points_input, + **kwargs) + + if reg_branches is not None: + tmp = reg_branches[lid](query) + assert reference_points.shape[-1] == 4 + new_reference_points = tmp + inverse_sigmoid( + reference_points, eps=1e-3) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + if self.return_intermediate: + intermediate.append(self.norm(query)) + intermediate_reference_points.append(new_reference_points) + # NOTE this is for the "Look Forward Twice" module, + # in the DeformDETR, reference_points was appended. + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return query, reference_points + + +class CdnQueryGenerator(BaseModule): + """Implement query generator of the Contrastive denoising (CDN) proposed in + `DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object + Detection `_ + + Code is modified from the `official github repo + `_. + + Args: + num_classes (int): Number of object classes. + embed_dims (int): The embedding dimensions of the generated queries. + num_matching_queries (int): The queries number of the matching part. + Used for generating dn_mask. + label_noise_scale (float): The scale of label noise, defaults to 0.5. + box_noise_scale (float): The scale of box noise, defaults to 1.0. + group_cfg (:obj:`ConfigDict` or dict, optional): The config of the + denoising queries grouping, includes `dynamic`, `num_dn_queries`, + and `num_groups`. Two grouping strategies, 'static dn groups' and + 'dynamic dn groups', are supported. When `dynamic` is `False`, + the `num_groups` should be set, and the number of denoising query + groups will always be `num_groups`. When `dynamic` is `True`, the + `num_dn_queries` should be set, and the group number will be + dynamic to ensure that the denoising queries number will not exceed + `num_dn_queries` to prevent large fluctuations of memory. Defaults + to `None`. + """ + + def __init__(self, + num_classes: int, + embed_dims: int, + num_matching_queries: int, + label_noise_scale: float = 0.5, + box_noise_scale: float = 1.0, + group_cfg: OptConfigType = None) -> None: + super().__init__() + self.num_classes = num_classes + self.embed_dims = embed_dims + self.num_matching_queries = num_matching_queries + self.label_noise_scale = label_noise_scale + self.box_noise_scale = box_noise_scale + + # prepare grouping strategy + group_cfg = {} if group_cfg is None else group_cfg + self.dynamic_dn_groups = group_cfg.get('dynamic', True) + if self.dynamic_dn_groups: + if 'num_dn_queries' not in group_cfg: + warnings.warn("'num_dn_queries' should be set when using " + 'dynamic dn groups, use 100 as default.') + self.num_dn_queries = group_cfg.get('num_dn_queries', 100) + assert isinstance(self.num_dn_queries, int), \ + f'Expected the num_dn_queries to have type int, but got ' \ + f'{self.num_dn_queries}({type(self.num_dn_queries)}). ' + else: + assert 'num_groups' in group_cfg, \ + 'num_groups should be set when using static dn groups' + self.num_groups = group_cfg['num_groups'] + assert isinstance(self.num_groups, int), \ + f'Expected the num_groups to have type int, but got ' \ + f'{self.num_groups}({type(self.num_groups)}). ' + + # NOTE The original repo of DINO set the num_embeddings 92 for coco, + # 91 (0~90) of which represents target classes and the 92 (91) + # indicates `Unknown` class. However, the embedding of `unknown` class + # is not used in the original DINO. + # TODO: num_classes + 1 or num_classes ? + self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims) + + def __call__(self, batch_data_samples: SampleList) -> tuple: + """Generate contrastive denoising (cdn) queries with ground truth. + + Descriptions of the Number Values in code and comments: + - num_target_total: the total target number of the input batch + samples. + - max_num_target: the max target number of the input batch samples. + - num_noisy_targets: the total targets number after adding noise, + i.e., num_target_total * num_groups * 2. + - num_denoising_queries: the length of the output batched queries, + i.e., max_num_target * num_groups * 2. + + NOTE The format of input bboxes in batch_data_samples is unnormalized + (x, y, x, y), and the output bbox queries are embedded by normalized + (cx, cy, w, h) format bboxes going through inverse_sigmoid. + + Args: + batch_data_samples (list[:obj:`DetDataSample`]): List of the batch + data samples, each includes `gt_instance` which has attributes + `bboxes` and `labels`. The `bboxes` has unnormalized coordinate + format (x, y, x, y). + + Returns: + tuple: The outputs of the dn query generator. + + - dn_label_query (Tensor): The output content queries for denoising + part, has shape (bs, num_denoising_queries, dim), where + `num_denoising_queries = max_num_target * num_groups * 2`. + - dn_bbox_query (Tensor): The output reference bboxes as positions + of queries for denoising part, which are embedded by normalized + (cx, cy, w, h) format bboxes going through inverse_sigmoid, has + shape (bs, num_denoising_queries, 4) with the last dimension + arranged as (cx, cy, w, h). + - attn_mask (Tensor): The attention mask to prevent information + leakage from different denoising groups and matching parts, + will be used as `self_attn_mask` of the `decoder`, has shape + (num_queries_total, num_queries_total), where `num_queries_total` + is the sum of `num_denoising_queries` and `num_matching_queries`. + - dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + """ + # normalize bbox and collate ground truth (gt) + gt_labels_list = [] + gt_bboxes_list = [] + for sample in batch_data_samples: + img_h, img_w = sample.img_shape + bboxes = sample.gt_instances.bboxes + factor = bboxes.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + bboxes_normalized = bboxes / factor + gt_bboxes_list.append(bboxes_normalized) + gt_labels_list.append(sample.gt_instances.labels) + gt_labels = torch.cat(gt_labels_list) # (num_target_total, 4) + gt_bboxes = torch.cat(gt_bboxes_list) + + num_target_list = [len(bboxes) for bboxes in gt_bboxes_list] + max_num_target = max(num_target_list) + num_groups = self.get_num_groups(max_num_target) + + dn_label_query = self.generate_dn_label_query(gt_labels, num_groups) + dn_bbox_query = self.generate_dn_bbox_query(gt_bboxes, num_groups) + + # The `batch_idx` saves the batch index of the corresponding sample + # for each target, has shape (num_target_total). + batch_idx = torch.cat([ + torch.full_like(t.long(), i) for i, t in enumerate(gt_labels_list) + ]) + dn_label_query, dn_bbox_query = self.collate_dn_queries( + dn_label_query, dn_bbox_query, batch_idx, len(batch_data_samples), + num_groups) + + attn_mask = self.generate_dn_mask( + max_num_target, num_groups, device=dn_label_query.device) + + dn_meta = dict( + num_denoising_queries=int(max_num_target * 2 * num_groups), + num_denoising_groups=num_groups) + + return dn_label_query, dn_bbox_query, attn_mask, dn_meta + + def get_num_groups(self, max_num_target: int = None) -> int: + """Calculate denoising query groups number. + + Two grouping strategies, 'static dn groups' and 'dynamic dn groups', + are supported. When `self.dynamic_dn_groups` is `False`, the number + of denoising query groups will always be `self.num_groups`. When + `self.dynamic_dn_groups` is `True`, the group number will be dynamic, + ensuring the denoising queries number will not exceed + `self.num_dn_queries` to prevent large fluctuations of memory. + + NOTE The `num_group` is shared for different samples in a batch. When + the target numbers in the samples varies, the denoising queries of the + samples containing fewer targets are padded to the max length. + + Args: + max_num_target (int, optional): The max target number of the batch + samples. It will only be used when `self.dynamic_dn_groups` is + `True`. Defaults to `None`. + + Returns: + int: The denoising group number of the current batch. + """ + if self.dynamic_dn_groups: + assert max_num_target is not None, \ + 'group_queries should be provided when using ' \ + 'dynamic dn groups' + if max_num_target == 0: + num_groups = 1 + else: + num_groups = self.num_dn_queries // max_num_target + else: + num_groups = self.num_groups + if num_groups < 1: + num_groups = 1 + return int(num_groups) + + def generate_dn_label_query(self, gt_labels: Tensor, + num_groups: int) -> Tensor: + """Generate noisy labels and their query embeddings. + + The strategy for generating noisy labels is: Randomly choose labels of + `self.label_noise_scale * 0.5` proportion and override each of them + with a random object category label. + + NOTE Not add noise to all labels. Besides, the `self.label_noise_scale + * 0.5` arg is the ratio of the chosen positions, which is higher than + the actual proportion of noisy labels, because the labels to override + may be correct. And the gap becomes larger as the number of target + categories decreases. The users should notice this and modify the scale + arg or the corresponding logic according to specific dataset. + + Args: + gt_labels (Tensor): The concatenated gt labels of all samples + in the batch, has shape (num_target_total, ) where + `num_target_total = sum(num_target_list)`. + num_groups (int): The number of denoising query groups. + + Returns: + Tensor: The query embeddings of noisy labels, has shape + (num_noisy_targets, embed_dims), where `num_noisy_targets = + num_target_total * num_groups * 2`. + """ + assert self.label_noise_scale > 0 + gt_labels_expand = gt_labels.repeat(2 * num_groups, + 1).view(-1) # Note `* 2` # noqa + p = torch.rand_like(gt_labels_expand.float()) + chosen_indice = torch.nonzero(p < (self.label_noise_scale * 0.5)).view( + -1) # Note `* 0.5` + new_labels = torch.randint_like(chosen_indice, 0, self.num_classes) + noisy_labels_expand = gt_labels_expand.scatter(0, chosen_indice, + new_labels) + dn_label_query = self.label_embedding(noisy_labels_expand) + return dn_label_query + + def generate_dn_bbox_query(self, gt_bboxes: Tensor, + num_groups: int) -> Tensor: + """Generate noisy bboxes and their query embeddings. + + The strategy for generating noisy bboxes is as follow: + + .. code:: text + + +--------------------+ + | negative | + | +----------+ | + | | positive | | + | | +-----|----+------------+ + | | | | | | + | +----+-----+ | | + | | | | + +---------+----------+ | + | | + | gt bbox | + | | + | +---------+----------+ + | | | | + | | +----+-----+ | + | | | | | | + +-------------|--- +----+ | | + | | positive | | + | +----------+ | + | negative | + +--------------------+ + + The random noise is added to the top-left and down-right point + positions, hence, normalized (x, y, x, y) format of bboxes are + required. The noisy bboxes of positive queries have the points + both within the inner square, while those of negative queries + have the points both between the inner and outer squares. + + Besides, the length of outer square is twice as long as that of + the inner square, i.e., self.box_noise_scale * w_or_h / 2. + NOTE The noise is added to all the bboxes. Moreover, there is still + unconsidered case when one point is within the positive square and + the others is between the inner and outer squares. + + Args: + gt_bboxes (Tensor): The concatenated gt bboxes of all samples + in the batch, has shape (num_target_total, 4) with the last + dimension arranged as (cx, cy, w, h) where + `num_target_total = sum(num_target_list)`. + num_groups (int): The number of denoising query groups. + + Returns: + Tensor: The output noisy bboxes, which are embedded by normalized + (cx, cy, w, h) format bboxes going through inverse_sigmoid, has + shape (num_noisy_targets, 4) with the last dimension arranged as + (cx, cy, w, h), where + `num_noisy_targets = num_target_total * num_groups * 2`. + """ + assert self.box_noise_scale > 0 + device = gt_bboxes.device + + # expand gt_bboxes as groups + gt_bboxes_expand = gt_bboxes.repeat(2 * num_groups, 1) # xyxy + + # obtain index of negative queries in gt_bboxes_expand + positive_idx = torch.arange( + len(gt_bboxes), dtype=torch.long, device=device) + positive_idx = positive_idx.unsqueeze(0).repeat(num_groups, 1) + positive_idx += 2 * len(gt_bboxes) * torch.arange( + num_groups, dtype=torch.long, device=device)[:, None] + positive_idx = positive_idx.flatten() + negative_idx = positive_idx + len(gt_bboxes) + + # determine the sign of each element in the random part of the added + # noise to be positive or negative randomly. + rand_sign = torch.randint_like( + gt_bboxes_expand, low=0, high=2, + dtype=torch.float32) * 2.0 - 1.0 # [low, high), 1 or -1, randomly + + # calculate the random part of the added noise + rand_part = torch.rand_like(gt_bboxes_expand) # [0, 1) + rand_part[negative_idx] += 1.0 # pos: [0, 1); neg: [1, 2) + rand_part *= rand_sign # pos: (-1, 1); neg: (-2, -1] U [1, 2) + + # add noise to the bboxes + bboxes_whwh = bbox_xyxy_to_cxcywh(gt_bboxes_expand)[:, 2:].repeat(1, 2) + noisy_bboxes_expand = gt_bboxes_expand + torch.mul( + rand_part, bboxes_whwh) * self.box_noise_scale / 2 # xyxy + noisy_bboxes_expand = noisy_bboxes_expand.clamp(min=0.0, max=1.0) + noisy_bboxes_expand = bbox_xyxy_to_cxcywh(noisy_bboxes_expand) + + dn_bbox_query = inverse_sigmoid(noisy_bboxes_expand, eps=1e-3) + return dn_bbox_query + + def collate_dn_queries(self, input_label_query: Tensor, + input_bbox_query: Tensor, batch_idx: Tensor, + batch_size: int, num_groups: int) -> Tuple[Tensor]: + """Collate generated queries to obtain batched dn queries. + + The strategy for query collation is as follow: + + .. code:: text + + input_queries (num_target_total, query_dim) + P_A1 P_B1 P_B2 N_A1 N_B1 N_B2 P'A1 P'B1 P'B2 N'A1 N'B1 N'B2 + |________ group1 ________| |________ group2 ________| + | + V + P_A1 Pad0 N_A1 Pad0 P'A1 Pad0 N'A1 Pad0 + P_B1 P_B2 N_B1 N_B2 P'B1 P'B2 N'B1 N'B2 + |____ group1 ____| |____ group2 ____| + batched_queries (batch_size, max_num_target, query_dim) + + where query_dim is 4 for bbox and self.embed_dims for label. + Notation: _-group 1; '-group 2; + A-Sample1(has 1 target); B-sample2(has 2 targets) + + Args: + input_label_query (Tensor): The generated label queries of all + targets, has shape (num_target_total, embed_dims) where + `num_target_total = sum(num_target_list)`. + input_bbox_query (Tensor): The generated bbox queries of all + targets, has shape (num_target_total, 4) with the last + dimension arranged as (cx, cy, w, h). + batch_idx (Tensor): The batch index of the corresponding sample + for each target, has shape (num_target_total). + batch_size (int): The size of the input batch. + num_groups (int): The number of denoising query groups. + + Returns: + tuple[Tensor]: Output batched label and bbox queries. + - batched_label_query (Tensor): The output batched label queries, + has shape (batch_size, max_num_target, embed_dims). + - batched_bbox_query (Tensor): The output batched bbox queries, + has shape (batch_size, max_num_target, 4) with the last dimension + arranged as (cx, cy, w, h). + """ + device = input_label_query.device + num_target_list = [ + torch.sum(batch_idx == idx) for idx in range(batch_size) + ] + max_num_target = max(num_target_list) + num_denoising_queries = int(max_num_target * 2 * num_groups) + + map_query_index = torch.cat([ + torch.arange(num_target, device=device) + for num_target in num_target_list + ]) + map_query_index = torch.cat([ + map_query_index + max_num_target * i for i in range(2 * num_groups) + ]).long() + batch_idx_expand = batch_idx.repeat(2 * num_groups, 1).view(-1) + mapper = (batch_idx_expand, map_query_index) + + batched_label_query = torch.zeros( + batch_size, num_denoising_queries, self.embed_dims, device=device) + batched_bbox_query = torch.zeros( + batch_size, num_denoising_queries, 4, device=device) + + batched_label_query[mapper] = input_label_query + batched_bbox_query[mapper] = input_bbox_query + return batched_label_query, batched_bbox_query + + def generate_dn_mask(self, max_num_target: int, num_groups: int, + device: Union[torch.device, str]) -> Tensor: + """Generate attention mask to prevent information leakage from + different denoising groups and matching parts. + + .. code:: text + + 0 0 0 0 1 1 1 1 0 0 0 0 0 + 0 0 0 0 1 1 1 1 0 0 0 0 0 + 0 0 0 0 1 1 1 1 0 0 0 0 0 + 0 0 0 0 1 1 1 1 0 0 0 0 0 + 1 1 1 1 0 0 0 0 0 0 0 0 0 + 1 1 1 1 0 0 0 0 0 0 0 0 0 + 1 1 1 1 0 0 0 0 0 0 0 0 0 + 1 1 1 1 0 0 0 0 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + 1 1 1 1 1 1 1 1 0 0 0 0 0 + max_num_target |_| |_________| num_matching_queries + |_____________| num_denoising_queries + + 1 -> True (Masked), means 'can not see'. + 0 -> False (UnMasked), means 'can see'. + + Args: + max_num_target (int): The max target number of the input batch + samples. + num_groups (int): The number of denoising query groups. + device (obj:`device` or str): The device of generated mask. + + Returns: + Tensor: The attention mask to prevent information leakage from + different denoising groups and matching parts, will be used as + `self_attn_mask` of the `decoder`, has shape (num_queries_total, + num_queries_total), where `num_queries_total` is the sum of + `num_denoising_queries` and `num_matching_queries`. + """ + num_denoising_queries = int(max_num_target * 2 * num_groups) + num_queries_total = num_denoising_queries + self.num_matching_queries + attn_mask = torch.zeros( + num_queries_total, + num_queries_total, + device=device, + dtype=torch.bool) + # Make the matching part cannot see the denoising groups + attn_mask[num_denoising_queries:, :num_denoising_queries] = True + # Make the denoising groups cannot see each other + for i in range(num_groups): + # Mask rows of one group per step. + row_scope = slice(max_num_target * 2 * i, + max_num_target * 2 * (i + 1)) + left_scope = slice(max_num_target * 2 * i) + right_scope = slice(max_num_target * 2 * (i + 1), + num_denoising_queries) + attn_mask[row_scope, right_scope] = True + attn_mask[row_scope, left_scope] = True + return attn_mask diff --git a/mmdetection/mmdet/models/layers/transformer/grounding_dino_layers.py b/mmdetection/mmdet/models/layers/transformer/grounding_dino_layers.py new file mode 100644 index 0000000..3c28576 --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/grounding_dino_layers.py @@ -0,0 +1,270 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention +from mmcv.ops import MultiScaleDeformableAttention +from mmengine.model import ModuleList +from torch import Tensor + +from mmdet.models.utils.vlfuse_helper import SingleScaleBiAttentionBlock +from mmdet.utils import ConfigType, OptConfigType +from .deformable_detr_layers import (DeformableDetrTransformerDecoderLayer, + DeformableDetrTransformerEncoder, + DeformableDetrTransformerEncoderLayer) +from .detr_layers import DetrTransformerEncoderLayer +from .dino_layers import DinoTransformerDecoder +from .utils import MLP, get_text_sine_pos_embed + +try: + from fairscale.nn.checkpoint import checkpoint_wrapper +except Exception: + checkpoint_wrapper = None + + +class GroundingDinoTransformerDecoderLayer( + DeformableDetrTransformerDecoderLayer): + + def __init__(self, + cross_attn_text_cfg: OptConfigType = dict( + embed_dims=256, + num_heads=8, + dropout=0.0, + batch_first=True), + **kwargs) -> None: + """Decoder layer of Deformable DETR.""" + self.cross_attn_text_cfg = cross_attn_text_cfg + if 'batch_first' not in self.cross_attn_text_cfg: + self.cross_attn_text_cfg['batch_first'] = True + super().__init__(**kwargs) + + def _init_layers(self) -> None: + """Initialize self_attn, cross-attn, ffn, and norms.""" + self.self_attn = MultiheadAttention(**self.self_attn_cfg) + self.cross_attn_text = MultiheadAttention(**self.cross_attn_text_cfg) + self.cross_attn = MultiScaleDeformableAttention(**self.cross_attn_cfg) + self.embed_dims = self.self_attn.embed_dims + self.ffn = FFN(**self.ffn_cfg) + norms_list = [ + build_norm_layer(self.norm_cfg, self.embed_dims)[1] + for _ in range(4) + ] + self.norms = ModuleList(norms_list) + + def forward(self, + query: Tensor, + key: Tensor = None, + value: Tensor = None, + query_pos: Tensor = None, + key_pos: Tensor = None, + self_attn_mask: Tensor = None, + cross_attn_mask: Tensor = None, + key_padding_mask: Tensor = None, + memory_text: Tensor = None, + text_attention_mask: Tensor = None, + **kwargs) -> Tensor: + """Implements decoder layer in Grounding DINO transformer. + + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + key (Tensor, optional): The input key, has shape (bs, num_keys, + dim). If `None`, the `query` will be used. Defaults to `None`. + value (Tensor, optional): The input value, has the same shape as + `key`, as in `nn.MultiheadAttention.forward`. If `None`, the + `key` will be used. Defaults to `None`. + query_pos (Tensor, optional): The positional encoding for `query`, + has the same shape as `query`. If not `None`, it will be added + to `query` before forward function. Defaults to `None`. + key_pos (Tensor, optional): The positional encoding for `key`, has + the same shape as `key`. If not `None`, it will be added to + `key` before forward function. If None, and `query_pos` has the + same shape as `key`, then `query_pos` will be used for + `key_pos`. Defaults to None. + self_attn_mask (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), as in `nn.MultiheadAttention.forward`. + Defaults to None. + cross_attn_mask (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), as in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor, optional): The `key_padding_mask` of + `self_attn` input. ByteTensor, has shape (bs, num_value). + Defaults to None. + memory_text (Tensor): Memory text. It has shape (bs, len_text, + text_embed_dims). + text_attention_mask (Tensor): Text token mask. It has shape (bs, + len_text). + + Returns: + Tensor: forwarded results, has shape (bs, num_queries, dim). + """ + # self attention + query = self.self_attn( + query=query, + key=query, + value=query, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=self_attn_mask, + **kwargs) + query = self.norms[0](query) + # cross attention between query and text + query = self.cross_attn_text( + query=query, + query_pos=query_pos, + key=memory_text, + value=memory_text, + key_padding_mask=text_attention_mask) + query = self.norms[1](query) + # cross attention between query and image + query = self.cross_attn( + query=query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=cross_attn_mask, + key_padding_mask=key_padding_mask, + **kwargs) + query = self.norms[2](query) + query = self.ffn(query) + query = self.norms[3](query) + + return query + + +class GroundingDinoTransformerEncoder(DeformableDetrTransformerEncoder): + + def __init__(self, text_layer_cfg: ConfigType, + fusion_layer_cfg: ConfigType, **kwargs) -> None: + self.text_layer_cfg = text_layer_cfg + self.fusion_layer_cfg = fusion_layer_cfg + super().__init__(**kwargs) + + def _init_layers(self) -> None: + """Initialize encoder layers.""" + self.layers = ModuleList([ + DeformableDetrTransformerEncoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + self.text_layers = ModuleList([ + DetrTransformerEncoderLayer(**self.text_layer_cfg) + for _ in range(self.num_layers) + ]) + self.fusion_layers = ModuleList([ + SingleScaleBiAttentionBlock(**self.fusion_layer_cfg) + for _ in range(self.num_layers) + ]) + self.embed_dims = self.layers[0].embed_dims + if self.num_cp > 0: + if checkpoint_wrapper is None: + raise NotImplementedError( + 'If you want to reduce GPU memory usage, \ + please install fairscale by executing the \ + following command: pip install fairscale.') + for i in range(self.num_cp): + self.layers[i] = checkpoint_wrapper(self.layers[i]) + self.fusion_layers[i] = checkpoint_wrapper( + self.fusion_layers[i]) + + def forward(self, + query: Tensor, + query_pos: Tensor, + key_padding_mask: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios: Tensor, + memory_text: Tensor = None, + text_attention_mask: Tensor = None, + pos_text: Tensor = None, + text_self_attention_masks: Tensor = None, + position_ids: Tensor = None): + """Forward function of Transformer encoder. + + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + query_pos (Tensor): The positional encoding for query, has shape + (bs, num_queries, dim). + key_padding_mask (Tensor): The `key_padding_mask` of `self_attn` + input. ByteTensor, has shape (bs, num_queries). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + memory_text (Tensor, optional): Memory text. It has shape (bs, + len_text, text_embed_dims). + text_attention_mask (Tensor, optional): Text token mask. It has + shape (bs,len_text). + pos_text (Tensor, optional): The positional encoding for text. + Defaults to None. + text_self_attention_masks (Tensor, optional): Text self attention + mask. Defaults to None. + position_ids (Tensor, optional): Text position ids. + Defaults to None. + """ + output = query + reference_points = self.get_encoder_reference_points( + spatial_shapes, valid_ratios, device=query.device) + if self.text_layers: + # generate pos_text + bs, n_text, _ = memory_text.shape + if pos_text is None and position_ids is None: + pos_text = ( + torch.arange(n_text, + device=memory_text.device).float().unsqueeze( + 0).unsqueeze(-1).repeat(bs, 1, 1)) + pos_text = get_text_sine_pos_embed( + pos_text, num_pos_feats=256, exchange_xy=False) + if position_ids is not None: + pos_text = get_text_sine_pos_embed( + position_ids[..., None], + num_pos_feats=256, + exchange_xy=False) + + # main process + for layer_id, layer in enumerate(self.layers): + if self.fusion_layers: + output, memory_text = self.fusion_layers[layer_id]( + visual_feature=output, + lang_feature=memory_text, + attention_mask_v=key_padding_mask, + attention_mask_l=text_attention_mask, + ) + if self.text_layers: + text_num_heads = self.text_layers[ + layer_id].self_attn_cfg.num_heads + memory_text = self.text_layers[layer_id]( + query=memory_text, + query_pos=(pos_text if pos_text is not None else None), + attn_mask=~text_self_attention_masks.repeat( + text_num_heads, 1, 1), # note we use ~ for mask here + key_padding_mask=None, + ) + output = layer( + query=output, + query_pos=query_pos, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=key_padding_mask) + return output, memory_text + + +class GroundingDinoTransformerDecoder(DinoTransformerDecoder): + + def _init_layers(self) -> None: + """Initialize decoder layers.""" + self.layers = ModuleList([ + GroundingDinoTransformerDecoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + self.embed_dims = self.layers[0].embed_dims + if self.post_norm_cfg is not None: + raise ValueError('There is not post_norm in ' + f'{self._get_name()}') + self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims, + self.embed_dims, 2) + self.norm = nn.LayerNorm(self.embed_dims) diff --git a/mmdetection/mmdet/models/layers/transformer/mask2former_layers.py b/mmdetection/mmdet/models/layers/transformer/mask2former_layers.py new file mode 100644 index 0000000..dcc604e --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/mask2former_layers.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import build_norm_layer +from mmengine.model import ModuleList +from torch import Tensor + +from .deformable_detr_layers import DeformableDetrTransformerEncoder +from .detr_layers import DetrTransformerDecoder, DetrTransformerDecoderLayer + + +class Mask2FormerTransformerEncoder(DeformableDetrTransformerEncoder): + """Encoder in PixelDecoder of Mask2Former.""" + + def forward(self, query: Tensor, query_pos: Tensor, + key_padding_mask: Tensor, spatial_shapes: Tensor, + level_start_index: Tensor, valid_ratios: Tensor, + reference_points: Tensor, **kwargs) -> Tensor: + """Forward function of Transformer encoder. + + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + query_pos (Tensor): The positional encoding for query, has shape + (bs, num_queries, dim). If not None, it will be added to the + `query` before forward function. Defaults to None. + key_padding_mask (Tensor): The `key_padding_mask` of `self_attn` + input. ByteTensor, has shape (bs, num_queries). + spatial_shapes (Tensor): Spatial shapes of features in all levels, + has shape (num_levels, 2), last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels, ) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + valid_ratios (Tensor): The ratios of the valid width and the valid + height relative to the width and the height of features in all + levels, has shape (bs, num_levels, 2). + reference_points (Tensor): The initial reference, has shape + (bs, num_queries, 2) with the last dimension arranged + as (cx, cy). + + Returns: + Tensor: Output queries of Transformer encoder, which is also + called 'encoder output embeddings' or 'memory', has shape + (bs, num_queries, dim) + """ + for layer in self.layers: + query = layer( + query=query, + query_pos=query_pos, + key_padding_mask=key_padding_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reference_points=reference_points, + **kwargs) + return query + + +class Mask2FormerTransformerDecoder(DetrTransformerDecoder): + """Decoder of Mask2Former.""" + + def _init_layers(self) -> None: + """Initialize decoder layers.""" + self.layers = ModuleList([ + Mask2FormerTransformerDecoderLayer(**self.layer_cfg) + for _ in range(self.num_layers) + ]) + self.embed_dims = self.layers[0].embed_dims + self.post_norm = build_norm_layer(self.post_norm_cfg, + self.embed_dims)[1] + + +class Mask2FormerTransformerDecoderLayer(DetrTransformerDecoderLayer): + """Implements decoder layer in Mask2Former transformer.""" + + def forward(self, + query: Tensor, + key: Tensor = None, + value: Tensor = None, + query_pos: Tensor = None, + key_pos: Tensor = None, + self_attn_mask: Tensor = None, + cross_attn_mask: Tensor = None, + key_padding_mask: Tensor = None, + **kwargs) -> Tensor: + """ + Args: + query (Tensor): The input query, has shape (bs, num_queries, dim). + key (Tensor, optional): The input key, has shape (bs, num_keys, + dim). If `None`, the `query` will be used. Defaults to `None`. + value (Tensor, optional): The input value, has the same shape as + `key`, as in `nn.MultiheadAttention.forward`. If `None`, the + `key` will be used. Defaults to `None`. + query_pos (Tensor, optional): The positional encoding for `query`, + has the same shape as `query`. If not `None`, it will be added + to `query` before forward function. Defaults to `None`. + key_pos (Tensor, optional): The positional encoding for `key`, has + the same shape as `key`. If not `None`, it will be added to + `key` before forward function. If None, and `query_pos` has the + same shape as `key`, then `query_pos` will be used for + `key_pos`. Defaults to None. + self_attn_mask (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), as in `nn.MultiheadAttention.forward`. + Defaults to None. + cross_attn_mask (Tensor, optional): ByteTensor mask, has shape + (num_queries, num_keys), as in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor, optional): The `key_padding_mask` of + `self_attn` input. ByteTensor, has shape (bs, num_value). + Defaults to None. + + Returns: + Tensor: forwarded results, has shape (bs, num_queries, dim). + """ + + query = self.cross_attn( + query=query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=cross_attn_mask, + key_padding_mask=key_padding_mask, + **kwargs) + query = self.norms[0](query) + query = self.self_attn( + query=query, + key=query, + value=query, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=self_attn_mask, + **kwargs) + query = self.norms[1](query) + query = self.ffn(query) + query = self.norms[2](query) + + return query diff --git a/mmdetection/mmdet/models/layers/transformer/utils.py b/mmdetection/mmdet/models/layers/transformer/utils.py new file mode 100644 index 0000000..6e43a17 --- /dev/null +++ b/mmdetection/mmdet/models/layers/transformer/utils.py @@ -0,0 +1,915 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import warnings +from typing import Optional, Sequence, Tuple, Union + +import torch +import torch.nn.functional as F +from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer, + build_norm_layer) +from mmcv.cnn.bricks.drop import Dropout +from mmengine.model import BaseModule, ModuleList +from mmengine.utils import to_2tuple +from torch import Tensor, nn + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType, OptMultiConfig + + +def nlc_to_nchw(x: Tensor, hw_shape: Sequence[int]) -> Tensor: + """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor. + + Args: + x (Tensor): The input tensor of shape [N, L, C] before conversion. + hw_shape (Sequence[int]): The height and width of output feature map. + + Returns: + Tensor: The output tensor of shape [N, C, H, W] after conversion. + """ + H, W = hw_shape + assert len(x.shape) == 3 + B, L, C = x.shape + assert L == H * W, 'The seq_len does not match H, W' + return x.transpose(1, 2).reshape(B, C, H, W).contiguous() + + +def nchw_to_nlc(x): + """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor. + + Args: + x (Tensor): The input tensor of shape [N, C, H, W] before conversion. + + Returns: + Tensor: The output tensor of shape [N, L, C] after conversion. + """ + assert len(x.shape) == 4 + return x.flatten(2).transpose(1, 2).contiguous() + + +def coordinate_to_encoding(coord_tensor: Tensor, + num_feats: int = 128, + temperature: int = 10000, + scale: float = 2 * math.pi): + """Convert coordinate tensor to positional encoding. + + Args: + coord_tensor (Tensor): Coordinate tensor to be converted to + positional encoding. With the last dimension as 2 or 4. + num_feats (int, optional): The feature dimension for each position + along x-axis or y-axis. Note the final returned dimension + for each position is 2 times of this value. Defaults to 128. + temperature (int, optional): The temperature used for scaling + the position embedding. Defaults to 10000. + scale (float, optional): A scale factor that scales the position + embedding. The scale will be used only when `normalize` is True. + Defaults to 2*pi. + Returns: + Tensor: Returned encoded positional tensor. + """ + dim_t = torch.arange( + num_feats, dtype=torch.float32, device=coord_tensor.device) + dim_t = temperature**(2 * (dim_t // 2) / num_feats) + x_embed = coord_tensor[..., 0] * scale + y_embed = coord_tensor[..., 1] * scale + pos_x = x_embed[..., None] / dim_t + pos_y = y_embed[..., None] / dim_t + pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), + dim=-1).flatten(2) + pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), + dim=-1).flatten(2) + if coord_tensor.size(-1) == 2: + pos = torch.cat((pos_y, pos_x), dim=-1) + elif coord_tensor.size(-1) == 4: + w_embed = coord_tensor[..., 2] * scale + pos_w = w_embed[..., None] / dim_t + pos_w = torch.stack((pos_w[..., 0::2].sin(), pos_w[..., 1::2].cos()), + dim=-1).flatten(2) + + h_embed = coord_tensor[..., 3] * scale + pos_h = h_embed[..., None] / dim_t + pos_h = torch.stack((pos_h[..., 0::2].sin(), pos_h[..., 1::2].cos()), + dim=-1).flatten(2) + + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=-1) + else: + raise ValueError('Unknown pos_tensor shape(-1):{}'.format( + coord_tensor.size(-1))) + return pos + + +def inverse_sigmoid(x: Tensor, eps: float = 1e-5) -> Tensor: + """Inverse function of sigmoid. + + Args: + x (Tensor): The tensor to do the inverse. + eps (float): EPS avoid numerical overflow. Defaults 1e-5. + Returns: + Tensor: The x has passed the inverse function of sigmoid, has the same + shape with input. + """ + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +class AdaptivePadding(nn.Module): + """Applies padding to input (if needed) so that input can get fully covered + by filter you specified. It support two modes "same" and "corner". The + "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around + input. The "corner" mode would pad zero to bottom right. + + Args: + kernel_size (int | tuple): Size of the kernel: + stride (int | tuple): Stride of the filter. Default: 1: + dilation (int | tuple): Spacing between kernel elements. + Default: 1 + padding (str): Support "same" and "corner", "corner" mode + would pad zero to bottom right, and "same" mode would + pad zero around input. Default: "corner". + Example: + >>> kernel_size = 16 + >>> stride = 16 + >>> dilation = 1 + >>> input = torch.rand(1, 1, 15, 17) + >>> adap_pad = AdaptivePadding( + >>> kernel_size=kernel_size, + >>> stride=stride, + >>> dilation=dilation, + >>> padding="corner") + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + >>> input = torch.rand(1, 1, 16, 17) + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + """ + + def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): + + super(AdaptivePadding, self).__init__() + + assert padding in ('same', 'corner') + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + padding = to_2tuple(padding) + dilation = to_2tuple(dilation) + + self.padding = padding + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + + def get_pad_shape(self, input_shape): + input_h, input_w = input_shape + kernel_h, kernel_w = self.kernel_size + stride_h, stride_w = self.stride + output_h = math.ceil(input_h / stride_h) + output_w = math.ceil(input_w / stride_w) + pad_h = max((output_h - 1) * stride_h + + (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0) + pad_w = max((output_w - 1) * stride_w + + (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0) + return pad_h, pad_w + + def forward(self, x): + pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) + if pad_h > 0 or pad_w > 0: + if self.padding == 'corner': + x = F.pad(x, [0, pad_w, 0, pad_h]) + elif self.padding == 'same': + x = F.pad(x, [ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, + pad_h - pad_h // 2 + ]) + return x + + +class PatchEmbed(BaseModule): + """Image to Patch Embedding. + + We use a conv layer to implement PatchEmbed. + + Args: + in_channels (int): The num of input channels. Default: 3 + embed_dims (int): The dimensions of embedding. Default: 768 + conv_type (str): The config dict for embedding + conv layer type selection. Default: "Conv2d. + kernel_size (int): The kernel_size of embedding conv. Default: 16. + stride (int): The slide stride of embedding conv. + Default: None (Would be set as `kernel_size`). + padding (int | tuple | string ): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Default: "corner". + dilation (int): The dilation rate of embedding conv. Default: 1. + bias (bool): Bias of embed conv. Default: True. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: None. + input_size (int | tuple | None): The size of input, which will be + used to calculate the out size. Only work when `dynamic_size` + is False. Default: None. + init_cfg (`mmengine.ConfigDict`, optional): The Config for + initialization. Default: None. + """ + + def __init__(self, + in_channels: int = 3, + embed_dims: int = 768, + conv_type: str = 'Conv2d', + kernel_size: int = 16, + stride: int = 16, + padding: Union[int, tuple, str] = 'corner', + dilation: int = 1, + bias: bool = True, + norm_cfg: OptConfigType = None, + input_size: Union[int, tuple] = None, + init_cfg: OptConfigType = None) -> None: + super(PatchEmbed, self).__init__(init_cfg=init_cfg) + + self.embed_dims = embed_dims + if stride is None: + stride = kernel_size + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + + if isinstance(padding, str): + self.adap_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of conv + padding = 0 + else: + self.adap_padding = None + padding = to_2tuple(padding) + + self.projection = build_conv_layer( + dict(type=conv_type), + in_channels=in_channels, + out_channels=embed_dims, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias) + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = None + + if input_size: + input_size = to_2tuple(input_size) + # `init_out_size` would be used outside to + # calculate the num_patches + # when `use_abs_pos_embed` outside + self.init_input_size = input_size + if self.adap_padding: + pad_h, pad_w = self.adap_padding.get_pad_shape(input_size) + input_h, input_w = input_size + input_h = input_h + pad_h + input_w = input_w + pad_w + input_size = (input_h, input_w) + + # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html + h_out = (input_size[0] + 2 * padding[0] - dilation[0] * + (kernel_size[0] - 1) - 1) // stride[0] + 1 + w_out = (input_size[1] + 2 * padding[1] - dilation[1] * + (kernel_size[1] - 1) - 1) // stride[1] + 1 + self.init_out_size = (h_out, w_out) + else: + self.init_input_size = None + self.init_out_size = None + + def forward(self, x: Tensor) -> Tuple[Tensor, Tuple[int]]: + """ + Args: + x (Tensor): Has shape (B, C, H, W). In most case, C is 3. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, out_h * out_w, embed_dims) + - out_size (tuple[int]): Spatial shape of x, arrange as + (out_h, out_w). + """ + + if self.adap_padding: + x = self.adap_padding(x) + + x = self.projection(x) + out_size = (x.shape[2], x.shape[3]) + x = x.flatten(2).transpose(1, 2) + if self.norm is not None: + x = self.norm(x) + return x, out_size + + +class PatchMerging(BaseModule): + """Merge patch feature map. + + This layer groups feature map by kernel_size, and applies norm and linear + layers to the grouped feature map. Our implementation uses `nn.Unfold` to + merge patch, which is about 25% faster than original implementation. + Instead, we need to modify pretrained models for compatibility. + + Args: + in_channels (int): The num of input channels. + to gets fully covered by filter and stride you specified.. + Default: True. + out_channels (int): The num of output channels. + kernel_size (int | tuple, optional): the kernel size in the unfold + layer. Defaults to 2. + stride (int | tuple, optional): the stride of the sliding blocks in the + unfold layer. Default: None. (Would be set as `kernel_size`) + padding (int | tuple | string ): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Default: "corner". + dilation (int | tuple, optional): dilation parameter in the unfold + layer. Default: 1. + bias (bool, optional): Whether to add bias in linear layer or not. + Defaults: False. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (dict, optional): The extra config for initialization. + Default: None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Optional[Union[int, tuple]] = 2, + stride: Optional[Union[int, tuple]] = None, + padding: Union[int, tuple, str] = 'corner', + dilation: Optional[Union[int, tuple]] = 1, + bias: Optional[bool] = False, + norm_cfg: OptConfigType = dict(type='LN'), + init_cfg: OptConfigType = None) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + if stride: + stride = stride + else: + stride = kernel_size + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + + if isinstance(padding, str): + self.adap_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of unfold + padding = 0 + else: + self.adap_padding = None + + padding = to_2tuple(padding) + self.sampler = nn.Unfold( + kernel_size=kernel_size, + dilation=dilation, + padding=padding, + stride=stride) + + sample_dim = kernel_size[0] * kernel_size[1] * in_channels + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, sample_dim)[1] + else: + self.norm = None + + self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) + + def forward(self, x: Tensor, + input_size: Tuple[int]) -> Tuple[Tensor, Tuple[int]]: + """ + Args: + x (Tensor): Has shape (B, H*W, C_in). + input_size (tuple[int]): The spatial shape of x, arrange as (H, W). + Default: None. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out) + - out_size (tuple[int]): Spatial shape of x, arrange as + (Merged_H, Merged_W). + """ + B, L, C = x.shape + assert isinstance(input_size, Sequence), f'Expect ' \ + f'input_size is ' \ + f'`Sequence` ' \ + f'but get {input_size}' + + H, W = input_size + assert L == H * W, 'input feature has wrong size' + + x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W + # Use nn.Unfold to merge patch. About 25% faster than original method, + # but need to modify pretrained model for compatibility + + if self.adap_padding: + x = self.adap_padding(x) + H, W = x.shape[-2:] + + x = self.sampler(x) + # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) + + out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * + (self.sampler.kernel_size[0] - 1) - + 1) // self.sampler.stride[0] + 1 + out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * + (self.sampler.kernel_size[1] - 1) - + 1) // self.sampler.stride[1] + 1 + + output_size = (out_h, out_w) + x = x.transpose(1, 2) # B, H/2*W/2, 4*C + x = self.norm(x) if self.norm else x + x = self.reduction(x) + return x, output_size + + +class ConditionalAttention(BaseModule): + """A wrapper of conditional attention, dropout and residual connection. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop: A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + cross_attn (bool): Whether the attention module is for cross attention. + Default: False + keep_query_pos (bool): Whether to transform query_pos before cross + attention. + Default: False. + batch_first (bool): When it is True, Key, Query and Value are shape of + (batch, n, embed_dim), otherwise (n, batch, embed_dim). + Default: True. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims: int, + num_heads: int, + attn_drop: float = 0., + proj_drop: float = 0., + cross_attn: bool = False, + keep_query_pos: bool = False, + batch_first: bool = True, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + assert batch_first is True, 'Set `batch_first`\ + to False is NOT supported in ConditionalAttention. \ + First dimension of all DETRs in mmdet is `batch`, \ + please set `batch_first` to True.' + + self.cross_attn = cross_attn + self.keep_query_pos = keep_query_pos + self.embed_dims = embed_dims + self.num_heads = num_heads + self.attn_drop = Dropout(attn_drop) + self.proj_drop = Dropout(proj_drop) + + self._init_layers() + + def _init_layers(self): + """Initialize layers for qkv projection.""" + embed_dims = self.embed_dims + self.qcontent_proj = Linear(embed_dims, embed_dims) + self.qpos_proj = Linear(embed_dims, embed_dims) + self.kcontent_proj = Linear(embed_dims, embed_dims) + self.kpos_proj = Linear(embed_dims, embed_dims) + self.v_proj = Linear(embed_dims, embed_dims) + if self.cross_attn: + self.qpos_sine_proj = Linear(embed_dims, embed_dims) + self.out_proj = Linear(embed_dims, embed_dims) + + nn.init.constant_(self.out_proj.bias, 0.) + + def forward_attn(self, + query: Tensor, + key: Tensor, + value: Tensor, + attn_mask: Tensor = None, + key_padding_mask: Tensor = None) -> Tuple[Tensor]: + """Forward process for `ConditionalAttention`. + + Args: + query (Tensor): The input query with shape [bs, num_queries, + embed_dims]. + key (Tensor): The key tensor with shape [bs, num_keys, + embed_dims]. + If None, the `query` will be used. Defaults to None. + value (Tensor): The value tensor with same shape as `key`. + Same in `nn.MultiheadAttention.forward`. Defaults to None. + If None, the `key` will be used. + attn_mask (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + Returns: + Tuple[Tensor]: Attention outputs of shape :math:`(N, L, E)`, + where :math:`N` is the batch size, :math:`L` is the target + sequence length , and :math:`E` is the embedding dimension + `embed_dim`. Attention weights per head of shape :math:` + (num_heads, L, S)`. where :math:`N` is batch size, :math:`L` + is target sequence length, and :math:`S` is the source sequence + length. + """ + assert key.size(1) == value.size(1), \ + f'{"key, value must have the same sequence length"}' + assert query.size(0) == key.size(0) == value.size(0), \ + f'{"batch size must be equal for query, key, value"}' + assert query.size(2) == key.size(2), \ + f'{"q_dims, k_dims must be equal"}' + assert value.size(2) == self.embed_dims, \ + f'{"v_dims must be equal to embed_dims"}' + + bs, tgt_len, hidden_dims = query.size() + _, src_len, _ = key.size() + head_dims = hidden_dims // self.num_heads + v_head_dims = self.embed_dims // self.num_heads + assert head_dims * self.num_heads == hidden_dims, \ + f'{"hidden_dims must be divisible by num_heads"}' + scaling = float(head_dims)**-0.5 + + q = query * scaling + k = key + v = value + + if attn_mask is not None: + assert attn_mask.dtype == torch.float32 or \ + attn_mask.dtype == torch.float64 or \ + attn_mask.dtype == torch.float16 or \ + attn_mask.dtype == torch.uint8 or \ + attn_mask.dtype == torch.bool, \ + 'Only float, byte, and bool types are supported for \ + attn_mask' + + if attn_mask.dtype == torch.uint8: + warnings.warn('Byte tensor for attn_mask is deprecated.\ + Use bool tensor instead.') + attn_mask = attn_mask.to(torch.bool) + if attn_mask.dim() == 2: + attn_mask = attn_mask.unsqueeze(0) + if list(attn_mask.size()) != [1, query.size(1), key.size(1)]: + raise RuntimeError( + 'The size of the 2D attn_mask is not correct.') + elif attn_mask.dim() == 3: + if list(attn_mask.size()) != [ + bs * self.num_heads, + query.size(1), + key.size(1) + ]: + raise RuntimeError( + 'The size of the 3D attn_mask is not correct.') + else: + raise RuntimeError( + "attn_mask's dimension {} is not supported".format( + attn_mask.dim())) + # attn_mask's dim is 3 now. + + if key_padding_mask is not None and key_padding_mask.dtype == int: + key_padding_mask = key_padding_mask.to(torch.bool) + + q = q.contiguous().view(bs, tgt_len, self.num_heads, + head_dims).permute(0, 2, 1, 3).flatten(0, 1) + if k is not None: + k = k.contiguous().view(bs, src_len, self.num_heads, + head_dims).permute(0, 2, 1, + 3).flatten(0, 1) + if v is not None: + v = v.contiguous().view(bs, src_len, self.num_heads, + v_head_dims).permute(0, 2, 1, + 3).flatten(0, 1) + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bs + assert key_padding_mask.size(1) == src_len + + attn_output_weights = torch.bmm(q, k.transpose(1, 2)) + assert list(attn_output_weights.size()) == [ + bs * self.num_heads, tgt_len, src_len + ] + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_output_weights.masked_fill_(attn_mask, float('-inf')) + else: + attn_output_weights += attn_mask + + if key_padding_mask is not None: + attn_output_weights = attn_output_weights.view( + bs, self.num_heads, tgt_len, src_len) + attn_output_weights = attn_output_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + float('-inf'), + ) + attn_output_weights = attn_output_weights.view( + bs * self.num_heads, tgt_len, src_len) + + attn_output_weights = F.softmax( + attn_output_weights - + attn_output_weights.max(dim=-1, keepdim=True)[0], + dim=-1) + attn_output_weights = self.attn_drop(attn_output_weights) + + attn_output = torch.bmm(attn_output_weights, v) + assert list( + attn_output.size()) == [bs * self.num_heads, tgt_len, v_head_dims] + attn_output = attn_output.view(bs, self.num_heads, tgt_len, + v_head_dims).permute(0, 2, 1, + 3).flatten(2) + attn_output = self.out_proj(attn_output) + + # average attention weights over heads + attn_output_weights = attn_output_weights.view(bs, self.num_heads, + tgt_len, src_len) + return attn_output, attn_output_weights.sum(dim=1) / self.num_heads + + def forward(self, + query: Tensor, + key: Tensor, + query_pos: Tensor = None, + ref_sine_embed: Tensor = None, + key_pos: Tensor = None, + attn_mask: Tensor = None, + key_padding_mask: Tensor = None, + is_first: bool = False) -> Tensor: + """Forward function for `ConditionalAttention`. + Args: + query (Tensor): The input query with shape [bs, num_queries, + embed_dims]. + key (Tensor): The key tensor with shape [bs, num_keys, + embed_dims]. + If None, the `query` will be used. Defaults to None. + query_pos (Tensor): The positional encoding for query in self + attention, with the same shape as `x`. If not None, it will + be added to `x` before forward function. + Defaults to None. + query_sine_embed (Tensor): The positional encoding for query in + cross attention, with the same shape as `x`. If not None, it + will be added to `x` before forward function. + Defaults to None. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. Defaults to None. If not None, it will + be added to `key` before forward function. If None, and + `query_pos` has the same shape as `key`, then `query_pos` + will be used for `key_pos`. Defaults to None. + attn_mask (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + is_first (bool): A indicator to tell whether the current layer + is the first layer of the decoder. + Defaults to False. + Returns: + Tensor: forwarded results with shape + [bs, num_queries, embed_dims]. + """ + + if self.cross_attn: + q_content = self.qcontent_proj(query) + k_content = self.kcontent_proj(key) + v = self.v_proj(key) + + bs, nq, c = q_content.size() + _, hw, _ = k_content.size() + + k_pos = self.kpos_proj(key_pos) + if is_first or self.keep_query_pos: + q_pos = self.qpos_proj(query_pos) + q = q_content + q_pos + k = k_content + k_pos + else: + q = q_content + k = k_content + q = q.view(bs, nq, self.num_heads, c // self.num_heads) + query_sine_embed = self.qpos_sine_proj(ref_sine_embed) + query_sine_embed = query_sine_embed.view(bs, nq, self.num_heads, + c // self.num_heads) + q = torch.cat([q, query_sine_embed], dim=3).view(bs, nq, 2 * c) + k = k.view(bs, hw, self.num_heads, c // self.num_heads) + k_pos = k_pos.view(bs, hw, self.num_heads, c // self.num_heads) + k = torch.cat([k, k_pos], dim=3).view(bs, hw, 2 * c) + ca_output = self.forward_attn( + query=q, + key=k, + value=v, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask)[0] + query = query + self.proj_drop(ca_output) + else: + q_content = self.qcontent_proj(query) + q_pos = self.qpos_proj(query_pos) + k_content = self.kcontent_proj(query) + k_pos = self.kpos_proj(query_pos) + v = self.v_proj(query) + q = q_content if q_pos is None else q_content + q_pos + k = k_content if k_pos is None else k_content + k_pos + sa_output = self.forward_attn( + query=q, + key=k, + value=v, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask)[0] + query = query + self.proj_drop(sa_output) + + return query + + +class MLP(BaseModule): + """Very simple multi-layer perceptron (also called FFN) with relu. Mostly + used in DETR series detectors. + + Args: + input_dim (int): Feature dim of the input tensor. + hidden_dim (int): Feature dim of the hidden layer. + output_dim (int): Feature dim of the output tensor. + num_layers (int): Number of FFN layers. As the last + layer of MLP only contains FFN (Linear). + """ + + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, + num_layers: int) -> None: + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = ModuleList( + Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x: Tensor) -> Tensor: + """Forward function of MLP. + + Args: + x (Tensor): The input feature, has shape + (num_queries, bs, input_dim). + Returns: + Tensor: The output feature, has shape + (num_queries, bs, output_dim). + """ + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@MODELS.register_module() +class DynamicConv(BaseModule): + """Implements Dynamic Convolution. + + This module generate parameters for each sample and + use bmm to implement 1*1 convolution. Code is modified + from the `official github repo `_ . + + Args: + in_channels (int): The input feature channel. + Defaults to 256. + feat_channels (int): The inner feature channel. + Defaults to 64. + out_channels (int, optional): The output feature channel. + When not specified, it will be set to `in_channels` + by default + input_feat_shape (int): The shape of input feature. + Defaults to 7. + with_proj (bool): Project two-dimentional feature to + one-dimentional feature. Default to True. + act_cfg (dict): The activation config for DynamicConv. + norm_cfg (dict): Config dict for normalization layer. Default + layer normalization. + init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_channels: int = 256, + feat_channels: int = 64, + out_channels: Optional[int] = None, + input_feat_shape: int = 7, + with_proj: bool = True, + act_cfg: OptConfigType = dict(type='ReLU', inplace=True), + norm_cfg: OptConfigType = dict(type='LN'), + init_cfg: OptConfigType = None) -> None: + super(DynamicConv, self).__init__(init_cfg) + self.in_channels = in_channels + self.feat_channels = feat_channels + self.out_channels_raw = out_channels + self.input_feat_shape = input_feat_shape + self.with_proj = with_proj + self.act_cfg = act_cfg + self.norm_cfg = norm_cfg + self.out_channels = out_channels if out_channels else in_channels + + self.num_params_in = self.in_channels * self.feat_channels + self.num_params_out = self.out_channels * self.feat_channels + self.dynamic_layer = nn.Linear( + self.in_channels, self.num_params_in + self.num_params_out) + + self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1] + self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1] + + self.activation = build_activation_layer(act_cfg) + + num_output = self.out_channels * input_feat_shape**2 + if self.with_proj: + self.fc_layer = nn.Linear(num_output, self.out_channels) + self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1] + + def forward(self, param_feature: Tensor, input_feature: Tensor) -> Tensor: + """Forward function for `DynamicConv`. + + Args: + param_feature (Tensor): The feature can be used + to generate the parameter, has shape + (num_all_proposals, in_channels). + input_feature (Tensor): Feature that + interact with parameters, has shape + (num_all_proposals, in_channels, H, W). + + Returns: + Tensor: The output feature has shape + (num_all_proposals, out_channels). + """ + input_feature = input_feature.flatten(2).permute(2, 0, 1) + + input_feature = input_feature.permute(1, 0, 2) + parameters = self.dynamic_layer(param_feature) + + param_in = parameters[:, :self.num_params_in].view( + -1, self.in_channels, self.feat_channels) + param_out = parameters[:, -self.num_params_out:].view( + -1, self.feat_channels, self.out_channels) + + # input_feature has shape (num_all_proposals, H*W, in_channels) + # param_in has shape (num_all_proposals, in_channels, feat_channels) + # feature has shape (num_all_proposals, H*W, feat_channels) + features = torch.bmm(input_feature, param_in) + features = self.norm_in(features) + features = self.activation(features) + + # param_out has shape (batch_size, feat_channels, out_channels) + features = torch.bmm(features, param_out) + features = self.norm_out(features) + features = self.activation(features) + + if self.with_proj: + features = features.flatten(1) + features = self.fc_layer(features) + features = self.fc_norm(features) + features = self.activation(features) + + return features + + +def get_text_sine_pos_embed( + pos_tensor: torch.Tensor, + num_pos_feats: int = 128, + temperature: int = 10000, + exchange_xy: bool = True, +): + """generate sine position embedding from a position tensor + Args: + pos_tensor (torch.Tensor): shape: [..., n]. + num_pos_feats (int): projected shape for each float in the tensor. + temperature (int): temperature in the sine/cosine function. + exchange_xy (bool, optional): exchange pos x and pos y. For example, + input tensor is [x,y], the results will be [pos(y), pos(x)]. + Defaults to True. + Returns: + pos_embed (torch.Tensor): shape: [..., n*num_pos_feats]. + """ + scale = 2 * math.pi + dim_t = torch.arange( + num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature**(2 * torch.div(dim_t, 2, rounding_mode='floor') / + num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), + dim=3).flatten(2) + return sin_x + + pos_res = [ + sine_func(x) + for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1) + ] + if exchange_xy: + pos_res[0], pos_res[1] = pos_res[1], pos_res[0] + pos_res = torch.cat(pos_res, dim=-1) + return pos_res diff --git a/mmdetection/mmdet/models/losses/__init__.py b/mmdetection/mmdet/models/losses/__init__.py new file mode 100644 index 0000000..7c57a3a --- /dev/null +++ b/mmdetection/mmdet/models/losses/__init__.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .accuracy import Accuracy, accuracy +from .ae_loss import AssociativeEmbeddingLoss +from .balanced_l1_loss import BalancedL1Loss, balanced_l1_loss +from .cross_entropy_loss import (CrossEntropyCustomLoss, CrossEntropyLoss, + binary_cross_entropy, cross_entropy, + mask_cross_entropy) +from .ddq_detr_aux_loss import DDQAuxLoss +from .dice_loss import DiceLoss +from .eqlv2_loss import EQLV2Loss +from .focal_loss import FocalCustomLoss, FocalLoss, sigmoid_focal_loss +from .gaussian_focal_loss import GaussianFocalLoss +from .gfocal_loss import DistributionFocalLoss, QualityFocalLoss +from .ghm_loss import GHMC, GHMR +from .iou_loss import (BoundedIoULoss, CIoULoss, DIoULoss, EIoULoss, GIoULoss, + IoULoss, SIoULoss, bounded_iou_loss, iou_loss) +from .kd_loss import KnowledgeDistillationKLDivLoss +from .l2_loss import L2Loss +from .margin_loss import MarginL2Loss +from .mse_loss import MSELoss, mse_loss +from .multipos_cross_entropy_loss import MultiPosCrossEntropyLoss +from .pisa_loss import carl_loss, isr_p +from .seesaw_loss import SeesawLoss +from .smooth_l1_loss import L1Loss, SmoothL1Loss, l1_loss, smooth_l1_loss +from .triplet_loss import TripletLoss +from .utils import reduce_loss, weight_reduce_loss, weighted_loss +from .varifocal_loss import VarifocalLoss + +__all__ = [ + 'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy', + 'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss', + 'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss', + 'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss', + 'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'DIoULoss', 'CIoULoss', + 'EIoULoss', 'SIoULoss', 'GHMC', 'GHMR', 'reduce_loss', + 'weight_reduce_loss', 'weighted_loss', 'L1Loss', 'l1_loss', 'isr_p', + 'carl_loss', 'AssociativeEmbeddingLoss', 'GaussianFocalLoss', + 'QualityFocalLoss', 'DistributionFocalLoss', 'VarifocalLoss', + 'KnowledgeDistillationKLDivLoss', 'SeesawLoss', 'DiceLoss', 'EQLV2Loss', + 'MarginL2Loss', 'MultiPosCrossEntropyLoss', 'L2Loss', 'TripletLoss', + 'DDQAuxLoss', 'CrossEntropyCustomLoss', 'FocalCustomLoss' +] diff --git a/mmdetection/mmdet/models/losses/accuracy.py b/mmdetection/mmdet/models/losses/accuracy.py new file mode 100644 index 0000000..d68484e --- /dev/null +++ b/mmdetection/mmdet/models/losses/accuracy.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + + +def accuracy(pred, target, topk=1, thresh=None): + """Calculate accuracy according to the prediction and target. + + Args: + pred (torch.Tensor): The model prediction, shape (N, num_class) + target (torch.Tensor): The target of each prediction, shape (N, ) + topk (int | tuple[int], optional): If the predictions in ``topk`` + matches the target, the predictions will be regarded as + correct ones. Defaults to 1. + thresh (float, optional): If not None, predictions with scores under + this threshold are considered incorrect. Default to None. + + Returns: + float | tuple[float]: If the input ``topk`` is a single integer, + the function will return a single float as accuracy. If + ``topk`` is a tuple containing multiple integers, the + function will return a tuple containing accuracies of + each ``topk`` number. + """ + assert isinstance(topk, (int, tuple)) + if isinstance(topk, int): + topk = (topk, ) + return_single = True + else: + return_single = False + + maxk = max(topk) + if pred.size(0) == 0: + accu = [pred.new_tensor(0.) for i in range(len(topk))] + return accu[0] if return_single else accu + assert pred.ndim == 2 and target.ndim == 1 + assert pred.size(0) == target.size(0) + assert maxk <= pred.size(1), \ + f'maxk {maxk} exceeds pred dimension {pred.size(1)}' + pred_value, pred_label = pred.topk(maxk, dim=1) + pred_label = pred_label.t() # transpose to shape (maxk, N) + correct = pred_label.eq(target.view(1, -1).expand_as(pred_label)) + if thresh is not None: + # Only prediction values larger than thresh are counted as correct + correct = correct & (pred_value > thresh).t() + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / pred.size(0))) + return res[0] if return_single else res + + +class Accuracy(nn.Module): + + def __init__(self, topk=(1, ), thresh=None): + """Module to calculate the accuracy. + + Args: + topk (tuple, optional): The criterion used to calculate the + accuracy. Defaults to (1,). + thresh (float, optional): If not None, predictions with scores + under this threshold are considered incorrect. Default to None. + """ + super().__init__() + self.topk = topk + self.thresh = thresh + + def forward(self, pred, target): + """Forward function to calculate accuracy. + + Args: + pred (torch.Tensor): Prediction of models. + target (torch.Tensor): Target for each prediction. + + Returns: + tuple[float]: The accuracies under different topk criterions. + """ + return accuracy(pred, target, self.topk, self.thresh) diff --git a/mmdetection/mmdet/models/losses/ae_loss.py b/mmdetection/mmdet/models/losses/ae_loss.py new file mode 100644 index 0000000..2aa7d69 --- /dev/null +++ b/mmdetection/mmdet/models/losses/ae_loss.py @@ -0,0 +1,101 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmdet.registry import MODELS + + +def ae_loss_per_image(tl_preds, br_preds, match): + """Associative Embedding Loss in one image. + + Associative Embedding Loss including two parts: pull loss and push loss. + Pull loss makes embedding vectors from same object closer to each other. + Push loss distinguish embedding vector from different objects, and makes + the gap between them is large enough. + + During computing, usually there are 3 cases: + - no object in image: both pull loss and push loss will be 0. + - one object in image: push loss will be 0 and pull loss is computed + by the two corner of the only object. + - more than one objects in image: pull loss is computed by corner pairs + from each object, push loss is computed by each object with all + other objects. We use confusion matrix with 0 in diagonal to + compute the push loss. + + Args: + tl_preds (tensor): Embedding feature map of left-top corner. + br_preds (tensor): Embedding feature map of bottim-right corner. + match (list): Downsampled coordinates pair of each ground truth box. + """ + + tl_list, br_list, me_list = [], [], [] + if len(match) == 0: # no object in image + pull_loss = tl_preds.sum() * 0. + push_loss = tl_preds.sum() * 0. + else: + for m in match: + [tl_y, tl_x], [br_y, br_x] = m + tl_e = tl_preds[:, tl_y, tl_x].view(-1, 1) + br_e = br_preds[:, br_y, br_x].view(-1, 1) + tl_list.append(tl_e) + br_list.append(br_e) + me_list.append((tl_e + br_e) / 2.0) + + tl_list = torch.cat(tl_list) + br_list = torch.cat(br_list) + me_list = torch.cat(me_list) + + assert tl_list.size() == br_list.size() + + # N is object number in image, M is dimension of embedding vector + N, M = tl_list.size() + + pull_loss = (tl_list - me_list).pow(2) + (br_list - me_list).pow(2) + pull_loss = pull_loss.sum() / N + + margin = 1 # exp setting of CornerNet, details in section 3.3 of paper + + # confusion matrix of push loss + conf_mat = me_list.expand((N, N, M)).permute(1, 0, 2) - me_list + conf_weight = 1 - torch.eye(N).type_as(me_list) + conf_mat = conf_weight * (margin - conf_mat.sum(-1).abs()) + + if N > 1: # more than one object in current image + push_loss = F.relu(conf_mat).sum() / (N * (N - 1)) + else: + push_loss = tl_preds.sum() * 0. + + return pull_loss, push_loss + + +@MODELS.register_module() +class AssociativeEmbeddingLoss(nn.Module): + """Associative Embedding Loss. + + More details can be found in + `Associative Embedding `_ and + `CornerNet `_ . + Code is modified from `kp_utils.py `_ # noqa: E501 + + Args: + pull_weight (float): Loss weight for corners from same object. + push_weight (float): Loss weight for corners from different object. + """ + + def __init__(self, pull_weight=0.25, push_weight=0.25): + super(AssociativeEmbeddingLoss, self).__init__() + self.pull_weight = pull_weight + self.push_weight = push_weight + + def forward(self, pred, target, match): + """Forward function.""" + batch = pred.size(0) + pull_all, push_all = 0.0, 0.0 + for i in range(batch): + pull, push = ae_loss_per_image(pred[i], target[i], match[i]) + + pull_all += self.pull_weight * pull + push_all += self.push_weight * push + + return pull_all, push_all diff --git a/mmdetection/mmdet/models/losses/balanced_l1_loss.py b/mmdetection/mmdet/models/losses/balanced_l1_loss.py new file mode 100644 index 0000000..25adaab --- /dev/null +++ b/mmdetection/mmdet/models/losses/balanced_l1_loss.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +import torch.nn as nn + +from mmdet.registry import MODELS +from .utils import weighted_loss + + +@weighted_loss +def balanced_l1_loss(pred, + target, + beta=1.0, + alpha=0.5, + gamma=1.5, + reduction='mean'): + """Calculate balanced L1 loss. + + Please see the `Libra R-CNN `_ + + Args: + pred (torch.Tensor): The prediction with shape (N, 4). + target (torch.Tensor): The learning target of the prediction with + shape (N, 4). + beta (float): The loss is a piecewise function of prediction and target + and ``beta`` serves as a threshold for the difference between the + prediction and target. Defaults to 1.0. + alpha (float): The denominator ``alpha`` in the balanced L1 loss. + Defaults to 0.5. + gamma (float): The ``gamma`` in the balanced L1 loss. + Defaults to 1.5. + reduction (str, optional): The method that reduces the loss to a + scalar. Options are "none", "mean" and "sum". + + Returns: + torch.Tensor: The calculated loss + """ + assert beta > 0 + if target.numel() == 0: + return pred.sum() * 0 + + assert pred.size() == target.size() + + diff = torch.abs(pred - target) + b = np.e**(gamma / alpha) - 1 + loss = torch.where( + diff < beta, alpha / b * + (b * diff + 1) * torch.log(b * diff / beta + 1) - alpha * diff, + gamma * diff + gamma / b - alpha * beta) + + return loss + + +@MODELS.register_module() +class BalancedL1Loss(nn.Module): + """Balanced L1 Loss. + + arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019) + + Args: + alpha (float): The denominator ``alpha`` in the balanced L1 loss. + Defaults to 0.5. + gamma (float): The ``gamma`` in the balanced L1 loss. Defaults to 1.5. + beta (float, optional): The loss is a piecewise function of prediction + and target. ``beta`` serves as a threshold for the difference + between the prediction and target. Defaults to 1.0. + reduction (str, optional): The method that reduces the loss to a + scalar. Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of the loss. Defaults to 1.0 + """ + + def __init__(self, + alpha=0.5, + gamma=1.5, + beta=1.0, + reduction='mean', + loss_weight=1.0): + super(BalancedL1Loss, self).__init__() + self.alpha = alpha + self.gamma = gamma + self.beta = beta + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None, + **kwargs): + """Forward function of loss. + + Args: + pred (torch.Tensor): The prediction with shape (N, 4). + target (torch.Tensor): The learning target of the prediction with + shape (N, 4). + weight (torch.Tensor, optional): Sample-wise loss weight with + shape (N, ). + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Options are "none", "mean" and "sum". + + Returns: + torch.Tensor: The calculated loss + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_bbox = self.loss_weight * balanced_l1_loss( + pred, + target, + weight, + alpha=self.alpha, + gamma=self.gamma, + beta=self.beta, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss_bbox diff --git a/mmdetection/mmdet/models/losses/cross_entropy_loss.py b/mmdetection/mmdet/models/losses/cross_entropy_loss.py new file mode 100644 index 0000000..49fac77 --- /dev/null +++ b/mmdetection/mmdet/models/losses/cross_entropy_loss.py @@ -0,0 +1,401 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmdet.registry import MODELS +from .accuracy import accuracy +from .utils import weight_reduce_loss + + +def cross_entropy(pred, + label, + weight=None, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=-100, + avg_non_ignore=False): + """Calculate the CrossEntropy loss. + + Args: + pred (torch.Tensor): The prediction with shape (N, C), C is the number + of classes. + label (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + reduction (str, optional): The method used to reduce the loss. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (int | None): The label index to be ignored. + If None, it will be set to default value. Default: -100. + avg_non_ignore (bool): The flag decides to whether the loss is + only averaged over non-ignored targets. Default: False. + + Returns: + torch.Tensor: The calculated loss + """ + # The default value of ignore_index is the same as F.cross_entropy + ignore_index = -100 if ignore_index is None else ignore_index + # element-wise losses + loss = F.cross_entropy( + pred, + label, + weight=class_weight, + reduction='none', + ignore_index=ignore_index) + + # average loss over non-ignored elements + # pytorch's official cross_entropy average loss over non-ignored elements + # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660 # noqa + if (avg_factor is None) and avg_non_ignore and reduction == 'mean': + avg_factor = label.numel() - (label == ignore_index).sum().item() + + # apply weights and do the reduction + if weight is not None: + weight = weight.float() + loss = weight_reduce_loss( + loss, weight=weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + +def _expand_onehot_labels(labels, label_weights, label_channels, ignore_index): + """Expand onehot labels to match the size of prediction.""" + bin_labels = labels.new_full((labels.size(0), label_channels), 0) + valid_mask = (labels >= 0) & (labels != ignore_index) + inds = torch.nonzero( + valid_mask & (labels < label_channels), as_tuple=False) + + if inds.numel() > 0: + bin_labels[inds, labels[inds]] = 1 + + valid_mask = valid_mask.view(-1, 1).expand(labels.size(0), + label_channels).float() + if label_weights is None: + bin_label_weights = valid_mask + else: + bin_label_weights = label_weights.view(-1, 1).repeat(1, label_channels) + bin_label_weights *= valid_mask + + return bin_labels, bin_label_weights, valid_mask + + +def binary_cross_entropy(pred, + label, + weight=None, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=-100, + avg_non_ignore=False): + """Calculate the binary CrossEntropy loss. + + Args: + pred (torch.Tensor): The prediction with shape (N, 1) or (N, ). + When the shape of pred is (N, 1), label will be expanded to + one-hot format, and when the shape of pred is (N, ), label + will not be expanded to one-hot format. + label (torch.Tensor): The learning label of the prediction, + with shape (N, ). + weight (torch.Tensor, optional): Sample-wise loss weight. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (int | None): The label index to be ignored. + If None, it will be set to default value. Default: -100. + avg_non_ignore (bool): The flag decides to whether the loss is + only averaged over non-ignored targets. Default: False. + + Returns: + torch.Tensor: The calculated loss. + """ + # The default value of ignore_index is the same as F.cross_entropy + ignore_index = -100 if ignore_index is None else ignore_index + + if pred.dim() != label.dim(): + label, weight, valid_mask = _expand_onehot_labels( + label, weight, pred.size(-1), ignore_index) + else: + # should mask out the ignored elements + valid_mask = ((label >= 0) & (label != ignore_index)).float() + if weight is not None: + # The inplace writing method will have a mismatched broadcast + # shape error if the weight and valid_mask dimensions + # are inconsistent such as (B,N,1) and (B,N,C). + weight = weight * valid_mask + else: + weight = valid_mask + + # average loss over non-ignored elements + if (avg_factor is None) and avg_non_ignore and reduction == 'mean': + avg_factor = valid_mask.sum().item() + + # weighted element-wise losses + weight = weight.float() + loss = F.binary_cross_entropy_with_logits( + pred, label.float(), pos_weight=class_weight, reduction='none') + # do the reduction for the weighted loss + loss = weight_reduce_loss( + loss, weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + +def mask_cross_entropy(pred, + target, + label, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=None, + **kwargs): + """Calculate the CrossEntropy loss for masks. + + Args: + pred (torch.Tensor): The prediction with shape (N, C, *), C is the + number of classes. The trailing * indicates arbitrary shape. + target (torch.Tensor): The learning label of the prediction. + label (torch.Tensor): ``label`` indicates the class label of the mask + corresponding object. This will be used to select the mask in the + of the class which the object belongs to when the mask prediction + if not class-agnostic. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (None): Placeholder, to be consistent with other loss. + Default: None. + + Returns: + torch.Tensor: The calculated loss + + Example: + >>> N, C = 3, 11 + >>> H, W = 2, 2 + >>> pred = torch.randn(N, C, H, W) * 1000 + >>> target = torch.rand(N, H, W) + >>> label = torch.randint(0, C, size=(N,)) + >>> reduction = 'mean' + >>> avg_factor = None + >>> class_weights = None + >>> loss = mask_cross_entropy(pred, target, label, reduction, + >>> avg_factor, class_weights) + >>> assert loss.shape == (1,) + """ + assert ignore_index is None, 'BCE loss does not support ignore_index' + # TODO: handle these two reserved arguments + assert reduction == 'mean' and avg_factor is None + num_rois = pred.size()[0] + inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) + pred_slice = pred[inds, label].squeeze(1) + return F.binary_cross_entropy_with_logits( + pred_slice, target, weight=class_weight, reduction='mean')[None] + + +@MODELS.register_module() +class CrossEntropyLoss(nn.Module): + + def __init__(self, + use_sigmoid=False, + use_mask=False, + reduction='mean', + class_weight=None, + ignore_index=None, + loss_weight=1.0, + avg_non_ignore=False): + """CrossEntropyLoss. + + Args: + use_sigmoid (bool, optional): Whether the prediction uses sigmoid + of softmax. Defaults to False. + use_mask (bool, optional): Whether to use mask cross entropy loss. + Defaults to False. + reduction (str, optional): . Defaults to 'mean'. + Options are "none", "mean" and "sum". + class_weight (list[float], optional): Weight of each class. + Defaults to None. + ignore_index (int | None): The label index to be ignored. + Defaults to None. + loss_weight (float, optional): Weight of the loss. Defaults to 1.0. + avg_non_ignore (bool): The flag decides to whether the loss is + only averaged over non-ignored targets. Default: False. + """ + super(CrossEntropyLoss, self).__init__() + assert (use_sigmoid is False) or (use_mask is False) + self.use_sigmoid = use_sigmoid + self.use_mask = use_mask + self.reduction = reduction + self.loss_weight = loss_weight + self.class_weight = class_weight + self.ignore_index = ignore_index + self.avg_non_ignore = avg_non_ignore + if ((ignore_index is not None) and not self.avg_non_ignore + and self.reduction == 'mean'): + warnings.warn( + 'Default ``avg_non_ignore`` is False, if you would like to ' + 'ignore the certain label and average loss over non-ignore ' + 'labels, which is the same with PyTorch official ' + 'cross_entropy, set ``avg_non_ignore=True``.') + + if self.use_sigmoid: + self.cls_criterion = binary_cross_entropy + elif self.use_mask: + self.cls_criterion = mask_cross_entropy + else: + self.cls_criterion = cross_entropy + + def extra_repr(self): + """Extra repr.""" + s = f'avg_non_ignore={self.avg_non_ignore}' + return s + + def forward(self, + cls_score, + label, + weight=None, + avg_factor=None, + reduction_override=None, + ignore_index=None, + **kwargs): + """Forward function. + + Args: + cls_score (torch.Tensor): The prediction. + label (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The method used to reduce the + loss. Options are "none", "mean" and "sum". + ignore_index (int | None): The label index to be ignored. + If not None, it will override the default value. Default: None. + Returns: + torch.Tensor: The calculated loss. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if ignore_index is None: + ignore_index = self.ignore_index + + if self.class_weight is not None: + class_weight = cls_score.new_tensor( + self.class_weight, device=cls_score.device) + else: + class_weight = None + loss_cls = self.loss_weight * self.cls_criterion( + cls_score, + label, + weight, + class_weight=class_weight, + reduction=reduction, + avg_factor=avg_factor, + ignore_index=ignore_index, + avg_non_ignore=self.avg_non_ignore, + **kwargs) + return loss_cls + + +@MODELS.register_module() +class CrossEntropyCustomLoss(CrossEntropyLoss): + + def __init__(self, + use_sigmoid=False, + use_mask=False, + reduction='mean', + num_classes=-1, + class_weight=None, + ignore_index=None, + loss_weight=1.0, + avg_non_ignore=False): + """CrossEntropyCustomLoss. + + Args: + use_sigmoid (bool, optional): Whether the prediction uses sigmoid + of softmax. Defaults to False. + use_mask (bool, optional): Whether to use mask cross entropy loss. + Defaults to False. + reduction (str, optional): . Defaults to 'mean'. + Options are "none", "mean" and "sum". + num_classes (int): Number of classes to classify. + class_weight (list[float], optional): Weight of each class. + Defaults to None. + ignore_index (int | None): The label index to be ignored. + Defaults to None. + loss_weight (float, optional): Weight of the loss. Defaults to 1.0. + avg_non_ignore (bool): The flag decides to whether the loss is + only averaged over non-ignored targets. Default: False. + """ + super(CrossEntropyCustomLoss, self).__init__() + assert (use_sigmoid is False) or (use_mask is False) + self.use_sigmoid = use_sigmoid + self.use_mask = use_mask + self.reduction = reduction + self.loss_weight = loss_weight + self.class_weight = class_weight + self.ignore_index = ignore_index + self.avg_non_ignore = avg_non_ignore + if ((ignore_index is not None) and not self.avg_non_ignore + and self.reduction == 'mean'): + warnings.warn( + 'Default ``avg_non_ignore`` is False, if you would like to ' + 'ignore the certain label and average loss over non-ignore ' + 'labels, which is the same with PyTorch official ' + 'cross_entropy, set ``avg_non_ignore=True``.') + + if self.use_sigmoid: + self.cls_criterion = binary_cross_entropy + elif self.use_mask: + self.cls_criterion = mask_cross_entropy + else: + self.cls_criterion = cross_entropy + + self.num_classes = num_classes + + assert self.num_classes != -1 + + # custom output channels of the classifier + self.custom_cls_channels = True + # custom activation of cls_score + self.custom_activation = True + # custom accuracy of the classsifier + self.custom_accuracy = True + + def get_cls_channels(self, num_classes): + assert num_classes == self.num_classes + if not self.use_sigmoid: + return num_classes + 1 + else: + return num_classes + + def get_activation(self, cls_score): + + fine_cls_score = cls_score[:, :self.num_classes] + + if not self.use_sigmoid: + bg_score = cls_score[:, [-1]] + new_score = torch.cat([fine_cls_score, bg_score], dim=-1) + scores = F.softmax(new_score, dim=-1) + else: + score_classes = fine_cls_score.sigmoid() + score_neg = 1 - score_classes.sum(dim=1, keepdim=True) + score_neg = score_neg.clamp(min=0, max=1) + scores = torch.cat([score_classes, score_neg], dim=1) + + return scores + + def get_accuracy(self, cls_score, labels): + + fine_cls_score = cls_score[:, :self.num_classes] + + pos_inds = labels < self.num_classes + acc_classes = accuracy(fine_cls_score[pos_inds], labels[pos_inds]) + acc = dict() + acc['acc_classes'] = acc_classes + return acc diff --git a/mmdetection/mmdet/models/losses/ddq_detr_aux_loss.py b/mmdetection/mmdet/models/losses/ddq_detr_aux_loss.py new file mode 100644 index 0000000..41f1c71 --- /dev/null +++ b/mmdetection/mmdet/models/losses/ddq_detr_aux_loss.py @@ -0,0 +1,303 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmengine.structures import BaseDataElement + +from mmdet.models.utils import multi_apply +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.utils import reduce_mean + + +class DDQAuxLoss(nn.Module): + """DDQ auxiliary branches loss for dense queries. + + Args: + loss_cls (dict): + Configuration of classification loss function. + loss_bbox (dict): + Configuration of bbox regression loss function. + train_cfg (dict): + Configuration of gt targets assigner for each predicted bbox. + """ + + def __init__( + self, + loss_cls=dict( + type='QualityFocalLoss', + use_sigmoid=True, + activated=True, # use probability instead of logit as input + beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + train_cfg=dict( + assigner=dict(type='TopkHungarianAssigner', topk=8), + alpha=1, + beta=6), + ): + super(DDQAuxLoss, self).__init__() + self.train_cfg = train_cfg + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + self.assigner = TASK_UTILS.build(self.train_cfg['assigner']) + + sampler_cfg = dict(type='PseudoSampler') + self.sampler = TASK_UTILS.build(sampler_cfg) + + def loss_single(self, cls_score, bbox_pred, labels, label_weights, + bbox_targets, alignment_metrics): + """Calculate auxiliary branches loss for dense queries for one image. + + Args: + cls_score (Tensor): Predicted normalized classification + scores for one image, has shape (num_dense_queries, + cls_out_channels). + bbox_pred (Tensor): Predicted unnormalized bbox coordinates + for one image, has shape (num_dense_queries, 4) with the + last dimension arranged as (x1, y1, x2, y2). + labels (Tensor): Labels for one image. + label_weights (Tensor): Label weights for one image. + bbox_targets (Tensor): Bbox targets for one image. + alignment_metrics (Tensor): Normalized alignment metrics for one + image. + + Returns: + tuple: A tuple of loss components and loss weights. + """ + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + alignment_metrics = alignment_metrics.reshape(-1) + label_weights = label_weights.reshape(-1) + targets = (labels, alignment_metrics) + cls_loss_func = self.loss_cls + + loss_cls = cls_loss_func( + cls_score, targets, label_weights, avg_factor=1.0) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = cls_score.size(-1) + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + + pos_decode_bbox_pred = pos_bbox_pred + pos_decode_bbox_targets = pos_bbox_targets + + # regression loss + pos_bbox_weight = alignment_metrics[pos_inds] + + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_decode_bbox_targets, + weight=pos_bbox_weight, + avg_factor=1.0) + else: + loss_bbox = bbox_pred.sum() * 0 + pos_bbox_weight = bbox_targets.new_tensor(0.) + + return loss_cls, loss_bbox, alignment_metrics.sum( + ), pos_bbox_weight.sum() + + def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, + **kwargs): + """Calculate auxiliary branches loss for dense queries. + + Args: + cls_scores (Tensor): Predicted normalized classification + scores, has shape (bs, num_dense_queries, + cls_out_channels). + bbox_preds (Tensor): Predicted unnormalized bbox coordinates, + has shape (bs, num_dense_queries, 4) with the last + dimension arranged as (x1, y1, x2, y2). + gt_bboxes (list[Tensor]): List of unnormalized ground truth + bboxes for each image, each has shape (num_gt, 4) with the + last dimension arranged as (x1, y1, x2, y2). + NOTE: num_gt is dynamic for each image. + gt_labels (list[Tensor]): List of ground truth classification + index for each image, each has shape (num_gt,). + NOTE: num_gt is dynamic for each image. + img_metas (list[dict]): Meta information for one image, + e.g., image size, scaling factor, etc. + + Returns: + dict: A dictionary of loss components. + """ + flatten_cls_scores = cls_scores + flatten_bbox_preds = bbox_preds + + cls_reg_targets = self.get_targets( + flatten_cls_scores, + flatten_bbox_preds, + gt_bboxes, + img_metas, + gt_labels_list=gt_labels, + ) + (labels_list, label_weights_list, bbox_targets_list, + alignment_metrics_list) = cls_reg_targets + + losses_cls, losses_bbox, \ + cls_avg_factors, bbox_avg_factors = multi_apply( + self.loss_single, + flatten_cls_scores, + flatten_bbox_preds, + labels_list, + label_weights_list, + bbox_targets_list, + alignment_metrics_list, + ) + + cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item() + losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls)) + + bbox_avg_factor = reduce_mean( + sum(bbox_avg_factors)).clamp_(min=1).item() + losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) + return dict(aux_loss_cls=losses_cls, aux_loss_bbox=losses_bbox) + + def get_targets(self, + cls_scores, + bbox_preds, + gt_bboxes_list, + img_metas, + gt_labels_list=None, + **kwargs): + """Compute regression and classification targets for a batch images. + + Args: + cls_scores (Tensor): Predicted normalized classification + scores, has shape (bs, num_dense_queries, + cls_out_channels). + bbox_preds (Tensor): Predicted unnormalized bbox coordinates, + has shape (bs, num_dense_queries, 4) with the last + dimension arranged as (x1, y1, x2, y2). + gt_bboxes_list (List[Tensor]): List of unnormalized ground truth + bboxes for each image, each has shape (num_gt, 4) with the + last dimension arranged as (x1, y1, x2, y2). + NOTE: num_gt is dynamic for each image. + img_metas (list[dict]): Meta information for one image, + e.g., image size, scaling factor, etc. + gt_labels_list (list[Tensor]): List of ground truth classification + index for each image, each has shape (num_gt,). + NOTE: num_gt is dynamic for each image. + Default: None. + + Returns: + tuple: a tuple containing the following targets. + + - all_labels (list[Tensor]): Labels for all images. + - all_label_weights (list[Tensor]): Label weights for all images. + - all_bbox_targets (list[Tensor]): Bbox targets for all images. + - all_assign_metrics (list[Tensor]): Normalized alignment metrics + for all images. + """ + (all_labels, all_label_weights, all_bbox_targets, + all_assign_metrics) = multi_apply(self._get_target_single, cls_scores, + bbox_preds, gt_bboxes_list, + gt_labels_list, img_metas) + + return (all_labels, all_label_weights, all_bbox_targets, + all_assign_metrics) + + def _get_target_single(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, + img_meta, **kwargs): + """Compute regression and classification targets for one image. + + Args: + cls_scores (Tensor): Predicted normalized classification + scores for one image, has shape (num_dense_queries, + cls_out_channels). + bbox_preds (Tensor): Predicted unnormalized bbox coordinates + for one image, has shape (num_dense_queries, 4) with the + last dimension arranged as (x1, y1, x2, y2). + gt_bboxes (Tensor): Unnormalized ground truth + bboxes for one image, has shape (num_gt, 4) with the + last dimension arranged as (x1, y1, x2, y2). + NOTE: num_gt is dynamic for each image. + gt_labels (Tensor): Ground truth classification + index for the image, has shape (num_gt,). + NOTE: num_gt is dynamic for each image. + img_meta (dict): Meta information for one image. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels for one image. + - label_weights (Tensor): Label weights for one image. + - bbox_targets (Tensor): Bbox targets for one image. + - norm_alignment_metrics (Tensor): Normalized alignment + metrics for one image. + """ + if len(gt_labels) == 0: + num_valid_anchors = len(cls_scores) + bbox_targets = torch.zeros_like(bbox_preds) + labels = bbox_preds.new_full((num_valid_anchors, ), + cls_scores.size(-1), + dtype=torch.long) + label_weights = bbox_preds.new_zeros( + num_valid_anchors, dtype=torch.float) + norm_alignment_metrics = bbox_preds.new_zeros( + num_valid_anchors, dtype=torch.float) + return (labels, label_weights, bbox_targets, + norm_alignment_metrics) + + assign_result = self.assigner.assign(cls_scores, bbox_preds, gt_bboxes, + gt_labels, img_meta) + assign_ious = assign_result.max_overlaps + assign_metrics = assign_result.assign_metrics + + pred_instances = BaseDataElement() + gt_instances = BaseDataElement() + + pred_instances.bboxes = bbox_preds + gt_instances.bboxes = gt_bboxes + + pred_instances.priors = cls_scores + gt_instances.labels = gt_labels + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + + num_valid_anchors = len(cls_scores) + bbox_targets = torch.zeros_like(bbox_preds) + labels = bbox_preds.new_full((num_valid_anchors, ), + cls_scores.size(-1), + dtype=torch.long) + label_weights = bbox_preds.new_zeros( + num_valid_anchors, dtype=torch.float) + norm_alignment_metrics = bbox_preds.new_zeros( + num_valid_anchors, dtype=torch.float) + + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + if len(pos_inds) > 0: + # point-based + pos_bbox_targets = sampling_result.pos_gt_bboxes + bbox_targets[pos_inds, :] = pos_bbox_targets + + if gt_labels is None: + # Only dense_heads gives gt_labels as None + # Foreground is the first class since v2.5.0 + labels[pos_inds] = 0 + else: + labels[pos_inds] = gt_labels[ + sampling_result.pos_assigned_gt_inds] + + label_weights[pos_inds] = 1.0 + + if len(neg_inds) > 0: + label_weights[neg_inds] = 1.0 + + class_assigned_gt_inds = torch.unique( + sampling_result.pos_assigned_gt_inds) + for gt_inds in class_assigned_gt_inds: + gt_class_inds = sampling_result.pos_assigned_gt_inds == gt_inds + pos_alignment_metrics = assign_metrics[gt_class_inds] + pos_ious = assign_ious[gt_class_inds] + pos_norm_alignment_metrics = pos_alignment_metrics / ( + pos_alignment_metrics.max() + 10e-8) * pos_ious.max() + norm_alignment_metrics[ + pos_inds[gt_class_inds]] = pos_norm_alignment_metrics + + return (labels, label_weights, bbox_targets, norm_alignment_metrics) diff --git a/mmdetection/mmdet/models/losses/dice_loss.py b/mmdetection/mmdet/models/losses/dice_loss.py new file mode 100644 index 0000000..1d5cac1 --- /dev/null +++ b/mmdetection/mmdet/models/losses/dice_loss.py @@ -0,0 +1,146 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmdet.registry import MODELS +from .utils import weight_reduce_loss + + +def dice_loss(pred, + target, + weight=None, + eps=1e-3, + reduction='mean', + naive_dice=False, + avg_factor=None): + """Calculate dice loss, there are two forms of dice loss is supported: + + - the one proposed in `V-Net: Fully Convolutional Neural + Networks for Volumetric Medical Image Segmentation + `_. + - the dice loss in which the power of the number in the + denominator is the first power instead of the second + power. + + Args: + pred (torch.Tensor): The prediction, has a shape (n, *) + target (torch.Tensor): The learning label of the prediction, + shape (n, *), same shape of pred. + weight (torch.Tensor, optional): The weight of loss for each + prediction, has a shape (n,). Defaults to None. + eps (float): Avoid dividing by zero. Default: 1e-3. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. + Options are "none", "mean" and "sum". + naive_dice (bool, optional): If false, use the dice + loss defined in the V-Net paper, otherwise, use the + naive dice loss in which the power of the number in the + denominator is the first power instead of the second + power.Defaults to False. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + """ + + input = pred.flatten(1) + target = target.flatten(1).float() + + a = torch.sum(input * target, 1) + if naive_dice: + b = torch.sum(input, 1) + c = torch.sum(target, 1) + d = (2 * a + eps) / (b + c + eps) + else: + b = torch.sum(input * input, 1) + eps + c = torch.sum(target * target, 1) + eps + d = (2 * a) / (b + c) + + loss = 1 - d + if weight is not None: + assert weight.ndim == loss.ndim + assert len(weight) == len(pred) + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + +@MODELS.register_module() +class DiceLoss(nn.Module): + + def __init__(self, + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=False, + loss_weight=1.0, + eps=1e-3): + """Compute dice loss. + + Args: + use_sigmoid (bool, optional): Whether to the prediction is + used for sigmoid or softmax. Defaults to True. + activate (bool): Whether to activate the predictions inside, + this will disable the inside sigmoid operation. + Defaults to True. + reduction (str, optional): The method used + to reduce the loss. Options are "none", + "mean" and "sum". Defaults to 'mean'. + naive_dice (bool, optional): If false, use the dice + loss defined in the V-Net paper, otherwise, use the + naive dice loss in which the power of the number in the + denominator is the first power instead of the second + power. Defaults to False. + loss_weight (float, optional): Weight of loss. Defaults to 1.0. + eps (float): Avoid dividing by zero. Defaults to 1e-3. + """ + + super(DiceLoss, self).__init__() + self.use_sigmoid = use_sigmoid + self.reduction = reduction + self.naive_dice = naive_dice + self.loss_weight = loss_weight + self.eps = eps + self.activate = activate + + def forward(self, + pred, + target, + weight=None, + reduction_override=None, + avg_factor=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction, has a shape (n, *). + target (torch.Tensor): The label of the prediction, + shape (n, *), same shape of pred. + weight (torch.Tensor, optional): The weight of loss for each + prediction, has a shape (n,). Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Options are "none", "mean" and "sum". + + Returns: + torch.Tensor: The calculated loss + """ + + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + if self.activate: + if self.use_sigmoid: + pred = pred.sigmoid() + else: + raise NotImplementedError + + loss = self.loss_weight * dice_loss( + pred, + target, + weight, + eps=self.eps, + reduction=reduction, + naive_dice=self.naive_dice, + avg_factor=avg_factor) + + return loss diff --git a/mmdetection/mmdet/models/losses/eqlv2_loss.py b/mmdetection/mmdet/models/losses/eqlv2_loss.py new file mode 100644 index 0000000..ea1f4a9 --- /dev/null +++ b/mmdetection/mmdet/models/losses/eqlv2_loss.py @@ -0,0 +1,173 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +from functools import partial +from typing import Optional + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from mmengine.logging import print_log +from torch import Tensor + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class EQLV2Loss(nn.Module): + + def __init__(self, + use_sigmoid: bool = True, + reduction: str = 'mean', + class_weight: Optional[Tensor] = None, + loss_weight: float = 1.0, + num_classes: int = 1203, + use_distributed: bool = False, + mu: float = 0.8, + alpha: float = 4.0, + gamma: int = 12, + vis_grad: bool = False, + test_with_obj: bool = True) -> None: + """`Equalization Loss v2 `_ + + Args: + use_sigmoid (bool): EQLv2 uses the sigmoid function to transform + the predicted logits to an estimated probability distribution. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. + class_weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + loss_weight (float, optional): The weight of the total EQLv2 loss. + Defaults to 1.0. + num_classes (int): 1203 for lvis v1.0, 1230 for lvis v0.5. + use_distributed (bool, float): EQLv2 will calculate the gradients + on all GPUs if there is any. Change to True if you are using + distributed training. Default to False. + mu (float, optional): Defaults to 0.8 + alpha (float, optional): A balance factor for the negative part of + EQLV2 Loss. Defaults to 4.0. + gamma (int, optional): The gamma for calculating the modulating + factor. Defaults to 12. + vis_grad (bool, optional): Default to False. + test_with_obj (bool, optional): Default to True. + + Returns: + None. + """ + super().__init__() + self.use_sigmoid = True + self.reduction = reduction + self.loss_weight = loss_weight + self.class_weight = class_weight + self.num_classes = num_classes + self.group = True + + # cfg for eqlv2 + self.vis_grad = vis_grad + self.mu = mu + self.alpha = alpha + self.gamma = gamma + self.use_distributed = use_distributed + + # initial variables + self.register_buffer('pos_grad', torch.zeros(self.num_classes)) + self.register_buffer('neg_grad', torch.zeros(self.num_classes)) + # At the beginning of training, we set a high value (eg. 100) + # for the initial gradient ratio so that the weight for pos + # gradients and neg gradients are 1. + self.register_buffer('pos_neg', torch.ones(self.num_classes) * 100) + + self.test_with_obj = test_with_obj + + def _func(x, gamma, mu): + return 1 / (1 + torch.exp(-gamma * (x - mu))) + + self.map_func = partial(_func, gamma=self.gamma, mu=self.mu) + + print_log( + f'build EQL v2, gamma: {gamma}, mu: {mu}, alpha: {alpha}', + logger='current', + level=logging.DEBUG) + + def forward(self, + cls_score: Tensor, + label: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[Tensor] = None) -> Tensor: + """`Equalization Loss v2 `_ + + Args: + cls_score (Tensor): The prediction with shape (N, C), C is the + number of classes. + label (Tensor): The ground truth label of the predicted target with + shape (N, C), C is the number of classes. + weight (Tensor, optional): The weight of loss for each prediction. + Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Options are "none", "mean" and "sum". + + Returns: + Tensor: The calculated loss + """ + self.n_i, self.n_c = cls_score.size() + self.gt_classes = label + self.pred_class_logits = cls_score + + def expand_label(pred, gt_classes): + target = pred.new_zeros(self.n_i, self.n_c) + target[torch.arange(self.n_i), gt_classes] = 1 + return target + + target = expand_label(cls_score, label) + + pos_w, neg_w = self.get_weight(cls_score) + + weight = pos_w * target + neg_w * (1 - target) + + cls_loss = F.binary_cross_entropy_with_logits( + cls_score, target, reduction='none') + cls_loss = torch.sum(cls_loss * weight) / self.n_i + + self.collect_grad(cls_score.detach(), target.detach(), weight.detach()) + + return self.loss_weight * cls_loss + + def get_channel_num(self, num_classes): + num_channel = num_classes + 1 + return num_channel + + def get_activation(self, pred): + pred = torch.sigmoid(pred) + n_i, n_c = pred.size() + bg_score = pred[:, -1].view(n_i, 1) + if self.test_with_obj: + pred[:, :-1] *= (1 - bg_score) + return pred + + def collect_grad(self, pred, target, weight): + prob = torch.sigmoid(pred) + grad = target * (prob - 1) + (1 - target) * prob + grad = torch.abs(grad) + + # do not collect grad for objectiveness branch [:-1] + pos_grad = torch.sum(grad * target * weight, dim=0)[:-1] + neg_grad = torch.sum(grad * (1 - target) * weight, dim=0)[:-1] + + if self.use_distributed: + dist.all_reduce(pos_grad) + dist.all_reduce(neg_grad) + + self.pos_grad += pos_grad + self.neg_grad += neg_grad + self.pos_neg = self.pos_grad / (self.neg_grad + 1e-10) + + def get_weight(self, pred): + neg_w = torch.cat([self.map_func(self.pos_neg), pred.new_ones(1)]) + pos_w = 1 + self.alpha * (1 - neg_w) + neg_w = neg_w.view(1, -1).expand(self.n_i, self.n_c) + pos_w = pos_w.view(1, -1).expand(self.n_i, self.n_c) + return pos_w, neg_w diff --git a/mmdetection/mmdet/models/losses/focal_loss.py b/mmdetection/mmdet/models/losses/focal_loss.py new file mode 100644 index 0000000..15bef29 --- /dev/null +++ b/mmdetection/mmdet/models/losses/focal_loss.py @@ -0,0 +1,371 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss + +from mmdet.registry import MODELS +from .accuracy import accuracy +from .utils import weight_reduce_loss + + +# This method is only for debugging +def py_sigmoid_focal_loss(pred, + target, + weight=None, + gamma=2.0, + alpha=0.25, + reduction='mean', + avg_factor=None): + """PyTorch version of `Focal Loss `_. + + Args: + pred (torch.Tensor): The prediction with shape (N, C), C is the + number of classes + target (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + alpha (float, optional): A balanced form for Focal Loss. + Defaults to 0.25. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + """ + pred_sigmoid = pred.sigmoid() + target = target.type_as(pred) + # Actually, pt here denotes (1 - pt) in the Focal Loss paper + pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target) + # Thus it's pt.pow(gamma) rather than (1 - pt).pow(gamma) + focal_weight = (alpha * target + (1 - alpha) * + (1 - target)) * pt.pow(gamma) + loss = F.binary_cross_entropy_with_logits( + pred, target, reduction='none') * focal_weight + if weight is not None: + if weight.shape != loss.shape: + if weight.size(0) == loss.size(0): + # For most cases, weight is of shape (num_priors, ), + # which means it does not have the second axis num_class + weight = weight.view(-1, 1) + else: + # Sometimes, weight per anchor per class is also needed. e.g. + # in FSAF. But it may be flattened of shape + # (num_priors x num_class, ), while loss is still of shape + # (num_priors, num_class). + assert weight.numel() == loss.numel() + weight = weight.view(loss.size(0), -1) + assert weight.ndim == loss.ndim + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + +def py_focal_loss_with_prob(pred, + target, + weight=None, + gamma=2.0, + alpha=0.25, + reduction='mean', + avg_factor=None): + """PyTorch version of `Focal Loss `_. + Different from `py_sigmoid_focal_loss`, this function accepts probability + as input. + + Args: + pred (torch.Tensor): The prediction probability with shape (N, C), + C is the number of classes. + target (torch.Tensor): The learning label of the prediction. + The target shape support (N,C) or (N,), (N,C) means one-hot form. + weight (torch.Tensor, optional): Sample-wise loss weight. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + alpha (float, optional): A balanced form for Focal Loss. + Defaults to 0.25. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + """ + if pred.dim() != target.dim(): + num_classes = pred.size(1) + target = F.one_hot(target, num_classes=num_classes + 1) + target = target[:, :num_classes] + + target = target.type_as(pred) + pt = (1 - pred) * target + pred * (1 - target) + focal_weight = (alpha * target + (1 - alpha) * + (1 - target)) * pt.pow(gamma) + loss = F.binary_cross_entropy( + pred, target, reduction='none') * focal_weight + if weight is not None: + if weight.shape != loss.shape: + if weight.size(0) == loss.size(0): + # For most cases, weight is of shape (num_priors, ), + # which means it does not have the second axis num_class + weight = weight.view(-1, 1) + else: + # Sometimes, weight per anchor per class is also needed. e.g. + # in FSAF. But it may be flattened of shape + # (num_priors x num_class, ), while loss is still of shape + # (num_priors, num_class). + assert weight.numel() == loss.numel() + weight = weight.view(loss.size(0), -1) + assert weight.ndim == loss.ndim + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + +def sigmoid_focal_loss(pred, + target, + weight=None, + gamma=2.0, + alpha=0.25, + reduction='mean', + avg_factor=None): + r"""A wrapper of cuda version `Focal Loss + `_. + + Args: + pred (torch.Tensor): The prediction with shape (N, C), C is the number + of classes. + target (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + alpha (float, optional): A balanced form for Focal Loss. + Defaults to 0.25. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + """ + # Function.apply does not accept keyword arguments, so the decorator + # "weighted_loss" is not applicable + loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(), gamma, + alpha, None, 'none') + if weight is not None: + if weight.shape != loss.shape: + if weight.size(0) == loss.size(0): + # For most cases, weight is of shape (num_priors, ), + # which means it does not have the second axis num_class + weight = weight.view(-1, 1) + else: + # Sometimes, weight per anchor per class is also needed. e.g. + # in FSAF. But it may be flattened of shape + # (num_priors x num_class, ), while loss is still of shape + # (num_priors, num_class). + assert weight.numel() == loss.numel() + weight = weight.view(loss.size(0), -1) + assert weight.ndim == loss.ndim + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + +@MODELS.register_module() +class FocalLoss(nn.Module): + + def __init__(self, + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0, + activated=False): + """`Focal Loss `_ + + Args: + use_sigmoid (bool, optional): Whether to the prediction is + used for sigmoid or softmax. Defaults to True. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + alpha (float, optional): A balanced form for Focal Loss. + Defaults to 0.25. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. Options are "none", "mean" and + "sum". + loss_weight (float, optional): Weight of loss. Defaults to 1.0. + activated (bool, optional): Whether the input is activated. + If True, it means the input has been activated and can be + treated as probabilities. Else, it should be treated as logits. + Defaults to False. + """ + super(FocalLoss, self).__init__() + assert use_sigmoid is True, 'Only sigmoid focal loss supported now.' + self.use_sigmoid = use_sigmoid + self.gamma = gamma + self.alpha = alpha + self.reduction = reduction + self.loss_weight = loss_weight + self.activated = activated + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning label of the prediction. + The target shape support (N,C) or (N,), (N,C) means + one-hot form. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Options are "none", "mean" and "sum". + + Returns: + torch.Tensor: The calculated loss + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.use_sigmoid: + if self.activated: + calculate_loss_func = py_focal_loss_with_prob + else: + if pred.dim() == target.dim(): + # this means that target is already in One-Hot form. + calculate_loss_func = py_sigmoid_focal_loss + elif torch.cuda.is_available() and pred.is_cuda: + calculate_loss_func = sigmoid_focal_loss + else: + num_classes = pred.size(1) + target = F.one_hot(target, num_classes=num_classes + 1) + target = target[:, :num_classes] + calculate_loss_func = py_sigmoid_focal_loss + + loss_cls = self.loss_weight * calculate_loss_func( + pred, + target, + weight, + gamma=self.gamma, + alpha=self.alpha, + reduction=reduction, + avg_factor=avg_factor) + + else: + raise NotImplementedError + return loss_cls + + +@MODELS.register_module() +class FocalCustomLoss(nn.Module): + + def __init__(self, + use_sigmoid=True, + num_classes=-1, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0, + activated=False): + """`Focal Loss for V3Det `_ + + Args: + use_sigmoid (bool, optional): Whether to the prediction is + used for sigmoid or softmax. Defaults to True. + num_classes (int): Number of classes to classify. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + alpha (float, optional): A balanced form for Focal Loss. + Defaults to 0.25. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. Options are "none", "mean" and + "sum". + loss_weight (float, optional): Weight of loss. Defaults to 1.0. + activated (bool, optional): Whether the input is activated. + If True, it means the input has been activated and can be + treated as probabilities. Else, it should be treated as logits. + Defaults to False. + """ + super(FocalCustomLoss, self).__init__() + assert use_sigmoid is True, 'Only sigmoid focal loss supported now.' + self.use_sigmoid = use_sigmoid + self.num_classes = num_classes + self.gamma = gamma + self.alpha = alpha + self.reduction = reduction + self.loss_weight = loss_weight + self.activated = activated + + assert self.num_classes != -1 + + # custom output channels of the classifier + self.custom_cls_channels = True + # custom activation of cls_score + self.custom_activation = True + # custom accuracy of the classsifier + self.custom_accuracy = True + + def get_cls_channels(self, num_classes): + assert num_classes == self.num_classes + return num_classes + + def get_activation(self, cls_score): + + fine_cls_score = cls_score[:, :self.num_classes] + + score_classes = fine_cls_score.sigmoid() + + return score_classes + + def get_accuracy(self, cls_score, labels): + + fine_cls_score = cls_score[:, :self.num_classes] + + pos_inds = labels < self.num_classes + acc_classes = accuracy(fine_cls_score[pos_inds], labels[pos_inds]) + acc = dict() + acc['acc_classes'] = acc_classes + return acc + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Options are "none", "mean" and "sum". + + Returns: + torch.Tensor: The calculated loss + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.use_sigmoid: + + num_classes = pred.size(1) + target = F.one_hot(target, num_classes=num_classes + 1) + target = target[:, :num_classes] + calculate_loss_func = py_sigmoid_focal_loss + + loss_cls = self.loss_weight * calculate_loss_func( + pred, + target, + weight, + gamma=self.gamma, + alpha=self.alpha, + reduction=reduction, + avg_factor=avg_factor) + + else: + raise NotImplementedError + return loss_cls diff --git a/mmdetection/mmdet/models/losses/gaussian_focal_loss.py b/mmdetection/mmdet/models/losses/gaussian_focal_loss.py new file mode 100644 index 0000000..14fa8da --- /dev/null +++ b/mmdetection/mmdet/models/losses/gaussian_focal_loss.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +import torch.nn as nn +from torch import Tensor + +from mmdet.registry import MODELS +from .utils import weight_reduce_loss, weighted_loss + + +@weighted_loss +def gaussian_focal_loss(pred: Tensor, + gaussian_target: Tensor, + alpha: float = 2.0, + gamma: float = 4.0, + pos_weight: float = 1.0, + neg_weight: float = 1.0) -> Tensor: + """`Focal Loss `_ for targets in gaussian + distribution. + + Args: + pred (torch.Tensor): The prediction. + gaussian_target (torch.Tensor): The learning target of the prediction + in gaussian distribution. + alpha (float, optional): A balanced form for Focal Loss. + Defaults to 2.0. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 4.0. + pos_weight(float): Positive sample loss weight. Defaults to 1.0. + neg_weight(float): Negative sample loss weight. Defaults to 1.0. + """ + eps = 1e-12 + pos_weights = gaussian_target.eq(1) + neg_weights = (1 - gaussian_target).pow(gamma) + pos_loss = -(pred + eps).log() * (1 - pred).pow(alpha) * pos_weights + neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights + return pos_weight * pos_loss + neg_weight * neg_loss + + +def gaussian_focal_loss_with_pos_inds( + pred: Tensor, + gaussian_target: Tensor, + pos_inds: Tensor, + pos_labels: Tensor, + alpha: float = 2.0, + gamma: float = 4.0, + pos_weight: float = 1.0, + neg_weight: float = 1.0, + reduction: str = 'mean', + avg_factor: Optional[Union[int, float]] = None) -> Tensor: + """`Focal Loss `_ for targets in gaussian + distribution. + + Note: The index with a value of 1 in ``gaussian_target`` in the + ``gaussian_focal_loss`` function is a positive sample, but in + ``gaussian_focal_loss_with_pos_inds`` the positive sample is passed + in through the ``pos_inds`` parameter. + + Args: + pred (torch.Tensor): The prediction. The shape is (N, num_classes). + gaussian_target (torch.Tensor): The learning target of the prediction + in gaussian distribution. The shape is (N, num_classes). + pos_inds (torch.Tensor): The positive sample index. + The shape is (M, ). + pos_labels (torch.Tensor): The label corresponding to the positive + sample index. The shape is (M, ). + alpha (float, optional): A balanced form for Focal Loss. + Defaults to 2.0. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 4.0. + pos_weight(float): Positive sample loss weight. Defaults to 1.0. + neg_weight(float): Negative sample loss weight. Defaults to 1.0. + reduction (str): Options are "none", "mean" and "sum". + Defaults to 'mean`. + avg_factor (int, float, optional): Average factor that is used to + average the loss. Defaults to None. + """ + eps = 1e-12 + neg_weights = (1 - gaussian_target).pow(gamma) + + pos_pred_pix = pred[pos_inds] + pos_pred = pos_pred_pix.gather(1, pos_labels.unsqueeze(1)) + pos_loss = -(pos_pred + eps).log() * (1 - pos_pred).pow(alpha) + pos_loss = weight_reduce_loss(pos_loss, None, reduction, avg_factor) + + neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights + neg_loss = weight_reduce_loss(neg_loss, None, reduction, avg_factor) + + return pos_weight * pos_loss + neg_weight * neg_loss + + +@MODELS.register_module() +class GaussianFocalLoss(nn.Module): + """GaussianFocalLoss is a variant of focal loss. + + More details can be found in the `paper + `_ + Code is modified from `kp_utils.py + `_ # noqa: E501 + Please notice that the target in GaussianFocalLoss is a gaussian heatmap, + not 0/1 binary target. + + Args: + alpha (float): Power of prediction. + gamma (float): Power of target for negative samples. + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Loss weight of current loss. + pos_weight(float): Positive sample loss weight. Defaults to 1.0. + neg_weight(float): Negative sample loss weight. Defaults to 1.0. + """ + + def __init__(self, + alpha: float = 2.0, + gamma: float = 4.0, + reduction: str = 'mean', + loss_weight: float = 1.0, + pos_weight: float = 1.0, + neg_weight: float = 1.0) -> None: + super().__init__() + self.alpha = alpha + self.gamma = gamma + self.reduction = reduction + self.loss_weight = loss_weight + self.pos_weight = pos_weight + self.neg_weight = neg_weight + + def forward(self, + pred: Tensor, + target: Tensor, + pos_inds: Optional[Tensor] = None, + pos_labels: Optional[Tensor] = None, + weight: Optional[Tensor] = None, + avg_factor: Optional[Union[int, float]] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function. + + If you want to manually determine which positions are + positive samples, you can set the pos_index and pos_label + parameter. Currently, only the CenterNet update version uses + the parameter. + + Args: + pred (torch.Tensor): The prediction. The shape is (N, num_classes). + target (torch.Tensor): The learning target of the prediction + in gaussian distribution. The shape is (N, num_classes). + pos_inds (torch.Tensor): The positive sample index. + Defaults to None. + pos_labels (torch.Tensor): The label corresponding to the positive + sample index. Defaults to None. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, float, optional): Average factor that is used to + average the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if pos_inds is not None: + assert pos_labels is not None + # Only used by centernet update version + loss_reg = self.loss_weight * gaussian_focal_loss_with_pos_inds( + pred, + target, + pos_inds, + pos_labels, + alpha=self.alpha, + gamma=self.gamma, + pos_weight=self.pos_weight, + neg_weight=self.neg_weight, + reduction=reduction, + avg_factor=avg_factor) + else: + loss_reg = self.loss_weight * gaussian_focal_loss( + pred, + target, + weight, + alpha=self.alpha, + gamma=self.gamma, + pos_weight=self.pos_weight, + neg_weight=self.neg_weight, + reduction=reduction, + avg_factor=avg_factor) + return loss_reg diff --git a/mmdetection/mmdet/models/losses/gfocal_loss.py b/mmdetection/mmdet/models/losses/gfocal_loss.py new file mode 100644 index 0000000..b3a1172 --- /dev/null +++ b/mmdetection/mmdet/models/losses/gfocal_loss.py @@ -0,0 +1,295 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmdet.models.losses.utils import weighted_loss +from mmdet.registry import MODELS + + +@weighted_loss +def quality_focal_loss(pred, target, beta=2.0): + r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning + Qualified and Distributed Bounding Boxes for Dense Object Detection + `_. + + Args: + pred (torch.Tensor): Predicted joint representation of classification + and quality (IoU) estimation with shape (N, C), C is the number of + classes. + target (tuple([torch.Tensor])): Target category label with shape (N,) + and target quality label with shape (N,). + beta (float): The beta parameter for calculating the modulating factor. + Defaults to 2.0. + + Returns: + torch.Tensor: Loss tensor with shape (N,). + """ + assert len(target) == 2, """target for QFL must be a tuple of two elements, + including category label and quality label, respectively""" + # label denotes the category id, score denotes the quality score + label, score = target + + # negatives are supervised by 0 quality score + pred_sigmoid = pred.sigmoid() + scale_factor = pred_sigmoid + zerolabel = scale_factor.new_zeros(pred.shape) + loss = F.binary_cross_entropy_with_logits( + pred, zerolabel, reduction='none') * scale_factor.pow(beta) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = pred.size(1) + pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1) + pos_label = label[pos].long() + # positives are supervised by bbox quality (IoU) score + scale_factor = score[pos] - pred_sigmoid[pos, pos_label] + loss[pos, pos_label] = F.binary_cross_entropy_with_logits( + pred[pos, pos_label], score[pos], + reduction='none') * scale_factor.abs().pow(beta) + + loss = loss.sum(dim=1, keepdim=False) + return loss + + +@weighted_loss +def quality_focal_loss_tensor_target(pred, target, beta=2.0, activated=False): + """`QualityFocal Loss `_ + Args: + pred (torch.Tensor): The prediction with shape (N, C), C is the + number of classes + target (torch.Tensor): The learning target of the iou-aware + classification score with shape (N, C), C is the number of classes. + beta (float): The beta parameter for calculating the modulating factor. + Defaults to 2.0. + activated (bool): Whether the input is activated. + If True, it means the input has been activated and can be + treated as probabilities. Else, it should be treated as logits. + Defaults to False. + """ + # pred and target should be of the same size + assert pred.size() == target.size() + if activated: + pred_sigmoid = pred + loss_function = F.binary_cross_entropy + else: + pred_sigmoid = pred.sigmoid() + loss_function = F.binary_cross_entropy_with_logits + + scale_factor = pred_sigmoid + target = target.type_as(pred) + + zerolabel = scale_factor.new_zeros(pred.shape) + loss = loss_function( + pred, zerolabel, reduction='none') * scale_factor.pow(beta) + + pos = (target != 0) + scale_factor = target[pos] - pred_sigmoid[pos] + loss[pos] = loss_function( + pred[pos], target[pos], + reduction='none') * scale_factor.abs().pow(beta) + + loss = loss.sum(dim=1, keepdim=False) + return loss + + +@weighted_loss +def quality_focal_loss_with_prob(pred, target, beta=2.0): + r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning + Qualified and Distributed Bounding Boxes for Dense Object Detection + `_. + Different from `quality_focal_loss`, this function accepts probability + as input. + + Args: + pred (torch.Tensor): Predicted joint representation of classification + and quality (IoU) estimation with shape (N, C), C is the number of + classes. + target (tuple([torch.Tensor])): Target category label with shape (N,) + and target quality label with shape (N,). + beta (float): The beta parameter for calculating the modulating factor. + Defaults to 2.0. + + Returns: + torch.Tensor: Loss tensor with shape (N,). + """ + assert len(target) == 2, """target for QFL must be a tuple of two elements, + including category label and quality label, respectively""" + # label denotes the category id, score denotes the quality score + label, score = target + + # negatives are supervised by 0 quality score + pred_sigmoid = pred + scale_factor = pred_sigmoid + zerolabel = scale_factor.new_zeros(pred.shape) + loss = F.binary_cross_entropy( + pred, zerolabel, reduction='none') * scale_factor.pow(beta) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = pred.size(1) + pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1) + pos_label = label[pos].long() + # positives are supervised by bbox quality (IoU) score + scale_factor = score[pos] - pred_sigmoid[pos, pos_label] + loss[pos, pos_label] = F.binary_cross_entropy( + pred[pos, pos_label], score[pos], + reduction='none') * scale_factor.abs().pow(beta) + + loss = loss.sum(dim=1, keepdim=False) + return loss + + +@weighted_loss +def distribution_focal_loss(pred, label): + r"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning + Qualified and Distributed Bounding Boxes for Dense Object Detection + `_. + + Args: + pred (torch.Tensor): Predicted general distribution of bounding boxes + (before softmax) with shape (N, n+1), n is the max value of the + integral set `{0, ..., n}` in paper. + label (torch.Tensor): Target distance label for bounding boxes with + shape (N,). + + Returns: + torch.Tensor: Loss tensor with shape (N,). + """ + dis_left = label.long() + dis_right = dis_left + 1 + weight_left = dis_right.float() - label + weight_right = label - dis_left.float() + loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \ + + F.cross_entropy(pred, dis_right, reduction='none') * weight_right + return loss + + +@MODELS.register_module() +class QualityFocalLoss(nn.Module): + r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss: + Learning Qualified and Distributed Bounding Boxes for Dense Object + Detection `_. + + Args: + use_sigmoid (bool): Whether sigmoid operation is conducted in QFL. + Defaults to True. + beta (float): The beta parameter for calculating the modulating factor. + Defaults to 2.0. + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Loss weight of current loss. + activated (bool, optional): Whether the input is activated. + If True, it means the input has been activated and can be + treated as probabilities. Else, it should be treated as logits. + Defaults to False. + """ + + def __init__(self, + use_sigmoid=True, + beta=2.0, + reduction='mean', + loss_weight=1.0, + activated=False): + super(QualityFocalLoss, self).__init__() + assert use_sigmoid is True, 'Only sigmoid in QFL supported now.' + self.use_sigmoid = use_sigmoid + self.beta = beta + self.reduction = reduction + self.loss_weight = loss_weight + self.activated = activated + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): Predicted joint representation of + classification and quality (IoU) estimation with shape (N, C), + C is the number of classes. + target (Union(tuple([torch.Tensor]),Torch.Tensor)): The type is + tuple, it should be included Target category label with + shape (N,) and target quality label with shape (N,).The type + is torch.Tensor, the target should be one-hot form with + soft weights. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.use_sigmoid: + if self.activated: + calculate_loss_func = quality_focal_loss_with_prob + else: + calculate_loss_func = quality_focal_loss + if isinstance(target, torch.Tensor): + # the target shape with (N,C) or (N,C,...), which means + # the target is one-hot form with soft weights. + calculate_loss_func = partial( + quality_focal_loss_tensor_target, activated=self.activated) + + loss_cls = self.loss_weight * calculate_loss_func( + pred, + target, + weight, + beta=self.beta, + reduction=reduction, + avg_factor=avg_factor) + else: + raise NotImplementedError + return loss_cls + + +@MODELS.register_module() +class DistributionFocalLoss(nn.Module): + r"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss: + Learning Qualified and Distributed Bounding Boxes for Dense Object + Detection `_. + + Args: + reduction (str): Options are `'none'`, `'mean'` and `'sum'`. + loss_weight (float): Loss weight of current loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(DistributionFocalLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): Predicted general distribution of bounding + boxes (before softmax) with shape (N, n+1), n is the max value + of the integral set `{0, ..., n}` in paper. + target (torch.Tensor): Target distance label for bounding boxes + with shape (N,). + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_cls = self.loss_weight * distribution_focal_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_cls diff --git a/mmdetection/mmdet/models/losses/ghm_loss.py b/mmdetection/mmdet/models/losses/ghm_loss.py new file mode 100644 index 0000000..a874c00 --- /dev/null +++ b/mmdetection/mmdet/models/losses/ghm_loss.py @@ -0,0 +1,213 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from mmdet.registry import MODELS +from .utils import weight_reduce_loss + + +def _expand_onehot_labels(labels, label_weights, label_channels): + bin_labels = labels.new_full((labels.size(0), label_channels), 0) + inds = torch.nonzero( + (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze() + if inds.numel() > 0: + bin_labels[inds, labels[inds]] = 1 + bin_label_weights = label_weights.view(-1, 1).expand( + label_weights.size(0), label_channels) + return bin_labels, bin_label_weights + + +# TODO: code refactoring to make it consistent with other losses +@MODELS.register_module() +class GHMC(nn.Module): + """GHM Classification Loss. + + Details of the theorem can be viewed in the paper + `Gradient Harmonized Single-stage Detector + `_. + + Args: + bins (int): Number of the unit regions for distribution calculation. + momentum (float): The parameter for moving average. + use_sigmoid (bool): Can only be true for BCE based loss now. + loss_weight (float): The weight of the total GHM-C loss. + reduction (str): Options are "none", "mean" and "sum". + Defaults to "mean" + """ + + def __init__(self, + bins=10, + momentum=0, + use_sigmoid=True, + loss_weight=1.0, + reduction='mean'): + super(GHMC, self).__init__() + self.bins = bins + self.momentum = momentum + edges = torch.arange(bins + 1).float() / bins + self.register_buffer('edges', edges) + self.edges[-1] += 1e-6 + if momentum > 0: + acc_sum = torch.zeros(bins) + self.register_buffer('acc_sum', acc_sum) + self.use_sigmoid = use_sigmoid + if not self.use_sigmoid: + raise NotImplementedError + self.loss_weight = loss_weight + self.reduction = reduction + + def forward(self, + pred, + target, + label_weight, + reduction_override=None, + **kwargs): + """Calculate the GHM-C loss. + + Args: + pred (float tensor of size [batch_num, class_num]): + The direct prediction of classification fc layer. + target (float tensor of size [batch_num, class_num]): + Binary class target for each sample. + label_weight (float tensor of size [batch_num, class_num]): + the value is 1 if the sample is valid and 0 if ignored. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + Returns: + The gradient harmonized loss. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # the target should be binary class label + if pred.dim() != target.dim(): + target, label_weight = _expand_onehot_labels( + target, label_weight, pred.size(-1)) + target, label_weight = target.float(), label_weight.float() + edges = self.edges + mmt = self.momentum + weights = torch.zeros_like(pred) + + # gradient length + g = torch.abs(pred.sigmoid().detach() - target) + + valid = label_weight > 0 + tot = max(valid.float().sum().item(), 1.0) + n = 0 # n valid bins + for i in range(self.bins): + inds = (g >= edges[i]) & (g < edges[i + 1]) & valid + num_in_bin = inds.sum().item() + if num_in_bin > 0: + if mmt > 0: + self.acc_sum[i] = mmt * self.acc_sum[i] \ + + (1 - mmt) * num_in_bin + weights[inds] = tot / self.acc_sum[i] + else: + weights[inds] = tot / num_in_bin + n += 1 + if n > 0: + weights = weights / n + + loss = F.binary_cross_entropy_with_logits( + pred, target, reduction='none') + loss = weight_reduce_loss( + loss, weights, reduction=reduction, avg_factor=tot) + return loss * self.loss_weight + + +# TODO: code refactoring to make it consistent with other losses +@MODELS.register_module() +class GHMR(nn.Module): + """GHM Regression Loss. + + Details of the theorem can be viewed in the paper + `Gradient Harmonized Single-stage Detector + `_. + + Args: + mu (float): The parameter for the Authentic Smooth L1 loss. + bins (int): Number of the unit regions for distribution calculation. + momentum (float): The parameter for moving average. + loss_weight (float): The weight of the total GHM-R loss. + reduction (str): Options are "none", "mean" and "sum". + Defaults to "mean" + """ + + def __init__(self, + mu=0.02, + bins=10, + momentum=0, + loss_weight=1.0, + reduction='mean'): + super(GHMR, self).__init__() + self.mu = mu + self.bins = bins + edges = torch.arange(bins + 1).float() / bins + self.register_buffer('edges', edges) + self.edges[-1] = 1e3 + self.momentum = momentum + if momentum > 0: + acc_sum = torch.zeros(bins) + self.register_buffer('acc_sum', acc_sum) + self.loss_weight = loss_weight + self.reduction = reduction + + # TODO: support reduction parameter + def forward(self, + pred, + target, + label_weight, + avg_factor=None, + reduction_override=None): + """Calculate the GHM-R loss. + + Args: + pred (float tensor of size [batch_num, 4 (* class_num)]): + The prediction of box regression layer. Channel number can be 4 + or 4 * class_num depending on whether it is class-agnostic. + target (float tensor of size [batch_num, 4 (* class_num)]): + The target regression values with the same size of pred. + label_weight (float tensor of size [batch_num, 4 (* class_num)]): + The weight of each sample, 0 if ignored. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + Returns: + The gradient harmonized loss. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + mu = self.mu + edges = self.edges + mmt = self.momentum + + # ASL1 loss + diff = pred - target + loss = torch.sqrt(diff * diff + mu * mu) - mu + + # gradient length + g = torch.abs(diff / torch.sqrt(mu * mu + diff * diff)).detach() + weights = torch.zeros_like(g) + + valid = label_weight > 0 + tot = max(label_weight.float().sum().item(), 1.0) + n = 0 # n: valid bins + for i in range(self.bins): + inds = (g >= edges[i]) & (g < edges[i + 1]) & valid + num_in_bin = inds.sum().item() + if num_in_bin > 0: + n += 1 + if mmt > 0: + self.acc_sum[i] = mmt * self.acc_sum[i] \ + + (1 - mmt) * num_in_bin + weights[inds] = tot / self.acc_sum[i] + else: + weights[inds] = tot / num_in_bin + if n > 0: + weights /= n + loss = weight_reduce_loss( + loss, weights, reduction=reduction, avg_factor=tot) + return loss * self.loss_weight diff --git a/mmdetection/mmdet/models/losses/iou_loss.py b/mmdetection/mmdet/models/losses/iou_loss.py new file mode 100644 index 0000000..c8a2b97 --- /dev/null +++ b/mmdetection/mmdet/models/losses/iou_loss.py @@ -0,0 +1,926 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import warnings +from typing import Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox_overlaps +from .utils import weighted_loss + + +@weighted_loss +def iou_loss(pred: Tensor, + target: Tensor, + linear: bool = False, + mode: str = 'log', + eps: float = 1e-6) -> Tensor: + """IoU loss. + + Computing the IoU loss between a set of predicted bboxes and target bboxes. + The loss is calculated as negative log of IoU. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + linear (bool, optional): If True, use linear scale of loss instead of + log scale. Default: False. + mode (str): Loss scaling mode, including "linear", "square", and "log". + Default: 'log' + eps (float): Epsilon to avoid log(0). + + Return: + Tensor: Loss tensor. + """ + assert mode in ['linear', 'square', 'log'] + if linear: + mode = 'linear' + warnings.warn('DeprecationWarning: Setting "linear=True" in ' + 'iou_loss is deprecated, please use "mode=`linear`" ' + 'instead.') + # avoid fp16 overflow + if pred.dtype == torch.float16: + fp16 = True + pred = pred.to(torch.float32) + else: + fp16 = False + + ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps) + + if fp16: + ious = ious.to(torch.float16) + + if mode == 'linear': + loss = 1 - ious + elif mode == 'square': + loss = 1 - ious**2 + elif mode == 'log': + loss = -ious.log() + else: + raise NotImplementedError + return loss + + +@weighted_loss +def bounded_iou_loss(pred: Tensor, + target: Tensor, + beta: float = 0.2, + eps: float = 1e-3) -> Tensor: + """BIoULoss. + + This is an implementation of paper + `Improving Object Localization with Fitness NMS and Bounded IoU Loss. + `_. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + beta (float, optional): Beta parameter in smoothl1. + eps (float, optional): Epsilon to avoid NaN values. + + Return: + Tensor: Loss tensor. + """ + pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5 + pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5 + pred_w = pred[:, 2] - pred[:, 0] + pred_h = pred[:, 3] - pred[:, 1] + with torch.no_grad(): + target_ctrx = (target[:, 0] + target[:, 2]) * 0.5 + target_ctry = (target[:, 1] + target[:, 3]) * 0.5 + target_w = target[:, 2] - target[:, 0] + target_h = target[:, 3] - target[:, 1] + + dx = target_ctrx - pred_ctrx + dy = target_ctry - pred_ctry + + loss_dx = 1 - torch.max( + (target_w - 2 * dx.abs()) / + (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx)) + loss_dy = 1 - torch.max( + (target_h - 2 * dy.abs()) / + (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy)) + loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w / + (target_w + eps)) + loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h / + (target_h + eps)) + # view(..., -1) does not work for empty tensor + loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh], + dim=-1).flatten(1) + + loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta, + loss_comb - 0.5 * beta) + return loss + + +@weighted_loss +def giou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor: + r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding + Box Regression `_. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + eps (float): Epsilon to avoid log(0). + + Return: + Tensor: Loss tensor. + """ + # avoid fp16 overflow + if pred.dtype == torch.float16: + fp16 = True + pred = pred.to(torch.float32) + else: + fp16 = False + + gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps) + + if fp16: + gious = gious.to(torch.float16) + + loss = 1 - gious + return loss + + +@weighted_loss +def diou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor: + r"""Implementation of `Distance-IoU Loss: Faster and Better + Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_. + + Code is modified from https://github.com/Zzh-tju/DIoU. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + eps (float): Epsilon to avoid log(0). + + Return: + Tensor: Loss tensor. + """ + # overlap + lt = torch.max(pred[:, :2], target[:, :2]) + rb = torch.min(pred[:, 2:], target[:, 2:]) + wh = (rb - lt).clamp(min=0) + overlap = wh[:, 0] * wh[:, 1] + + # union + ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1]) + ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1]) + union = ap + ag - overlap + eps + + # IoU + ious = overlap / union + + # enclose area + enclose_x1y1 = torch.min(pred[:, :2], target[:, :2]) + enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:]) + enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0) + + cw = enclose_wh[:, 0] + ch = enclose_wh[:, 1] + + c2 = cw**2 + ch**2 + eps + + b1_x1, b1_y1 = pred[:, 0], pred[:, 1] + b1_x2, b1_y2 = pred[:, 2], pred[:, 3] + b2_x1, b2_y1 = target[:, 0], target[:, 1] + b2_x2, b2_y2 = target[:, 2], target[:, 3] + + left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4 + right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4 + rho2 = left + right + + # DIoU + dious = ious - rho2 / c2 + loss = 1 - dious + return loss + + +@weighted_loss +def ciou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor: + r"""`Implementation of paper `Enhancing Geometric Factors into + Model Learning and Inference for Object Detection and Instance + Segmentation `_. + + Code is modified from https://github.com/Zzh-tju/CIoU. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + eps (float): Epsilon to avoid log(0). + + Return: + Tensor: Loss tensor. + """ + # overlap + lt = torch.max(pred[:, :2], target[:, :2]) + rb = torch.min(pred[:, 2:], target[:, 2:]) + wh = (rb - lt).clamp(min=0) + overlap = wh[:, 0] * wh[:, 1] + + # union + ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1]) + ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1]) + union = ap + ag - overlap + eps + + # IoU + ious = overlap / union + + # enclose area + enclose_x1y1 = torch.min(pred[:, :2], target[:, :2]) + enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:]) + enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0) + + cw = enclose_wh[:, 0] + ch = enclose_wh[:, 1] + + c2 = cw**2 + ch**2 + eps + + b1_x1, b1_y1 = pred[:, 0], pred[:, 1] + b1_x2, b1_y2 = pred[:, 2], pred[:, 3] + b2_x1, b2_y1 = target[:, 0], target[:, 1] + b2_x2, b2_y2 = target[:, 2], target[:, 3] + + w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps + w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps + + left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4 + right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4 + rho2 = left + right + + factor = 4 / math.pi**2 + v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2) + + with torch.no_grad(): + alpha = (ious > 0.5).float() * v / (1 - ious + v) + + # CIoU + cious = ious - (rho2 / c2 + alpha * v) + loss = 1 - cious.clamp(min=-1.0, max=1.0) + return loss + + +@weighted_loss +def eiou_loss(pred: Tensor, + target: Tensor, + smooth_point: float = 0.1, + eps: float = 1e-7) -> Tensor: + r"""Implementation of paper `Extended-IoU Loss: A Systematic + IoU-Related Method: Beyond Simplified Regression for Better + Localization `_ + + Code is modified from https://github.com//ShiqiYu/libfacedetection.train. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + smooth_point (float): hyperparameter, default is 0.1. + eps (float): Epsilon to avoid log(0). + + Return: + Tensor: Loss tensor. + """ + px1, py1, px2, py2 = pred[:, 0], pred[:, 1], pred[:, 2], pred[:, 3] + tx1, ty1, tx2, ty2 = target[:, 0], target[:, 1], target[:, 2], target[:, 3] + + # extent top left + ex1 = torch.min(px1, tx1) + ey1 = torch.min(py1, ty1) + + # intersection coordinates + ix1 = torch.max(px1, tx1) + iy1 = torch.max(py1, ty1) + ix2 = torch.min(px2, tx2) + iy2 = torch.min(py2, ty2) + + # extra + xmin = torch.min(ix1, ix2) + ymin = torch.min(iy1, iy2) + xmax = torch.max(ix1, ix2) + ymax = torch.max(iy1, iy2) + + # Intersection + intersection = (ix2 - ex1) * (iy2 - ey1) + (xmin - ex1) * (ymin - ey1) - ( + ix1 - ex1) * (ymax - ey1) - (xmax - ex1) * ( + iy1 - ey1) + # Union + union = (px2 - px1) * (py2 - py1) + (tx2 - tx1) * ( + ty2 - ty1) - intersection + eps + # IoU + ious = 1 - (intersection / union) + + # Smooth-EIoU + smooth_sign = (ious < smooth_point).detach().float() + loss = 0.5 * smooth_sign * (ious**2) / smooth_point + (1 - smooth_sign) * ( + ious - 0.5 * smooth_point) + return loss + + +@weighted_loss +def siou_loss(pred, target, eps=1e-7, neg_gamma=False): + r"""`Implementation of paper `SIoU Loss: More Powerful Learning + for Bounding Box Regression `_. + + Code is modified from https://github.com/meituan/YOLOv6. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + eps (float): Eps to avoid log(0). + neg_gamma (bool): `True` follows original implementation in paper. + + Return: + Tensor: Loss tensor. + """ + # overlap + lt = torch.max(pred[:, :2], target[:, :2]) + rb = torch.min(pred[:, 2:], target[:, 2:]) + wh = (rb - lt).clamp(min=0) + overlap = wh[:, 0] * wh[:, 1] + + # union + ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1]) + ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1]) + union = ap + ag - overlap + eps + + # IoU + ious = overlap / union + + # enclose area + enclose_x1y1 = torch.min(pred[:, :2], target[:, :2]) + enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:]) + # modified clamp threshold zero to eps to avoid NaN + enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=eps) + + cw = enclose_wh[:, 0] + ch = enclose_wh[:, 1] + + b1_x1, b1_y1 = pred[:, 0], pred[:, 1] + b1_x2, b1_y2 = pred[:, 2], pred[:, 3] + b2_x1, b2_y1 = target[:, 0], target[:, 1] + b2_x2, b2_y2 = target[:, 2], target[:, 3] + + w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps + w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps + + # angle cost + s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps + s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps + + sigma = torch.pow(s_cw**2 + s_ch**2, 0.5) + + sin_alpha_1 = torch.abs(s_cw) / sigma + sin_alpha_2 = torch.abs(s_ch) / sigma + threshold = pow(2, 0.5) / 2 + sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1) + angle_cost = torch.cos(torch.asin(sin_alpha) * 2 - math.pi / 2) + + # distance cost + rho_x = (s_cw / cw)**2 + rho_y = (s_ch / ch)**2 + + # `neg_gamma=True` follows original implementation in paper + # but setting `neg_gamma=False` makes training more stable. + gamma = angle_cost - 2 if neg_gamma else 2 - angle_cost + distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y) + + # shape cost + omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) + omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) + shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow( + 1 - torch.exp(-1 * omiga_h), 4) + + # SIoU + sious = ious - 0.5 * (distance_cost + shape_cost) + loss = 1 - sious.clamp(min=-1.0, max=1.0) + return loss + + +@MODELS.register_module() +class IoULoss(nn.Module): + """IoULoss. + + Computing the IoU loss between a set of predicted bboxes and target bboxes. + + Args: + linear (bool): If True, use linear scale of loss else determined + by mode. Default: False. + eps (float): Epsilon to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + mode (str): Loss scaling mode, including "linear", "square", and "log". + Default: 'log' + """ + + def __init__(self, + linear: bool = False, + eps: float = 1e-6, + reduction: str = 'mean', + loss_weight: float = 1.0, + mode: str = 'log') -> None: + super().__init__() + assert mode in ['linear', 'square', 'log'] + if linear: + mode = 'linear' + warnings.warn('DeprecationWarning: Setting "linear=True" in ' + 'IOULoss is deprecated, please use "mode=`linear`" ' + 'instead.') + self.mode = mode + self.linear = linear + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): The learning target of the prediction, + shape (n, 4). + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. Options are "none", "mean" and "sum". + + Return: + Tensor: Loss tensor. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if (weight is not None) and (not torch.any(weight > 0)) and ( + reduction != 'none'): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + if weight is not None and weight.dim() > 1: + # TODO: remove this in the future + # reduce the weight of shape (n, 4) to (n,) to match the + # iou_loss of shape (n,) + assert weight.shape == pred.shape + weight = weight.mean(-1) + loss = self.loss_weight * iou_loss( + pred, + target, + weight, + mode=self.mode, + eps=self.eps, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss + + +@MODELS.register_module() +class BoundedIoULoss(nn.Module): + """BIoULoss. + + This is an implementation of paper + `Improving Object Localization with Fitness NMS and Bounded IoU Loss. + `_. + + Args: + beta (float, optional): Beta parameter in smoothl1. + eps (float, optional): Epsilon to avoid NaN values. + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + """ + + def __init__(self, + beta: float = 0.2, + eps: float = 1e-3, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + super().__init__() + self.beta = beta + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): The learning target of the prediction, + shape (n, 4). + weight (Optional[Tensor], optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (Optional[int], optional): Average factor that is used + to average the loss. Defaults to None. + reduction_override (Optional[str], optional): The reduction method + used to override the original reduction method of the loss. + Defaults to None. Options are "none", "mean" and "sum". + + Returns: + Tensor: Loss tensor. + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss = self.loss_weight * bounded_iou_loss( + pred, + target, + weight, + beta=self.beta, + eps=self.eps, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss + + +@MODELS.register_module() +class GIoULoss(nn.Module): + r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding + Box Regression `_. + + Args: + eps (float): Epsilon to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + """ + + def __init__(self, + eps: float = 1e-6, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + super().__init__() + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): The learning target of the prediction, + shape (n, 4). + weight (Optional[Tensor], optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (Optional[int], optional): Average factor that is used + to average the loss. Defaults to None. + reduction_override (Optional[str], optional): The reduction method + used to override the original reduction method of the loss. + Defaults to None. Options are "none", "mean" and "sum". + + Returns: + Tensor: Loss tensor. + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if weight is not None and weight.dim() > 1: + # TODO: remove this in the future + # reduce the weight of shape (n, 4) to (n,) to match the + # giou_loss of shape (n,) + assert weight.shape == pred.shape + weight = weight.mean(-1) + loss = self.loss_weight * giou_loss( + pred, + target, + weight, + eps=self.eps, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss + + +@MODELS.register_module() +class DIoULoss(nn.Module): + r"""Implementation of `Distance-IoU Loss: Faster and Better + Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_. + + Code is modified from https://github.com/Zzh-tju/DIoU. + + Args: + eps (float): Epsilon to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + """ + + def __init__(self, + eps: float = 1e-6, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + super().__init__() + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): The learning target of the prediction, + shape (n, 4). + weight (Optional[Tensor], optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (Optional[int], optional): Average factor that is used + to average the loss. Defaults to None. + reduction_override (Optional[str], optional): The reduction method + used to override the original reduction method of the loss. + Defaults to None. Options are "none", "mean" and "sum". + + Returns: + Tensor: Loss tensor. + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if weight is not None and weight.dim() > 1: + # TODO: remove this in the future + # reduce the weight of shape (n, 4) to (n,) to match the + # giou_loss of shape (n,) + assert weight.shape == pred.shape + weight = weight.mean(-1) + loss = self.loss_weight * diou_loss( + pred, + target, + weight, + eps=self.eps, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss + + +@MODELS.register_module() +class CIoULoss(nn.Module): + r"""`Implementation of paper `Enhancing Geometric Factors into + Model Learning and Inference for Object Detection and Instance + Segmentation `_. + + Code is modified from https://github.com/Zzh-tju/CIoU. + + Args: + eps (float): Epsilon to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + """ + + def __init__(self, + eps: float = 1e-6, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + super().__init__() + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): The learning target of the prediction, + shape (n, 4). + weight (Optional[Tensor], optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (Optional[int], optional): Average factor that is used + to average the loss. Defaults to None. + reduction_override (Optional[str], optional): The reduction method + used to override the original reduction method of the loss. + Defaults to None. Options are "none", "mean" and "sum". + + Returns: + Tensor: Loss tensor. + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if weight is not None and weight.dim() > 1: + # TODO: remove this in the future + # reduce the weight of shape (n, 4) to (n,) to match the + # giou_loss of shape (n,) + assert weight.shape == pred.shape + weight = weight.mean(-1) + loss = self.loss_weight * ciou_loss( + pred, + target, + weight, + eps=self.eps, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss + + +@MODELS.register_module() +class EIoULoss(nn.Module): + r"""Implementation of paper `Extended-IoU Loss: A Systematic + IoU-Related Method: Beyond Simplified Regression for Better + Localization `_ + + Code is modified from https://github.com//ShiqiYu/libfacedetection.train. + + Args: + eps (float): Epsilon to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + smooth_point (float): hyperparameter, default is 0.1. + """ + + def __init__(self, + eps: float = 1e-6, + reduction: str = 'mean', + loss_weight: float = 1.0, + smooth_point: float = 0.1) -> None: + super().__init__() + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + self.smooth_point = smooth_point + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): The learning target of the prediction, + shape (n, 4). + weight (Optional[Tensor], optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (Optional[int], optional): Average factor that is used + to average the loss. Defaults to None. + reduction_override (Optional[str], optional): The reduction method + used to override the original reduction method of the loss. + Defaults to None. Options are "none", "mean" and "sum". + + Returns: + Tensor: Loss tensor. + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if weight is not None and weight.dim() > 1: + assert weight.shape == pred.shape + weight = weight.mean(-1) + loss = self.loss_weight * eiou_loss( + pred, + target, + weight, + smooth_point=self.smooth_point, + eps=self.eps, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss + + +@MODELS.register_module() +class SIoULoss(nn.Module): + r"""`Implementation of paper `SIoU Loss: More Powerful Learning + for Bounding Box Regression `_. + + Code is modified from https://github.com/meituan/YOLOv6. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + eps (float): Eps to avoid log(0). + neg_gamma (bool): `True` follows original implementation in paper. + + Return: + Tensor: Loss tensor. + """ + + def __init__(self, + eps: float = 1e-6, + reduction: str = 'mean', + loss_weight: float = 1.0, + neg_gamma: bool = False) -> None: + super().__init__() + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + self.neg_gamma = neg_gamma + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (Tensor): The learning target of the prediction, + shape (n, 4). + weight (Optional[Tensor], optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (Optional[int], optional): Average factor that is used + to average the loss. Defaults to None. + reduction_override (Optional[str], optional): The reduction method + used to override the original reduction method of the loss. + Defaults to None. Options are "none", "mean" and "sum". + + Returns: + Tensor: Loss tensor. + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if weight is not None and weight.dim() > 1: + # TODO: remove this in the future + # reduce the weight of shape (n, 4) to (n,) to match the + # giou_loss of shape (n,) + assert weight.shape == pred.shape + weight = weight.mean(-1) + loss = self.loss_weight * siou_loss( + pred, + target, + weight, + eps=self.eps, + reduction=reduction, + avg_factor=avg_factor, + neg_gamma=self.neg_gamma, + **kwargs) + return loss diff --git a/mmdetection/mmdet/models/losses/kd_loss.py b/mmdetection/mmdet/models/losses/kd_loss.py new file mode 100644 index 0000000..0a7d5ef --- /dev/null +++ b/mmdetection/mmdet/models/losses/kd_loss.py @@ -0,0 +1,95 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from mmdet.registry import MODELS +from .utils import weighted_loss + + +@weighted_loss +def knowledge_distillation_kl_div_loss(pred: Tensor, + soft_label: Tensor, + T: int, + detach_target: bool = True) -> Tensor: + r"""Loss function for knowledge distilling using KL divergence. + + Args: + pred (Tensor): Predicted logits with shape (N, n + 1). + soft_label (Tensor): Target logits with shape (N, N + 1). + T (int): Temperature for distillation. + detach_target (bool): Remove soft_label from automatic differentiation + + Returns: + Tensor: Loss tensor with shape (N,). + """ + assert pred.size() == soft_label.size() + target = F.softmax(soft_label / T, dim=1) + if detach_target: + target = target.detach() + + kd_loss = F.kl_div( + F.log_softmax(pred / T, dim=1), target, reduction='none').mean(1) * ( + T * T) + + return kd_loss + + +@MODELS.register_module() +class KnowledgeDistillationKLDivLoss(nn.Module): + """Loss function for knowledge distilling using KL divergence. + + Args: + reduction (str): Options are `'none'`, `'mean'` and `'sum'`. + loss_weight (float): Loss weight of current loss. + T (int): Temperature for distillation. + """ + + def __init__(self, + reduction: str = 'mean', + loss_weight: float = 1.0, + T: int = 10) -> None: + super().__init__() + assert T >= 1 + self.reduction = reduction + self.loss_weight = loss_weight + self.T = T + + def forward(self, + pred: Tensor, + soft_label: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function. + + Args: + pred (Tensor): Predicted logits with shape (N, n + 1). + soft_label (Tensor): Target logits with shape (N, N + 1). + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + + Returns: + Tensor: Loss tensor. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + + reduction = ( + reduction_override if reduction_override else self.reduction) + + loss_kd = self.loss_weight * knowledge_distillation_kl_div_loss( + pred, + soft_label, + weight, + reduction=reduction, + avg_factor=avg_factor, + T=self.T) + + return loss_kd diff --git a/mmdetection/mmdet/models/losses/l2_loss.py b/mmdetection/mmdet/models/losses/l2_loss.py new file mode 100644 index 0000000..6210a30 --- /dev/null +++ b/mmdetection/mmdet/models/losses/l2_loss.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import numpy as np +import torch +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from .utils import weighted_loss + + +@weighted_loss +def l2_loss(pred: Tensor, target: Tensor) -> Tensor: + """L2 loss. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + + Returns: + torch.Tensor: Calculated loss + """ + assert pred.size() == target.size() + loss = torch.abs(pred - target)**2 + return loss + + +@MODELS.register_module() +class L2Loss(BaseModule): + """L2 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, + neg_pos_ub: int = -1, + pos_margin: float = -1, + neg_margin: float = -1, + hard_mining: bool = False, + reduction: str = 'mean', + loss_weight: float = 1.0): + super(L2Loss, self).__init__() + self.neg_pos_ub = neg_pos_ub + self.pos_margin = pos_margin + self.neg_margin = neg_margin + self.hard_mining = hard_mining + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[float] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (float, optional): Average factor that is used to + average the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + pred, weight, avg_factor = self.update_weight(pred, target, weight, + avg_factor) + loss_bbox = self.loss_weight * l2_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox + + def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor, + avg_factor: float) -> Tuple[Tensor, Tensor, float]: + """Update the weight according to targets.""" + if weight is None: + weight = target.new_ones(target.size()) + + invalid_inds = weight <= 0 + target[invalid_inds] = -1 + pos_inds = target == 1 + neg_inds = target == 0 + + if self.pos_margin > 0: + pred[pos_inds] -= self.pos_margin + if self.neg_margin > 0: + pred[neg_inds] -= self.neg_margin + pred = torch.clamp(pred, min=0, max=1) + + num_pos = int((target == 1).sum()) + num_neg = int((target == 0).sum()) + if self.neg_pos_ub > 0 and num_neg / (num_pos + + 1e-6) > self.neg_pos_ub: + num_neg = num_pos * self.neg_pos_ub + neg_idx = torch.nonzero(target == 0, as_tuple=False) + + if self.hard_mining: + costs = l2_loss( + pred, target, reduction='none')[neg_idx[:, 0], + neg_idx[:, 1]].detach() + neg_idx = neg_idx[costs.topk(num_neg)[1], :] + else: + neg_idx = self.random_choice(neg_idx, num_neg) + + new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool() + new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True + + invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds) + weight[invalid_neg_inds] = 0 + + avg_factor = (weight > 0).sum() + return pred, weight, avg_factor + + @staticmethod + def random_choice(gallery: Union[list, np.ndarray, Tensor], + num: int) -> np.ndarray: + """Random select some elements from the gallery. + + It seems that Pytorch's implementation is slower than numpy so we use + numpy to randperm the indices. + """ + assert len(gallery) >= num + if isinstance(gallery, list): + gallery = np.array(gallery) + cands = np.arange(len(gallery)) + np.random.shuffle(cands) + rand_inds = cands[:num] + if not isinstance(gallery, np.ndarray): + rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device) + return gallery[rand_inds] diff --git a/mmdetection/mmdet/models/losses/margin_loss.py b/mmdetection/mmdet/models/losses/margin_loss.py new file mode 100644 index 0000000..0609e1d --- /dev/null +++ b/mmdetection/mmdet/models/losses/margin_loss.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import numpy as np +import torch +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from .mse_loss import mse_loss + + +@MODELS.register_module() +class MarginL2Loss(BaseModule): + """L2 loss with margin. + + Args: + neg_pos_ub (int, optional): The upper bound of negative to positive + samples in hard mining. Defaults to -1. + pos_margin (float, optional): The similarity margin for positive + samples in hard mining. Defaults to -1. + neg_margin (float, optional): The similarity margin for negative + samples in hard mining. Defaults to -1. + hard_mining (bool, optional): Whether to use hard mining. Defaults to + False. + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". Defaults to "mean". + loss_weight (float, optional): The weight of loss. Defaults to 1.0. + """ + + def __init__(self, + neg_pos_ub: int = -1, + pos_margin: float = -1, + neg_margin: float = -1, + hard_mining: bool = False, + reduction: str = 'mean', + loss_weight: float = 1.0): + super(MarginL2Loss, self).__init__() + self.neg_pos_ub = neg_pos_ub + self.pos_margin = pos_margin + self.neg_margin = neg_margin + self.hard_mining = hard_mining + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[float] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (float, optional): Average factor that is used to + average the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + pred, weight, avg_factor = self.update_weight(pred, target, weight, + avg_factor) + loss_bbox = self.loss_weight * mse_loss( + pred, + target.float(), + weight.float(), + reduction=reduction, + avg_factor=avg_factor) + return loss_bbox + + def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor, + avg_factor: float) -> Tuple[Tensor, Tensor, float]: + """Update the weight according to targets. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor): The weight of loss for each prediction. + avg_factor (float): Average factor that is used to average the + loss. + + Returns: + tuple[torch.Tensor]: The updated prediction, weight and average + factor. + """ + if weight is None: + weight = target.new_ones(target.size()) + + invalid_inds = weight <= 0 + target[invalid_inds] = -1 + pos_inds = target == 1 + neg_inds = target == 0 + + if self.pos_margin > 0: + pred[pos_inds] -= self.pos_margin + if self.neg_margin > 0: + pred[neg_inds] -= self.neg_margin + pred = torch.clamp(pred, min=0, max=1) + + num_pos = int((target == 1).sum()) + num_neg = int((target == 0).sum()) + if self.neg_pos_ub > 0 and num_neg / (num_pos + + 1e-6) > self.neg_pos_ub: + num_neg = num_pos * self.neg_pos_ub + neg_idx = torch.nonzero(target == 0, as_tuple=False) + + if self.hard_mining: + costs = mse_loss( + pred, target.float(), + reduction='none')[neg_idx[:, 0], neg_idx[:, 1]].detach() + neg_idx = neg_idx[costs.topk(num_neg)[1], :] + else: + neg_idx = self.random_choice(neg_idx, num_neg) + + new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool() + new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True + + invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds) + weight[invalid_neg_inds] = 0 + + avg_factor = (weight > 0).sum() + return pred, weight, avg_factor + + @staticmethod + def random_choice(gallery: Union[list, np.ndarray, Tensor], + num: int) -> np.ndarray: + """Random select some elements from the gallery. + + It seems that Pytorch's implementation is slower than numpy so we use + numpy to randperm the indices. + + Args: + gallery (list | np.ndarray | torch.Tensor): The gallery from + which to sample. + num (int): The number of elements to sample. + """ + assert len(gallery) >= num + if isinstance(gallery, list): + gallery = np.array(gallery) + cands = np.arange(len(gallery)) + np.random.shuffle(cands) + rand_inds = cands[:num] + if not isinstance(gallery, np.ndarray): + rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device) + return gallery[rand_inds] diff --git a/mmdetection/mmdet/models/losses/mse_loss.py b/mmdetection/mmdet/models/losses/mse_loss.py new file mode 100644 index 0000000..6048218 --- /dev/null +++ b/mmdetection/mmdet/models/losses/mse_loss.py @@ -0,0 +1,69 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from mmdet.registry import MODELS +from .utils import weighted_loss + + +@weighted_loss +def mse_loss(pred: Tensor, target: Tensor) -> Tensor: + """A Wrapper of MSE loss. + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + + Returns: + Tensor: loss Tensor + """ + return F.mse_loss(pred, target, reduction='none') + + +@MODELS.register_module() +class MSELoss(nn.Module): + """MSELoss. + + Args: + reduction (str, optional): The method that reduces the loss to a + scalar. Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of the loss. Defaults to 1.0 + """ + + def __init__(self, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + super().__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function of loss. + + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + weight (Tensor, optional): Weight of the loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + + Returns: + Tensor: The calculated loss. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss = self.loss_weight * mse_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss diff --git a/mmdetection/mmdet/models/losses/multipos_cross_entropy_loss.py b/mmdetection/mmdet/models/losses/multipos_cross_entropy_loss.py new file mode 100644 index 0000000..a7d1561 --- /dev/null +++ b/mmdetection/mmdet/models/losses/multipos_cross_entropy_loss.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from .utils import weight_reduce_loss + + +@MODELS.register_module() +class MultiPosCrossEntropyLoss(BaseModule): + """multi-positive targets cross entropy loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". Defaults to "mean". + loss_weight (float, optional): The weight of loss. Defaults to 1.0. + """ + + def __init__(self, reduction: str = 'mean', loss_weight: float = 1.0): + super(MultiPosCrossEntropyLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def multi_pos_cross_entropy(self, + pred: Tensor, + label: Tensor, + weight: Optional[Tensor] = None, + reduction: str = 'mean', + avg_factor: Optional[float] = None) -> Tensor: + """Multi-positive targets cross entropy loss. + + Args: + pred (torch.Tensor): The prediction. + label (torch.Tensor): The assigned label of the prediction. + weight (torch.Tensor): The element-wise weight. + reduction (str): Same as built-in losses of PyTorch. + avg_factor (float): Average factor when computing + the mean of losses. + + Returns: + torch.Tensor: Calculated loss + """ + + pos_inds = (label >= 1) + neg_inds = (label == 0) + pred_pos = pred * pos_inds.float() + pred_neg = pred * neg_inds.float() + # use -inf to mask out unwanted elements. + pred_pos[neg_inds] = pred_pos[neg_inds] + float('inf') + pred_neg[pos_inds] = pred_neg[pos_inds] + float('-inf') + + _pos_expand = torch.repeat_interleave(pred_pos, pred.shape[1], dim=1) + _neg_expand = pred_neg.repeat(1, pred.shape[1]) + + x = torch.nn.functional.pad((_neg_expand - _pos_expand), (0, 1), + 'constant', 0) + loss = torch.logsumexp(x, dim=1) + + # apply weights and do the reduction + if weight is not None: + weight = weight.float() + loss = weight_reduce_loss( + loss, weight=weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + def forward(self, + cls_score: Tensor, + label: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[float] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + cls_score (torch.Tensor): The classification score. + label (torch.Tensor): The assigned label of the prediction. + weight (torch.Tensor): The element-wise weight. + avg_factor (float): Average factor when computing + the mean of losses. + reduction_override (str): Same as built-in losses of PyTorch. + + Returns: + torch.Tensor: Calculated loss + """ + assert cls_score.size() == label.size() + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_cls = self.loss_weight * self.multi_pos_cross_entropy( + cls_score, + label, + weight, + reduction=reduction, + avg_factor=avg_factor) + return loss_cls diff --git a/mmdetection/mmdet/models/losses/pisa_loss.py b/mmdetection/mmdet/models/losses/pisa_loss.py new file mode 100644 index 0000000..b192aa0 --- /dev/null +++ b/mmdetection/mmdet/models/losses/pisa_loss.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from torch import Tensor + +from mmdet.structures.bbox import bbox_overlaps +from ..task_modules.coders import BaseBBoxCoder +from ..task_modules.samplers import SamplingResult + + +def isr_p(cls_score: Tensor, + bbox_pred: Tensor, + bbox_targets: Tuple[Tensor], + rois: Tensor, + sampling_results: List[SamplingResult], + loss_cls: nn.Module, + bbox_coder: BaseBBoxCoder, + k: float = 2, + bias: float = 0, + num_class: int = 80) -> tuple: + """Importance-based Sample Reweighting (ISR_P), positive part. + + Args: + cls_score (Tensor): Predicted classification scores. + bbox_pred (Tensor): Predicted bbox deltas. + bbox_targets (tuple[Tensor]): A tuple of bbox targets, the are + labels, label_weights, bbox_targets, bbox_weights, respectively. + rois (Tensor): Anchors (single_stage) in shape (n, 4) or RoIs + (two_stage) in shape (n, 5). + sampling_results (:obj:`SamplingResult`): Sampling results. + loss_cls (:obj:`nn.Module`): Classification loss func of the head. + bbox_coder (:obj:`BaseBBoxCoder`): BBox coder of the head. + k (float): Power of the non-linear mapping. Defaults to 2. + bias (float): Shift of the non-linear mapping. Defaults to 0. + num_class (int): Number of classes, defaults to 80. + + Return: + tuple([Tensor]): labels, imp_based_label_weights, bbox_targets, + bbox_target_weights + """ + + labels, label_weights, bbox_targets, bbox_weights = bbox_targets + pos_label_inds = ((labels >= 0) & + (labels < num_class)).nonzero().reshape(-1) + pos_labels = labels[pos_label_inds] + + # if no positive samples, return the original targets + num_pos = float(pos_label_inds.size(0)) + if num_pos == 0: + return labels, label_weights, bbox_targets, bbox_weights + + # merge pos_assigned_gt_inds of per image to a single tensor + gts = list() + last_max_gt = 0 + for i in range(len(sampling_results)): + gt_i = sampling_results[i].pos_assigned_gt_inds + gts.append(gt_i + last_max_gt) + if len(gt_i) != 0: + last_max_gt = gt_i.max() + 1 + gts = torch.cat(gts) + assert len(gts) == num_pos + + cls_score = cls_score.detach() + bbox_pred = bbox_pred.detach() + + # For single stage detectors, rois here indicate anchors, in shape (N, 4) + # For two stage detectors, rois are in shape (N, 5) + if rois.size(-1) == 5: + pos_rois = rois[pos_label_inds][:, 1:] + else: + pos_rois = rois[pos_label_inds] + + if bbox_pred.size(-1) > 4: + bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4) + pos_delta_pred = bbox_pred[pos_label_inds, pos_labels].view(-1, 4) + else: + pos_delta_pred = bbox_pred[pos_label_inds].view(-1, 4) + + # compute iou of the predicted bbox and the corresponding GT + pos_delta_target = bbox_targets[pos_label_inds].view(-1, 4) + pos_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_pred) + target_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_target) + ious = bbox_overlaps(pos_bbox_pred, target_bbox_pred, is_aligned=True) + + pos_imp_weights = label_weights[pos_label_inds] + # Two steps to compute IoU-HLR. Samples are first sorted by IoU locally, + # then sorted again within the same-rank group + max_l_num = pos_labels.bincount().max() + for label in pos_labels.unique(): + l_inds = (pos_labels == label).nonzero().view(-1) + l_gts = gts[l_inds] + for t in l_gts.unique(): + t_inds = l_inds[l_gts == t] + t_ious = ious[t_inds] + _, t_iou_rank_idx = t_ious.sort(descending=True) + _, t_iou_rank = t_iou_rank_idx.sort() + ious[t_inds] += max_l_num - t_iou_rank.float() + l_ious = ious[l_inds] + _, l_iou_rank_idx = l_ious.sort(descending=True) + _, l_iou_rank = l_iou_rank_idx.sort() # IoU-HLR + # linearly map HLR to label weights + pos_imp_weights[l_inds] *= (max_l_num - l_iou_rank.float()) / max_l_num + + pos_imp_weights = (bias + pos_imp_weights * (1 - bias)).pow(k) + + # normalize to make the new weighted loss value equal to the original loss + pos_loss_cls = loss_cls( + cls_score[pos_label_inds], pos_labels, reduction_override='none') + if pos_loss_cls.dim() > 1: + ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds][:, + None] + new_pos_loss_cls = pos_loss_cls * pos_imp_weights[:, None] + else: + ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds] + new_pos_loss_cls = pos_loss_cls * pos_imp_weights + pos_loss_cls_ratio = ori_pos_loss_cls.sum() / new_pos_loss_cls.sum() + pos_imp_weights = pos_imp_weights * pos_loss_cls_ratio + label_weights[pos_label_inds] = pos_imp_weights + + bbox_targets = labels, label_weights, bbox_targets, bbox_weights + return bbox_targets + + +def carl_loss(cls_score: Tensor, + labels: Tensor, + bbox_pred: Tensor, + bbox_targets: Tensor, + loss_bbox: nn.Module, + k: float = 1, + bias: float = 0.2, + avg_factor: Optional[int] = None, + sigmoid: bool = False, + num_class: int = 80) -> dict: + """Classification-Aware Regression Loss (CARL). + + Args: + cls_score (Tensor): Predicted classification scores. + labels (Tensor): Targets of classification. + bbox_pred (Tensor): Predicted bbox deltas. + bbox_targets (Tensor): Target of bbox regression. + loss_bbox (func): Regression loss func of the head. + bbox_coder (obj): BBox coder of the head. + k (float): Power of the non-linear mapping. Defaults to 1. + bias (float): Shift of the non-linear mapping. Defaults to 0.2. + avg_factor (int, optional): Average factor used in regression loss. + sigmoid (bool): Activation of the classification score. + num_class (int): Number of classes, defaults to 80. + + Return: + dict: CARL loss dict. + """ + pos_label_inds = ((labels >= 0) & + (labels < num_class)).nonzero().reshape(-1) + if pos_label_inds.numel() == 0: + return dict(loss_carl=cls_score.sum()[None] * 0.) + pos_labels = labels[pos_label_inds] + + # multiply pos_cls_score with the corresponding bbox weight + # and remain gradient + if sigmoid: + pos_cls_score = cls_score.sigmoid()[pos_label_inds, pos_labels] + else: + pos_cls_score = cls_score.softmax(-1)[pos_label_inds, pos_labels] + carl_loss_weights = (bias + (1 - bias) * pos_cls_score).pow(k) + + # normalize carl_loss_weight to make its sum equal to num positive + num_pos = float(pos_cls_score.size(0)) + weight_ratio = num_pos / carl_loss_weights.sum() + carl_loss_weights *= weight_ratio + + if avg_factor is None: + avg_factor = bbox_targets.size(0) + # if is class agnostic, bbox pred is in shape (N, 4) + # otherwise, bbox pred is in shape (N, #classes, 4) + if bbox_pred.size(-1) > 4: + bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4) + pos_bbox_preds = bbox_pred[pos_label_inds, pos_labels] + else: + pos_bbox_preds = bbox_pred[pos_label_inds] + ori_loss_reg = loss_bbox( + pos_bbox_preds, + bbox_targets[pos_label_inds], + reduction_override='none') / avg_factor + loss_carl = (ori_loss_reg * carl_loss_weights[:, None]).sum() + return dict(loss_carl=loss_carl[None]) diff --git a/mmdetection/mmdet/models/losses/seesaw_loss.py b/mmdetection/mmdet/models/losses/seesaw_loss.py new file mode 100644 index 0000000..4dec62b --- /dev/null +++ b/mmdetection/mmdet/models/losses/seesaw_loss.py @@ -0,0 +1,278 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from mmdet.registry import MODELS +from .accuracy import accuracy +from .cross_entropy_loss import cross_entropy +from .utils import weight_reduce_loss + + +def seesaw_ce_loss(cls_score: Tensor, + labels: Tensor, + label_weights: Tensor, + cum_samples: Tensor, + num_classes: int, + p: float, + q: float, + eps: float, + reduction: str = 'mean', + avg_factor: Optional[int] = None) -> Tensor: + """Calculate the Seesaw CrossEntropy loss. + + Args: + cls_score (Tensor): The prediction with shape (N, C), + C is the number of classes. + labels (Tensor): The learning label of the prediction. + label_weights (Tensor): Sample-wise loss weight. + cum_samples (Tensor): Cumulative samples for each category. + num_classes (int): The number of classes. + p (float): The ``p`` in the mitigation factor. + q (float): The ``q`` in the compenstation factor. + eps (float): The minimal value of divisor to smooth + the computation of compensation factor + reduction (str, optional): The method used to reduce the loss. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + + Returns: + Tensor: The calculated loss + """ + assert cls_score.size(-1) == num_classes + assert len(cum_samples) == num_classes + + onehot_labels = F.one_hot(labels, num_classes) + seesaw_weights = cls_score.new_ones(onehot_labels.size()) + + # mitigation factor + if p > 0: + sample_ratio_matrix = cum_samples[None, :].clamp( + min=1) / cum_samples[:, None].clamp(min=1) + index = (sample_ratio_matrix < 1.0).float() + sample_weights = sample_ratio_matrix.pow(p) * index + (1 - index) + mitigation_factor = sample_weights[labels.long(), :] + seesaw_weights = seesaw_weights * mitigation_factor + + # compensation factor + if q > 0: + scores = F.softmax(cls_score.detach(), dim=1) + self_scores = scores[ + torch.arange(0, len(scores)).to(scores.device).long(), + labels.long()] + score_matrix = scores / self_scores[:, None].clamp(min=eps) + index = (score_matrix > 1.0).float() + compensation_factor = score_matrix.pow(q) * index + (1 - index) + seesaw_weights = seesaw_weights * compensation_factor + + cls_score = cls_score + (seesaw_weights.log() * (1 - onehot_labels)) + + loss = F.cross_entropy(cls_score, labels, weight=None, reduction='none') + + if label_weights is not None: + label_weights = label_weights.float() + loss = weight_reduce_loss( + loss, weight=label_weights, reduction=reduction, avg_factor=avg_factor) + return loss + + +@MODELS.register_module() +class SeesawLoss(nn.Module): + """ + Seesaw Loss for Long-Tailed Instance Segmentation (CVPR 2021) + arXiv: https://arxiv.org/abs/2008.10032 + + Args: + use_sigmoid (bool, optional): Whether the prediction uses sigmoid + of softmax. Only False is supported. + p (float, optional): The ``p`` in the mitigation factor. + Defaults to 0.8. + q (float, optional): The ``q`` in the compenstation factor. + Defaults to 2.0. + num_classes (int, optional): The number of classes. + Default to 1203 for LVIS v1 dataset. + eps (float, optional): The minimal value of divisor to smooth + the computation of compensation factor + reduction (str, optional): The method that reduces the loss to a + scalar. Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of the loss. Defaults to 1.0 + return_dict (bool, optional): Whether return the losses as a dict. + Default to True. + """ + + def __init__(self, + use_sigmoid: bool = False, + p: float = 0.8, + q: float = 2.0, + num_classes: int = 1203, + eps: float = 1e-2, + reduction: str = 'mean', + loss_weight: float = 1.0, + return_dict: bool = True) -> None: + super().__init__() + assert not use_sigmoid + self.use_sigmoid = False + self.p = p + self.q = q + self.num_classes = num_classes + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + self.return_dict = return_dict + + # 0 for pos, 1 for neg + self.cls_criterion = seesaw_ce_loss + + # cumulative samples for each category + self.register_buffer( + 'cum_samples', + torch.zeros(self.num_classes + 1, dtype=torch.float)) + + # custom output channels of the classifier + self.custom_cls_channels = True + # custom activation of cls_score + self.custom_activation = True + # custom accuracy of the classsifier + self.custom_accuracy = True + + def _split_cls_score(self, cls_score: Tensor) -> Tuple[Tensor, Tensor]: + """split cls_score. + + Args: + cls_score (Tensor): The prediction with shape (N, C + 2). + + Returns: + Tuple[Tensor, Tensor]: The score for classes and objectness, + respectively + """ + # split cls_score to cls_score_classes and cls_score_objectness + assert cls_score.size(-1) == self.num_classes + 2 + cls_score_classes = cls_score[..., :-2] + cls_score_objectness = cls_score[..., -2:] + return cls_score_classes, cls_score_objectness + + def get_cls_channels(self, num_classes: int) -> int: + """Get custom classification channels. + + Args: + num_classes (int): The number of classes. + + Returns: + int: The custom classification channels. + """ + assert num_classes == self.num_classes + return num_classes + 2 + + def get_activation(self, cls_score: Tensor) -> Tensor: + """Get custom activation of cls_score. + + Args: + cls_score (Tensor): The prediction with shape (N, C + 2). + + Returns: + Tensor: The custom activation of cls_score with shape + (N, C + 1). + """ + cls_score_classes, cls_score_objectness = self._split_cls_score( + cls_score) + score_classes = F.softmax(cls_score_classes, dim=-1) + score_objectness = F.softmax(cls_score_objectness, dim=-1) + score_pos = score_objectness[..., [0]] + score_neg = score_objectness[..., [1]] + score_classes = score_classes * score_pos + scores = torch.cat([score_classes, score_neg], dim=-1) + return scores + + def get_accuracy(self, cls_score: Tensor, + labels: Tensor) -> Dict[str, Tensor]: + """Get custom accuracy w.r.t. cls_score and labels. + + Args: + cls_score (Tensor): The prediction with shape (N, C + 2). + labels (Tensor): The learning label of the prediction. + + Returns: + Dict [str, Tensor]: The accuracy for objectness and classes, + respectively. + """ + pos_inds = labels < self.num_classes + obj_labels = (labels == self.num_classes).long() + cls_score_classes, cls_score_objectness = self._split_cls_score( + cls_score) + acc_objectness = accuracy(cls_score_objectness, obj_labels) + acc_classes = accuracy(cls_score_classes[pos_inds], labels[pos_inds]) + acc = dict() + acc['acc_objectness'] = acc_objectness + acc['acc_classes'] = acc_classes + return acc + + def forward( + self, + cls_score: Tensor, + labels: Tensor, + label_weights: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None + ) -> Union[Tensor, Dict[str, Tensor]]: + """Forward function. + + Args: + cls_score (Tensor): The prediction with shape (N, C + 2). + labels (Tensor): The learning label of the prediction. + label_weights (Tensor, optional): Sample-wise loss weight. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + + Returns: + Tensor | Dict [str, Tensor]: + if return_dict == False: The calculated loss | + if return_dict == True: The dict of calculated losses + for objectness and classes, respectively. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + assert cls_score.size(-1) == self.num_classes + 2 + pos_inds = labels < self.num_classes + # 0 for pos, 1 for neg + obj_labels = (labels == self.num_classes).long() + + # accumulate the samples for each category + unique_labels = labels.unique() + for u_l in unique_labels: + inds_ = labels == u_l.item() + self.cum_samples[u_l] += inds_.sum() + + if label_weights is not None: + label_weights = label_weights.float() + else: + label_weights = labels.new_ones(labels.size(), dtype=torch.float) + + cls_score_classes, cls_score_objectness = self._split_cls_score( + cls_score) + # calculate loss_cls_classes (only need pos samples) + if pos_inds.sum() > 0: + loss_cls_classes = self.loss_weight * self.cls_criterion( + cls_score_classes[pos_inds], labels[pos_inds], + label_weights[pos_inds], self.cum_samples[:self.num_classes], + self.num_classes, self.p, self.q, self.eps, reduction, + avg_factor) + else: + loss_cls_classes = cls_score_classes[pos_inds].sum() + # calculate loss_cls_objectness + loss_cls_objectness = self.loss_weight * cross_entropy( + cls_score_objectness, obj_labels, label_weights, reduction, + avg_factor) + + if self.return_dict: + loss_cls = dict() + loss_cls['loss_cls_objectness'] = loss_cls_objectness + loss_cls['loss_cls_classes'] = loss_cls_classes + else: + loss_cls = loss_cls_classes + loss_cls_objectness + return loss_cls diff --git a/mmdetection/mmdet/models/losses/smooth_l1_loss.py b/mmdetection/mmdet/models/losses/smooth_l1_loss.py new file mode 100644 index 0000000..102f978 --- /dev/null +++ b/mmdetection/mmdet/models/losses/smooth_l1_loss.py @@ -0,0 +1,165 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from mmdet.registry import MODELS +from .utils import weighted_loss + + +@weighted_loss +def smooth_l1_loss(pred: Tensor, target: Tensor, beta: float = 1.0) -> Tensor: + """Smooth L1 loss. + + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + beta (float, optional): The threshold in the piecewise function. + Defaults to 1.0. + + Returns: + Tensor: Calculated loss + """ + assert beta > 0 + if target.numel() == 0: + return pred.sum() * 0 + + assert pred.size() == target.size() + diff = torch.abs(pred - target) + loss = torch.where(diff < beta, 0.5 * diff * diff / beta, + diff - 0.5 * beta) + return loss + + +@weighted_loss +def l1_loss(pred: Tensor, target: Tensor) -> Tensor: + """L1 loss. + + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + + Returns: + Tensor: Calculated loss + """ + if target.numel() == 0: + return pred.sum() * 0 + + assert pred.size() == target.size() + loss = torch.abs(pred - target) + return loss + + +@MODELS.register_module() +class SmoothL1Loss(nn.Module): + """Smooth L1 loss. + + Args: + beta (float, optional): The threshold in the piecewise function. + Defaults to 1.0. + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". Defaults to "mean". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, + beta: float = 1.0, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + super().__init__() + self.beta = beta + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + + Returns: + Tensor: Calculated loss + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_bbox = self.loss_weight * smooth_l1_loss( + pred, + target, + weight, + beta=self.beta, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss_bbox + + +@MODELS.register_module() +class L1Loss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + super().__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function. + + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + + Returns: + Tensor: Calculated loss + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_bbox = self.loss_weight * l1_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox diff --git a/mmdetection/mmdet/models/losses/triplet_loss.py b/mmdetection/mmdet/models/losses/triplet_loss.py new file mode 100644 index 0000000..d9c9604 --- /dev/null +++ b/mmdetection/mmdet/models/losses/triplet_loss.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmengine.model import BaseModule + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class TripletLoss(BaseModule): + """Triplet loss with hard positive/negative mining. + + Reference: + Hermans et al. In Defense of the Triplet Loss for + Person Re-Identification. arXiv:1703.07737. + Imported from ``_. + Args: + margin (float, optional): Margin for triplet loss. Defaults to 0.3. + loss_weight (float, optional): Weight of the loss. Defaults to 1.0. + hard_mining (bool, optional): Whether to perform hard mining. + Defaults to True. + """ + + def __init__(self, + margin: float = 0.3, + loss_weight: float = 1.0, + hard_mining=True): + super(TripletLoss, self).__init__() + self.margin = margin + self.ranking_loss = nn.MarginRankingLoss(margin=margin) + self.loss_weight = loss_weight + self.hard_mining = hard_mining + + def hard_mining_triplet_loss_forward( + self, inputs: torch.Tensor, + targets: torch.LongTensor) -> torch.Tensor: + """ + Args: + inputs (torch.Tensor): feature matrix with shape + (batch_size, feat_dim). + targets (torch.LongTensor): ground truth labels with shape + (num_classes). + + Returns: + torch.Tensor: triplet loss with hard mining. + """ + + batch_size = inputs.size(0) + + # Compute Euclidean distance + dist = torch.pow(inputs, 2).sum( + dim=1, keepdim=True).expand(batch_size, batch_size) + dist = dist + dist.t() + dist.addmm_(inputs, inputs.t(), beta=1, alpha=-2) + dist = dist.clamp(min=1e-12).sqrt() # for numerical stability + + # For each anchor, find the furthest positive sample + # and nearest negative sample in the embedding space + mask = targets.expand(batch_size, batch_size).eq( + targets.expand(batch_size, batch_size).t()) + dist_ap, dist_an = [], [] + for i in range(batch_size): + dist_ap.append(dist[i][mask[i]].max().unsqueeze(0)) + dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0)) + dist_ap = torch.cat(dist_ap) + dist_an = torch.cat(dist_an) + + # Compute ranking hinge loss + y = torch.ones_like(dist_an) + return self.loss_weight * self.ranking_loss(dist_an, dist_ap, y) + + def forward(self, inputs: torch.Tensor, + targets: torch.LongTensor) -> torch.Tensor: + """ + Args: + inputs (torch.Tensor): feature matrix with shape + (batch_size, feat_dim). + targets (torch.LongTensor): ground truth labels with shape + (num_classes). + + Returns: + torch.Tensor: triplet loss. + """ + if self.hard_mining: + return self.hard_mining_triplet_loss_forward(inputs, targets) + else: + raise NotImplementedError() diff --git a/mmdetection/mmdet/models/losses/utils.py b/mmdetection/mmdet/models/losses/utils.py new file mode 100644 index 0000000..5e6e785 --- /dev/null +++ b/mmdetection/mmdet/models/losses/utils.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +from typing import Callable, Optional + +import torch +import torch.nn.functional as F +from torch import Tensor + + +def reduce_loss(loss: Tensor, reduction: str) -> Tensor: + """Reduce loss as specified. + + Args: + loss (Tensor): Elementwise loss tensor. + reduction (str): Options are "none", "mean" and "sum". + + Return: + Tensor: Reduced loss tensor. + """ + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, elementwise_mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() + + +def weight_reduce_loss(loss: Tensor, + weight: Optional[Tensor] = None, + reduction: str = 'mean', + avg_factor: Optional[float] = None) -> Tensor: + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): Element-wise loss. + weight (Optional[Tensor], optional): Element-wise weights. + Defaults to None. + reduction (str, optional): Same as built-in losses of PyTorch. + Defaults to 'mean'. + avg_factor (Optional[float], optional): Average factor when + computing the mean of losses. Defaults to None. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + # Avoid causing ZeroDivisionError when avg_factor is 0.0, + # i.e., all labels of an image belong to ignore index. + eps = torch.finfo(torch.float32).eps + loss = loss.sum() / (avg_factor + eps) + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + + +def weighted_loss(loss_func: Callable) -> Callable: + """Create a weighted version of a given loss function. + + To use this decorator, the loss function must have the signature like + `loss_func(pred, target, **kwargs)`. The function only needs to compute + element-wise loss without any reduction. This decorator will add weight + and reduction arguments to the function. The decorated function will have + the signature like `loss_func(pred, target, weight=None, reduction='mean', + avg_factor=None, **kwargs)`. + + :Example: + + >>> import torch + >>> @weighted_loss + >>> def l1_loss(pred, target): + >>> return (pred - target).abs() + + >>> pred = torch.Tensor([0, 2, 3]) + >>> target = torch.Tensor([1, 1, 1]) + >>> weight = torch.Tensor([1, 0, 1]) + + >>> l1_loss(pred, target) + tensor(1.3333) + >>> l1_loss(pred, target, weight) + tensor(1.) + >>> l1_loss(pred, target, reduction='none') + tensor([1., 1., 2.]) + >>> l1_loss(pred, target, weight, avg_factor=2) + tensor(1.5000) + """ + + @functools.wraps(loss_func) + def wrapper(pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + reduction: str = 'mean', + avg_factor: Optional[int] = None, + **kwargs) -> Tensor: + """ + Args: + pred (Tensor): The prediction. + target (Tensor): Target bboxes. + weight (Optional[Tensor], optional): The weight of loss for each + prediction. Defaults to None. + reduction (str, optional): Options are "none", "mean" and "sum". + Defaults to 'mean'. + avg_factor (Optional[int], optional): Average factor that is used + to average the loss. Defaults to None. + + Returns: + Tensor: Loss tensor. + """ + # get element-wise loss + loss = loss_func(pred, target, **kwargs) + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + return wrapper diff --git a/mmdetection/mmdet/models/losses/varifocal_loss.py b/mmdetection/mmdet/models/losses/varifocal_loss.py new file mode 100644 index 0000000..58ab167 --- /dev/null +++ b/mmdetection/mmdet/models/losses/varifocal_loss.py @@ -0,0 +1,141 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from mmdet.registry import MODELS +from .utils import weight_reduce_loss + + +def varifocal_loss(pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + alpha: float = 0.75, + gamma: float = 2.0, + iou_weighted: bool = True, + reduction: str = 'mean', + avg_factor: Optional[int] = None) -> Tensor: + """`Varifocal Loss `_ + + Args: + pred (Tensor): The prediction with shape (N, C), C is the + number of classes. + target (Tensor): The learning target of the iou-aware + classification score with shape (N, C), C is the number of classes. + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + alpha (float, optional): A balance factor for the negative part of + Varifocal Loss, which is different from the alpha of Focal Loss. + Defaults to 0.75. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + iou_weighted (bool, optional): Whether to weight the loss of the + positive example with the iou target. Defaults to True. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. Options are "none", "mean" and + "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + + Returns: + Tensor: Loss tensor. + """ + # pred and target should be of the same size + assert pred.size() == target.size() + pred_sigmoid = pred.sigmoid() + target = target.type_as(pred) + if iou_weighted: + focal_weight = target * (target > 0.0).float() + \ + alpha * (pred_sigmoid - target).abs().pow(gamma) * \ + (target <= 0.0).float() + else: + focal_weight = (target > 0.0).float() + \ + alpha * (pred_sigmoid - target).abs().pow(gamma) * \ + (target <= 0.0).float() + loss = F.binary_cross_entropy_with_logits( + pred, target, reduction='none') * focal_weight + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + +@MODELS.register_module() +class VarifocalLoss(nn.Module): + + def __init__(self, + use_sigmoid: bool = True, + alpha: float = 0.75, + gamma: float = 2.0, + iou_weighted: bool = True, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + """`Varifocal Loss `_ + + Args: + use_sigmoid (bool, optional): Whether the prediction is + used for sigmoid or softmax. Defaults to True. + alpha (float, optional): A balance factor for the negative part of + Varifocal Loss, which is different from the alpha of Focal + Loss. Defaults to 0.75. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + iou_weighted (bool, optional): Whether to weight the loss of the + positive examples with the iou target. Defaults to True. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. Options are "none", "mean" and + "sum". + loss_weight (float, optional): Weight of loss. Defaults to 1.0. + """ + super().__init__() + assert use_sigmoid is True, \ + 'Only sigmoid varifocal loss supported now.' + assert alpha >= 0.0 + self.use_sigmoid = use_sigmoid + self.alpha = alpha + self.gamma = gamma + self.iou_weighted = iou_weighted + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function. + + Args: + pred (Tensor): The prediction with shape (N, C), C is the + number of classes. + target (Tensor): The learning target of the iou-aware + classification score with shape (N, C), C is + the number of classes. + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Options are "none", "mean" and "sum". + + Returns: + Tensor: The calculated loss + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.use_sigmoid: + loss_cls = self.loss_weight * varifocal_loss( + pred, + target, + weight, + alpha=self.alpha, + gamma=self.gamma, + iou_weighted=self.iou_weighted, + reduction=reduction, + avg_factor=avg_factor) + else: + raise NotImplementedError + return loss_cls diff --git a/mmdetection/mmdet/models/mot/__init__.py b/mmdetection/mmdet/models/mot/__init__.py new file mode 100644 index 0000000..1bd3c8d --- /dev/null +++ b/mmdetection/mmdet/models/mot/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseMOTModel +from .bytetrack import ByteTrack +from .deep_sort import DeepSORT +from .ocsort import OCSORT +from .qdtrack import QDTrack +from .strongsort import StrongSORT + +__all__ = [ + 'BaseMOTModel', 'ByteTrack', 'QDTrack', 'DeepSORT', 'StrongSORT', 'OCSORT' +] diff --git a/mmdetection/mmdet/models/mot/base.py b/mmdetection/mmdet/models/mot/base.py new file mode 100644 index 0000000..9981417 --- /dev/null +++ b/mmdetection/mmdet/models/mot/base.py @@ -0,0 +1,147 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Tuple, Union + +from mmengine.model import BaseModel +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import OptTrackSampleList, TrackSampleList +from mmdet.utils import OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class BaseMOTModel(BaseModel, metaclass=ABCMeta): + """Base class for multiple object tracking. + + Args: + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + init_cfg (dict or list[dict]): Initialization config dict. + """ + + def __init__(self, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + + def freeze_module(self, module: Union[List[str], Tuple[str], str]) -> None: + """Freeze module during training.""" + if isinstance(module, str): + modules = [module] + else: + if not (isinstance(module, list) or isinstance(module, tuple)): + raise TypeError('module must be a str or a list.') + else: + modules = module + for module in modules: + m = getattr(self, module) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + @property + def with_detector(self) -> bool: + """bool: whether the framework has a detector.""" + return hasattr(self, 'detector') and self.detector is not None + + @property + def with_reid(self) -> bool: + """bool: whether the framework has a reid model.""" + return hasattr(self, 'reid') and self.reid is not None + + @property + def with_motion(self) -> bool: + """bool: whether the framework has a motion model.""" + return hasattr(self, 'motion') and self.motion is not None + + @property + def with_track_head(self) -> bool: + """bool: whether the framework has a track_head.""" + return hasattr(self, 'track_head') and self.track_head is not None + + @property + def with_tracker(self) -> bool: + """bool: whether the framework has a tracker.""" + return hasattr(self, 'tracker') and self.tracker is not None + + def forward(self, + inputs: Dict[str, Tensor], + data_samples: OptTrackSampleList = None, + mode: str = 'predict', + **kwargs): + """The unified entry for a forward process in both training and test. + + The method should accept three modes: "tensor", "predict" and "loss": + + - "tensor": Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - "predict": Forward and return the predictions, which are fully + processed to a list of :obj:`TrackDataSample`. + - "loss": Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) + encoding input images. Typically these should be mean centered + and std scaled. The N denotes batch size. The T denotes the + number of key/reference frames. + - img (Tensor) : The key images. + - ref_img (Tensor): The reference images. + data_samples (list[:obj:`TrackDataSample`], optional): The + annotation data of every samples. Defaults to None. + mode (str): Return what kind of value. Defaults to 'predict'. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of :obj:`TrackDataSample`. + - If ``mode="loss"``, return a dict of tensor. + """ + if mode == 'loss': + return self.loss(inputs, data_samples, **kwargs) + elif mode == 'predict': + return self.predict(inputs, data_samples, **kwargs) + elif mode == 'tensor': + return self._forward(inputs, data_samples, **kwargs) + else: + raise RuntimeError(f'Invalid mode "{mode}". ' + 'Only supports loss, predict and tensor mode') + + @abstractmethod + def loss(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList, + **kwargs) -> Union[dict, tuple]: + """Calculate losses from a batch of inputs and data samples.""" + pass + + @abstractmethod + def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList, + **kwargs) -> TrackSampleList: + """Predict results from a batch of inputs and data samples with post- + processing.""" + pass + + def _forward(self, + inputs: Dict[str, Tensor], + data_samples: OptTrackSampleList = None, + **kwargs): + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + inputs (Dict[str, Tensor]): of shape (N, T, C, H, W). + data_samples (List[:obj:`TrackDataSample`], optional): The + Data Samples. It usually includes information such as + `gt_instance`. + + Returns: + tuple[list]: A tuple of features from ``head`` forward. + """ + raise NotImplementedError( + "_forward function (namely 'tensor' mode) is not supported now") diff --git a/mmdetection/mmdet/models/mot/bytetrack.py b/mmdetection/mmdet/models/mot/bytetrack.py new file mode 100644 index 0000000..8a3bb86 --- /dev/null +++ b/mmdetection/mmdet/models/mot/bytetrack.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional + +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList, TrackSampleList +from mmdet.utils import OptConfigType, OptMultiConfig +from .base import BaseMOTModel + + +@MODELS.register_module() +class ByteTrack(BaseMOTModel): + """ByteTrack: Multi-Object Tracking by Associating Every Detection Box. + + This multi object tracker is the implementation of `ByteTrack + `_. + + Args: + detector (dict): Configuration of detector. Defaults to None. + tracker (dict): Configuration of tracker. Defaults to None. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + init_cfg (dict or list[dict]): Configuration of initialization. + Defaults to None. + """ + + def __init__(self, + detector: Optional[dict] = None, + tracker: Optional[dict] = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(data_preprocessor, init_cfg) + + if detector is not None: + self.detector = MODELS.build(detector) + + if tracker is not None: + self.tracker = MODELS.build(tracker) + + def loss(self, inputs: Tensor, data_samples: SampleList, **kwargs) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (Tensor): of shape (N, C, H, W) encoding + input images. Typically these should be mean centered and std + scaled. The N denotes batch size + data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance`. + + Returns: + dict: A dictionary of loss components. + """ + return self.detector.loss(inputs, data_samples, **kwargs) + + def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList, + **kwargs) -> TrackSampleList: + """Predict results from a video and data samples with post-processing. + + Args: + inputs (Tensor): of shape (N, T, C, H, W) encoding + input images. The N denotes batch size. + The T denotes the number of frames in a video. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `video_data_samples`. + Returns: + TrackSampleList: Tracking results of the inputs. + """ + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + assert inputs.size(0) == 1, \ + 'Bytetrack inference only support ' \ + '1 batch size per gpu for now.' + + assert len(data_samples) == 1, \ + 'Bytetrack inference only support 1 batch size per gpu for now.' + + track_data_sample = data_samples[0] + video_len = len(track_data_sample) + + for frame_id in range(video_len): + img_data_sample = track_data_sample[frame_id] + single_img = inputs[:, frame_id].contiguous() + # det_results List[DetDataSample] + det_results = self.detector.predict(single_img, [img_data_sample]) + assert len(det_results) == 1, 'Batch inference is not supported.' + + pred_track_instances = self.tracker.track( + data_sample=det_results[0], **kwargs) + img_data_sample.pred_track_instances = pred_track_instances + + return [track_data_sample] diff --git a/mmdetection/mmdet/models/mot/deep_sort.py b/mmdetection/mmdet/models/mot/deep_sort.py new file mode 100644 index 0000000..70b30c7 --- /dev/null +++ b/mmdetection/mmdet/models/mot/deep_sort.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import TrackSampleList +from mmdet.utils import OptConfigType +from .base import BaseMOTModel + + +@MODELS.register_module() +class DeepSORT(BaseMOTModel): + """Simple online and realtime tracking with a deep association metric. + + Details can be found at `DeepSORT`_. + + Args: + detector (dict): Configuration of detector. Defaults to None. + reid (dict): Configuration of reid. Defaults to None + tracker (dict): Configuration of tracker. Defaults to None. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + init_cfg (dict or list[dict]): Configuration of initialization. + Defaults to None. + """ + + def __init__(self, + detector: Optional[dict] = None, + reid: Optional[dict] = None, + tracker: Optional[dict] = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptConfigType = None): + super().__init__(data_preprocessor, init_cfg) + + if detector is not None: + self.detector = MODELS.build(detector) + + if reid is not None: + self.reid = MODELS.build(reid) + + if tracker is not None: + self.tracker = MODELS.build(tracker) + + self.preprocess_cfg = data_preprocessor + + def loss(self, inputs: Tensor, data_samples: TrackSampleList, + **kwargs) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + raise NotImplementedError( + 'Please train `detector` and `reid` models firstly, then \ + inference with SORT/DeepSORT.') + + def predict(self, + inputs: Tensor, + data_samples: TrackSampleList, + rescale: bool = True, + **kwargs) -> TrackSampleList: + """Predict results from a video and data samples with post- processing. + + Args: + inputs (Tensor): of shape (N, T, C, H, W) encoding + input images. The N denotes batch size. + The T denotes the number of key frames + and reference frames. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance`. + rescale (bool, Optional): If False, then returned bboxes and masks + will fit the scale of img, otherwise, returned bboxes and masks + will fit the scale of original image shape. Defaults to True. + + Returns: + TrackSampleList: List[TrackDataSample] + Tracking results of the input videos. + Each DetDataSample usually contains ``pred_track_instances``. + """ + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + assert inputs.size(0) == 1, \ + 'SORT/DeepSORT inference only support ' \ + '1 batch size per gpu for now.' + + assert len(data_samples) == 1, \ + 'SORT/DeepSORT inference only support ' \ + '1 batch size per gpu for now.' + + track_data_sample = data_samples[0] + video_len = len(track_data_sample) + if track_data_sample[0].frame_id == 0: + self.tracker.reset() + + for frame_id in range(video_len): + img_data_sample = track_data_sample[frame_id] + single_img = inputs[:, frame_id].contiguous() + # det_results List[DetDataSample] + det_results = self.detector.predict(single_img, [img_data_sample]) + assert len(det_results) == 1, 'Batch inference is not supported.' + + pred_track_instances = self.tracker.track( + model=self, + img=single_img, + feats=None, + data_sample=det_results[0], + data_preprocessor=self.preprocess_cfg, + rescale=rescale, + **kwargs) + img_data_sample.pred_track_instances = pred_track_instances + + return [track_data_sample] diff --git a/mmdetection/mmdet/models/mot/ocsort.py b/mmdetection/mmdet/models/mot/ocsort.py new file mode 100644 index 0000000..abf4eb3 --- /dev/null +++ b/mmdetection/mmdet/models/mot/ocsort.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import Dict, Optional + +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import TrackSampleList +from mmdet.utils import OptConfigType, OptMultiConfig +from .base import BaseMOTModel + + +@MODELS.register_module() +class OCSORT(BaseMOTModel): + """OCOSRT: Observation-Centric SORT: Rethinking SORT for Robust + Multi-Object Tracking + + This multi object tracker is the implementation of `OC-SORT + `_. + + Args: + detector (dict): Configuration of detector. Defaults to None. + tracker (dict): Configuration of tracker. Defaults to None. + motion (dict): Configuration of motion. Defaults to None. + init_cfg (dict): Configuration of initialization. Defaults to None. + """ + + def __init__(self, + detector: Optional[dict] = None, + tracker: Optional[dict] = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(data_preprocessor, init_cfg) + + if detector is not None: + self.detector = MODELS.build(detector) + + if tracker is not None: + self.tracker = MODELS.build(tracker) + + def loss(self, inputs: Tensor, data_samples: TrackSampleList, + **kwargs) -> dict: + """Calculate losses from a batch of inputs and data samples.""" + return self.detector.loss(inputs, data_samples, **kwargs) + + def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList, + **kwargs) -> TrackSampleList: + """Predict results from a video and data samples with post-processing. + + Args: + inputs (Tensor): of shape (N, T, C, H, W) encoding + input images. The N denotes batch size. + The T denotes the number of frames in a video. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `video_data_samples`. + Returns: + TrackSampleList: Tracking results of the inputs. + """ + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + assert inputs.size(0) == 1, \ + 'OCSORT inference only support ' \ + '1 batch size per gpu for now.' + + assert len(data_samples) == 1, \ + 'OCSORT inference only support 1 batch size per gpu for now.' + + track_data_sample = data_samples[0] + video_len = len(track_data_sample) + + for frame_id in range(video_len): + img_data_sample = track_data_sample[frame_id] + single_img = inputs[:, frame_id].contiguous() + # det_results List[DetDataSample] + det_results = self.detector.predict(single_img, [img_data_sample]) + assert len(det_results) == 1, 'Batch inference is not supported.' + + pred_track_instances = self.tracker.track( + data_sample=det_results[0], **kwargs) + img_data_sample.pred_track_instances = pred_track_instances + + return [track_data_sample] diff --git a/mmdetection/mmdet/models/mot/qdtrack.py b/mmdetection/mmdet/models/mot/qdtrack.py new file mode 100644 index 0000000..43d5dd6 --- /dev/null +++ b/mmdetection/mmdet/models/mot/qdtrack.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import TrackSampleList +from mmdet.utils import OptConfigType, OptMultiConfig +from .base import BaseMOTModel + + +@MODELS.register_module() +class QDTrack(BaseMOTModel): + """Quasi-Dense Similarity Learning for Multiple Object Tracking. + + This multi object tracker is the implementation of `QDTrack + `_. + + Args: + detector (dict): Configuration of detector. Defaults to None. + track_head (dict): Configuration of track head. Defaults to None. + tracker (dict): Configuration of tracker. Defaults to None. + freeze_detector (bool): If True, freeze the detector weights. + Defaults to False. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + init_cfg (dict or list[dict]): Configuration of initialization. + Defaults to None. + """ + + def __init__(self, + detector: Optional[dict] = None, + track_head: Optional[dict] = None, + tracker: Optional[dict] = None, + freeze_detector: bool = False, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(data_preprocessor, init_cfg) + if detector is not None: + self.detector = MODELS.build(detector) + + if track_head is not None: + self.track_head = MODELS.build(track_head) + + if tracker is not None: + self.tracker = MODELS.build(tracker) + + self.freeze_detector = freeze_detector + if self.freeze_detector: + self.freeze_module('detector') + + def predict(self, + inputs: Tensor, + data_samples: TrackSampleList, + rescale: bool = True, + **kwargs) -> TrackSampleList: + """Predict results from a video and data samples with post- processing. + + Args: + inputs (Tensor): of shape (N, T, C, H, W) encoding + input images. The N denotes batch size. + The T denotes the number of frames in a video. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `video_data_samples`. + rescale (bool, Optional): If False, then returned bboxes and masks + will fit the scale of img, otherwise, returned bboxes and masks + will fit the scale of original image shape. Defaults to True. + + Returns: + TrackSampleList: Tracking results of the inputs. + """ + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + assert inputs.size(0) == 1, \ + 'QDTrack inference only support 1 batch size per gpu for now.' + + assert len(data_samples) == 1, \ + 'QDTrack only support 1 batch size per gpu for now.' + + track_data_sample = data_samples[0] + video_len = len(track_data_sample) + if track_data_sample[0].frame_id == 0: + self.tracker.reset() + + for frame_id in range(video_len): + img_data_sample = track_data_sample[frame_id] + single_img = inputs[:, frame_id].contiguous() + x = self.detector.extract_feat(single_img) + rpn_results_list = self.detector.rpn_head.predict( + x, [img_data_sample]) + # det_results List[InstanceData] + det_results = self.detector.roi_head.predict( + x, rpn_results_list, [img_data_sample], rescale=rescale) + assert len(det_results) == 1, 'Batch inference is not supported.' + img_data_sample.pred_instances = det_results[0] + frame_pred_track_instances = self.tracker.track( + model=self, + img=single_img, + feats=x, + data_sample=img_data_sample, + **kwargs) + img_data_sample.pred_track_instances = frame_pred_track_instances + + return [track_data_sample] + + def loss(self, inputs: Tensor, data_samples: TrackSampleList, + **kwargs) -> Union[dict, tuple]: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding + input images. Typically these should be mean centered and std + scaled. The N denotes batch size. The T denotes the number of + frames. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `video_data_samples`. + + Returns: + dict: A dictionary of loss components. + """ + # modify the inputs shape to fit mmdet + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + assert inputs.size(1) == 2, \ + 'QDTrack can only have 1 key frame and 1 reference frame.' + + # split the data_samples into two aspects: key frames and reference + # frames + ref_data_samples, key_data_samples = [], [] + key_frame_inds, ref_frame_inds = [], [] + # set cat_id of gt_labels to 0 in RPN + for track_data_sample in data_samples: + key_frame_inds.append(track_data_sample.key_frames_inds[0]) + ref_frame_inds.append(track_data_sample.ref_frames_inds[0]) + key_data_sample = track_data_sample.get_key_frames()[0] + key_data_sample.gt_instances.labels = \ + torch.zeros_like(key_data_sample.gt_instances.labels) + key_data_samples.append(key_data_sample) + ref_data_sample = track_data_sample.get_ref_frames()[0] + ref_data_samples.append(ref_data_sample) + + key_frame_inds = torch.tensor(key_frame_inds, dtype=torch.int64) + ref_frame_inds = torch.tensor(ref_frame_inds, dtype=torch.int64) + batch_inds = torch.arange(len(inputs)) + key_imgs = inputs[batch_inds, key_frame_inds].contiguous() + ref_imgs = inputs[batch_inds, ref_frame_inds].contiguous() + + x = self.detector.extract_feat(key_imgs) + ref_x = self.detector.extract_feat(ref_imgs) + + losses = dict() + # RPN head forward and loss + assert self.detector.with_rpn, \ + 'QDTrack only support detector with RPN.' + + proposal_cfg = self.detector.train_cfg.get('rpn_proposal', + self.detector.test_cfg.rpn) + rpn_losses, rpn_results_list = self.detector.rpn_head. \ + loss_and_predict(x, + key_data_samples, + proposal_cfg=proposal_cfg, + **kwargs) + ref_rpn_results_list = self.detector.rpn_head.predict( + ref_x, ref_data_samples, **kwargs) + + # avoid get same name with roi_head loss + keys = rpn_losses.keys() + for key in keys: + if 'loss' in key and 'rpn' not in key: + rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) + losses.update(rpn_losses) + + # roi_head loss + losses_detect = self.detector.roi_head.loss(x, rpn_results_list, + key_data_samples, **kwargs) + losses.update(losses_detect) + + # tracking head loss + losses_track = self.track_head.loss(x, ref_x, rpn_results_list, + ref_rpn_results_list, data_samples, + **kwargs) + losses.update(losses_track) + + return losses diff --git a/mmdetection/mmdet/models/mot/strongsort.py b/mmdetection/mmdet/models/mot/strongsort.py new file mode 100644 index 0000000..6129bf4 --- /dev/null +++ b/mmdetection/mmdet/models/mot/strongsort.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import numpy as np +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import TrackSampleList +from mmdet.utils import OptConfigType +from .deep_sort import DeepSORT + + +@MODELS.register_module() +class StrongSORT(DeepSORT): + """StrongSORT: Make DeepSORT Great Again. + + Details can be found at `StrongSORT`_. + + Args: + detector (dict): Configuration of detector. Defaults to None. + reid (dict): Configuration of reid. Defaults to None + tracker (dict): Configuration of tracker. Defaults to None. + kalman (dict): Configuration of Kalman filter. Defaults to None. + cmc (dict): Configuration of camera model compensation. + Defaults to None. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + init_cfg (dict or list[dict]): Configuration of initialization. + Defaults to None. + """ + + def __init__(self, + detector: Optional[dict] = None, + reid: Optional[dict] = None, + cmc: Optional[dict] = None, + tracker: Optional[dict] = None, + postprocess_model: Optional[dict] = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptConfigType = None): + super().__init__(detector, reid, tracker, data_preprocessor, init_cfg) + + if cmc is not None: + self.cmc = TASK_UTILS.build(cmc) + + if postprocess_model is not None: + self.postprocess_model = TASK_UTILS.build(postprocess_model) + + @property + def with_cmc(self): + """bool: whether the framework has a camera model compensation + model. + """ + return hasattr(self, 'cmc') and self.cmc is not None + + def predict(self, + inputs: Tensor, + data_samples: TrackSampleList, + rescale: bool = True, + **kwargs) -> TrackSampleList: + """Predict results from a video and data samples with post- processing. + + Args: + inputs (Tensor): of shape (N, T, C, H, W) encoding + input images. The N denotes batch size. + The T denotes the number of key frames + and reference frames. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance`. + rescale (bool, Optional): If False, then returned bboxes and masks + will fit the scale of img, otherwise, returned bboxes and masks + will fit the scale of original image shape. Defaults to True. + + Returns: + TrackSampleList: List[TrackDataSample] + Tracking results of the input videos. + Each DetDataSample usually contains ``pred_track_instances``. + """ + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + assert inputs.size(0) == 1, \ + 'SORT/DeepSORT inference only support ' \ + '1 batch size per gpu for now.' + + assert len(data_samples) == 1, \ + 'SORT/DeepSORT inference only support ' \ + '1 batch size per gpu for now.' + + track_data_sample = data_samples[0] + video_len = len(track_data_sample) + + video_track_instances = [] + for frame_id in range(video_len): + img_data_sample = track_data_sample[frame_id] + single_img = inputs[:, frame_id].contiguous() + # det_results List[DetDataSample] + det_results = self.detector.predict(single_img, [img_data_sample]) + assert len(det_results) == 1, 'Batch inference is not supported.' + + pred_track_instances = self.tracker.track( + model=self, + img=single_img, + data_sample=det_results[0], + data_preprocessor=self.preprocess_cfg, + rescale=rescale, + **kwargs) + for i in range(len(pred_track_instances.instances_id)): + video_track_instances.append( + np.array([ + frame_id + 1, + pred_track_instances.instances_id[i].cpu(), + pred_track_instances.bboxes[i][0].cpu(), + pred_track_instances.bboxes[i][1].cpu(), + (pred_track_instances.bboxes[i][2] - + pred_track_instances.bboxes[i][0]).cpu(), + (pred_track_instances.bboxes[i][3] - + pred_track_instances.bboxes[i][1]).cpu(), + pred_track_instances.scores[i].cpu() + ])) + video_track_instances = np.array(video_track_instances).reshape(-1, 7) + video_track_instances = self.postprocess_model.forward( + video_track_instances) + for frame_id in range(video_len): + track_data_sample[frame_id].pred_track_instances = \ + InstanceData(bboxes=video_track_instances[ + video_track_instances[:, 0] == frame_id + 1, :]) + + return [track_data_sample] diff --git a/mmdetection/mmdet/models/necks/__init__.py b/mmdetection/mmdet/models/necks/__init__.py new file mode 100644 index 0000000..343fbfe --- /dev/null +++ b/mmdetection/mmdet/models/necks/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bfp import BFP +from .channel_mapper import ChannelMapper +from .cspnext_pafpn import CSPNeXtPAFPN +from .ct_resnet_neck import CTResNetNeck +from .dilated_encoder import DilatedEncoder +from .dyhead import DyHead +from .fpg import FPG +from .fpn import FPN +from .fpn_carafe import FPN_CARAFE +from .fpn_dropblock import FPN_DropBlock +from .hrfpn import HRFPN +from .nas_fpn import NASFPN +from .nasfcos_fpn import NASFCOS_FPN +from .pafpn import PAFPN +from .rfp import RFP +from .ssd_neck import SSDNeck +from .ssh import SSH +from .yolo_neck import YOLOV3Neck +from .yolox_pafpn import YOLOXPAFPN + +__all__ = [ + 'FPN', 'BFP', 'ChannelMapper', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN', + 'NASFCOS_FPN', 'RFP', 'YOLOV3Neck', 'FPG', 'DilatedEncoder', + 'CTResNetNeck', 'SSDNeck', 'YOLOXPAFPN', 'DyHead', 'CSPNeXtPAFPN', 'SSH', + 'FPN_DropBlock' +] diff --git a/mmdetection/mmdet/models/necks/bfp.py b/mmdetection/mmdet/models/necks/bfp.py new file mode 100644 index 0000000..401cdb0 --- /dev/null +++ b/mmdetection/mmdet/models/necks/bfp.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmcv.cnn.bricks import NonLocal2d +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class BFP(BaseModule): + """BFP (Balanced Feature Pyramids) + + BFP takes multi-level features as inputs and gather them into a single one, + then refine the gathered feature and scatter the refined results to + multi-level features. This module is used in Libra R-CNN (CVPR 2019), see + the paper `Libra R-CNN: Towards Balanced Learning for Object Detection + `_ for details. + + Args: + in_channels (int): Number of input channels (feature maps of all levels + should have the same channels). + num_levels (int): Number of input feature levels. + refine_level (int): Index of integration and refine level of BSF in + multi-level features from bottom to top. + refine_type (str): Type of the refine op, currently support + [None, 'conv', 'non_local']. + conv_cfg (:obj:`ConfigDict` or dict, optional): The config dict for + convolution layers. + norm_cfg (:obj:`ConfigDict` or dict, optional): The config dict for + normalization layers. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or + dict], optional): Initialization config dict. + """ + + def __init__( + self, + in_channels: int, + num_levels: int, + refine_level: int = 2, + refine_type: str = None, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = dict( + type='Xavier', layer='Conv2d', distribution='uniform') + ) -> None: + super().__init__(init_cfg=init_cfg) + assert refine_type in [None, 'conv', 'non_local'] + + self.in_channels = in_channels + self.num_levels = num_levels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + self.refine_level = refine_level + self.refine_type = refine_type + assert 0 <= self.refine_level < self.num_levels + + if self.refine_type == 'conv': + self.refine = ConvModule( + self.in_channels, + self.in_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + elif self.refine_type == 'non_local': + self.refine = NonLocal2d( + self.in_channels, + reduction=1, + use_scale=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + + def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]: + """Forward function.""" + assert len(inputs) == self.num_levels + + # step 1: gather multi-level features by resize and average + feats = [] + gather_size = inputs[self.refine_level].size()[2:] + for i in range(self.num_levels): + if i < self.refine_level: + gathered = F.adaptive_max_pool2d( + inputs[i], output_size=gather_size) + else: + gathered = F.interpolate( + inputs[i], size=gather_size, mode='nearest') + feats.append(gathered) + + bsf = sum(feats) / len(feats) + + # step 2: refine gathered features + if self.refine_type is not None: + bsf = self.refine(bsf) + + # step 3: scatter refined features to multi-levels by a residual path + outs = [] + for i in range(self.num_levels): + out_size = inputs[i].size()[2:] + if i < self.refine_level: + residual = F.interpolate(bsf, size=out_size, mode='nearest') + else: + residual = F.adaptive_max_pool2d(bsf, output_size=out_size) + outs.append(residual + inputs[i]) + + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/channel_mapper.py b/mmdetection/mmdet/models/necks/channel_mapper.py new file mode 100644 index 0000000..7429361 --- /dev/null +++ b/mmdetection/mmdet/models/necks/channel_mapper.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class ChannelMapper(BaseModule): + """Channel Mapper to reduce/increase channels of backbone features. + + This is used to reduce/increase channels of backbone features. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + kernel_size (int, optional): kernel_size for reducing channels (used + at each scale). Default: 3. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Default: None. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + normalization layer. Default: None. + act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + activation layer in ConvModule. Default: dict(type='ReLU'). + bias (bool | str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise + False. Default: "auto". + num_outs (int, optional): Number of output feature maps. There would + be extra_convs when num_outs larger than the length of in_channels. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or dict], + optional): Initialization config dict. + Example: + >>> import torch + >>> in_channels = [2, 3, 5, 7] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = ChannelMapper(in_channels, 11, 3).eval() + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 11, 340, 340]) + outputs[1].shape = torch.Size([1, 11, 170, 170]) + outputs[2].shape = torch.Size([1, 11, 84, 84]) + outputs[3].shape = torch.Size([1, 11, 43, 43]) + """ + + def __init__( + self, + in_channels: List[int], + out_channels: int, + kernel_size: int = 3, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + act_cfg: OptConfigType = dict(type='ReLU'), + bias: Union[bool, str] = 'auto', + num_outs: int = None, + init_cfg: OptMultiConfig = dict( + type='Xavier', layer='Conv2d', distribution='uniform') + ) -> None: + super().__init__(init_cfg=init_cfg) + assert isinstance(in_channels, list) + self.extra_convs = None + if num_outs is None: + num_outs = len(in_channels) + self.convs = nn.ModuleList() + for in_channel in in_channels: + self.convs.append( + ConvModule( + in_channel, + out_channels, + kernel_size, + padding=(kernel_size - 1) // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + bias=bias)) + if num_outs > len(in_channels): + self.extra_convs = nn.ModuleList() + for i in range(len(in_channels), num_outs): + if i == len(in_channels): + in_channel = in_channels[-1] + else: + in_channel = out_channels + self.extra_convs.append( + ConvModule( + in_channel, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + bias=bias)) + + def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]: + """Forward function.""" + assert len(inputs) == len(self.convs) + outs = [self.convs[i](inputs[i]) for i in range(len(inputs))] + if self.extra_convs: + for i in range(len(self.extra_convs)): + if i == 0: + outs.append(self.extra_convs[0](inputs[-1])) + else: + outs.append(self.extra_convs[i](outs[-1])) + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/cspnext_pafpn.py b/mmdetection/mmdet/models/necks/cspnext_pafpn.py new file mode 100644 index 0000000..a52ba72 --- /dev/null +++ b/mmdetection/mmdet/models/necks/cspnext_pafpn.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptMultiConfig +from ..layers import CSPLayer + + +@MODELS.register_module() +class CSPNeXtPAFPN(BaseModule): + """Path Aggregation Network with CSPNeXt blocks. + + Args: + in_channels (Sequence[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_csp_blocks (int): Number of bottlenecks in CSPLayer. + Defaults to 3. + use_depthwise (bool): Whether to use depthwise separable convolution in + blocks. Defaults to False. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Default: 0.5 + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(scale_factor=2, mode='nearest')` + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN') + act_cfg (dict): Config dict for activation layer. + Default: dict(type='Swish') + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__( + self, + in_channels: Sequence[int], + out_channels: int, + num_csp_blocks: int = 3, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'), + conv_cfg: bool = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='Swish'), + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + super().__init__(init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + # build top-down blocks + self.upsample = nn.Upsample(**upsample_cfg) + self.reduce_layers = nn.ModuleList() + self.top_down_blocks = nn.ModuleList() + for idx in range(len(in_channels) - 1, 0, -1): + self.reduce_layers.append( + ConvModule( + in_channels[idx], + in_channels[idx - 1], + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.top_down_blocks.append( + CSPLayer( + in_channels[idx - 1] * 2, + in_channels[idx - 1], + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + use_cspnext_block=True, + expand_ratio=expand_ratio, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + # build bottom-up blocks + self.downsamples = nn.ModuleList() + self.bottom_up_blocks = nn.ModuleList() + for idx in range(len(in_channels) - 1): + self.downsamples.append( + conv( + in_channels[idx], + in_channels[idx], + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.bottom_up_blocks.append( + CSPLayer( + in_channels[idx] * 2, + in_channels[idx + 1], + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + use_cspnext_block=True, + expand_ratio=expand_ratio, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.out_convs = nn.ModuleList() + for i in range(len(in_channels)): + self.out_convs.append( + conv( + in_channels[i], + out_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]: + """ + Args: + inputs (tuple[Tensor]): input features. + + Returns: + tuple[Tensor]: YOLOXPAFPN features. + """ + assert len(inputs) == len(self.in_channels) + + # top-down path + inner_outs = [inputs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = inputs[idx - 1] + feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx]( + feat_heigh) + inner_outs[0] = feat_heigh + + upsample_feat = self.upsample(feat_heigh) + + inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( + torch.cat([upsample_feat, feat_low], 1)) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsamples[idx](feat_low) + out = self.bottom_up_blocks[idx]( + torch.cat([downsample_feat, feat_height], 1)) + outs.append(out) + + # out convs + for idx, conv in enumerate(self.out_convs): + outs[idx] = conv(outs[idx]) + + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/ct_resnet_neck.py b/mmdetection/mmdet/models/necks/ct_resnet_neck.py new file mode 100644 index 0000000..9109fe7 --- /dev/null +++ b/mmdetection/mmdet/models/necks/ct_resnet_neck.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from mmdet.utils import OptMultiConfig + + +@MODELS.register_module() +class CTResNetNeck(BaseModule): + """The neck used in `CenterNet `_ for + object classification and box regression. + + Args: + in_channels (int): Number of input channels. + num_deconv_filters (tuple[int]): Number of filters per stage. + num_deconv_kernels (tuple[int]): Number of kernels per stage. + use_dcn (bool): If True, use DCNv2. Defaults to True. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization + config dict. + """ + + def __init__(self, + in_channels: int, + num_deconv_filters: Tuple[int, ...], + num_deconv_kernels: Tuple[int, ...], + use_dcn: bool = True, + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + assert len(num_deconv_filters) == len(num_deconv_kernels) + self.fp16_enabled = False + self.use_dcn = use_dcn + self.in_channels = in_channels + self.deconv_layers = self._make_deconv_layer(num_deconv_filters, + num_deconv_kernels) + + def _make_deconv_layer( + self, num_deconv_filters: Tuple[int, ...], + num_deconv_kernels: Tuple[int, ...]) -> nn.Sequential: + """use deconv layers to upsample backbone's output.""" + layers = [] + for i in range(len(num_deconv_filters)): + feat_channels = num_deconv_filters[i] + conv_module = ConvModule( + self.in_channels, + feat_channels, + 3, + padding=1, + conv_cfg=dict(type='DCNv2') if self.use_dcn else None, + norm_cfg=dict(type='BN')) + layers.append(conv_module) + upsample_module = ConvModule( + feat_channels, + feat_channels, + num_deconv_kernels[i], + stride=2, + padding=1, + conv_cfg=dict(type='deconv'), + norm_cfg=dict(type='BN')) + layers.append(upsample_module) + self.in_channels = feat_channels + + return nn.Sequential(*layers) + + def init_weights(self) -> None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, nn.ConvTranspose2d): + # In order to be consistent with the source code, + # reset the ConvTranspose2d initialization parameters + m.reset_parameters() + # Simulated bilinear upsampling kernel + w = m.weight.data + f = math.ceil(w.size(2) / 2) + c = (2 * f - 1 - f % 2) / (2. * f) + for i in range(w.size(2)): + for j in range(w.size(3)): + w[0, 0, i, j] = \ + (1 - math.fabs(i / f - c)) * ( + 1 - math.fabs(j / f - c)) + for c in range(1, w.size(0)): + w[c, 0, :, :] = w[0, 0, :, :] + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + # self.use_dcn is False + elif not self.use_dcn and isinstance(m, nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + + def forward(self, x: Sequence[torch.Tensor]) -> Tuple[torch.Tensor]: + """model forward.""" + assert isinstance(x, (list, tuple)) + outs = self.deconv_layers(x[-1]) + return outs, diff --git a/mmdetection/mmdet/models/necks/dilated_encoder.py b/mmdetection/mmdet/models/necks/dilated_encoder.py new file mode 100644 index 0000000..e9beb3e --- /dev/null +++ b/mmdetection/mmdet/models/necks/dilated_encoder.py @@ -0,0 +1,109 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import ConvModule, is_norm +from mmengine.model import caffe2_xavier_init, constant_init, normal_init +from torch.nn import BatchNorm2d + +from mmdet.registry import MODELS + + +class Bottleneck(nn.Module): + """Bottleneck block for DilatedEncoder used in `YOLOF. + + `. + + The Bottleneck contains three ConvLayers and one residual connection. + + Args: + in_channels (int): The number of input channels. + mid_channels (int): The number of middle output channels. + dilation (int): Dilation rate. + norm_cfg (dict): Dictionary to construct and config norm layer. + """ + + def __init__(self, + in_channels, + mid_channels, + dilation, + norm_cfg=dict(type='BN', requires_grad=True)): + super(Bottleneck, self).__init__() + self.conv1 = ConvModule( + in_channels, mid_channels, 1, norm_cfg=norm_cfg) + self.conv2 = ConvModule( + mid_channels, + mid_channels, + 3, + padding=dilation, + dilation=dilation, + norm_cfg=norm_cfg) + self.conv3 = ConvModule( + mid_channels, in_channels, 1, norm_cfg=norm_cfg) + + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.conv2(out) + out = self.conv3(out) + out = out + identity + return out + + +@MODELS.register_module() +class DilatedEncoder(nn.Module): + """Dilated Encoder for YOLOF `. + + This module contains two types of components: + - the original FPN lateral convolution layer and fpn convolution layer, + which are 1x1 conv + 3x3 conv + - the dilated residual block + + Args: + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + block_mid_channels (int): The number of middle block output channels + num_residual_blocks (int): The number of residual blocks. + block_dilations (list): The list of residual blocks dilation. + """ + + def __init__(self, in_channels, out_channels, block_mid_channels, + num_residual_blocks, block_dilations): + super(DilatedEncoder, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.block_mid_channels = block_mid_channels + self.num_residual_blocks = num_residual_blocks + self.block_dilations = block_dilations + self._init_layers() + + def _init_layers(self): + self.lateral_conv = nn.Conv2d( + self.in_channels, self.out_channels, kernel_size=1) + self.lateral_norm = BatchNorm2d(self.out_channels) + self.fpn_conv = nn.Conv2d( + self.out_channels, self.out_channels, kernel_size=3, padding=1) + self.fpn_norm = BatchNorm2d(self.out_channels) + encoder_blocks = [] + for i in range(self.num_residual_blocks): + dilation = self.block_dilations[i] + encoder_blocks.append( + Bottleneck( + self.out_channels, + self.block_mid_channels, + dilation=dilation)) + self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks) + + def init_weights(self): + caffe2_xavier_init(self.lateral_conv) + caffe2_xavier_init(self.fpn_conv) + for m in [self.lateral_norm, self.fpn_norm]: + constant_init(m, 1) + for m in self.dilated_encoder_blocks.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + + def forward(self, feature): + out = self.lateral_norm(self.lateral_conv(feature[-1])) + out = self.fpn_norm(self.fpn_conv(out)) + return self.dilated_encoder_blocks(out), diff --git a/mmdetection/mmdet/models/necks/dyhead.py b/mmdetection/mmdet/models/necks/dyhead.py new file mode 100644 index 0000000..5f5ae0b --- /dev/null +++ b/mmdetection/mmdet/models/necks/dyhead.py @@ -0,0 +1,173 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_activation_layer, build_norm_layer +from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d +from mmengine.model import BaseModule, constant_init, normal_init + +from mmdet.registry import MODELS +from ..layers import DyReLU + +# Reference: +# https://github.com/microsoft/DynamicHead +# https://github.com/jshilong/SEPC + + +class DyDCNv2(nn.Module): + """ModulatedDeformConv2d with normalization layer used in DyHead. + + This module cannot be configured with `conv_cfg=dict(type='DCNv2')` + because DyHead calculates offset and mask from middle-level feature. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (int | tuple[int], optional): Stride of the convolution. + Default: 1. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: dict(type='GN', num_groups=16, requires_grad=True). + """ + + def __init__(self, + in_channels, + out_channels, + stride=1, + norm_cfg=dict(type='GN', num_groups=16, requires_grad=True)): + super().__init__() + self.with_norm = norm_cfg is not None + bias = not self.with_norm + self.conv = ModulatedDeformConv2d( + in_channels, out_channels, 3, stride=stride, padding=1, bias=bias) + if self.with_norm: + self.norm = build_norm_layer(norm_cfg, out_channels)[1] + + def forward(self, x, offset, mask): + """Forward function.""" + x = self.conv(x.contiguous(), offset, mask) + if self.with_norm: + x = self.norm(x) + return x + + +class DyHeadBlock(nn.Module): + """DyHead Block with three types of attention. + + HSigmoid arguments in default act_cfg follow official code, not paper. + https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + zero_init_offset (bool, optional): Whether to use zero init for + `spatial_conv_offset`. Default: True. + act_cfg (dict, optional): Config dict for the last activation layer of + scale-aware attention. Default: dict(type='HSigmoid', bias=3.0, + divisor=6.0). + """ + + def __init__(self, + in_channels, + out_channels, + zero_init_offset=True, + act_cfg=dict(type='HSigmoid', bias=3.0, divisor=6.0)): + super().__init__() + self.zero_init_offset = zero_init_offset + # (offset_x, offset_y, mask) * kernel_size_y * kernel_size_x + self.offset_and_mask_dim = 3 * 3 * 3 + self.offset_dim = 2 * 3 * 3 + + self.spatial_conv_high = DyDCNv2(in_channels, out_channels) + self.spatial_conv_mid = DyDCNv2(in_channels, out_channels) + self.spatial_conv_low = DyDCNv2(in_channels, out_channels, stride=2) + self.spatial_conv_offset = nn.Conv2d( + in_channels, self.offset_and_mask_dim, 3, padding=1) + self.scale_attn_module = nn.Sequential( + nn.AdaptiveAvgPool2d(1), nn.Conv2d(out_channels, 1, 1), + nn.ReLU(inplace=True), build_activation_layer(act_cfg)) + self.task_attn_module = DyReLU(out_channels) + self._init_weights() + + def _init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, 0, 0.01) + if self.zero_init_offset: + constant_init(self.spatial_conv_offset, 0) + + def forward(self, x): + """Forward function.""" + outs = [] + for level in range(len(x)): + # calculate offset and mask of DCNv2 from middle-level feature + offset_and_mask = self.spatial_conv_offset(x[level]) + offset = offset_and_mask[:, :self.offset_dim, :, :] + mask = offset_and_mask[:, self.offset_dim:, :, :].sigmoid() + + mid_feat = self.spatial_conv_mid(x[level], offset, mask) + sum_feat = mid_feat * self.scale_attn_module(mid_feat) + summed_levels = 1 + if level > 0: + low_feat = self.spatial_conv_low(x[level - 1], offset, mask) + sum_feat += low_feat * self.scale_attn_module(low_feat) + summed_levels += 1 + if level < len(x) - 1: + # this upsample order is weird, but faster than natural order + # https://github.com/microsoft/DynamicHead/issues/25 + high_feat = F.interpolate( + self.spatial_conv_high(x[level + 1], offset, mask), + size=x[level].shape[-2:], + mode='bilinear', + align_corners=True) + sum_feat += high_feat * self.scale_attn_module(high_feat) + summed_levels += 1 + outs.append(self.task_attn_module(sum_feat / summed_levels)) + + return outs + + +@MODELS.register_module() +class DyHead(BaseModule): + """DyHead neck consisting of multiple DyHead Blocks. + + See `Dynamic Head: Unifying Object Detection Heads with Attentions + `_ for details. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + num_blocks (int, optional): Number of DyHead Blocks. Default: 6. + zero_init_offset (bool, optional): Whether to use zero init for + `spatial_conv_offset`. Default: True. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + num_blocks=6, + zero_init_offset=True, + init_cfg=None): + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_blocks = num_blocks + self.zero_init_offset = zero_init_offset + + dyhead_blocks = [] + for i in range(num_blocks): + in_channels = self.in_channels if i == 0 else self.out_channels + dyhead_blocks.append( + DyHeadBlock( + in_channels, + self.out_channels, + zero_init_offset=zero_init_offset)) + self.dyhead_blocks = nn.Sequential(*dyhead_blocks) + + def forward(self, inputs): + """Forward function.""" + assert isinstance(inputs, (tuple, list)) + outs = self.dyhead_blocks(inputs) + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/fpg.py b/mmdetection/mmdet/models/necks/fpg.py new file mode 100644 index 0000000..73ee799 --- /dev/null +++ b/mmdetection/mmdet/models/necks/fpg.py @@ -0,0 +1,406 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule + +from mmdet.registry import MODELS + + +class Transition(BaseModule): + """Base class for transition. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + """ + + def __init__(self, in_channels, out_channels, init_cfg=None): + super().__init__(init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + + def forward(x): + pass + + +class UpInterpolationConv(Transition): + """A transition used for up-sampling. + + Up-sample the input by interpolation then refines the feature by + a convolution layer. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + scale_factor (int): Up-sampling factor. Default: 2. + mode (int): Interpolation mode. Default: nearest. + align_corners (bool): Whether align corners when interpolation. + Default: None. + kernel_size (int): Kernel size for the conv. Default: 3. + """ + + def __init__(self, + in_channels, + out_channels, + scale_factor=2, + mode='nearest', + align_corners=None, + kernel_size=3, + init_cfg=None, + **kwargs): + super().__init__(in_channels, out_channels, init_cfg) + self.mode = mode + self.scale_factor = scale_factor + self.align_corners = align_corners + self.conv = ConvModule( + in_channels, + out_channels, + kernel_size, + padding=(kernel_size - 1) // 2, + **kwargs) + + def forward(self, x): + x = F.interpolate( + x, + scale_factor=self.scale_factor, + mode=self.mode, + align_corners=self.align_corners) + x = self.conv(x) + return x + + +class LastConv(Transition): + """A transition used for refining the output of the last stage. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + num_inputs (int): Number of inputs of the FPN features. + kernel_size (int): Kernel size for the conv. Default: 3. + """ + + def __init__(self, + in_channels, + out_channels, + num_inputs, + kernel_size=3, + init_cfg=None, + **kwargs): + super().__init__(in_channels, out_channels, init_cfg) + self.num_inputs = num_inputs + self.conv_out = ConvModule( + in_channels, + out_channels, + kernel_size, + padding=(kernel_size - 1) // 2, + **kwargs) + + def forward(self, inputs): + assert len(inputs) == self.num_inputs + return self.conv_out(inputs[-1]) + + +@MODELS.register_module() +class FPG(BaseModule): + """FPG. + + Implementation of `Feature Pyramid Grids (FPG) + `_. + This implementation only gives the basic structure stated in the paper. + But users can implement different type of transitions to fully explore the + the potential power of the structure of FPG. + + Args: + in_channels (int): Number of input channels (feature maps of all levels + should have the same channels). + out_channels (int): Number of output channels (used at each scale) + num_outs (int): Number of output scales. + stack_times (int): The number of times the pyramid architecture will + be stacked. + paths (list[str]): Specify the path order of each stack level. + Each element in the list should be either 'bu' (bottom-up) or + 'td' (top-down). + inter_channels (int): Number of inter channels. + same_up_trans (dict): Transition that goes down at the same stage. + same_down_trans (dict): Transition that goes up at the same stage. + across_lateral_trans (dict): Across-pathway same-stage + across_down_trans (dict): Across-pathway bottom-up connection. + across_up_trans (dict): Across-pathway top-down connection. + across_skip_trans (dict): Across-pathway skip connection. + output_trans (dict): Transition that trans the output of the + last stage. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Default: 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Default: -1, which means the last level. + add_extra_convs (bool): It decides whether to add conv + layers on top of the original feature maps. Default to False. + If True, its actual mode is specified by `extra_convs_on_inputs`. + norm_cfg (dict): Config dict for normalization layer. Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + transition_types = { + 'conv': ConvModule, + 'interpolation_conv': UpInterpolationConv, + 'last_conv': LastConv, + } + + def __init__(self, + in_channels, + out_channels, + num_outs, + stack_times, + paths, + inter_channels=None, + same_down_trans=None, + same_up_trans=dict( + type='conv', kernel_size=3, stride=2, padding=1), + across_lateral_trans=dict(type='conv', kernel_size=1), + across_down_trans=dict(type='conv', kernel_size=3), + across_up_trans=None, + across_skip_trans=dict(type='identity'), + output_trans=dict(type='last_conv', kernel_size=3), + start_level=0, + end_level=-1, + add_extra_convs=False, + norm_cfg=None, + skip_inds=None, + init_cfg=[ + dict(type='Caffe2Xavier', layer='Conv2d'), + dict( + type='Constant', + layer=[ + '_BatchNorm', '_InstanceNorm', 'GroupNorm', + 'LayerNorm' + ], + val=1.0) + ]): + super(FPG, self).__init__(init_cfg) + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + if inter_channels is None: + self.inter_channels = [out_channels for _ in range(num_outs)] + elif isinstance(inter_channels, int): + self.inter_channels = [inter_channels for _ in range(num_outs)] + else: + assert isinstance(inter_channels, list) + assert len(inter_channels) == num_outs + self.inter_channels = inter_channels + self.stack_times = stack_times + self.paths = paths + assert isinstance(paths, list) and len(paths) == stack_times + for d in paths: + assert d in ('bu', 'td') + + self.same_down_trans = same_down_trans + self.same_up_trans = same_up_trans + self.across_lateral_trans = across_lateral_trans + self.across_down_trans = across_down_trans + self.across_up_trans = across_up_trans + self.output_trans = output_trans + self.across_skip_trans = across_skip_trans + + self.with_bias = norm_cfg is None + # skip inds must be specified if across skip trans is not None + if self.across_skip_trans is not None: + skip_inds is not None + self.skip_inds = skip_inds + assert len(self.skip_inds[0]) <= self.stack_times + + if end_level == -1 or end_level == self.num_ins - 1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level is not the last level, no extra level is allowed + self.backbone_end_level = end_level + 1 + assert end_level < self.num_ins + assert num_outs == end_level - start_level + 1 + self.start_level = start_level + self.end_level = end_level + self.add_extra_convs = add_extra_convs + + # build lateral 1x1 convs to reduce channels + self.lateral_convs = nn.ModuleList() + for i in range(self.start_level, self.backbone_end_level): + l_conv = nn.Conv2d(self.in_channels[i], + self.inter_channels[i - self.start_level], 1) + self.lateral_convs.append(l_conv) + + extra_levels = num_outs - self.backbone_end_level + self.start_level + self.extra_downsamples = nn.ModuleList() + for i in range(extra_levels): + if self.add_extra_convs: + fpn_idx = self.backbone_end_level - self.start_level + i + extra_conv = nn.Conv2d( + self.inter_channels[fpn_idx - 1], + self.inter_channels[fpn_idx], + 3, + stride=2, + padding=1) + self.extra_downsamples.append(extra_conv) + else: + self.extra_downsamples.append(nn.MaxPool2d(1, stride=2)) + + self.fpn_transitions = nn.ModuleList() # stack times + for s in range(self.stack_times): + stage_trans = nn.ModuleList() # num of feature levels + for i in range(self.num_outs): + # same, across_lateral, across_down, across_up + trans = nn.ModuleDict() + if s in self.skip_inds[i]: + stage_trans.append(trans) + continue + # build same-stage down trans (used in bottom-up paths) + if i == 0 or self.same_up_trans is None: + same_up_trans = None + else: + same_up_trans = self.build_trans( + self.same_up_trans, self.inter_channels[i - 1], + self.inter_channels[i]) + trans['same_up'] = same_up_trans + # build same-stage up trans (used in top-down paths) + if i == self.num_outs - 1 or self.same_down_trans is None: + same_down_trans = None + else: + same_down_trans = self.build_trans( + self.same_down_trans, self.inter_channels[i + 1], + self.inter_channels[i]) + trans['same_down'] = same_down_trans + # build across lateral trans + across_lateral_trans = self.build_trans( + self.across_lateral_trans, self.inter_channels[i], + self.inter_channels[i]) + trans['across_lateral'] = across_lateral_trans + # build across down trans + if i == self.num_outs - 1 or self.across_down_trans is None: + across_down_trans = None + else: + across_down_trans = self.build_trans( + self.across_down_trans, self.inter_channels[i + 1], + self.inter_channels[i]) + trans['across_down'] = across_down_trans + # build across up trans + if i == 0 or self.across_up_trans is None: + across_up_trans = None + else: + across_up_trans = self.build_trans( + self.across_up_trans, self.inter_channels[i - 1], + self.inter_channels[i]) + trans['across_up'] = across_up_trans + if self.across_skip_trans is None: + across_skip_trans = None + else: + across_skip_trans = self.build_trans( + self.across_skip_trans, self.inter_channels[i - 1], + self.inter_channels[i]) + trans['across_skip'] = across_skip_trans + # build across_skip trans + stage_trans.append(trans) + self.fpn_transitions.append(stage_trans) + + self.output_transition = nn.ModuleList() # output levels + for i in range(self.num_outs): + trans = self.build_trans( + self.output_trans, + self.inter_channels[i], + self.out_channels, + num_inputs=self.stack_times + 1) + self.output_transition.append(trans) + + self.relu = nn.ReLU(inplace=True) + + def build_trans(self, cfg, in_channels, out_channels, **extra_args): + cfg_ = cfg.copy() + trans_type = cfg_.pop('type') + trans_cls = self.transition_types[trans_type] + return trans_cls(in_channels, out_channels, **cfg_, **extra_args) + + def fuse(self, fuse_dict): + out = None + for item in fuse_dict.values(): + if item is not None: + if out is None: + out = item + else: + out = out + item + return out + + def forward(self, inputs): + assert len(inputs) == len(self.in_channels) + + # build all levels from original feature maps + feats = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + for downsample in self.extra_downsamples: + feats.append(downsample(feats[-1])) + + outs = [feats] + + for i in range(self.stack_times): + current_outs = outs[-1] + next_outs = [] + direction = self.paths[i] + for j in range(self.num_outs): + if i in self.skip_inds[j]: + next_outs.append(outs[-1][j]) + continue + # feature level + if direction == 'td': + lvl = self.num_outs - j - 1 + else: + lvl = j + # get transitions + if direction == 'td': + same_trans = self.fpn_transitions[i][lvl]['same_down'] + else: + same_trans = self.fpn_transitions[i][lvl]['same_up'] + across_lateral_trans = self.fpn_transitions[i][lvl][ + 'across_lateral'] + across_down_trans = self.fpn_transitions[i][lvl]['across_down'] + across_up_trans = self.fpn_transitions[i][lvl]['across_up'] + across_skip_trans = self.fpn_transitions[i][lvl]['across_skip'] + # init output + to_fuse = dict( + same=None, lateral=None, across_up=None, across_down=None) + # same downsample/upsample + if same_trans is not None: + to_fuse['same'] = same_trans(next_outs[-1]) + # across lateral + if across_lateral_trans is not None: + to_fuse['lateral'] = across_lateral_trans( + current_outs[lvl]) + # across downsample + if lvl > 0 and across_up_trans is not None: + to_fuse['across_up'] = across_up_trans(current_outs[lvl - + 1]) + # across upsample + if (lvl < self.num_outs - 1 and across_down_trans is not None): + to_fuse['across_down'] = across_down_trans( + current_outs[lvl + 1]) + if across_skip_trans is not None: + to_fuse['across_skip'] = across_skip_trans(outs[0][lvl]) + x = self.fuse(to_fuse) + next_outs.append(x) + + if direction == 'td': + outs.append(next_outs[::-1]) + else: + outs.append(next_outs) + + # output trans + final_outs = [] + for i in range(self.num_outs): + lvl_out_list = [] + for s in range(len(outs)): + lvl_out_list.append(outs[s][i]) + lvl_out = self.output_transition[i](lvl_out_list) + final_outs.append(lvl_out) + + return final_outs diff --git a/mmdetection/mmdet/models/necks/fpn.py b/mmdetection/mmdet/models/necks/fpn.py new file mode 100644 index 0000000..67bd887 --- /dev/null +++ b/mmdetection/mmdet/models/necks/fpn.py @@ -0,0 +1,221 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, MultiConfig, OptConfigType + + +@MODELS.register_module() +class FPN(BaseModule): + r"""Feature Pyramid Network. + + This is an implementation of paper `Feature Pyramid Networks for Object + Detection `_. + + Args: + in_channels (list[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + num_outs (int): Number of output scales. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Defaults to 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Defaults to -1, which means the + last level. + add_extra_convs (bool | str): If bool, it decides whether to add conv + layers on top of the original feature maps. Defaults to False. + If True, it is equivalent to `add_extra_convs='on_input'`. + If str, it specifies the source feature map of the extra convs. + Only the following options are allowed + + - 'on_input': Last feat map of neck inputs (i.e. backbone feature). + - 'on_lateral': Last feature map after lateral convs. + - 'on_output': The last output feature map after fpn convs. + relu_before_extra_convs (bool): Whether to apply relu before the extra + conv. Defaults to False. + no_norm_on_lateral (bool): Whether to apply norm on lateral. + Defaults to False. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + normalization layer. Defaults to None. + act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + activation layer in ConvModule. Defaults to None. + upsample_cfg (:obj:`ConfigDict` or dict, optional): Config dict + for interpolate layer. Defaults to dict(mode='nearest'). + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. + + Example: + >>> import torch + >>> in_channels = [2, 3, 5, 7] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = FPN(in_channels, 11, len(in_channels)).eval() + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 11, 340, 340]) + outputs[1].shape = torch.Size([1, 11, 170, 170]) + outputs[2].shape = torch.Size([1, 11, 84, 84]) + outputs[3].shape = torch.Size([1, 11, 43, 43]) + """ + + def __init__( + self, + in_channels: List[int], + out_channels: int, + num_outs: int, + start_level: int = 0, + end_level: int = -1, + add_extra_convs: Union[bool, str] = False, + relu_before_extra_convs: bool = False, + no_norm_on_lateral: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + act_cfg: OptConfigType = None, + upsample_cfg: ConfigType = dict(mode='nearest'), + init_cfg: MultiConfig = dict( + type='Xavier', layer='Conv2d', distribution='uniform') + ) -> None: + super().__init__(init_cfg=init_cfg) + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.relu_before_extra_convs = relu_before_extra_convs + self.no_norm_on_lateral = no_norm_on_lateral + self.fp16_enabled = False + self.upsample_cfg = upsample_cfg.copy() + + if end_level == -1 or end_level == self.num_ins - 1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level is not the last level, no extra level is allowed + self.backbone_end_level = end_level + 1 + assert end_level < self.num_ins + assert num_outs == end_level - start_level + 1 + self.start_level = start_level + self.end_level = end_level + self.add_extra_convs = add_extra_convs + assert isinstance(add_extra_convs, (str, bool)) + if isinstance(add_extra_convs, str): + # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' + assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') + elif add_extra_convs: # True + self.add_extra_convs = 'on_input' + + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + + for i in range(self.start_level, self.backbone_end_level): + l_conv = ConvModule( + in_channels[i], + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, + act_cfg=act_cfg, + inplace=False) + fpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + # add extra conv layers (e.g., RetinaNet) + extra_levels = num_outs - self.backbone_end_level + self.start_level + if self.add_extra_convs and extra_levels >= 1: + for i in range(extra_levels): + if i == 0 and self.add_extra_convs == 'on_input': + in_channels = self.in_channels[self.backbone_end_level - 1] + else: + in_channels = out_channels + extra_fpn_conv = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + self.fpn_convs.append(extra_fpn_conv) + + def forward(self, inputs: Tuple[Tensor]) -> tuple: + """Forward function. + + Args: + inputs (tuple[Tensor]): Features from the upstream network, each + is a 4D-tensor. + + Returns: + tuple: Feature maps, each is a 4D-tensor. + """ + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + # In some cases, fixing `scale factor` (e.g. 2) is preferred, but + # it cannot co-exist with `size` in `F.interpolate`. + if 'scale_factor' in self.upsample_cfg: + # fix runtime error of "+=" inplace operation in PyTorch 1.10 + laterals[i - 1] = laterals[i - 1] + F.interpolate( + laterals[i], **self.upsample_cfg) + else: + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] = laterals[i - 1] + F.interpolate( + laterals[i], size=prev_shape, **self.upsample_cfg) + + # build outputs + # part 1: from original levels + outs = [ + self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) + ] + # part 2: add extra levels + if self.num_outs > len(outs): + # use max pool to get more levels on top of outputs + # (e.g., Faster R-CNN, Mask R-CNN) + if not self.add_extra_convs: + for i in range(self.num_outs - used_backbone_levels): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) + # add conv layers on top of original feature maps (RetinaNet) + else: + if self.add_extra_convs == 'on_input': + extra_source = inputs[self.backbone_end_level - 1] + elif self.add_extra_convs == 'on_lateral': + extra_source = laterals[-1] + elif self.add_extra_convs == 'on_output': + extra_source = outs[-1] + else: + raise NotImplementedError + outs.append(self.fpn_convs[used_backbone_levels](extra_source)) + for i in range(used_backbone_levels + 1, self.num_outs): + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/fpn_carafe.py b/mmdetection/mmdet/models/necks/fpn_carafe.py new file mode 100644 index 0000000..b393ff7 --- /dev/null +++ b/mmdetection/mmdet/models/necks/fpn_carafe.py @@ -0,0 +1,275 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import ConvModule, build_upsample_layer +from mmcv.ops.carafe import CARAFEPack +from mmengine.model import BaseModule, ModuleList, xavier_init + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class FPN_CARAFE(BaseModule): + """FPN_CARAFE is a more flexible implementation of FPN. It allows more + choice for upsample methods during the top-down pathway. + + It can reproduce the performance of ICCV 2019 paper + CARAFE: Content-Aware ReAssembly of FEatures + Please refer to https://arxiv.org/abs/1905.02188 for more details. + + Args: + in_channels (list[int]): Number of channels for each input feature map. + out_channels (int): Output channels of feature pyramids. + num_outs (int): Number of output stages. + start_level (int): Start level of feature pyramids. + (Default: 0) + end_level (int): End level of feature pyramids. + (Default: -1 indicates the last level). + norm_cfg (dict): Dictionary to construct and config norm layer. + activate (str): Type of activation function in ConvModule + (Default: None indicates w/o activation). + order (dict): Order of components in ConvModule. + upsample (str): Type of upsample layer. + upsample_cfg (dict): Dictionary to construct and config upsample layer. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + norm_cfg=None, + act_cfg=None, + order=('conv', 'norm', 'act'), + upsample_cfg=dict( + type='carafe', + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1), + init_cfg=None): + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super(FPN_CARAFE, self).__init__(init_cfg) + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.with_bias = norm_cfg is None + self.upsample_cfg = upsample_cfg.copy() + self.upsample = self.upsample_cfg.get('type') + self.relu = nn.ReLU(inplace=False) + + self.order = order + assert order in [('conv', 'norm', 'act'), ('act', 'conv', 'norm')] + + assert self.upsample in [ + 'nearest', 'bilinear', 'deconv', 'pixel_shuffle', 'carafe', None + ] + if self.upsample in ['deconv', 'pixel_shuffle']: + assert hasattr( + self.upsample_cfg, + 'upsample_kernel') and self.upsample_cfg.upsample_kernel > 0 + self.upsample_kernel = self.upsample_cfg.pop('upsample_kernel') + + if end_level == -1 or end_level == self.num_ins - 1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level is not the last level, no extra level is allowed + self.backbone_end_level = end_level + 1 + assert end_level < self.num_ins + assert num_outs == end_level - start_level + 1 + self.start_level = start_level + self.end_level = end_level + + self.lateral_convs = ModuleList() + self.fpn_convs = ModuleList() + self.upsample_modules = ModuleList() + + for i in range(self.start_level, self.backbone_end_level): + l_conv = ConvModule( + in_channels[i], + out_channels, + 1, + norm_cfg=norm_cfg, + bias=self.with_bias, + act_cfg=act_cfg, + inplace=False, + order=self.order) + fpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + bias=self.with_bias, + act_cfg=act_cfg, + inplace=False, + order=self.order) + if i != self.backbone_end_level - 1: + upsample_cfg_ = self.upsample_cfg.copy() + if self.upsample == 'deconv': + upsample_cfg_.update( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=self.upsample_kernel, + stride=2, + padding=(self.upsample_kernel - 1) // 2, + output_padding=(self.upsample_kernel - 1) // 2) + elif self.upsample == 'pixel_shuffle': + upsample_cfg_.update( + in_channels=out_channels, + out_channels=out_channels, + scale_factor=2, + upsample_kernel=self.upsample_kernel) + elif self.upsample == 'carafe': + upsample_cfg_.update(channels=out_channels, scale_factor=2) + else: + # suppress warnings + align_corners = (None + if self.upsample == 'nearest' else False) + upsample_cfg_.update( + scale_factor=2, + mode=self.upsample, + align_corners=align_corners) + upsample_module = build_upsample_layer(upsample_cfg_) + self.upsample_modules.append(upsample_module) + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + # add extra conv layers (e.g., RetinaNet) + extra_out_levels = ( + num_outs - self.backbone_end_level + self.start_level) + if extra_out_levels >= 1: + for i in range(extra_out_levels): + in_channels = ( + self.in_channels[self.backbone_end_level - + 1] if i == 0 else out_channels) + extra_l_conv = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + bias=self.with_bias, + act_cfg=act_cfg, + inplace=False, + order=self.order) + if self.upsample == 'deconv': + upsampler_cfg_ = dict( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=self.upsample_kernel, + stride=2, + padding=(self.upsample_kernel - 1) // 2, + output_padding=(self.upsample_kernel - 1) // 2) + elif self.upsample == 'pixel_shuffle': + upsampler_cfg_ = dict( + in_channels=out_channels, + out_channels=out_channels, + scale_factor=2, + upsample_kernel=self.upsample_kernel) + elif self.upsample == 'carafe': + upsampler_cfg_ = dict( + channels=out_channels, + scale_factor=2, + **self.upsample_cfg) + else: + # suppress warnings + align_corners = (None + if self.upsample == 'nearest' else False) + upsampler_cfg_ = dict( + scale_factor=2, + mode=self.upsample, + align_corners=align_corners) + upsampler_cfg_['type'] = self.upsample + upsample_module = build_upsample_layer(upsampler_cfg_) + extra_fpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + bias=self.with_bias, + act_cfg=act_cfg, + inplace=False, + order=self.order) + self.upsample_modules.append(upsample_module) + self.fpn_convs.append(extra_fpn_conv) + self.lateral_convs.append(extra_l_conv) + + # default init_weights for conv(msra) and norm in ConvModule + def init_weights(self): + """Initialize the weights of module.""" + super(FPN_CARAFE, self).init_weights() + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)): + xavier_init(m, distribution='uniform') + for m in self.modules(): + if isinstance(m, CARAFEPack): + m.init_weights() + + def slice_as(self, src, dst): + """Slice ``src`` as ``dst`` + + Note: + ``src`` should have the same or larger size than ``dst``. + + Args: + src (torch.Tensor): Tensors to be sliced. + dst (torch.Tensor): ``src`` will be sliced to have the same + size as ``dst``. + + Returns: + torch.Tensor: Sliced tensor. + """ + assert (src.size(2) >= dst.size(2)) and (src.size(3) >= dst.size(3)) + if src.size(2) == dst.size(2) and src.size(3) == dst.size(3): + return src + else: + return src[:, :, :dst.size(2), :dst.size(3)] + + def tensor_add(self, a, b): + """Add tensors ``a`` and ``b`` that might have different sizes.""" + if a.size() == b.size(): + c = a + b + else: + c = a + self.slice_as(b, a) + return c + + def forward(self, inputs): + """Forward function.""" + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [] + for i, lateral_conv in enumerate(self.lateral_convs): + if i <= self.backbone_end_level - self.start_level: + input = inputs[min(i + self.start_level, len(inputs) - 1)] + else: + input = laterals[-1] + lateral = lateral_conv(input) + laterals.append(lateral) + + # build top-down path + for i in range(len(laterals) - 1, 0, -1): + if self.upsample is not None: + upsample_feat = self.upsample_modules[i - 1](laterals[i]) + else: + upsample_feat = laterals[i] + laterals[i - 1] = self.tensor_add(laterals[i - 1], upsample_feat) + + # build outputs + num_conv_outs = len(self.fpn_convs) + outs = [] + for i in range(num_conv_outs): + out = self.fpn_convs[i](laterals[i]) + outs.append(out) + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/fpn_dropblock.py b/mmdetection/mmdet/models/necks/fpn_dropblock.py new file mode 100644 index 0000000..473af92 --- /dev/null +++ b/mmdetection/mmdet/models/necks/fpn_dropblock.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch.nn.functional as F +from torch import Tensor + +from mmdet.registry import MODELS +from .fpn import FPN + + +@MODELS.register_module() +class FPN_DropBlock(FPN): + + def __init__(self, + *args, + plugin: Optional[dict] = dict( + type='DropBlock', + drop_prob=0.3, + block_size=3, + warmup_iters=0), + **kwargs) -> None: + super().__init__(*args, **kwargs) + self.plugin = None + if plugin is not None: + self.plugin = MODELS.build(plugin) + + def forward(self, inputs: Tuple[Tensor]) -> tuple: + """Forward function. + + Args: + inputs (tuple[Tensor]): Features from the upstream network, each + is a 4D-tensor. + + Returns: + tuple: Feature maps, each is a 4D-tensor. + """ + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + # In some cases, fixing `scale factor` (e.g. 2) is preferred, but + # it cannot co-exist with `size` in `F.interpolate`. + if 'scale_factor' in self.upsample_cfg: + # fix runtime error of "+=" inplace operation in PyTorch 1.10 + laterals[i - 1] = laterals[i - 1] + F.interpolate( + laterals[i], **self.upsample_cfg) + else: + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] = laterals[i - 1] + F.interpolate( + laterals[i], size=prev_shape, **self.upsample_cfg) + + if self.plugin is not None: + laterals[i - 1] = self.plugin(laterals[i - 1]) + + # build outputs + # part 1: from original levels + outs = [ + self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) + ] + # part 2: add extra levels + if self.num_outs > len(outs): + # use max pool to get more levels on top of outputs + # (e.g., Faster R-CNN, Mask R-CNN) + if not self.add_extra_convs: + for i in range(self.num_outs - used_backbone_levels): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) + # add conv layers on top of original feature maps (RetinaNet) + else: + if self.add_extra_convs == 'on_input': + extra_source = inputs[self.backbone_end_level - 1] + elif self.add_extra_convs == 'on_lateral': + extra_source = laterals[-1] + elif self.add_extra_convs == 'on_output': + extra_source = outs[-1] + else: + raise NotImplementedError + outs.append(self.fpn_convs[used_backbone_levels](extra_source)) + for i in range(used_backbone_levels + 1, self.num_outs): + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/hrfpn.py b/mmdetection/mmdet/models/necks/hrfpn.py new file mode 100644 index 0000000..d262754 --- /dev/null +++ b/mmdetection/mmdet/models/necks/hrfpn.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch.utils.checkpoint import checkpoint + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class HRFPN(BaseModule): + """HRFPN (High Resolution Feature Pyramids) + + paper: `High-Resolution Representations for Labeling Pixels and Regions + `_. + + Args: + in_channels (list): number of channels for each branch. + out_channels (int): output channels of feature pyramids. + num_outs (int): number of output stages. + pooling_type (str): pooling for generating feature pyramids + from {MAX, AVG}. + conv_cfg (dict): dictionary to construct and config conv layer. + norm_cfg (dict): dictionary to construct and config norm layer. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + stride (int): stride of 3x3 convolutional layers + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + in_channels, + out_channels, + num_outs=5, + pooling_type='AVG', + conv_cfg=None, + norm_cfg=None, + with_cp=False, + stride=1, + init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')): + super(HRFPN, self).__init__(init_cfg) + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + self.reduction_conv = ConvModule( + sum(in_channels), + out_channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + act_cfg=None) + + self.fpn_convs = nn.ModuleList() + for i in range(self.num_outs): + self.fpn_convs.append( + ConvModule( + out_channels, + out_channels, + kernel_size=3, + padding=1, + stride=stride, + conv_cfg=self.conv_cfg, + act_cfg=None)) + + if pooling_type == 'MAX': + self.pooling = F.max_pool2d + else: + self.pooling = F.avg_pool2d + + def forward(self, inputs): + """Forward function.""" + assert len(inputs) == self.num_ins + outs = [inputs[0]] + for i in range(1, self.num_ins): + outs.append( + F.interpolate(inputs[i], scale_factor=2**i, mode='bilinear')) + out = torch.cat(outs, dim=1) + if out.requires_grad and self.with_cp: + out = checkpoint(self.reduction_conv, out) + else: + out = self.reduction_conv(out) + outs = [out] + for i in range(1, self.num_outs): + outs.append(self.pooling(out, kernel_size=2**i, stride=2**i)) + outputs = [] + + for i in range(self.num_outs): + if outs[i].requires_grad and self.with_cp: + tmp_out = checkpoint(self.fpn_convs[i], outs[i]) + else: + tmp_out = self.fpn_convs[i](outs[i]) + outputs.append(tmp_out) + return tuple(outputs) diff --git a/mmdetection/mmdet/models/necks/nas_fpn.py b/mmdetection/mmdet/models/necks/nas_fpn.py new file mode 100644 index 0000000..8ec90cd --- /dev/null +++ b/mmdetection/mmdet/models/necks/nas_fpn.py @@ -0,0 +1,171 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.ops.merge_cells import GlobalPoolingCell, SumCell +from mmengine.model import BaseModule, ModuleList +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import MultiConfig, OptConfigType + + +@MODELS.register_module() +class NASFPN(BaseModule): + """NAS-FPN. + + Implementation of `NAS-FPN: Learning Scalable Feature Pyramid Architecture + for Object Detection `_ + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_outs (int): Number of output scales. + stack_times (int): The number of times the pyramid architecture will + be stacked. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Defaults to 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Defaults to -1, which means the + last level. + norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + normalization layer. Defaults to None. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. + """ + + def __init__( + self, + in_channels: List[int], + out_channels: int, + num_outs: int, + stack_times: int, + start_level: int = 0, + end_level: int = -1, + norm_cfg: OptConfigType = None, + init_cfg: MultiConfig = dict(type='Caffe2Xavier', layer='Conv2d') + ) -> None: + super().__init__(init_cfg=init_cfg) + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) # num of input feature levels + self.num_outs = num_outs # num of output feature levels + self.stack_times = stack_times + self.norm_cfg = norm_cfg + + if end_level == -1 or end_level == self.num_ins - 1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level is not the last level, no extra level is allowed + self.backbone_end_level = end_level + 1 + assert end_level < self.num_ins + assert num_outs == end_level - start_level + 1 + self.start_level = start_level + self.end_level = end_level + + # add lateral connections + self.lateral_convs = nn.ModuleList() + for i in range(self.start_level, self.backbone_end_level): + l_conv = ConvModule( + in_channels[i], + out_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=None) + self.lateral_convs.append(l_conv) + + # add extra downsample layers (stride-2 pooling or conv) + extra_levels = num_outs - self.backbone_end_level + self.start_level + self.extra_downsamples = nn.ModuleList() + for i in range(extra_levels): + extra_conv = ConvModule( + out_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None) + self.extra_downsamples.append( + nn.Sequential(extra_conv, nn.MaxPool2d(2, 2))) + + # add NAS FPN connections + self.fpn_stages = ModuleList() + for _ in range(self.stack_times): + stage = nn.ModuleDict() + # gp(p6, p4) -> p4_1 + stage['gp_64_4'] = GlobalPoolingCell( + in_channels=out_channels, + out_channels=out_channels, + out_norm_cfg=norm_cfg) + # sum(p4_1, p4) -> p4_2 + stage['sum_44_4'] = SumCell( + in_channels=out_channels, + out_channels=out_channels, + out_norm_cfg=norm_cfg) + # sum(p4_2, p3) -> p3_out + stage['sum_43_3'] = SumCell( + in_channels=out_channels, + out_channels=out_channels, + out_norm_cfg=norm_cfg) + # sum(p3_out, p4_2) -> p4_out + stage['sum_34_4'] = SumCell( + in_channels=out_channels, + out_channels=out_channels, + out_norm_cfg=norm_cfg) + # sum(p5, gp(p4_out, p3_out)) -> p5_out + stage['gp_43_5'] = GlobalPoolingCell(with_out_conv=False) + stage['sum_55_5'] = SumCell( + in_channels=out_channels, + out_channels=out_channels, + out_norm_cfg=norm_cfg) + # sum(p7, gp(p5_out, p4_2)) -> p7_out + stage['gp_54_7'] = GlobalPoolingCell(with_out_conv=False) + stage['sum_77_7'] = SumCell( + in_channels=out_channels, + out_channels=out_channels, + out_norm_cfg=norm_cfg) + # gp(p7_out, p5_out) -> p6_out + stage['gp_75_6'] = GlobalPoolingCell( + in_channels=out_channels, + out_channels=out_channels, + out_norm_cfg=norm_cfg) + self.fpn_stages.append(stage) + + def forward(self, inputs: Tuple[Tensor]) -> tuple: + """Forward function. + + Args: + inputs (tuple[Tensor]): Features from the upstream network, each + is a 4D-tensor. + + Returns: + tuple: Feature maps, each is a 4D-tensor. + """ + # build P3-P5 + feats = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + # build P6-P7 on top of P5 + for downsample in self.extra_downsamples: + feats.append(downsample(feats[-1])) + + p3, p4, p5, p6, p7 = feats + + for stage in self.fpn_stages: + # gp(p6, p4) -> p4_1 + p4_1 = stage['gp_64_4'](p6, p4, out_size=p4.shape[-2:]) + # sum(p4_1, p4) -> p4_2 + p4_2 = stage['sum_44_4'](p4_1, p4, out_size=p4.shape[-2:]) + # sum(p4_2, p3) -> p3_out + p3 = stage['sum_43_3'](p4_2, p3, out_size=p3.shape[-2:]) + # sum(p3_out, p4_2) -> p4_out + p4 = stage['sum_34_4'](p3, p4_2, out_size=p4.shape[-2:]) + # sum(p5, gp(p4_out, p3_out)) -> p5_out + p5_tmp = stage['gp_43_5'](p4, p3, out_size=p5.shape[-2:]) + p5 = stage['sum_55_5'](p5, p5_tmp, out_size=p5.shape[-2:]) + # sum(p7, gp(p5_out, p4_2)) -> p7_out + p7_tmp = stage['gp_54_7'](p5, p4_2, out_size=p7.shape[-2:]) + p7 = stage['sum_77_7'](p7, p7_tmp, out_size=p7.shape[-2:]) + # gp(p7_out, p5_out) -> p6_out + p6 = stage['gp_75_6'](p7, p5, out_size=p6.shape[-2:]) + + return p3, p4, p5, p6, p7 diff --git a/mmdetection/mmdet/models/necks/nasfcos_fpn.py b/mmdetection/mmdet/models/necks/nasfcos_fpn.py new file mode 100644 index 0000000..12d0848 --- /dev/null +++ b/mmdetection/mmdet/models/necks/nasfcos_fpn.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmcv.ops.merge_cells import ConcatCell +from mmengine.model import BaseModule, caffe2_xavier_init + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class NASFCOS_FPN(BaseModule): + """FPN structure in NASFPN. + + Implementation of paper `NAS-FCOS: Fast Neural Architecture Search for + Object Detection `_ + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_outs (int): Number of output scales. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Default: 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Default: -1, which means the last level. + add_extra_convs (bool): It decides whether to add conv + layers on top of the original feature maps. Default to False. + If True, its actual mode is specified by `extra_convs_on_inputs`. + conv_cfg (dict): dictionary to construct and config conv layer. + norm_cfg (dict): dictionary to construct and config norm layer. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=1, + end_level=-1, + add_extra_convs=False, + conv_cfg=None, + norm_cfg=None, + init_cfg=None): + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super(NASFCOS_FPN, self).__init__(init_cfg) + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + + if end_level == -1 or end_level == self.num_ins - 1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level is not the last level, no extra level is allowed + self.backbone_end_level = end_level + 1 + assert end_level < self.num_ins + assert num_outs == end_level - start_level + 1 + self.start_level = start_level + self.end_level = end_level + self.add_extra_convs = add_extra_convs + + self.adapt_convs = nn.ModuleList() + for i in range(self.start_level, self.backbone_end_level): + adapt_conv = ConvModule( + in_channels[i], + out_channels, + 1, + stride=1, + padding=0, + bias=False, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU', inplace=False)) + self.adapt_convs.append(adapt_conv) + + # C2 is omitted according to the paper + extra_levels = num_outs - self.backbone_end_level + self.start_level + + def build_concat_cell(with_input1_conv, with_input2_conv): + cell_conv_cfg = dict( + kernel_size=1, padding=0, bias=False, groups=out_channels) + return ConcatCell( + in_channels=out_channels, + out_channels=out_channels, + with_out_conv=True, + out_conv_cfg=cell_conv_cfg, + out_norm_cfg=dict(type='BN'), + out_conv_order=('norm', 'act', 'conv'), + with_input1_conv=with_input1_conv, + with_input2_conv=with_input2_conv, + input_conv_cfg=conv_cfg, + input_norm_cfg=norm_cfg, + upsample_mode='nearest') + + # Denote c3=f0, c4=f1, c5=f2 for convince + self.fpn = nn.ModuleDict() + self.fpn['c22_1'] = build_concat_cell(True, True) + self.fpn['c22_2'] = build_concat_cell(True, True) + self.fpn['c32'] = build_concat_cell(True, False) + self.fpn['c02'] = build_concat_cell(True, False) + self.fpn['c42'] = build_concat_cell(True, True) + self.fpn['c36'] = build_concat_cell(True, True) + self.fpn['c61'] = build_concat_cell(True, True) # f9 + self.extra_downsamples = nn.ModuleList() + for i in range(extra_levels): + extra_act_cfg = None if i == 0 \ + else dict(type='ReLU', inplace=False) + self.extra_downsamples.append( + ConvModule( + out_channels, + out_channels, + 3, + stride=2, + padding=1, + act_cfg=extra_act_cfg, + order=('act', 'norm', 'conv'))) + + def forward(self, inputs): + """Forward function.""" + feats = [ + adapt_conv(inputs[i + self.start_level]) + for i, adapt_conv in enumerate(self.adapt_convs) + ] + + for (i, module_name) in enumerate(self.fpn): + idx_1, idx_2 = int(module_name[1]), int(module_name[2]) + res = self.fpn[module_name](feats[idx_1], feats[idx_2]) + feats.append(res) + + ret = [] + for (idx, input_idx) in zip([9, 8, 7], [1, 2, 3]): # add P3, P4, P5 + feats1, feats2 = feats[idx], feats[5] + feats2_resize = F.interpolate( + feats2, + size=feats1.size()[2:], + mode='bilinear', + align_corners=False) + + feats_sum = feats1 + feats2_resize + ret.append( + F.interpolate( + feats_sum, + size=inputs[input_idx].size()[2:], + mode='bilinear', + align_corners=False)) + + for submodule in self.extra_downsamples: + ret.append(submodule(ret[-1])) + + return tuple(ret) + + def init_weights(self): + """Initialize the weights of module.""" + super(NASFCOS_FPN, self).init_weights() + for module in self.fpn.values(): + if hasattr(module, 'conv_out'): + caffe2_xavier_init(module.out_conv.conv) + + for modules in [ + self.adapt_convs.modules(), + self.extra_downsamples.modules() + ]: + for module in modules: + if isinstance(module, nn.Conv2d): + caffe2_xavier_init(module) diff --git a/mmdetection/mmdet/models/necks/pafpn.py b/mmdetection/mmdet/models/necks/pafpn.py new file mode 100644 index 0000000..557638f --- /dev/null +++ b/mmdetection/mmdet/models/necks/pafpn.py @@ -0,0 +1,157 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule + +from mmdet.registry import MODELS +from .fpn import FPN + + +@MODELS.register_module() +class PAFPN(FPN): + """Path Aggregation Network for Instance Segmentation. + + This is an implementation of the `PAFPN in Path Aggregation Network + `_. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_outs (int): Number of output scales. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Default: 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Default: -1, which means the last level. + add_extra_convs (bool | str): If bool, it decides whether to add conv + layers on top of the original feature maps. Default to False. + If True, it is equivalent to `add_extra_convs='on_input'`. + If str, it specifies the source feature map of the extra convs. + Only the following options are allowed + + - 'on_input': Last feat map of neck inputs (i.e. backbone feature). + - 'on_lateral': Last feature map after lateral convs. + - 'on_output': The last output feature map after fpn convs. + relu_before_extra_convs (bool): Whether to apply relu before the extra + conv. Default: False. + no_norm_on_lateral (bool): Whether to apply norm on lateral. + Default: False. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (str): Config dict for activation layer in ConvModule. + Default: None. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False, + relu_before_extra_convs=False, + no_norm_on_lateral=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + init_cfg=dict( + type='Xavier', layer='Conv2d', distribution='uniform')): + super(PAFPN, self).__init__( + in_channels, + out_channels, + num_outs, + start_level, + end_level, + add_extra_convs, + relu_before_extra_convs, + no_norm_on_lateral, + conv_cfg, + norm_cfg, + act_cfg, + init_cfg=init_cfg) + # add extra bottom up pathway + self.downsample_convs = nn.ModuleList() + self.pafpn_convs = nn.ModuleList() + for i in range(self.start_level + 1, self.backbone_end_level): + d_conv = ConvModule( + out_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + pafpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + self.downsample_convs.append(d_conv) + self.pafpn_convs.append(pafpn_conv) + + def forward(self, inputs): + """Forward function.""" + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] = laterals[i - 1] + F.interpolate( + laterals[i], size=prev_shape, mode='nearest') + + # build outputs + # part 1: from original levels + inter_outs = [ + self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) + ] + + # part 2: add bottom-up path + for i in range(0, used_backbone_levels - 1): + inter_outs[i + 1] = inter_outs[i + 1] + \ + self.downsample_convs[i](inter_outs[i]) + + outs = [] + outs.append(inter_outs[0]) + outs.extend([ + self.pafpn_convs[i - 1](inter_outs[i]) + for i in range(1, used_backbone_levels) + ]) + + # part 3: add extra levels + if self.num_outs > len(outs): + # use max pool to get more levels on top of outputs + # (e.g., Faster R-CNN, Mask R-CNN) + if not self.add_extra_convs: + for i in range(self.num_outs - used_backbone_levels): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) + # add conv layers on top of original feature maps (RetinaNet) + else: + if self.add_extra_convs == 'on_input': + orig = inputs[self.backbone_end_level - 1] + outs.append(self.fpn_convs[used_backbone_levels](orig)) + elif self.add_extra_convs == 'on_lateral': + outs.append(self.fpn_convs[used_backbone_levels]( + laterals[-1])) + elif self.add_extra_convs == 'on_output': + outs.append(self.fpn_convs[used_backbone_levels](outs[-1])) + else: + raise NotImplementedError + for i in range(used_backbone_levels + 1, self.num_outs): + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/rfp.py b/mmdetection/mmdet/models/necks/rfp.py new file mode 100644 index 0000000..7ec9b37 --- /dev/null +++ b/mmdetection/mmdet/models/necks/rfp.py @@ -0,0 +1,134 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModule, ModuleList, constant_init, xavier_init + +from mmdet.registry import MODELS +from .fpn import FPN + + +class ASPP(BaseModule): + """ASPP (Atrous Spatial Pyramid Pooling) + + This is an implementation of the ASPP module used in DetectoRS + (https://arxiv.org/pdf/2006.02334.pdf) + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of channels produced by this module + dilations (tuple[int]): Dilations of the four branches. + Default: (1, 3, 6, 1) + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + in_channels, + out_channels, + dilations=(1, 3, 6, 1), + init_cfg=dict(type='Kaiming', layer='Conv2d')): + super().__init__(init_cfg) + assert dilations[-1] == 1 + self.aspp = nn.ModuleList() + for dilation in dilations: + kernel_size = 3 if dilation > 1 else 1 + padding = dilation if dilation > 1 else 0 + conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=1, + dilation=dilation, + padding=padding, + bias=True) + self.aspp.append(conv) + self.gap = nn.AdaptiveAvgPool2d(1) + + def forward(self, x): + avg_x = self.gap(x) + out = [] + for aspp_idx in range(len(self.aspp)): + inp = avg_x if (aspp_idx == len(self.aspp) - 1) else x + out.append(F.relu_(self.aspp[aspp_idx](inp))) + out[-1] = out[-1].expand_as(out[-2]) + out = torch.cat(out, dim=1) + return out + + +@MODELS.register_module() +class RFP(FPN): + """RFP (Recursive Feature Pyramid) + + This is an implementation of RFP in `DetectoRS + `_. Different from standard FPN, the + input of RFP should be multi level features along with origin input image + of backbone. + + Args: + rfp_steps (int): Number of unrolled steps of RFP. + rfp_backbone (dict): Configuration of the backbone for RFP. + aspp_out_channels (int): Number of output channels of ASPP module. + aspp_dilations (tuple[int]): Dilation rates of four branches. + Default: (1, 3, 6, 1) + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + rfp_steps, + rfp_backbone, + aspp_out_channels, + aspp_dilations=(1, 3, 6, 1), + init_cfg=None, + **kwargs): + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super().__init__(init_cfg=init_cfg, **kwargs) + self.rfp_steps = rfp_steps + # Be careful! Pretrained weights cannot be loaded when use + # nn.ModuleList + self.rfp_modules = ModuleList() + for rfp_idx in range(1, rfp_steps): + rfp_module = MODELS.build(rfp_backbone) + self.rfp_modules.append(rfp_module) + self.rfp_aspp = ASPP(self.out_channels, aspp_out_channels, + aspp_dilations) + self.rfp_weight = nn.Conv2d( + self.out_channels, + 1, + kernel_size=1, + stride=1, + padding=0, + bias=True) + + def init_weights(self): + # Avoid using super().init_weights(), which may alter the default + # initialization of the modules in self.rfp_modules that have missing + # keys in the pretrained checkpoint. + for convs in [self.lateral_convs, self.fpn_convs]: + for m in convs.modules(): + if isinstance(m, nn.Conv2d): + xavier_init(m, distribution='uniform') + for rfp_idx in range(self.rfp_steps - 1): + self.rfp_modules[rfp_idx].init_weights() + constant_init(self.rfp_weight, 0) + + def forward(self, inputs): + inputs = list(inputs) + assert len(inputs) == len(self.in_channels) + 1 # +1 for input image + img = inputs.pop(0) + # FPN forward + x = super().forward(tuple(inputs)) + for rfp_idx in range(self.rfp_steps - 1): + rfp_feats = [x[0]] + list( + self.rfp_aspp(x[i]) for i in range(1, len(x))) + x_idx = self.rfp_modules[rfp_idx].rfp_forward(img, rfp_feats) + # FPN forward + x_idx = super().forward(x_idx) + x_new = [] + for ft_idx in range(len(x_idx)): + add_weight = torch.sigmoid(self.rfp_weight(x_idx[ft_idx])) + x_new.append(add_weight * x_idx[ft_idx] + + (1 - add_weight) * x[ft_idx]) + x = x_new + return x diff --git a/mmdetection/mmdet/models/necks/ssd_neck.py b/mmdetection/mmdet/models/necks/ssd_neck.py new file mode 100644 index 0000000..17ba319 --- /dev/null +++ b/mmdetection/mmdet/models/necks/ssd_neck.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class SSDNeck(BaseModule): + """Extra layers of SSD backbone to generate multi-scale feature maps. + + Args: + in_channels (Sequence[int]): Number of input channels per scale. + out_channels (Sequence[int]): Number of output channels per scale. + level_strides (Sequence[int]): Stride of 3x3 conv per level. + level_paddings (Sequence[int]): Padding size of 3x3 conv per level. + l2_norm_scale (float|None): L2 normalization layer init scale. + If None, not use L2 normalization on the first input feature. + last_kernel_size (int): Kernel size of the last conv layer. + Default: 3. + use_depthwise (bool): Whether to use DepthwiseSeparableConv. + Default: False. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: None. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + in_channels, + out_channels, + level_strides, + level_paddings, + l2_norm_scale=20., + last_kernel_size=3, + use_depthwise=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + init_cfg=[ + dict( + type='Xavier', distribution='uniform', + layer='Conv2d'), + dict(type='Constant', val=1, layer='BatchNorm2d'), + ]): + super(SSDNeck, self).__init__(init_cfg) + assert len(out_channels) > len(in_channels) + assert len(out_channels) - len(in_channels) == len(level_strides) + assert len(level_strides) == len(level_paddings) + assert in_channels == out_channels[:len(in_channels)] + + if l2_norm_scale: + self.l2_norm = L2Norm(in_channels[0], l2_norm_scale) + self.init_cfg += [ + dict( + type='Constant', + val=self.l2_norm.scale, + override=dict(name='l2_norm')) + ] + + self.extra_layers = nn.ModuleList() + extra_layer_channels = out_channels[len(in_channels):] + second_conv = DepthwiseSeparableConvModule if \ + use_depthwise else ConvModule + + for i, (out_channel, stride, padding) in enumerate( + zip(extra_layer_channels, level_strides, level_paddings)): + kernel_size = last_kernel_size \ + if i == len(extra_layer_channels) - 1 else 3 + per_lvl_convs = nn.Sequential( + ConvModule( + out_channels[len(in_channels) - 1 + i], + out_channel // 2, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + second_conv( + out_channel // 2, + out_channel, + kernel_size, + stride=stride, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.extra_layers.append(per_lvl_convs) + + def forward(self, inputs): + """Forward function.""" + outs = [feat for feat in inputs] + if hasattr(self, 'l2_norm'): + outs[0] = self.l2_norm(outs[0]) + + feat = outs[-1] + for layer in self.extra_layers: + feat = layer(feat) + outs.append(feat) + return tuple(outs) + + +class L2Norm(nn.Module): + + def __init__(self, n_dims, scale=20., eps=1e-10): + """L2 normalization layer. + + Args: + n_dims (int): Number of dimensions to be normalized + scale (float, optional): Defaults to 20.. + eps (float, optional): Used to avoid division by zero. + Defaults to 1e-10. + """ + super(L2Norm, self).__init__() + self.n_dims = n_dims + self.weight = nn.Parameter(torch.Tensor(self.n_dims)) + self.eps = eps + self.scale = scale + + def forward(self, x): + """Forward function.""" + # normalization layer convert to FP32 in FP16 training + x_float = x.float() + norm = x_float.pow(2).sum(1, keepdim=True).sqrt() + self.eps + return (self.weight[None, :, None, None].float().expand_as(x_float) * + x_float / norm).type_as(x) diff --git a/mmdetection/mmdet/models/necks/ssh.py b/mmdetection/mmdet/models/necks/ssh.py new file mode 100644 index 0000000..75a6561 --- /dev/null +++ b/mmdetection/mmdet/models/necks/ssh.py @@ -0,0 +1,216 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig + + +class SSHContextModule(BaseModule): + """This is an implementation of `SSH context module` described in `SSH: + Single Stage Headless Face Detector. + + `_. + + Args: + in_channels (int): Number of input channels used at each scale. + out_channels (int): Number of output channels used at each scale. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN'). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + assert out_channels % 4 == 0 + + self.in_channels = in_channels + self.out_channels = out_channels + + self.conv5x5_1 = ConvModule( + self.in_channels, + self.out_channels // 4, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + ) + + self.conv5x5_2 = ConvModule( + self.out_channels // 4, + self.out_channels // 4, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.conv7x7_2 = ConvModule( + self.out_channels // 4, + self.out_channels // 4, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + ) + + self.conv7x7_3 = ConvModule( + self.out_channels // 4, + self.out_channels // 4, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None, + ) + + def forward(self, x: torch.Tensor) -> tuple: + conv5x5_1 = self.conv5x5_1(x) + conv5x5 = self.conv5x5_2(conv5x5_1) + conv7x7_2 = self.conv7x7_2(conv5x5_1) + conv7x7 = self.conv7x7_3(conv7x7_2) + + return (conv5x5, conv7x7) + + +class SSHDetModule(BaseModule): + """This is an implementation of `SSH detection module` described in `SSH: + Single Stage Headless Face Detector. + + `_. + + Args: + in_channels (int): Number of input channels used at each scale. + out_channels (int): Number of output channels used at each scale. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN'). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + assert out_channels % 4 == 0 + + self.in_channels = in_channels + self.out_channels = out_channels + + self.conv3x3 = ConvModule( + self.in_channels, + self.out_channels // 2, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.context_module = SSHContextModule( + in_channels=self.in_channels, + out_channels=self.out_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + conv3x3 = self.conv3x3(x) + conv5x5, conv7x7 = self.context_module(x) + out = torch.cat([conv3x3, conv5x5, conv7x7], dim=1) + out = F.relu(out) + + return out + + +@MODELS.register_module() +class SSH(BaseModule): + """`SSH Neck` used in `SSH: Single Stage Headless Face Detector. + + `_. + + Args: + num_scales (int): The number of scales / stages. + in_channels (list[int]): The number of input channels per scale. + out_channels (list[int]): The number of output channels per scale. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN'). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + + Example: + >>> import torch + >>> in_channels = [8, 16, 32, 64] + >>> out_channels = [16, 32, 64, 128] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = SSH(num_scales=4, in_channels=in_channels, + ... out_channels=out_channels) + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 16, 340, 340]) + outputs[1].shape = torch.Size([1, 32, 170, 170]) + outputs[2].shape = torch.Size([1, 64, 84, 84]) + outputs[3].shape = torch.Size([1, 128, 43, 43]) + """ + + def __init__(self, + num_scales: int, + in_channels: List[int], + out_channels: List[int], + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + init_cfg: OptMultiConfig = dict( + type='Xavier', layer='Conv2d', distribution='uniform')): + super().__init__(init_cfg=init_cfg) + assert (num_scales == len(in_channels) == len(out_channels)) + self.num_scales = num_scales + self.in_channels = in_channels + self.out_channels = out_channels + + for idx in range(self.num_scales): + in_c, out_c = self.in_channels[idx], self.out_channels[idx] + self.add_module( + f'ssh_module{idx}', + SSHDetModule( + in_channels=in_c, + out_channels=out_c, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg)) + + def forward(self, inputs: Tuple[torch.Tensor]) -> tuple: + assert len(inputs) == self.num_scales + + outs = [] + for idx, x in enumerate(inputs): + ssh_module = getattr(self, f'ssh_module{idx}') + out = ssh_module(x) + outs.append(out) + + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/yolo_neck.py b/mmdetection/mmdet/models/necks/yolo_neck.py new file mode 100644 index 0000000..48a6b1a --- /dev/null +++ b/mmdetection/mmdet/models/necks/yolo_neck.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Copyright (c) 2019 Western Digital Corporation or its affiliates. +from typing import List, Tuple + +import torch +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig + + +class DetectionBlock(BaseModule): + """Detection block in YOLO neck. + + Let out_channels = n, the DetectionBlock contains: + Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer. + The first 6 ConvLayers are formed the following way: + 1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n. + The Conv2D layer is 1x1x255. + Some block will have branch after the fifth ConvLayer. + The input channel is arbitrary (in_channels) + + Args: + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: dict(type='BN', requires_grad=True) + act_cfg (dict): Config dict for activation layer. + Default: dict(type='LeakyReLU', negative_slope=0.1). + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + in_channels: int, + out_channels: int, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + act_cfg: ConfigType = dict( + type='LeakyReLU', negative_slope=0.1), + init_cfg: OptMultiConfig = None) -> None: + super(DetectionBlock, self).__init__(init_cfg) + double_out_channels = out_channels * 2 + + # shortcut + cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) + self.conv1 = ConvModule(in_channels, out_channels, 1, **cfg) + self.conv2 = ConvModule( + out_channels, double_out_channels, 3, padding=1, **cfg) + self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg) + self.conv4 = ConvModule( + out_channels, double_out_channels, 3, padding=1, **cfg) + self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg) + + def forward(self, x: Tensor) -> Tensor: + tmp = self.conv1(x) + tmp = self.conv2(tmp) + tmp = self.conv3(tmp) + tmp = self.conv4(tmp) + out = self.conv5(tmp) + return out + + +@MODELS.register_module() +class YOLOV3Neck(BaseModule): + """The neck of YOLOV3. + + It can be treated as a simplified version of FPN. It + will take the result from Darknet backbone and do some upsampling and + concatenation. It will finally output the detection result. + + Note: + The input feats should be from top to bottom. + i.e., from high-lvl to low-lvl + But YOLOV3Neck will process them in reversed order. + i.e., from bottom (high-lvl) to top (low-lvl) + + Args: + num_scales (int): The number of scales / stages. + in_channels (List[int]): The number of input channels per scale. + out_channels (List[int]): The number of output channels per scale. + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None. + norm_cfg (dict, optional): Dictionary to construct and config norm + layer. Default: dict(type='BN', requires_grad=True) + act_cfg (dict, optional): Config dict for activation layer. + Default: dict(type='LeakyReLU', negative_slope=0.1). + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + num_scales: int, + in_channels: List[int], + out_channels: List[int], + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', requires_grad=True), + act_cfg: ConfigType = dict( + type='LeakyReLU', negative_slope=0.1), + init_cfg: OptMultiConfig = None) -> None: + super(YOLOV3Neck, self).__init__(init_cfg) + assert (num_scales == len(in_channels) == len(out_channels)) + self.num_scales = num_scales + self.in_channels = in_channels + self.out_channels = out_channels + + # shortcut + cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg) + + # To support arbitrary scales, the code looks awful, but it works. + # Better solution is welcomed. + self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg) + for i in range(1, self.num_scales): + in_c, out_c = self.in_channels[i], self.out_channels[i] + inter_c = out_channels[i - 1] + self.add_module(f'conv{i}', ConvModule(inter_c, out_c, 1, **cfg)) + # in_c + out_c : High-lvl feats will be cat with low-lvl feats + self.add_module(f'detect{i+1}', + DetectionBlock(in_c + out_c, out_c, **cfg)) + + def forward(self, feats=Tuple[Tensor]) -> Tuple[Tensor]: + assert len(feats) == self.num_scales + + # processed from bottom (high-lvl) to top (low-lvl) + outs = [] + out = self.detect1(feats[-1]) + outs.append(out) + + for i, x in enumerate(reversed(feats[:-1])): + conv = getattr(self, f'conv{i+1}') + tmp = conv(out) + + # Cat with low-lvl feats + tmp = F.interpolate(tmp, scale_factor=2) + tmp = torch.cat((tmp, x), 1) + + detect = getattr(self, f'detect{i+2}') + out = detect(tmp) + outs.append(out) + + return tuple(outs) diff --git a/mmdetection/mmdet/models/necks/yolox_pafpn.py b/mmdetection/mmdet/models/necks/yolox_pafpn.py new file mode 100644 index 0000000..8ec3d12 --- /dev/null +++ b/mmdetection/mmdet/models/necks/yolox_pafpn.py @@ -0,0 +1,156 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from ..layers import CSPLayer + + +@MODELS.register_module() +class YOLOXPAFPN(BaseModule): + """Path Aggregation Network used in YOLOX. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3 + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: False + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(scale_factor=2, mode='nearest')` + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN') + act_cfg (dict): Config dict for activation layer. + Default: dict(type='Swish') + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + num_csp_blocks=3, + use_depthwise=False, + upsample_cfg=dict(scale_factor=2, mode='nearest'), + conv_cfg=None, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu')): + super(YOLOXPAFPN, self).__init__(init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + # build top-down blocks + self.upsample = nn.Upsample(**upsample_cfg) + self.reduce_layers = nn.ModuleList() + self.top_down_blocks = nn.ModuleList() + for idx in range(len(in_channels) - 1, 0, -1): + self.reduce_layers.append( + ConvModule( + in_channels[idx], + in_channels[idx - 1], + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.top_down_blocks.append( + CSPLayer( + in_channels[idx - 1] * 2, + in_channels[idx - 1], + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + # build bottom-up blocks + self.downsamples = nn.ModuleList() + self.bottom_up_blocks = nn.ModuleList() + for idx in range(len(in_channels) - 1): + self.downsamples.append( + conv( + in_channels[idx], + in_channels[idx], + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.bottom_up_blocks.append( + CSPLayer( + in_channels[idx] * 2, + in_channels[idx + 1], + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.out_convs = nn.ModuleList() + for i in range(len(in_channels)): + self.out_convs.append( + ConvModule( + in_channels[i], + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, inputs): + """ + Args: + inputs (tuple[Tensor]): input features. + + Returns: + tuple[Tensor]: YOLOXPAFPN features. + """ + assert len(inputs) == len(self.in_channels) + + # top-down path + inner_outs = [inputs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = inputs[idx - 1] + feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx]( + feat_heigh) + inner_outs[0] = feat_heigh + + upsample_feat = self.upsample(feat_heigh) + + inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( + torch.cat([upsample_feat, feat_low], 1)) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsamples[idx](feat_low) + out = self.bottom_up_blocks[idx]( + torch.cat([downsample_feat, feat_height], 1)) + outs.append(out) + + # out convs + for idx, conv in enumerate(self.out_convs): + outs[idx] = conv(outs[idx]) + + return tuple(outs) diff --git a/mmdetection/mmdet/models/reid/__init__.py b/mmdetection/mmdet/models/reid/__init__.py new file mode 100644 index 0000000..aca617f --- /dev/null +++ b/mmdetection/mmdet/models/reid/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_reid import BaseReID +from .fc_module import FcModule +from .gap import GlobalAveragePooling +from .linear_reid_head import LinearReIDHead + +__all__ = ['BaseReID', 'GlobalAveragePooling', 'LinearReIDHead', 'FcModule'] diff --git a/mmdetection/mmdet/models/reid/base_reid.py b/mmdetection/mmdet/models/reid/base_reid.py new file mode 100644 index 0000000..4c45964 --- /dev/null +++ b/mmdetection/mmdet/models/reid/base_reid.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional + +import torch + +try: + import mmpretrain + from mmpretrain.models.classifiers import ImageClassifier +except ImportError: + mmpretrain = None + ImageClassifier = object + +from mmdet.registry import MODELS +from mmdet.structures import ReIDDataSample + + +@MODELS.register_module() +class BaseReID(ImageClassifier): + """Base model for re-identification.""" + + def __init__(self, *args, **kwargs): + if mmpretrain is None: + raise RuntimeError('Please run "pip install openmim" and ' + 'run "mim install mmpretrain" to ' + 'install mmpretrain first.') + super().__init__(*args, **kwargs) + + def forward(self, + inputs: torch.Tensor, + data_samples: Optional[List[ReIDDataSample]] = None, + mode: str = 'tensor'): + """The unified entry for a forward process in both training and test. + + The method should accept three modes: "tensor", "predict" and "loss": + + - "tensor": Forward the whole network and return tensor or tuple of + tensor without any post-processing, same as a common nn.Module. + - "predict": Forward and return the predictions, which are fully + processed to a list of :obj:`ReIDDataSample`. + - "loss": Forward and return a dict of losses according to the given + inputs and data samples. + + Note that this method doesn't handle neither back propagation nor + optimizer updating, which are done in the :meth:`train_step`. + + Args: + inputs (torch.Tensor): The input tensor with shape + (N, C, H, W) or (N, T, C, H, W). + data_samples (List[ReIDDataSample], optional): The annotation + data of every sample. It's required if ``mode="loss"``. + Defaults to None. + mode (str): Return what kind of value. Defaults to 'tensor'. + + Returns: + The return type depends on ``mode``. + + - If ``mode="tensor"``, return a tensor or a tuple of tensor. + - If ``mode="predict"``, return a list of + :obj:`ReIDDataSample`. + - If ``mode="loss"``, return a dict of tensor. + """ + if len(inputs.size()) == 5: + assert inputs.size(0) == 1 + inputs = inputs[0] + return super().forward(inputs, data_samples, mode) diff --git a/mmdetection/mmdet/models/reid/fc_module.py b/mmdetection/mmdet/models/reid/fc_module.py new file mode 100644 index 0000000..76e7efd --- /dev/null +++ b/mmdetection/mmdet/models/reid/fc_module.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import build_activation_layer, build_norm_layer +from mmengine.model import BaseModule + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class FcModule(BaseModule): + """Fully-connected layer module. + + Args: + in_channels (int): Input channels. + out_channels (int): Ourput channels. + norm_cfg (dict, optional): Configuration of normlization method + after fc. Defaults to None. + act_cfg (dict, optional): Configuration of activation method after fc. + Defaults to dict(type='ReLU'). + inplace (bool, optional): Whether inplace the activatation module. + Defaults to True. + init_cfg (dict, optional): Initialization config dict. + Defaults to dict(type='Kaiming', layer='Linear'). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + norm_cfg: dict = None, + act_cfg: dict = dict(type='ReLU'), + inplace: bool = True, + init_cfg=dict(type='Kaiming', layer='Linear')): + super(FcModule, self).__init__(init_cfg) + assert norm_cfg is None or isinstance(norm_cfg, dict) + assert act_cfg is None or isinstance(act_cfg, dict) + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.inplace = inplace + + self.with_norm = norm_cfg is not None + self.with_activation = act_cfg is not None + + self.fc = nn.Linear(in_channels, out_channels) + # build normalization layers + if self.with_norm: + self.norm_name, norm = build_norm_layer(norm_cfg, out_channels) + self.add_module(self.norm_name, norm) + + # build activation layer + if self.with_activation: + act_cfg_ = act_cfg.copy() + # nn.Tanh has no 'inplace' argument + if act_cfg_['type'] not in [ + 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish' + ]: + act_cfg_.setdefault('inplace', inplace) + self.activate = build_activation_layer(act_cfg_) + + @property + def norm(self): + """Normalization.""" + return getattr(self, self.norm_name) + + def forward(self, x, activate=True, norm=True): + """Model forward.""" + x = self.fc(x) + if norm and self.with_norm: + x = self.norm(x) + if activate and self.with_activation: + x = self.activate(x) + return x diff --git a/mmdetection/mmdet/models/reid/gap.py b/mmdetection/mmdet/models/reid/gap.py new file mode 100644 index 0000000..aadc25e --- /dev/null +++ b/mmdetection/mmdet/models/reid/gap.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmengine.model import BaseModule + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class GlobalAveragePooling(BaseModule): + """Global Average Pooling neck. + + Note that we use `view` to remove extra channel after pooling. We do not + use `squeeze` as it will also remove the batch dimension when the tensor + has a batch dimension of size 1, which can lead to unexpected errors. + """ + + def __init__(self, kernel_size=None, stride=None): + super(GlobalAveragePooling, self).__init__() + if kernel_size is None and stride is None: + self.gap = nn.AdaptiveAvgPool2d((1, 1)) + else: + self.gap = nn.AvgPool2d(kernel_size, stride) + + def forward(self, inputs): + if isinstance(inputs, tuple): + outs = tuple([self.gap(x) for x in inputs]) + outs = tuple([ + out.view(x.size(0), + torch.tensor(out.size()[1:]).prod()) + for out, x in zip(outs, inputs) + ]) + elif isinstance(inputs, torch.Tensor): + outs = self.gap(inputs) + outs = outs.view( + inputs.size(0), + torch.tensor(outs.size()[1:]).prod()) + else: + raise TypeError('neck inputs should be tuple or torch.tensor') + return outs diff --git a/mmdetection/mmdet/models/reid/linear_reid_head.py b/mmdetection/mmdet/models/reid/linear_reid_head.py new file mode 100644 index 0000000..f35aaf6 --- /dev/null +++ b/mmdetection/mmdet/models/reid/linear_reid_head.py @@ -0,0 +1,202 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn + +try: + import mmpretrain + from mmpretrain.evaluation.metrics import Accuracy +except ImportError: + mmpretrain = None + +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from mmdet.structures import ReIDDataSample +from .fc_module import FcModule + + +@MODELS.register_module() +class LinearReIDHead(BaseModule): + """Linear head for re-identification. + + Args: + num_fcs (int): Number of fcs. + in_channels (int): Number of channels in the input. + fc_channels (int): Number of channels in the fcs. + out_channels (int): Number of channels in the output. + norm_cfg (dict, optional): Configuration of normlization method + after fc. Defaults to None. + act_cfg (dict, optional): Configuration of activation method after fc. + Defaults to None. + num_classes (int, optional): Number of the identities. Default to None. + loss_cls (dict, optional): Cross entropy loss to train the ReID module. + Defaults to None. + loss_triplet (dict, optional): Triplet loss to train the ReID module. + Defaults to None. + topk (int | Tuple[int]): Top-k accuracy. Defaults to ``(1, )``. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to dict(type='Normal',layer='Linear', mean=0, std=0.01, + bias=0). + """ + + def __init__(self, + num_fcs: int, + in_channels: int, + fc_channels: int, + out_channels: int, + norm_cfg: Optional[dict] = None, + act_cfg: Optional[dict] = None, + num_classes: Optional[int] = None, + loss_cls: Optional[dict] = None, + loss_triplet: Optional[dict] = None, + topk: Union[int, Tuple[int]] = (1, ), + init_cfg: Union[dict, List[dict]] = dict( + type='Normal', layer='Linear', mean=0, std=0.01, bias=0)): + if mmpretrain is None: + raise RuntimeError('Please run "pip install openmim" and ' + 'run "mim install mmpretrain" to ' + 'install mmpretrain first.') + super(LinearReIDHead, self).__init__(init_cfg=init_cfg) + + assert isinstance(topk, (int, tuple)) + if isinstance(topk, int): + topk = (topk, ) + for _topk in topk: + assert _topk > 0, 'Top-k should be larger than 0' + self.topk = topk + + if loss_cls is None: + if isinstance(num_classes, int): + warnings.warn('Since cross entropy is not set, ' + 'the num_classes will be ignored.') + if loss_triplet is None: + raise ValueError('Please choose at least one loss in ' + 'triplet loss and cross entropy loss.') + elif not isinstance(num_classes, int): + raise TypeError('The num_classes must be a current number, ' + 'if there is cross entropy loss.') + self.loss_cls = MODELS.build(loss_cls) if loss_cls else None + self.loss_triplet = MODELS.build(loss_triplet) \ + if loss_triplet else None + + self.num_fcs = num_fcs + self.in_channels = in_channels + self.fc_channels = fc_channels + self.out_channels = out_channels + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.num_classes = num_classes + + self._init_layers() + + def _init_layers(self): + """Initialize fc layers.""" + self.fcs = nn.ModuleList() + for i in range(self.num_fcs): + in_channels = self.in_channels if i == 0 else self.fc_channels + self.fcs.append( + FcModule(in_channels, self.fc_channels, self.norm_cfg, + self.act_cfg)) + in_channels = self.in_channels if self.num_fcs == 0 else \ + self.fc_channels + self.fc_out = nn.Linear(in_channels, self.out_channels) + if self.loss_cls: + self.bn = nn.BatchNorm1d(self.out_channels) + self.classifier = nn.Linear(self.out_channels, self.num_classes) + + def forward(self, feats: Tuple[torch.Tensor]) -> torch.Tensor: + """The forward process.""" + # Multiple stage inputs are acceptable + # but only the last stage will be used. + feats = feats[-1] + + for m in self.fcs: + feats = m(feats) + feats = self.fc_out(feats) + return feats + + def loss(self, feats: Tuple[torch.Tensor], + data_samples: List[ReIDDataSample]) -> dict: + """Calculate losses. + + Args: + feats (tuple[Tensor]): The features extracted from the backbone. + data_samples (List[ReIDDataSample]): The annotation data of + every samples. + + Returns: + dict: a dictionary of loss components + """ + # The part can be traced by torch.fx + feats = self(feats) + + # The part can not be traced by torch.fx + losses = self.loss_by_feat(feats, data_samples) + return losses + + def loss_by_feat(self, feats: torch.Tensor, + data_samples: List[ReIDDataSample]) -> dict: + """Unpack data samples and compute loss.""" + losses = dict() + gt_label = torch.cat([i.gt_label.label for i in data_samples]) + gt_label = gt_label.to(feats.device) + + if self.loss_triplet: + losses['triplet_loss'] = self.loss_triplet(feats, gt_label) + + if self.loss_cls: + feats_bn = self.bn(feats) + cls_score = self.classifier(feats_bn) + losses['ce_loss'] = self.loss_cls(cls_score, gt_label) + acc = Accuracy.calculate(cls_score, gt_label, topk=self.topk) + losses.update( + {f'accuracy_top-{k}': a + for k, a in zip(self.topk, acc)}) + + return losses + + def predict( + self, + feats: Tuple[torch.Tensor], + data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]: + """Inference without augmentation. + + Args: + feats (Tuple[Tensor]): The features extracted from the backbone. + Multiple stage inputs are acceptable but only the last stage + will be used. + data_samples (List[ReIDDataSample], optional): The annotation + data of every samples. If not None, set ``pred_label`` of + the input data samples. Defaults to None. + + Returns: + List[ReIDDataSample]: A list of data samples which contains the + predicted results. + """ + # The part can be traced by torch.fx + feats = self(feats) + + # The part can not be traced by torch.fx + data_samples = self.predict_by_feat(feats, data_samples) + + return data_samples + + def predict_by_feat( + self, + feats: torch.Tensor, + data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]: + """Add prediction features to data samples.""" + if data_samples is not None: + for data_sample, feat in zip(data_samples, feats): + data_sample.pred_feature = feat + else: + data_samples = [] + for feat in feats: + data_sample = ReIDDataSample() + data_sample.pred_feature = feat + data_samples.append(data_sample) + + return data_samples diff --git a/mmdetection/mmdet/models/roi_heads/__init__.py b/mmdetection/mmdet/models/roi_heads/__init__.py new file mode 100644 index 0000000..bba5664 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/__init__.py @@ -0,0 +1,38 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_roi_head import BaseRoIHead +from .bbox_heads import (BBoxHead, ConvFCBBoxHead, DIIHead, + DoubleConvFCBBoxHead, SABLHead, SCNetBBoxHead, + Shared2FCBBoxHead, Shared4Conv1FCBBoxHead) +from .cascade_roi_head import CascadeRoIHead +from .double_roi_head import DoubleHeadRoIHead +from .dynamic_roi_head import DynamicRoIHead +from .grid_roi_head import GridRoIHead +from .htc_roi_head import HybridTaskCascadeRoIHead +from .mask_heads import (CoarseMaskHead, FCNMaskHead, FeatureRelayHead, + FusedSemanticHead, GlobalContextHead, GridHead, + HTCMaskHead, MaskIoUHead, MaskPointHead, + SCNetMaskHead, SCNetSemanticHead) +from .mask_scoring_roi_head import MaskScoringRoIHead +from .multi_instance_roi_head import MultiInstanceRoIHead +from .pisa_roi_head import PISARoIHead +from .point_rend_roi_head import PointRendRoIHead +from .roi_extractors import (BaseRoIExtractor, GenericRoIExtractor, + SingleRoIExtractor) +from .scnet_roi_head import SCNetRoIHead +from .shared_heads import ResLayer +from .sparse_roi_head import SparseRoIHead +from .standard_roi_head import StandardRoIHead +from .trident_roi_head import TridentRoIHead + +__all__ = [ + 'BaseRoIHead', 'CascadeRoIHead', 'DoubleHeadRoIHead', 'MaskScoringRoIHead', + 'HybridTaskCascadeRoIHead', 'GridRoIHead', 'ResLayer', 'BBoxHead', + 'ConvFCBBoxHead', 'DIIHead', 'SABLHead', 'Shared2FCBBoxHead', + 'StandardRoIHead', 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', + 'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead', + 'MaskIoUHead', 'BaseRoIExtractor', 'GenericRoIExtractor', + 'SingleRoIExtractor', 'PISARoIHead', 'PointRendRoIHead', 'MaskPointHead', + 'CoarseMaskHead', 'DynamicRoIHead', 'SparseRoIHead', 'TridentRoIHead', + 'SCNetRoIHead', 'SCNetMaskHead', 'SCNetSemanticHead', 'SCNetBBoxHead', + 'FeatureRelayHead', 'GlobalContextHead', 'MultiInstanceRoIHead' +] diff --git a/mmdetection/mmdet/models/roi_heads/base_roi_head.py b/mmdetection/mmdet/models/roi_heads/base_roi_head.py new file mode 100644 index 0000000..405f80a --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/base_roi_head.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import Tuple + +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig + + +class BaseRoIHead(BaseModule, metaclass=ABCMeta): + """Base class for RoIHeads.""" + + def __init__(self, + bbox_roi_extractor: OptMultiConfig = None, + bbox_head: OptMultiConfig = None, + mask_roi_extractor: OptMultiConfig = None, + mask_head: OptMultiConfig = None, + shared_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + if shared_head is not None: + self.shared_head = MODELS.build(shared_head) + + if bbox_head is not None: + self.init_bbox_head(bbox_roi_extractor, bbox_head) + + if mask_head is not None: + self.init_mask_head(mask_roi_extractor, mask_head) + + self.init_assigner_sampler() + + @property + def with_bbox(self) -> bool: + """bool: whether the RoI head contains a `bbox_head`""" + return hasattr(self, 'bbox_head') and self.bbox_head is not None + + @property + def with_mask(self) -> bool: + """bool: whether the RoI head contains a `mask_head`""" + return hasattr(self, 'mask_head') and self.mask_head is not None + + @property + def with_shared_head(self) -> bool: + """bool: whether the RoI head contains a `shared_head`""" + return hasattr(self, 'shared_head') and self.shared_head is not None + + @abstractmethod + def init_bbox_head(self, *args, **kwargs): + """Initialize ``bbox_head``""" + pass + + @abstractmethod + def init_mask_head(self, *args, **kwargs): + """Initialize ``mask_head``""" + pass + + @abstractmethod + def init_assigner_sampler(self, *args, **kwargs): + """Initialize assigner and sampler.""" + pass + + @abstractmethod + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList): + """Perform forward propagation and loss calculation of the roi head on + the features of the upstream network.""" + + def predict(self, + x: Tuple[Tensor], + rpn_results_list: InstanceList, + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the roi head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from upstream network. Each + has shape (N, C, H, W). + rpn_results_list (list[:obj:`InstanceData`]): list of region + proposals. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results to + the original image. Defaults to True. + + Returns: + list[obj:`InstanceData`]: Detection results of each image. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + assert self.with_bbox, 'Bbox head must be implemented.' + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + # TODO: nms_op in mmcv need be enhanced, the bbox result may get + # difference when not rescale in bbox_head + + # If it has the mask branch, the bbox branch does not need + # to be scaled to the original image scale, because the mask + # branch will scale both bbox and mask at the same time. + bbox_rescale = rescale if not self.with_mask else False + results_list = self.predict_bbox( + x, + batch_img_metas, + rpn_results_list, + rcnn_test_cfg=self.test_cfg, + rescale=bbox_rescale) + + if self.with_mask: + results_list = self.predict_mask( + x, batch_img_metas, results_list, rescale=rescale) + + return results_list diff --git a/mmdetection/mmdet/models/roi_heads/bbox_heads/__init__.py b/mmdetection/mmdet/models/roi_heads/bbox_heads/__init__.py new file mode 100644 index 0000000..d9e742a --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/bbox_heads/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_head import BBoxHead +from .convfc_bbox_head import (ConvFCBBoxHead, Shared2FCBBoxHead, + Shared4Conv1FCBBoxHead) +from .dii_head import DIIHead +from .double_bbox_head import DoubleConvFCBBoxHead +from .multi_instance_bbox_head import MultiInstanceBBoxHead +from .sabl_head import SABLHead +from .scnet_bbox_head import SCNetBBoxHead + +__all__ = [ + 'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead', + 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'SABLHead', 'DIIHead', + 'SCNetBBoxHead', 'MultiInstanceBBoxHead' +] diff --git a/mmdetection/mmdet/models/roi_heads/bbox_heads/bbox_head.py b/mmdetection/mmdet/models/roi_heads/bbox_heads/bbox_head.py new file mode 100644 index 0000000..3b2e8aa --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/bbox_heads/bbox_head.py @@ -0,0 +1,708 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.config import ConfigDict +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor +from torch.nn.modules.utils import _pair + +from mmdet.models.layers import multiclass_nms +from mmdet.models.losses import accuracy +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.models.utils import empty_instances, multi_apply +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import get_box_tensor, scale_boxes +from mmdet.utils import ConfigType, InstanceList, OptMultiConfig + + +@MODELS.register_module() +class BBoxHead(BaseModule): + """Simplest RoI head, with only two fc layers for classification and + regression respectively.""" + + def __init__(self, + with_avg_pool: bool = False, + with_cls: bool = True, + with_reg: bool = True, + roi_feat_size: int = 7, + in_channels: int = 256, + num_classes: int = 80, + bbox_coder: ConfigType = dict( + type='DeltaXYWHBBoxCoder', + clip_border=True, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + predict_box_type: str = 'hbox', + reg_class_agnostic: bool = False, + reg_decoded_bbox: bool = False, + reg_predictor_cfg: ConfigType = dict(type='Linear'), + cls_predictor_cfg: ConfigType = dict(type='Linear'), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='SmoothL1Loss', beta=1.0, loss_weight=1.0), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + assert with_cls or with_reg + self.with_avg_pool = with_avg_pool + self.with_cls = with_cls + self.with_reg = with_reg + self.roi_feat_size = _pair(roi_feat_size) + self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1] + self.in_channels = in_channels + self.num_classes = num_classes + self.predict_box_type = predict_box_type + self.reg_class_agnostic = reg_class_agnostic + self.reg_decoded_bbox = reg_decoded_bbox + self.reg_predictor_cfg = reg_predictor_cfg + self.cls_predictor_cfg = cls_predictor_cfg + + self.bbox_coder = TASK_UTILS.build(bbox_coder) + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + + in_channels = self.in_channels + if self.with_avg_pool: + self.avg_pool = nn.AvgPool2d(self.roi_feat_size) + else: + in_channels *= self.roi_feat_area + if self.with_cls: + # need to add background class + if self.custom_cls_channels: + cls_channels = self.loss_cls.get_cls_channels(self.num_classes) + else: + cls_channels = num_classes + 1 + cls_predictor_cfg_ = self.cls_predictor_cfg.copy() + cls_predictor_cfg_.update( + in_features=in_channels, out_features=cls_channels) + self.fc_cls = MODELS.build(cls_predictor_cfg_) + if self.with_reg: + box_dim = self.bbox_coder.encode_size + out_dim_reg = box_dim if reg_class_agnostic else \ + box_dim * num_classes + reg_predictor_cfg_ = self.reg_predictor_cfg.copy() + if isinstance(reg_predictor_cfg_, (dict, ConfigDict)): + reg_predictor_cfg_.update( + in_features=in_channels, out_features=out_dim_reg) + self.fc_reg = MODELS.build(reg_predictor_cfg_) + self.debug_imgs = None + if init_cfg is None: + self.init_cfg = [] + if self.with_cls: + self.init_cfg += [ + dict( + type='Normal', std=0.01, override=dict(name='fc_cls')) + ] + if self.with_reg: + self.init_cfg += [ + dict( + type='Normal', std=0.001, override=dict(name='fc_reg')) + ] + + # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead + @property + def custom_cls_channels(self) -> bool: + """get custom_cls_channels from loss_cls.""" + return getattr(self.loss_cls, 'custom_cls_channels', False) + + # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead + @property + def custom_activation(self) -> bool: + """get custom_activation from loss_cls.""" + return getattr(self.loss_cls, 'custom_activation', False) + + # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead + @property + def custom_accuracy(self) -> bool: + """get custom_accuracy from loss_cls.""" + return getattr(self.loss_cls, 'custom_accuracy', False) + + def forward(self, x: Tuple[Tensor]) -> tuple: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores and bbox prediction. + + - cls_score (Tensor): Classification scores for all + scale levels, each is a 4D-tensor, the channels number + is num_base_priors * num_classes. + - bbox_pred (Tensor): Box energies / deltas for all + scale levels, each is a 4D-tensor, the channels number + is num_base_priors * 4. + """ + if self.with_avg_pool: + if x.numel() > 0: + x = self.avg_pool(x) + x = x.view(x.size(0), -1) + else: + # avg_pool does not support empty tensor, + # so use torch.mean instead it + x = torch.mean(x, dim=(-1, -2)) + cls_score = self.fc_cls(x) if self.with_cls else None + bbox_pred = self.fc_reg(x) if self.with_reg else None + return cls_score, bbox_pred + + def _get_targets_single(self, pos_priors: Tensor, neg_priors: Tensor, + pos_gt_bboxes: Tensor, pos_gt_labels: Tensor, + cfg: ConfigDict) -> tuple: + """Calculate the ground truth for proposals in the single image + according to the sampling results. + + Args: + pos_priors (Tensor): Contains all the positive boxes, + has shape (num_pos, 4), the last dimension 4 + represents [tl_x, tl_y, br_x, br_y]. + neg_priors (Tensor): Contains all the negative boxes, + has shape (num_neg, 4), the last dimension 4 + represents [tl_x, tl_y, br_x, br_y]. + pos_gt_bboxes (Tensor): Contains gt_boxes for + all positive samples, has shape (num_pos, 4), + the last dimension 4 + represents [tl_x, tl_y, br_x, br_y]. + pos_gt_labels (Tensor): Contains gt_labels for + all positive samples, has shape (num_pos, ). + cfg (obj:`ConfigDict`): `train_cfg` of R-CNN. + + Returns: + Tuple[Tensor]: Ground truth for proposals + in a single image. Containing the following Tensors: + + - labels(Tensor): Gt_labels for all proposals, has + shape (num_proposals,). + - label_weights(Tensor): Labels_weights for all + proposals, has shape (num_proposals,). + - bbox_targets(Tensor):Regression target for all + proposals, has shape (num_proposals, 4), the + last dimension 4 represents [tl_x, tl_y, br_x, br_y]. + - bbox_weights(Tensor):Regression weights for all + proposals, has shape (num_proposals, 4). + """ + num_pos = pos_priors.size(0) + num_neg = neg_priors.size(0) + num_samples = num_pos + num_neg + + # original implementation uses new_zeros since BG are set to be 0 + # now use empty & fill because BG cat_id = num_classes, + # FG cat_id = [0, num_classes-1] + labels = pos_priors.new_full((num_samples, ), + self.num_classes, + dtype=torch.long) + reg_dim = pos_gt_bboxes.size(-1) if self.reg_decoded_bbox \ + else self.bbox_coder.encode_size + label_weights = pos_priors.new_zeros(num_samples) + bbox_targets = pos_priors.new_zeros(num_samples, reg_dim) + bbox_weights = pos_priors.new_zeros(num_samples, reg_dim) + if num_pos > 0: + labels[:num_pos] = pos_gt_labels + pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight + label_weights[:num_pos] = pos_weight + if not self.reg_decoded_bbox: + pos_bbox_targets = self.bbox_coder.encode( + pos_priors, pos_gt_bboxes) + else: + # When the regression loss (e.g. `IouLoss`, `GIouLoss`) + # is applied directly on the decoded bounding boxes, both + # the predicted boxes and regression targets should be with + # absolute coordinate format. + pos_bbox_targets = get_box_tensor(pos_gt_bboxes) + bbox_targets[:num_pos, :] = pos_bbox_targets + bbox_weights[:num_pos, :] = 1 + if num_neg > 0: + label_weights[-num_neg:] = 1.0 + + return labels, label_weights, bbox_targets, bbox_weights + + def get_targets(self, + sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict, + concat: bool = True) -> tuple: + """Calculate the ground truth for all samples in a batch according to + the sampling_results. + + Almost the same as the implementation in bbox_head, we passed + additional parameters pos_inds_list and neg_inds_list to + `_get_targets_single` function. + + Args: + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + concat (bool): Whether to concatenate the results of all + the images in a single batch. + + Returns: + Tuple[Tensor]: Ground truth for proposals in a single image. + Containing the following list of Tensors: + + - labels (list[Tensor],Tensor): Gt_labels for all + proposals in a batch, each tensor in list has + shape (num_proposals,) when `concat=False`, otherwise + just a single tensor has shape (num_all_proposals,). + - label_weights (list[Tensor]): Labels_weights for + all proposals in a batch, each tensor in list has + shape (num_proposals,) when `concat=False`, otherwise + just a single tensor has shape (num_all_proposals,). + - bbox_targets (list[Tensor],Tensor): Regression target + for all proposals in a batch, each tensor in list + has shape (num_proposals, 4) when `concat=False`, + otherwise just a single tensor has shape + (num_all_proposals, 4), the last dimension 4 represents + [tl_x, tl_y, br_x, br_y]. + - bbox_weights (list[tensor],Tensor): Regression weights for + all proposals in a batch, each tensor in list has shape + (num_proposals, 4) when `concat=False`, otherwise just a + single tensor has shape (num_all_proposals, 4). + """ + pos_priors_list = [res.pos_priors for res in sampling_results] + neg_priors_list = [res.neg_priors for res in sampling_results] + pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results] + pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] + labels, label_weights, bbox_targets, bbox_weights = multi_apply( + self._get_targets_single, + pos_priors_list, + neg_priors_list, + pos_gt_bboxes_list, + pos_gt_labels_list, + cfg=rcnn_train_cfg) + + if concat: + labels = torch.cat(labels, 0) + label_weights = torch.cat(label_weights, 0) + bbox_targets = torch.cat(bbox_targets, 0) + bbox_weights = torch.cat(bbox_weights, 0) + return labels, label_weights, bbox_targets, bbox_weights + + def loss_and_target(self, + cls_score: Tensor, + bbox_pred: Tensor, + rois: Tensor, + sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict, + concat: bool = True, + reduction_override: Optional[str] = None) -> dict: + """Calculate the loss based on the features extracted by the bbox head. + + Args: + cls_score (Tensor): Classification prediction + results of all class, has shape + (batch_size * num_proposals_single_image, num_classes) + bbox_pred (Tensor): Regression prediction results, + has shape + (batch_size * num_proposals_single_image, 4), the last + dimension 4 represents [tl_x, tl_y, br_x, br_y]. + rois (Tensor): RoIs with the shape + (batch_size * num_proposals_single_image, 5) where the first + column indicates batch id of each RoI. + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + concat (bool): Whether to concatenate the results of all + the images in a single batch. Defaults to True. + reduction_override (str, optional): The reduction + method used to override the original reduction + method of the loss. Options are "none", + "mean" and "sum". Defaults to None, + + Returns: + dict: A dictionary of loss and targets components. + The targets are only used for cascade rcnn. + """ + + cls_reg_targets = self.get_targets( + sampling_results, rcnn_train_cfg, concat=concat) + losses = self.loss( + cls_score, + bbox_pred, + rois, + *cls_reg_targets, + reduction_override=reduction_override) + + # cls_reg_targets is only for cascade rcnn + return dict(loss_bbox=losses, bbox_targets=cls_reg_targets) + + def loss(self, + cls_score: Tensor, + bbox_pred: Tensor, + rois: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + bbox_weights: Tensor, + reduction_override: Optional[str] = None) -> dict: + """Calculate the loss based on the network predictions and targets. + + Args: + cls_score (Tensor): Classification prediction + results of all class, has shape + (batch_size * num_proposals_single_image, num_classes) + bbox_pred (Tensor): Regression prediction results, + has shape + (batch_size * num_proposals_single_image, 4), the last + dimension 4 represents [tl_x, tl_y, br_x, br_y]. + rois (Tensor): RoIs with the shape + (batch_size * num_proposals_single_image, 5) where the first + column indicates batch id of each RoI. + labels (Tensor): Gt_labels for all proposals in a batch, has + shape (batch_size * num_proposals_single_image, ). + label_weights (Tensor): Labels_weights for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, ). + bbox_targets (Tensor): Regression target for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, 4), + the last dimension 4 represents [tl_x, tl_y, br_x, br_y]. + bbox_weights (Tensor): Regression weights for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, 4). + reduction_override (str, optional): The reduction + method used to override the original reduction + method of the loss. Options are "none", + "mean" and "sum". Defaults to None, + + Returns: + dict: A dictionary of loss. + """ + + losses = dict() + + if cls_score is not None: + avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.) + if cls_score.numel() > 0: + loss_cls_ = self.loss_cls( + cls_score, + labels, + label_weights, + avg_factor=avg_factor, + reduction_override=reduction_override) + if isinstance(loss_cls_, dict): + losses.update(loss_cls_) + else: + losses['loss_cls'] = loss_cls_ + if self.custom_activation: + acc_ = self.loss_cls.get_accuracy(cls_score, labels) + losses.update(acc_) + else: + losses['acc'] = accuracy(cls_score, labels) + if bbox_pred is not None: + bg_class_ind = self.num_classes + # 0~self.num_classes-1 are FG, self.num_classes is BG + pos_inds = (labels >= 0) & (labels < bg_class_ind) + # do not perform bounding box regression for BG anymore. + if pos_inds.any(): + if self.reg_decoded_bbox: + # When the regression loss (e.g. `IouLoss`, + # `GIouLoss`, `DIouLoss`) is applied directly on + # the decoded bounding boxes, it decodes the + # already encoded coordinates to absolute format. + bbox_pred = self.bbox_coder.decode(rois[:, 1:], bbox_pred) + bbox_pred = get_box_tensor(bbox_pred) + if self.reg_class_agnostic: + pos_bbox_pred = bbox_pred.view( + bbox_pred.size(0), -1)[pos_inds.type(torch.bool)] + else: + pos_bbox_pred = bbox_pred.view( + bbox_pred.size(0), self.num_classes, + -1)[pos_inds.type(torch.bool), + labels[pos_inds.type(torch.bool)]] + losses['loss_bbox'] = self.loss_bbox( + pos_bbox_pred, + bbox_targets[pos_inds.type(torch.bool)], + bbox_weights[pos_inds.type(torch.bool)], + avg_factor=bbox_targets.size(0), + reduction_override=reduction_override) + else: + losses['loss_bbox'] = bbox_pred[pos_inds].sum() + + return losses + + def predict_by_feat(self, + rois: Tuple[Tensor], + cls_scores: Tuple[Tensor], + bbox_preds: Tuple[Tensor], + batch_img_metas: List[dict], + rcnn_test_cfg: Optional[ConfigDict] = None, + rescale: bool = False) -> InstanceList: + """Transform a batch of output features extracted from the head into + bbox results. + + Args: + rois (tuple[Tensor]): Tuple of boxes to be transformed. + Each has shape (num_boxes, 5). last dimension 5 arrange as + (batch_index, x1, y1, x2, y2). + cls_scores (tuple[Tensor]): Tuple of box scores, each has shape + (num_boxes, num_classes + 1). + bbox_preds (tuple[Tensor]): Tuple of box energies / deltas, each + has shape (num_boxes, num_classes * 4). + batch_img_metas (list[dict]): List of image information. + rcnn_test_cfg (obj:`ConfigDict`, optional): `test_cfg` of R-CNN. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Instance segmentation + results of each image after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + result_list = [] + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + results = self._predict_by_feat_single( + roi=rois[img_id], + cls_score=cls_scores[img_id], + bbox_pred=bbox_preds[img_id], + img_meta=img_meta, + rescale=rescale, + rcnn_test_cfg=rcnn_test_cfg) + result_list.append(results) + + return result_list + + def _predict_by_feat_single( + self, + roi: Tensor, + cls_score: Tensor, + bbox_pred: Tensor, + img_meta: dict, + rescale: bool = False, + rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5). + last dimension 5 arrange as (batch_index, x1, y1, x2, y2). + cls_score (Tensor): Box scores, has shape + (num_boxes, num_classes + 1). + bbox_pred (Tensor): Box energies / deltas. + has shape (num_boxes, num_classes * 4). + img_meta (dict): image information. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. + Defaults to None + + Returns: + :obj:`InstanceData`: Detection results of each image\ + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + results = InstanceData() + if roi.shape[0] == 0: + return empty_instances([img_meta], + roi.device, + task_type='bbox', + instance_results=[results], + box_type=self.predict_box_type, + use_box_type=False, + num_classes=self.num_classes, + score_per_cls=rcnn_test_cfg is None)[0] + + # some loss (Seesaw loss..) may have custom activation + if self.custom_cls_channels: + scores = self.loss_cls.get_activation(cls_score) + else: + scores = F.softmax( + cls_score, dim=-1) if cls_score is not None else None + + img_shape = img_meta['img_shape'] + num_rois = roi.size(0) + # bbox_pred would be None in some detector when with_reg is False, + # e.g. Grid R-CNN. + if bbox_pred is not None: + num_classes = 1 if self.reg_class_agnostic else self.num_classes + roi = roi.repeat_interleave(num_classes, dim=0) + bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size) + bboxes = self.bbox_coder.decode( + roi[..., 1:], bbox_pred, max_shape=img_shape) + else: + bboxes = roi[:, 1:].clone() + if img_shape is not None and bboxes.size(-1) == 4: + bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1]) + bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0]) + + if rescale and bboxes.size(0) > 0: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + bboxes = scale_boxes(bboxes, scale_factor) + + # Get the inside tensor when `bboxes` is a box type + bboxes = get_box_tensor(bboxes) + box_dim = bboxes.size(-1) + bboxes = bboxes.view(num_rois, -1) + + if rcnn_test_cfg is None: + # This means that it is aug test. + # It needs to return the raw results without nms. + results.bboxes = bboxes + results.scores = scores + else: + det_bboxes, det_labels = multiclass_nms( + bboxes, + scores, + rcnn_test_cfg.score_thr, + rcnn_test_cfg.nms, + rcnn_test_cfg.max_per_img, + box_dim=box_dim) + results.bboxes = det_bboxes[:, :-1] + results.scores = det_bboxes[:, -1] + results.labels = det_labels + return results + + def refine_bboxes(self, sampling_results: Union[List[SamplingResult], + InstanceList], + bbox_results: dict, + batch_img_metas: List[dict]) -> InstanceList: + """Refine bboxes during training. + + Args: + sampling_results (List[:obj:`SamplingResult`] or + List[:obj:`InstanceData`]): Sampling results. + :obj:`SamplingResult` is the real sampling results + calculate from bbox_head, while :obj:`InstanceData` is + fake sampling results, e.g., in Sparse R-CNN or QueryInst, etc. + bbox_results (dict): Usually is a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `rois` (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + - `bbox_targets` (tuple): Ground truth for proposals in a + single image. Containing the following list of Tensors: + (labels, label_weights, bbox_targets, bbox_weights) + batch_img_metas (List[dict]): List of image information. + + Returns: + list[:obj:`InstanceData`]: Refined bboxes of each image. + + Example: + >>> # xdoctest: +REQUIRES(module:kwarray) + >>> import numpy as np + >>> from mmdet.models.task_modules.samplers. + ... sampling_result import random_boxes + >>> from mmdet.models.task_modules.samplers import SamplingResult + >>> self = BBoxHead(reg_class_agnostic=True) + >>> n_roi = 2 + >>> n_img = 4 + >>> scale = 512 + >>> rng = np.random.RandomState(0) + ... batch_img_metas = [{'img_shape': (scale, scale)} + >>> for _ in range(n_img)] + >>> sampling_results = [SamplingResult.random(rng=10) + ... for _ in range(n_img)] + >>> # Create rois in the expected format + >>> roi_boxes = random_boxes(n_roi, scale=scale, rng=rng) + >>> img_ids = torch.randint(0, n_img, (n_roi,)) + >>> img_ids = img_ids.float() + >>> rois = torch.cat([img_ids[:, None], roi_boxes], dim=1) + >>> # Create other args + >>> labels = torch.randint(0, 81, (scale,)).long() + >>> bbox_preds = random_boxes(n_roi, scale=scale, rng=rng) + >>> cls_score = torch.randn((scale, 81)) + ... # For each image, pretend random positive boxes are gts + >>> bbox_targets = (labels, None, None, None) + ... bbox_results = dict(rois=rois, bbox_pred=bbox_preds, + ... cls_score=cls_score, + ... bbox_targets=bbox_targets) + >>> bboxes_list = self.refine_bboxes(sampling_results, + ... bbox_results, + ... batch_img_metas) + >>> print(bboxes_list) + """ + pos_is_gts = [res.pos_is_gt for res in sampling_results] + # bbox_targets is a tuple + labels = bbox_results['bbox_targets'][0] + cls_scores = bbox_results['cls_score'] + rois = bbox_results['rois'] + bbox_preds = bbox_results['bbox_pred'] + if self.custom_activation: + # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead + cls_scores = self.loss_cls.get_activation(cls_scores) + if cls_scores.numel() == 0: + return None + if cls_scores.shape[-1] == self.num_classes + 1: + # remove background class + cls_scores = cls_scores[:, :-1] + elif cls_scores.shape[-1] != self.num_classes: + raise ValueError('The last dim of `cls_scores` should equal to ' + '`num_classes` or `num_classes + 1`,' + f'but got {cls_scores.shape[-1]}.') + labels = torch.where(labels == self.num_classes, cls_scores.argmax(1), + labels) + + img_ids = rois[:, 0].long().unique(sorted=True) + assert img_ids.numel() <= len(batch_img_metas) + + results_list = [] + for i in range(len(batch_img_metas)): + inds = torch.nonzero( + rois[:, 0] == i, as_tuple=False).squeeze(dim=1) + num_rois = inds.numel() + + bboxes_ = rois[inds, 1:] + label_ = labels[inds] + bbox_pred_ = bbox_preds[inds] + img_meta_ = batch_img_metas[i] + pos_is_gts_ = pos_is_gts[i] + + bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_, + img_meta_) + # filter gt bboxes + pos_keep = 1 - pos_is_gts_ + keep_inds = pos_is_gts_.new_ones(num_rois) + keep_inds[:len(pos_is_gts_)] = pos_keep + results = InstanceData(bboxes=bboxes[keep_inds.type(torch.bool)]) + results_list.append(results) + + return results_list + + def regress_by_class(self, priors: Tensor, label: Tensor, + bbox_pred: Tensor, img_meta: dict) -> Tensor: + """Regress the bbox for the predicted class. Used in Cascade R-CNN. + + Args: + priors (Tensor): Priors from `rpn_head` or last stage + `bbox_head`, has shape (num_proposals, 4). + label (Tensor): Only used when `self.reg_class_agnostic` + is False, has shape (num_proposals, ). + bbox_pred (Tensor): Regression prediction of + current stage `bbox_head`. When `self.reg_class_agnostic` + is False, it has shape (n, num_classes * 4), otherwise + it has shape (n, 4). + img_meta (dict): Image meta info. + + Returns: + Tensor: Regressed bboxes, the same shape as input rois. + """ + reg_dim = self.bbox_coder.encode_size + if not self.reg_class_agnostic: + label = label * reg_dim + inds = torch.stack([label + i for i in range(reg_dim)], 1) + bbox_pred = torch.gather(bbox_pred, 1, inds) + assert bbox_pred.size()[1] == reg_dim + + max_shape = img_meta['img_shape'] + regressed_bboxes = self.bbox_coder.decode( + priors, bbox_pred, max_shape=max_shape) + return regressed_bboxes diff --git a/mmdetection/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/mmdetection/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py new file mode 100644 index 0000000..cb6aadd --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py @@ -0,0 +1,249 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from torch import Tensor + +from mmdet.registry import MODELS +from .bbox_head import BBoxHead + + +@MODELS.register_module() +class ConvFCBBoxHead(BBoxHead): + r"""More general bbox head, with shared conv and fc layers and two optional + separated branches. + + .. code-block:: none + + /-> cls convs -> cls fcs -> cls + shared convs -> shared fcs + \-> reg convs -> reg fcs -> reg + """ # noqa: W605 + + def __init__(self, + num_shared_convs: int = 0, + num_shared_fcs: int = 0, + num_cls_convs: int = 0, + num_cls_fcs: int = 0, + num_reg_convs: int = 0, + num_reg_fcs: int = 0, + conv_out_channels: int = 256, + fc_out_channels: int = 1024, + conv_cfg: Optional[Union[dict, ConfigDict]] = None, + norm_cfg: Optional[Union[dict, ConfigDict]] = None, + init_cfg: Optional[Union[dict, ConfigDict]] = None, + *args, + **kwargs) -> None: + super().__init__(*args, init_cfg=init_cfg, **kwargs) + assert (num_shared_convs + num_shared_fcs + num_cls_convs + + num_cls_fcs + num_reg_convs + num_reg_fcs > 0) + if num_cls_convs > 0 or num_reg_convs > 0: + assert num_shared_fcs == 0 + if not self.with_cls: + assert num_cls_convs == 0 and num_cls_fcs == 0 + if not self.with_reg: + assert num_reg_convs == 0 and num_reg_fcs == 0 + self.num_shared_convs = num_shared_convs + self.num_shared_fcs = num_shared_fcs + self.num_cls_convs = num_cls_convs + self.num_cls_fcs = num_cls_fcs + self.num_reg_convs = num_reg_convs + self.num_reg_fcs = num_reg_fcs + self.conv_out_channels = conv_out_channels + self.fc_out_channels = fc_out_channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + # add shared convs and fcs + self.shared_convs, self.shared_fcs, last_layer_dim = \ + self._add_conv_fc_branch( + self.num_shared_convs, self.num_shared_fcs, self.in_channels, + True) + self.shared_out_channels = last_layer_dim + + # add cls specific branch + self.cls_convs, self.cls_fcs, self.cls_last_dim = \ + self._add_conv_fc_branch( + self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels) + + # add reg specific branch + self.reg_convs, self.reg_fcs, self.reg_last_dim = \ + self._add_conv_fc_branch( + self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels) + + if self.num_shared_fcs == 0 and not self.with_avg_pool: + if self.num_cls_fcs == 0: + self.cls_last_dim *= self.roi_feat_area + if self.num_reg_fcs == 0: + self.reg_last_dim *= self.roi_feat_area + + self.relu = nn.ReLU(inplace=True) + # reconstruct fc_cls and fc_reg since input channels are changed + if self.with_cls: + if self.custom_cls_channels: + cls_channels = self.loss_cls.get_cls_channels(self.num_classes) + else: + cls_channels = self.num_classes + 1 + cls_predictor_cfg_ = self.cls_predictor_cfg.copy() + cls_predictor_cfg_.update( + in_features=self.cls_last_dim, out_features=cls_channels) + self.fc_cls = MODELS.build(cls_predictor_cfg_) + if self.with_reg: + box_dim = self.bbox_coder.encode_size + out_dim_reg = box_dim if self.reg_class_agnostic else \ + box_dim * self.num_classes + reg_predictor_cfg_ = self.reg_predictor_cfg.copy() + if isinstance(reg_predictor_cfg_, (dict, ConfigDict)): + reg_predictor_cfg_.update( + in_features=self.reg_last_dim, out_features=out_dim_reg) + self.fc_reg = MODELS.build(reg_predictor_cfg_) + + if init_cfg is None: + # when init_cfg is None, + # It has been set to + # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))], + # [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))] + # after `super(ConvFCBBoxHead, self).__init__()` + # we only need to append additional configuration + # for `shared_fcs`, `cls_fcs` and `reg_fcs` + self.init_cfg += [ + dict( + type='Xavier', + distribution='uniform', + override=[ + dict(name='shared_fcs'), + dict(name='cls_fcs'), + dict(name='reg_fcs') + ]) + ] + + def _add_conv_fc_branch(self, + num_branch_convs: int, + num_branch_fcs: int, + in_channels: int, + is_shared: bool = False) -> tuple: + """Add shared or separable branch. + + convs -> avg pool (optional) -> fcs + """ + last_layer_dim = in_channels + # add branch specific conv layers + branch_convs = nn.ModuleList() + if num_branch_convs > 0: + for i in range(num_branch_convs): + conv_in_channels = ( + last_layer_dim if i == 0 else self.conv_out_channels) + branch_convs.append( + ConvModule( + conv_in_channels, + self.conv_out_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + last_layer_dim = self.conv_out_channels + # add branch specific fc layers + branch_fcs = nn.ModuleList() + if num_branch_fcs > 0: + # for shared branch, only consider self.with_avg_pool + # for separated branches, also consider self.num_shared_fcs + if (is_shared + or self.num_shared_fcs == 0) and not self.with_avg_pool: + last_layer_dim *= self.roi_feat_area + for i in range(num_branch_fcs): + fc_in_channels = ( + last_layer_dim if i == 0 else self.fc_out_channels) + branch_fcs.append( + nn.Linear(fc_in_channels, self.fc_out_channels)) + last_layer_dim = self.fc_out_channels + return branch_convs, branch_fcs, last_layer_dim + + def forward(self, x: Tuple[Tensor]) -> tuple: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores and bbox prediction. + + - cls_score (Tensor): Classification scores for all \ + scale levels, each is a 4D-tensor, the channels number \ + is num_base_priors * num_classes. + - bbox_pred (Tensor): Box energies / deltas for all \ + scale levels, each is a 4D-tensor, the channels number \ + is num_base_priors * 4. + """ + # shared part + if self.num_shared_convs > 0: + for conv in self.shared_convs: + x = conv(x) + + if self.num_shared_fcs > 0: + if self.with_avg_pool: + x = self.avg_pool(x) + + x = x.flatten(1) + + for fc in self.shared_fcs: + x = self.relu(fc(x)) + # separate branches + x_cls = x + x_reg = x + + for conv in self.cls_convs: + x_cls = conv(x_cls) + if x_cls.dim() > 2: + if self.with_avg_pool: + x_cls = self.avg_pool(x_cls) + x_cls = x_cls.flatten(1) + for fc in self.cls_fcs: + x_cls = self.relu(fc(x_cls)) + + for conv in self.reg_convs: + x_reg = conv(x_reg) + if x_reg.dim() > 2: + if self.with_avg_pool: + x_reg = self.avg_pool(x_reg) + x_reg = x_reg.flatten(1) + for fc in self.reg_fcs: + x_reg = self.relu(fc(x_reg)) + + cls_score = self.fc_cls(x_cls) if self.with_cls else None + bbox_pred = self.fc_reg(x_reg) if self.with_reg else None + return cls_score, bbox_pred + + +@MODELS.register_module() +class Shared2FCBBoxHead(ConvFCBBoxHead): + + def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None: + super().__init__( + num_shared_convs=0, + num_shared_fcs=2, + num_cls_convs=0, + num_cls_fcs=0, + num_reg_convs=0, + num_reg_fcs=0, + fc_out_channels=fc_out_channels, + *args, + **kwargs) + + +@MODELS.register_module() +class Shared4Conv1FCBBoxHead(ConvFCBBoxHead): + + def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None: + super().__init__( + num_shared_convs=4, + num_shared_fcs=1, + num_cls_convs=0, + num_cls_fcs=0, + num_reg_convs=0, + num_reg_fcs=0, + fc_out_channels=fc_out_channels, + *args, + **kwargs) diff --git a/mmdetection/mmdet/models/roi_heads/bbox_heads/dii_head.py b/mmdetection/mmdet/models/roi_heads/bbox_heads/dii_head.py new file mode 100644 index 0000000..ae9a31b --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/bbox_heads/dii_head.py @@ -0,0 +1,422 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from mmcv.cnn import build_activation_layer, build_norm_layer +from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention +from mmengine.config import ConfigDict +from mmengine.model import bias_init_with_prob +from torch import Tensor + +from mmdet.models.losses import accuracy +from mmdet.models.task_modules import SamplingResult +from mmdet.models.utils import multi_apply +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, reduce_mean +from .bbox_head import BBoxHead + + +@MODELS.register_module() +class DIIHead(BBoxHead): + r"""Dynamic Instance Interactive Head for `Sparse R-CNN: End-to-End Object + Detection with Learnable Proposals `_ + + Args: + num_classes (int): Number of class in dataset. + Defaults to 80. + num_ffn_fcs (int): The number of fully-connected + layers in FFNs. Defaults to 2. + num_heads (int): The hidden dimension of FFNs. + Defaults to 8. + num_cls_fcs (int): The number of fully-connected + layers in classification subnet. Defaults to 1. + num_reg_fcs (int): The number of fully-connected + layers in regression subnet. Defaults to 3. + feedforward_channels (int): The hidden dimension + of FFNs. Defaults to 2048 + in_channels (int): Hidden_channels of MultiheadAttention. + Defaults to 256. + dropout (float): Probability of drop the channel. + Defaults to 0.0 + ffn_act_cfg (:obj:`ConfigDict` or dict): The activation config + for FFNs. + dynamic_conv_cfg (:obj:`ConfigDict` or dict): The convolution + config for DynamicConv. + loss_iou (:obj:`ConfigDict` or dict): The config for iou or + giou loss. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. Defaults to None. + """ + + def __init__(self, + num_classes: int = 80, + num_ffn_fcs: int = 2, + num_heads: int = 8, + num_cls_fcs: int = 1, + num_reg_fcs: int = 3, + feedforward_channels: int = 2048, + in_channels: int = 256, + dropout: float = 0.0, + ffn_act_cfg: ConfigType = dict(type='ReLU', inplace=True), + dynamic_conv_cfg: ConfigType = dict( + type='DynamicConv', + in_channels=256, + feat_channels=64, + out_channels=256, + input_feat_shape=7, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN')), + loss_iou: ConfigType = dict(type='GIoULoss', loss_weight=2.0), + init_cfg: OptConfigType = None, + **kwargs) -> None: + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super().__init__( + num_classes=num_classes, + reg_decoded_bbox=True, + reg_class_agnostic=True, + init_cfg=init_cfg, + **kwargs) + self.loss_iou = MODELS.build(loss_iou) + self.in_channels = in_channels + self.fp16_enabled = False + self.attention = MultiheadAttention(in_channels, num_heads, dropout) + self.attention_norm = build_norm_layer(dict(type='LN'), in_channels)[1] + + self.instance_interactive_conv = MODELS.build(dynamic_conv_cfg) + self.instance_interactive_conv_dropout = nn.Dropout(dropout) + self.instance_interactive_conv_norm = build_norm_layer( + dict(type='LN'), in_channels)[1] + + self.ffn = FFN( + in_channels, + feedforward_channels, + num_ffn_fcs, + act_cfg=ffn_act_cfg, + dropout=dropout) + self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] + + self.cls_fcs = nn.ModuleList() + for _ in range(num_cls_fcs): + self.cls_fcs.append( + nn.Linear(in_channels, in_channels, bias=False)) + self.cls_fcs.append( + build_norm_layer(dict(type='LN'), in_channels)[1]) + self.cls_fcs.append( + build_activation_layer(dict(type='ReLU', inplace=True))) + + # over load the self.fc_cls in BBoxHead + if self.loss_cls.use_sigmoid: + self.fc_cls = nn.Linear(in_channels, self.num_classes) + else: + self.fc_cls = nn.Linear(in_channels, self.num_classes + 1) + + self.reg_fcs = nn.ModuleList() + for _ in range(num_reg_fcs): + self.reg_fcs.append( + nn.Linear(in_channels, in_channels, bias=False)) + self.reg_fcs.append( + build_norm_layer(dict(type='LN'), in_channels)[1]) + self.reg_fcs.append( + build_activation_layer(dict(type='ReLU', inplace=True))) + # over load the self.fc_cls in BBoxHead + self.fc_reg = nn.Linear(in_channels, 4) + + assert self.reg_class_agnostic, 'DIIHead only ' \ + 'suppport `reg_class_agnostic=True` ' + assert self.reg_decoded_bbox, 'DIIHead only ' \ + 'suppport `reg_decoded_bbox=True`' + + def init_weights(self) -> None: + """Use xavier initialization for all weight parameter and set + classification head bias as a specific value when use focal loss.""" + super().init_weights() + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + else: + # adopt the default initialization for + # the weight and bias of the layer norm + pass + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + nn.init.constant_(self.fc_cls.bias, bias_init) + + def forward(self, roi_feat: Tensor, proposal_feat: Tensor) -> tuple: + """Forward function of Dynamic Instance Interactive Head. + + Args: + roi_feat (Tensor): Roi-pooling features with shape + (batch_size*num_proposals, feature_dimensions, + pooling_h , pooling_w). + proposal_feat (Tensor): Intermediate feature get from + diihead in last stage, has shape + (batch_size, num_proposals, feature_dimensions) + + Returns: + tuple[Tensor]: Usually a tuple of classification scores + and bbox prediction and a intermediate feature. + + - cls_scores (Tensor): Classification scores for + all proposals, has shape + (batch_size, num_proposals, num_classes). + - bbox_preds (Tensor): Box energies / deltas for + all proposals, has shape + (batch_size, num_proposals, 4). + - obj_feat (Tensor): Object feature before classification + and regression subnet, has shape + (batch_size, num_proposal, feature_dimensions). + - attn_feats (Tensor): Intermediate feature. + """ + N, num_proposals = proposal_feat.shape[:2] + + # Self attention + proposal_feat = proposal_feat.permute(1, 0, 2) + proposal_feat = self.attention_norm(self.attention(proposal_feat)) + attn_feats = proposal_feat.permute(1, 0, 2) + + # instance interactive + proposal_feat = attn_feats.reshape(-1, self.in_channels) + proposal_feat_iic = self.instance_interactive_conv( + proposal_feat, roi_feat) + proposal_feat = proposal_feat + self.instance_interactive_conv_dropout( + proposal_feat_iic) + obj_feat = self.instance_interactive_conv_norm(proposal_feat) + + # FFN + obj_feat = self.ffn_norm(self.ffn(obj_feat)) + + cls_feat = obj_feat + reg_feat = obj_feat + + for cls_layer in self.cls_fcs: + cls_feat = cls_layer(cls_feat) + for reg_layer in self.reg_fcs: + reg_feat = reg_layer(reg_feat) + + cls_score = self.fc_cls(cls_feat).view( + N, num_proposals, self.num_classes + if self.loss_cls.use_sigmoid else self.num_classes + 1) + bbox_delta = self.fc_reg(reg_feat).view(N, num_proposals, 4) + + return cls_score, bbox_delta, obj_feat.view( + N, num_proposals, self.in_channels), attn_feats + + def loss_and_target(self, + cls_score: Tensor, + bbox_pred: Tensor, + sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigType, + imgs_whwh: Tensor, + concat: bool = True, + reduction_override: str = None) -> dict: + """Calculate the loss based on the features extracted by the DIIHead. + + Args: + cls_score (Tensor): Classification prediction + results of all class, has shape + (batch_size * num_proposals_single_image, num_classes) + bbox_pred (Tensor): Regression prediction results, has shape + (batch_size * num_proposals_single_image, 4), the last + dimension 4 represents [tl_x, tl_y, br_x, br_y]. + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + imgs_whwh (Tensor): imgs_whwh (Tensor): Tensor with\ + shape (batch_size, num_proposals, 4), the last + dimension means + [img_width,img_height, img_width, img_height]. + concat (bool): Whether to concatenate the results of all + the images in a single batch. Defaults to True. + reduction_override (str, optional): The reduction + method used to override the original reduction + method of the loss. Options are "none", + "mean" and "sum". Defaults to None. + + Returns: + dict: A dictionary of loss and targets components. + The targets are only used for cascade rcnn. + """ + cls_reg_targets = self.get_targets( + sampling_results=sampling_results, + rcnn_train_cfg=rcnn_train_cfg, + concat=concat) + (labels, label_weights, bbox_targets, bbox_weights) = cls_reg_targets + + losses = dict() + bg_class_ind = self.num_classes + # note in spare rcnn num_gt == num_pos + pos_inds = (labels >= 0) & (labels < bg_class_ind) + num_pos = pos_inds.sum().float() + avg_factor = reduce_mean(num_pos) + if cls_score is not None: + if cls_score.numel() > 0: + losses['loss_cls'] = self.loss_cls( + cls_score, + labels, + label_weights, + avg_factor=avg_factor, + reduction_override=reduction_override) + losses['pos_acc'] = accuracy(cls_score[pos_inds], + labels[pos_inds]) + if bbox_pred is not None: + # 0~self.num_classes-1 are FG, self.num_classes is BG + # do not perform bounding box regression for BG anymore. + if pos_inds.any(): + pos_bbox_pred = bbox_pred.reshape(bbox_pred.size(0), + 4)[pos_inds.type(torch.bool)] + imgs_whwh = imgs_whwh.reshape(bbox_pred.size(0), + 4)[pos_inds.type(torch.bool)] + losses['loss_bbox'] = self.loss_bbox( + pos_bbox_pred / imgs_whwh, + bbox_targets[pos_inds.type(torch.bool)] / imgs_whwh, + bbox_weights[pos_inds.type(torch.bool)], + avg_factor=avg_factor) + losses['loss_iou'] = self.loss_iou( + pos_bbox_pred, + bbox_targets[pos_inds.type(torch.bool)], + bbox_weights[pos_inds.type(torch.bool)], + avg_factor=avg_factor) + else: + losses['loss_bbox'] = bbox_pred.sum() * 0 + losses['loss_iou'] = bbox_pred.sum() * 0 + return dict(loss_bbox=losses, bbox_targets=cls_reg_targets) + + def _get_targets_single(self, pos_inds: Tensor, neg_inds: Tensor, + pos_priors: Tensor, neg_priors: Tensor, + pos_gt_bboxes: Tensor, pos_gt_labels: Tensor, + cfg: ConfigDict) -> tuple: + """Calculate the ground truth for proposals in the single image + according to the sampling results. + + Almost the same as the implementation in `bbox_head`, + we add pos_inds and neg_inds to select positive and + negative samples instead of selecting the first num_pos + as positive samples. + + Args: + pos_inds (Tensor): The length is equal to the + positive sample numbers contain all index + of the positive sample in the origin proposal set. + neg_inds (Tensor): The length is equal to the + negative sample numbers contain all index + of the negative sample in the origin proposal set. + pos_priors (Tensor): Contains all the positive boxes, + has shape (num_pos, 4), the last dimension 4 + represents [tl_x, tl_y, br_x, br_y]. + neg_priors (Tensor): Contains all the negative boxes, + has shape (num_neg, 4), the last dimension 4 + represents [tl_x, tl_y, br_x, br_y]. + pos_gt_bboxes (Tensor): Contains gt_boxes for + all positive samples, has shape (num_pos, 4), + the last dimension 4 + represents [tl_x, tl_y, br_x, br_y]. + pos_gt_labels (Tensor): Contains gt_labels for + all positive samples, has shape (num_pos, ). + cfg (obj:`ConfigDict`): `train_cfg` of R-CNN. + + Returns: + Tuple[Tensor]: Ground truth for proposals in a single image. + Containing the following Tensors: + + - labels(Tensor): Gt_labels for all proposals, has + shape (num_proposals,). + - label_weights(Tensor): Labels_weights for all proposals, has + shape (num_proposals,). + - bbox_targets(Tensor):Regression target for all proposals, has + shape (num_proposals, 4), the last dimension 4 + represents [tl_x, tl_y, br_x, br_y]. + - bbox_weights(Tensor):Regression weights for all proposals, + has shape (num_proposals, 4). + """ + num_pos = pos_priors.size(0) + num_neg = neg_priors.size(0) + num_samples = num_pos + num_neg + + # original implementation uses new_zeros since BG are set to be 0 + # now use empty & fill because BG cat_id = num_classes, + # FG cat_id = [0, num_classes-1] + labels = pos_priors.new_full((num_samples, ), + self.num_classes, + dtype=torch.long) + label_weights = pos_priors.new_zeros(num_samples) + bbox_targets = pos_priors.new_zeros(num_samples, 4) + bbox_weights = pos_priors.new_zeros(num_samples, 4) + if num_pos > 0: + labels[pos_inds] = pos_gt_labels + pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight + label_weights[pos_inds] = pos_weight + if not self.reg_decoded_bbox: + pos_bbox_targets = self.bbox_coder.encode( + pos_priors, pos_gt_bboxes) + else: + pos_bbox_targets = pos_gt_bboxes + bbox_targets[pos_inds, :] = pos_bbox_targets + bbox_weights[pos_inds, :] = 1 + if num_neg > 0: + label_weights[neg_inds] = 1.0 + + return labels, label_weights, bbox_targets, bbox_weights + + def get_targets(self, + sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict, + concat: bool = True) -> tuple: + """Calculate the ground truth for all samples in a batch according to + the sampling_results. + + Almost the same as the implementation in bbox_head, we passed + additional parameters pos_inds_list and neg_inds_list to + `_get_targets_single` function. + + Args: + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + concat (bool): Whether to concatenate the results of all + the images in a single batch. + + Returns: + Tuple[Tensor]: Ground truth for proposals in a single image. + Containing the following list of Tensors: + + - labels (list[Tensor],Tensor): Gt_labels for all + proposals in a batch, each tensor in list has + shape (num_proposals,) when `concat=False`, otherwise just + a single tensor has shape (num_all_proposals,). + - label_weights (list[Tensor]): Labels_weights for + all proposals in a batch, each tensor in list has shape + (num_proposals,) when `concat=False`, otherwise just a + single tensor has shape (num_all_proposals,). + - bbox_targets (list[Tensor],Tensor): Regression target + for all proposals in a batch, each tensor in list has + shape (num_proposals, 4) when `concat=False`, otherwise + just a single tensor has shape (num_all_proposals, 4), + the last dimension 4 represents [tl_x, tl_y, br_x, br_y]. + - bbox_weights (list[tensor],Tensor): Regression weights for + all proposals in a batch, each tensor in list has shape + (num_proposals, 4) when `concat=False`, otherwise just a + single tensor has shape (num_all_proposals, 4). + """ + pos_inds_list = [res.pos_inds for res in sampling_results] + neg_inds_list = [res.neg_inds for res in sampling_results] + pos_priors_list = [res.pos_priors for res in sampling_results] + neg_priors_list = [res.neg_priors for res in sampling_results] + pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results] + pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results] + labels, label_weights, bbox_targets, bbox_weights = multi_apply( + self._get_targets_single, + pos_inds_list, + neg_inds_list, + pos_priors_list, + neg_priors_list, + pos_gt_bboxes_list, + pos_gt_labels_list, + cfg=rcnn_train_cfg) + if concat: + labels = torch.cat(labels, 0) + label_weights = torch.cat(label_weights, 0) + bbox_targets = torch.cat(bbox_targets, 0) + bbox_weights = torch.cat(bbox_weights, 0) + return labels, label_weights, bbox_targets, bbox_weights diff --git a/mmdetection/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py b/mmdetection/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py new file mode 100644 index 0000000..076c358 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule, ModuleList +from torch import Tensor + +from mmdet.models.backbones.resnet import Bottleneck +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, MultiConfig, OptConfigType, OptMultiConfig +from .bbox_head import BBoxHead + + +class BasicResBlock(BaseModule): + """Basic residual block. + + This block is a little different from the block in the ResNet backbone. + The kernel size of conv1 is 1 in this block while 3 in ResNet BasicBlock. + + Args: + in_channels (int): Channels of the input feature map. + out_channels (int): Channels of the output feature map. + conv_cfg (:obj:`ConfigDict` or dict, optional): The config dict + for convolution layers. + norm_cfg (:obj:`ConfigDict` or dict): The config dict for + normalization layers. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None + """ + + def __init__(self, + in_channels: int, + out_channels: int, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + + # main path + self.conv1 = ConvModule( + in_channels, + in_channels, + kernel_size=3, + padding=1, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + self.conv2 = ConvModule( + in_channels, + out_channels, + kernel_size=1, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + # identity path + self.conv_identity = ConvModule( + in_channels, + out_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x: Tensor) -> Tensor: + """Forward function.""" + identity = x + + x = self.conv1(x) + x = self.conv2(x) + + identity = self.conv_identity(identity) + out = x + identity + + out = self.relu(out) + return out + + +@MODELS.register_module() +class DoubleConvFCBBoxHead(BBoxHead): + r"""Bbox head used in Double-Head R-CNN + + .. code-block:: none + + /-> cls + /-> shared convs -> + \-> reg + roi features + /-> cls + \-> shared fc -> + \-> reg + """ # noqa: W605 + + def __init__(self, + num_convs: int = 0, + num_fcs: int = 0, + conv_out_channels: int = 1024, + fc_out_channels: int = 1024, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + init_cfg: MultiConfig = dict( + type='Normal', + override=[ + dict(type='Normal', name='fc_cls', std=0.01), + dict(type='Normal', name='fc_reg', std=0.001), + dict( + type='Xavier', + name='fc_branch', + distribution='uniform') + ]), + **kwargs) -> None: + kwargs.setdefault('with_avg_pool', True) + super().__init__(init_cfg=init_cfg, **kwargs) + assert self.with_avg_pool + assert num_convs > 0 + assert num_fcs > 0 + self.num_convs = num_convs + self.num_fcs = num_fcs + self.conv_out_channels = conv_out_channels + self.fc_out_channels = fc_out_channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + # increase the channel of input features + self.res_block = BasicResBlock(self.in_channels, + self.conv_out_channels) + + # add conv heads + self.conv_branch = self._add_conv_branch() + # add fc heads + self.fc_branch = self._add_fc_branch() + + out_dim_reg = 4 if self.reg_class_agnostic else 4 * self.num_classes + self.fc_reg = nn.Linear(self.conv_out_channels, out_dim_reg) + + self.fc_cls = nn.Linear(self.fc_out_channels, self.num_classes + 1) + self.relu = nn.ReLU() + + def _add_conv_branch(self) -> None: + """Add the fc branch which consists of a sequential of conv layers.""" + branch_convs = ModuleList() + for i in range(self.num_convs): + branch_convs.append( + Bottleneck( + inplanes=self.conv_out_channels, + planes=self.conv_out_channels // 4, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + return branch_convs + + def _add_fc_branch(self) -> None: + """Add the fc branch which consists of a sequential of fc layers.""" + branch_fcs = ModuleList() + for i in range(self.num_fcs): + fc_in_channels = ( + self.in_channels * + self.roi_feat_area if i == 0 else self.fc_out_channels) + branch_fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels)) + return branch_fcs + + def forward(self, x_cls: Tensor, x_reg: Tensor) -> Tuple[Tensor]: + """Forward features from the upstream network. + + Args: + x_cls (Tensor): Classification features of rois + x_reg (Tensor): Regression features from the upstream network. + + Returns: + tuple: A tuple of classification scores and bbox prediction. + + - cls_score (Tensor): Classification score predictions of rois. + each roi predicts num_classes + 1 channels. + - bbox_pred (Tensor): BBox deltas predictions of rois. each roi + predicts 4 * num_classes channels. + """ + # conv head + x_conv = self.res_block(x_reg) + + for conv in self.conv_branch: + x_conv = conv(x_conv) + + if self.with_avg_pool: + x_conv = self.avg_pool(x_conv) + + x_conv = x_conv.view(x_conv.size(0), -1) + bbox_pred = self.fc_reg(x_conv) + + # fc head + x_fc = x_cls.view(x_cls.size(0), -1) + for fc in self.fc_branch: + x_fc = self.relu(fc(x_fc)) + + cls_score = self.fc_cls(x_fc) + + return cls_score, bbox_pred diff --git a/mmdetection/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py b/mmdetection/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py new file mode 100644 index 0000000..38e57d2 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py @@ -0,0 +1,626 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor, nn + +from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.models.utils import empty_instances +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox_overlaps + + +@MODELS.register_module() +class MultiInstanceBBoxHead(BBoxHead): + r"""Bbox head used in CrowdDet. + + .. code-block:: none + + /-> cls convs_1 -> cls fcs_1 -> cls_1 + |-- + | \-> reg convs_1 -> reg fcs_1 -> reg_1 + | + | /-> cls convs_2 -> cls fcs_2 -> cls_2 + shared convs -> shared fcs |-- + | \-> reg convs_2 -> reg fcs_2 -> reg_2 + | + | ... + | + | /-> cls convs_k -> cls fcs_k -> cls_k + |-- + \-> reg convs_k -> reg fcs_k -> reg_k + + + Args: + num_instance (int): The number of branches after shared fcs. + Defaults to 2. + with_refine (bool): Whether to use refine module. Defaults to False. + num_shared_convs (int): The number of shared convs. Defaults to 0. + num_shared_fcs (int): The number of shared fcs. Defaults to 2. + num_cls_convs (int): The number of cls convs. Defaults to 0. + num_cls_fcs (int): The number of cls fcs. Defaults to 0. + num_reg_convs (int): The number of reg convs. Defaults to 0. + num_reg_fcs (int): The number of reg fcs. Defaults to 0. + conv_out_channels (int): The number of conv out channels. + Defaults to 256. + fc_out_channels (int): The number of fc out channels. Defaults to 1024. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ # noqa: W605 + + def __init__(self, + num_instance: int = 2, + with_refine: bool = False, + num_shared_convs: int = 0, + num_shared_fcs: int = 2, + num_cls_convs: int = 0, + num_cls_fcs: int = 0, + num_reg_convs: int = 0, + num_reg_fcs: int = 0, + conv_out_channels: int = 256, + fc_out_channels: int = 1024, + init_cfg: Optional[Union[dict, ConfigDict]] = None, + *args, + **kwargs) -> None: + super().__init__(*args, init_cfg=init_cfg, **kwargs) + assert (num_shared_convs + num_shared_fcs + num_cls_convs + + num_cls_fcs + num_reg_convs + num_reg_fcs > 0) + assert num_instance == 2, 'Currently only 2 instances are supported' + if num_cls_convs > 0 or num_reg_convs > 0: + assert num_shared_fcs == 0 + if not self.with_cls: + assert num_cls_convs == 0 and num_cls_fcs == 0 + if not self.with_reg: + assert num_reg_convs == 0 and num_reg_fcs == 0 + self.num_instance = num_instance + self.num_shared_convs = num_shared_convs + self.num_shared_fcs = num_shared_fcs + self.num_cls_convs = num_cls_convs + self.num_cls_fcs = num_cls_fcs + self.num_reg_convs = num_reg_convs + self.num_reg_fcs = num_reg_fcs + self.conv_out_channels = conv_out_channels + self.fc_out_channels = fc_out_channels + self.with_refine = with_refine + + # add shared convs and fcs + self.shared_convs, self.shared_fcs, last_layer_dim = \ + self._add_conv_fc_branch( + self.num_shared_convs, self.num_shared_fcs, self.in_channels, + True) + self.shared_out_channels = last_layer_dim + self.relu = nn.ReLU(inplace=True) + + if self.with_refine: + refine_model_cfg = { + 'type': 'Linear', + 'in_features': self.shared_out_channels + 20, + 'out_features': self.shared_out_channels + } + self.shared_fcs_ref = MODELS.build(refine_model_cfg) + self.fc_cls_ref = nn.ModuleList() + self.fc_reg_ref = nn.ModuleList() + + self.cls_convs = nn.ModuleList() + self.cls_fcs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.reg_fcs = nn.ModuleList() + self.cls_last_dim = list() + self.reg_last_dim = list() + self.fc_cls = nn.ModuleList() + self.fc_reg = nn.ModuleList() + for k in range(self.num_instance): + # add cls specific branch + cls_convs, cls_fcs, cls_last_dim = self._add_conv_fc_branch( + self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels) + self.cls_convs.append(cls_convs) + self.cls_fcs.append(cls_fcs) + self.cls_last_dim.append(cls_last_dim) + + # add reg specific branch + reg_convs, reg_fcs, reg_last_dim = self._add_conv_fc_branch( + self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels) + self.reg_convs.append(reg_convs) + self.reg_fcs.append(reg_fcs) + self.reg_last_dim.append(reg_last_dim) + + if self.num_shared_fcs == 0 and not self.with_avg_pool: + if self.num_cls_fcs == 0: + self.cls_last_dim *= self.roi_feat_area + if self.num_reg_fcs == 0: + self.reg_last_dim *= self.roi_feat_area + + if self.with_cls: + if self.custom_cls_channels: + cls_channels = self.loss_cls.get_cls_channels( + self.num_classes) + else: + cls_channels = self.num_classes + 1 + cls_predictor_cfg_ = self.cls_predictor_cfg.copy() # deepcopy + cls_predictor_cfg_.update( + in_features=self.cls_last_dim[k], + out_features=cls_channels) + self.fc_cls.append(MODELS.build(cls_predictor_cfg_)) + if self.with_refine: + self.fc_cls_ref.append(MODELS.build(cls_predictor_cfg_)) + + if self.with_reg: + out_dim_reg = (4 if self.reg_class_agnostic else 4 * + self.num_classes) + reg_predictor_cfg_ = self.reg_predictor_cfg.copy() + reg_predictor_cfg_.update( + in_features=self.reg_last_dim[k], out_features=out_dim_reg) + self.fc_reg.append(MODELS.build(reg_predictor_cfg_)) + if self.with_refine: + self.fc_reg_ref.append(MODELS.build(reg_predictor_cfg_)) + + if init_cfg is None: + # when init_cfg is None, + # It has been set to + # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))], + # [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))] + # after `super(ConvFCBBoxHead, self).__init__()` + # we only need to append additional configuration + # for `shared_fcs`, `cls_fcs` and `reg_fcs` + self.init_cfg += [ + dict( + type='Xavier', + distribution='uniform', + override=[ + dict(name='shared_fcs'), + dict(name='cls_fcs'), + dict(name='reg_fcs') + ]) + ] + + def _add_conv_fc_branch(self, + num_branch_convs: int, + num_branch_fcs: int, + in_channels: int, + is_shared: bool = False) -> tuple: + """Add shared or separable branch. + + convs -> avg pool (optional) -> fcs + """ + last_layer_dim = in_channels + # add branch specific conv layers + branch_convs = nn.ModuleList() + if num_branch_convs > 0: + for i in range(num_branch_convs): + conv_in_channels = ( + last_layer_dim if i == 0 else self.conv_out_channels) + branch_convs.append( + ConvModule( + conv_in_channels, self.conv_out_channels, 3, + padding=1)) + last_layer_dim = self.conv_out_channels + # add branch specific fc layers + branch_fcs = nn.ModuleList() + if num_branch_fcs > 0: + # for shared branch, only consider self.with_avg_pool + # for separated branches, also consider self.num_shared_fcs + if (is_shared + or self.num_shared_fcs == 0) and not self.with_avg_pool: + last_layer_dim *= self.roi_feat_area + for i in range(num_branch_fcs): + fc_in_channels = ( + last_layer_dim if i == 0 else self.fc_out_channels) + branch_fcs.append( + nn.Linear(fc_in_channels, self.fc_out_channels)) + last_layer_dim = self.fc_out_channels + return branch_convs, branch_fcs, last_layer_dim + + def forward(self, x: Tuple[Tensor]) -> tuple: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of classification scores and bbox prediction. + + - cls_score (Tensor): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_pred (Tensor): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - cls_score_ref (Tensor): The cls_score after refine model. + - bbox_pred_ref (Tensor): The bbox_pred after refine model. + """ + # shared part + if self.num_shared_convs > 0: + for conv in self.shared_convs: + x = conv(x) + + if self.num_shared_fcs > 0: + if self.with_avg_pool: + x = self.avg_pool(x) + + x = x.flatten(1) + for fc in self.shared_fcs: + x = self.relu(fc(x)) + + x_cls = x + x_reg = x + # separate branches + cls_score = list() + bbox_pred = list() + for k in range(self.num_instance): + for conv in self.cls_convs[k]: + x_cls = conv(x_cls) + if x_cls.dim() > 2: + if self.with_avg_pool: + x_cls = self.avg_pool(x_cls) + x_cls = x_cls.flatten(1) + for fc in self.cls_fcs[k]: + x_cls = self.relu(fc(x_cls)) + + for conv in self.reg_convs[k]: + x_reg = conv(x_reg) + if x_reg.dim() > 2: + if self.with_avg_pool: + x_reg = self.avg_pool(x_reg) + x_reg = x_reg.flatten(1) + for fc in self.reg_fcs[k]: + x_reg = self.relu(fc(x_reg)) + + cls_score.append(self.fc_cls[k](x_cls) if self.with_cls else None) + bbox_pred.append(self.fc_reg[k](x_reg) if self.with_reg else None) + + if self.with_refine: + x_ref = x + cls_score_ref = list() + bbox_pred_ref = list() + for k in range(self.num_instance): + feat_ref = cls_score[k].softmax(dim=-1) + feat_ref = torch.cat((bbox_pred[k], feat_ref[:, 1][:, None]), + dim=1).repeat(1, 4) + feat_ref = torch.cat((x_ref, feat_ref), dim=1) + feat_ref = F.relu_(self.shared_fcs_ref(feat_ref)) + + cls_score_ref.append(self.fc_cls_ref[k](feat_ref)) + bbox_pred_ref.append(self.fc_reg_ref[k](feat_ref)) + + cls_score = torch.cat(cls_score, dim=1) + bbox_pred = torch.cat(bbox_pred, dim=1) + cls_score_ref = torch.cat(cls_score_ref, dim=1) + bbox_pred_ref = torch.cat(bbox_pred_ref, dim=1) + return cls_score, bbox_pred, cls_score_ref, bbox_pred_ref + + cls_score = torch.cat(cls_score, dim=1) + bbox_pred = torch.cat(bbox_pred, dim=1) + + return cls_score, bbox_pred + + def get_targets(self, + sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict, + concat: bool = True) -> tuple: + """Calculate the ground truth for all samples in a batch according to + the sampling_results. + + Almost the same as the implementation in bbox_head, we passed + additional parameters pos_inds_list and neg_inds_list to + `_get_targets_single` function. + + Args: + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + concat (bool): Whether to concatenate the results of all + the images in a single batch. + + Returns: + Tuple[Tensor]: Ground truth for proposals in a single image. + Containing the following list of Tensors: + + - labels (list[Tensor],Tensor): Gt_labels for all proposals in a + batch, each tensor in list has shape (num_proposals,) when + `concat=False`, otherwise just a single tensor has shape + (num_all_proposals,). + - label_weights (list[Tensor]): Labels_weights for + all proposals in a batch, each tensor in list has shape + (num_proposals,) when `concat=False`, otherwise just a single + tensor has shape (num_all_proposals,). + - bbox_targets (list[Tensor],Tensor): Regression target for all + proposals in a batch, each tensor in list has shape + (num_proposals, 4) when `concat=False`, otherwise just a single + tensor has shape (num_all_proposals, 4), the last dimension 4 + represents [tl_x, tl_y, br_x, br_y]. + - bbox_weights (list[tensor],Tensor): Regression weights for + all proposals in a batch, each tensor in list has shape + (num_proposals, 4) when `concat=False`, otherwise just a + single tensor has shape (num_all_proposals, 4). + """ + labels = [] + bbox_targets = [] + bbox_weights = [] + label_weights = [] + for i in range(len(sampling_results)): + sample_bboxes = torch.cat([ + sampling_results[i].pos_gt_bboxes, + sampling_results[i].neg_gt_bboxes + ]) + sample_priors = sampling_results[i].priors + sample_priors = sample_priors.repeat(1, self.num_instance).reshape( + -1, 4) + sample_bboxes = sample_bboxes.reshape(-1, 4) + + if not self.reg_decoded_bbox: + _bbox_targets = self.bbox_coder.encode(sample_priors, + sample_bboxes) + else: + _bbox_targets = sample_priors + _bbox_targets = _bbox_targets.reshape(-1, self.num_instance * 4) + _bbox_weights = torch.ones(_bbox_targets.shape) + _labels = torch.cat([ + sampling_results[i].pos_gt_labels, + sampling_results[i].neg_gt_labels + ]) + _labels_weights = torch.ones(_labels.shape) + + bbox_targets.append(_bbox_targets) + bbox_weights.append(_bbox_weights) + labels.append(_labels) + label_weights.append(_labels_weights) + + if concat: + labels = torch.cat(labels, 0) + label_weights = torch.cat(label_weights, 0) + bbox_targets = torch.cat(bbox_targets, 0) + bbox_weights = torch.cat(bbox_weights, 0) + return labels, label_weights, bbox_targets, bbox_weights + + def loss(self, cls_score: Tensor, bbox_pred: Tensor, rois: Tensor, + labels: Tensor, label_weights: Tensor, bbox_targets: Tensor, + bbox_weights: Tensor, **kwargs) -> dict: + """Calculate the loss based on the network predictions and targets. + + Args: + cls_score (Tensor): Classification prediction results of all class, + has shape (batch_size * num_proposals_single_image, + (num_classes + 1) * k), k represents the number of prediction + boxes generated by each proposal box. + bbox_pred (Tensor): Regression prediction results, has shape + (batch_size * num_proposals_single_image, 4 * k), the last + dimension 4 represents [tl_x, tl_y, br_x, br_y]. + rois (Tensor): RoIs with the shape + (batch_size * num_proposals_single_image, 5) where the first + column indicates batch id of each RoI. + labels (Tensor): Gt_labels for all proposals in a batch, has + shape (batch_size * num_proposals_single_image, k). + label_weights (Tensor): Labels_weights for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, k). + bbox_targets (Tensor): Regression target for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, + 4 * k), the last dimension 4 represents [tl_x, tl_y, br_x, + br_y]. + bbox_weights (Tensor): Regression weights for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, + 4 * k). + + Returns: + dict: A dictionary of loss. + """ + losses = dict() + if bbox_pred.numel(): + loss_0 = self.emd_loss(bbox_pred[:, 0:4], cls_score[:, 0:2], + bbox_pred[:, 4:8], cls_score[:, 2:4], + bbox_targets, labels) + loss_1 = self.emd_loss(bbox_pred[:, 4:8], cls_score[:, 2:4], + bbox_pred[:, 0:4], cls_score[:, 0:2], + bbox_targets, labels) + loss = torch.cat([loss_0, loss_1], dim=1) + _, min_indices = loss.min(dim=1) + loss_emd = loss[torch.arange(loss.shape[0]), min_indices] + loss_emd = loss_emd.mean() + else: + loss_emd = bbox_pred.sum() + losses['loss_rcnn_emd'] = loss_emd + return losses + + def emd_loss(self, bbox_pred_0: Tensor, cls_score_0: Tensor, + bbox_pred_1: Tensor, cls_score_1: Tensor, targets: Tensor, + labels: Tensor) -> Tensor: + """Calculate the emd loss. + + Note: + This implementation is modified from https://github.com/Purkialo/ + CrowdDet/blob/master/lib/det_oprs/loss_opr.py + + Args: + bbox_pred_0 (Tensor): Part of regression prediction results, has + shape (batch_size * num_proposals_single_image, 4), the last + dimension 4 represents [tl_x, tl_y, br_x, br_y]. + cls_score_0 (Tensor): Part of classification prediction results, + has shape (batch_size * num_proposals_single_image, + (num_classes + 1)), where 1 represents the background. + bbox_pred_1 (Tensor): The other part of regression prediction + results, has shape (batch_size*num_proposals_single_image, 4). + cls_score_1 (Tensor):The other part of classification prediction + results, has shape (batch_size * num_proposals_single_image, + (num_classes + 1)). + targets (Tensor):Regression target for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, + 4 * k), the last dimension 4 represents [tl_x, tl_y, br_x, + br_y], k represents the number of prediction boxes generated + by each proposal box. + labels (Tensor): Gt_labels for all proposals in a batch, has + shape (batch_size * num_proposals_single_image, k). + + Returns: + torch.Tensor: The calculated loss. + """ + + bbox_pred = torch.cat([bbox_pred_0, bbox_pred_1], + dim=1).reshape(-1, bbox_pred_0.shape[-1]) + cls_score = torch.cat([cls_score_0, cls_score_1], + dim=1).reshape(-1, cls_score_0.shape[-1]) + targets = targets.reshape(-1, 4) + labels = labels.long().flatten() + + # masks + valid_masks = labels >= 0 + fg_masks = labels > 0 + + # multiple class + bbox_pred = bbox_pred.reshape(-1, self.num_classes, 4) + fg_gt_classes = labels[fg_masks] + bbox_pred = bbox_pred[fg_masks, fg_gt_classes - 1, :] + + # loss for regression + loss_bbox = self.loss_bbox(bbox_pred, targets[fg_masks]) + loss_bbox = loss_bbox.sum(dim=1) + + # loss for classification + labels = labels * valid_masks + loss_cls = self.loss_cls(cls_score, labels) + + loss_cls[fg_masks] = loss_cls[fg_masks] + loss_bbox + loss = loss_cls.reshape(-1, 2).sum(dim=1) + return loss.reshape(-1, 1) + + def _predict_by_feat_single( + self, + roi: Tensor, + cls_score: Tensor, + bbox_pred: Tensor, + img_meta: dict, + rescale: bool = False, + rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5). + last dimension 5 arrange as (batch_index, x1, y1, x2, y2). + cls_score (Tensor): Box scores, has shape + (num_boxes, num_classes + 1). + bbox_pred (Tensor): Box energies / deltas. has shape + (num_boxes, num_classes * 4). + img_meta (dict): image information. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. + Defaults to None + + Returns: + :obj:`InstanceData`: Detection results of each image. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + + cls_score = cls_score.reshape(-1, self.num_classes + 1) + bbox_pred = bbox_pred.reshape(-1, 4) + roi = roi.repeat_interleave(self.num_instance, dim=0) + + results = InstanceData() + if roi.shape[0] == 0: + return empty_instances([img_meta], + roi.device, + task_type='bbox', + instance_results=[results])[0] + + scores = cls_score.softmax(dim=-1) if cls_score is not None else None + img_shape = img_meta['img_shape'] + bboxes = self.bbox_coder.decode( + roi[..., 1:], bbox_pred, max_shape=img_shape) + + if rescale and bboxes.size(0) > 0: + assert img_meta.get('scale_factor') is not None + scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view( + bboxes.size()[0], -1) + + if rcnn_test_cfg is None: + # This means that it is aug test. + # It needs to return the raw results without nms. + results.bboxes = bboxes + results.scores = scores + else: + roi_idx = np.tile( + np.arange(bboxes.shape[0] / self.num_instance)[:, None], + (1, self.num_instance)).reshape(-1, 1)[:, 0] + roi_idx = torch.from_numpy(roi_idx).to(bboxes.device).reshape( + -1, 1) + bboxes = torch.cat([bboxes, roi_idx], dim=1) + det_bboxes, det_scores = self.set_nms( + bboxes, scores[:, 1], rcnn_test_cfg.score_thr, + rcnn_test_cfg.nms['iou_threshold'], rcnn_test_cfg.max_per_img) + + results.bboxes = det_bboxes[:, :-1] + results.scores = det_scores + results.labels = torch.zeros_like(det_scores) + + return results + + @staticmethod + def set_nms(bboxes: Tensor, + scores: Tensor, + score_thr: float, + iou_threshold: float, + max_num: int = -1) -> Tuple[Tensor, Tensor]: + """NMS for multi-instance prediction. Please refer to + https://github.com/Purkialo/CrowdDet for more details. + + Args: + bboxes (Tensor): predict bboxes. + scores (Tensor): The score of each predict bbox. + score_thr (float): bbox threshold, bboxes with scores lower than it + will not be considered. + iou_threshold (float): IoU threshold to be considered as + conflicted. + max_num (int, optional): if there are more than max_num bboxes + after NMS, only top max_num will be kept. Default to -1. + + Returns: + Tuple[Tensor, Tensor]: (bboxes, scores). + """ + + bboxes = bboxes[scores > score_thr] + scores = scores[scores > score_thr] + + ordered_scores, order = scores.sort(descending=True) + ordered_bboxes = bboxes[order] + roi_idx = ordered_bboxes[:, -1] + + keep = torch.ones(len(ordered_bboxes)) == 1 + ruler = torch.arange(len(ordered_bboxes)) + + keep = keep.to(bboxes.device) + ruler = ruler.to(bboxes.device) + + while ruler.shape[0] > 0: + basement = ruler[0] + ruler = ruler[1:] + idx = roi_idx[basement] + # calculate the body overlap + basement_bbox = ordered_bboxes[:, :4][basement].reshape(-1, 4) + ruler_bbox = ordered_bboxes[:, :4][ruler].reshape(-1, 4) + overlap = bbox_overlaps(basement_bbox, ruler_bbox) + indices = torch.where(overlap > iou_threshold)[1] + loc = torch.where(roi_idx[ruler][indices] == idx) + # the mask won't change in the step + mask = keep[ruler[indices][loc]] + keep[ruler[indices]] = False + keep[ruler[indices][loc][mask]] = True + ruler[~keep[ruler]] = -1 + ruler = ruler[ruler > 0] + + keep = keep[order.sort()[1]] + return bboxes[keep][:max_num, :], scores[keep][:max_num] diff --git a/mmdetection/mmdet/models/roi_heads/bbox_heads/sabl_head.py b/mmdetection/mmdet/models/roi_heads/bbox_heads/sabl_head.py new file mode 100644 index 0000000..9a9ee6a --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/bbox_heads/sabl_head.py @@ -0,0 +1,684 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.layers import multiclass_nms +from mmdet.models.losses import accuracy +from mmdet.models.task_modules import SamplingResult +from mmdet.models.utils import multi_apply +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig +from .bbox_head import BBoxHead + + +@MODELS.register_module() +class SABLHead(BBoxHead): + """Side-Aware Boundary Localization (SABL) for RoI-Head. + + Side-Aware features are extracted by conv layers + with an attention mechanism. + Boundary Localization with Bucketing and Bucketing Guided Rescoring + are implemented in BucketingBBoxCoder. + + Please refer to https://arxiv.org/abs/1912.04260 for more details. + + Args: + cls_in_channels (int): Input channels of cls RoI feature. \ + Defaults to 256. + reg_in_channels (int): Input channels of reg RoI feature. \ + Defaults to 256. + roi_feat_size (int): Size of RoI features. Defaults to 7. + reg_feat_up_ratio (int): Upsample ratio of reg features. \ + Defaults to 2. + reg_pre_kernel (int): Kernel of 2D conv layers before \ + attention pooling. Defaults to 3. + reg_post_kernel (int): Kernel of 1D conv layers after \ + attention pooling. Defaults to 3. + reg_pre_num (int): Number of pre convs. Defaults to 2. + reg_post_num (int): Number of post convs. Defaults to 1. + num_classes (int): Number of classes in dataset. Defaults to 80. + cls_out_channels (int): Hidden channels in cls fcs. Defaults to 1024. + reg_offset_out_channels (int): Hidden and output channel \ + of reg offset branch. Defaults to 256. + reg_cls_out_channels (int): Hidden and output channel \ + of reg cls branch. Defaults to 256. + num_cls_fcs (int): Number of fcs for cls branch. Defaults to 1. + num_reg_fcs (int): Number of fcs for reg branch.. Defaults to 0. + reg_class_agnostic (bool): Class agnostic regression or not. \ + Defaults to True. + norm_cfg (dict): Config of norm layers. Defaults to None. + bbox_coder (dict): Config of bbox coder. Defaults 'BucketingBBoxCoder'. + loss_cls (dict): Config of classification loss. + loss_bbox_cls (dict): Config of classification loss for bbox branch. + loss_bbox_reg (dict): Config of regression loss for bbox branch. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + cls_in_channels: int = 256, + reg_in_channels: int = 256, + roi_feat_size: int = 7, + reg_feat_up_ratio: int = 2, + reg_pre_kernel: int = 3, + reg_post_kernel: int = 3, + reg_pre_num: int = 2, + reg_post_num: int = 1, + cls_out_channels: int = 1024, + reg_offset_out_channels: int = 256, + reg_cls_out_channels: int = 256, + num_cls_fcs: int = 1, + num_reg_fcs: int = 0, + reg_class_agnostic: bool = True, + norm_cfg: OptConfigType = None, + bbox_coder: ConfigType = dict( + type='BucketingBBoxCoder', + num_buckets=14, + scale_factor=1.7), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_bbox_reg: ConfigType = dict( + type='SmoothL1Loss', beta=0.1, loss_weight=1.0), + init_cfg: OptMultiConfig = None) -> None: + super(BBoxHead, self).__init__(init_cfg=init_cfg) + self.cls_in_channels = cls_in_channels + self.reg_in_channels = reg_in_channels + self.roi_feat_size = roi_feat_size + self.reg_feat_up_ratio = int(reg_feat_up_ratio) + self.num_buckets = bbox_coder['num_buckets'] + assert self.reg_feat_up_ratio // 2 >= 1 + self.up_reg_feat_size = roi_feat_size * self.reg_feat_up_ratio + assert self.up_reg_feat_size == bbox_coder['num_buckets'] + self.reg_pre_kernel = reg_pre_kernel + self.reg_post_kernel = reg_post_kernel + self.reg_pre_num = reg_pre_num + self.reg_post_num = reg_post_num + self.num_classes = num_classes + self.cls_out_channels = cls_out_channels + self.reg_offset_out_channels = reg_offset_out_channels + self.reg_cls_out_channels = reg_cls_out_channels + self.num_cls_fcs = num_cls_fcs + self.num_reg_fcs = num_reg_fcs + self.reg_class_agnostic = reg_class_agnostic + assert self.reg_class_agnostic + self.norm_cfg = norm_cfg + + self.bbox_coder = TASK_UTILS.build(bbox_coder) + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox_cls = MODELS.build(loss_bbox_cls) + self.loss_bbox_reg = MODELS.build(loss_bbox_reg) + + self.cls_fcs = self._add_fc_branch(self.num_cls_fcs, + self.cls_in_channels, + self.roi_feat_size, + self.cls_out_channels) + + self.side_num = int(np.ceil(self.num_buckets / 2)) + + if self.reg_feat_up_ratio > 1: + self.upsample_x = nn.ConvTranspose1d( + reg_in_channels, + reg_in_channels, + self.reg_feat_up_ratio, + stride=self.reg_feat_up_ratio) + self.upsample_y = nn.ConvTranspose1d( + reg_in_channels, + reg_in_channels, + self.reg_feat_up_ratio, + stride=self.reg_feat_up_ratio) + + self.reg_pre_convs = nn.ModuleList() + for i in range(self.reg_pre_num): + reg_pre_conv = ConvModule( + reg_in_channels, + reg_in_channels, + kernel_size=reg_pre_kernel, + padding=reg_pre_kernel // 2, + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU')) + self.reg_pre_convs.append(reg_pre_conv) + + self.reg_post_conv_xs = nn.ModuleList() + for i in range(self.reg_post_num): + reg_post_conv_x = ConvModule( + reg_in_channels, + reg_in_channels, + kernel_size=(1, reg_post_kernel), + padding=(0, reg_post_kernel // 2), + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU')) + self.reg_post_conv_xs.append(reg_post_conv_x) + self.reg_post_conv_ys = nn.ModuleList() + for i in range(self.reg_post_num): + reg_post_conv_y = ConvModule( + reg_in_channels, + reg_in_channels, + kernel_size=(reg_post_kernel, 1), + padding=(reg_post_kernel // 2, 0), + norm_cfg=norm_cfg, + act_cfg=dict(type='ReLU')) + self.reg_post_conv_ys.append(reg_post_conv_y) + + self.reg_conv_att_x = nn.Conv2d(reg_in_channels, 1, 1) + self.reg_conv_att_y = nn.Conv2d(reg_in_channels, 1, 1) + + self.fc_cls = nn.Linear(self.cls_out_channels, self.num_classes + 1) + self.relu = nn.ReLU(inplace=True) + + self.reg_cls_fcs = self._add_fc_branch(self.num_reg_fcs, + self.reg_in_channels, 1, + self.reg_cls_out_channels) + self.reg_offset_fcs = self._add_fc_branch(self.num_reg_fcs, + self.reg_in_channels, 1, + self.reg_offset_out_channels) + self.fc_reg_cls = nn.Linear(self.reg_cls_out_channels, 1) + self.fc_reg_offset = nn.Linear(self.reg_offset_out_channels, 1) + + if init_cfg is None: + self.init_cfg = [ + dict( + type='Xavier', + layer='Linear', + distribution='uniform', + override=[ + dict(type='Normal', name='reg_conv_att_x', std=0.01), + dict(type='Normal', name='reg_conv_att_y', std=0.01), + dict(type='Normal', name='fc_reg_cls', std=0.01), + dict(type='Normal', name='fc_cls', std=0.01), + dict(type='Normal', name='fc_reg_offset', std=0.001) + ]) + ] + if self.reg_feat_up_ratio > 1: + self.init_cfg += [ + dict( + type='Kaiming', + distribution='normal', + override=[ + dict(name='upsample_x'), + dict(name='upsample_y') + ]) + ] + + def _add_fc_branch(self, num_branch_fcs: int, in_channels: int, + roi_feat_size: int, + fc_out_channels: int) -> nn.ModuleList: + """build fc layers.""" + in_channels = in_channels * roi_feat_size * roi_feat_size + branch_fcs = nn.ModuleList() + for i in range(num_branch_fcs): + fc_in_channels = (in_channels if i == 0 else fc_out_channels) + branch_fcs.append(nn.Linear(fc_in_channels, fc_out_channels)) + return branch_fcs + + def cls_forward(self, cls_x: Tensor) -> Tensor: + """forward of classification fc layers.""" + cls_x = cls_x.view(cls_x.size(0), -1) + for fc in self.cls_fcs: + cls_x = self.relu(fc(cls_x)) + cls_score = self.fc_cls(cls_x) + return cls_score + + def attention_pool(self, reg_x: Tensor) -> tuple: + """Extract direction-specific features fx and fy with attention + methanism.""" + reg_fx = reg_x + reg_fy = reg_x + reg_fx_att = self.reg_conv_att_x(reg_fx).sigmoid() + reg_fy_att = self.reg_conv_att_y(reg_fy).sigmoid() + reg_fx_att = reg_fx_att / reg_fx_att.sum(dim=2).unsqueeze(2) + reg_fy_att = reg_fy_att / reg_fy_att.sum(dim=3).unsqueeze(3) + reg_fx = (reg_fx * reg_fx_att).sum(dim=2) + reg_fy = (reg_fy * reg_fy_att).sum(dim=3) + return reg_fx, reg_fy + + def side_aware_feature_extractor(self, reg_x: Tensor) -> tuple: + """Refine and extract side-aware features without split them.""" + for reg_pre_conv in self.reg_pre_convs: + reg_x = reg_pre_conv(reg_x) + reg_fx, reg_fy = self.attention_pool(reg_x) + + if self.reg_post_num > 0: + reg_fx = reg_fx.unsqueeze(2) + reg_fy = reg_fy.unsqueeze(3) + for i in range(self.reg_post_num): + reg_fx = self.reg_post_conv_xs[i](reg_fx) + reg_fy = self.reg_post_conv_ys[i](reg_fy) + reg_fx = reg_fx.squeeze(2) + reg_fy = reg_fy.squeeze(3) + if self.reg_feat_up_ratio > 1: + reg_fx = self.relu(self.upsample_x(reg_fx)) + reg_fy = self.relu(self.upsample_y(reg_fy)) + reg_fx = torch.transpose(reg_fx, 1, 2) + reg_fy = torch.transpose(reg_fy, 1, 2) + return reg_fx.contiguous(), reg_fy.contiguous() + + def reg_pred(self, x: Tensor, offset_fcs: nn.ModuleList, + cls_fcs: nn.ModuleList) -> tuple: + """Predict bucketing estimation (cls_pred) and fine regression (offset + pred) with side-aware features.""" + x_offset = x.view(-1, self.reg_in_channels) + x_cls = x.view(-1, self.reg_in_channels) + + for fc in offset_fcs: + x_offset = self.relu(fc(x_offset)) + for fc in cls_fcs: + x_cls = self.relu(fc(x_cls)) + offset_pred = self.fc_reg_offset(x_offset) + cls_pred = self.fc_reg_cls(x_cls) + + offset_pred = offset_pred.view(x.size(0), -1) + cls_pred = cls_pred.view(x.size(0), -1) + + return offset_pred, cls_pred + + def side_aware_split(self, feat: Tensor) -> Tensor: + """Split side-aware features aligned with orders of bucketing + targets.""" + l_end = int(np.ceil(self.up_reg_feat_size / 2)) + r_start = int(np.floor(self.up_reg_feat_size / 2)) + feat_fl = feat[:, :l_end] + feat_fr = feat[:, r_start:].flip(dims=(1, )) + feat_fl = feat_fl.contiguous() + feat_fr = feat_fr.contiguous() + feat = torch.cat([feat_fl, feat_fr], dim=-1) + return feat + + def bbox_pred_split(self, bbox_pred: tuple, + num_proposals_per_img: Sequence[int]) -> tuple: + """Split batch bbox prediction back to each image.""" + bucket_cls_preds, bucket_offset_preds = bbox_pred + bucket_cls_preds = bucket_cls_preds.split(num_proposals_per_img, 0) + bucket_offset_preds = bucket_offset_preds.split( + num_proposals_per_img, 0) + bbox_pred = tuple(zip(bucket_cls_preds, bucket_offset_preds)) + return bbox_pred + + def reg_forward(self, reg_x: Tensor) -> tuple: + """forward of regression branch.""" + outs = self.side_aware_feature_extractor(reg_x) + edge_offset_preds = [] + edge_cls_preds = [] + reg_fx = outs[0] + reg_fy = outs[1] + offset_pred_x, cls_pred_x = self.reg_pred(reg_fx, self.reg_offset_fcs, + self.reg_cls_fcs) + offset_pred_y, cls_pred_y = self.reg_pred(reg_fy, self.reg_offset_fcs, + self.reg_cls_fcs) + offset_pred_x = self.side_aware_split(offset_pred_x) + offset_pred_y = self.side_aware_split(offset_pred_y) + cls_pred_x = self.side_aware_split(cls_pred_x) + cls_pred_y = self.side_aware_split(cls_pred_y) + edge_offset_preds = torch.cat([offset_pred_x, offset_pred_y], dim=-1) + edge_cls_preds = torch.cat([cls_pred_x, cls_pred_y], dim=-1) + + return edge_cls_preds, edge_offset_preds + + def forward(self, x: Tensor) -> tuple: + """Forward features from the upstream network.""" + bbox_pred = self.reg_forward(x) + cls_score = self.cls_forward(x) + + return cls_score, bbox_pred + + def get_targets(self, + sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict, + concat: bool = True) -> tuple: + """Calculate the ground truth for all samples in a batch according to + the sampling_results.""" + pos_proposals = [res.pos_bboxes for res in sampling_results] + neg_proposals = [res.neg_bboxes for res in sampling_results] + pos_gt_bboxes = [res.pos_gt_bboxes for res in sampling_results] + pos_gt_labels = [res.pos_gt_labels for res in sampling_results] + cls_reg_targets = self.bucket_target( + pos_proposals, + neg_proposals, + pos_gt_bboxes, + pos_gt_labels, + rcnn_train_cfg, + concat=concat) + (labels, label_weights, bucket_cls_targets, bucket_cls_weights, + bucket_offset_targets, bucket_offset_weights) = cls_reg_targets + return (labels, label_weights, (bucket_cls_targets, + bucket_offset_targets), + (bucket_cls_weights, bucket_offset_weights)) + + def bucket_target(self, + pos_proposals_list: list, + neg_proposals_list: list, + pos_gt_bboxes_list: list, + pos_gt_labels_list: list, + rcnn_train_cfg: ConfigDict, + concat: bool = True) -> tuple: + """Compute bucketing estimation targets and fine regression targets for + a batch of images.""" + (labels, label_weights, bucket_cls_targets, bucket_cls_weights, + bucket_offset_targets, bucket_offset_weights) = multi_apply( + self._bucket_target_single, + pos_proposals_list, + neg_proposals_list, + pos_gt_bboxes_list, + pos_gt_labels_list, + cfg=rcnn_train_cfg) + + if concat: + labels = torch.cat(labels, 0) + label_weights = torch.cat(label_weights, 0) + bucket_cls_targets = torch.cat(bucket_cls_targets, 0) + bucket_cls_weights = torch.cat(bucket_cls_weights, 0) + bucket_offset_targets = torch.cat(bucket_offset_targets, 0) + bucket_offset_weights = torch.cat(bucket_offset_weights, 0) + return (labels, label_weights, bucket_cls_targets, bucket_cls_weights, + bucket_offset_targets, bucket_offset_weights) + + def _bucket_target_single(self, pos_proposals: Tensor, + neg_proposals: Tensor, pos_gt_bboxes: Tensor, + pos_gt_labels: Tensor, cfg: ConfigDict) -> tuple: + """Compute bucketing estimation targets and fine regression targets for + a single image. + + Args: + pos_proposals (Tensor): positive proposals of a single image, + Shape (n_pos, 4) + neg_proposals (Tensor): negative proposals of a single image, + Shape (n_neg, 4). + pos_gt_bboxes (Tensor): gt bboxes assigned to positive proposals + of a single image, Shape (n_pos, 4). + pos_gt_labels (Tensor): gt labels assigned to positive proposals + of a single image, Shape (n_pos, ). + cfg (dict): Config of calculating targets + + Returns: + tuple: + + - labels (Tensor): Labels in a single image. Shape (n,). + - label_weights (Tensor): Label weights in a single image. + Shape (n,) + - bucket_cls_targets (Tensor): Bucket cls targets in + a single image. Shape (n, num_buckets*2). + - bucket_cls_weights (Tensor): Bucket cls weights in + a single image. Shape (n, num_buckets*2). + - bucket_offset_targets (Tensor): Bucket offset targets + in a single image. Shape (n, num_buckets*2). + - bucket_offset_targets (Tensor): Bucket offset weights + in a single image. Shape (n, num_buckets*2). + """ + num_pos = pos_proposals.size(0) + num_neg = neg_proposals.size(0) + num_samples = num_pos + num_neg + labels = pos_gt_bboxes.new_full((num_samples, ), + self.num_classes, + dtype=torch.long) + label_weights = pos_proposals.new_zeros(num_samples) + bucket_cls_targets = pos_proposals.new_zeros(num_samples, + 4 * self.side_num) + bucket_cls_weights = pos_proposals.new_zeros(num_samples, + 4 * self.side_num) + bucket_offset_targets = pos_proposals.new_zeros( + num_samples, 4 * self.side_num) + bucket_offset_weights = pos_proposals.new_zeros( + num_samples, 4 * self.side_num) + if num_pos > 0: + labels[:num_pos] = pos_gt_labels + label_weights[:num_pos] = 1.0 + (pos_bucket_offset_targets, pos_bucket_offset_weights, + pos_bucket_cls_targets, + pos_bucket_cls_weights) = self.bbox_coder.encode( + pos_proposals, pos_gt_bboxes) + bucket_cls_targets[:num_pos, :] = pos_bucket_cls_targets + bucket_cls_weights[:num_pos, :] = pos_bucket_cls_weights + bucket_offset_targets[:num_pos, :] = pos_bucket_offset_targets + bucket_offset_weights[:num_pos, :] = pos_bucket_offset_weights + if num_neg > 0: + label_weights[-num_neg:] = 1.0 + return (labels, label_weights, bucket_cls_targets, bucket_cls_weights, + bucket_offset_targets, bucket_offset_weights) + + def loss(self, + cls_score: Tensor, + bbox_pred: Tuple[Tensor, Tensor], + rois: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tuple[Tensor, Tensor], + bbox_weights: Tuple[Tensor, Tensor], + reduction_override: Optional[str] = None) -> dict: + """Calculate the loss based on the network predictions and targets. + + Args: + cls_score (Tensor): Classification prediction + results of all class, has shape + (batch_size * num_proposals_single_image, num_classes) + bbox_pred (Tensor): A tuple of regression prediction results + containing `bucket_cls_preds and` `bucket_offset_preds`. + rois (Tensor): RoIs with the shape + (batch_size * num_proposals_single_image, 5) where the first + column indicates batch id of each RoI. + labels (Tensor): Gt_labels for all proposals in a batch, has + shape (batch_size * num_proposals_single_image, ). + label_weights (Tensor): Labels_weights for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, ). + bbox_targets (Tuple[Tensor, Tensor]): A tuple of regression target + containing `bucket_cls_targets` and `bucket_offset_targets`. + the last dimension 4 represents [tl_x, tl_y, br_x, br_y]. + bbox_weights (Tuple[Tensor, Tensor]): A tuple of regression + weights containing `bucket_cls_weights` and + `bucket_offset_weights`. + reduction_override (str, optional): The reduction + method used to override the original reduction + method of the loss. Options are "none", + "mean" and "sum". Defaults to None, + + Returns: + dict: A dictionary of loss. + """ + losses = dict() + if cls_score is not None: + avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.) + losses['loss_cls'] = self.loss_cls( + cls_score, + labels, + label_weights, + avg_factor=avg_factor, + reduction_override=reduction_override) + losses['acc'] = accuracy(cls_score, labels) + + if bbox_pred is not None: + bucket_cls_preds, bucket_offset_preds = bbox_pred + bucket_cls_targets, bucket_offset_targets = bbox_targets + bucket_cls_weights, bucket_offset_weights = bbox_weights + # edge cls + bucket_cls_preds = bucket_cls_preds.view(-1, self.side_num) + bucket_cls_targets = bucket_cls_targets.view(-1, self.side_num) + bucket_cls_weights = bucket_cls_weights.view(-1, self.side_num) + losses['loss_bbox_cls'] = self.loss_bbox_cls( + bucket_cls_preds, + bucket_cls_targets, + bucket_cls_weights, + avg_factor=bucket_cls_targets.size(0), + reduction_override=reduction_override) + + losses['loss_bbox_reg'] = self.loss_bbox_reg( + bucket_offset_preds, + bucket_offset_targets, + bucket_offset_weights, + avg_factor=bucket_offset_targets.size(0), + reduction_override=reduction_override) + + return losses + + def _predict_by_feat_single( + self, + roi: Tensor, + cls_score: Tensor, + bbox_pred: Tuple[Tensor, Tensor], + img_meta: dict, + rescale: bool = False, + rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5). + last dimension 5 arrange as (batch_index, x1, y1, x2, y2). + cls_score (Tensor): Box scores, has shape + (num_boxes, num_classes + 1). + bbox_pred (Tuple[Tensor, Tensor]): Box cls preds and offset preds. + img_meta (dict): image information. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. + Defaults to None + + Returns: + :obj:`InstanceData`: Detection results of each image + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + results = InstanceData() + if isinstance(cls_score, list): + cls_score = sum(cls_score) / float(len(cls_score)) + scores = F.softmax(cls_score, dim=1) if cls_score is not None else None + img_shape = img_meta['img_shape'] + if bbox_pred is not None: + bboxes, confidences = self.bbox_coder.decode( + roi[:, 1:], bbox_pred, img_shape) + else: + bboxes = roi[:, 1:].clone() + confidences = None + if img_shape is not None: + bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1] - 1) + bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0] - 1) + + if rescale and bboxes.size(0) > 0: + assert img_meta.get('scale_factor') is not None + scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view( + bboxes.size()[0], -1) + + if rcnn_test_cfg is None: + results.bboxes = bboxes + results.scores = scores + else: + det_bboxes, det_labels = multiclass_nms( + bboxes, + scores, + rcnn_test_cfg.score_thr, + rcnn_test_cfg.nms, + rcnn_test_cfg.max_per_img, + score_factors=confidences) + results.bboxes = det_bboxes[:, :4] + results.scores = det_bboxes[:, -1] + results.labels = det_labels + return results + + def refine_bboxes(self, sampling_results: List[SamplingResult], + bbox_results: dict, + batch_img_metas: List[dict]) -> InstanceList: + """Refine bboxes during training. + + Args: + sampling_results (List[:obj:`SamplingResult`]): Sampling results. + bbox_results (dict): Usually is a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `rois` (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + - `bbox_targets` (tuple): Ground truth for proposals in a + single image. Containing the following list of Tensors: + (labels, label_weights, bbox_targets, bbox_weights) + batch_img_metas (List[dict]): List of image information. + + Returns: + list[:obj:`InstanceData`]: Refined bboxes of each image. + """ + pos_is_gts = [res.pos_is_gt for res in sampling_results] + # bbox_targets is a tuple + labels = bbox_results['bbox_targets'][0] + cls_scores = bbox_results['cls_score'] + rois = bbox_results['rois'] + bbox_preds = bbox_results['bbox_pred'] + + if cls_scores.numel() == 0: + return None + + labels = torch.where(labels == self.num_classes, + cls_scores[:, :-1].argmax(1), labels) + + img_ids = rois[:, 0].long().unique(sorted=True) + assert img_ids.numel() <= len(batch_img_metas) + + results_list = [] + for i in range(len(batch_img_metas)): + inds = torch.nonzero( + rois[:, 0] == i, as_tuple=False).squeeze(dim=1) + num_rois = inds.numel() + + bboxes_ = rois[inds, 1:] + label_ = labels[inds] + edge_cls_preds, edge_offset_preds = bbox_preds + edge_cls_preds_ = edge_cls_preds[inds] + edge_offset_preds_ = edge_offset_preds[inds] + bbox_pred_ = (edge_cls_preds_, edge_offset_preds_) + img_meta_ = batch_img_metas[i] + pos_is_gts_ = pos_is_gts[i] + + bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_, + img_meta_) + # filter gt bboxes + pos_keep = 1 - pos_is_gts_ + keep_inds = pos_is_gts_.new_ones(num_rois) + keep_inds[:len(pos_is_gts_)] = pos_keep + results = InstanceData(bboxes=bboxes[keep_inds.type(torch.bool)]) + results_list.append(results) + + return results_list + + def regress_by_class(self, rois: Tensor, label: Tensor, bbox_pred: tuple, + img_meta: dict) -> Tensor: + """Regress the bbox for the predicted class. Used in Cascade R-CNN. + + Args: + rois (Tensor): shape (n, 4) or (n, 5) + label (Tensor): shape (n, ) + bbox_pred (Tuple[Tensor]): shape [(n, num_buckets *2), \ + (n, num_buckets *2)] + img_meta (dict): Image meta info. + + Returns: + Tensor: Regressed bboxes, the same shape as input rois. + """ + assert rois.size(1) == 4 or rois.size(1) == 5 + + if rois.size(1) == 4: + new_rois, _ = self.bbox_coder.decode(rois, bbox_pred, + img_meta['img_shape']) + else: + bboxes, _ = self.bbox_coder.decode(rois[:, 1:], bbox_pred, + img_meta['img_shape']) + new_rois = torch.cat((rois[:, [0]], bboxes), dim=1) + + return new_rois diff --git a/mmdetection/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py b/mmdetection/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py new file mode 100644 index 0000000..790b08f --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py @@ -0,0 +1,101 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + +from torch import Tensor + +from mmdet.registry import MODELS +from .convfc_bbox_head import ConvFCBBoxHead + + +@MODELS.register_module() +class SCNetBBoxHead(ConvFCBBoxHead): + """BBox head for `SCNet `_. + + This inherits ``ConvFCBBoxHead`` with modified forward() function, allow us + to get intermediate shared feature. + """ + + def _forward_shared(self, x: Tensor) -> Tensor: + """Forward function for shared part. + + Args: + x (Tensor): Input feature. + + Returns: + Tensor: Shared feature. + """ + if self.num_shared_convs > 0: + for conv in self.shared_convs: + x = conv(x) + + if self.num_shared_fcs > 0: + if self.with_avg_pool: + x = self.avg_pool(x) + + x = x.flatten(1) + + for fc in self.shared_fcs: + x = self.relu(fc(x)) + + return x + + def _forward_cls_reg(self, x: Tensor) -> Tuple[Tensor]: + """Forward function for classification and regression parts. + + Args: + x (Tensor): Input feature. + + Returns: + tuple[Tensor]: + + - cls_score (Tensor): classification prediction. + - bbox_pred (Tensor): bbox prediction. + """ + x_cls = x + x_reg = x + + for conv in self.cls_convs: + x_cls = conv(x_cls) + if x_cls.dim() > 2: + if self.with_avg_pool: + x_cls = self.avg_pool(x_cls) + x_cls = x_cls.flatten(1) + for fc in self.cls_fcs: + x_cls = self.relu(fc(x_cls)) + + for conv in self.reg_convs: + x_reg = conv(x_reg) + if x_reg.dim() > 2: + if self.with_avg_pool: + x_reg = self.avg_pool(x_reg) + x_reg = x_reg.flatten(1) + for fc in self.reg_fcs: + x_reg = self.relu(fc(x_reg)) + + cls_score = self.fc_cls(x_cls) if self.with_cls else None + bbox_pred = self.fc_reg(x_reg) if self.with_reg else None + + return cls_score, bbox_pred + + def forward( + self, + x: Tensor, + return_shared_feat: bool = False) -> Union[Tensor, Tuple[Tensor]]: + """Forward function. + + Args: + x (Tensor): input features + return_shared_feat (bool): If True, return cls-reg-shared feature. + + Return: + out (tuple[Tensor]): contain ``cls_score`` and ``bbox_pred``, + if ``return_shared_feat`` is True, append ``x_shared`` to the + returned tuple. + """ + x_shared = self._forward_shared(x) + out = self._forward_cls_reg(x_shared) + + if return_shared_feat: + out += (x_shared, ) + + return out diff --git a/mmdetection/mmdet/models/roi_heads/cascade_roi_head.py b/mmdetection/mmdet/models/roi_heads/cascade_roi_head.py new file mode 100644 index 0000000..81db671 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/cascade_roi_head.py @@ -0,0 +1,568 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmengine.model import ModuleList +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.models.test_time_augs import merge_aug_masks +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi, get_box_tensor +from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType, + OptMultiConfig) +from ..utils.misc import empty_instances, unpack_gt_instances +from .base_roi_head import BaseRoIHead + + +@MODELS.register_module() +class CascadeRoIHead(BaseRoIHead): + """Cascade roi head including one bbox head and one mask head. + + https://arxiv.org/abs/1712.00726 + """ + + def __init__(self, + num_stages: int, + stage_loss_weights: Union[List[float], Tuple[float]], + bbox_roi_extractor: OptMultiConfig = None, + bbox_head: OptMultiConfig = None, + mask_roi_extractor: OptMultiConfig = None, + mask_head: OptMultiConfig = None, + shared_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + assert bbox_roi_extractor is not None + assert bbox_head is not None + assert shared_head is None, \ + 'Shared head is not supported in Cascade RCNN anymore' + + self.num_stages = num_stages + self.stage_loss_weights = stage_loss_weights + super().__init__( + bbox_roi_extractor=bbox_roi_extractor, + bbox_head=bbox_head, + mask_roi_extractor=mask_roi_extractor, + mask_head=mask_head, + shared_head=shared_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + def init_bbox_head(self, bbox_roi_extractor: MultiConfig, + bbox_head: MultiConfig) -> None: + """Initialize box head and box roi extractor. + + Args: + bbox_roi_extractor (:obj:`ConfigDict`, dict or list): + Config of box roi extractor. + bbox_head (:obj:`ConfigDict`, dict or list): Config + of box in box head. + """ + self.bbox_roi_extractor = ModuleList() + self.bbox_head = ModuleList() + if not isinstance(bbox_roi_extractor, list): + bbox_roi_extractor = [ + bbox_roi_extractor for _ in range(self.num_stages) + ] + if not isinstance(bbox_head, list): + bbox_head = [bbox_head for _ in range(self.num_stages)] + assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages + for roi_extractor, head in zip(bbox_roi_extractor, bbox_head): + self.bbox_roi_extractor.append(MODELS.build(roi_extractor)) + self.bbox_head.append(MODELS.build(head)) + + def init_mask_head(self, mask_roi_extractor: MultiConfig, + mask_head: MultiConfig) -> None: + """Initialize mask head and mask roi extractor. + + Args: + mask_head (dict): Config of mask in mask head. + mask_roi_extractor (:obj:`ConfigDict`, dict or list): + Config of mask roi extractor. + """ + self.mask_head = nn.ModuleList() + if not isinstance(mask_head, list): + mask_head = [mask_head for _ in range(self.num_stages)] + assert len(mask_head) == self.num_stages + for head in mask_head: + self.mask_head.append(MODELS.build(head)) + if mask_roi_extractor is not None: + self.share_roi_extractor = False + self.mask_roi_extractor = ModuleList() + if not isinstance(mask_roi_extractor, list): + mask_roi_extractor = [ + mask_roi_extractor for _ in range(self.num_stages) + ] + assert len(mask_roi_extractor) == self.num_stages + for roi_extractor in mask_roi_extractor: + self.mask_roi_extractor.append(MODELS.build(roi_extractor)) + else: + self.share_roi_extractor = True + self.mask_roi_extractor = self.bbox_roi_extractor + + def init_assigner_sampler(self) -> None: + """Initialize assigner and sampler for each stage.""" + self.bbox_assigner = [] + self.bbox_sampler = [] + if self.train_cfg is not None: + for idx, rcnn_train_cfg in enumerate(self.train_cfg): + self.bbox_assigner.append( + TASK_UTILS.build(rcnn_train_cfg.assigner)) + self.current_stage = idx + self.bbox_sampler.append( + TASK_UTILS.build( + rcnn_train_cfg.sampler, + default_args=dict(context=self))) + + def _bbox_forward(self, stage: int, x: Tuple[Tensor], + rois: Tensor) -> dict: + """Box head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + bbox_roi_extractor = self.bbox_roi_extractor[stage] + bbox_head = self.bbox_head[stage] + bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], + rois) + # do not support caffe_c4 model anymore + cls_score, bbox_pred = bbox_head(bbox_feats) + + bbox_results = dict( + cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats) + return bbox_results + + def bbox_loss(self, stage: int, x: Tuple[Tensor], + sampling_results: List[SamplingResult]) -> dict: + """Run forward function and calculate loss for box head in training. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + + Returns: + dict: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + - `rois` (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + - `bbox_targets` (tuple): Ground truth for proposals in a + single image. Containing the following list of Tensors: + (labels, label_weights, bbox_targets, bbox_weights) + """ + bbox_head = self.bbox_head[stage] + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward(stage, x, rois) + bbox_results.update(rois=rois) + + bbox_loss_and_target = bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + bbox_pred=bbox_results['bbox_pred'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg[stage]) + bbox_results.update(bbox_loss_and_target) + + return bbox_results + + def _mask_forward(self, stage: int, x: Tuple[Tensor], + rois: Tensor) -> dict: + """Mask head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + """ + mask_roi_extractor = self.mask_roi_extractor[stage] + mask_head = self.mask_head[stage] + mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs], + rois) + # do not support caffe_c4 model anymore + mask_preds = mask_head(mask_feats) + + mask_results = dict(mask_preds=mask_preds) + return mask_results + + def mask_loss(self, stage: int, x: Tuple[Tensor], + sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList) -> dict: + """Run forward function and calculate loss for mask head in training. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `loss_mask` (dict): A dictionary of mask loss components. + """ + pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) + mask_results = self._mask_forward(stage, x, pos_rois) + + mask_head = self.mask_head[stage] + + mask_loss_and_target = mask_head.loss_and_target( + mask_preds=mask_results['mask_preds'], + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=self.train_cfg[stage]) + mask_results.update(mask_loss_and_target) + + return mask_results + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + # TODO: May add a new function in baseroihead + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + + num_imgs = len(batch_data_samples) + losses = dict() + results_list = rpn_results_list + for stage in range(self.num_stages): + self.current_stage = stage + + stage_loss_weight = self.stage_loss_weights[stage] + + # assign gts and sample proposals + sampling_results = [] + if self.with_bbox or self.with_mask: + bbox_assigner = self.bbox_assigner[stage] + bbox_sampler = self.bbox_sampler[stage] + + for i in range(num_imgs): + results = results_list[i] + # rename rpn_results.bboxes to rpn_results.priors + results.priors = results.pop('bboxes') + + assign_result = bbox_assigner.assign( + results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + + sampling_result = bbox_sampler.sample( + assign_result, + results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + # bbox head forward and loss + bbox_results = self.bbox_loss(stage, x, sampling_results) + + for name, value in bbox_results['loss_bbox'].items(): + losses[f's{stage}.{name}'] = ( + value * stage_loss_weight if 'loss' in name else value) + + # mask head forward and loss + if self.with_mask: + mask_results = self.mask_loss(stage, x, sampling_results, + batch_gt_instances) + for name, value in mask_results['loss_mask'].items(): + losses[f's{stage}.{name}'] = ( + value * stage_loss_weight if 'loss' in name else value) + + # refine bboxes + if stage < self.num_stages - 1: + bbox_head = self.bbox_head[stage] + with torch.no_grad(): + results_list = bbox_head.refine_bboxes( + sampling_results, bbox_results, batch_img_metas) + # Empty proposal + if results_list is None: + break + return losses + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False, + **kwargs) -> InstanceList: + """Perform forward propagation of the bbox head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + proposals = [res.bboxes for res in rpn_results_list] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = bbox2roi(proposals) + + if rois.shape[0] == 0: + return empty_instances( + batch_img_metas, + rois.device, + task_type='bbox', + box_type=self.bbox_head[-1].predict_box_type, + num_classes=self.bbox_head[-1].num_classes, + score_per_cls=rcnn_test_cfg is None) + + rois, cls_scores, bbox_preds = self._refine_roi( + x=x, + rois=rois, + batch_img_metas=batch_img_metas, + num_proposals_per_img=num_proposals_per_img, + **kwargs) + + results_list = self.bbox_head[-1].predict_by_feat( + rois=rois, + cls_scores=cls_scores, + bbox_preds=bbox_preds, + batch_img_metas=batch_img_metas, + rescale=rescale, + rcnn_test_cfg=rcnn_test_cfg) + return results_list + + def predict_mask(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + results_list: List[InstanceData], + rescale: bool = False) -> List[InstanceData]: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + bboxes = [res.bboxes for res in results_list] + mask_rois = bbox2roi(bboxes) + if mask_rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas, + mask_rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + num_mask_rois_per_img = [len(res) for res in results_list] + aug_masks = [] + for stage in range(self.num_stages): + mask_results = self._mask_forward(stage, x, mask_rois) + mask_preds = mask_results['mask_preds'] + # split batch mask prediction back to each image + mask_preds = mask_preds.split(num_mask_rois_per_img, 0) + aug_masks.append([m.sigmoid().detach() for m in mask_preds]) + + merged_masks = [] + for i in range(len(batch_img_metas)): + aug_mask = [mask[i] for mask in aug_masks] + merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i]) + merged_masks.append(merged_mask) + results_list = self.mask_head[-1].predict_by_feat( + mask_preds=merged_masks, + results_list=results_list, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale, + activate_map=True) + return results_list + + def _refine_roi(self, x: Tuple[Tensor], rois: Tensor, + batch_img_metas: List[dict], + num_proposals_per_img: Sequence[int], **kwargs) -> tuple: + """Multi-stage refinement of RoI. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2] + batch_img_metas (list[dict]): List of image information. + num_proposals_per_img (sequence[int]): number of proposals + in each image. + + Returns: + tuple: + + - rois (Tensor): Refined RoI. + - cls_scores (list[Tensor]): Average predicted + cls score per image. + - bbox_preds (list[Tensor]): Bbox branch predictions + for the last stage of per image. + """ + # "ms" in variable names means multi-stage + ms_scores = [] + for stage in range(self.num_stages): + bbox_results = self._bbox_forward( + stage=stage, x=x, rois=rois, **kwargs) + + # split batch bbox prediction back to each image + cls_scores = bbox_results['cls_score'] + bbox_preds = bbox_results['bbox_pred'] + + rois = rois.split(num_proposals_per_img, 0) + cls_scores = cls_scores.split(num_proposals_per_img, 0) + ms_scores.append(cls_scores) + + # some detector with_reg is False, bbox_preds will be None + if bbox_preds is not None: + # TODO move this to a sabl_roi_head + # the bbox prediction of some detectors like SABL is not Tensor + if isinstance(bbox_preds, torch.Tensor): + bbox_preds = bbox_preds.split(num_proposals_per_img, 0) + else: + bbox_preds = self.bbox_head[stage].bbox_pred_split( + bbox_preds, num_proposals_per_img) + else: + bbox_preds = (None, ) * len(batch_img_metas) + + if stage < self.num_stages - 1: + bbox_head = self.bbox_head[stage] + if bbox_head.custom_activation: + cls_scores = [ + bbox_head.loss_cls.get_activation(s) + for s in cls_scores + ] + refine_rois_list = [] + for i in range(len(batch_img_metas)): + if rois[i].shape[0] > 0: + bbox_label = cls_scores[i][:, :-1].argmax(dim=1) + # Refactor `bbox_head.regress_by_class` to only accept + # box tensor without img_idx concatenated. + refined_bboxes = bbox_head.regress_by_class( + rois[i][:, 1:], bbox_label, bbox_preds[i], + batch_img_metas[i]) + refined_bboxes = get_box_tensor(refined_bboxes) + refined_rois = torch.cat( + [rois[i][:, [0]], refined_bboxes], dim=1) + refine_rois_list.append(refined_rois) + rois = torch.cat(refine_rois_list) + + # average scores of each image by stages + cls_scores = [ + sum([score[i] for score in ms_scores]) / float(len(ms_scores)) + for i in range(len(batch_img_metas)) + ] + return rois, cls_scores, bbox_preds + + def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + x (List[Tensor]): Multi-level features that may have different + resolutions. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns + tuple: A tuple of features from ``bbox_head`` and ``mask_head`` + forward. + """ + results = () + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + proposals = [rpn_results.bboxes for rpn_results in rpn_results_list] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = bbox2roi(proposals) + # bbox head + if self.with_bbox: + rois, cls_scores, bbox_preds = self._refine_roi( + x, rois, batch_img_metas, num_proposals_per_img) + results = results + (cls_scores, bbox_preds) + # mask head + if self.with_mask: + aug_masks = [] + rois = torch.cat(rois) + for stage in range(self.num_stages): + mask_results = self._mask_forward(stage, x, rois) + mask_preds = mask_results['mask_preds'] + mask_preds = mask_preds.split(num_proposals_per_img, 0) + aug_masks.append([m.sigmoid().detach() for m in mask_preds]) + + merged_masks = [] + for i in range(len(batch_img_metas)): + aug_mask = [mask[i] for mask in aug_masks] + merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i]) + merged_masks.append(merged_mask) + results = results + (merged_masks, ) + return results diff --git a/mmdetection/mmdet/models/roi_heads/double_roi_head.py b/mmdetection/mmdet/models/roi_heads/double_roi_head.py new file mode 100644 index 0000000..f9464ff --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/double_roi_head.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +from torch import Tensor + +from mmdet.registry import MODELS +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class DoubleHeadRoIHead(StandardRoIHead): + """RoI head for `Double Head RCNN `_. + + Args: + reg_roi_scale_factor (float): The scale factor to extend the rois + used to extract the regression features. + """ + + def __init__(self, reg_roi_scale_factor: float, **kwargs): + super().__init__(**kwargs) + self.reg_roi_scale_factor = reg_roi_scale_factor + + def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + """Box head forward function used in both training and testing. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + bbox_cls_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], rois) + bbox_reg_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], + rois, + roi_scale_factor=self.reg_roi_scale_factor) + if self.with_shared_head: + bbox_cls_feats = self.shared_head(bbox_cls_feats) + bbox_reg_feats = self.shared_head(bbox_reg_feats) + cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats) + + bbox_results = dict( + cls_score=cls_score, + bbox_pred=bbox_pred, + bbox_feats=bbox_cls_feats) + return bbox_results diff --git a/mmdetection/mmdet/models/roi_heads/dynamic_roi_head.py b/mmdetection/mmdet/models/roi_heads/dynamic_roi_head.py new file mode 100644 index 0000000..3c7f7bd --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/dynamic_roi_head.py @@ -0,0 +1,163 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import numpy as np +import torch +from torch import Tensor + +from mmdet.models.losses import SmoothL1Loss +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import InstanceList +from ..utils.misc import unpack_gt_instances +from .standard_roi_head import StandardRoIHead + +EPS = 1e-15 + + +@MODELS.register_module() +class DynamicRoIHead(StandardRoIHead): + """RoI head for `Dynamic R-CNN `_.""" + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + assert isinstance(self.bbox_head.loss_bbox, SmoothL1Loss) + # the IoU history of the past `update_iter_interval` iterations + self.iou_history = [] + # the beta history of the past `update_iter_interval` iterations + self.beta_history = [] + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> dict: + """Forward function for training. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, _ = outputs + + # assign gts and sample proposals + num_imgs = len(batch_data_samples) + sampling_results = [] + cur_iou = [] + for i in range(num_imgs): + # rename rpn_results.bboxes to rpn_results.priors + rpn_results = rpn_results_list[i] + rpn_results.priors = rpn_results.pop('bboxes') + + assign_result = self.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + # record the `iou_topk`-th largest IoU in an image + iou_topk = min(self.train_cfg.dynamic_rcnn.iou_topk, + len(assign_result.max_overlaps)) + ious, _ = torch.topk(assign_result.max_overlaps, iou_topk) + cur_iou.append(ious[-1].item()) + sampling_results.append(sampling_result) + # average the current IoUs over images + cur_iou = np.mean(cur_iou) + self.iou_history.append(cur_iou) + + losses = dict() + # bbox head forward and loss + if self.with_bbox: + bbox_results = self.bbox_loss(x, sampling_results) + losses.update(bbox_results['loss_bbox']) + + # mask head forward and loss + if self.with_mask: + mask_results = self.mask_loss(x, sampling_results, + bbox_results['bbox_feats'], + batch_gt_instances) + losses.update(mask_results['loss_mask']) + + # update IoU threshold and SmoothL1 beta + update_iter_interval = self.train_cfg.dynamic_rcnn.update_iter_interval + if len(self.iou_history) % update_iter_interval == 0: + new_iou_thr, new_beta = self.update_hyperparameters() + + return losses + + def bbox_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult]) -> dict: + """Perform forward propagation and loss calculation of the bbox head on + the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + """ + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward(x, rois) + + bbox_loss_and_target = self.bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + bbox_pred=bbox_results['bbox_pred'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg) + bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox']) + + # record the `beta_topk`-th smallest target + # `bbox_targets[2]` and `bbox_targets[3]` stand for bbox_targets + # and bbox_weights, respectively + bbox_targets = bbox_loss_and_target['bbox_targets'] + pos_inds = bbox_targets[3][:, 0].nonzero().squeeze(1) + num_pos = len(pos_inds) + num_imgs = len(sampling_results) + if num_pos > 0: + cur_target = bbox_targets[2][pos_inds, :2].abs().mean(dim=1) + beta_topk = min(self.train_cfg.dynamic_rcnn.beta_topk * num_imgs, + num_pos) + cur_target = torch.kthvalue(cur_target, beta_topk)[0].item() + self.beta_history.append(cur_target) + + return bbox_results + + def update_hyperparameters(self): + """Update hyperparameters like IoU thresholds for assigner and beta for + SmoothL1 loss based on the training statistics. + + Returns: + tuple[float]: the updated ``iou_thr`` and ``beta``. + """ + new_iou_thr = max(self.train_cfg.dynamic_rcnn.initial_iou, + np.mean(self.iou_history)) + self.iou_history = [] + self.bbox_assigner.pos_iou_thr = new_iou_thr + self.bbox_assigner.neg_iou_thr = new_iou_thr + self.bbox_assigner.min_pos_iou = new_iou_thr + if (not self.beta_history) or (np.median(self.beta_history) < EPS): + # avoid 0 or too small value for new_beta + new_beta = self.bbox_head.loss_bbox.beta + else: + new_beta = min(self.train_cfg.dynamic_rcnn.initial_beta, + np.median(self.beta_history)) + self.beta_history = [] + self.bbox_head.loss_bbox.beta = new_beta + return new_iou_thr, new_beta diff --git a/mmdetection/mmdet/models/roi_heads/grid_roi_head.py b/mmdetection/mmdet/models/roi_heads/grid_roi_head.py new file mode 100644 index 0000000..9eda7f0 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/grid_roi_head.py @@ -0,0 +1,280 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList +from ..task_modules.samplers import SamplingResult +from ..utils.misc import unpack_gt_instances +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class GridRoIHead(StandardRoIHead): + """Implementation of `Grid RoI Head `_ + + Args: + grid_roi_extractor (:obj:`ConfigDict` or dict): Config of + roi extractor. + grid_head (:obj:`ConfigDict` or dict): Config of grid head + """ + + def __init__(self, grid_roi_extractor: ConfigType, grid_head: ConfigType, + **kwargs) -> None: + assert grid_head is not None + super().__init__(**kwargs) + if grid_roi_extractor is not None: + self.grid_roi_extractor = MODELS.build(grid_roi_extractor) + self.share_roi_extractor = False + else: + self.share_roi_extractor = True + self.grid_roi_extractor = self.bbox_roi_extractor + self.grid_head = MODELS.build(grid_head) + + def _random_jitter(self, + sampling_results: List[SamplingResult], + batch_img_metas: List[dict], + amplitude: float = 0.15) -> List[SamplingResult]: + """Ramdom jitter positive proposals for training. + + Args: + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + batch_img_metas (list[dict]): List of image information. + amplitude (float): Amplitude of random offset. Defaults to 0.15. + + Returns: + list[obj:SamplingResult]: SamplingResults after random jittering. + """ + for sampling_result, img_meta in zip(sampling_results, + batch_img_metas): + bboxes = sampling_result.pos_priors + random_offsets = bboxes.new_empty(bboxes.shape[0], 4).uniform_( + -amplitude, amplitude) + # before jittering + cxcy = (bboxes[:, 2:4] + bboxes[:, :2]) / 2 + wh = (bboxes[:, 2:4] - bboxes[:, :2]).abs() + # after jittering + new_cxcy = cxcy + wh * random_offsets[:, :2] + new_wh = wh * (1 + random_offsets[:, 2:]) + # xywh to xyxy + new_x1y1 = (new_cxcy - new_wh / 2) + new_x2y2 = (new_cxcy + new_wh / 2) + new_bboxes = torch.cat([new_x1y1, new_x2y2], dim=1) + # clip bboxes + max_shape = img_meta['img_shape'] + if max_shape is not None: + new_bboxes[:, 0::2].clamp_(min=0, max=max_shape[1] - 1) + new_bboxes[:, 1::2].clamp_(min=0, max=max_shape[0] - 1) + + sampling_result.pos_priors = new_bboxes + return sampling_results + + # TODO: Forward is incorrect and need to refactor. + def forward(self, + x: Tuple[Tensor], + rpn_results_list: InstanceList, + batch_data_samples: SampleList = None) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + x (Tuple[Tensor]): Multi-level features that may have different + resolutions. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns + tuple: A tuple of features from ``bbox_head`` and ``mask_head`` + forward. + """ + results = () + proposals = [rpn_results.bboxes for rpn_results in rpn_results_list] + rois = bbox2roi(proposals) + # bbox head + if self.with_bbox: + bbox_results = self._bbox_forward(x, rois) + results = results + (bbox_results['cls_score'], ) + if self.bbox_head.with_reg: + results = results + (bbox_results['bbox_pred'], ) + + # grid head + grid_rois = rois[:100] + grid_feats = self.grid_roi_extractor( + x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois) + if self.with_shared_head: + grid_feats = self.shared_head(grid_feats) + self.grid_head.test_mode = True + grid_preds = self.grid_head(grid_feats) + results = results + (grid_preds, ) + + # mask head + if self.with_mask: + mask_rois = rois[:100] + mask_results = self._mask_forward(x, mask_rois) + results = results + (mask_results['mask_preds'], ) + return results + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList, **kwargs) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + # assign gts and sample proposals + num_imgs = len(batch_data_samples) + sampling_results = [] + for i in range(num_imgs): + # rename rpn_results.bboxes to rpn_results.priors + rpn_results = rpn_results_list[i] + rpn_results.priors = rpn_results.pop('bboxes') + + assign_result = self.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + losses = dict() + # bbox head loss + if self.with_bbox: + bbox_results = self.bbox_loss(x, sampling_results, batch_img_metas) + losses.update(bbox_results['loss_bbox']) + + # mask head forward and loss + if self.with_mask: + mask_results = self.mask_loss(x, sampling_results, + bbox_results['bbox_feats'], + batch_gt_instances) + losses.update(mask_results['loss_mask']) + + return losses + + def bbox_loss(self, + x: Tuple[Tensor], + sampling_results: List[SamplingResult], + batch_img_metas: Optional[List[dict]] = None) -> dict: + """Perform forward propagation and loss calculation of the bbox head on + the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list[:obj:`SamplingResult`]): Sampling results. + batch_img_metas (list[dict], optional): Meta information of each + image, e.g., image size, scaling factor, etc. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + """ + assert batch_img_metas is not None + bbox_results = super().bbox_loss(x, sampling_results) + + # Grid head forward and loss + sampling_results = self._random_jitter(sampling_results, + batch_img_metas) + pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) + + # GN in head does not support zero shape input + if pos_rois.shape[0] == 0: + return bbox_results + + grid_feats = self.grid_roi_extractor( + x[:self.grid_roi_extractor.num_inputs], pos_rois) + if self.with_shared_head: + grid_feats = self.shared_head(grid_feats) + # Accelerate training + max_sample_num_grid = self.train_cfg.get('max_num_grid', 192) + sample_idx = torch.randperm( + grid_feats.shape[0])[:min(grid_feats.shape[0], max_sample_num_grid + )] + grid_feats = grid_feats[sample_idx] + grid_pred = self.grid_head(grid_feats) + + loss_grid = self.grid_head.loss(grid_pred, sample_idx, + sampling_results, self.train_cfg) + + bbox_results['loss_bbox'].update(loss_grid) + return bbox_results + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the bbox head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + rcnn_test_cfg (:obj:`ConfigDict`): `test_cfg` of R-CNN. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape \ + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), the last \ + dimension 4 arrange as (x1, y1, x2, y2). + """ + results_list = super().predict_bbox( + x, + batch_img_metas=batch_img_metas, + rpn_results_list=rpn_results_list, + rcnn_test_cfg=rcnn_test_cfg, + rescale=False) + + grid_rois = bbox2roi([res.bboxes for res in results_list]) + if grid_rois.shape[0] != 0: + grid_feats = self.grid_roi_extractor( + x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois) + if self.with_shared_head: + grid_feats = self.shared_head(grid_feats) + self.grid_head.test_mode = True + grid_preds = self.grid_head(grid_feats) + results_list = self.grid_head.predict_by_feat( + grid_preds=grid_preds, + results_list=results_list, + batch_img_metas=batch_img_metas, + rescale=rescale) + + return results_list diff --git a/mmdetection/mmdet/models/roi_heads/htc_roi_head.py b/mmdetection/mmdet/models/roi_heads/htc_roi_head.py new file mode 100644 index 0000000..0fdd99d --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/htc_roi_head.py @@ -0,0 +1,581 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor + +from mmdet.models.test_time_augs import merge_aug_masks +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import InstanceList, OptConfigType +from ..layers import adaptive_avg_pool2d +from ..task_modules.samplers import SamplingResult +from ..utils import empty_instances, unpack_gt_instances +from .cascade_roi_head import CascadeRoIHead + + +@MODELS.register_module() +class HybridTaskCascadeRoIHead(CascadeRoIHead): + """Hybrid task cascade roi head including one bbox head and one mask head. + + https://arxiv.org/abs/1901.07518 + + Args: + num_stages (int): Number of cascade stages. + stage_loss_weights (list[float]): Loss weight for every stage. + semantic_roi_extractor (:obj:`ConfigDict` or dict, optional): + Config of semantic roi extractor. Defaults to None. + Semantic_head (:obj:`ConfigDict` or dict, optional): + Config of semantic head. Defaults to None. + interleaved (bool): Whether to interleaves the box branch and mask + branch. If True, the mask branch can take the refined bounding + box predictions. Defaults to True. + mask_info_flow (bool): Whether to turn on the mask information flow, + which means that feeding the mask features of the preceding stage + to the current stage. Defaults to True. + """ + + def __init__(self, + num_stages: int, + stage_loss_weights: List[float], + semantic_roi_extractor: OptConfigType = None, + semantic_head: OptConfigType = None, + semantic_fusion: Tuple[str] = ('bbox', 'mask'), + interleaved: bool = True, + mask_info_flow: bool = True, + **kwargs) -> None: + super().__init__( + num_stages=num_stages, + stage_loss_weights=stage_loss_weights, + **kwargs) + assert self.with_bbox + assert not self.with_shared_head # shared head is not supported + + if semantic_head is not None: + self.semantic_roi_extractor = MODELS.build(semantic_roi_extractor) + self.semantic_head = MODELS.build(semantic_head) + + self.semantic_fusion = semantic_fusion + self.interleaved = interleaved + self.mask_info_flow = mask_info_flow + + # TODO move to base_roi_head later + @property + def with_semantic(self) -> bool: + """bool: whether the head has semantic head""" + return hasattr(self, + 'semantic_head') and self.semantic_head is not None + + def _bbox_forward( + self, + stage: int, + x: Tuple[Tensor], + rois: Tensor, + semantic_feat: Optional[Tensor] = None) -> Dict[str, Tensor]: + """Box head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + semantic_feat (Tensor, optional): Semantic feature. Defaults to + None. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + bbox_roi_extractor = self.bbox_roi_extractor[stage] + bbox_head = self.bbox_head[stage] + bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], + rois) + if self.with_semantic and 'bbox' in self.semantic_fusion: + bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat], + rois) + if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]: + bbox_semantic_feat = adaptive_avg_pool2d( + bbox_semantic_feat, bbox_feats.shape[-2:]) + bbox_feats += bbox_semantic_feat + cls_score, bbox_pred = bbox_head(bbox_feats) + + bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred) + return bbox_results + + def bbox_loss(self, + stage: int, + x: Tuple[Tensor], + sampling_results: List[SamplingResult], + semantic_feat: Optional[Tensor] = None) -> dict: + """Run forward function and calculate loss for box head in training. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + semantic_feat (Tensor, optional): Semantic feature. Defaults to + None. + + Returns: + dict: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + - `rois` (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + - `bbox_targets` (tuple): Ground truth for proposals in a + single image. Containing the following list of Tensors: + (labels, label_weights, bbox_targets, bbox_weights) + """ + bbox_head = self.bbox_head[stage] + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward( + stage, x, rois, semantic_feat=semantic_feat) + bbox_results.update(rois=rois) + + bbox_loss_and_target = bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + bbox_pred=bbox_results['bbox_pred'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg[stage]) + bbox_results.update(bbox_loss_and_target) + return bbox_results + + def _mask_forward(self, + stage: int, + x: Tuple[Tensor], + rois: Tensor, + semantic_feat: Optional[Tensor] = None, + training: bool = True) -> Dict[str, Tensor]: + """Mask head forward function used only in training. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + semantic_feat (Tensor, optional): Semantic feature. Defaults to + None. + training (bool): Mask Forward is different between training and + testing. If True, use the mask forward in training. + Defaults to True. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + """ + mask_roi_extractor = self.mask_roi_extractor[stage] + mask_head = self.mask_head[stage] + mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs], + rois) + + # semantic feature fusion + # element-wise sum for original features and pooled semantic features + if self.with_semantic and 'mask' in self.semantic_fusion: + mask_semantic_feat = self.semantic_roi_extractor([semantic_feat], + rois) + if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]: + mask_semantic_feat = F.adaptive_avg_pool2d( + mask_semantic_feat, mask_feats.shape[-2:]) + mask_feats = mask_feats + mask_semantic_feat + + # mask information flow + # forward all previous mask heads to obtain last_feat, and fuse it + # with the normal mask feature + if training: + if self.mask_info_flow: + last_feat = None + for i in range(stage): + last_feat = self.mask_head[i]( + mask_feats, last_feat, return_logits=False) + mask_preds = mask_head( + mask_feats, last_feat, return_feat=False) + else: + mask_preds = mask_head(mask_feats, return_feat=False) + + mask_results = dict(mask_preds=mask_preds) + else: + aug_masks = [] + last_feat = None + for i in range(self.num_stages): + mask_head = self.mask_head[i] + if self.mask_info_flow: + mask_preds, last_feat = mask_head(mask_feats, last_feat) + else: + mask_preds = mask_head(mask_feats) + aug_masks.append(mask_preds) + + mask_results = dict(mask_preds=aug_masks) + + return mask_results + + def mask_loss(self, + stage: int, + x: Tuple[Tensor], + sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, + semantic_feat: Optional[Tensor] = None) -> dict: + """Run forward function and calculate loss for mask head in training. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + semantic_feat (Tensor, optional): Semantic feature. Defaults to + None. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `loss_mask` (dict): A dictionary of mask loss components. + """ + pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) + mask_results = self._mask_forward( + stage=stage, + x=x, + rois=pos_rois, + semantic_feat=semantic_feat, + training=True) + + mask_head = self.mask_head[stage] + mask_loss_and_target = mask_head.loss_and_target( + mask_preds=mask_results['mask_preds'], + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=self.train_cfg[stage]) + mask_results.update(mask_loss_and_target) + + return mask_results + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + + # semantic segmentation part + # 2 outputs: segmentation prediction and embedded features + losses = dict() + if self.with_semantic: + gt_semantic_segs = [ + data_sample.gt_sem_seg.sem_seg + for data_sample in batch_data_samples + ] + gt_semantic_segs = torch.stack(gt_semantic_segs) + semantic_pred, semantic_feat = self.semantic_head(x) + loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_segs) + losses['loss_semantic_seg'] = loss_seg + else: + semantic_feat = None + + results_list = rpn_results_list + num_imgs = len(batch_img_metas) + for stage in range(self.num_stages): + self.current_stage = stage + + stage_loss_weight = self.stage_loss_weights[stage] + + # assign gts and sample proposals + sampling_results = [] + bbox_assigner = self.bbox_assigner[stage] + bbox_sampler = self.bbox_sampler[stage] + for i in range(num_imgs): + results = results_list[i] + # rename rpn_results.bboxes to rpn_results.priors + if 'bboxes' in results: + results.priors = results.pop('bboxes') + + assign_result = bbox_assigner.assign( + results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = bbox_sampler.sample( + assign_result, + results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + # bbox head forward and loss + bbox_results = self.bbox_loss( + stage=stage, + x=x, + sampling_results=sampling_results, + semantic_feat=semantic_feat) + + for name, value in bbox_results['loss_bbox'].items(): + losses[f's{stage}.{name}'] = ( + value * stage_loss_weight if 'loss' in name else value) + + # mask head forward and loss + if self.with_mask: + # interleaved execution: use regressed bboxes by the box branch + # to train the mask branch + if self.interleaved: + bbox_head = self.bbox_head[stage] + with torch.no_grad(): + results_list = bbox_head.refine_bboxes( + sampling_results, bbox_results, batch_img_metas) + # re-assign and sample 512 RoIs from 512 RoIs + sampling_results = [] + for i in range(num_imgs): + results = results_list[i] + # rename rpn_results.bboxes to rpn_results.priors + results.priors = results.pop('bboxes') + assign_result = bbox_assigner.assign( + results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = bbox_sampler.sample( + assign_result, + results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + mask_results = self.mask_loss( + stage=stage, + x=x, + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + semantic_feat=semantic_feat) + for name, value in mask_results['loss_mask'].items(): + losses[f's{stage}.{name}'] = ( + value * stage_loss_weight if 'loss' in name else value) + + # refine bboxes (same as Cascade R-CNN) + if stage < self.num_stages - 1 and not self.interleaved: + bbox_head = self.bbox_head[stage] + with torch.no_grad(): + results_list = bbox_head.refine_bboxes( + sampling_results=sampling_results, + bbox_results=bbox_results, + batch_img_metas=batch_img_metas) + + return losses + + def predict(self, + x: Tuple[Tensor], + rpn_results_list: InstanceList, + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the roi head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from upstream network. Each + has shape (N, C, H, W). + rpn_results_list (list[:obj:`InstanceData`]): list of region + proposals. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results to + the original image. Defaults to False. + + Returns: + list[obj:`InstanceData`]: Detection results of each image. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + assert self.with_bbox, 'Bbox head must be implemented.' + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + if self.with_semantic: + _, semantic_feat = self.semantic_head(x) + else: + semantic_feat = None + + # TODO: nms_op in mmcv need be enhanced, the bbox result may get + # difference when not rescale in bbox_head + + # If it has the mask branch, the bbox branch does not need + # to be scaled to the original image scale, because the mask + # branch will scale both bbox and mask at the same time. + bbox_rescale = rescale if not self.with_mask else False + results_list = self.predict_bbox( + x=x, + semantic_feat=semantic_feat, + batch_img_metas=batch_img_metas, + rpn_results_list=rpn_results_list, + rcnn_test_cfg=self.test_cfg, + rescale=bbox_rescale) + + if self.with_mask: + results_list = self.predict_mask( + x=x, + semantic_heat=semantic_feat, + batch_img_metas=batch_img_metas, + results_list=results_list, + rescale=rescale) + + return results_list + + def predict_mask(self, + x: Tuple[Tensor], + semantic_heat: Tensor, + batch_img_metas: List[dict], + results_list: InstanceList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + semantic_feat (Tensor): Semantic feature. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + num_imgs = len(batch_img_metas) + bboxes = [res.bboxes for res in results_list] + mask_rois = bbox2roi(bboxes) + if mask_rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas=batch_img_metas, + device=mask_rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + num_mask_rois_per_img = [len(res) for res in results_list] + mask_results = self._mask_forward( + stage=-1, + x=x, + rois=mask_rois, + semantic_feat=semantic_heat, + training=False) + # split batch mask prediction back to each image + aug_masks = [[ + mask.sigmoid().detach() + for mask in mask_preds.split(num_mask_rois_per_img, 0) + ] for mask_preds in mask_results['mask_preds']] + + merged_masks = [] + for i in range(num_imgs): + aug_mask = [mask[i] for mask in aug_masks] + merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i]) + merged_masks.append(merged_mask) + + results_list = self.mask_head[-1].predict_by_feat( + mask_preds=merged_masks, + results_list=results_list, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale, + activate_map=True) + + return results_list + + def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + x (List[Tensor]): Multi-level features that may have different + resolutions. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns + tuple: A tuple of features from ``bbox_head`` and ``mask_head`` + forward. + """ + results = () + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + num_imgs = len(batch_img_metas) + + if self.with_semantic: + _, semantic_feat = self.semantic_head(x) + else: + semantic_feat = None + + proposals = [rpn_results.bboxes for rpn_results in rpn_results_list] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = bbox2roi(proposals) + # bbox head + if self.with_bbox: + rois, cls_scores, bbox_preds = self._refine_roi( + x=x, + rois=rois, + semantic_feat=semantic_feat, + batch_img_metas=batch_img_metas, + num_proposals_per_img=num_proposals_per_img) + results = results + (cls_scores, bbox_preds) + # mask head + if self.with_mask: + rois = torch.cat(rois) + mask_results = self._mask_forward( + stage=-1, + x=x, + rois=rois, + semantic_feat=semantic_feat, + training=False) + aug_masks = [[ + mask.sigmoid().detach() + for mask in mask_preds.split(num_proposals_per_img, 0) + ] for mask_preds in mask_results['mask_preds']] + + merged_masks = [] + for i in range(num_imgs): + aug_mask = [mask[i] for mask in aug_masks] + merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i]) + merged_masks.append(merged_mask) + results = results + (merged_masks, ) + return results diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/__init__.py b/mmdetection/mmdet/models/roi_heads/mask_heads/__init__.py new file mode 100644 index 0000000..48a5d42 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .coarse_mask_head import CoarseMaskHead +from .dynamic_mask_head import DynamicMaskHead +from .fcn_mask_head import FCNMaskHead +from .feature_relay_head import FeatureRelayHead +from .fused_semantic_head import FusedSemanticHead +from .global_context_head import GlobalContextHead +from .grid_head import GridHead +from .htc_mask_head import HTCMaskHead +from .mask_point_head import MaskPointHead +from .maskiou_head import MaskIoUHead +from .scnet_mask_head import SCNetMaskHead +from .scnet_semantic_head import SCNetSemanticHead + +__all__ = [ + 'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead', + 'MaskIoUHead', 'CoarseMaskHead', 'MaskPointHead', 'SCNetMaskHead', + 'SCNetSemanticHead', 'GlobalContextHead', 'FeatureRelayHead', + 'DynamicMaskHead' +] diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py new file mode 100644 index 0000000..1caa901 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import ConvModule, Linear +from mmengine.model import ModuleList +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import MultiConfig +from .fcn_mask_head import FCNMaskHead + + +@MODELS.register_module() +class CoarseMaskHead(FCNMaskHead): + """Coarse mask head used in PointRend. + + Compared with standard ``FCNMaskHead``, ``CoarseMaskHead`` will downsample + the input feature map instead of upsample it. + + Args: + num_convs (int): Number of conv layers in the head. Defaults to 0. + num_fcs (int): Number of fc layers in the head. Defaults to 2. + fc_out_channels (int): Number of output channels of fc layer. + Defaults to 1024. + downsample_factor (int): The factor that feature map is downsampled by. + Defaults to 2. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + num_convs: int = 0, + num_fcs: int = 2, + fc_out_channels: int = 1024, + downsample_factor: int = 2, + init_cfg: MultiConfig = dict( + type='Xavier', + override=[ + dict(name='fcs'), + dict(type='Constant', val=0.001, name='fc_logits') + ]), + *arg, + **kwarg) -> None: + super().__init__( + *arg, + num_convs=num_convs, + upsample_cfg=dict(type=None), + init_cfg=None, + **kwarg) + self.init_cfg = init_cfg + self.num_fcs = num_fcs + assert self.num_fcs > 0 + self.fc_out_channels = fc_out_channels + self.downsample_factor = downsample_factor + assert self.downsample_factor >= 1 + # remove conv_logit + delattr(self, 'conv_logits') + + if downsample_factor > 1: + downsample_in_channels = ( + self.conv_out_channels + if self.num_convs > 0 else self.in_channels) + self.downsample_conv = ConvModule( + downsample_in_channels, + self.conv_out_channels, + kernel_size=downsample_factor, + stride=downsample_factor, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + else: + self.downsample_conv = None + + self.output_size = (self.roi_feat_size[0] // downsample_factor, + self.roi_feat_size[1] // downsample_factor) + self.output_area = self.output_size[0] * self.output_size[1] + + last_layer_dim = self.conv_out_channels * self.output_area + + self.fcs = ModuleList() + for i in range(num_fcs): + fc_in_channels = ( + last_layer_dim if i == 0 else self.fc_out_channels) + self.fcs.append(Linear(fc_in_channels, self.fc_out_channels)) + last_layer_dim = self.fc_out_channels + output_channels = self.num_classes * self.output_area + self.fc_logits = Linear(last_layer_dim, output_channels) + + def init_weights(self) -> None: + """Initialize weights.""" + super(FCNMaskHead, self).init_weights() + + def forward(self, x: Tensor) -> Tensor: + """Forward features from the upstream network. + + Args: + x (Tensor): Extract mask RoI features. + + Returns: + Tensor: Predicted foreground masks. + """ + for conv in self.convs: + x = conv(x) + + if self.downsample_conv is not None: + x = self.downsample_conv(x) + + x = x.flatten(1) + for fc in self.fcs: + x = self.relu(fc(x)) + mask_preds = self.fc_logits(x).view( + x.size(0), self.num_classes, *self.output_size) + return mask_preds diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py new file mode 100644 index 0000000..f33612b --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py @@ -0,0 +1,166 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from mmengine.config import ConfigDict +from torch import Tensor + +from mmdet.models.task_modules import SamplingResult +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, InstanceList, OptConfigType, reduce_mean +from .fcn_mask_head import FCNMaskHead + + +@MODELS.register_module() +class DynamicMaskHead(FCNMaskHead): + r"""Dynamic Mask Head for + `Instances as Queries `_ + + Args: + num_convs (int): Number of convolution layer. + Defaults to 4. + roi_feat_size (int): The output size of RoI extractor, + Defaults to 14. + in_channels (int): Input feature channels. + Defaults to 256. + conv_kernel_size (int): Kernel size of convolution layers. + Defaults to 3. + conv_out_channels (int): Output channels of convolution layers. + Defaults to 256. + num_classes (int): Number of classes. + Defaults to 80 + class_agnostic (int): Whether generate class agnostic prediction. + Defaults to False. + dropout (float): Probability of drop the channel. + Defaults to 0.0 + upsample_cfg (:obj:`ConfigDict` or dict): The config for + upsample layer. + conv_cfg (:obj:`ConfigDict` or dict, optional): The convolution + layer config. + norm_cfg (:obj:`ConfigDict` or dict, optional): The norm layer config. + dynamic_conv_cfg (:obj:`ConfigDict` or dict): The dynamic convolution + layer config. + loss_mask (:obj:`ConfigDict` or dict): The config for mask loss. + """ + + def __init__(self, + num_convs: int = 4, + roi_feat_size: int = 14, + in_channels: int = 256, + conv_kernel_size: int = 3, + conv_out_channels: int = 256, + num_classes: int = 80, + class_agnostic: bool = False, + upsample_cfg: ConfigType = dict( + type='deconv', scale_factor=2), + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + dynamic_conv_cfg: ConfigType = dict( + type='DynamicConv', + in_channels=256, + feat_channels=64, + out_channels=256, + input_feat_shape=14, + with_proj=False, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN')), + loss_mask: ConfigType = dict( + type='DiceLoss', loss_weight=8.0), + **kwargs) -> None: + super().__init__( + num_convs=num_convs, + roi_feat_size=roi_feat_size, + in_channels=in_channels, + conv_kernel_size=conv_kernel_size, + conv_out_channels=conv_out_channels, + num_classes=num_classes, + class_agnostic=class_agnostic, + upsample_cfg=upsample_cfg, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + loss_mask=loss_mask, + **kwargs) + assert class_agnostic is False, \ + 'DynamicMaskHead only support class_agnostic=False' + self.fp16_enabled = False + + self.instance_interactive_conv = MODELS.build(dynamic_conv_cfg) + + def init_weights(self) -> None: + """Use xavier initialization for all weight parameter and set + classification head bias as a specific value when use focal loss.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + nn.init.constant_(self.conv_logits.bias, 0.) + + def forward(self, roi_feat: Tensor, proposal_feat: Tensor) -> Tensor: + """Forward function of DynamicMaskHead. + + Args: + roi_feat (Tensor): Roi-pooling features with shape + (batch_size*num_proposals, feature_dimensions, + pooling_h , pooling_w). + proposal_feat (Tensor): Intermediate feature get from + diihead in last stage, has shape + (batch_size*num_proposals, feature_dimensions) + + Returns: + mask_preds (Tensor): Predicted foreground masks with shape + (batch_size*num_proposals, num_classes, pooling_h*2, pooling_w*2). + """ + + proposal_feat = proposal_feat.reshape(-1, self.in_channels) + proposal_feat_iic = self.instance_interactive_conv( + proposal_feat, roi_feat) + + x = proposal_feat_iic.permute(0, 2, 1).reshape(roi_feat.size()) + + for conv in self.convs: + x = conv(x) + if self.upsample is not None: + x = self.upsample(x) + if self.upsample_method == 'deconv': + x = self.relu(x) + mask_preds = self.conv_logits(x) + return mask_preds + + def loss_and_target(self, mask_preds: Tensor, + sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, + rcnn_train_cfg: ConfigDict) -> dict: + """Calculate the loss based on the features extracted by the mask head. + + Args: + mask_preds (Tensor): Predicted foreground masks, has shape + (num_pos, num_classes, h, w). + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + + Returns: + dict: A dictionary of loss and targets components. + """ + mask_targets = self.get_targets( + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=rcnn_train_cfg) + pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) + + num_pos = pos_labels.new_ones(pos_labels.size()).float().sum() + avg_factor = torch.clamp(reduce_mean(num_pos), min=1.).item() + loss = dict() + if mask_preds.size(0) == 0: + loss_mask = mask_preds.sum() + else: + loss_mask = self.loss_mask( + mask_preds[torch.arange(num_pos).long(), pos_labels, + ...].sigmoid(), + mask_targets, + avg_factor=avg_factor) + loss['loss_mask'] = loss_mask + return dict(loss_mask=loss, mask_targets=mask_targets) diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py new file mode 100644 index 0000000..3a089df --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py @@ -0,0 +1,474 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, build_conv_layer, build_upsample_layer +from mmcv.ops.carafe import CARAFEPack +from mmengine.config import ConfigDict +from mmengine.model import BaseModule, ModuleList +from mmengine.structures import InstanceData +from torch import Tensor +from torch.nn.modules.utils import _pair + +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.models.utils import empty_instances +from mmdet.registry import MODELS +from mmdet.structures.mask import mask_target +from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig + +BYTES_PER_FLOAT = 4 +# TODO: This memory limit may be too much or too little. It would be better to +# determine it based on available resources. +GPU_MEM_LIMIT = 1024**3 # 1 GB memory limit + + +@MODELS.register_module() +class FCNMaskHead(BaseModule): + + def __init__(self, + num_convs: int = 4, + roi_feat_size: int = 14, + in_channels: int = 256, + conv_kernel_size: int = 3, + conv_out_channels: int = 256, + num_classes: int = 80, + class_agnostic: int = False, + upsample_cfg: ConfigType = dict( + type='deconv', scale_factor=2), + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + predictor_cfg: ConfigType = dict(type='Conv'), + loss_mask: ConfigType = dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0), + init_cfg: OptMultiConfig = None) -> None: + assert init_cfg is None, 'To prevent abnormal initialization ' \ + 'behavior, init_cfg is not allowed to be set' + super().__init__(init_cfg=init_cfg) + self.upsample_cfg = upsample_cfg.copy() + if self.upsample_cfg['type'] not in [ + None, 'deconv', 'nearest', 'bilinear', 'carafe' + ]: + raise ValueError( + f'Invalid upsample method {self.upsample_cfg["type"]}, ' + 'accepted methods are "deconv", "nearest", "bilinear", ' + '"carafe"') + self.num_convs = num_convs + # WARN: roi_feat_size is reserved and not used + self.roi_feat_size = _pair(roi_feat_size) + self.in_channels = in_channels + self.conv_kernel_size = conv_kernel_size + self.conv_out_channels = conv_out_channels + self.upsample_method = self.upsample_cfg.get('type') + self.scale_factor = self.upsample_cfg.pop('scale_factor', None) + self.num_classes = num_classes + self.class_agnostic = class_agnostic + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.predictor_cfg = predictor_cfg + self.loss_mask = MODELS.build(loss_mask) + + self.convs = ModuleList() + for i in range(self.num_convs): + in_channels = ( + self.in_channels if i == 0 else self.conv_out_channels) + padding = (self.conv_kernel_size - 1) // 2 + self.convs.append( + ConvModule( + in_channels, + self.conv_out_channels, + self.conv_kernel_size, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg)) + upsample_in_channels = ( + self.conv_out_channels if self.num_convs > 0 else in_channels) + upsample_cfg_ = self.upsample_cfg.copy() + if self.upsample_method is None: + self.upsample = None + elif self.upsample_method == 'deconv': + upsample_cfg_.update( + in_channels=upsample_in_channels, + out_channels=self.conv_out_channels, + kernel_size=self.scale_factor, + stride=self.scale_factor) + self.upsample = build_upsample_layer(upsample_cfg_) + elif self.upsample_method == 'carafe': + upsample_cfg_.update( + channels=upsample_in_channels, scale_factor=self.scale_factor) + self.upsample = build_upsample_layer(upsample_cfg_) + else: + # suppress warnings + align_corners = (None + if self.upsample_method == 'nearest' else False) + upsample_cfg_.update( + scale_factor=self.scale_factor, + mode=self.upsample_method, + align_corners=align_corners) + self.upsample = build_upsample_layer(upsample_cfg_) + + out_channels = 1 if self.class_agnostic else self.num_classes + logits_in_channel = ( + self.conv_out_channels + if self.upsample_method == 'deconv' else upsample_in_channels) + self.conv_logits = build_conv_layer(self.predictor_cfg, + logits_in_channel, out_channels, 1) + self.relu = nn.ReLU(inplace=True) + self.debug_imgs = None + + def init_weights(self) -> None: + """Initialize the weights.""" + super().init_weights() + for m in [self.upsample, self.conv_logits]: + if m is None: + continue + elif isinstance(m, CARAFEPack): + m.init_weights() + elif hasattr(m, 'weight') and hasattr(m, 'bias'): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + nn.init.constant_(m.bias, 0) + + def forward(self, x: Tensor) -> Tensor: + """Forward features from the upstream network. + + Args: + x (Tensor): Extract mask RoI features. + + Returns: + Tensor: Predicted foreground masks. + """ + for conv in self.convs: + x = conv(x) + if self.upsample is not None: + x = self.upsample(x) + if self.upsample_method == 'deconv': + x = self.relu(x) + mask_preds = self.conv_logits(x) + return mask_preds + + def get_targets(self, sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, + rcnn_train_cfg: ConfigDict) -> Tensor: + """Calculate the ground truth for all samples in a batch according to + the sampling_results. + + Args: + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + + Returns: + Tensor: Mask target of each positive proposals in the image. + """ + pos_proposals = [res.pos_priors for res in sampling_results] + pos_assigned_gt_inds = [ + res.pos_assigned_gt_inds for res in sampling_results + ] + gt_masks = [res.masks for res in batch_gt_instances] + mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds, + gt_masks, rcnn_train_cfg) + return mask_targets + + def loss_and_target(self, mask_preds: Tensor, + sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, + rcnn_train_cfg: ConfigDict) -> dict: + """Calculate the loss based on the features extracted by the mask head. + + Args: + mask_preds (Tensor): Predicted foreground masks, has shape + (num_pos, num_classes, h, w). + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + + Returns: + dict: A dictionary of loss and targets components. + """ + mask_targets = self.get_targets( + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=rcnn_train_cfg) + + pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) + + loss = dict() + if mask_preds.size(0) == 0: + loss_mask = mask_preds.sum() + else: + if self.class_agnostic: + loss_mask = self.loss_mask(mask_preds, mask_targets, + torch.zeros_like(pos_labels)) + else: + loss_mask = self.loss_mask(mask_preds, mask_targets, + pos_labels) + loss['loss_mask'] = loss_mask + # TODO: which algorithm requires mask_targets? + return dict(loss_mask=loss, mask_targets=mask_targets) + + def predict_by_feat(self, + mask_preds: Tuple[Tensor], + results_list: List[InstanceData], + batch_img_metas: List[dict], + rcnn_test_cfg: ConfigDict, + rescale: bool = False, + activate_map: bool = False) -> InstanceList: + """Transform a batch of output features extracted from the head into + mask results. + + Args: + mask_preds (tuple[Tensor]): Tuple of predicted foreground masks, + each has shape (n, num_classes, h, w). + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + batch_img_metas (list[dict]): List of image information. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + activate_map (book): Whether get results with augmentations test. + If True, the `mask_preds` will not process with sigmoid. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + assert len(mask_preds) == len(results_list) == len(batch_img_metas) + + for img_id in range(len(batch_img_metas)): + img_meta = batch_img_metas[img_id] + results = results_list[img_id] + bboxes = results.bboxes + if bboxes.shape[0] == 0: + results_list[img_id] = empty_instances( + [img_meta], + bboxes.device, + task_type='mask', + instance_results=[results], + mask_thr_binary=rcnn_test_cfg.mask_thr_binary)[0] + else: + im_mask = self._predict_by_feat_single( + mask_preds=mask_preds[img_id], + bboxes=bboxes, + labels=results.labels, + img_meta=img_meta, + rcnn_test_cfg=rcnn_test_cfg, + rescale=rescale, + activate_map=activate_map) + results.masks = im_mask + return results_list + + def _predict_by_feat_single(self, + mask_preds: Tensor, + bboxes: Tensor, + labels: Tensor, + img_meta: dict, + rcnn_test_cfg: ConfigDict, + rescale: bool = False, + activate_map: bool = False) -> Tensor: + """Get segmentation masks from mask_preds and bboxes. + + Args: + mask_preds (Tensor): Predicted foreground masks, has shape + (n, num_classes, h, w). + bboxes (Tensor): Predicted bboxes, has shape (n, 4) + labels (Tensor): Labels of bboxes, has shape (n, ) + img_meta (dict): image information. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + activate_map (book): Whether get results with augmentations test. + If True, the `mask_preds` will not process with sigmoid. + Defaults to False. + + Returns: + Tensor: Encoded masks, has shape (n, img_w, img_h) + + Example: + >>> from mmengine.config import Config + >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import * # NOQA + >>> N = 7 # N = number of extracted ROIs + >>> C, H, W = 11, 32, 32 + >>> # Create example instance of FCN Mask Head. + >>> self = FCNMaskHead(num_classes=C, num_convs=0) + >>> inputs = torch.rand(N, self.in_channels, H, W) + >>> mask_preds = self.forward(inputs) + >>> # Each input is associated with some bounding box + >>> bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N) + >>> labels = torch.randint(0, C, size=(N,)) + >>> rcnn_test_cfg = Config({'mask_thr_binary': 0, }) + >>> ori_shape = (H * 4, W * 4) + >>> scale_factor = (1, 1) + >>> rescale = False + >>> img_meta = {'scale_factor': scale_factor, + ... 'ori_shape': ori_shape} + >>> # Encoded masks are a list for each category. + >>> encoded_masks = self._get_seg_masks_single( + ... mask_preds, bboxes, labels, + ... img_meta, rcnn_test_cfg, rescale) + >>> assert encoded_masks.size()[0] == N + >>> assert encoded_masks.size()[1:] == ori_shape + """ + scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + img_h, img_w = img_meta['ori_shape'][:2] + device = bboxes.device + + if not activate_map: + mask_preds = mask_preds.sigmoid() + else: + # In AugTest, has been activated before + mask_preds = bboxes.new_tensor(mask_preds) + + if rescale: # in-placed rescale the bboxes + bboxes /= scale_factor + else: + w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1] + img_h = np.round(img_h * h_scale.item()).astype(np.int32) + img_w = np.round(img_w * w_scale.item()).astype(np.int32) + + N = len(mask_preds) + # The actual implementation split the input into chunks, + # and paste them chunk by chunk. + if device.type == 'cpu': + # CPU is most efficient when they are pasted one by one with + # skip_empty=True, so that it performs minimal number of + # operations. + num_chunks = N + else: + # GPU benefits from parallelism for larger chunks, + # but may have memory issue + # the types of img_w and img_h are np.int32, + # when the image resolution is large, + # the calculation of num_chunks will overflow. + # so we need to change the types of img_w and img_h to int. + # See https://github.com/open-mmlab/mmdetection/pull/5191 + num_chunks = int( + np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / + GPU_MEM_LIMIT)) + assert (num_chunks <= + N), 'Default GPU_MEM_LIMIT is too small; try increasing it' + chunks = torch.chunk(torch.arange(N, device=device), num_chunks) + + threshold = rcnn_test_cfg.mask_thr_binary + im_mask = torch.zeros( + N, + img_h, + img_w, + device=device, + dtype=torch.bool if threshold >= 0 else torch.uint8) + + if not self.class_agnostic: + mask_preds = mask_preds[range(N), labels][:, None] + + for inds in chunks: + masks_chunk, spatial_inds = _do_paste_mask( + mask_preds[inds], + bboxes[inds], + img_h, + img_w, + skip_empty=device.type == 'cpu') + + if threshold >= 0: + masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool) + else: + # for visualization and debugging + masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8) + + im_mask[(inds, ) + spatial_inds] = masks_chunk + return im_mask + + +def _do_paste_mask(masks: Tensor, + boxes: Tensor, + img_h: int, + img_w: int, + skip_empty: bool = True) -> tuple: + """Paste instance masks according to boxes. + + This implementation is modified from + https://github.com/facebookresearch/detectron2/ + + Args: + masks (Tensor): N, 1, H, W + boxes (Tensor): N, 4 + img_h (int): Height of the image to be pasted. + img_w (int): Width of the image to be pasted. + skip_empty (bool): Only paste masks within the region that + tightly bound all boxes, and returns the results this region only. + An important optimization for CPU. + + Returns: + tuple: (Tensor, tuple). The first item is mask tensor, the second one + is the slice object. + + If skip_empty == False, the whole image will be pasted. It will + return a mask of shape (N, img_h, img_w) and an empty tuple. + + If skip_empty == True, only area around the mask will be pasted. + A mask of shape (N, h', w') and its start and end coordinates + in the original image will be returned. + """ + # On GPU, paste all masks together (up to chunk size) + # by using the entire image to sample the masks + # Compared to pasting them one by one, + # this has more operations but is faster on COCO-scale dataset. + device = masks.device + if skip_empty: + x0_int, y0_int = torch.clamp( + boxes.min(dim=0).values.floor()[:2] - 1, + min=0).to(dtype=torch.int32) + x1_int = torch.clamp( + boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32) + y1_int = torch.clamp( + boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32) + else: + x0_int, y0_int = 0, 0 + x1_int, y1_int = img_w, img_h + x0, y0, x1, y1 = torch.split(boxes, 1, dim=1) # each is Nx1 + + N = masks.shape[0] + + img_y = torch.arange(y0_int, y1_int, device=device).to(torch.float32) + 0.5 + img_x = torch.arange(x0_int, x1_int, device=device).to(torch.float32) + 0.5 + img_y = (img_y - y0) / (y1 - y0) * 2 - 1 + img_x = (img_x - x0) / (x1 - x0) * 2 - 1 + # img_x, img_y have shapes (N, w), (N, h) + # IsInf op is not supported with ONNX<=1.7.0 + if not torch.onnx.is_in_onnx_export(): + if torch.isinf(img_x).any(): + inds = torch.where(torch.isinf(img_x)) + img_x[inds] = 0 + if torch.isinf(img_y).any(): + inds = torch.where(torch.isinf(img_y)) + img_y[inds] = 0 + + gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1)) + gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1)) + grid = torch.stack([gx, gy], dim=3) + + img_masks = F.grid_sample( + masks.to(dtype=torch.float32), grid, align_corners=False) + + if skip_empty: + return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int)) + else: + return img_masks[:, 0], () diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/feature_relay_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/feature_relay_head.py new file mode 100644 index 0000000..0c34561 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/feature_relay_head.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import MultiConfig + + +@MODELS.register_module() +class FeatureRelayHead(BaseModule): + """Feature Relay Head used in `SCNet `_. + + Args: + in_channels (int): number of input channels. Defaults to 256. + conv_out_channels (int): number of output channels before + classification layer. Defaults to 256. + roi_feat_size (int): roi feat size at box head. Default: 7. + scale_factor (int): scale factor to match roi feat size + at mask head. Defaults to 2. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. Defaults to + dict(type='Kaiming', layer='Linear'). + """ + + def __init__( + self, + in_channels: int = 1024, + out_conv_channels: int = 256, + roi_feat_size: int = 7, + scale_factor: int = 2, + init_cfg: MultiConfig = dict(type='Kaiming', layer='Linear') + ) -> None: + super().__init__(init_cfg=init_cfg) + assert isinstance(roi_feat_size, int) + + self.in_channels = in_channels + self.out_conv_channels = out_conv_channels + self.roi_feat_size = roi_feat_size + self.out_channels = (roi_feat_size**2) * out_conv_channels + self.scale_factor = scale_factor + self.fp16_enabled = False + + self.fc = nn.Linear(self.in_channels, self.out_channels) + self.upsample = nn.Upsample( + scale_factor=scale_factor, mode='bilinear', align_corners=True) + + def forward(self, x: Tensor) -> Optional[Tensor]: + """Forward function. + + Args: + x (Tensor): Input feature. + + Returns: + Optional[Tensor]: Output feature. When the first dim of input is + 0, None is returned. + """ + N, _ = x.shape + if N > 0: + out_C = self.out_conv_channels + out_HW = self.roi_feat_size + x = self.fc(x) + x = x.reshape(N, out_C, out_HW, out_HW) + x = self.upsample(x) + return x + return None diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py new file mode 100644 index 0000000..d20beb2 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Tuple + +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import MultiConfig, OptConfigType + + +@MODELS.register_module() +class FusedSemanticHead(BaseModule): + r"""Multi-level fused semantic segmentation head. + + .. code-block:: none + + in_1 -> 1x1 conv --- + | + in_2 -> 1x1 conv -- | + || + in_3 -> 1x1 conv - || + ||| /-> 1x1 conv (mask prediction) + in_4 -> 1x1 conv -----> 3x3 convs (*4) + | \-> 1x1 conv (feature) + in_5 -> 1x1 conv --- + """ # noqa: W605 + + def __init__( + self, + num_ins: int, + fusion_level: int, + seg_scale_factor=1 / 8, + num_convs: int = 4, + in_channels: int = 256, + conv_out_channels: int = 256, + num_classes: int = 183, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + ignore_label: int = None, + loss_weight: float = None, + loss_seg: ConfigDict = dict( + type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2), + init_cfg: MultiConfig = dict( + type='Kaiming', override=dict(name='conv_logits')) + ) -> None: + super().__init__(init_cfg=init_cfg) + self.num_ins = num_ins + self.fusion_level = fusion_level + self.seg_scale_factor = seg_scale_factor + self.num_convs = num_convs + self.in_channels = in_channels + self.conv_out_channels = conv_out_channels + self.num_classes = num_classes + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.fp16_enabled = False + + self.lateral_convs = nn.ModuleList() + for i in range(self.num_ins): + self.lateral_convs.append( + ConvModule( + self.in_channels, + self.in_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=False)) + + self.convs = nn.ModuleList() + for i in range(self.num_convs): + in_channels = self.in_channels if i == 0 else conv_out_channels + self.convs.append( + ConvModule( + in_channels, + conv_out_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.conv_embedding = ConvModule( + conv_out_channels, + conv_out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1) + if ignore_label: + loss_seg['ignore_index'] = ignore_label + if loss_weight: + loss_seg['loss_weight'] = loss_weight + if ignore_label or loss_weight: + warnings.warn('``ignore_label`` and ``loss_weight`` would be ' + 'deprecated soon. Please set ``ingore_index`` and ' + '``loss_weight`` in ``loss_seg`` instead.') + self.criterion = MODELS.build(loss_seg) + + def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]: + """Forward function. + + Args: + feats (tuple[Tensor]): Multi scale feature maps. + + Returns: + tuple[Tensor]: + + - mask_preds (Tensor): Predicted mask logits. + - x (Tensor): Fused feature. + """ + x = self.lateral_convs[self.fusion_level](feats[self.fusion_level]) + fused_size = tuple(x.shape[-2:]) + for i, feat in enumerate(feats): + if i != self.fusion_level: + feat = F.interpolate( + feat, size=fused_size, mode='bilinear', align_corners=True) + # fix runtime error of "+=" inplace operation in PyTorch 1.10 + x = x + self.lateral_convs[i](feat) + + for i in range(self.num_convs): + x = self.convs[i](x) + + mask_preds = self.conv_logits(x) + x = self.conv_embedding(x) + return mask_preds, x + + def loss(self, mask_preds: Tensor, labels: Tensor) -> Tensor: + """Loss function. + + Args: + mask_preds (Tensor): Predicted mask logits. + labels (Tensor): Ground truth. + + Returns: + Tensor: Semantic segmentation loss. + """ + labels = F.interpolate( + labels.float(), scale_factor=self.seg_scale_factor, mode='nearest') + labels = labels.squeeze(1).long() + loss_semantic_seg = self.criterion(mask_preds, labels) + return loss_semantic_seg diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/global_context_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/global_context_head.py new file mode 100644 index 0000000..cb947ea --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/global_context_head.py @@ -0,0 +1,127 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.models.layers import ResLayer, SimplifiedBasicBlock +from mmdet.registry import MODELS +from mmdet.utils import MultiConfig, OptConfigType + + +@MODELS.register_module() +class GlobalContextHead(BaseModule): + """Global context head used in `SCNet `_. + + Args: + num_convs (int, optional): number of convolutional layer in GlbCtxHead. + Defaults to 4. + in_channels (int, optional): number of input channels. Defaults to 256. + conv_out_channels (int, optional): number of output channels before + classification layer. Defaults to 256. + num_classes (int, optional): number of classes. Defaults to 80. + loss_weight (float, optional): global context loss weight. + Defaults to 1. + conv_cfg (dict, optional): config to init conv layer. Defaults to None. + norm_cfg (dict, optional): config to init norm layer. Defaults to None. + conv_to_res (bool, optional): if True, 2 convs will be grouped into + 1 `SimplifiedBasicBlock` using a skip connection. + Defaults to False. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. Defaults to + dict(type='Normal', std=0.01, override=dict(name='fc')). + """ + + def __init__( + self, + num_convs: int = 4, + in_channels: int = 256, + conv_out_channels: int = 256, + num_classes: int = 80, + loss_weight: float = 1.0, + conv_cfg: OptConfigType = None, + norm_cfg: OptConfigType = None, + conv_to_res: bool = False, + init_cfg: MultiConfig = dict( + type='Normal', std=0.01, override=dict(name='fc')) + ) -> None: + super().__init__(init_cfg=init_cfg) + self.num_convs = num_convs + self.in_channels = in_channels + self.conv_out_channels = conv_out_channels + self.num_classes = num_classes + self.loss_weight = loss_weight + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.conv_to_res = conv_to_res + self.fp16_enabled = False + + if self.conv_to_res: + num_res_blocks = num_convs // 2 + self.convs = ResLayer( + SimplifiedBasicBlock, + in_channels, + self.conv_out_channels, + num_res_blocks, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + self.num_convs = num_res_blocks + else: + self.convs = nn.ModuleList() + for i in range(self.num_convs): + in_channels = self.in_channels if i == 0 else conv_out_channels + self.convs.append( + ConvModule( + in_channels, + conv_out_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + + self.pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Linear(conv_out_channels, num_classes) + + self.criterion = nn.BCEWithLogitsLoss() + + def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]: + """Forward function. + + Args: + feats (Tuple[Tensor]): Multi-scale feature maps. + + Returns: + Tuple[Tensor]: + + - mc_pred (Tensor): Multi-class prediction. + - x (Tensor): Global context feature. + """ + x = feats[-1] + for i in range(self.num_convs): + x = self.convs[i](x) + x = self.pool(x) + + # multi-class prediction + mc_pred = x.reshape(x.size(0), -1) + mc_pred = self.fc(mc_pred) + + return mc_pred, x + + def loss(self, pred: Tensor, labels: List[Tensor]) -> Tensor: + """Loss function. + + Args: + pred (Tensor): Logits. + labels (list[Tensor]): Grouth truths. + + Returns: + Tensor: Loss. + """ + labels = [lbl.unique() for lbl in labels] + targets = pred.new_zeros(pred.size()) + for i, label in enumerate(labels): + targets[i, label] = 1.0 + loss = self.loss_weight * self.criterion(pred, targets) + return loss diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/grid_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/grid_head.py new file mode 100644 index 0000000..d9514ae --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/grid_head.py @@ -0,0 +1,490 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType + + +@MODELS.register_module() +class GridHead(BaseModule): + """Implementation of `Grid Head `_ + + Args: + grid_points (int): The number of grid points. Defaults to 9. + num_convs (int): The number of convolution layers. Defaults to 8. + roi_feat_size (int): RoI feature size. Default to 14. + in_channels (int): The channel number of inputs features. + Defaults to 256. + conv_kernel_size (int): The kernel size of convolution layers. + Defaults to 3. + point_feat_channels (int): The number of channels of each point + features. Defaults to 64. + class_agnostic (bool): Whether use class agnostic classification. + If so, the output channels of logits will be 1. Defaults to False. + loss_grid (:obj:`ConfigDict` or dict): Config of grid loss. + conv_cfg (:obj:`ConfigDict` or dict, optional) dictionary to + construct and config conv layer. + norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and + config norm layer. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. + """ + + def __init__( + self, + grid_points: int = 9, + num_convs: int = 8, + roi_feat_size: int = 14, + in_channels: int = 256, + conv_kernel_size: int = 3, + point_feat_channels: int = 64, + deconv_kernel_size: int = 4, + class_agnostic: bool = False, + loss_grid: ConfigType = dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=15), + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='GN', num_groups=36), + init_cfg: MultiConfig = [ + dict(type='Kaiming', layer=['Conv2d', 'Linear']), + dict( + type='Normal', + layer='ConvTranspose2d', + std=0.001, + override=dict( + type='Normal', + name='deconv2', + std=0.001, + bias=-np.log(0.99 / 0.01))) + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + self.grid_points = grid_points + self.num_convs = num_convs + self.roi_feat_size = roi_feat_size + self.in_channels = in_channels + self.conv_kernel_size = conv_kernel_size + self.point_feat_channels = point_feat_channels + self.conv_out_channels = self.point_feat_channels * self.grid_points + self.class_agnostic = class_agnostic + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + if isinstance(norm_cfg, dict) and norm_cfg['type'] == 'GN': + assert self.conv_out_channels % norm_cfg['num_groups'] == 0 + + assert self.grid_points >= 4 + self.grid_size = int(np.sqrt(self.grid_points)) + if self.grid_size * self.grid_size != self.grid_points: + raise ValueError('grid_points must be a square number') + + # the predicted heatmap is half of whole_map_size + if not isinstance(self.roi_feat_size, int): + raise ValueError('Only square RoIs are supporeted in Grid R-CNN') + self.whole_map_size = self.roi_feat_size * 4 + + # compute point-wise sub-regions + self.sub_regions = self.calc_sub_regions() + + self.convs = [] + for i in range(self.num_convs): + in_channels = ( + self.in_channels if i == 0 else self.conv_out_channels) + stride = 2 if i == 0 else 1 + padding = (self.conv_kernel_size - 1) // 2 + self.convs.append( + ConvModule( + in_channels, + self.conv_out_channels, + self.conv_kernel_size, + stride=stride, + padding=padding, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + bias=True)) + self.convs = nn.Sequential(*self.convs) + + self.deconv1 = nn.ConvTranspose2d( + self.conv_out_channels, + self.conv_out_channels, + kernel_size=deconv_kernel_size, + stride=2, + padding=(deconv_kernel_size - 2) // 2, + groups=grid_points) + self.norm1 = nn.GroupNorm(grid_points, self.conv_out_channels) + self.deconv2 = nn.ConvTranspose2d( + self.conv_out_channels, + grid_points, + kernel_size=deconv_kernel_size, + stride=2, + padding=(deconv_kernel_size - 2) // 2, + groups=grid_points) + + # find the 4-neighbor of each grid point + self.neighbor_points = [] + grid_size = self.grid_size + for i in range(grid_size): # i-th column + for j in range(grid_size): # j-th row + neighbors = [] + if i > 0: # left: (i - 1, j) + neighbors.append((i - 1) * grid_size + j) + if j > 0: # up: (i, j - 1) + neighbors.append(i * grid_size + j - 1) + if j < grid_size - 1: # down: (i, j + 1) + neighbors.append(i * grid_size + j + 1) + if i < grid_size - 1: # right: (i + 1, j) + neighbors.append((i + 1) * grid_size + j) + self.neighbor_points.append(tuple(neighbors)) + # total edges in the grid + self.num_edges = sum([len(p) for p in self.neighbor_points]) + + self.forder_trans = nn.ModuleList() # first-order feature transition + self.sorder_trans = nn.ModuleList() # second-order feature transition + for neighbors in self.neighbor_points: + fo_trans = nn.ModuleList() + so_trans = nn.ModuleList() + for _ in range(len(neighbors)): + # each transition module consists of a 5x5 depth-wise conv and + # 1x1 conv. + fo_trans.append( + nn.Sequential( + nn.Conv2d( + self.point_feat_channels, + self.point_feat_channels, + 5, + stride=1, + padding=2, + groups=self.point_feat_channels), + nn.Conv2d(self.point_feat_channels, + self.point_feat_channels, 1))) + so_trans.append( + nn.Sequential( + nn.Conv2d( + self.point_feat_channels, + self.point_feat_channels, + 5, + 1, + 2, + groups=self.point_feat_channels), + nn.Conv2d(self.point_feat_channels, + self.point_feat_channels, 1))) + self.forder_trans.append(fo_trans) + self.sorder_trans.append(so_trans) + + self.loss_grid = MODELS.build(loss_grid) + + def forward(self, x: Tensor) -> Dict[str, Tensor]: + """forward function of ``GridHead``. + + Args: + x (Tensor): RoI features, has shape + (num_rois, num_channels, roi_feat_size, roi_feat_size). + + Returns: + Dict[str, Tensor]: Return a dict including fused and unfused + heatmap. + """ + assert x.shape[-1] == x.shape[-2] == self.roi_feat_size + # RoI feature transformation, downsample 2x + x = self.convs(x) + + c = self.point_feat_channels + # first-order fusion + x_fo = [None for _ in range(self.grid_points)] + for i, points in enumerate(self.neighbor_points): + x_fo[i] = x[:, i * c:(i + 1) * c] + for j, point_idx in enumerate(points): + x_fo[i] = x_fo[i] + self.forder_trans[i][j]( + x[:, point_idx * c:(point_idx + 1) * c]) + + # second-order fusion + x_so = [None for _ in range(self.grid_points)] + for i, points in enumerate(self.neighbor_points): + x_so[i] = x[:, i * c:(i + 1) * c] + for j, point_idx in enumerate(points): + x_so[i] = x_so[i] + self.sorder_trans[i][j](x_fo[point_idx]) + + # predicted heatmap with fused features + x2 = torch.cat(x_so, dim=1) + x2 = self.deconv1(x2) + x2 = F.relu(self.norm1(x2), inplace=True) + heatmap = self.deconv2(x2) + + # predicted heatmap with original features (applicable during training) + if self.training: + x1 = x + x1 = self.deconv1(x1) + x1 = F.relu(self.norm1(x1), inplace=True) + heatmap_unfused = self.deconv2(x1) + else: + heatmap_unfused = heatmap + + return dict(fused=heatmap, unfused=heatmap_unfused) + + def calc_sub_regions(self) -> List[Tuple[float]]: + """Compute point specific representation regions. + + See `Grid R-CNN Plus `_ for details. + """ + # to make it consistent with the original implementation, half_size + # is computed as 2 * quarter_size, which is smaller + half_size = self.whole_map_size // 4 * 2 + sub_regions = [] + for i in range(self.grid_points): + x_idx = i // self.grid_size + y_idx = i % self.grid_size + if x_idx == 0: + sub_x1 = 0 + elif x_idx == self.grid_size - 1: + sub_x1 = half_size + else: + ratio = x_idx / (self.grid_size - 1) - 0.25 + sub_x1 = max(int(ratio * self.whole_map_size), 0) + + if y_idx == 0: + sub_y1 = 0 + elif y_idx == self.grid_size - 1: + sub_y1 = half_size + else: + ratio = y_idx / (self.grid_size - 1) - 0.25 + sub_y1 = max(int(ratio * self.whole_map_size), 0) + sub_regions.append( + (sub_x1, sub_y1, sub_x1 + half_size, sub_y1 + half_size)) + return sub_regions + + def get_targets(self, sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict) -> Tensor: + """Calculate the ground truth for all samples in a batch according to + the sampling_results.". + + Args: + sampling_results (List[:obj:`SamplingResult`]): Assign results of + all images in a batch after sampling. + rcnn_train_cfg (:obj:`ConfigDict`): `train_cfg` of RCNN. + + Returns: + Tensor: Grid heatmap targets. + """ + # mix all samples (across images) together. + pos_bboxes = torch.cat([res.pos_bboxes for res in sampling_results], + dim=0).cpu() + pos_gt_bboxes = torch.cat( + [res.pos_gt_bboxes for res in sampling_results], dim=0).cpu() + assert pos_bboxes.shape == pos_gt_bboxes.shape + + # expand pos_bboxes to 2x of original size + x1 = pos_bboxes[:, 0] - (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2 + y1 = pos_bboxes[:, 1] - (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2 + x2 = pos_bboxes[:, 2] + (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2 + y2 = pos_bboxes[:, 3] + (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2 + pos_bboxes = torch.stack([x1, y1, x2, y2], dim=-1) + pos_bbox_ws = (pos_bboxes[:, 2] - pos_bboxes[:, 0]).unsqueeze(-1) + pos_bbox_hs = (pos_bboxes[:, 3] - pos_bboxes[:, 1]).unsqueeze(-1) + + num_rois = pos_bboxes.shape[0] + map_size = self.whole_map_size + # this is not the final target shape + targets = torch.zeros((num_rois, self.grid_points, map_size, map_size), + dtype=torch.float) + + # pre-compute interpolation factors for all grid points. + # the first item is the factor of x-dim, and the second is y-dim. + # for a 9-point grid, factors are like (1, 0), (0.5, 0.5), (0, 1) + factors = [] + for j in range(self.grid_points): + x_idx = j // self.grid_size + y_idx = j % self.grid_size + factors.append((1 - x_idx / (self.grid_size - 1), + 1 - y_idx / (self.grid_size - 1))) + + radius = rcnn_train_cfg.pos_radius + radius2 = radius**2 + for i in range(num_rois): + # ignore small bboxes + if (pos_bbox_ws[i] <= self.grid_size + or pos_bbox_hs[i] <= self.grid_size): + continue + # for each grid point, mark a small circle as positive + for j in range(self.grid_points): + factor_x, factor_y = factors[j] + gridpoint_x = factor_x * pos_gt_bboxes[i, 0] + ( + 1 - factor_x) * pos_gt_bboxes[i, 2] + gridpoint_y = factor_y * pos_gt_bboxes[i, 1] + ( + 1 - factor_y) * pos_gt_bboxes[i, 3] + + cx = int((gridpoint_x - pos_bboxes[i, 0]) / pos_bbox_ws[i] * + map_size) + cy = int((gridpoint_y - pos_bboxes[i, 1]) / pos_bbox_hs[i] * + map_size) + + for x in range(cx - radius, cx + radius + 1): + for y in range(cy - radius, cy + radius + 1): + if x >= 0 and x < map_size and y >= 0 and y < map_size: + if (x - cx)**2 + (y - cy)**2 <= radius2: + targets[i, j, y, x] = 1 + # reduce the target heatmap size by a half + # proposed in Grid R-CNN Plus (https://arxiv.org/abs/1906.05688). + sub_targets = [] + for i in range(self.grid_points): + sub_x1, sub_y1, sub_x2, sub_y2 = self.sub_regions[i] + sub_targets.append(targets[:, [i], sub_y1:sub_y2, sub_x1:sub_x2]) + sub_targets = torch.cat(sub_targets, dim=1) + sub_targets = sub_targets.to(sampling_results[0].pos_bboxes.device) + return sub_targets + + def loss(self, grid_pred: Tensor, sample_idx: Tensor, + sampling_results: List[SamplingResult], + rcnn_train_cfg: ConfigDict) -> dict: + """Calculate the loss based on the features extracted by the grid head. + + Args: + grid_pred (dict[str, Tensor]): Outputs of grid_head forward. + sample_idx (Tensor): The sampling index of ``grid_pred``. + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN. + + Returns: + dict: A dictionary of loss and targets components. + """ + grid_targets = self.get_targets(sampling_results, rcnn_train_cfg) + grid_targets = grid_targets[sample_idx] + + loss_fused = self.loss_grid(grid_pred['fused'], grid_targets) + loss_unfused = self.loss_grid(grid_pred['unfused'], grid_targets) + loss_grid = loss_fused + loss_unfused + return dict(loss_grid=loss_grid) + + def predict_by_feat(self, + grid_preds: Dict[str, Tensor], + results_list: List[InstanceData], + batch_img_metas: List[dict], + rescale: bool = False) -> InstanceList: + """Adjust the predicted bboxes from bbox head. + + Args: + grid_preds (dict[str, Tensor]): dictionary outputted by forward + function. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + batch_img_metas (list[dict]): List of image information. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape \ + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), the last \ + dimension 4 arrange as (x1, y1, x2, y2). + """ + num_roi_per_img = tuple(res.bboxes.size(0) for res in results_list) + grid_preds = { + k: v.split(num_roi_per_img, 0) + for k, v in grid_preds.items() + } + + for i, results in enumerate(results_list): + if len(results) != 0: + bboxes = self._predict_by_feat_single( + grid_pred=grid_preds['fused'][i], + bboxes=results.bboxes, + img_meta=batch_img_metas[i], + rescale=rescale) + results.bboxes = bboxes + return results_list + + def _predict_by_feat_single(self, + grid_pred: Tensor, + bboxes: Tensor, + img_meta: dict, + rescale: bool = False) -> Tensor: + """Adjust ``bboxes`` according to ``grid_pred``. + + Args: + grid_pred (Tensor): Grid fused heatmap. + bboxes (Tensor): Predicted bboxes, has shape (n, 4) + img_meta (dict): image information. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + Tensor: adjusted bboxes. + """ + assert bboxes.size(0) == grid_pred.size(0) + grid_pred = grid_pred.sigmoid() + + R, c, h, w = grid_pred.shape + half_size = self.whole_map_size // 4 * 2 + assert h == w == half_size + assert c == self.grid_points + + # find the point with max scores in the half-sized heatmap + grid_pred = grid_pred.view(R * c, h * w) + pred_scores, pred_position = grid_pred.max(dim=1) + xs = pred_position % w + ys = pred_position // w + + # get the position in the whole heatmap instead of half-sized heatmap + for i in range(self.grid_points): + xs[i::self.grid_points] += self.sub_regions[i][0] + ys[i::self.grid_points] += self.sub_regions[i][1] + + # reshape to (num_rois, grid_points) + pred_scores, xs, ys = tuple( + map(lambda x: x.view(R, c), [pred_scores, xs, ys])) + + # get expanded pos_bboxes + widths = (bboxes[:, 2] - bboxes[:, 0]).unsqueeze(-1) + heights = (bboxes[:, 3] - bboxes[:, 1]).unsqueeze(-1) + x1 = (bboxes[:, 0, None] - widths / 2) + y1 = (bboxes[:, 1, None] - heights / 2) + # map the grid point to the absolute coordinates + abs_xs = (xs.float() + 0.5) / w * widths + x1 + abs_ys = (ys.float() + 0.5) / h * heights + y1 + + # get the grid points indices that fall on the bbox boundaries + x1_inds = [i for i in range(self.grid_size)] + y1_inds = [i * self.grid_size for i in range(self.grid_size)] + x2_inds = [ + self.grid_points - self.grid_size + i + for i in range(self.grid_size) + ] + y2_inds = [(i + 1) * self.grid_size - 1 for i in range(self.grid_size)] + + # voting of all grid points on some boundary + bboxes_x1 = (abs_xs[:, x1_inds] * pred_scores[:, x1_inds]).sum( + dim=1, keepdim=True) / ( + pred_scores[:, x1_inds].sum(dim=1, keepdim=True)) + bboxes_y1 = (abs_ys[:, y1_inds] * pred_scores[:, y1_inds]).sum( + dim=1, keepdim=True) / ( + pred_scores[:, y1_inds].sum(dim=1, keepdim=True)) + bboxes_x2 = (abs_xs[:, x2_inds] * pred_scores[:, x2_inds]).sum( + dim=1, keepdim=True) / ( + pred_scores[:, x2_inds].sum(dim=1, keepdim=True)) + bboxes_y2 = (abs_ys[:, y2_inds] * pred_scores[:, y2_inds]).sum( + dim=1, keepdim=True) / ( + pred_scores[:, y2_inds].sum(dim=1, keepdim=True)) + + bboxes = torch.cat([bboxes_x1, bboxes_y1, bboxes_x2, bboxes_y2], dim=1) + bboxes[:, [0, 2]].clamp_(min=0, max=img_meta['img_shape'][1]) + bboxes[:, [1, 3]].clamp_(min=0, max=img_meta['img_shape'][0]) + + if rescale: + assert img_meta.get('scale_factor') is not None + bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat( + (1, 2)) + + return bboxes diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/htc_mask_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/htc_mask_head.py new file mode 100644 index 0000000..73ac1e6 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/htc_mask_head.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Union + +from mmcv.cnn import ConvModule +from torch import Tensor + +from mmdet.registry import MODELS +from .fcn_mask_head import FCNMaskHead + + +@MODELS.register_module() +class HTCMaskHead(FCNMaskHead): + """Mask head for HTC. + + Args: + with_conv_res (bool): Whether add conv layer for ``res_feat``. + Defaults to True. + """ + + def __init__(self, with_conv_res: bool = True, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.with_conv_res = with_conv_res + if self.with_conv_res: + self.conv_res = ConvModule( + self.conv_out_channels, + self.conv_out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + + def forward(self, + x: Tensor, + res_feat: Optional[Tensor] = None, + return_logits: bool = True, + return_feat: bool = True) -> Union[Tensor, List[Tensor]]: + """ + Args: + x (Tensor): Feature map. + res_feat (Tensor, optional): Feature for residual connection. + Defaults to None. + return_logits (bool): Whether return mask logits. Defaults to True. + return_feat (bool): Whether return feature map. Defaults to True. + + Returns: + Union[Tensor, List[Tensor]]: The return result is one of three + results: res_feat, logits, or [logits, res_feat]. + """ + assert not (not return_logits and not return_feat) + if res_feat is not None: + assert self.with_conv_res + res_feat = self.conv_res(res_feat) + x = x + res_feat + for conv in self.convs: + x = conv(x) + res_feat = x + outs = [] + if return_logits: + x = self.upsample(x) + if self.upsample_method == 'deconv': + x = self.relu(x) + mask_preds = self.conv_logits(x) + outs.append(mask_preds) + if return_feat: + outs.append(res_feat) + return outs if len(outs) > 1 else outs[0] diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/mask_point_head.py new file mode 100644 index 0000000..2084f59 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/mask_point_head.py @@ -0,0 +1,284 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py # noqa + +from typing import List, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.models.utils import (get_uncertain_point_coords_with_randomness, + get_uncertainty) +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType + + +@MODELS.register_module() +class MaskPointHead(BaseModule): + """A mask point head use in PointRend. + + ``MaskPointHead`` use shared multi-layer perceptron (equivalent to + nn.Conv1d) to predict the logit of input points. The fine-grained feature + and coarse feature will be concatenate together for predication. + + Args: + num_fcs (int): Number of fc layers in the head. Defaults to 3. + in_channels (int): Number of input channels. Defaults to 256. + fc_channels (int): Number of fc channels. Defaults to 256. + num_classes (int): Number of classes for logits. Defaults to 80. + class_agnostic (bool): Whether use class agnostic classification. + If so, the output channels of logits will be 1. Defaults to False. + coarse_pred_each_layer (bool): Whether concatenate coarse feature with + the output of each fc layer. Defaults to True. + conv_cfg (:obj:`ConfigDict` or dict): Dictionary to construct + and config conv layer. Defaults to dict(type='Conv1d')). + norm_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to construct + and config norm layer. Defaults to None. + loss_point (:obj:`ConfigDict` or dict): Dictionary to construct and + config loss layer of point head. Defaults to + dict(type='CrossEntropyLoss', use_mask=True, loss_weight=1.0). + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. + """ + + def __init__( + self, + num_classes: int, + num_fcs: int = 3, + in_channels: int = 256, + fc_channels: int = 256, + class_agnostic: bool = False, + coarse_pred_each_layer: bool = True, + conv_cfg: ConfigType = dict(type='Conv1d'), + norm_cfg: OptConfigType = None, + act_cfg: ConfigType = dict(type='ReLU'), + loss_point: ConfigType = dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0), + init_cfg: MultiConfig = dict( + type='Normal', std=0.001, override=dict(name='fc_logits')) + ) -> None: + super().__init__(init_cfg=init_cfg) + self.num_fcs = num_fcs + self.in_channels = in_channels + self.fc_channels = fc_channels + self.num_classes = num_classes + self.class_agnostic = class_agnostic + self.coarse_pred_each_layer = coarse_pred_each_layer + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.loss_point = MODELS.build(loss_point) + + fc_in_channels = in_channels + num_classes + self.fcs = nn.ModuleList() + for _ in range(num_fcs): + fc = ConvModule( + fc_in_channels, + fc_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.fcs.append(fc) + fc_in_channels = fc_channels + fc_in_channels += num_classes if self.coarse_pred_each_layer else 0 + + out_channels = 1 if self.class_agnostic else self.num_classes + self.fc_logits = nn.Conv1d( + fc_in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, fine_grained_feats: Tensor, + coarse_feats: Tensor) -> Tensor: + """Classify each point base on fine grained and coarse feats. + + Args: + fine_grained_feats (Tensor): Fine grained feature sampled from FPN, + shape (num_rois, in_channels, num_points). + coarse_feats (Tensor): Coarse feature sampled from CoarseMaskHead, + shape (num_rois, num_classes, num_points). + + Returns: + Tensor: Point classification results, + shape (num_rois, num_class, num_points). + """ + + x = torch.cat([fine_grained_feats, coarse_feats], dim=1) + for fc in self.fcs: + x = fc(x) + if self.coarse_pred_each_layer: + x = torch.cat((x, coarse_feats), dim=1) + return self.fc_logits(x) + + def get_targets(self, rois: Tensor, rel_roi_points: Tensor, + sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, + cfg: ConfigType) -> Tensor: + """Get training targets of MaskPointHead for all images. + + Args: + rois (Tensor): Region of Interest, shape (num_rois, 5). + rel_roi_points (Tensor): Points coordinates relative to RoI, shape + (num_rois, num_points, 2). + sampling_results (:obj:`SamplingResult`): Sampling result after + sampling and assignment. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + cfg (obj:`ConfigDict` or dict): Training cfg. + + Returns: + Tensor: Point target, shape (num_rois, num_points). + """ + + num_imgs = len(sampling_results) + rois_list = [] + rel_roi_points_list = [] + for batch_ind in range(num_imgs): + inds = (rois[:, 0] == batch_ind) + rois_list.append(rois[inds]) + rel_roi_points_list.append(rel_roi_points[inds]) + pos_assigned_gt_inds_list = [ + res.pos_assigned_gt_inds for res in sampling_results + ] + cfg_list = [cfg for _ in range(num_imgs)] + + point_targets = map(self._get_targets_single, rois_list, + rel_roi_points_list, pos_assigned_gt_inds_list, + batch_gt_instances, cfg_list) + point_targets = list(point_targets) + + if len(point_targets) > 0: + point_targets = torch.cat(point_targets) + + return point_targets + + def _get_targets_single(self, rois: Tensor, rel_roi_points: Tensor, + pos_assigned_gt_inds: Tensor, + gt_instances: InstanceData, + cfg: ConfigType) -> Tensor: + """Get training target of MaskPointHead for each image.""" + num_pos = rois.size(0) + num_points = cfg.num_points + if num_pos > 0: + gt_masks_th = ( + gt_instances.masks.to_tensor(rois.dtype, + rois.device).index_select( + 0, pos_assigned_gt_inds)) + gt_masks_th = gt_masks_th.unsqueeze(1) + rel_img_points = rel_roi_point_to_rel_img_point( + rois, rel_roi_points, gt_masks_th) + point_targets = point_sample(gt_masks_th, + rel_img_points).squeeze(1) + else: + point_targets = rois.new_zeros((0, num_points)) + return point_targets + + def loss_and_target(self, point_pred: Tensor, rel_roi_points: Tensor, + sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, + cfg: ConfigType) -> dict: + """Calculate loss for MaskPointHead. + + Args: + point_pred (Tensor): Point predication result, shape + (num_rois, num_classes, num_points). + rel_roi_points (Tensor): Points coordinates relative to RoI, shape + (num_rois, num_points, 2). + sampling_results (:obj:`SamplingResult`): Sampling result after + sampling and assignment. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + cfg (obj:`ConfigDict` or dict): Training cfg. + + Returns: + dict: a dictionary of point loss and point target. + """ + rois = bbox2roi([res.pos_bboxes for res in sampling_results]) + pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) + + point_target = self.get_targets(rois, rel_roi_points, sampling_results, + batch_gt_instances, cfg) + if self.class_agnostic: + loss_point = self.loss_point(point_pred, point_target, + torch.zeros_like(pos_labels)) + else: + loss_point = self.loss_point(point_pred, point_target, pos_labels) + + return dict(loss_point=loss_point, point_target=point_target) + + def get_roi_rel_points_train(self, mask_preds: Tensor, labels: Tensor, + cfg: ConfigType) -> Tensor: + """Get ``num_points`` most uncertain points with random points during + train. + + Sample points in [0, 1] x [0, 1] coordinate space based on their + uncertainty. The uncertainties are calculated for each point using + '_get_uncertainty()' function that takes point's logit prediction as + input. + + Args: + mask_preds (Tensor): A tensor of shape (num_rois, num_classes, + mask_height, mask_width) for class-specific or class-agnostic + prediction. + labels (Tensor): The ground truth class for each instance. + cfg (:obj:`ConfigDict` or dict): Training config of point head. + + Returns: + point_coords (Tensor): A tensor of shape (num_rois, num_points, 2) + that contains the coordinates sampled points. + """ + point_coords = get_uncertain_point_coords_with_randomness( + mask_preds, labels, cfg.num_points, cfg.oversample_ratio, + cfg.importance_sample_ratio) + return point_coords + + def get_roi_rel_points_test(self, mask_preds: Tensor, label_preds: Tensor, + cfg: ConfigType) -> Tuple[Tensor, Tensor]: + """Get ``num_points`` most uncertain points during test. + + Args: + mask_preds (Tensor): A tensor of shape (num_rois, num_classes, + mask_height, mask_width) for class-specific or class-agnostic + prediction. + label_preds (Tensor): The predication class for each instance. + cfg (:obj:`ConfigDict` or dict): Testing config of point head. + + Returns: + tuple: + + - point_indices (Tensor): A tensor of shape (num_rois, num_points) + that contains indices from [0, mask_height x mask_width) of the + most uncertain points. + - point_coords (Tensor): A tensor of shape (num_rois, num_points, + 2) that contains [0, 1] x [0, 1] normalized coordinates of the + most uncertain points from the [mask_height, mask_width] grid. + """ + num_points = cfg.subdivision_num_points + uncertainty_map = get_uncertainty(mask_preds, label_preds) + num_rois, _, mask_height, mask_width = uncertainty_map.shape + + # During ONNX exporting, the type of each elements of 'shape' is + # `Tensor(float)`, while it is `float` during PyTorch inference. + if isinstance(mask_height, torch.Tensor): + h_step = 1.0 / mask_height.float() + w_step = 1.0 / mask_width.float() + else: + h_step = 1.0 / mask_height + w_step = 1.0 / mask_width + # cast to int to avoid dynamic K for TopK op in ONNX + mask_size = int(mask_height * mask_width) + uncertainty_map = uncertainty_map.view(num_rois, mask_size) + num_points = min(mask_size, num_points) + point_indices = uncertainty_map.topk(num_points, dim=1)[1] + xs = w_step / 2.0 + (point_indices % mask_width).float() * w_step + ys = h_step / 2.0 + (point_indices // mask_width).float() * h_step + point_coords = torch.stack([xs, ys], dim=2) + return point_indices, point_coords diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/maskiou_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/maskiou_head.py new file mode 100644 index 0000000..8901871 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/maskiou_head.py @@ -0,0 +1,277 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import Conv2d, Linear, MaxPool2d +from mmengine.config import ConfigDict +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor +from torch.nn.modules.utils import _pair + +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, InstanceList, OptMultiConfig + + +@MODELS.register_module() +class MaskIoUHead(BaseModule): + """Mask IoU Head. + + This head predicts the IoU of predicted masks and corresponding gt masks. + + Args: + num_convs (int): The number of convolution layers. Defaults to 4. + num_fcs (int): The number of fully connected layers. Defaults to 2. + roi_feat_size (int): RoI feature size. Default to 14. + in_channels (int): The channel number of inputs features. + Defaults to 256. + conv_out_channels (int): The feature channels of convolution layers. + Defaults to 256. + fc_out_channels (int): The feature channels of fully connected layers. + Defaults to 1024. + num_classes (int): Number of categories excluding the background + category. Defaults to 80. + loss_iou (:obj:`ConfigDict` or dict): IoU loss. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. + """ + + def __init__( + self, + num_convs: int = 4, + num_fcs: int = 2, + roi_feat_size: int = 14, + in_channels: int = 256, + conv_out_channels: int = 256, + fc_out_channels: int = 1024, + num_classes: int = 80, + loss_iou: ConfigType = dict(type='MSELoss', loss_weight=0.5), + init_cfg: OptMultiConfig = [ + dict(type='Kaiming', override=dict(name='convs')), + dict(type='Caffe2Xavier', override=dict(name='fcs')), + dict(type='Normal', std=0.01, override=dict(name='fc_mask_iou')) + ] + ) -> None: + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.conv_out_channels = conv_out_channels + self.fc_out_channels = fc_out_channels + self.num_classes = num_classes + + self.convs = nn.ModuleList() + for i in range(num_convs): + if i == 0: + # concatenation of mask feature and mask prediction + in_channels = self.in_channels + 1 + else: + in_channels = self.conv_out_channels + stride = 2 if i == num_convs - 1 else 1 + self.convs.append( + Conv2d( + in_channels, + self.conv_out_channels, + 3, + stride=stride, + padding=1)) + + roi_feat_size = _pair(roi_feat_size) + pooled_area = (roi_feat_size[0] // 2) * (roi_feat_size[1] // 2) + self.fcs = nn.ModuleList() + for i in range(num_fcs): + in_channels = ( + self.conv_out_channels * + pooled_area if i == 0 else self.fc_out_channels) + self.fcs.append(Linear(in_channels, self.fc_out_channels)) + + self.fc_mask_iou = Linear(self.fc_out_channels, self.num_classes) + self.relu = nn.ReLU() + self.max_pool = MaxPool2d(2, 2) + self.loss_iou = MODELS.build(loss_iou) + + def forward(self, mask_feat: Tensor, mask_preds: Tensor) -> Tensor: + """Forward function. + + Args: + mask_feat (Tensor): Mask features from upstream models. + mask_preds (Tensor): Mask predictions from mask head. + + Returns: + Tensor: Mask IoU predictions. + """ + mask_preds = mask_preds.sigmoid() + mask_pred_pooled = self.max_pool(mask_preds.unsqueeze(1)) + + x = torch.cat((mask_feat, mask_pred_pooled), 1) + + for conv in self.convs: + x = self.relu(conv(x)) + x = x.flatten(1) + for fc in self.fcs: + x = self.relu(fc(x)) + mask_iou = self.fc_mask_iou(x) + return mask_iou + + def loss_and_target(self, mask_iou_pred: Tensor, mask_preds: Tensor, + mask_targets: Tensor, + sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, + rcnn_train_cfg: ConfigDict) -> dict: + """Calculate the loss and targets of MaskIoUHead. + + Args: + mask_iou_pred (Tensor): Mask IoU predictions results, has shape + (num_pos, num_classes) + mask_preds (Tensor): Mask predictions from mask head, has shape + (num_pos, mask_size, mask_size). + mask_targets (Tensor): The ground truth masks assigned with + predictions, has shape + (num_pos, mask_size, mask_size). + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It includes ``masks`` inside. + rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN. + + Returns: + dict: A dictionary of loss and targets components. + The targets are only used for cascade rcnn. + """ + mask_iou_targets = self.get_targets( + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + mask_preds=mask_preds, + mask_targets=mask_targets, + rcnn_train_cfg=rcnn_train_cfg) + + pos_inds = mask_iou_targets > 0 + if pos_inds.sum() > 0: + loss_mask_iou = self.loss_iou(mask_iou_pred[pos_inds], + mask_iou_targets[pos_inds]) + else: + loss_mask_iou = mask_iou_pred.sum() * 0 + return dict(loss_mask_iou=loss_mask_iou) + + def get_targets(self, sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, mask_preds: Tensor, + mask_targets: Tensor, + rcnn_train_cfg: ConfigDict) -> Tensor: + """Compute target of mask IoU. + + Mask IoU target is the IoU of the predicted mask (inside a bbox) and + the gt mask of corresponding gt mask (the whole instance). + The intersection area is computed inside the bbox, and the gt mask area + is computed with two steps, firstly we compute the gt area inside the + bbox, then divide it by the area ratio of gt area inside the bbox and + the gt area of the whole instance. + + Args: + sampling_results (list[:obj:`SamplingResult`]): sampling results. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It includes ``masks`` inside. + mask_preds (Tensor): Predicted masks of each positive proposal, + shape (num_pos, h, w). + mask_targets (Tensor): Gt mask of each positive proposal, + binary map of the shape (num_pos, h, w). + rcnn_train_cfg (obj:`ConfigDict`): Training config for R-CNN part. + + Returns: + Tensor: mask iou target (length == num positive). + """ + pos_proposals = [res.pos_priors for res in sampling_results] + pos_assigned_gt_inds = [ + res.pos_assigned_gt_inds for res in sampling_results + ] + gt_masks = [res.masks for res in batch_gt_instances] + + # compute the area ratio of gt areas inside the proposals and + # the whole instance + area_ratios = map(self._get_area_ratio, pos_proposals, + pos_assigned_gt_inds, gt_masks) + area_ratios = torch.cat(list(area_ratios)) + assert mask_targets.size(0) == area_ratios.size(0) + + mask_preds = (mask_preds > rcnn_train_cfg.mask_thr_binary).float() + mask_pred_areas = mask_preds.sum((-1, -2)) + + # mask_preds and mask_targets are binary maps + overlap_areas = (mask_preds * mask_targets).sum((-1, -2)) + + # compute the mask area of the whole instance + gt_full_areas = mask_targets.sum((-1, -2)) / (area_ratios + 1e-7) + + mask_iou_targets = overlap_areas / ( + mask_pred_areas + gt_full_areas - overlap_areas) + return mask_iou_targets + + def _get_area_ratio(self, pos_proposals: Tensor, + pos_assigned_gt_inds: Tensor, + gt_masks: InstanceData) -> Tensor: + """Compute area ratio of the gt mask inside the proposal and the gt + mask of the corresponding instance. + + Args: + pos_proposals (Tensor): Positive proposals, has shape (num_pos, 4). + pos_assigned_gt_inds (Tensor): positive proposals assigned ground + truth index. + gt_masks (BitmapMask or PolygonMask): Gt masks (the whole instance) + of each image, with the same shape of the input image. + + Returns: + Tensor: The area ratio of the gt mask inside the proposal and the + gt mask of the corresponding instance. + """ + num_pos = pos_proposals.size(0) + if num_pos > 0: + area_ratios = [] + proposals_np = pos_proposals.cpu().numpy() + pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy() + # compute mask areas of gt instances (batch processing for speedup) + gt_instance_mask_area = gt_masks.areas + for i in range(num_pos): + gt_mask = gt_masks[pos_assigned_gt_inds[i]] + + # crop the gt mask inside the proposal + bbox = proposals_np[i, :].astype(np.int32) + gt_mask_in_proposal = gt_mask.crop(bbox) + + ratio = gt_mask_in_proposal.areas[0] / ( + gt_instance_mask_area[pos_assigned_gt_inds[i]] + 1e-7) + area_ratios.append(ratio) + area_ratios = torch.from_numpy(np.stack(area_ratios)).float().to( + pos_proposals.device) + else: + area_ratios = pos_proposals.new_zeros((0, )) + return area_ratios + + def predict_by_feat(self, mask_iou_preds: Tuple[Tensor], + results_list: InstanceList) -> InstanceList: + """Predict the mask iou and calculate it into ``results.scores``. + + Args: + mask_iou_preds (Tensor): Mask IoU predictions results, has shape + (num_proposals, num_classes) + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + assert len(mask_iou_preds) == len(results_list) + for results, mask_iou_pred in zip(results_list, mask_iou_preds): + labels = results.labels + scores = results.scores + results.scores = scores * mask_iou_pred[range(labels.size(0)), + labels] + return results_list diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py new file mode 100644 index 0000000..ffd30c3 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.models.layers import ResLayer, SimplifiedBasicBlock +from mmdet.registry import MODELS +from .fcn_mask_head import FCNMaskHead + + +@MODELS.register_module() +class SCNetMaskHead(FCNMaskHead): + """Mask head for `SCNet `_. + + Args: + conv_to_res (bool, optional): if True, change the conv layers to + ``SimplifiedBasicBlock``. + """ + + def __init__(self, conv_to_res: bool = True, **kwargs) -> None: + super().__init__(**kwargs) + self.conv_to_res = conv_to_res + if conv_to_res: + assert self.conv_kernel_size == 3 + self.num_res_blocks = self.num_convs // 2 + self.convs = ResLayer( + SimplifiedBasicBlock, + self.in_channels, + self.conv_out_channels, + self.num_res_blocks, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) diff --git a/mmdetection/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py b/mmdetection/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py new file mode 100644 index 0000000..55c5c8e --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.models.layers import ResLayer, SimplifiedBasicBlock +from mmdet.registry import MODELS +from .fused_semantic_head import FusedSemanticHead + + +@MODELS.register_module() +class SCNetSemanticHead(FusedSemanticHead): + """Mask head for `SCNet `_. + + Args: + conv_to_res (bool, optional): if True, change the conv layers to + ``SimplifiedBasicBlock``. + """ + + def __init__(self, conv_to_res: bool = True, **kwargs) -> None: + super().__init__(**kwargs) + self.conv_to_res = conv_to_res + if self.conv_to_res: + num_res_blocks = self.num_convs // 2 + self.convs = ResLayer( + SimplifiedBasicBlock, + self.in_channels, + self.conv_out_channels, + num_res_blocks, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg) + self.num_convs = num_res_blocks diff --git a/mmdetection/mmdet/models/roi_heads/mask_scoring_roi_head.py b/mmdetection/mmdet/models/roi_heads/mask_scoring_roi_head.py new file mode 100644 index 0000000..6545c0e --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/mask_scoring_roi_head.py @@ -0,0 +1,208 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList +from ..task_modules.samplers import SamplingResult +from ..utils.misc import empty_instances +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class MaskScoringRoIHead(StandardRoIHead): + """Mask Scoring RoIHead for `Mask Scoring RCNN. + + `_. + + Args: + mask_iou_head (:obj`ConfigDict`, dict): The config of mask_iou_head. + """ + + def __init__(self, mask_iou_head: ConfigType, **kwargs): + assert mask_iou_head is not None + super().__init__(**kwargs) + self.mask_iou_head = MODELS.build(mask_iou_head) + + def forward(self, + x: Tuple[Tensor], + rpn_results_list: InstanceList, + batch_data_samples: SampleList = None) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + x (List[Tensor]): Multi-level features that may have different + resolutions. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns + tuple: A tuple of features from ``bbox_head`` and ``mask_head`` + forward. + """ + results = () + proposals = [rpn_results.bboxes for rpn_results in rpn_results_list] + rois = bbox2roi(proposals) + # bbox head + if self.with_bbox: + bbox_results = self._bbox_forward(x, rois) + results = results + (bbox_results['cls_score'], + bbox_results['bbox_pred']) + # mask head + if self.with_mask: + mask_rois = rois[:100] + mask_results = self._mask_forward(x, mask_rois) + results = results + (mask_results['mask_preds'], ) + + # mask iou head + cls_score = bbox_results['cls_score'][:100] + mask_preds = mask_results['mask_preds'] + mask_feats = mask_results['mask_feats'] + _, labels = cls_score[:, :self.bbox_head.num_classes].max(dim=1) + mask_iou_preds = self.mask_iou_head( + mask_feats, mask_preds[range(labels.size(0)), labels]) + results = results + (mask_iou_preds, ) + + return results + + def mask_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult], bbox_feats, + batch_gt_instances: InstanceList) -> dict: + """Perform forward propagation and loss calculation of the mask head on + the features of the upstream network. + + Args: + x (tuple[Tensor]): Tuple of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + bbox_feats (Tensor): Extract bbox RoI features. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `mask_feats` (Tensor): Extract mask RoI features. + - `mask_targets` (Tensor): Mask target of each positive\ + proposals in the image. + - `loss_mask` (dict): A dictionary of mask loss components. + - `loss_mask_iou` (Tensor): mask iou loss. + """ + if not self.share_roi_extractor: + pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) + mask_results = self._mask_forward(x, pos_rois) + else: + pos_inds = [] + device = bbox_feats.device + for res in sampling_results: + pos_inds.append( + torch.ones( + res.pos_priors.shape[0], + device=device, + dtype=torch.uint8)) + pos_inds.append( + torch.zeros( + res.neg_priors.shape[0], + device=device, + dtype=torch.uint8)) + pos_inds = torch.cat(pos_inds) + + mask_results = self._mask_forward( + x, pos_inds=pos_inds, bbox_feats=bbox_feats) + + mask_loss_and_target = self.mask_head.loss_and_target( + mask_preds=mask_results['mask_preds'], + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=self.train_cfg) + mask_targets = mask_loss_and_target['mask_targets'] + mask_results.update(loss_mask=mask_loss_and_target['loss_mask']) + if mask_results['loss_mask'] is None: + return mask_results + + # mask iou head forward and loss + pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) + pos_mask_pred = mask_results['mask_preds'][ + range(mask_results['mask_preds'].size(0)), pos_labels] + mask_iou_pred = self.mask_iou_head(mask_results['mask_feats'], + pos_mask_pred) + pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)), + pos_labels] + + loss_mask_iou = self.mask_iou_head.loss_and_target( + pos_mask_iou_pred, pos_mask_pred, mask_targets, sampling_results, + batch_gt_instances, self.train_cfg) + mask_results['loss_mask'].update(loss_mask_iou) + return mask_results + + def predict_mask(self, + x: Tensor, + batch_img_metas: List[dict], + results_list: InstanceList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + bboxes = [res.bboxes for res in results_list] + mask_rois = bbox2roi(bboxes) + if mask_rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas, + mask_rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + mask_results = self._mask_forward(x, mask_rois) + mask_preds = mask_results['mask_preds'] + mask_feats = mask_results['mask_feats'] + # get mask scores with mask iou head + labels = torch.cat([res.labels for res in results_list]) + mask_iou_preds = self.mask_iou_head( + mask_feats, mask_preds[range(labels.size(0)), labels]) + # split batch mask prediction back to each image + num_mask_rois_per_img = [len(res) for res in results_list] + mask_preds = mask_preds.split(num_mask_rois_per_img, 0) + mask_iou_preds = mask_iou_preds.split(num_mask_rois_per_img, 0) + + # TODO: Handle the case where rescale is false + results_list = self.mask_head.predict_by_feat( + mask_preds=mask_preds, + results_list=results_list, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale) + results_list = self.mask_iou_head.predict_by_feat( + mask_iou_preds=mask_iou_preds, results_list=results_list) + return results_list diff --git a/mmdetection/mmdet/models/roi_heads/multi_instance_roi_head.py b/mmdetection/mmdet/models/roi_heads/multi_instance_roi_head.py new file mode 100644 index 0000000..fee55b0 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/multi_instance_roi_head.py @@ -0,0 +1,226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList +from ..task_modules.samplers import SamplingResult +from ..utils import empty_instances, unpack_gt_instances +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class MultiInstanceRoIHead(StandardRoIHead): + """The roi head for Multi-instance prediction.""" + + def __init__(self, num_instance: int = 2, *args, **kwargs) -> None: + self.num_instance = num_instance + super().__init__(*args, **kwargs) + + def init_bbox_head(self, bbox_roi_extractor: ConfigType, + bbox_head: ConfigType) -> None: + """Initialize box head and box roi extractor. + + Args: + bbox_roi_extractor (dict or ConfigDict): Config of box + roi extractor. + bbox_head (dict or ConfigDict): Config of box in box head. + """ + self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor) + self.bbox_head = MODELS.build(bbox_head) + + def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + """Box head forward function used in both training and testing. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `cls_score_ref` (Tensor): The cls_score after refine model. + - `bbox_pred_ref` (Tensor): The bbox_pred after refine model. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + # TODO: a more flexible way to decide which feature maps to use + bbox_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], rois) + bbox_results = self.bbox_head(bbox_feats) + + if self.bbox_head.with_refine: + bbox_results = dict( + cls_score=bbox_results[0], + bbox_pred=bbox_results[1], + cls_score_ref=bbox_results[2], + bbox_pred_ref=bbox_results[3], + bbox_feats=bbox_feats) + else: + bbox_results = dict( + cls_score=bbox_results[0], + bbox_pred=bbox_results[1], + bbox_feats=bbox_feats) + + return bbox_results + + def bbox_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult]) -> dict: + """Perform forward propagation and loss calculation of the bbox head on + the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + """ + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward(x, rois) + + # If there is a refining process, add refine loss. + if 'cls_score_ref' in bbox_results: + bbox_loss_and_target = self.bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + bbox_pred=bbox_results['bbox_pred'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg) + bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox']) + bbox_loss_and_target_ref = self.bbox_head.loss_and_target( + cls_score=bbox_results['cls_score_ref'], + bbox_pred=bbox_results['bbox_pred_ref'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg) + bbox_results['loss_bbox']['loss_rcnn_emd_ref'] = \ + bbox_loss_and_target_ref['loss_bbox']['loss_rcnn_emd'] + else: + bbox_loss_and_target = self.bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + bbox_pred=bbox_results['bbox_pred'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg) + bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox']) + + return bbox_results + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: List[DetDataSample]) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, _ = outputs + + sampling_results = [] + for i in range(len(batch_data_samples)): + # rename rpn_results.bboxes to rpn_results.priors + rpn_results = rpn_results_list[i] + rpn_results.priors = rpn_results.pop('bboxes') + + assign_result = self.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + batch_gt_instances_ignore=batch_gt_instances_ignore[i]) + sampling_results.append(sampling_result) + + losses = dict() + # bbox head loss + if self.with_bbox: + bbox_results = self.bbox_loss(x, sampling_results) + losses.update(bbox_results['loss_bbox']) + + return losses + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the bbox head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + proposals = [res.bboxes for res in rpn_results_list] + rois = bbox2roi(proposals) + + if rois.shape[0] == 0: + return empty_instances( + batch_img_metas, rois.device, task_type='bbox') + + bbox_results = self._bbox_forward(x, rois) + + # split batch bbox prediction back to each image + if 'cls_score_ref' in bbox_results: + cls_scores = bbox_results['cls_score_ref'] + bbox_preds = bbox_results['bbox_pred_ref'] + else: + cls_scores = bbox_results['cls_score'] + bbox_preds = bbox_results['bbox_pred'] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = rois.split(num_proposals_per_img, 0) + cls_scores = cls_scores.split(num_proposals_per_img, 0) + + if bbox_preds is not None: + bbox_preds = bbox_preds.split(num_proposals_per_img, 0) + else: + bbox_preds = (None, ) * len(proposals) + + result_list = self.bbox_head.predict_by_feat( + rois=rois, + cls_scores=cls_scores, + bbox_preds=bbox_preds, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=rcnn_test_cfg, + rescale=rescale) + return result_list diff --git a/mmdetection/mmdet/models/roi_heads/pisa_roi_head.py b/mmdetection/mmdet/models/roi_heads/pisa_roi_head.py new file mode 100644 index 0000000..45d5987 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/pisa_roi_head.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +from torch import Tensor + +from mmdet.models.task_modules import SamplingResult +from mmdet.registry import MODELS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import InstanceList +from ..losses.pisa_loss import carl_loss, isr_p +from ..utils import unpack_gt_instances +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class PISARoIHead(StandardRoIHead): + r"""The RoI head for `Prime Sample Attention in Object Detection + `_.""" + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: List[DetDataSample]) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, _ = outputs + + # assign gts and sample proposals + num_imgs = len(batch_data_samples) + sampling_results = [] + neg_label_weights = [] + for i in range(num_imgs): + # rename rpn_results.bboxes to rpn_results.priors + rpn_results = rpn_results_list[i] + rpn_results.priors = rpn_results.pop('bboxes') + + assign_result = self.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + if isinstance(sampling_result, tuple): + sampling_result, neg_label_weight = sampling_result + sampling_results.append(sampling_result) + neg_label_weights.append(neg_label_weight) + + losses = dict() + # bbox head forward and loss + if self.with_bbox: + bbox_results = self.bbox_loss( + x, sampling_results, neg_label_weights=neg_label_weights) + losses.update(bbox_results['loss_bbox']) + + # mask head forward and loss + if self.with_mask: + mask_results = self.mask_loss(x, sampling_results, + bbox_results['bbox_feats'], + batch_gt_instances) + losses.update(mask_results['loss_mask']) + + return losses + + def bbox_loss(self, + x: Tuple[Tensor], + sampling_results: List[SamplingResult], + neg_label_weights: List[Tensor] = None) -> dict: + """Perform forward propagation and loss calculation of the bbox head on + the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + """ + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward(x, rois) + bbox_targets = self.bbox_head.get_targets(sampling_results, + self.train_cfg) + + # neg_label_weights obtained by sampler is image-wise, mapping back to + # the corresponding location in label weights + if neg_label_weights[0] is not None: + label_weights = bbox_targets[1] + cur_num_rois = 0 + for i in range(len(sampling_results)): + num_pos = sampling_results[i].pos_inds.size(0) + num_neg = sampling_results[i].neg_inds.size(0) + label_weights[cur_num_rois + num_pos:cur_num_rois + num_pos + + num_neg] = neg_label_weights[i] + cur_num_rois += num_pos + num_neg + + cls_score = bbox_results['cls_score'] + bbox_pred = bbox_results['bbox_pred'] + + # Apply ISR-P + isr_cfg = self.train_cfg.get('isr', None) + if isr_cfg is not None: + bbox_targets = isr_p( + cls_score, + bbox_pred, + bbox_targets, + rois, + sampling_results, + self.bbox_head.loss_cls, + self.bbox_head.bbox_coder, + **isr_cfg, + num_class=self.bbox_head.num_classes) + loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, rois, + *bbox_targets) + + # Add CARL Loss + carl_cfg = self.train_cfg.get('carl', None) + if carl_cfg is not None: + loss_carl = carl_loss( + cls_score, + bbox_targets[0], + bbox_pred, + bbox_targets[2], + self.bbox_head.loss_bbox, + **carl_cfg, + num_class=self.bbox_head.num_classes) + loss_bbox.update(loss_carl) + + bbox_results.update(loss_bbox=loss_bbox) + return bbox_results diff --git a/mmdetection/mmdet/models/roi_heads/point_rend_roi_head.py b/mmdetection/mmdet/models/roi_heads/point_rend_roi_head.py new file mode 100644 index 0000000..6a06415 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/point_rend_roi_head.py @@ -0,0 +1,236 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend # noqa +from typing import List, Tuple + +import torch +import torch.nn.functional as F +from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList +from ..task_modules.samplers import SamplingResult +from ..utils import empty_instances +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class PointRendRoIHead(StandardRoIHead): + """`PointRend `_.""" + + def __init__(self, point_head: ConfigType, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + assert self.with_bbox and self.with_mask + self.init_point_head(point_head) + + def init_point_head(self, point_head: ConfigType) -> None: + """Initialize ``point_head``""" + self.point_head = MODELS.build(point_head) + + def mask_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult], bbox_feats: Tensor, + batch_gt_instances: InstanceList) -> dict: + """Run forward function and calculate loss for mask head and point head + in training.""" + mask_results = super().mask_loss( + x=x, + sampling_results=sampling_results, + bbox_feats=bbox_feats, + batch_gt_instances=batch_gt_instances) + + mask_point_results = self._mask_point_loss( + x=x, + sampling_results=sampling_results, + mask_preds=mask_results['mask_preds'], + batch_gt_instances=batch_gt_instances) + mask_results['loss_mask'].update( + loss_point=mask_point_results['loss_point']) + + return mask_results + + def _mask_point_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult], + mask_preds: Tensor, + batch_gt_instances: InstanceList) -> dict: + """Run forward function and calculate loss for point head in + training.""" + pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) + rel_roi_points = self.point_head.get_roi_rel_points_train( + mask_preds, pos_labels, cfg=self.train_cfg) + rois = bbox2roi([res.pos_bboxes for res in sampling_results]) + + fine_grained_point_feats = self._get_fine_grained_point_feats( + x, rois, rel_roi_points) + coarse_point_feats = point_sample(mask_preds, rel_roi_points) + mask_point_pred = self.point_head(fine_grained_point_feats, + coarse_point_feats) + + loss_and_target = self.point_head.loss_and_target( + point_pred=mask_point_pred, + rel_roi_points=rel_roi_points, + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + cfg=self.train_cfg) + + return loss_and_target + + def _mask_point_forward_test(self, x: Tuple[Tensor], rois: Tensor, + label_preds: Tensor, + mask_preds: Tensor) -> Tensor: + """Mask refining process with point head in testing. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + rois (Tensor): shape (num_rois, 5). + label_preds (Tensor): The predication class for each rois. + mask_preds (Tensor): The predication coarse masks of + shape (num_rois, num_classes, small_size, small_size). + + Returns: + Tensor: The refined masks of shape (num_rois, num_classes, + large_size, large_size). + """ + refined_mask_pred = mask_preds.clone() + for subdivision_step in range(self.test_cfg.subdivision_steps): + refined_mask_pred = F.interpolate( + refined_mask_pred, + scale_factor=self.test_cfg.scale_factor, + mode='bilinear', + align_corners=False) + # If `subdivision_num_points` is larger or equal to the + # resolution of the next step, then we can skip this step + num_rois, channels, mask_height, mask_width = \ + refined_mask_pred.shape + if (self.test_cfg.subdivision_num_points >= + self.test_cfg.scale_factor**2 * mask_height * mask_width + and + subdivision_step < self.test_cfg.subdivision_steps - 1): + continue + point_indices, rel_roi_points = \ + self.point_head.get_roi_rel_points_test( + refined_mask_pred, label_preds, cfg=self.test_cfg) + + fine_grained_point_feats = self._get_fine_grained_point_feats( + x=x, rois=rois, rel_roi_points=rel_roi_points) + coarse_point_feats = point_sample(mask_preds, rel_roi_points) + mask_point_pred = self.point_head(fine_grained_point_feats, + coarse_point_feats) + + point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1) + refined_mask_pred = refined_mask_pred.reshape( + num_rois, channels, mask_height * mask_width) + refined_mask_pred = refined_mask_pred.scatter_( + 2, point_indices, mask_point_pred) + refined_mask_pred = refined_mask_pred.view(num_rois, channels, + mask_height, mask_width) + + return refined_mask_pred + + def _get_fine_grained_point_feats(self, x: Tuple[Tensor], rois: Tensor, + rel_roi_points: Tensor) -> Tensor: + """Sample fine grained feats from each level feature map and + concatenate them together. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + rois (Tensor): shape (num_rois, 5). + rel_roi_points (Tensor): A tensor of shape (num_rois, num_points, + 2) that contains [0, 1] x [0, 1] normalized coordinates of the + most uncertain points from the [mask_height, mask_width] grid. + + Returns: + Tensor: The fine grained features for each points, + has shape (num_rois, feats_channels, num_points). + """ + assert rois.shape[0] > 0, 'RoI is a empty tensor.' + num_imgs = x[0].shape[0] + fine_grained_feats = [] + for idx in range(self.mask_roi_extractor.num_inputs): + feats = x[idx] + spatial_scale = 1. / float( + self.mask_roi_extractor.featmap_strides[idx]) + point_feats = [] + for batch_ind in range(num_imgs): + # unravel batch dim + feat = feats[batch_ind].unsqueeze(0) + inds = (rois[:, 0].long() == batch_ind) + if inds.any(): + rel_img_points = rel_roi_point_to_rel_img_point( + rois=rois[inds], + rel_roi_points=rel_roi_points[inds], + img=feat.shape[2:], + spatial_scale=spatial_scale).unsqueeze(0) + point_feat = point_sample(feat, rel_img_points) + point_feat = point_feat.squeeze(0).transpose(0, 1) + point_feats.append(point_feat) + fine_grained_feats.append(torch.cat(point_feats, dim=0)) + return torch.cat(fine_grained_feats, dim=1) + + def predict_mask(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + results_list: InstanceList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + # don't need to consider aug_test. + bboxes = [res.bboxes for res in results_list] + mask_rois = bbox2roi(bboxes) + if mask_rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas, + mask_rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + mask_results = self._mask_forward(x, mask_rois) + mask_preds = mask_results['mask_preds'] + # split batch mask prediction back to each image + num_mask_rois_per_img = [len(res) for res in results_list] + mask_preds = mask_preds.split(num_mask_rois_per_img, 0) + + # refine mask_preds + mask_rois = mask_rois.split(num_mask_rois_per_img, 0) + mask_preds_refined = [] + for i in range(len(batch_img_metas)): + labels = results_list[i].labels + x_i = [xx[[i]] for xx in x] + mask_rois_i = mask_rois[i] + mask_rois_i[:, 0] = 0 + mask_pred_i = self._mask_point_forward_test( + x_i, mask_rois_i, labels, mask_preds[i]) + mask_preds_refined.append(mask_pred_i) + + # TODO: Handle the case where rescale is false + results_list = self.mask_head.predict_by_feat( + mask_preds=mask_preds_refined, + results_list=results_list, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale) + return results_list diff --git a/mmdetection/mmdet/models/roi_heads/roi_extractors/__init__.py b/mmdetection/mmdet/models/roi_heads/roi_extractors/__init__.py new file mode 100644 index 0000000..0f60214 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/roi_extractors/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_roi_extractor import BaseRoIExtractor +from .generic_roi_extractor import GenericRoIExtractor +from .single_level_roi_extractor import SingleRoIExtractor + +__all__ = ['BaseRoIExtractor', 'SingleRoIExtractor', 'GenericRoIExtractor'] diff --git a/mmdetection/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/mmdetection/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py new file mode 100644 index 0000000..a8de051 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmcv import ops +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.utils import ConfigType, OptMultiConfig + + +class BaseRoIExtractor(BaseModule, metaclass=ABCMeta): + """Base class for RoI extractor. + + Args: + roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and + arguments. + out_channels (int): Output channels of RoI layers. + featmap_strides (list[int]): Strides of input feature maps. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None. + """ + + def __init__(self, + roi_layer: ConfigType, + out_channels: int, + featmap_strides: List[int], + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides) + self.out_channels = out_channels + self.featmap_strides = featmap_strides + + @property + def num_inputs(self) -> int: + """int: Number of input feature maps.""" + return len(self.featmap_strides) + + def build_roi_layers(self, layer_cfg: ConfigType, + featmap_strides: List[int]) -> nn.ModuleList: + """Build RoI operator to extract feature from each level feature map. + + Args: + layer_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config RoI layer operation. Options are modules under + ``mmcv/ops`` such as ``RoIAlign``. + featmap_strides (list[int]): The stride of input feature map w.r.t + to the original image size, which would be used to scale RoI + coordinate (original image coordinate system) to feature + coordinate system. + + Returns: + :obj:`nn.ModuleList`: The RoI extractor modules for each level + feature map. + """ + + cfg = layer_cfg.copy() + layer_type = cfg.pop('type') + if isinstance(layer_type, str): + assert hasattr(ops, layer_type) + layer_cls = getattr(ops, layer_type) + else: + layer_cls = layer_type + roi_layers = nn.ModuleList( + [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides]) + return roi_layers + + def roi_rescale(self, rois: Tensor, scale_factor: float) -> Tensor: + """Scale RoI coordinates by scale factor. + + Args: + rois (Tensor): RoI (Region of Interest), shape (n, 5) + scale_factor (float): Scale factor that RoI will be multiplied by. + + Returns: + Tensor: Scaled RoI. + """ + + cx = (rois[:, 1] + rois[:, 3]) * 0.5 + cy = (rois[:, 2] + rois[:, 4]) * 0.5 + w = rois[:, 3] - rois[:, 1] + h = rois[:, 4] - rois[:, 2] + new_w = w * scale_factor + new_h = h * scale_factor + x1 = cx - new_w * 0.5 + x2 = cx + new_w * 0.5 + y1 = cy - new_h * 0.5 + y2 = cy + new_h * 0.5 + new_rois = torch.stack((rois[:, 0], x1, y1, x2, y2), dim=-1) + return new_rois + + @abstractmethod + def forward(self, + feats: Tuple[Tensor], + rois: Tensor, + roi_scale_factor: Optional[float] = None) -> Tensor: + """Extractor ROI feats. + + Args: + feats (Tuple[Tensor]): Multi-scale features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + roi_scale_factor (Optional[float]): RoI scale factor. + Defaults to None. + + Returns: + Tensor: RoI feature. + """ + pass diff --git a/mmdetection/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py b/mmdetection/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py new file mode 100644 index 0000000..39d4c90 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +from mmcv.cnn.bricks import build_plugin_layer +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType +from .base_roi_extractor import BaseRoIExtractor + + +@MODELS.register_module() +class GenericRoIExtractor(BaseRoIExtractor): + """Extract RoI features from all level feature maps levels. + + This is the implementation of `A novel Region of Interest Extraction Layer + for Instance Segmentation `_. + + Args: + aggregation (str): The method to aggregate multiple feature maps. + Options are 'sum', 'concat'. Defaults to 'sum'. + pre_cfg (:obj:`ConfigDict` or dict): Specify pre-processing modules. + Defaults to None. + post_cfg (:obj:`ConfigDict` or dict): Specify post-processing modules. + Defaults to None. + kwargs (keyword arguments): Arguments that are the same + as :class:`BaseRoIExtractor`. + """ + + def __init__(self, + aggregation: str = 'sum', + pre_cfg: OptConfigType = None, + post_cfg: OptConfigType = None, + **kwargs) -> None: + super().__init__(**kwargs) + + assert aggregation in ['sum', 'concat'] + + self.aggregation = aggregation + self.with_post = post_cfg is not None + self.with_pre = pre_cfg is not None + # build pre/post processing modules + if self.with_post: + self.post_module = build_plugin_layer(post_cfg, '_post_module')[1] + if self.with_pre: + self.pre_module = build_plugin_layer(pre_cfg, '_pre_module')[1] + + def forward(self, + feats: Tuple[Tensor], + rois: Tensor, + roi_scale_factor: Optional[float] = None) -> Tensor: + """Extractor ROI feats. + + Args: + feats (Tuple[Tensor]): Multi-scale features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + roi_scale_factor (Optional[float]): RoI scale factor. + Defaults to None. + + Returns: + Tensor: RoI feature. + """ + out_size = self.roi_layers[0].output_size + num_levels = len(feats) + roi_feats = feats[0].new_zeros( + rois.size(0), self.out_channels, *out_size) + + # some times rois is an empty tensor + if roi_feats.shape[0] == 0: + return roi_feats + + if num_levels == 1: + return self.roi_layers[0](feats[0], rois) + + if roi_scale_factor is not None: + rois = self.roi_rescale(rois, roi_scale_factor) + + # mark the starting channels for concat mode + start_channels = 0 + for i in range(num_levels): + roi_feats_t = self.roi_layers[i](feats[i], rois) + end_channels = start_channels + roi_feats_t.size(1) + if self.with_pre: + # apply pre-processing to a RoI extracted from each layer + roi_feats_t = self.pre_module(roi_feats_t) + if self.aggregation == 'sum': + # and sum them all + roi_feats += roi_feats_t + else: + # and concat them along channel dimension + roi_feats[:, start_channels:end_channels] = roi_feats_t + # update channels starting position + start_channels = end_channels + # check if concat channels match at the end + if self.aggregation == 'concat': + assert start_channels == self.out_channels + + if self.with_post: + # apply post-processing before return the result + roi_feats = self.post_module(roi_feats) + return roi_feats diff --git a/mmdetection/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/mmdetection/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py new file mode 100644 index 0000000..59229e0 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptMultiConfig +from .base_roi_extractor import BaseRoIExtractor + + +@MODELS.register_module() +class SingleRoIExtractor(BaseRoIExtractor): + """Extract RoI features from a single level feature map. + + If there are multiple input feature levels, each RoI is mapped to a level + according to its scale. The mapping rule is proposed in + `FPN `_. + + Args: + roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and + arguments. + out_channels (int): Output channels of RoI layers. + featmap_strides (List[int]): Strides of input feature maps. + finest_scale (int): Scale threshold of mapping to level 0. + Defaults to 56. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None. + """ + + def __init__(self, + roi_layer: ConfigType, + out_channels: int, + featmap_strides: List[int], + finest_scale: int = 56, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + roi_layer=roi_layer, + out_channels=out_channels, + featmap_strides=featmap_strides, + init_cfg=init_cfg) + self.finest_scale = finest_scale + + def map_roi_levels(self, rois: Tensor, num_levels: int) -> Tensor: + """Map rois to corresponding feature levels by scales. + + - scale < finest_scale * 2: level 0 + - finest_scale * 2 <= scale < finest_scale * 4: level 1 + - finest_scale * 4 <= scale < finest_scale * 8: level 2 + - scale >= finest_scale * 8: level 3 + + Args: + rois (Tensor): Input RoIs, shape (k, 5). + num_levels (int): Total level number. + + Returns: + Tensor: Level index (0-based) of each RoI, shape (k, ) + """ + scale = torch.sqrt( + (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2])) + target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6)) + target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long() + return target_lvls + + def forward(self, + feats: Tuple[Tensor], + rois: Tensor, + roi_scale_factor: Optional[float] = None): + """Extractor ROI feats. + + Args: + feats (Tuple[Tensor]): Multi-scale features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + roi_scale_factor (Optional[float]): RoI scale factor. + Defaults to None. + + Returns: + Tensor: RoI feature. + """ + # convert fp32 to fp16 when amp is on + rois = rois.type_as(feats[0]) + out_size = self.roi_layers[0].output_size + num_levels = len(feats) + roi_feats = feats[0].new_zeros( + rois.size(0), self.out_channels, *out_size) + + # TODO: remove this when parrots supports + if torch.__version__ == 'parrots': + roi_feats.requires_grad = True + + if num_levels == 1: + if len(rois) == 0: + return roi_feats + return self.roi_layers[0](feats[0], rois) + + target_lvls = self.map_roi_levels(rois, num_levels) + + if roi_scale_factor is not None: + rois = self.roi_rescale(rois, roi_scale_factor) + + for i in range(num_levels): + mask = target_lvls == i + inds = mask.nonzero(as_tuple=False).squeeze(1) + if inds.numel() > 0: + rois_ = rois[inds] + roi_feats_t = self.roi_layers[i](feats[i], rois_) + roi_feats[inds] = roi_feats_t + else: + # Sometimes some pyramid levels will not be used for RoI + # feature extraction and this will cause an incomplete + # computation graph in one GPU, which is different from those + # in other GPUs and will cause a hanging error. + # Therefore, we add it to ensure each feature pyramid is + # included in the computation graph to avoid runtime bugs. + roi_feats += sum( + x.view(-1)[0] + for x in self.parameters()) * 0. + feats[i].sum() * 0. + return roi_feats diff --git a/mmdetection/mmdet/models/roi_heads/scnet_roi_head.py b/mmdetection/mmdet/models/roi_heads/scnet_roi_head.py new file mode 100644 index 0000000..e6d2bc1 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/scnet_roi_head.py @@ -0,0 +1,677 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +import torch.nn.functional as F +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList, OptConfigType +from ..layers import adaptive_avg_pool2d +from ..task_modules.samplers import SamplingResult +from ..utils import empty_instances, unpack_gt_instances +from .cascade_roi_head import CascadeRoIHead + + +@MODELS.register_module() +class SCNetRoIHead(CascadeRoIHead): + """RoIHead for `SCNet `_. + + Args: + num_stages (int): number of cascade stages. + stage_loss_weights (list): loss weight of cascade stages. + semantic_roi_extractor (dict): config to init semantic roi extractor. + semantic_head (dict): config to init semantic head. + feat_relay_head (dict): config to init feature_relay_head. + glbctx_head (dict): config to init global context head. + """ + + def __init__(self, + num_stages: int, + stage_loss_weights: List[float], + semantic_roi_extractor: OptConfigType = None, + semantic_head: OptConfigType = None, + feat_relay_head: OptConfigType = None, + glbctx_head: OptConfigType = None, + **kwargs) -> None: + super().__init__( + num_stages=num_stages, + stage_loss_weights=stage_loss_weights, + **kwargs) + assert self.with_bbox and self.with_mask + assert not self.with_shared_head # shared head is not supported + + if semantic_head is not None: + self.semantic_roi_extractor = MODELS.build(semantic_roi_extractor) + self.semantic_head = MODELS.build(semantic_head) + + if feat_relay_head is not None: + self.feat_relay_head = MODELS.build(feat_relay_head) + + if glbctx_head is not None: + self.glbctx_head = MODELS.build(glbctx_head) + + def init_mask_head(self, mask_roi_extractor: ConfigType, + mask_head: ConfigType) -> None: + """Initialize ``mask_head``""" + if mask_roi_extractor is not None: + self.mask_roi_extractor = MODELS.build(mask_roi_extractor) + self.mask_head = MODELS.build(mask_head) + + # TODO move to base_roi_head later + @property + def with_semantic(self) -> bool: + """bool: whether the head has semantic head""" + return hasattr(self, + 'semantic_head') and self.semantic_head is not None + + @property + def with_feat_relay(self) -> bool: + """bool: whether the head has feature relay head""" + return (hasattr(self, 'feat_relay_head') + and self.feat_relay_head is not None) + + @property + def with_glbctx(self) -> bool: + """bool: whether the head has global context head""" + return hasattr(self, 'glbctx_head') and self.glbctx_head is not None + + def _fuse_glbctx(self, roi_feats: Tensor, glbctx_feat: Tensor, + rois: Tensor) -> Tensor: + """Fuse global context feats with roi feats. + + Args: + roi_feats (Tensor): RoI features. + glbctx_feat (Tensor): Global context feature.. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + Tensor: Fused feature. + """ + assert roi_feats.size(0) == rois.size(0) + # RuntimeError: isDifferentiableType(variable.scalar_type()) + # INTERNAL ASSERT FAILED if detach() is not used when calling + # roi_head.predict(). + img_inds = torch.unique(rois[:, 0].detach().cpu(), sorted=True).long() + fused_feats = torch.zeros_like(roi_feats) + for img_id in img_inds: + inds = (rois[:, 0] == img_id.item()) + fused_feats[inds] = roi_feats[inds] + glbctx_feat[img_id] + return fused_feats + + def _slice_pos_feats(self, feats: Tensor, + sampling_results: List[SamplingResult]) -> Tensor: + """Get features from pos rois. + + Args: + feats (Tensor): Input features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + + Returns: + Tensor: Sliced features. + """ + num_rois = [res.priors.size(0) for res in sampling_results] + num_pos_rois = [res.pos_priors.size(0) for res in sampling_results] + inds = torch.zeros(sum(num_rois), dtype=torch.bool) + start = 0 + for i in range(len(num_rois)): + start = 0 if i == 0 else start + num_rois[i - 1] + stop = start + num_pos_rois[i] + inds[start:stop] = 1 + sliced_feats = feats[inds] + return sliced_feats + + def _bbox_forward(self, + stage: int, + x: Tuple[Tensor], + rois: Tensor, + semantic_feat: Optional[Tensor] = None, + glbctx_feat: Optional[Tensor] = None) -> dict: + """Box head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + semantic_feat (Tensor): Semantic feature. Defaults to None. + glbctx_feat (Tensor): Global context feature. Defaults to None. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + bbox_roi_extractor = self.bbox_roi_extractor[stage] + bbox_head = self.bbox_head[stage] + bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], + rois) + if self.with_semantic and semantic_feat is not None: + bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat], + rois) + if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]: + bbox_semantic_feat = adaptive_avg_pool2d( + bbox_semantic_feat, bbox_feats.shape[-2:]) + bbox_feats += bbox_semantic_feat + if self.with_glbctx and glbctx_feat is not None: + bbox_feats = self._fuse_glbctx(bbox_feats, glbctx_feat, rois) + cls_score, bbox_pred, relayed_feat = bbox_head( + bbox_feats, return_shared_feat=True) + + bbox_results = dict( + cls_score=cls_score, + bbox_pred=bbox_pred, + relayed_feat=relayed_feat) + return bbox_results + + def _mask_forward(self, + x: Tuple[Tensor], + rois: Tensor, + semantic_feat: Optional[Tensor] = None, + glbctx_feat: Optional[Tensor] = None, + relayed_feat: Optional[Tensor] = None) -> dict: + """Mask head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + semantic_feat (Tensor): Semantic feature. Defaults to None. + glbctx_feat (Tensor): Global context feature. Defaults to None. + relayed_feat (Tensor): Relayed feature. Defaults to None. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + """ + mask_feats = self.mask_roi_extractor( + x[:self.mask_roi_extractor.num_inputs], rois) + if self.with_semantic and semantic_feat is not None: + mask_semantic_feat = self.semantic_roi_extractor([semantic_feat], + rois) + if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]: + mask_semantic_feat = F.adaptive_avg_pool2d( + mask_semantic_feat, mask_feats.shape[-2:]) + mask_feats += mask_semantic_feat + if self.with_glbctx and glbctx_feat is not None: + mask_feats = self._fuse_glbctx(mask_feats, glbctx_feat, rois) + if self.with_feat_relay and relayed_feat is not None: + mask_feats = mask_feats + relayed_feat + mask_preds = self.mask_head(mask_feats) + mask_results = dict(mask_preds=mask_preds) + + return mask_results + + def bbox_loss(self, + stage: int, + x: Tuple[Tensor], + sampling_results: List[SamplingResult], + semantic_feat: Optional[Tensor] = None, + glbctx_feat: Optional[Tensor] = None) -> dict: + """Run forward function and calculate loss for box head in training. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + semantic_feat (Tensor): Semantic feature. Defaults to None. + glbctx_feat (Tensor): Global context feature. Defaults to None. + + Returns: + dict: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + - `rois` (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + - `bbox_targets` (tuple): Ground truth for proposals in a + single image. Containing the following list of Tensors: + (labels, label_weights, bbox_targets, bbox_weights) + """ + bbox_head = self.bbox_head[stage] + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward( + stage, + x, + rois, + semantic_feat=semantic_feat, + glbctx_feat=glbctx_feat) + bbox_results.update(rois=rois) + + bbox_loss_and_target = bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + bbox_pred=bbox_results['bbox_pred'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg[stage]) + + bbox_results.update(bbox_loss_and_target) + return bbox_results + + def mask_loss(self, + x: Tuple[Tensor], + sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList, + semantic_feat: Optional[Tensor] = None, + glbctx_feat: Optional[Tensor] = None, + relayed_feat: Optional[Tensor] = None) -> dict: + """Run forward function and calculate loss for mask head in training. + + Args: + x (tuple[Tensor]): Tuple of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + semantic_feat (Tensor): Semantic feature. Defaults to None. + glbctx_feat (Tensor): Global context feature. Defaults to None. + relayed_feat (Tensor): Relayed feature. Defaults to None. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `loss_mask` (dict): A dictionary of mask loss components. + """ + pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) + mask_results = self._mask_forward( + x, + pos_rois, + semantic_feat=semantic_feat, + glbctx_feat=glbctx_feat, + relayed_feat=relayed_feat) + + mask_loss_and_target = self.mask_head.loss_and_target( + mask_preds=mask_results['mask_preds'], + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=self.train_cfg[-1]) + mask_results.update(mask_loss_and_target) + + return mask_results + + def semantic_loss(self, x: Tuple[Tensor], + batch_data_samples: SampleList) -> dict: + """Semantic segmentation loss. + + Args: + x (Tuple[Tensor]): Tuple of multi-level img features. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: Usually returns a dictionary with keys: + + - `semantic_feat` (Tensor): Semantic feature. + - `loss_seg` (dict): Semantic segmentation loss. + """ + gt_semantic_segs = [ + data_sample.gt_sem_seg.sem_seg + for data_sample in batch_data_samples + ] + gt_semantic_segs = torch.stack(gt_semantic_segs) + semantic_pred, semantic_feat = self.semantic_head(x) + loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_segs) + + semantic_results = dict(loss_seg=loss_seg, semantic_feat=semantic_feat) + + return semantic_results + + def global_context_loss(self, x: Tuple[Tensor], + batch_gt_instances: InstanceList) -> dict: + """Global context loss. + + Args: + x (Tuple[Tensor]): Tuple of multi-level img features. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + + Returns: + dict: Usually returns a dictionary with keys: + + - `glbctx_feat` (Tensor): Global context feature. + - `loss_glbctx` (dict): Global context loss. + """ + gt_labels = [ + gt_instances.labels for gt_instances in batch_gt_instances + ] + mc_pred, glbctx_feat = self.glbctx_head(x) + loss_glbctx = self.glbctx_head.loss(mc_pred, gt_labels) + global_context_results = dict( + loss_glbctx=loss_glbctx, glbctx_feat=glbctx_feat) + + return global_context_results + + def loss(self, x: Tensor, rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + + losses = dict() + + # semantic segmentation branch + if self.with_semantic: + semantic_results = self.semantic_loss( + x=x, batch_data_samples=batch_data_samples) + losses['loss_semantic_seg'] = semantic_results['loss_seg'] + semantic_feat = semantic_results['semantic_feat'] + else: + semantic_feat = None + + # global context branch + if self.with_glbctx: + global_context_results = self.global_context_loss( + x=x, batch_gt_instances=batch_gt_instances) + losses['loss_glbctx'] = global_context_results['loss_glbctx'] + glbctx_feat = global_context_results['glbctx_feat'] + else: + glbctx_feat = None + + results_list = rpn_results_list + num_imgs = len(batch_img_metas) + for stage in range(self.num_stages): + stage_loss_weight = self.stage_loss_weights[stage] + + # assign gts and sample proposals + sampling_results = [] + bbox_assigner = self.bbox_assigner[stage] + bbox_sampler = self.bbox_sampler[stage] + for i in range(num_imgs): + results = results_list[i] + # rename rpn_results.bboxes to rpn_results.priors + results.priors = results.pop('bboxes') + + assign_result = bbox_assigner.assign( + results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = bbox_sampler.sample( + assign_result, + results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + # bbox head forward and loss + bbox_results = self.bbox_loss( + stage=stage, + x=x, + sampling_results=sampling_results, + semantic_feat=semantic_feat, + glbctx_feat=glbctx_feat) + + for name, value in bbox_results['loss_bbox'].items(): + losses[f's{stage}.{name}'] = ( + value * stage_loss_weight if 'loss' in name else value) + + # refine bboxes + if stage < self.num_stages - 1: + bbox_head = self.bbox_head[stage] + with torch.no_grad(): + results_list = bbox_head.refine_bboxes( + sampling_results=sampling_results, + bbox_results=bbox_results, + batch_img_metas=batch_img_metas) + + if self.with_feat_relay: + relayed_feat = self._slice_pos_feats(bbox_results['relayed_feat'], + sampling_results) + relayed_feat = self.feat_relay_head(relayed_feat) + else: + relayed_feat = None + + # mask head forward and loss + mask_results = self.mask_loss( + x=x, + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + semantic_feat=semantic_feat, + glbctx_feat=glbctx_feat, + relayed_feat=relayed_feat) + mask_stage_loss_weight = sum(self.stage_loss_weights) + losses['loss_mask'] = mask_stage_loss_weight * mask_results[ + 'loss_mask']['loss_mask'] + + return losses + + def predict(self, + x: Tuple[Tensor], + rpn_results_list: InstanceList, + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the roi head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from upstream network. Each + has shape (N, C, H, W). + rpn_results_list (list[:obj:`InstanceData`]): list of region + proposals. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results to + the original image. Defaults to False. + + Returns: + list[obj:`InstanceData`]: Detection results of each image. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + assert self.with_bbox, 'Bbox head must be implemented.' + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + if self.with_semantic: + _, semantic_feat = self.semantic_head(x) + else: + semantic_feat = None + + if self.with_glbctx: + _, glbctx_feat = self.glbctx_head(x) + else: + glbctx_feat = None + + # TODO: nms_op in mmcv need be enhanced, the bbox result may get + # difference when not rescale in bbox_head + + # If it has the mask branch, the bbox branch does not need + # to be scaled to the original image scale, because the mask + # branch will scale both bbox and mask at the same time. + bbox_rescale = rescale if not self.with_mask else False + results_list = self.predict_bbox( + x=x, + semantic_feat=semantic_feat, + glbctx_feat=glbctx_feat, + batch_img_metas=batch_img_metas, + rpn_results_list=rpn_results_list, + rcnn_test_cfg=self.test_cfg, + rescale=bbox_rescale) + + if self.with_mask: + results_list = self.predict_mask( + x=x, + semantic_heat=semantic_feat, + glbctx_feat=glbctx_feat, + batch_img_metas=batch_img_metas, + results_list=results_list, + rescale=rescale) + + return results_list + + def predict_mask(self, + x: Tuple[Tensor], + semantic_heat: Tensor, + glbctx_feat: Tensor, + batch_img_metas: List[dict], + results_list: List[InstanceData], + rescale: bool = False) -> List[InstanceData]: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + semantic_feat (Tensor): Semantic feature. + glbctx_feat (Tensor): Global context feature. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + bboxes = [res.bboxes for res in results_list] + mask_rois = bbox2roi(bboxes) + if mask_rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas=batch_img_metas, + device=mask_rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + bboxes_results = self._bbox_forward( + stage=-1, + x=x, + rois=mask_rois, + semantic_feat=semantic_heat, + glbctx_feat=glbctx_feat) + relayed_feat = bboxes_results['relayed_feat'] + relayed_feat = self.feat_relay_head(relayed_feat) + + mask_results = self._mask_forward( + x=x, + rois=mask_rois, + semantic_feat=semantic_heat, + glbctx_feat=glbctx_feat, + relayed_feat=relayed_feat) + mask_preds = mask_results['mask_preds'] + + # split batch mask prediction back to each image + num_bbox_per_img = tuple(len(_bbox) for _bbox in bboxes) + mask_preds = mask_preds.split(num_bbox_per_img, 0) + + results_list = self.mask_head.predict_by_feat( + mask_preds=mask_preds, + results_list=results_list, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale) + + return results_list + + def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + x (List[Tensor]): Multi-level features that may have different + resolutions. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns + tuple: A tuple of features from ``bbox_head`` and ``mask_head`` + forward. + """ + results = () + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + if self.with_semantic: + _, semantic_feat = self.semantic_head(x) + else: + semantic_feat = None + + if self.with_glbctx: + _, glbctx_feat = self.glbctx_head(x) + else: + glbctx_feat = None + + proposals = [rpn_results.bboxes for rpn_results in rpn_results_list] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = bbox2roi(proposals) + # bbox head + if self.with_bbox: + rois, cls_scores, bbox_preds = self._refine_roi( + x=x, + rois=rois, + semantic_feat=semantic_feat, + glbctx_feat=glbctx_feat, + batch_img_metas=batch_img_metas, + num_proposals_per_img=num_proposals_per_img) + results = results + (cls_scores, bbox_preds) + # mask head + if self.with_mask: + rois = torch.cat(rois) + bboxes_results = self._bbox_forward( + stage=-1, + x=x, + rois=rois, + semantic_feat=semantic_feat, + glbctx_feat=glbctx_feat) + relayed_feat = bboxes_results['relayed_feat'] + relayed_feat = self.feat_relay_head(relayed_feat) + mask_results = self._mask_forward( + x=x, + rois=rois, + semantic_feat=semantic_feat, + glbctx_feat=glbctx_feat, + relayed_feat=relayed_feat) + mask_preds = mask_results['mask_preds'] + mask_preds = mask_preds.split(num_proposals_per_img, 0) + results = results + (mask_preds, ) + return results diff --git a/mmdetection/mmdet/models/roi_heads/shared_heads/__init__.py b/mmdetection/mmdet/models/roi_heads/shared_heads/__init__.py new file mode 100644 index 0000000..d56636a --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/shared_heads/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .res_layer import ResLayer + +__all__ = ['ResLayer'] diff --git a/mmdetection/mmdet/models/roi_heads/shared_heads/res_layer.py b/mmdetection/mmdet/models/roi_heads/shared_heads/res_layer.py new file mode 100644 index 0000000..d9210cb --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/shared_heads/res_layer.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch.nn as nn +from mmengine.model import BaseModule + +from mmdet.models.backbones import ResNet +from mmdet.models.layers import ResLayer as _ResLayer +from mmdet.registry import MODELS + + +@MODELS.register_module() +class ResLayer(BaseModule): + + def __init__(self, + depth, + stage=3, + stride=2, + dilation=1, + style='pytorch', + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + with_cp=False, + dcn=None, + pretrained=None, + init_cfg=None): + super(ResLayer, self).__init__(init_cfg) + + self.norm_eval = norm_eval + self.norm_cfg = norm_cfg + self.stage = stage + self.fp16_enabled = False + block, stage_blocks = ResNet.arch_settings[depth] + stage_block = stage_blocks[stage] + planes = 64 * 2**stage + inplanes = 64 * 2**(stage - 1) * block.expansion + + res_layer = _ResLayer( + block, + inplanes, + planes, + stage_block, + stride=stride, + dilation=dilation, + style=style, + with_cp=with_cp, + norm_cfg=self.norm_cfg, + dcn=dcn) + self.add_module(f'layer{stage + 1}', res_layer) + + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be specified at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is a deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is None: + if init_cfg is None: + self.init_cfg = [ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ] + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + res_layer = getattr(self, f'layer{self.stage + 1}') + out = res_layer(x) + return out + + def train(self, mode=True): + super(ResLayer, self).train(mode) + if self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() diff --git a/mmdetection/mmdet/models/roi_heads/sparse_roi_head.py b/mmdetection/mmdet/models/roi_heads/sparse_roi_head.py new file mode 100644 index 0000000..19c3e1e --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/sparse_roi_head.py @@ -0,0 +1,601 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.task_modules.samplers import PseudoSampler +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList, OptConfigType +from ..utils.misc import empty_instances, unpack_gt_instances +from .cascade_roi_head import CascadeRoIHead + + +@MODELS.register_module() +class SparseRoIHead(CascadeRoIHead): + r"""The RoIHead for `Sparse R-CNN: End-to-End Object Detection with + Learnable Proposals `_ + and `Instances as Queries `_ + + Args: + num_stages (int): Number of stage whole iterative process. + Defaults to 6. + stage_loss_weights (Tuple[float]): The loss + weight of each stage. By default all stages have + the same weight 1. + bbox_roi_extractor (:obj:`ConfigDict` or dict): Config of box + roi extractor. + mask_roi_extractor (:obj:`ConfigDict` or dict): Config of mask + roi extractor. + bbox_head (:obj:`ConfigDict` or dict): Config of box head. + mask_head (:obj:`ConfigDict` or dict): Config of mask head. + train_cfg (:obj:`ConfigDict` or dict, Optional): Configuration + information in train stage. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, Optional): Configuration + information in test stage. Defaults to None. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. Defaults to None. + """ + + def __init__(self, + num_stages: int = 6, + stage_loss_weights: Tuple[float] = (1, 1, 1, 1, 1, 1), + proposal_feature_channel: int = 256, + bbox_roi_extractor: ConfigType = dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_roi_extractor: OptConfigType = None, + bbox_head: ConfigType = dict( + type='DIIHead', + num_classes=80, + num_fcs=2, + num_heads=8, + num_cls_fcs=1, + num_reg_fcs=3, + feedforward_channels=2048, + hidden_channels=256, + dropout=0.0, + roi_feat_size=7, + ffn_act_cfg=dict(type='ReLU', inplace=True)), + mask_head: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptConfigType = None) -> None: + assert bbox_roi_extractor is not None + assert bbox_head is not None + assert len(stage_loss_weights) == num_stages + self.num_stages = num_stages + self.stage_loss_weights = stage_loss_weights + self.proposal_feature_channel = proposal_feature_channel + super().__init__( + num_stages=num_stages, + stage_loss_weights=stage_loss_weights, + bbox_roi_extractor=bbox_roi_extractor, + mask_roi_extractor=mask_roi_extractor, + bbox_head=bbox_head, + mask_head=mask_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + # train_cfg would be None when run the test.py + if train_cfg is not None: + for stage in range(num_stages): + assert isinstance(self.bbox_sampler[stage], PseudoSampler), \ + 'Sparse R-CNN and QueryInst only support `PseudoSampler`' + + def bbox_loss(self, stage: int, x: Tuple[Tensor], + results_list: InstanceList, object_feats: Tensor, + batch_img_metas: List[dict], + batch_gt_instances: InstanceList) -> dict: + """Perform forward propagation and loss calculation of the bbox head on + the features of the upstream network. + + Args: + stage (int): The current stage in iterative process. + x (tuple[Tensor]): List of multi-level img features. + results_list (List[:obj:`InstanceData`]) : List of region + proposals. + object_feats (Tensor): The object feature extracted from + the previous stage. + batch_img_metas (list[dict]): Meta information of each image. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + """ + proposal_list = [res.bboxes for res in results_list] + rois = bbox2roi(proposal_list) + bbox_results = self._bbox_forward(stage, x, rois, object_feats, + batch_img_metas) + imgs_whwh = torch.cat( + [res.imgs_whwh[None, ...] for res in results_list]) + cls_pred_list = bbox_results['detached_cls_scores'] + proposal_list = bbox_results['detached_proposals'] + + sampling_results = [] + bbox_head = self.bbox_head[stage] + for i in range(len(batch_img_metas)): + pred_instances = InstanceData() + # TODO: Enhance the logic + pred_instances.bboxes = proposal_list[i] # for assinger + pred_instances.scores = cls_pred_list[i] + pred_instances.priors = proposal_list[i] # for sampler + + assign_result = self.bbox_assigner[stage].assign( + pred_instances=pred_instances, + gt_instances=batch_gt_instances[i], + gt_instances_ignore=None, + img_meta=batch_img_metas[i]) + + sampling_result = self.bbox_sampler[stage].sample( + assign_result, pred_instances, batch_gt_instances[i]) + sampling_results.append(sampling_result) + + bbox_results.update(sampling_results=sampling_results) + + cls_score = bbox_results['cls_score'] + decoded_bboxes = bbox_results['decoded_bboxes'] + cls_score = cls_score.view(-1, cls_score.size(-1)) + decoded_bboxes = decoded_bboxes.view(-1, 4) + bbox_loss_and_target = bbox_head.loss_and_target( + cls_score, + decoded_bboxes, + sampling_results, + self.train_cfg[stage], + imgs_whwh=imgs_whwh, + concat=True) + bbox_results.update(bbox_loss_and_target) + + # propose for the new proposal_list + proposal_list = [] + for idx in range(len(batch_img_metas)): + results = InstanceData() + results.imgs_whwh = results_list[idx].imgs_whwh + results.bboxes = bbox_results['detached_proposals'][idx] + proposal_list.append(results) + bbox_results.update(results_list=proposal_list) + return bbox_results + + def _bbox_forward(self, stage: int, x: Tuple[Tensor], rois: Tensor, + object_feats: Tensor, + batch_img_metas: List[dict]) -> dict: + """Box head forward function used in both training and testing. Returns + all regression, classification results and a intermediate feature. + + Args: + stage (int): The current stage in iterative process. + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + Each dimension means (img_index, x1, y1, x2, y2). + object_feats (Tensor): The object feature extracted from + the previous stage. + batch_img_metas (list[dict]): Meta information of each image. + + Returns: + dict[str, Tensor]: a dictionary of bbox head outputs, + Containing the following results: + + - cls_score (Tensor): The score of each class, has + shape (batch_size, num_proposals, num_classes) + when use focal loss or + (batch_size, num_proposals, num_classes+1) + otherwise. + - decoded_bboxes (Tensor): The regression results + with shape (batch_size, num_proposal, 4). + The last dimension 4 represents + [tl_x, tl_y, br_x, br_y]. + - object_feats (Tensor): The object feature extracted + from current stage + - detached_cls_scores (list[Tensor]): The detached + classification results, length is batch_size, and + each tensor has shape (num_proposal, num_classes). + - detached_proposals (list[tensor]): The detached + regression results, length is batch_size, and each + tensor has shape (num_proposal, 4). The last + dimension 4 represents [tl_x, tl_y, br_x, br_y]. + """ + num_imgs = len(batch_img_metas) + bbox_roi_extractor = self.bbox_roi_extractor[stage] + bbox_head = self.bbox_head[stage] + bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], + rois) + cls_score, bbox_pred, object_feats, attn_feats = bbox_head( + bbox_feats, object_feats) + + fake_bbox_results = dict( + rois=rois, + bbox_targets=(rois.new_zeros(len(rois), dtype=torch.long), None), + bbox_pred=bbox_pred.view(-1, bbox_pred.size(-1)), + cls_score=cls_score.view(-1, cls_score.size(-1))) + fake_sampling_results = [ + InstanceData(pos_is_gt=rois.new_zeros(object_feats.size(1))) + for _ in range(len(batch_img_metas)) + ] + + results_list = bbox_head.refine_bboxes( + sampling_results=fake_sampling_results, + bbox_results=fake_bbox_results, + batch_img_metas=batch_img_metas) + proposal_list = [res.bboxes for res in results_list] + bbox_results = dict( + cls_score=cls_score, + decoded_bboxes=torch.cat(proposal_list), + object_feats=object_feats, + attn_feats=attn_feats, + # detach then use it in label assign + detached_cls_scores=[ + cls_score[i].detach() for i in range(num_imgs) + ], + detached_proposals=[item.detach() for item in proposal_list]) + + return bbox_results + + def _mask_forward(self, stage: int, x: Tuple[Tensor], rois: Tensor, + attn_feats) -> dict: + """Mask head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + attn_feats (Tensot): Intermediate feature get from the last + diihead, has shape + (batch_size*num_proposals, feature_dimensions) + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + """ + mask_roi_extractor = self.mask_roi_extractor[stage] + mask_head = self.mask_head[stage] + mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs], + rois) + # do not support caffe_c4 model anymore + mask_preds = mask_head(mask_feats, attn_feats) + + mask_results = dict(mask_preds=mask_preds) + return mask_results + + def mask_loss(self, stage: int, x: Tuple[Tensor], bbox_results: dict, + batch_gt_instances: InstanceList, + rcnn_train_cfg: ConfigDict) -> dict: + """Run forward function and calculate loss for mask head in training. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + bbox_results (dict): Results obtained from `bbox_loss`. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `loss_mask` (dict): A dictionary of mask loss components. + """ + attn_feats = bbox_results['attn_feats'] + sampling_results = bbox_results['sampling_results'] + + pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) + + attn_feats = torch.cat([ + feats[res.pos_inds] + for (feats, res) in zip(attn_feats, sampling_results) + ]) + mask_results = self._mask_forward(stage, x, pos_rois, attn_feats) + + mask_loss_and_target = self.mask_head[stage].loss_and_target( + mask_preds=mask_results['mask_preds'], + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=rcnn_train_cfg) + mask_results.update(mask_loss_and_target) + + return mask_results + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (List[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: a dictionary of loss components of all stage. + """ + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + + object_feats = torch.cat( + [res.pop('features')[None, ...] for res in rpn_results_list]) + results_list = rpn_results_list + losses = {} + for stage in range(self.num_stages): + stage_loss_weight = self.stage_loss_weights[stage] + + # bbox head forward and loss + bbox_results = self.bbox_loss( + stage=stage, + x=x, + object_feats=object_feats, + results_list=results_list, + batch_img_metas=batch_img_metas, + batch_gt_instances=batch_gt_instances) + + for name, value in bbox_results['loss_bbox'].items(): + losses[f's{stage}.{name}'] = ( + value * stage_loss_weight if 'loss' in name else value) + + if self.with_mask: + mask_results = self.mask_loss( + stage=stage, + x=x, + bbox_results=bbox_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=self.train_cfg[stage]) + + for name, value in mask_results['loss_mask'].items(): + losses[f's{stage}.{name}'] = ( + value * stage_loss_weight if 'loss' in name else value) + + object_feats = bbox_results['object_feats'] + results_list = bbox_results['results_list'] + return losses + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the bbox head and predict detection + results on the features of the upstream network. + + Args: + x(tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + proposal_list = [res.bboxes for res in rpn_results_list] + object_feats = torch.cat( + [res.pop('features')[None, ...] for res in rpn_results_list]) + if all([proposal.shape[0] == 0 for proposal in proposal_list]): + # There is no proposal in the whole batch + return empty_instances( + batch_img_metas, x[0].device, task_type='bbox') + + for stage in range(self.num_stages): + rois = bbox2roi(proposal_list) + bbox_results = self._bbox_forward(stage, x, rois, object_feats, + batch_img_metas) + object_feats = bbox_results['object_feats'] + cls_score = bbox_results['cls_score'] + proposal_list = bbox_results['detached_proposals'] + + num_classes = self.bbox_head[-1].num_classes + + if self.bbox_head[-1].loss_cls.use_sigmoid: + cls_score = cls_score.sigmoid() + else: + cls_score = cls_score.softmax(-1)[..., :-1] + + topk_inds_list = [] + results_list = [] + for img_id in range(len(batch_img_metas)): + cls_score_per_img = cls_score[img_id] + scores_per_img, topk_inds = cls_score_per_img.flatten(0, 1).topk( + self.test_cfg.max_per_img, sorted=False) + labels_per_img = topk_inds % num_classes + bboxes_per_img = proposal_list[img_id][topk_inds // num_classes] + topk_inds_list.append(topk_inds) + if rescale and bboxes_per_img.size(0) > 0: + assert batch_img_metas[img_id].get('scale_factor') is not None + scale_factor = bboxes_per_img.new_tensor( + batch_img_metas[img_id]['scale_factor']).repeat((1, 2)) + bboxes_per_img = ( + bboxes_per_img.view(bboxes_per_img.size(0), -1, 4) / + scale_factor).view(bboxes_per_img.size()[0], -1) + + results = InstanceData() + results.bboxes = bboxes_per_img + results.scores = scores_per_img + results.labels = labels_per_img + results_list.append(results) + if self.with_mask: + for img_id in range(len(batch_img_metas)): + # add positive information in InstanceData to predict + # mask results in `mask_head`. + proposals = bbox_results['detached_proposals'][img_id] + topk_inds = topk_inds_list[img_id] + attn_feats = bbox_results['attn_feats'][img_id] + + results_list[img_id].proposals = proposals + results_list[img_id].topk_inds = topk_inds + results_list[img_id].attn_feats = attn_feats + return results_list + + def predict_mask(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + results_list: InstanceList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. Each item usually contains following keys: + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - proposal (Tensor): Bboxes predicted from bbox_head, + has a shape (num_instances, 4). + - topk_inds (Tensor): Topk indices of each image, has + shape (num_instances, ) + - attn_feats (Tensor): Intermediate feature get from the last + diihead, has shape (num_instances, feature_dimensions) + + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + proposal_list = [res.pop('proposals') for res in results_list] + topk_inds_list = [res.pop('topk_inds') for res in results_list] + attn_feats = torch.cat( + [res.pop('attn_feats')[None, ...] for res in results_list]) + + rois = bbox2roi(proposal_list) + + if rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas, + rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + last_stage = self.num_stages - 1 + mask_results = self._mask_forward(last_stage, x, rois, attn_feats) + + num_imgs = len(batch_img_metas) + mask_results['mask_preds'] = mask_results['mask_preds'].reshape( + num_imgs, -1, *mask_results['mask_preds'].size()[1:]) + num_classes = self.bbox_head[-1].num_classes + + mask_preds = [] + for img_id in range(num_imgs): + topk_inds = topk_inds_list[img_id] + masks_per_img = mask_results['mask_preds'][img_id].flatten( + 0, 1)[topk_inds] + masks_per_img = masks_per_img[:, None, + ...].repeat(1, num_classes, 1, 1) + mask_preds.append(masks_per_img) + results_list = self.mask_head[-1].predict_by_feat( + mask_preds, + results_list, + batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale) + + return results_list + + # TODO: Need to refactor later + def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + x (List[Tensor]): Multi-level features that may have different + resolutions. + rpn_results_list (List[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns + tuple: A tuple of features from ``bbox_head`` and ``mask_head`` + forward. + """ + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + all_stage_bbox_results = [] + object_feats = torch.cat( + [res.pop('features')[None, ...] for res in rpn_results_list]) + results_list = rpn_results_list + if self.with_bbox: + for stage in range(self.num_stages): + bbox_results = self.bbox_loss( + stage=stage, + x=x, + results_list=results_list, + object_feats=object_feats, + batch_img_metas=batch_img_metas, + batch_gt_instances=batch_gt_instances) + bbox_results.pop('loss_bbox') + # torch.jit does not support obj:SamplingResult + bbox_results.pop('results_list') + bbox_res = bbox_results.copy() + bbox_res.pop('sampling_results') + all_stage_bbox_results.append((bbox_res, )) + + if self.with_mask: + attn_feats = bbox_results['attn_feats'] + sampling_results = bbox_results['sampling_results'] + + pos_rois = bbox2roi( + [res.pos_priors for res in sampling_results]) + + attn_feats = torch.cat([ + feats[res.pos_inds] + for (feats, res) in zip(attn_feats, sampling_results) + ]) + mask_results = self._mask_forward(stage, x, pos_rois, + attn_feats) + all_stage_bbox_results[-1] += (mask_results, ) + return tuple(all_stage_bbox_results) diff --git a/mmdetection/mmdet/models/roi_heads/standard_roi_head.py b/mmdetection/mmdet/models/roi_heads/standard_roi_head.py new file mode 100644 index 0000000..8d168eb --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/standard_roi_head.py @@ -0,0 +1,419 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import DetDataSample, SampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import ConfigType, InstanceList +from ..task_modules.samplers import SamplingResult +from ..utils import empty_instances, unpack_gt_instances +from .base_roi_head import BaseRoIHead + + +@MODELS.register_module() +class StandardRoIHead(BaseRoIHead): + """Simplest base roi head including one bbox head and one mask head.""" + + def init_assigner_sampler(self) -> None: + """Initialize assigner and sampler.""" + self.bbox_assigner = None + self.bbox_sampler = None + if self.train_cfg: + self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner) + self.bbox_sampler = TASK_UTILS.build( + self.train_cfg.sampler, default_args=dict(context=self)) + + def init_bbox_head(self, bbox_roi_extractor: ConfigType, + bbox_head: ConfigType) -> None: + """Initialize box head and box roi extractor. + + Args: + bbox_roi_extractor (dict or ConfigDict): Config of box + roi extractor. + bbox_head (dict or ConfigDict): Config of box in box head. + """ + self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor) + self.bbox_head = MODELS.build(bbox_head) + + def init_mask_head(self, mask_roi_extractor: ConfigType, + mask_head: ConfigType) -> None: + """Initialize mask head and mask roi extractor. + + Args: + mask_roi_extractor (dict or ConfigDict): Config of mask roi + extractor. + mask_head (dict or ConfigDict): Config of mask in mask head. + """ + if mask_roi_extractor is not None: + self.mask_roi_extractor = MODELS.build(mask_roi_extractor) + self.share_roi_extractor = False + else: + self.share_roi_extractor = True + self.mask_roi_extractor = self.bbox_roi_extractor + self.mask_head = MODELS.build(mask_head) + + # TODO: Need to refactor later + def forward(self, + x: Tuple[Tensor], + rpn_results_list: InstanceList, + batch_data_samples: SampleList = None) -> tuple: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + + Args: + x (List[Tensor]): Multi-level features that may have different + resolutions. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + + Returns + tuple: A tuple of features from ``bbox_head`` and ``mask_head`` + forward. + """ + results = () + proposals = [rpn_results.bboxes for rpn_results in rpn_results_list] + rois = bbox2roi(proposals) + # bbox head + if self.with_bbox: + bbox_results = self._bbox_forward(x, rois) + results = results + (bbox_results['cls_score'], + bbox_results['bbox_pred']) + # mask head + if self.with_mask: + mask_rois = rois[:100] + mask_results = self._mask_forward(x, mask_rois) + results = results + (mask_results['mask_preds'], ) + return results + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: List[DetDataSample]) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, _ = outputs + + # assign gts and sample proposals + num_imgs = len(batch_data_samples) + sampling_results = [] + for i in range(num_imgs): + # rename rpn_results.bboxes to rpn_results.priors + rpn_results = rpn_results_list[i] + rpn_results.priors = rpn_results.pop('bboxes') + + assign_result = self.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + losses = dict() + # bbox head loss + if self.with_bbox: + bbox_results = self.bbox_loss(x, sampling_results) + losses.update(bbox_results['loss_bbox']) + + # mask head forward and loss + if self.with_mask: + mask_results = self.mask_loss(x, sampling_results, + bbox_results['bbox_feats'], + batch_gt_instances) + losses.update(mask_results['loss_mask']) + + return losses + + def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + """Box head forward function used in both training and testing. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + # TODO: a more flexible way to decide which feature maps to use + bbox_feats = self.bbox_roi_extractor( + x[:self.bbox_roi_extractor.num_inputs], rois) + if self.with_shared_head: + bbox_feats = self.shared_head(bbox_feats) + cls_score, bbox_pred = self.bbox_head(bbox_feats) + + bbox_results = dict( + cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats) + return bbox_results + + def bbox_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult]) -> dict: + """Perform forward propagation and loss calculation of the bbox head on + the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + """ + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward(x, rois) + + bbox_loss_and_target = self.bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + bbox_pred=bbox_results['bbox_pred'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg) + + bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox']) + return bbox_results + + def mask_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult], bbox_feats: Tensor, + batch_gt_instances: InstanceList) -> dict: + """Perform forward propagation and loss calculation of the mask head on + the features of the upstream network. + + Args: + x (tuple[Tensor]): Tuple of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + bbox_feats (Tensor): Extract bbox RoI features. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `mask_feats` (Tensor): Extract mask RoI features. + - `mask_targets` (Tensor): Mask target of each positive\ + proposals in the image. + - `loss_mask` (dict): A dictionary of mask loss components. + """ + if not self.share_roi_extractor: + pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) + mask_results = self._mask_forward(x, pos_rois) + else: + pos_inds = [] + device = bbox_feats.device + for res in sampling_results: + pos_inds.append( + torch.ones( + res.pos_priors.shape[0], + device=device, + dtype=torch.uint8)) + pos_inds.append( + torch.zeros( + res.neg_priors.shape[0], + device=device, + dtype=torch.uint8)) + pos_inds = torch.cat(pos_inds) + + mask_results = self._mask_forward( + x, pos_inds=pos_inds, bbox_feats=bbox_feats) + + mask_loss_and_target = self.mask_head.loss_and_target( + mask_preds=mask_results['mask_preds'], + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=self.train_cfg) + + mask_results.update(loss_mask=mask_loss_and_target['loss_mask']) + return mask_results + + def _mask_forward(self, + x: Tuple[Tensor], + rois: Tensor = None, + pos_inds: Optional[Tensor] = None, + bbox_feats: Optional[Tensor] = None) -> dict: + """Mask head forward function used in both training and testing. + + Args: + x (tuple[Tensor]): Tuple of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + pos_inds (Tensor, optional): Indices of positive samples. + Defaults to None. + bbox_feats (Tensor): Extract bbox RoI features. Defaults to None. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `mask_feats` (Tensor): Extract mask RoI features. + """ + assert ((rois is not None) ^ + (pos_inds is not None and bbox_feats is not None)) + if rois is not None: + mask_feats = self.mask_roi_extractor( + x[:self.mask_roi_extractor.num_inputs], rois) + if self.with_shared_head: + mask_feats = self.shared_head(mask_feats) + else: + assert bbox_feats is not None + mask_feats = bbox_feats[pos_inds] + + mask_preds = self.mask_head(mask_feats) + mask_results = dict(mask_preds=mask_preds, mask_feats=mask_feats) + return mask_results + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the bbox head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + proposals = [res.bboxes for res in rpn_results_list] + rois = bbox2roi(proposals) + + if rois.shape[0] == 0: + return empty_instances( + batch_img_metas, + rois.device, + task_type='bbox', + box_type=self.bbox_head.predict_box_type, + num_classes=self.bbox_head.num_classes, + score_per_cls=rcnn_test_cfg is None) + + bbox_results = self._bbox_forward(x, rois) + + # split batch bbox prediction back to each image + cls_scores = bbox_results['cls_score'] + bbox_preds = bbox_results['bbox_pred'] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = rois.split(num_proposals_per_img, 0) + cls_scores = cls_scores.split(num_proposals_per_img, 0) + + # some detector with_reg is False, bbox_preds will be None + if bbox_preds is not None: + # TODO move this to a sabl_roi_head + # the bbox prediction of some detectors like SABL is not Tensor + if isinstance(bbox_preds, torch.Tensor): + bbox_preds = bbox_preds.split(num_proposals_per_img, 0) + else: + bbox_preds = self.bbox_head.bbox_pred_split( + bbox_preds, num_proposals_per_img) + else: + bbox_preds = (None, ) * len(proposals) + + result_list = self.bbox_head.predict_by_feat( + rois=rois, + cls_scores=cls_scores, + bbox_preds=bbox_preds, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=rcnn_test_cfg, + rescale=rescale) + return result_list + + def predict_mask(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + results_list: InstanceList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + # don't need to consider aug_test. + bboxes = [res.bboxes for res in results_list] + mask_rois = bbox2roi(bboxes) + if mask_rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas, + mask_rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + mask_results = self._mask_forward(x, mask_rois) + mask_preds = mask_results['mask_preds'] + # split batch mask prediction back to each image + num_mask_rois_per_img = [len(res) for res in results_list] + mask_preds = mask_preds.split(num_mask_rois_per_img, 0) + + # TODO: Handle the case where rescale is false + results_list = self.mask_head.predict_by_feat( + mask_preds=mask_preds, + results_list=results_list, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale) + return results_list diff --git a/mmdetection/mmdet/models/roi_heads/test_mixins.py b/mmdetection/mmdet/models/roi_heads/test_mixins.py new file mode 100644 index 0000000..9404904 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/test_mixins.py @@ -0,0 +1,171 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# TODO: delete this file after refactor +import sys + +import torch + +from mmdet.models.layers import multiclass_nms +from mmdet.models.test_time_augs import merge_aug_bboxes, merge_aug_masks +from mmdet.structures.bbox import bbox2roi, bbox_mapping + +if sys.version_info >= (3, 7): + from mmdet.utils.contextmanagers import completed + + +class BBoxTestMixin: + + if sys.version_info >= (3, 7): + # TODO: Currently not supported + async def async_test_bboxes(self, + x, + img_metas, + proposals, + rcnn_test_cfg, + rescale=False, + **kwargs): + """Asynchronized test for box head without augmentation.""" + rois = bbox2roi(proposals) + roi_feats = self.bbox_roi_extractor( + x[:len(self.bbox_roi_extractor.featmap_strides)], rois) + if self.with_shared_head: + roi_feats = self.shared_head(roi_feats) + sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017) + + async with completed( + __name__, 'bbox_head_forward', + sleep_interval=sleep_interval): + cls_score, bbox_pred = self.bbox_head(roi_feats) + + img_shape = img_metas[0]['img_shape'] + scale_factor = img_metas[0]['scale_factor'] + det_bboxes, det_labels = self.bbox_head.get_bboxes( + rois, + cls_score, + bbox_pred, + img_shape, + scale_factor, + rescale=rescale, + cfg=rcnn_test_cfg) + return det_bboxes, det_labels + + # TODO: Currently not supported + def aug_test_bboxes(self, feats, img_metas, rpn_results_list, + rcnn_test_cfg): + """Test det bboxes with test time augmentation.""" + aug_bboxes = [] + aug_scores = [] + for x, img_meta in zip(feats, img_metas): + # only one image in the batch + img_shape = img_meta[0]['img_shape'] + scale_factor = img_meta[0]['scale_factor'] + flip = img_meta[0]['flip'] + flip_direction = img_meta[0]['flip_direction'] + # TODO more flexible + proposals = bbox_mapping(rpn_results_list[0][:, :4], img_shape, + scale_factor, flip, flip_direction) + rois = bbox2roi([proposals]) + bbox_results = self.bbox_forward(x, rois) + bboxes, scores = self.bbox_head.get_bboxes( + rois, + bbox_results['cls_score'], + bbox_results['bbox_pred'], + img_shape, + scale_factor, + rescale=False, + cfg=None) + aug_bboxes.append(bboxes) + aug_scores.append(scores) + # after merging, bboxes will be rescaled to the original image size + merged_bboxes, merged_scores = merge_aug_bboxes( + aug_bboxes, aug_scores, img_metas, rcnn_test_cfg) + if merged_bboxes.shape[0] == 0: + # There is no proposal in the single image + det_bboxes = merged_bboxes.new_zeros(0, 5) + det_labels = merged_bboxes.new_zeros((0, ), dtype=torch.long) + else: + det_bboxes, det_labels = multiclass_nms(merged_bboxes, + merged_scores, + rcnn_test_cfg.score_thr, + rcnn_test_cfg.nms, + rcnn_test_cfg.max_per_img) + return det_bboxes, det_labels + + +class MaskTestMixin: + + if sys.version_info >= (3, 7): + # TODO: Currently not supported + async def async_test_mask(self, + x, + img_metas, + det_bboxes, + det_labels, + rescale=False, + mask_test_cfg=None): + """Asynchronized test for mask head without augmentation.""" + # image shape of the first image in the batch (only one) + ori_shape = img_metas[0]['ori_shape'] + scale_factor = img_metas[0]['scale_factor'] + if det_bboxes.shape[0] == 0: + segm_result = [[] for _ in range(self.mask_head.num_classes)] + else: + if rescale and not isinstance(scale_factor, + (float, torch.Tensor)): + scale_factor = det_bboxes.new_tensor(scale_factor) + _bboxes = ( + det_bboxes[:, :4] * + scale_factor if rescale else det_bboxes) + mask_rois = bbox2roi([_bboxes]) + mask_feats = self.mask_roi_extractor( + x[:len(self.mask_roi_extractor.featmap_strides)], + mask_rois) + + if self.with_shared_head: + mask_feats = self.shared_head(mask_feats) + if mask_test_cfg and \ + mask_test_cfg.get('async_sleep_interval'): + sleep_interval = mask_test_cfg['async_sleep_interval'] + else: + sleep_interval = 0.035 + async with completed( + __name__, + 'mask_head_forward', + sleep_interval=sleep_interval): + mask_pred = self.mask_head(mask_feats) + segm_result = self.mask_head.get_results( + mask_pred, _bboxes, det_labels, self.test_cfg, ori_shape, + scale_factor, rescale) + return segm_result + + # TODO: Currently not supported + def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels): + """Test for mask head with test time augmentation.""" + if det_bboxes.shape[0] == 0: + segm_result = [[] for _ in range(self.mask_head.num_classes)] + else: + aug_masks = [] + for x, img_meta in zip(feats, img_metas): + img_shape = img_meta[0]['img_shape'] + scale_factor = img_meta[0]['scale_factor'] + flip = img_meta[0]['flip'] + flip_direction = img_meta[0]['flip_direction'] + _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, + scale_factor, flip, flip_direction) + mask_rois = bbox2roi([_bboxes]) + mask_results = self._mask_forward(x, mask_rois) + # convert to numpy array to save memory + aug_masks.append( + mask_results['mask_pred'].sigmoid().cpu().numpy()) + merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg) + + ori_shape = img_metas[0][0]['ori_shape'] + scale_factor = det_bboxes.new_ones(4) + segm_result = self.mask_head.get_results( + merged_masks, + det_bboxes, + det_labels, + self.test_cfg, + ori_shape, + scale_factor=scale_factor, + rescale=False) + return segm_result diff --git a/mmdetection/mmdet/models/roi_heads/trident_roi_head.py b/mmdetection/mmdet/models/roi_heads/trident_roi_head.py new file mode 100644 index 0000000..5215327 --- /dev/null +++ b/mmdetection/mmdet/models/roi_heads/trident_roi_head.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +from mmcv.ops import batched_nms +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import InstanceList +from .standard_roi_head import StandardRoIHead + + +@MODELS.register_module() +class TridentRoIHead(StandardRoIHead): + """Trident roi head. + + Args: + num_branch (int): Number of branches in TridentNet. + test_branch_idx (int): In inference, all 3 branches will be used + if `test_branch_idx==-1`, otherwise only branch with index + `test_branch_idx` will be used. + """ + + def __init__(self, num_branch: int, test_branch_idx: int, + **kwargs) -> None: + self.num_branch = num_branch + self.test_branch_idx = test_branch_idx + super().__init__(**kwargs) + + def merge_trident_bboxes(self, + trident_results: InstanceList) -> InstanceData: + """Merge bbox predictions of each branch. + + Args: + trident_results (List[:obj:`InstanceData`]): A list of InstanceData + predicted from every branch. + + Returns: + :obj:`InstanceData`: merged InstanceData. + """ + bboxes = torch.cat([res.bboxes for res in trident_results]) + scores = torch.cat([res.scores for res in trident_results]) + labels = torch.cat([res.labels for res in trident_results]) + + nms_cfg = self.test_cfg['nms'] + results = InstanceData() + if bboxes.numel() == 0: + results.bboxes = bboxes + results.scores = scores + results.labels = labels + else: + det_bboxes, keep = batched_nms(bboxes, scores, labels, nms_cfg) + results.bboxes = det_bboxes[:, :-1] + results.scores = det_bboxes[:, -1] + results.labels = labels[keep] + + if self.test_cfg['max_per_img'] > 0: + results = results[:self.test_cfg['max_per_img']] + return results + + def predict(self, + x: Tuple[Tensor], + rpn_results_list: InstanceList, + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the roi head and predict detection + results on the features of the upstream network. + + - Compute prediction bbox and label per branch. + - Merge predictions of each branch according to scores of + bboxes, i.e., bboxes with higher score are kept to give + top-k prediction. + + Args: + x (tuple[Tensor]): Features from upstream network. Each + has shape (N, C, H, W). + rpn_results_list (list[:obj:`InstanceData`]): list of region + proposals. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results to + the original image. Defaults to True. + + Returns: + list[obj:`InstanceData`]: Detection results of each image. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + results_list = super().predict( + x=x, + rpn_results_list=rpn_results_list, + batch_data_samples=batch_data_samples, + rescale=rescale) + + num_branch = self.num_branch \ + if self.training or self.test_branch_idx == -1 else 1 + + merged_results_list = [] + for i in range(len(batch_data_samples) // num_branch): + merged_results_list.append( + self.merge_trident_bboxes(results_list[i * num_branch:(i + 1) * + num_branch])) + return merged_results_list diff --git a/mmdetection/mmdet/models/seg_heads/__init__.py b/mmdetection/mmdet/models/seg_heads/__init__.py new file mode 100644 index 0000000..b489a90 --- /dev/null +++ b/mmdetection/mmdet/models/seg_heads/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .panoptic_fpn_head import PanopticFPNHead # noqa: F401,F403 +from .panoptic_fusion_heads import * # noqa: F401,F403 diff --git a/mmdetection/mmdet/models/seg_heads/base_semantic_head.py b/mmdetection/mmdet/models/seg_heads/base_semantic_head.py new file mode 100644 index 0000000..1db7154 --- /dev/null +++ b/mmdetection/mmdet/models/seg_heads/base_semantic_head.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Tuple, Union + +import torch.nn.functional as F +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptMultiConfig + + +@MODELS.register_module() +class BaseSemanticHead(BaseModule, metaclass=ABCMeta): + """Base module of Semantic Head. + + Args: + num_classes (int): the number of classes. + seg_rescale_factor (float): the rescale factor for ``gt_sem_seg``, + which equals to ``1 / output_strides``. The output_strides is + for ``seg_preds``. Defaults to 1 / 4. + init_cfg (Optional[Union[:obj:`ConfigDict`, dict]]): the initialization + config. + loss_seg (Union[:obj:`ConfigDict`, dict]): the loss of the semantic + head. + """ + + def __init__(self, + num_classes: int, + seg_rescale_factor: float = 1 / 4., + loss_seg: ConfigType = dict( + type='CrossEntropyLoss', + ignore_index=255, + loss_weight=1.0), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.loss_seg = MODELS.build(loss_seg) + self.num_classes = num_classes + self.seg_rescale_factor = seg_rescale_factor + + @abstractmethod + def forward(self, x: Union[Tensor, Tuple[Tensor]]) -> Dict[str, Tensor]: + """Placeholder of forward function. + + Args: + x (Tensor): Feature maps. + + Returns: + Dict[str, Tensor]: A dictionary, including features + and predicted scores. Required keys: 'seg_preds' + and 'feats'. + """ + pass + + @abstractmethod + def loss(self, x: Union[Tensor, Tuple[Tensor]], + batch_data_samples: SampleList) -> Dict[str, Tensor]: + """ + Args: + x (Union[Tensor, Tuple[Tensor]]): Feature maps. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Args: + x (Tensor): Feature maps. + + Returns: + Dict[str, Tensor]: The loss of semantic head. + """ + pass + + def predict(self, + x: Union[Tensor, Tuple[Tensor]], + batch_img_metas: List[dict], + rescale: bool = False) -> List[Tensor]: + """Test without Augmentation. + + Args: + x (Union[Tensor, Tuple[Tensor]]): Feature maps. + batch_img_metas (List[dict]): List of image information. + rescale (bool): Whether to rescale the results. + Defaults to False. + + Returns: + list[Tensor]: semantic segmentation logits. + """ + seg_preds = self.forward(x)['seg_preds'] + seg_preds = F.interpolate( + seg_preds, + size=batch_img_metas[0]['batch_input_shape'], + mode='bilinear', + align_corners=False) + seg_preds = [seg_preds[i] for i in range(len(batch_img_metas))] + + if rescale: + seg_pred_list = [] + for i in range(len(batch_img_metas)): + h, w = batch_img_metas[i]['img_shape'] + seg_pred = seg_preds[i][:, :h, :w] + + h, w = batch_img_metas[i]['ori_shape'] + seg_pred = F.interpolate( + seg_pred[None], + size=(h, w), + mode='bilinear', + align_corners=False)[0] + seg_pred_list.append(seg_pred) + else: + seg_pred_list = seg_preds + + return seg_pred_list diff --git a/mmdetection/mmdet/models/seg_heads/panoptic_fpn_head.py b/mmdetection/mmdet/models/seg_heads/panoptic_fpn_head.py new file mode 100644 index 0000000..8d8b901 --- /dev/null +++ b/mmdetection/mmdet/models/seg_heads/panoptic_fpn_head.py @@ -0,0 +1,174 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import ModuleList +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from ..layers import ConvUpsample +from ..utils import interpolate_as +from .base_semantic_head import BaseSemanticHead + + +@MODELS.register_module() +class PanopticFPNHead(BaseSemanticHead): + """PanopticFPNHead used in Panoptic FPN. + + In this head, the number of output channels is ``num_stuff_classes + + 1``, including all stuff classes and one thing class. The stuff + classes will be reset from ``0`` to ``num_stuff_classes - 1``, the + thing classes will be merged to ``num_stuff_classes``-th channel. + + Arg: + num_things_classes (int): Number of thing classes. Default: 80. + num_stuff_classes (int): Number of stuff classes. Default: 53. + in_channels (int): Number of channels in the input feature + map. + inner_channels (int): Number of channels in inner features. + start_level (int): The start level of the input features + used in PanopticFPN. + end_level (int): The end level of the used features, the + ``end_level``-th layer will not be used. + conv_cfg (Optional[Union[ConfigDict, dict]]): Dictionary to construct + and config conv layer. + norm_cfg (Union[ConfigDict, dict]): Dictionary to construct and config + norm layer. Use ``GN`` by default. + init_cfg (Optional[Union[ConfigDict, dict]]): Initialization config + dict. + loss_seg (Union[ConfigDict, dict]): the loss of the semantic head. + """ + + def __init__(self, + num_things_classes: int = 80, + num_stuff_classes: int = 53, + in_channels: int = 256, + inner_channels: int = 128, + start_level: int = 0, + end_level: int = 4, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + loss_seg: ConfigType = dict( + type='CrossEntropyLoss', ignore_index=-1, + loss_weight=1.0), + init_cfg: OptMultiConfig = None) -> None: + seg_rescale_factor = 1 / 2**(start_level + 2) + super().__init__( + num_classes=num_stuff_classes + 1, + seg_rescale_factor=seg_rescale_factor, + loss_seg=loss_seg, + init_cfg=init_cfg) + self.num_things_classes = num_things_classes + self.num_stuff_classes = num_stuff_classes + # Used feature layers are [start_level, end_level) + self.start_level = start_level + self.end_level = end_level + self.num_stages = end_level - start_level + self.inner_channels = inner_channels + + self.conv_upsample_layers = ModuleList() + for i in range(start_level, end_level): + self.conv_upsample_layers.append( + ConvUpsample( + in_channels, + inner_channels, + num_layers=i if i > 0 else 1, + num_upsample=i if i > 0 else 0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + )) + self.conv_logits = nn.Conv2d(inner_channels, self.num_classes, 1) + + def _set_things_to_void(self, gt_semantic_seg: Tensor) -> Tensor: + """Merge thing classes to one class. + + In PanopticFPN, the background labels will be reset from `0` to + `self.num_stuff_classes-1`, the foreground labels will be merged to + `self.num_stuff_classes`-th channel. + """ + gt_semantic_seg = gt_semantic_seg.int() + fg_mask = gt_semantic_seg < self.num_things_classes + bg_mask = (gt_semantic_seg >= self.num_things_classes) * ( + gt_semantic_seg < self.num_things_classes + self.num_stuff_classes) + + new_gt_seg = torch.clone(gt_semantic_seg) + new_gt_seg = torch.where(bg_mask, + gt_semantic_seg - self.num_things_classes, + new_gt_seg) + new_gt_seg = torch.where(fg_mask, + fg_mask.int() * self.num_stuff_classes, + new_gt_seg) + return new_gt_seg + + def loss(self, x: Union[Tensor, Tuple[Tensor]], + batch_data_samples: SampleList) -> Dict[str, Tensor]: + """ + Args: + x (Union[Tensor, Tuple[Tensor]]): Feature maps. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + Dict[str, Tensor]: The loss of semantic head. + """ + seg_preds = self(x)['seg_preds'] + gt_semantic_segs = [ + data_sample.gt_sem_seg.sem_seg + for data_sample in batch_data_samples + ] + + gt_semantic_segs = torch.stack(gt_semantic_segs) + if self.seg_rescale_factor != 1.0: + gt_semantic_segs = F.interpolate( + gt_semantic_segs.float(), + scale_factor=self.seg_rescale_factor, + mode='nearest').squeeze(1) + + # Things classes will be merged to one class in PanopticFPN. + gt_semantic_segs = self._set_things_to_void(gt_semantic_segs) + + if seg_preds.shape[-2:] != gt_semantic_segs.shape[-2:]: + seg_preds = interpolate_as(seg_preds, gt_semantic_segs) + seg_preds = seg_preds.permute((0, 2, 3, 1)) + + loss_seg = self.loss_seg( + seg_preds.reshape(-1, self.num_classes), # => [NxHxW, C] + gt_semantic_segs.reshape(-1).long()) + + return dict(loss_seg=loss_seg) + + def init_weights(self) -> None: + """Initialize weights.""" + super().init_weights() + nn.init.normal_(self.conv_logits.weight.data, 0, 0.01) + self.conv_logits.bias.data.zero_() + + def forward(self, x: Tuple[Tensor]) -> Dict[str, Tensor]: + """Forward. + + Args: + x (Tuple[Tensor]): Multi scale Feature maps. + + Returns: + dict[str, Tensor]: semantic segmentation predictions and + feature maps. + """ + # the number of subnets must be not more than + # the length of features. + assert self.num_stages <= len(x) + + feats = [] + for i, layer in enumerate(self.conv_upsample_layers): + f = layer(x[self.start_level + i]) + feats.append(f) + + seg_feats = torch.sum(torch.stack(feats, dim=0), dim=0) + seg_preds = self.conv_logits(seg_feats) + out = dict(seg_preds=seg_preds, seg_feats=seg_feats) + return out diff --git a/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py b/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py new file mode 100644 index 0000000..41625a6 --- /dev/null +++ b/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_panoptic_fusion_head import \ + BasePanopticFusionHead # noqa: F401,F403 +from .heuristic_fusion_head import HeuristicFusionHead # noqa: F401,F403 +from .maskformer_fusion_head import MaskFormerFusionHead # noqa: F401,F403 diff --git a/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py b/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py new file mode 100644 index 0000000..f6b20e1 --- /dev/null +++ b/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta): + """Base class for panoptic heads.""" + + def __init__(self, + num_things_classes: int = 80, + num_stuff_classes: int = 53, + test_cfg: OptConfigType = None, + loss_panoptic: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + super().__init__(init_cfg=init_cfg) + self.num_things_classes = num_things_classes + self.num_stuff_classes = num_stuff_classes + self.num_classes = num_things_classes + num_stuff_classes + self.test_cfg = test_cfg + + if loss_panoptic: + self.loss_panoptic = MODELS.build(loss_panoptic) + else: + self.loss_panoptic = None + + @property + def with_loss(self) -> bool: + """bool: whether the panoptic head contains loss function.""" + return self.loss_panoptic is not None + + @abstractmethod + def loss(self, **kwargs): + """Loss function.""" + + @abstractmethod + def predict(self, **kwargs): + """Predict function.""" diff --git a/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py b/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py new file mode 100644 index 0000000..7a4a420 --- /dev/null +++ b/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +from mmengine.structures import InstanceData, PixelData +from torch import Tensor + +from mmdet.evaluation.functional import INSTANCE_OFFSET +from mmdet.registry import MODELS +from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig, PixelList +from .base_panoptic_fusion_head import BasePanopticFusionHead + + +@MODELS.register_module() +class HeuristicFusionHead(BasePanopticFusionHead): + """Fusion Head with Heuristic method.""" + + def __init__(self, + num_things_classes: int = 80, + num_stuff_classes: int = 53, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + super().__init__( + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + test_cfg=test_cfg, + loss_panoptic=None, + init_cfg=init_cfg, + **kwargs) + + def loss(self, **kwargs) -> dict: + """HeuristicFusionHead has no training loss.""" + return dict() + + def _lay_masks(self, + mask_results: InstanceData, + overlap_thr: float = 0.5) -> Tensor: + """Lay instance masks to a result map. + + Args: + mask_results (:obj:`InstanceData`): Instance segmentation results, + each contains ``bboxes``, ``labels``, ``scores`` and ``masks``. + overlap_thr (float): Threshold to determine whether two masks + overlap. default: 0.5. + + Returns: + Tensor: The result map, (H, W). + """ + bboxes = mask_results.bboxes + scores = mask_results.scores + labels = mask_results.labels + masks = mask_results.masks + + num_insts = bboxes.shape[0] + id_map = torch.zeros( + masks.shape[-2:], device=bboxes.device, dtype=torch.long) + if num_insts == 0: + return id_map, labels + + # Sort by score to use heuristic fusion + order = torch.argsort(-scores) + bboxes = bboxes[order] + labels = labels[order] + segm_masks = masks[order] + + instance_id = 1 + left_labels = [] + for idx in range(bboxes.shape[0]): + _cls = labels[idx] + _mask = segm_masks[idx] + instance_id_map = torch.ones_like( + _mask, dtype=torch.long) * instance_id + area = _mask.sum() + if area == 0: + continue + + pasted = id_map > 0 + intersect = (_mask * pasted).sum() + if (intersect / (area + 1e-5)) > overlap_thr: + continue + + _part = _mask * (~pasted) + id_map = torch.where(_part, instance_id_map, id_map) + left_labels.append(_cls) + instance_id += 1 + + if len(left_labels) > 0: + instance_labels = torch.stack(left_labels) + else: + instance_labels = bboxes.new_zeros((0, ), dtype=torch.long) + assert instance_id == (len(instance_labels) + 1) + return id_map, instance_labels + + def _predict_single(self, mask_results: InstanceData, seg_preds: Tensor, + **kwargs) -> PixelData: + """Fuse the results of instance and semantic segmentations. + + Args: + mask_results (:obj:`InstanceData`): Instance segmentation results, + each contains ``bboxes``, ``labels``, ``scores`` and ``masks``. + seg_preds (Tensor): The semantic segmentation results, + (num_stuff + 1, H, W). + + Returns: + Tensor: The panoptic segmentation result, (H, W). + """ + id_map, labels = self._lay_masks(mask_results, + self.test_cfg.mask_overlap) + + seg_results = seg_preds.argmax(dim=0) + seg_results = seg_results + self.num_things_classes + + pan_results = seg_results + instance_id = 1 + for idx in range(len(mask_results)): + _mask = id_map == (idx + 1) + if _mask.sum() == 0: + continue + _cls = labels[idx] + # simply trust detection + segment_id = _cls + instance_id * INSTANCE_OFFSET + pan_results[_mask] = segment_id + instance_id += 1 + + ids, counts = torch.unique( + pan_results % INSTANCE_OFFSET, return_counts=True) + stuff_ids = ids[ids >= self.num_things_classes] + stuff_counts = counts[ids >= self.num_things_classes] + ignore_stuff_ids = stuff_ids[ + stuff_counts < self.test_cfg.stuff_area_limit] + + assert pan_results.ndim == 2 + pan_results[(pan_results.unsqueeze(2) == ignore_stuff_ids.reshape( + 1, 1, -1)).any(dim=2)] = self.num_classes + + pan_results = PixelData(sem_seg=pan_results[None].int()) + return pan_results + + def predict(self, mask_results_list: InstanceList, + seg_preds_list: List[Tensor], **kwargs) -> PixelList: + """Predict results by fusing the results of instance and semantic + segmentations. + + Args: + mask_results_list (list[:obj:`InstanceData`]): Instance + segmentation results, each contains ``bboxes``, ``labels``, + ``scores`` and ``masks``. + seg_preds_list (Tensor): List of semantic segmentation results. + + Returns: + List[PixelData]: Panoptic segmentation result. + """ + results_list = [ + self._predict_single(mask_results_list[i], seg_preds_list[i]) + for i in range(len(mask_results_list)) + ] + + return results_list diff --git a/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py b/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py new file mode 100644 index 0000000..1b76e6b --- /dev/null +++ b/mmdetection/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py @@ -0,0 +1,266 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn.functional as F +from mmengine.structures import InstanceData, PixelData +from torch import Tensor + +from mmdet.evaluation.functional import INSTANCE_OFFSET +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.mask import mask2bbox +from mmdet.utils import OptConfigType, OptMultiConfig +from .base_panoptic_fusion_head import BasePanopticFusionHead + + +@MODELS.register_module() +class MaskFormerFusionHead(BasePanopticFusionHead): + """MaskFormer fusion head which postprocesses results for panoptic + segmentation, instance segmentation and semantic segmentation.""" + + def __init__(self, + num_things_classes: int = 80, + num_stuff_classes: int = 53, + test_cfg: OptConfigType = None, + loss_panoptic: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs): + super().__init__( + num_things_classes=num_things_classes, + num_stuff_classes=num_stuff_classes, + test_cfg=test_cfg, + loss_panoptic=loss_panoptic, + init_cfg=init_cfg, + **kwargs) + + def loss(self, **kwargs): + """MaskFormerFusionHead has no training loss.""" + return dict() + + def panoptic_postprocess(self, mask_cls: Tensor, + mask_pred: Tensor) -> PixelData: + """Panoptic segmengation inference. + + Args: + mask_cls (Tensor): Classfication outputs of shape + (num_queries, cls_out_channels) for a image. + Note `cls_out_channels` should includes + background. + mask_pred (Tensor): Mask outputs of shape + (num_queries, h, w) for a image. + + Returns: + :obj:`PixelData`: Panoptic segment result of shape \ + (h, w), each element in Tensor means: \ + ``segment_id = _cls + instance_id * INSTANCE_OFFSET``. + """ + object_mask_thr = self.test_cfg.get('object_mask_thr', 0.8) + iou_thr = self.test_cfg.get('iou_thr', 0.8) + filter_low_score = self.test_cfg.get('filter_low_score', False) + + scores, labels = F.softmax(mask_cls, dim=-1).max(-1) + mask_pred = mask_pred.sigmoid() + + keep = labels.ne(self.num_classes) & (scores > object_mask_thr) + cur_scores = scores[keep] + cur_classes = labels[keep] + cur_masks = mask_pred[keep] + + cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks + + h, w = cur_masks.shape[-2:] + panoptic_seg = torch.full((h, w), + self.num_classes, + dtype=torch.int32, + device=cur_masks.device) + if cur_masks.shape[0] == 0: + # We didn't detect any mask :( + pass + else: + cur_mask_ids = cur_prob_masks.argmax(0) + instance_id = 1 + for k in range(cur_classes.shape[0]): + pred_class = int(cur_classes[k].item()) + isthing = pred_class < self.num_things_classes + mask = cur_mask_ids == k + mask_area = mask.sum().item() + original_area = (cur_masks[k] >= 0.5).sum().item() + + if filter_low_score: + mask = mask & (cur_masks[k] >= 0.5) + + if mask_area > 0 and original_area > 0: + if mask_area / original_area < iou_thr: + continue + + if not isthing: + # different stuff regions of same class will be + # merged here, and stuff share the instance_id 0. + panoptic_seg[mask] = pred_class + else: + panoptic_seg[mask] = ( + pred_class + instance_id * INSTANCE_OFFSET) + instance_id += 1 + + return PixelData(sem_seg=panoptic_seg[None]) + + def semantic_postprocess(self, mask_cls: Tensor, + mask_pred: Tensor) -> PixelData: + """Semantic segmengation postprocess. + + Args: + mask_cls (Tensor): Classfication outputs of shape + (num_queries, cls_out_channels) for a image. + Note `cls_out_channels` should includes + background. + mask_pred (Tensor): Mask outputs of shape + (num_queries, h, w) for a image. + + Returns: + :obj:`PixelData`: Semantic segment result. + """ + # TODO add semantic segmentation result + raise NotImplementedError + + def instance_postprocess(self, mask_cls: Tensor, + mask_pred: Tensor) -> InstanceData: + """Instance segmengation postprocess. + + Args: + mask_cls (Tensor): Classfication outputs of shape + (num_queries, cls_out_channels) for a image. + Note `cls_out_channels` should includes + background. + mask_pred (Tensor): Mask outputs of shape + (num_queries, h, w) for a image. + + Returns: + :obj:`InstanceData`: Instance segmentation results. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + max_per_image = self.test_cfg.get('max_per_image', 100) + num_queries = mask_cls.shape[0] + # shape (num_queries, num_class) + scores = F.softmax(mask_cls, dim=-1)[:, :-1] + # shape (num_queries * num_class, ) + labels = torch.arange(self.num_classes, device=mask_cls.device).\ + unsqueeze(0).repeat(num_queries, 1).flatten(0, 1) + scores_per_image, top_indices = scores.flatten(0, 1).topk( + max_per_image, sorted=False) + labels_per_image = labels[top_indices] + + query_indices = top_indices // self.num_classes + mask_pred = mask_pred[query_indices] + + # extract things + is_thing = labels_per_image < self.num_things_classes + scores_per_image = scores_per_image[is_thing] + labels_per_image = labels_per_image[is_thing] + mask_pred = mask_pred[is_thing] + + mask_pred_binary = (mask_pred > 0).float() + mask_scores_per_image = (mask_pred.sigmoid() * + mask_pred_binary).flatten(1).sum(1) / ( + mask_pred_binary.flatten(1).sum(1) + 1e-6) + det_scores = scores_per_image * mask_scores_per_image + mask_pred_binary = mask_pred_binary.bool() + bboxes = mask2bbox(mask_pred_binary) + + results = InstanceData() + results.bboxes = bboxes + results.labels = labels_per_image + results.scores = det_scores + results.masks = mask_pred_binary + return results + + def predict(self, + mask_cls_results: Tensor, + mask_pred_results: Tensor, + batch_data_samples: SampleList, + rescale: bool = False, + **kwargs) -> List[dict]: + """Test segment without test-time aumengtation. + + Only the output of last decoder layers was used. + + Args: + mask_cls_results (Tensor): Mask classification logits, + shape (batch_size, num_queries, cls_out_channels). + Note `cls_out_channels` should includes background. + mask_pred_results (Tensor): Mask logits, shape + (batch_size, num_queries, h, w). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): If True, return boxes in + original image space. Default False. + + Returns: + list[dict]: Instance segmentation \ + results and panoptic segmentation results for each \ + image. + + .. code-block:: none + + [ + { + 'pan_results': PixelData, + 'ins_results': InstanceData, + # semantic segmentation results are not supported yet + 'sem_results': PixelData + }, + ... + ] + """ + batch_img_metas = [ + data_sample.metainfo for data_sample in batch_data_samples + ] + panoptic_on = self.test_cfg.get('panoptic_on', True) + semantic_on = self.test_cfg.get('semantic_on', False) + instance_on = self.test_cfg.get('instance_on', False) + assert not semantic_on, 'segmantic segmentation '\ + 'results are not supported yet.' + + results = [] + for mask_cls_result, mask_pred_result, meta in zip( + mask_cls_results, mask_pred_results, batch_img_metas): + # remove padding + img_height, img_width = meta['img_shape'][:2] + mask_pred_result = mask_pred_result[:, :img_height, :img_width] + + if rescale: + # return result in original resolution + ori_height, ori_width = meta['ori_shape'][:2] + mask_pred_result = F.interpolate( + mask_pred_result[:, None], + size=(ori_height, ori_width), + mode='bilinear', + align_corners=False)[:, 0] + + result = dict() + if panoptic_on: + pan_results = self.panoptic_postprocess( + mask_cls_result, mask_pred_result) + result['pan_results'] = pan_results + + if instance_on: + ins_results = self.instance_postprocess( + mask_cls_result, mask_pred_result) + result['ins_results'] = ins_results + + if semantic_on: + sem_results = self.semantic_postprocess( + mask_cls_result, mask_pred_result) + result['sem_results'] = sem_results + + results.append(result) + + return results diff --git a/mmdetection/mmdet/models/task_modules/__init__.py b/mmdetection/mmdet/models/task_modules/__init__.py new file mode 100644 index 0000000..7bfd8f0 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assigners import * # noqa: F401,F403 +from .builder import (ANCHOR_GENERATORS, BBOX_ASSIGNERS, BBOX_CODERS, + BBOX_SAMPLERS, IOU_CALCULATORS, MATCH_COSTS, + PRIOR_GENERATORS, build_anchor_generator, build_assigner, + build_bbox_coder, build_iou_calculator, build_match_cost, + build_prior_generator, build_sampler) +from .coders import * # noqa: F401,F403 +from .prior_generators import * # noqa: F401,F403 +from .samplers import * # noqa: F401,F403 +from .tracking import * # noqa: F401,F403 + +__all__ = [ + 'ANCHOR_GENERATORS', 'PRIOR_GENERATORS', 'BBOX_ASSIGNERS', 'BBOX_SAMPLERS', + 'MATCH_COSTS', 'BBOX_CODERS', 'IOU_CALCULATORS', 'build_anchor_generator', + 'build_prior_generator', 'build_assigner', 'build_sampler', + 'build_iou_calculator', 'build_match_cost', 'build_bbox_coder' +] diff --git a/mmdetection/mmdet/models/task_modules/assigners/__init__.py b/mmdetection/mmdet/models/task_modules/assigners/__init__.py new file mode 100644 index 0000000..4e564f2 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/__init__.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .approx_max_iou_assigner import ApproxMaxIoUAssigner +from .assign_result import AssignResult +from .atss_assigner import ATSSAssigner +from .base_assigner import BaseAssigner +from .center_region_assigner import CenterRegionAssigner +from .dynamic_soft_label_assigner import DynamicSoftLabelAssigner +from .grid_assigner import GridAssigner +from .hungarian_assigner import HungarianAssigner +from .iou2d_calculator import BboxOverlaps2D, BboxOverlaps2D_GLIP +from .match_cost import (BBoxL1Cost, BinaryFocalLossCost, ClassificationCost, + CrossEntropyLossCost, DiceCost, FocalLossCost, + IoUCost) +from .max_iou_assigner import MaxIoUAssigner +from .multi_instance_assigner import MultiInstanceAssigner +from .point_assigner import PointAssigner +from .region_assigner import RegionAssigner +from .sim_ota_assigner import SimOTAAssigner +from .task_aligned_assigner import TaskAlignedAssigner +from .topk_hungarian_assigner import TopkHungarianAssigner +from .uniform_assigner import UniformAssigner + +__all__ = [ + 'BaseAssigner', 'BinaryFocalLossCost', 'MaxIoUAssigner', + 'ApproxMaxIoUAssigner', 'AssignResult', 'PointAssigner', 'ATSSAssigner', + 'CenterRegionAssigner', 'GridAssigner', 'HungarianAssigner', + 'RegionAssigner', 'UniformAssigner', 'SimOTAAssigner', + 'TaskAlignedAssigner', 'TopkHungarianAssigner', 'BBoxL1Cost', + 'ClassificationCost', 'CrossEntropyLossCost', 'DiceCost', 'FocalLossCost', + 'IoUCost', 'BboxOverlaps2D', 'DynamicSoftLabelAssigner', + 'MultiInstanceAssigner', 'BboxOverlaps2D_GLIP' +] diff --git a/mmdetection/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py new file mode 100644 index 0000000..471d54e --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py @@ -0,0 +1,162 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +import torch +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData + +from mmdet.registry import TASK_UTILS +from .assign_result import AssignResult +from .max_iou_assigner import MaxIoUAssigner + + +@TASK_UTILS.register_module() +class ApproxMaxIoUAssigner(MaxIoUAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with an integer indicating the ground truth + index. (semi-positive index: gt label (0-based), -1: background) + + - -1: negative sample, no assigned gt + - semi-positive integer: positive sample, index (0-based) of assigned gt + + Args: + pos_iou_thr (float): IoU threshold for positive bboxes. + neg_iou_thr (float or tuple): IoU threshold for negative bboxes. + min_pos_iou (float): Minimum iou for a bbox to be considered as a + positive bbox. Positive samples can have smaller IoU than + pos_iou_thr due to the 4th step (assign max IoU sample to each gt). + gt_max_assign_all (bool): Whether to assign all bboxes with the same + highest overlap with some gt to that gt. + ignore_iof_thr (float): IoF threshold for ignoring bboxes (if + `gt_bboxes_ignore` is specified). Negative values mean not + ignoring any bboxes. + ignore_wrt_candidates (bool): Whether to compute the iof between + `bboxes` and `gt_bboxes_ignore`, or the contrary. + match_low_quality (bool): Whether to allow quality matches. This is + usually allowed for RPN and single stage detectors, but not allowed + in the second stage. + gpu_assign_thr (int): The upper bound of the number of GT for GPU + assign. When the number of gt is above this threshold, will assign + on CPU device. Negative values mean not assign on CPU. + iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps + Calculator. + """ + + def __init__( + self, + pos_iou_thr: float, + neg_iou_thr: Union[float, tuple], + min_pos_iou: float = .0, + gt_max_assign_all: bool = True, + ignore_iof_thr: float = -1, + ignore_wrt_candidates: bool = True, + match_low_quality: bool = True, + gpu_assign_thr: int = -1, + iou_calculator: Union[ConfigDict, dict] = dict(type='BboxOverlaps2D') + ) -> None: + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_pos_iou = min_pos_iou + self.gt_max_assign_all = gt_max_assign_all + self.ignore_iof_thr = ignore_iof_thr + self.ignore_wrt_candidates = ignore_wrt_candidates + self.gpu_assign_thr = gpu_assign_thr + self.match_low_quality = match_low_quality + self.iou_calculator = TASK_UTILS.build(iou_calculator) + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to approxs. + + This method assign a gt bbox to each group of approxs (bboxes), + each group of approxs is represent by a base approx (bbox) and + will be assigned with -1, or a semi-positive number. + background_label (-1) means negative sample, + semi-positive number is the index (0-based) of assigned gt. + The assignment is done in following steps, the order matters. + + 1. assign every bbox to background_label (-1) + 2. use the max IoU of each group of approxs to assign + 2. assign proposals whose iou with all gts < neg_iou_thr to background + 3. for each bbox, if the iou with its nearest gt >= pos_iou_thr, + assign it to that bbox + 4. for each gt bbox, assign its nearest proposals (may be more than + one) to itself + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). ``approxs`` means the + group of approxs aligned with ``priors``, has shape + (n, num_approxs, 4). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + + Returns: + :obj:`AssignResult`: The assign result. + """ + squares = pred_instances.priors + approxs = pred_instances.approxs + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + gt_bboxes_ignore = None if gt_instances_ignore is None else \ + gt_instances_ignore.get('bboxes', None) + approxs_per_octave = approxs.size(1) + + num_squares = squares.size(0) + num_gts = gt_bboxes.size(0) + + if num_squares == 0 or num_gts == 0: + # No predictions and/or truth, return empty assignment + overlaps = approxs.new(num_gts, num_squares) + assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) + return assign_result + + # re-organize anchors by approxs_per_octave x num_squares + approxs = torch.transpose(approxs, 0, 1).contiguous().view(-1, 4) + assign_on_cpu = True if (self.gpu_assign_thr > 0) and ( + num_gts > self.gpu_assign_thr) else False + # compute overlap and assign gt on CPU when number of GT is large + if assign_on_cpu: + device = approxs.device + approxs = approxs.cpu() + gt_bboxes = gt_bboxes.cpu() + if gt_bboxes_ignore is not None: + gt_bboxes_ignore = gt_bboxes_ignore.cpu() + if gt_labels is not None: + gt_labels = gt_labels.cpu() + all_overlaps = self.iou_calculator(approxs, gt_bboxes) + + overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares, + num_gts).max(dim=0) + overlaps = torch.transpose(overlaps, 0, 1) + + if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None + and gt_bboxes_ignore.numel() > 0 and squares.numel() > 0): + if self.ignore_wrt_candidates: + ignore_overlaps = self.iou_calculator( + squares, gt_bboxes_ignore, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) + else: + ignore_overlaps = self.iou_calculator( + gt_bboxes_ignore, squares, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=0) + overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1 + + assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) + if assign_on_cpu: + assign_result.gt_inds = assign_result.gt_inds.to(device) + assign_result.max_overlaps = assign_result.max_overlaps.to(device) + if assign_result.labels is not None: + assign_result.labels = assign_result.labels.to(device) + return assign_result diff --git a/mmdetection/mmdet/models/task_modules/assigners/assign_result.py b/mmdetection/mmdet/models/task_modules/assigners/assign_result.py new file mode 100644 index 0000000..56ca2c3 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/assign_result.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +from mmdet.utils import util_mixins + + +class AssignResult(util_mixins.NiceRepr): + """Stores assignments between predicted and truth boxes. + + Attributes: + num_gts (int): the number of truth boxes considered when computing this + assignment + gt_inds (Tensor): for each predicted box indicates the 1-based + index of the assigned truth box. 0 means unassigned and -1 means + ignore. + max_overlaps (Tensor): the iou between the predicted box and its + assigned truth box. + labels (Tensor): If specified, for each predicted box + indicates the category label of the assigned truth box. + + Example: + >>> # An assign result between 4 predicted boxes and 9 true boxes + >>> # where only two boxes were assigned. + >>> num_gts = 9 + >>> max_overlaps = torch.LongTensor([0, .5, .9, 0]) + >>> gt_inds = torch.LongTensor([-1, 1, 2, 0]) + >>> labels = torch.LongTensor([0, 3, 4, 0]) + >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels) + >>> print(str(self)) # xdoctest: +IGNORE_WANT + + >>> # Force addition of gt labels (when adding gt as proposals) + >>> new_labels = torch.LongTensor([3, 4, 5]) + >>> self.add_gt_(new_labels) + >>> print(str(self)) # xdoctest: +IGNORE_WANT + + """ + + def __init__(self, num_gts: int, gt_inds: Tensor, max_overlaps: Tensor, + labels: Tensor) -> None: + self.num_gts = num_gts + self.gt_inds = gt_inds + self.max_overlaps = max_overlaps + self.labels = labels + # Interface for possible user-defined properties + self._extra_properties = {} + + @property + def num_preds(self): + """int: the number of predictions in this assignment""" + return len(self.gt_inds) + + def set_extra_property(self, key, value): + """Set user-defined new property.""" + assert key not in self.info + self._extra_properties[key] = value + + def get_extra_property(self, key): + """Get user-defined property.""" + return self._extra_properties.get(key, None) + + @property + def info(self): + """dict: a dictionary of info about the object""" + basic_info = { + 'num_gts': self.num_gts, + 'num_preds': self.num_preds, + 'gt_inds': self.gt_inds, + 'max_overlaps': self.max_overlaps, + 'labels': self.labels, + } + basic_info.update(self._extra_properties) + return basic_info + + def __nice__(self): + """str: a "nice" summary string describing this assign result""" + parts = [] + parts.append(f'num_gts={self.num_gts!r}') + if self.gt_inds is None: + parts.append(f'gt_inds={self.gt_inds!r}') + else: + parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}') + if self.max_overlaps is None: + parts.append(f'max_overlaps={self.max_overlaps!r}') + else: + parts.append('max_overlaps.shape=' + f'{tuple(self.max_overlaps.shape)!r}') + if self.labels is None: + parts.append(f'labels={self.labels!r}') + else: + parts.append(f'labels.shape={tuple(self.labels.shape)!r}') + return ', '.join(parts) + + @classmethod + def random(cls, **kwargs): + """Create random AssignResult for tests or debugging. + + Args: + num_preds: number of predicted boxes + num_gts: number of true boxes + p_ignore (float): probability of a predicted box assigned to an + ignored truth + p_assigned (float): probability of a predicted box not being + assigned + p_use_label (float | bool): with labels or not + rng (None | int | numpy.random.RandomState): seed or state + + Returns: + :obj:`AssignResult`: Randomly generated assign results. + + Example: + >>> from mmdet.models.task_modules.assigners.assign_result import * # NOQA + >>> self = AssignResult.random() + >>> print(self.info) + """ + from ..samplers.sampling_result import ensure_rng + rng = ensure_rng(kwargs.get('rng', None)) + + num_gts = kwargs.get('num_gts', None) + num_preds = kwargs.get('num_preds', None) + p_ignore = kwargs.get('p_ignore', 0.3) + p_assigned = kwargs.get('p_assigned', 0.7) + num_classes = kwargs.get('num_classes', 3) + + if num_gts is None: + num_gts = rng.randint(0, 8) + if num_preds is None: + num_preds = rng.randint(0, 16) + + if num_gts == 0: + max_overlaps = torch.zeros(num_preds, dtype=torch.float32) + gt_inds = torch.zeros(num_preds, dtype=torch.int64) + labels = torch.zeros(num_preds, dtype=torch.int64) + + else: + import numpy as np + + # Create an overlap for each predicted box + max_overlaps = torch.from_numpy(rng.rand(num_preds)) + + # Construct gt_inds for each predicted box + is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned) + # maximum number of assignments constraints + n_assigned = min(num_preds, min(num_gts, is_assigned.sum())) + + assigned_idxs = np.where(is_assigned)[0] + rng.shuffle(assigned_idxs) + assigned_idxs = assigned_idxs[0:n_assigned] + assigned_idxs.sort() + + is_assigned[:] = 0 + is_assigned[assigned_idxs] = True + + is_ignore = torch.from_numpy( + rng.rand(num_preds) < p_ignore) & is_assigned + + gt_inds = torch.zeros(num_preds, dtype=torch.int64) + + true_idxs = np.arange(num_gts) + rng.shuffle(true_idxs) + true_idxs = torch.from_numpy(true_idxs) + gt_inds[is_assigned] = true_idxs[:n_assigned].long() + + gt_inds = torch.from_numpy( + rng.randint(1, num_gts + 1, size=num_preds)) + gt_inds[is_ignore] = -1 + gt_inds[~is_assigned] = 0 + max_overlaps[~is_assigned] = 0 + + if num_classes == 0: + labels = torch.zeros(num_preds, dtype=torch.int64) + else: + labels = torch.from_numpy( + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + rng.randint(0, num_classes, size=num_preds)) + labels[~is_assigned] = 0 + + self = cls(num_gts, gt_inds, max_overlaps, labels) + return self + + def add_gt_(self, gt_labels): + """Add ground truth as assigned results. + + Args: + gt_labels (torch.Tensor): Labels of gt boxes + """ + self_inds = torch.arange( + 1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device) + self.gt_inds = torch.cat([self_inds, self.gt_inds]) + + self.max_overlaps = torch.cat( + [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps]) + + self.labels = torch.cat([gt_labels, self.labels]) diff --git a/mmdetection/mmdet/models/task_modules/assigners/atss_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/atss_assigner.py new file mode 100644 index 0000000..2796b99 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/atss_assigner.py @@ -0,0 +1,254 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import List, Optional + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.utils import ConfigType +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +def bbox_center_distance(bboxes: Tensor, priors: Tensor) -> Tensor: + """Compute the center distance between bboxes and priors. + + Args: + bboxes (Tensor): Shape (n, 4) for , "xyxy" format. + priors (Tensor): Shape (n, 4) for priors, "xyxy" format. + + Returns: + Tensor: Center distances between bboxes and priors. + """ + bbox_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0 + bbox_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0 + bbox_points = torch.stack((bbox_cx, bbox_cy), dim=1) + + priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0 + priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0 + priors_points = torch.stack((priors_cx, priors_cy), dim=1) + + distances = (priors_points[:, None, :] - + bbox_points[None, :, :]).pow(2).sum(-1).sqrt() + + return distances + + +@TASK_UTILS.register_module() +class ATSSAssigner(BaseAssigner): + """Assign a corresponding gt bbox or background to each prior. + + Each proposals will be assigned with `0` or a positive integer + indicating the ground truth index. + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + If ``alpha`` is not None, it means that the dynamic cost + ATSSAssigner is adopted, which is currently only used in the DDOD. + + Args: + topk (int): number of priors selected in each level + alpha (float, optional): param of cost rate for each proposal only + in DDOD. Defaults to None. + iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou + calculator. Defaults to ``dict(type='BboxOverlaps2D')`` + ignore_iof_thr (float): IoF threshold for ignoring bboxes (if + `gt_bboxes_ignore` is specified). Negative values mean not + ignoring any bboxes. Defaults to -1. + """ + + def __init__(self, + topk: int, + alpha: Optional[float] = None, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D'), + ignore_iof_thr: float = -1) -> None: + self.topk = topk + self.alpha = alpha + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.ignore_iof_thr = ignore_iof_thr + + # https://github.com/sfzhang15/ATSS/blob/master/atss_core/modeling/rpn/atss/loss.py + def assign( + self, + pred_instances: InstanceData, + num_level_priors: List[int], + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None + ) -> AssignResult: + """Assign gt to priors. + + The assignment is done in following steps + + 1. compute iou between all prior (prior of all pyramid levels) and gt + 2. compute center distance between all prior and gt + 3. on each pyramid level, for each gt, select k prior whose center + are closest to the gt center, so we total select k*l prior as + candidates for each gt + 4. get corresponding iou for the these candidates, and compute the + mean and std, set mean + std as the iou threshold + 5. select these candidates whose iou are greater than or equal to + the threshold as positive + 6. limit the positive sample's center in gt + + If ``alpha`` is not None, and ``cls_scores`` and `bbox_preds` + are not None, the overlaps calculation in the first step + will also include dynamic cost, which is currently only used in + the DDOD. + + Args: + pred_instances (:obj:`InstaceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors, points, or bboxes predicted by the model, + shape(n, 4). + num_level_priors (List): Number of bboxes in each level + gt_instances (:obj:`InstaceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + gt_instances_ignore (:obj:`InstaceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + + Returns: + :obj:`AssignResult`: The assign result. + """ + gt_bboxes = gt_instances.bboxes + priors = pred_instances.priors + gt_labels = gt_instances.labels + if gt_instances_ignore is not None: + gt_bboxes_ignore = gt_instances_ignore.bboxes + else: + gt_bboxes_ignore = None + + INF = 100000000 + priors = priors[:, :4] + num_gt, num_priors = gt_bboxes.size(0), priors.size(0) + + message = 'Invalid alpha parameter because cls_scores or ' \ + 'bbox_preds are None. If you want to use the ' \ + 'cost-based ATSSAssigner, please set cls_scores, ' \ + 'bbox_preds and self.alpha at the same time. ' + + # compute iou between all bbox and gt + if self.alpha is None: + # ATSSAssigner + overlaps = self.iou_calculator(priors, gt_bboxes) + if ('scores' in pred_instances or 'bboxes' in pred_instances): + warnings.warn(message) + + else: + # Dynamic cost ATSSAssigner in DDOD + assert ('scores' in pred_instances + and 'bboxes' in pred_instances), message + cls_scores = pred_instances.scores + bbox_preds = pred_instances.bboxes + + # compute cls cost for bbox and GT + cls_cost = torch.sigmoid(cls_scores[:, gt_labels]) + + # compute iou between all bbox and gt + overlaps = self.iou_calculator(bbox_preds, gt_bboxes) + + # make sure that we are in element-wise multiplication + assert cls_cost.shape == overlaps.shape + + # overlaps is actually a cost matrix + overlaps = cls_cost**(1 - self.alpha) * overlaps**self.alpha + + # assign 0 by default + assigned_gt_inds = overlaps.new_full((num_priors, ), + 0, + dtype=torch.long) + + if num_gt == 0 or num_priors == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = overlaps.new_zeros((num_priors, )) + if num_gt == 0: + # No truth, assign everything to background + assigned_gt_inds[:] = 0 + assigned_labels = overlaps.new_full((num_priors, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + # compute center distance between all bbox and gt + distances = bbox_center_distance(gt_bboxes, priors) + + if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None + and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0): + ignore_overlaps = self.iou_calculator( + priors, gt_bboxes_ignore, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) + ignore_idxs = ignore_max_overlaps > self.ignore_iof_thr + distances[ignore_idxs, :] = INF + assigned_gt_inds[ignore_idxs] = -1 + + # Selecting candidates based on the center distance + candidate_idxs = [] + start_idx = 0 + for level, priors_per_level in enumerate(num_level_priors): + # on each pyramid level, for each gt, + # select k bbox whose center are closest to the gt center + end_idx = start_idx + priors_per_level + distances_per_level = distances[start_idx:end_idx, :] + selectable_k = min(self.topk, priors_per_level) + _, topk_idxs_per_level = distances_per_level.topk( + selectable_k, dim=0, largest=False) + candidate_idxs.append(topk_idxs_per_level + start_idx) + start_idx = end_idx + candidate_idxs = torch.cat(candidate_idxs, dim=0) + + # get corresponding iou for the these candidates, and compute the + # mean and std, set mean + std as the iou threshold + candidate_overlaps = overlaps[candidate_idxs, torch.arange(num_gt)] + overlaps_mean_per_gt = candidate_overlaps.mean(0) + overlaps_std_per_gt = candidate_overlaps.std(0) + overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt + + is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :] + + # limit the positive sample's center in gt + for gt_idx in range(num_gt): + candidate_idxs[:, gt_idx] += gt_idx * num_priors + priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0 + priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0 + ep_priors_cx = priors_cx.view(1, -1).expand( + num_gt, num_priors).contiguous().view(-1) + ep_priors_cy = priors_cy.view(1, -1).expand( + num_gt, num_priors).contiguous().view(-1) + candidate_idxs = candidate_idxs.view(-1) + + # calculate the left, top, right, bottom distance between positive + # prior center and gt side + l_ = ep_priors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0] + t_ = ep_priors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1] + r_ = gt_bboxes[:, 2] - ep_priors_cx[candidate_idxs].view(-1, num_gt) + b_ = gt_bboxes[:, 3] - ep_priors_cy[candidate_idxs].view(-1, num_gt) + is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01 + + is_pos = is_pos & is_in_gts + + # if an anchor box is assigned to multiple gts, + # the one with the highest IoU will be selected. + overlaps_inf = torch.full_like(overlaps, + -INF).t().contiguous().view(-1) + index = candidate_idxs.view(-1)[is_pos.view(-1)] + overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index] + overlaps_inf = overlaps_inf.view(num_gt, -1).t() + + max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1) + assigned_gt_inds[ + max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1 + + assigned_labels = assigned_gt_inds.new_full((num_priors, ), -1) + pos_inds = torch.nonzero( + assigned_gt_inds > 0, as_tuple=False).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] - + 1] + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) diff --git a/mmdetection/mmdet/models/task_modules/assigners/base_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/base_assigner.py new file mode 100644 index 0000000..b12280a --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/base_assigner.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import Optional + +from mmengine.structures import InstanceData + + +class BaseAssigner(metaclass=ABCMeta): + """Base assigner that assigns boxes to ground truth boxes.""" + + @abstractmethod + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs): + """Assign boxes to either a ground truth boxes or a negative boxes.""" diff --git a/mmdetection/mmdet/models/task_modules/assigners/center_region_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/center_region_assigner.py new file mode 100644 index 0000000..11c8055 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/center_region_assigner.py @@ -0,0 +1,366 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.utils import ConfigType +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +def scale_boxes(bboxes: Tensor, scale: float) -> Tensor: + """Expand an array of boxes by a given scale. + + Args: + bboxes (Tensor): Shape (m, 4) + scale (float): The scale factor of bboxes + + Returns: + Tensor: Shape (m, 4). Scaled bboxes + """ + assert bboxes.size(1) == 4 + w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5 + h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5 + x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5 + y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_scaled = torch.zeros_like(bboxes) + boxes_scaled[:, 0] = x_c - w_half + boxes_scaled[:, 2] = x_c + w_half + boxes_scaled[:, 1] = y_c - h_half + boxes_scaled[:, 3] = y_c + h_half + return boxes_scaled + + +def is_located_in(points: Tensor, bboxes: Tensor) -> Tensor: + """Are points located in bboxes. + + Args: + points (Tensor): Points, shape: (m, 2). + bboxes (Tensor): Bounding boxes, shape: (n, 4). + + Return: + Tensor: Flags indicating if points are located in bboxes, + shape: (m, n). + """ + assert points.size(1) == 2 + assert bboxes.size(1) == 4 + return (points[:, 0].unsqueeze(1) > bboxes[:, 0].unsqueeze(0)) & \ + (points[:, 0].unsqueeze(1) < bboxes[:, 2].unsqueeze(0)) & \ + (points[:, 1].unsqueeze(1) > bboxes[:, 1].unsqueeze(0)) & \ + (points[:, 1].unsqueeze(1) < bboxes[:, 3].unsqueeze(0)) + + +def bboxes_area(bboxes: Tensor) -> Tensor: + """Compute the area of an array of bboxes. + + Args: + bboxes (Tensor): The coordinates ox bboxes. Shape: (m, 4) + + Returns: + Tensor: Area of the bboxes. Shape: (m, ) + """ + assert bboxes.size(1) == 4 + w = (bboxes[:, 2] - bboxes[:, 0]) + h = (bboxes[:, 3] - bboxes[:, 1]) + areas = w * h + return areas + + +@TASK_UTILS.register_module() +class CenterRegionAssigner(BaseAssigner): + """Assign pixels at the center region of a bbox as positive. + + Each proposals will be assigned with `-1`, `0`, or a positive integer + indicating the ground truth index. + - -1: negative samples + - semi-positive numbers: positive sample, index (0-based) of assigned gt + + Args: + pos_scale (float): Threshold within which pixels are + labelled as positive. + neg_scale (float): Threshold above which pixels are + labelled as positive. + min_pos_iof (float): Minimum iof of a pixel with a gt to be + labelled as positive. Default: 1e-2 + ignore_gt_scale (float): Threshold within which the pixels + are ignored when the gt is labelled as shadowed. Default: 0.5 + foreground_dominate (bool): If True, the bbox will be assigned as + positive when a gt's kernel region overlaps with another's shadowed + (ignored) region, otherwise it is set as ignored. Default to False. + iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps + Calculator. + """ + + def __init__( + self, + pos_scale: float, + neg_scale: float, + min_pos_iof: float = 1e-2, + ignore_gt_scale: float = 0.5, + foreground_dominate: bool = False, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D') + ) -> None: + self.pos_scale = pos_scale + self.neg_scale = neg_scale + self.min_pos_iof = min_pos_iof + self.ignore_gt_scale = ignore_gt_scale + self.foreground_dominate = foreground_dominate + self.iou_calculator = TASK_UTILS.build(iou_calculator) + + def get_gt_priorities(self, gt_bboxes: Tensor) -> Tensor: + """Get gt priorities according to their areas. + + Smaller gt has higher priority. + + Args: + gt_bboxes (Tensor): Ground truth boxes, shape (k, 4). + + Returns: + Tensor: The priority of gts so that gts with larger priority is + more likely to be assigned. Shape (k, ) + """ + gt_areas = bboxes_area(gt_bboxes) + # Rank all gt bbox areas. Smaller objects has larger priority + _, sort_idx = gt_areas.sort(descending=True) + sort_idx = sort_idx.argsort() + return sort_idx + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to bboxes. + + This method assigns gts to every prior (proposal/anchor), each prior + will be assigned with -1, or a semi-positive number. -1 means + negative sample, semi-positive number is the index (0-based) of + assigned gt. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + + Returns: + :obj:`AssignResult`: The assigned result. Note that shadowed_labels + of shape (N, 2) is also added as an `assign_result` attribute. + `shadowed_labels` is a tensor composed of N pairs of anchor_ind, + class_label], where N is the number of anchors that lie in the + outer region of a gt, anchor_ind is the shadowed anchor index + and class_label is the shadowed class label. + + Example: + >>> from mmengine.structures import InstanceData + >>> self = CenterRegionAssigner(0.2, 0.2) + >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10], + ... [10, 10, 20, 20]]) + >>> gt_instances = InstanceData() + >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 10]]) + >>> gt_instances.labels = torch.Tensor([0]) + >>> assign_result = self.assign(pred_instances, gt_instances) + >>> expected_gt_inds = torch.LongTensor([1, 0]) + >>> assert torch.all(assign_result.gt_inds == expected_gt_inds) + """ + # There are in total 5 steps in the pixel assignment + # 1. Find core (the center region, say inner 0.2) + # and shadow (the relatively ourter part, say inner 0.2-0.5) + # regions of every gt. + # 2. Find all prior bboxes that lie in gt_core and gt_shadow regions + # 3. Assign prior bboxes in gt_core with a one-hot id of the gt in + # the image. + # 3.1. For overlapping objects, the prior bboxes in gt_core is + # assigned with the object with smallest area + # 4. Assign prior bboxes with class label according to its gt id. + # 4.1. Assign -1 to prior bboxes lying in shadowed gts + # 4.2. Assign positive prior boxes with the corresponding label + # 5. Find pixels lying in the shadow of an object and assign them with + # background label, but set the loss weight of its corresponding + # gt to zero. + + # TODO not extract bboxes in assign. + gt_bboxes = gt_instances.bboxes + priors = pred_instances.priors + gt_labels = gt_instances.labels + + assert priors.size(1) == 4, 'priors must have size of 4' + # 1. Find core positive and shadow region of every gt + gt_core = scale_boxes(gt_bboxes, self.pos_scale) + gt_shadow = scale_boxes(gt_bboxes, self.neg_scale) + + # 2. Find prior bboxes that lie in gt_core and gt_shadow regions + prior_centers = (priors[:, 2:4] + priors[:, 0:2]) / 2 + # The center points lie within the gt boxes + is_prior_in_gt = is_located_in(prior_centers, gt_bboxes) + # Only calculate prior and gt_core IoF. This enables small prior bboxes + # to match large gts + prior_and_gt_core_overlaps = self.iou_calculator( + priors, gt_core, mode='iof') + # The center point of effective priors should be within the gt box + is_prior_in_gt_core = is_prior_in_gt & ( + prior_and_gt_core_overlaps > self.min_pos_iof) # shape (n, k) + + is_prior_in_gt_shadow = ( + self.iou_calculator(priors, gt_shadow, mode='iof') > + self.min_pos_iof) + # Rule out center effective positive pixels + is_prior_in_gt_shadow &= (~is_prior_in_gt_core) + + num_gts, num_priors = gt_bboxes.size(0), priors.size(0) + if num_gts == 0 or num_priors == 0: + # If no gts exist, assign all pixels to negative + assigned_gt_ids = \ + is_prior_in_gt_core.new_zeros((num_priors,), + dtype=torch.long) + pixels_in_gt_shadow = assigned_gt_ids.new_empty((0, 2)) + else: + # Step 3: assign a one-hot gt id to each pixel, and smaller objects + # have high priority to assign the pixel. + sort_idx = self.get_gt_priorities(gt_bboxes) + assigned_gt_ids, pixels_in_gt_shadow = \ + self.assign_one_hot_gt_indices(is_prior_in_gt_core, + is_prior_in_gt_shadow, + gt_priority=sort_idx) + + if (gt_instances_ignore is not None + and gt_instances_ignore.bboxes.numel() > 0): + # No ground truth or boxes, return empty assignment + gt_bboxes_ignore = gt_instances_ignore.bboxes + gt_bboxes_ignore = scale_boxes( + gt_bboxes_ignore, scale=self.ignore_gt_scale) + is_prior_in_ignored_gts = is_located_in(prior_centers, + gt_bboxes_ignore) + is_prior_in_ignored_gts = is_prior_in_ignored_gts.any(dim=1) + assigned_gt_ids[is_prior_in_ignored_gts] = -1 + + # 4. Assign prior bboxes with class label according to its gt id. + # Default assigned label is the background (-1) + assigned_labels = assigned_gt_ids.new_full((num_priors, ), -1) + pos_inds = torch.nonzero(assigned_gt_ids > 0, as_tuple=False).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[assigned_gt_ids[pos_inds] - + 1] + # 5. Find pixels lying in the shadow of an object + shadowed_pixel_labels = pixels_in_gt_shadow.clone() + if pixels_in_gt_shadow.numel() > 0: + pixel_idx, gt_idx =\ + pixels_in_gt_shadow[:, 0], pixels_in_gt_shadow[:, 1] + assert (assigned_gt_ids[pixel_idx] != gt_idx).all(), \ + 'Some pixels are dually assigned to ignore and gt!' + shadowed_pixel_labels[:, 1] = gt_labels[gt_idx - 1] + override = ( + assigned_labels[pixel_idx] == shadowed_pixel_labels[:, 1]) + if self.foreground_dominate: + # When a pixel is both positive and shadowed, set it as pos + shadowed_pixel_labels = shadowed_pixel_labels[~override] + else: + # When a pixel is both pos and shadowed, set it as shadowed + assigned_labels[pixel_idx[override]] = -1 + assigned_gt_ids[pixel_idx[override]] = 0 + + assign_result = AssignResult( + num_gts, assigned_gt_ids, None, labels=assigned_labels) + # Add shadowed_labels as assign_result property. Shape: (num_shadow, 2) + assign_result.set_extra_property('shadowed_labels', + shadowed_pixel_labels) + return assign_result + + def assign_one_hot_gt_indices( + self, + is_prior_in_gt_core: Tensor, + is_prior_in_gt_shadow: Tensor, + gt_priority: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: + """Assign only one gt index to each prior box. + + Gts with large gt_priority are more likely to be assigned. + + Args: + is_prior_in_gt_core (Tensor): Bool tensor indicating the prior + center is in the core area of a gt (e.g. 0-0.2). + Shape: (num_prior, num_gt). + is_prior_in_gt_shadow (Tensor): Bool tensor indicating the prior + center is in the shadowed area of a gt (e.g. 0.2-0.5). + Shape: (num_prior, num_gt). + gt_priority (Tensor): Priorities of gts. The gt with a higher + priority is more likely to be assigned to the bbox when the + bbox match with multiple gts. Shape: (num_gt, ). + + Returns: + tuple: Returns (assigned_gt_inds, shadowed_gt_inds). + + - assigned_gt_inds: The assigned gt index of each prior bbox \ + (i.e. index from 1 to num_gts). Shape: (num_prior, ). + - shadowed_gt_inds: shadowed gt indices. It is a tensor of \ + shape (num_ignore, 2) with first column being the shadowed prior \ + bbox indices and the second column the shadowed gt \ + indices (1-based). + """ + num_bboxes, num_gts = is_prior_in_gt_core.shape + + if gt_priority is None: + gt_priority = torch.arange( + num_gts, device=is_prior_in_gt_core.device) + assert gt_priority.size(0) == num_gts + # The bigger gt_priority, the more preferable to be assigned + # The assigned inds are by default 0 (background) + assigned_gt_inds = is_prior_in_gt_core.new_zeros((num_bboxes, ), + dtype=torch.long) + # Shadowed bboxes are assigned to be background. But the corresponding + # label is ignored during loss calculation, which is done through + # shadowed_gt_inds + shadowed_gt_inds = torch.nonzero(is_prior_in_gt_shadow, as_tuple=False) + if is_prior_in_gt_core.sum() == 0: # No gt match + shadowed_gt_inds[:, 1] += 1 # 1-based. For consistency issue + return assigned_gt_inds, shadowed_gt_inds + + # The priority of each prior box and gt pair. If one prior box is + # matched bo multiple gts. Only the pair with the highest priority + # is saved + pair_priority = is_prior_in_gt_core.new_full((num_bboxes, num_gts), + -1, + dtype=torch.long) + + # Each bbox could match with multiple gts. + # The following codes deal with this situation + # Matched bboxes (to any gt). Shape: (num_pos_anchor, ) + inds_of_match = torch.any(is_prior_in_gt_core, dim=1) + # The matched gt index of each positive bbox. Length >= num_pos_anchor + # , since one bbox could match multiple gts + matched_bbox_gt_inds = torch.nonzero( + is_prior_in_gt_core, as_tuple=False)[:, 1] + # Assign priority to each bbox-gt pair. + pair_priority[is_prior_in_gt_core] = gt_priority[matched_bbox_gt_inds] + _, argmax_priority = pair_priority[inds_of_match].max(dim=1) + assigned_gt_inds[inds_of_match] = argmax_priority + 1 # 1-based + # Zero-out the assigned anchor box to filter the shadowed gt indices + is_prior_in_gt_core[inds_of_match, argmax_priority] = 0 + # Concat the shadowed indices due to overlapping with that out side of + # effective scale. shape: (total_num_ignore, 2) + shadowed_gt_inds = torch.cat( + (shadowed_gt_inds, + torch.nonzero(is_prior_in_gt_core, as_tuple=False)), + dim=0) + # Change `is_prior_in_gt_core` back to keep arguments intact. + is_prior_in_gt_core[inds_of_match, argmax_priority] = 1 + # 1-based shadowed gt indices, to be consistent with `assigned_gt_inds` + if shadowed_gt_inds.numel() > 0: + shadowed_gt_inds[:, 1] += 1 + return assigned_gt_inds, shadowed_gt_inds diff --git a/mmdetection/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py new file mode 100644 index 0000000..3fc7af3 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py @@ -0,0 +1,227 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import BaseBoxes +from mmdet.utils import ConfigType +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + +INF = 100000000 +EPS = 1.0e-7 + + +def center_of_mass(masks: Tensor, eps: float = 1e-7) -> Tensor: + """Compute the masks center of mass. + + Args: + masks: Mask tensor, has shape (num_masks, H, W). + eps: a small number to avoid normalizer to be zero. + Defaults to 1e-7. + Returns: + Tensor: The masks center of mass. Has shape (num_masks, 2). + """ + n, h, w = masks.shape + grid_h = torch.arange(h, device=masks.device)[:, None] + grid_w = torch.arange(w, device=masks.device) + normalizer = masks.sum(dim=(1, 2)).float().clamp(min=eps) + center_y = (masks * grid_h).sum(dim=(1, 2)) / normalizer + center_x = (masks * grid_w).sum(dim=(1, 2)) / normalizer + center = torch.cat([center_x[:, None], center_y[:, None]], dim=1) + return center + + +@TASK_UTILS.register_module() +class DynamicSoftLabelAssigner(BaseAssigner): + """Computes matching between predictions and ground truth with dynamic soft + label assignment. + + Args: + soft_center_radius (float): Radius of the soft center prior. + Defaults to 3.0. + topk (int): Select top-k predictions to calculate dynamic k + best matches for each gt. Defaults to 13. + iou_weight (float): The scale factor of iou cost. Defaults to 3.0. + iou_calculator (ConfigType): Config of overlaps Calculator. + Defaults to dict(type='BboxOverlaps2D'). + """ + + def __init__( + self, + soft_center_radius: float = 3.0, + topk: int = 13, + iou_weight: float = 3.0, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D') + ) -> None: + self.soft_center_radius = soft_center_radius + self.topk = topk + self.iou_weight = iou_weight + self.iou_calculator = TASK_UTILS.build(iou_calculator) + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to priors. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + Returns: + obj:`AssignResult`: The assigned result. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + num_gt = gt_bboxes.size(0) + + decoded_bboxes = pred_instances.bboxes + pred_scores = pred_instances.scores + priors = pred_instances.priors + num_bboxes = decoded_bboxes.size(0) + + # assign 0 by default + assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ), + 0, + dtype=torch.long) + if num_gt == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + if num_gt == 0: + # No truth, assign everything to background + assigned_gt_inds[:] = 0 + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + prior_center = priors[:, :2] + if isinstance(gt_bboxes, BaseBoxes): + is_in_gts = gt_bboxes.find_inside_points(prior_center) + else: + # Tensor boxes will be treated as horizontal boxes by defaults + lt_ = prior_center[:, None] - gt_bboxes[:, :2] + rb_ = gt_bboxes[:, 2:] - prior_center[:, None] + + deltas = torch.cat([lt_, rb_], dim=-1) + is_in_gts = deltas.min(dim=-1).values > 0 + + valid_mask = is_in_gts.sum(dim=1) > 0 + + valid_decoded_bbox = decoded_bboxes[valid_mask] + valid_pred_scores = pred_scores[valid_mask] + num_valid = valid_decoded_bbox.size(0) + + if num_valid == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + if hasattr(gt_instances, 'masks'): + gt_center = center_of_mass(gt_instances.masks, eps=EPS) + elif isinstance(gt_bboxes, BaseBoxes): + gt_center = gt_bboxes.centers + else: + # Tensor boxes will be treated as horizontal boxes by defaults + gt_center = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2.0 + valid_prior = priors[valid_mask] + strides = valid_prior[:, 2] + distance = (valid_prior[:, None, :2] - gt_center[None, :, :] + ).pow(2).sum(-1).sqrt() / strides[:, None] + soft_center_prior = torch.pow(10, distance - self.soft_center_radius) + + pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes) + iou_cost = -torch.log(pairwise_ious + EPS) * self.iou_weight + + gt_onehot_label = ( + F.one_hot(gt_labels.to(torch.int64), + pred_scores.shape[-1]).float().unsqueeze(0).repeat( + num_valid, 1, 1)) + valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1) + + soft_label = gt_onehot_label * pairwise_ious[..., None] + scale_factor = soft_label - valid_pred_scores.sigmoid() + soft_cls_cost = F.binary_cross_entropy_with_logits( + valid_pred_scores, soft_label, + reduction='none') * scale_factor.abs().pow(2.0) + soft_cls_cost = soft_cls_cost.sum(dim=-1) + + cost_matrix = soft_cls_cost + iou_cost + soft_center_prior + + matched_pred_ious, matched_gt_inds = self.dynamic_k_matching( + cost_matrix, pairwise_ious, num_gt, valid_mask) + + # convert to AssignResult format + assigned_gt_inds[valid_mask] = matched_gt_inds + 1 + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long() + max_overlaps = assigned_gt_inds.new_full((num_bboxes, ), + -INF, + dtype=torch.float32) + max_overlaps[valid_mask] = matched_pred_ious + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor, + num_gt: int, + valid_mask: Tensor) -> Tuple[Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets. Same as SimOTA. + + Args: + cost (Tensor): Cost matrix. + pairwise_ious (Tensor): Pairwise iou matrix. + num_gt (int): Number of gt. + valid_mask (Tensor): Mask for valid bboxes. + + Returns: + tuple: matched ious and gt indexes. + """ + matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.topk, pairwise_ious.size(0)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1) + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) + matching_matrix[:, gt_idx][pos_idx] = 1 + + del topk_ious, dynamic_ks, pos_idx + + prior_match_gt_mask = matching_matrix.sum(1) > 1 + if prior_match_gt_mask.sum() > 0: + cost_min, cost_argmin = torch.min( + cost[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(1) > 0 + valid_mask[valid_mask.clone()] = fg_mask_inboxes + + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + matched_pred_ious = (matching_matrix * + pairwise_ious).sum(1)[fg_mask_inboxes] + return matched_pred_ious, matched_gt_inds diff --git a/mmdetection/mmdet/models/task_modules/assigners/grid_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/grid_assigner.py new file mode 100644 index 0000000..d8935d2 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/grid_assigner.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import torch +from mmengine.structures import InstanceData + +from mmdet.registry import TASK_UTILS +from mmdet.utils import ConfigType +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +@TASK_UTILS.register_module() +class GridAssigner(BaseAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with `-1`, `0`, or a positive integer + indicating the ground truth index. + + - -1: don't care + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + pos_iou_thr (float): IoU threshold for positive bboxes. + neg_iou_thr (float or tuple[float, float]): IoU threshold for negative + bboxes. + min_pos_iou (float): Minimum iou for a bbox to be considered as a + positive bbox. Positive samples can have smaller IoU than + pos_iou_thr due to the 4th step (assign max IoU sample to each gt). + Defaults to 0. + gt_max_assign_all (bool): Whether to assign all bboxes with the same + highest overlap with some gt to that gt. + iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps + Calculator. + """ + + def __init__( + self, + pos_iou_thr: float, + neg_iou_thr: Union[float, Tuple[float, float]], + min_pos_iou: float = .0, + gt_max_assign_all: bool = True, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D') + ) -> None: + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_pos_iou = min_pos_iou + self.gt_max_assign_all = gt_max_assign_all + self.iou_calculator = TASK_UTILS.build(iou_calculator) + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to bboxes. The process is very much like the max iou + assigner, except that positive samples are constrained within the cell + that the gt boxes fell in. + + This method assign a gt bbox to every bbox (proposal/anchor), each bbox + will be assigned with -1, 0, or a positive number. -1 means don't care, + 0 means negative sample, positive number is the index (1-based) of + assigned gt. + The assignment is done in following steps, the order matters. + + 1. assign every bbox to -1 + 2. assign proposals whose iou with all gts <= neg_iou_thr to 0 + 3. for each bbox within a cell, if the iou with its nearest gt > + pos_iou_thr and the center of that gt falls inside the cell, + assign it to that bbox + 4. for each gt bbox, assign its nearest proposals within the cell the + gt bbox falls in to itself. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + + Returns: + :obj:`AssignResult`: The assign result. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + + priors = pred_instances.priors + responsible_flags = pred_instances.responsible_flags + + num_gts, num_priors = gt_bboxes.size(0), priors.size(0) + + # compute iou between all gt and priors + overlaps = self.iou_calculator(gt_bboxes, priors) + + # 1. assign -1 by default + assigned_gt_inds = overlaps.new_full((num_priors, ), + -1, + dtype=torch.long) + + if num_gts == 0 or num_priors == 0: + # No ground truth or priors, return empty assignment + max_overlaps = overlaps.new_zeros((num_priors, )) + if num_gts == 0: + # No truth, assign everything to background + assigned_gt_inds[:] = 0 + assigned_labels = overlaps.new_full((num_priors, ), + -1, + dtype=torch.long) + return AssignResult( + num_gts, + assigned_gt_inds, + max_overlaps, + labels=assigned_labels) + + # 2. assign negative: below + # for each anchor, which gt best overlaps with it + # for each anchor, the max iou of all gts + # shape of max_overlaps == argmax_overlaps == num_priors + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + + if isinstance(self.neg_iou_thr, float): + assigned_gt_inds[(max_overlaps >= 0) + & (max_overlaps <= self.neg_iou_thr)] = 0 + elif isinstance(self.neg_iou_thr, (tuple, list)): + assert len(self.neg_iou_thr) == 2 + assigned_gt_inds[(max_overlaps > self.neg_iou_thr[0]) + & (max_overlaps <= self.neg_iou_thr[1])] = 0 + + # 3. assign positive: falls into responsible cell and above + # positive IOU threshold, the order matters. + # the prior condition of comparison is to filter out all + # unrelated anchors, i.e. not responsible_flags + overlaps[:, ~responsible_flags.type(torch.bool)] = -1. + + # calculate max_overlaps again, but this time we only consider IOUs + # for anchors responsible for prediction + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + + # for each gt, which anchor best overlaps with it + # for each gt, the max iou of all proposals + # shape of gt_max_overlaps == gt_argmax_overlaps == num_gts + gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1) + + pos_inds = (max_overlaps > self.pos_iou_thr) & responsible_flags.type( + torch.bool) + assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1 + + # 4. assign positive to max overlapped anchors within responsible cell + for i in range(num_gts): + if gt_max_overlaps[i] > self.min_pos_iou: + if self.gt_max_assign_all: + max_iou_inds = (overlaps[i, :] == gt_max_overlaps[i]) & \ + responsible_flags.type(torch.bool) + assigned_gt_inds[max_iou_inds] = i + 1 + elif responsible_flags[gt_argmax_overlaps[i]]: + assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1 + + # assign labels of positive anchors + assigned_labels = assigned_gt_inds.new_full((num_priors, ), -1) + pos_inds = torch.nonzero( + assigned_gt_inds > 0, as_tuple=False).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] - + 1] + + return AssignResult( + num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) diff --git a/mmdetection/mmdet/models/task_modules/assigners/hungarian_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/hungarian_assigner.py new file mode 100644 index 0000000..a6745a3 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/hungarian_assigner.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Union + +import torch +from mmengine import ConfigDict +from mmengine.structures import InstanceData +from scipy.optimize import linear_sum_assignment +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +@TASK_UTILS.register_module() +class HungarianAssigner(BaseAssigner): + """Computes one-to-one matching between predictions and ground truth. + + This class computes an assignment between the targets and the predictions + based on the costs. The costs are weighted sum of some components. + For DETR the costs are weighted sum of classification cost, regression L1 + cost and regression iou cost. The targets don't include the no_object, so + generally there are more predictions than targets. After the one-to-one + matching, the un-matched are treated as backgrounds. Thus each query + prediction will be assigned with `0` or a positive integer indicating the + ground truth index: + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + match_costs (:obj:`ConfigDict` or dict or \ + List[Union[:obj:`ConfigDict`, dict]]): Match cost configs. + """ + + def __init__( + self, match_costs: Union[List[Union[dict, ConfigDict]], dict, + ConfigDict] + ) -> None: + + if isinstance(match_costs, dict): + match_costs = [match_costs] + elif isinstance(match_costs, list): + assert len(match_costs) > 0, \ + 'match_costs must not be a empty list.' + + self.match_costs = [ + TASK_UTILS.build(match_cost) for match_cost in match_costs + ] + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> AssignResult: + """Computes one-to-one matching based on the weighted costs. + + This method assign each query prediction to a ground truth or + background. The `assigned_gt_inds` with -1 means don't care, + 0 means negative sample, and positive number is the index (1-based) + of assigned gt. + The assignment is done in the following steps, the order matters. + + 1. assign every prediction to -1 + 2. compute the weighted costs + 3. do Hungarian matching on CPU based on the costs + 4. assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. It may includes ``masks``, with shape + (n, h, w) or (n, l). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + ``labels``, with shape (k, ) and ``masks``, with shape + (k, h, w) or (k, l). + img_meta (dict): Image information. + + Returns: + :obj:`AssignResult`: The assigned result. + """ + assert isinstance(gt_instances.labels, Tensor) + num_gts, num_preds = len(gt_instances), len(pred_instances) + gt_labels = gt_instances.labels + device = gt_labels.device + + # 1. assign -1 by default + assigned_gt_inds = torch.full((num_preds, ), + -1, + dtype=torch.long, + device=device) + assigned_labels = torch.full((num_preds, ), + -1, + dtype=torch.long, + device=device) + + if num_gts == 0 or num_preds == 0: + # No ground truth or boxes, return empty assignment + if num_gts == 0: + # No ground truth, assign all to background + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) + + # 2. compute weighted cost + cost_list = [] + for match_cost in self.match_costs: + cost = match_cost( + pred_instances=pred_instances, + gt_instances=gt_instances, + img_meta=img_meta) + cost_list.append(cost) + cost = torch.stack(cost_list).sum(dim=0) + + # 3. do Hungarian matching on CPU using linear_sum_assignment + cost = cost.detach().cpu() + if linear_sum_assignment is None: + raise ImportError('Please run "pip install scipy" ' + 'to install scipy first.') + + matched_row_inds, matched_col_inds = linear_sum_assignment(cost) + matched_row_inds = torch.from_numpy(matched_row_inds).to(device) + matched_col_inds = torch.from_numpy(matched_col_inds).to(device) + + # 4. assign backgrounds and foregrounds + # assign all indices to backgrounds first + assigned_gt_inds[:] = 0 + # assign foregrounds based on matching results + assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 + assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) diff --git a/mmdetection/mmdet/models/task_modules/assigners/iou2d_calculator.py b/mmdetection/mmdet/models/task_modules/assigners/iou2d_calculator.py new file mode 100644 index 0000000..b6daa94 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/iou2d_calculator.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import bbox_overlaps, get_box_tensor + + +def cast_tensor_type(x, scale=1., dtype=None): + if dtype == 'fp16': + # scale is for preventing overflows + x = (x / scale).half() + return x + + +@TASK_UTILS.register_module() +class BboxOverlaps2D: + """2D Overlaps (e.g. IoUs, GIoUs) Calculator.""" + + def __init__(self, scale=1., dtype=None): + self.scale = scale + self.dtype = dtype + + def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): + """Calculate IoU between 2D bboxes. + + Args: + bboxes1 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4) + in format, or shape (m, 5) in format. + bboxes2 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4) + in format, shape (m, 5) in format, or be empty. If ``is_aligned `` is ``True``, + then m and n must be equal. + mode (str): "iou" (intersection over union), "iof" (intersection + over foreground), or "giou" (generalized intersection over + union). + is_aligned (bool, optional): If True, then m and n must be equal. + Default False. + + Returns: + Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) + """ + bboxes1 = get_box_tensor(bboxes1) + bboxes2 = get_box_tensor(bboxes2) + assert bboxes1.size(-1) in [0, 4, 5] + assert bboxes2.size(-1) in [0, 4, 5] + if bboxes2.size(-1) == 5: + bboxes2 = bboxes2[..., :4] + if bboxes1.size(-1) == 5: + bboxes1 = bboxes1[..., :4] + + if self.dtype == 'fp16': + # change tensor type to save cpu and cuda memory and keep speed + bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype) + bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype) + overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned) + if not overlaps.is_cuda and overlaps.dtype == torch.float16: + # resume cpu float32 + overlaps = overlaps.float() + return overlaps + + return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned) + + def __repr__(self): + """str: a string describing the module""" + repr_str = self.__class__.__name__ + f'(' \ + f'scale={self.scale}, dtype={self.dtype})' + return repr_str + + +@TASK_UTILS.register_module() +class BboxOverlaps2D_GLIP(BboxOverlaps2D): + + def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): + TO_REMOVE = 1 + area1 = (bboxes1[:, 2] - bboxes1[:, 0] + TO_REMOVE) * ( + bboxes1[:, 3] - bboxes1[:, 1] + TO_REMOVE) + area2 = (bboxes2[:, 2] - bboxes2[:, 0] + TO_REMOVE) * ( + bboxes2[:, 3] - bboxes2[:, 1] + TO_REMOVE) + + lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [N,M,2] + rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [N,M,2] + + wh = (rb - lt + TO_REMOVE).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + iou = inter / (area1[:, None] + area2 - inter) + return iou diff --git a/mmdetection/mmdet/models/task_modules/assigners/match_cost.py b/mmdetection/mmdet/models/task_modules/assigners/match_cost.py new file mode 100644 index 0000000..5fc62f0 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/match_cost.py @@ -0,0 +1,525 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractmethod +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcywh + + +class BaseMatchCost: + """Base match cost class. + + Args: + weight (Union[float, int]): Cost weight. Defaults to 1. + """ + + def __init__(self, weight: Union[float, int] = 1.) -> None: + self.weight = weight + + @abstractmethod + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + img_meta (dict, optional): Image information. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + pass + + +@TASK_UTILS.register_module() +class BBoxL1Cost(BaseMatchCost): + """BBoxL1Cost. + + Note: ``bboxes`` in ``InstanceData`` passed in is of format 'xyxy' + and its coordinates are unnormalized. + + Args: + box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN. + Defaults to 'xyxy'. + weight (Union[float, int]): Cost weight. Defaults to 1. + + Examples: + >>> from mmdet.models.task_modules.assigners. + ... match_costs.match_cost import BBoxL1Cost + >>> import torch + >>> self = BBoxL1Cost() + >>> bbox_pred = torch.rand(1, 4) + >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) + >>> factor = torch.tensor([10, 8, 10, 8]) + >>> self(bbox_pred, gt_bboxes, factor) + tensor([[1.6172, 1.6422]]) + """ + + def __init__(self, + box_format: str = 'xyxy', + weight: Union[float, int] = 1.) -> None: + super().__init__(weight=weight) + assert box_format in ['xyxy', 'xywh'] + self.box_format = box_format + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): ``bboxes`` inside is + predicted boxes with unnormalized coordinate + (x, y, x, y). + gt_instances (:obj:`InstanceData`): ``bboxes`` inside is gt + bboxes with unnormalized coordinate (x, y, x, y). + img_meta (Optional[dict]): Image information. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + pred_bboxes = pred_instances.bboxes + gt_bboxes = gt_instances.bboxes + + # convert box format + if self.box_format == 'xywh': + gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes) + pred_bboxes = bbox_xyxy_to_cxcywh(pred_bboxes) + + # normalized + img_h, img_w = img_meta['img_shape'] + factor = gt_bboxes.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + gt_bboxes = gt_bboxes / factor + pred_bboxes = pred_bboxes / factor + + bbox_cost = torch.cdist(pred_bboxes, gt_bboxes, p=1) + return bbox_cost * self.weight + + +@TASK_UTILS.register_module() +class IoUCost(BaseMatchCost): + """IoUCost. + + Note: ``bboxes`` in ``InstanceData`` passed in is of format 'xyxy' + and its coordinates are unnormalized. + + Args: + iou_mode (str): iou mode such as 'iou', 'giou'. Defaults to 'giou'. + weight (Union[float, int]): Cost weight. Defaults to 1. + + Examples: + >>> from mmdet.models.task_modules.assigners. + ... match_costs.match_cost import IoUCost + >>> import torch + >>> self = IoUCost() + >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]]) + >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) + >>> self(bboxes, gt_bboxes) + tensor([[-0.1250, 0.1667], + [ 0.1667, -0.5000]]) + """ + + def __init__(self, iou_mode: str = 'giou', weight: Union[float, int] = 1.): + super().__init__(weight=weight) + self.iou_mode = iou_mode + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs): + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): ``bboxes`` inside is + predicted boxes with unnormalized coordinate + (x, y, x, y). + gt_instances (:obj:`InstanceData`): ``bboxes`` inside is gt + bboxes with unnormalized coordinate (x, y, x, y). + img_meta (Optional[dict]): Image information. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + pred_bboxes = pred_instances.bboxes + gt_bboxes = gt_instances.bboxes + + # avoid fp16 overflow + if pred_bboxes.dtype == torch.float16: + fp16 = True + pred_bboxes = pred_bboxes.to(torch.float32) + else: + fp16 = False + + overlaps = bbox_overlaps( + pred_bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False) + + if fp16: + overlaps = overlaps.to(torch.float16) + + # The 1 is a constant that doesn't change the matching, so omitted. + iou_cost = -overlaps + return iou_cost * self.weight + + +@TASK_UTILS.register_module() +class ClassificationCost(BaseMatchCost): + """ClsSoftmaxCost. + + Args: + weight (Union[float, int]): Cost weight. Defaults to 1. + + Examples: + >>> from mmdet.models.task_modules.assigners. + ... match_costs.match_cost import ClassificationCost + >>> import torch + >>> self = ClassificationCost() + >>> cls_pred = torch.rand(4, 3) + >>> gt_labels = torch.tensor([0, 1, 2]) + >>> factor = torch.tensor([10, 8, 10, 8]) + >>> self(cls_pred, gt_labels) + tensor([[-0.3430, -0.3525, -0.3045], + [-0.3077, -0.2931, -0.3992], + [-0.3664, -0.3455, -0.2881], + [-0.3343, -0.2701, -0.3956]]) + """ + + def __init__(self, weight: Union[float, int] = 1) -> None: + super().__init__(weight=weight) + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): ``scores`` inside is + predicted classification logits, of shape + (num_queries, num_class). + gt_instances (:obj:`InstanceData`): ``labels`` inside should have + shape (num_gt, ). + img_meta (Optional[dict]): _description_. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + pred_scores = pred_instances.scores + gt_labels = gt_instances.labels + + pred_scores = pred_scores.softmax(-1) + cls_cost = -pred_scores[:, gt_labels] + + return cls_cost * self.weight + + +@TASK_UTILS.register_module() +class FocalLossCost(BaseMatchCost): + """FocalLossCost. + + Args: + alpha (Union[float, int]): focal_loss alpha. Defaults to 0.25. + gamma (Union[float, int]): focal_loss gamma. Defaults to 2. + eps (float): Defaults to 1e-12. + binary_input (bool): Whether the input is binary. Currently, + binary_input = True is for masks input, binary_input = False + is for label input. Defaults to False. + weight (Union[float, int]): Cost weight. Defaults to 1. + """ + + def __init__(self, + alpha: Union[float, int] = 0.25, + gamma: Union[float, int] = 2, + eps: float = 1e-12, + binary_input: bool = False, + weight: Union[float, int] = 1.) -> None: + super().__init__(weight=weight) + self.alpha = alpha + self.gamma = gamma + self.eps = eps + self.binary_input = binary_input + + def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor: + """ + Args: + cls_pred (Tensor): Predicted classification logits, shape + (num_queries, num_class). + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + + Returns: + torch.Tensor: cls_cost value with weight + """ + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * ( + 1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( + 1 - cls_pred).pow(self.gamma) + + cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels] + return cls_cost * self.weight + + def _mask_focal_loss_cost(self, cls_pred, gt_labels) -> Tensor: + """ + Args: + cls_pred (Tensor): Predicted classification logits. + in shape (num_queries, d1, ..., dn), dtype=torch.float32. + gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn), + dtype=torch.long. Labels should be binary. + + Returns: + Tensor: Focal cost matrix with weight in shape\ + (num_queries, num_gt). + """ + cls_pred = cls_pred.flatten(1) + gt_labels = gt_labels.flatten(1).float() + n = cls_pred.shape[1] + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * ( + 1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( + 1 - cls_pred).pow(self.gamma) + + cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \ + torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels)) + return cls_cost / n * self.weight + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): Predicted instances which + must contain ``scores`` or ``masks``. + gt_instances (:obj:`InstanceData`): Ground truth which must contain + ``labels`` or ``mask``. + img_meta (Optional[dict]): Image information. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + if self.binary_input: + pred_masks = pred_instances.masks + gt_masks = gt_instances.masks + return self._mask_focal_loss_cost(pred_masks, gt_masks) + else: + pred_scores = pred_instances.scores + gt_labels = gt_instances.labels + return self._focal_loss_cost(pred_scores, gt_labels) + + +@TASK_UTILS.register_module() +class BinaryFocalLossCost(FocalLossCost): + + def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor: + """ + Args: + cls_pred (Tensor): Predicted classification logits, shape + (num_queries, num_class). + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + + Returns: + torch.Tensor: cls_cost value with weight + """ + cls_pred = cls_pred.flatten(1) + gt_labels = gt_labels.flatten(1).float() + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * ( + 1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( + 1 - cls_pred).pow(self.gamma) + + cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \ + torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels)) + return cls_cost * self.weight + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): Predicted instances which + must contain ``scores`` or ``masks``. + gt_instances (:obj:`InstanceData`): Ground truth which must contain + ``labels`` or ``mask``. + img_meta (Optional[dict]): Image information. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + # gt_instances.text_token_mask is a repeated tensor of the same length + # of instances. Only gt_instances.text_token_mask[0] is useful + text_token_mask = torch.nonzero( + gt_instances.text_token_mask[0]).squeeze(-1) + pred_scores = pred_instances.scores[:, text_token_mask] + gt_labels = gt_instances.positive_maps[:, text_token_mask] + return self._focal_loss_cost(pred_scores, gt_labels) + + +@TASK_UTILS.register_module() +class DiceCost(BaseMatchCost): + """Cost of mask assignments based on dice losses. + + Args: + pred_act (bool): Whether to apply sigmoid to mask_pred. + Defaults to False. + eps (float): Defaults to 1e-3. + naive_dice (bool): If True, use the naive dice loss + in which the power of the number in the denominator is + the first power. If False, use the second power that + is adopted by K-Net and SOLO. Defaults to True. + weight (Union[float, int]): Cost weight. Defaults to 1. + """ + + def __init__(self, + pred_act: bool = False, + eps: float = 1e-3, + naive_dice: bool = True, + weight: Union[float, int] = 1.) -> None: + super().__init__(weight=weight) + self.pred_act = pred_act + self.eps = eps + self.naive_dice = naive_dice + + def _binary_mask_dice_loss(self, mask_preds: Tensor, + gt_masks: Tensor) -> Tensor: + """ + Args: + mask_preds (Tensor): Mask prediction in shape (num_queries, *). + gt_masks (Tensor): Ground truth in shape (num_gt, *) + store 0 or 1, 0 for negative class and 1 for + positive class. + + Returns: + Tensor: Dice cost matrix in shape (num_queries, num_gt). + """ + mask_preds = mask_preds.flatten(1) + gt_masks = gt_masks.flatten(1).float() + numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks) + if self.naive_dice: + denominator = mask_preds.sum(-1)[:, None] + \ + gt_masks.sum(-1)[None, :] + else: + denominator = mask_preds.pow(2).sum(1)[:, None] + \ + gt_masks.pow(2).sum(1)[None, :] + loss = 1 - (numerator + self.eps) / (denominator + self.eps) + return loss + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): Predicted instances which + must contain ``masks``. + gt_instances (:obj:`InstanceData`): Ground truth which must contain + ``mask``. + img_meta (Optional[dict]): Image information. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + pred_masks = pred_instances.masks + gt_masks = gt_instances.masks + + if self.pred_act: + pred_masks = pred_masks.sigmoid() + dice_cost = self._binary_mask_dice_loss(pred_masks, gt_masks) + return dice_cost * self.weight + + +@TASK_UTILS.register_module() +class CrossEntropyLossCost(BaseMatchCost): + """CrossEntropyLossCost. + + Args: + use_sigmoid (bool): Whether the prediction uses sigmoid + of softmax. Defaults to True. + weight (Union[float, int]): Cost weight. Defaults to 1. + """ + + def __init__(self, + use_sigmoid: bool = True, + weight: Union[float, int] = 1.) -> None: + super().__init__(weight=weight) + self.use_sigmoid = use_sigmoid + + def _binary_cross_entropy(self, cls_pred: Tensor, + gt_labels: Tensor) -> Tensor: + """ + Args: + cls_pred (Tensor): The prediction with shape (num_queries, 1, *) or + (num_queries, *). + gt_labels (Tensor): The learning label of prediction with + shape (num_gt, *). + + Returns: + Tensor: Cross entropy cost matrix in shape (num_queries, num_gt). + """ + cls_pred = cls_pred.flatten(1).float() + gt_labels = gt_labels.flatten(1).float() + n = cls_pred.shape[1] + pos = F.binary_cross_entropy_with_logits( + cls_pred, torch.ones_like(cls_pred), reduction='none') + neg = F.binary_cross_entropy_with_logits( + cls_pred, torch.zeros_like(cls_pred), reduction='none') + cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \ + torch.einsum('nc,mc->nm', neg, 1 - gt_labels) + cls_cost = cls_cost / n + + return cls_cost + + def __call__(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + **kwargs) -> Tensor: + """Compute match cost. + + Args: + pred_instances (:obj:`InstanceData`): Predicted instances which + must contain ``scores`` or ``masks``. + gt_instances (:obj:`InstanceData`): Ground truth which must contain + ``labels`` or ``masks``. + img_meta (Optional[dict]): Image information. Defaults to None. + + Returns: + Tensor: Match Cost matrix of shape (num_preds, num_gts). + """ + pred_masks = pred_instances.masks + gt_masks = gt_instances.masks + if self.use_sigmoid: + cls_cost = self._binary_cross_entropy(pred_masks, gt_masks) + else: + raise NotImplementedError + + return cls_cost * self.weight diff --git a/mmdetection/mmdet/models/task_modules/assigners/max_iou_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/max_iou_assigner.py new file mode 100644 index 0000000..71da544 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/max_iou_assigner.py @@ -0,0 +1,325 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Optional, Union + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +def _perm_box(bboxes, + iou_calculator, + iou_thr=0.97, + perm_range=0.01, + counter=0, + max_iter=5): + """Compute the permuted bboxes. + + Args: + bboxes (Tensor): Shape (n, 4) for , "xyxy" format. + iou_calculator (obj): Overlaps Calculator. + iou_thr (float): The permuted bboxes should have IoU > iou_thr. + perm_range (float): The scale of permutation. + counter (int): Counter of permutation iteration. + max_iter (int): The max iterations of permutation. + Returns: + Tensor: The permuted bboxes. + """ + ori_bboxes = copy.deepcopy(bboxes) + is_valid = True + N = bboxes.size(0) + perm_factor = bboxes.new_empty(N, 4).uniform_(1 - perm_range, + 1 + perm_range) + bboxes *= perm_factor + new_wh = bboxes[:, 2:] - bboxes[:, :2] + if (new_wh <= 0).any(): + is_valid = False + iou = iou_calculator(ori_bboxes.unique(dim=0), bboxes) + if (iou < iou_thr).any(): + is_valid = False + if not is_valid and counter < max_iter: + return _perm_box( + ori_bboxes, + iou_calculator, + perm_range=max(perm_range - counter * 0.001, 1e-3), + counter=counter + 1) + return bboxes + + +def perm_repeat_bboxes(bboxes, iou_calculator=None, perm_repeat_cfg=None): + """Permute the repeated bboxes. + + Args: + bboxes (Tensor): Shape (n, 4) for , "xyxy" format. + iou_calculator (obj): Overlaps Calculator. + perm_repeat_cfg (Dict): Config of permutation. + Returns: + Tensor: Bboxes after permuted repeated bboxes. + """ + assert isinstance(bboxes, torch.Tensor) + if iou_calculator is None: + import torchvision + iou_calculator = torchvision.ops.box_iou + bboxes = copy.deepcopy(bboxes) + unique_bboxes = bboxes.unique(dim=0) + iou_thr = perm_repeat_cfg.get('iou_thr', 0.97) + perm_range = perm_repeat_cfg.get('perm_range', 0.01) + for box in unique_bboxes: + inds = (bboxes == box).sum(-1).float() == 4 + if inds.float().sum().item() == 1: + continue + bboxes[inds] = _perm_box( + bboxes[inds], + iou_calculator, + iou_thr=iou_thr, + perm_range=perm_range, + counter=0) + return bboxes + + +@TASK_UTILS.register_module() +class MaxIoUAssigner(BaseAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with `-1`, or a semi-positive integer + indicating the ground truth index. + + - -1: negative sample, no assigned gt + - semi-positive integer: positive sample, index (0-based) of assigned gt + + Args: + pos_iou_thr (float): IoU threshold for positive bboxes. + neg_iou_thr (float or tuple): IoU threshold for negative bboxes. + min_pos_iou (float): Minimum iou for a bbox to be considered as a + positive bbox. Positive samples can have smaller IoU than + pos_iou_thr due to the 4th step (assign max IoU sample to each gt). + `min_pos_iou` is set to avoid assigning bboxes that have extremely + small iou with GT as positive samples. It brings about 0.3 mAP + improvements in 1x schedule but does not affect the performance of + 3x schedule. More comparisons can be found in + `PR #7464 `_. + gt_max_assign_all (bool): Whether to assign all bboxes with the same + highest overlap with some gt to that gt. + ignore_iof_thr (float): IoF threshold for ignoring bboxes (if + `gt_bboxes_ignore` is specified). Negative values mean not + ignoring any bboxes. + ignore_wrt_candidates (bool): Whether to compute the iof between + `bboxes` and `gt_bboxes_ignore`, or the contrary. + match_low_quality (bool): Whether to allow low quality matches. This is + usually allowed for RPN and single stage detectors, but not allowed + in the second stage. Details are demonstrated in Step 4. + gpu_assign_thr (int): The upper bound of the number of GT for GPU + assign. When the number of gt is above this threshold, will assign + on CPU device. Negative values mean not assign on CPU. + iou_calculator (dict): Config of overlaps Calculator. + perm_repeat_gt_cfg (dict): Config of permute repeated gt bboxes. + """ + + def __init__(self, + pos_iou_thr: float, + neg_iou_thr: Union[float, tuple], + min_pos_iou: float = .0, + gt_max_assign_all: bool = True, + ignore_iof_thr: float = -1, + ignore_wrt_candidates: bool = True, + match_low_quality: bool = True, + gpu_assign_thr: float = -1, + iou_calculator: dict = dict(type='BboxOverlaps2D'), + perm_repeat_gt_cfg=None): + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_pos_iou = min_pos_iou + self.gt_max_assign_all = gt_max_assign_all + self.ignore_iof_thr = ignore_iof_thr + self.ignore_wrt_candidates = ignore_wrt_candidates + self.gpu_assign_thr = gpu_assign_thr + self.match_low_quality = match_low_quality + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.perm_repeat_gt_cfg = perm_repeat_gt_cfg + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to bboxes. + + This method assign a gt bbox to every bbox (proposal/anchor), each bbox + will be assigned with -1, or a semi-positive number. -1 means negative + sample, semi-positive number is the index (0-based) of assigned gt. + The assignment is done in following steps, the order matters. + + 1. assign every bbox to the background + 2. assign proposals whose iou with all gts < neg_iou_thr to 0 + 3. for each bbox, if the iou with its nearest gt >= pos_iou_thr, + assign it to that bbox + 4. for each gt bbox, assign its nearest proposals (may be more than + one) to itself + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + + Returns: + :obj:`AssignResult`: The assign result. + + Example: + >>> from mmengine.structures import InstanceData + >>> self = MaxIoUAssigner(0.5, 0.5) + >>> pred_instances = InstanceData() + >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10], + ... [10, 10, 20, 20]]) + >>> gt_instances = InstanceData() + >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 9]]) + >>> gt_instances.labels = torch.Tensor([0]) + >>> assign_result = self.assign(pred_instances, gt_instances) + >>> expected_gt_inds = torch.LongTensor([1, 0]) + >>> assert torch.all(assign_result.gt_inds == expected_gt_inds) + """ + gt_bboxes = gt_instances.bboxes + priors = pred_instances.priors + gt_labels = gt_instances.labels + if gt_instances_ignore is not None: + gt_bboxes_ignore = gt_instances_ignore.bboxes + else: + gt_bboxes_ignore = None + + assign_on_cpu = True if (self.gpu_assign_thr > 0) and ( + gt_bboxes.shape[0] > self.gpu_assign_thr) else False + # compute overlap and assign gt on CPU when number of GT is large + if assign_on_cpu: + device = priors.device + priors = priors.cpu() + gt_bboxes = gt_bboxes.cpu() + gt_labels = gt_labels.cpu() + if gt_bboxes_ignore is not None: + gt_bboxes_ignore = gt_bboxes_ignore.cpu() + + if self.perm_repeat_gt_cfg is not None and priors.numel() > 0: + gt_bboxes_unique = perm_repeat_bboxes(gt_bboxes, + self.iou_calculator, + self.perm_repeat_gt_cfg) + else: + gt_bboxes_unique = gt_bboxes + overlaps = self.iou_calculator(gt_bboxes_unique, priors) + + if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None + and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0): + if self.ignore_wrt_candidates: + ignore_overlaps = self.iou_calculator( + priors, gt_bboxes_ignore, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) + else: + ignore_overlaps = self.iou_calculator( + gt_bboxes_ignore, priors, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=0) + overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1 + + assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) + if assign_on_cpu: + assign_result.gt_inds = assign_result.gt_inds.to(device) + assign_result.max_overlaps = assign_result.max_overlaps.to(device) + if assign_result.labels is not None: + assign_result.labels = assign_result.labels.to(device) + return assign_result + + def assign_wrt_overlaps(self, overlaps: Tensor, + gt_labels: Tensor) -> AssignResult: + """Assign w.r.t. the overlaps of priors with gts. + + Args: + overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes, + shape(k, n). + gt_labels (Tensor): Labels of k gt_bboxes, shape (k, ). + + Returns: + :obj:`AssignResult`: The assign result. + """ + num_gts, num_bboxes = overlaps.size(0), overlaps.size(1) + + # 1. assign -1 by default + assigned_gt_inds = overlaps.new_full((num_bboxes, ), + -1, + dtype=torch.long) + + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = overlaps.new_zeros((num_bboxes, )) + assigned_labels = overlaps.new_full((num_bboxes, ), + -1, + dtype=torch.long) + if num_gts == 0: + # No truth, assign everything to background + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=max_overlaps, + labels=assigned_labels) + + # for each anchor, which gt best overlaps with it + # for each anchor, the max iou of all gts + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + # for each gt, which anchor best overlaps with it + # for each gt, the max iou of all proposals + gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1) + + # 2. assign negative: below + # the negative inds are set to be 0 + if isinstance(self.neg_iou_thr, float): + assigned_gt_inds[(max_overlaps >= 0) + & (max_overlaps < self.neg_iou_thr)] = 0 + elif isinstance(self.neg_iou_thr, tuple): + assert len(self.neg_iou_thr) == 2 + assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0]) + & (max_overlaps < self.neg_iou_thr[1])] = 0 + + # 3. assign positive: above positive IoU threshold + pos_inds = max_overlaps >= self.pos_iou_thr + assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1 + + if self.match_low_quality: + # Low-quality matching will overwrite the assigned_gt_inds assigned + # in Step 3. Thus, the assigned gt might not be the best one for + # prediction. + # For example, if bbox A has 0.9 and 0.8 iou with GT bbox 1 & 2, + # bbox 1 will be assigned as the best target for bbox A in step 3. + # However, if GT bbox 2's gt_argmax_overlaps = A, bbox A's + # assigned_gt_inds will be overwritten to be bbox 2. + # This might be the reason that it is not used in ROI Heads. + for i in range(num_gts): + if gt_max_overlaps[i] >= self.min_pos_iou: + if self.gt_max_assign_all: + max_iou_inds = overlaps[i, :] == gt_max_overlaps[i] + assigned_gt_inds[max_iou_inds] = i + 1 + else: + assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1 + + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + pos_inds = torch.nonzero( + assigned_gt_inds > 0, as_tuple=False).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] - + 1] + + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=max_overlaps, + labels=assigned_labels) diff --git a/mmdetection/mmdet/models/task_modules/assigners/multi_instance_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/multi_instance_assigner.py new file mode 100644 index 0000000..1ba32af --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/multi_instance_assigner.py @@ -0,0 +1,140 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from mmengine.structures import InstanceData + +from mmdet.registry import TASK_UTILS +from .assign_result import AssignResult +from .max_iou_assigner import MaxIoUAssigner + + +@TASK_UTILS.register_module() +class MultiInstanceAssigner(MaxIoUAssigner): + """Assign a corresponding gt bbox or background to each proposal bbox. If + we need to use a proposal box to generate multiple predict boxes, + `MultiInstanceAssigner` can assign multiple gt to each proposal box. + + Args: + num_instance (int): How many bboxes are predicted by each proposal box. + """ + + def __init__(self, num_instance: int = 2, **kwargs): + super().__init__(**kwargs) + self.num_instance = num_instance + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to bboxes. + + This method assign gt bboxes to every bbox (proposal/anchor), each bbox + is assigned a set of gts, and the number of gts in this set is defined + by `self.num_instance`. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + + Returns: + :obj:`AssignResult`: The assign result. + """ + gt_bboxes = gt_instances.bboxes + priors = pred_instances.priors + # Set the FG label to 1 and add ignored annotations + gt_labels = gt_instances.labels + 1 + if gt_instances_ignore is not None: + gt_bboxes_ignore = gt_instances_ignore.bboxes + if hasattr(gt_instances_ignore, 'labels'): + gt_labels_ignore = gt_instances_ignore.labels + else: + gt_labels_ignore = torch.ones_like(gt_bboxes_ignore)[:, 0] * -1 + else: + gt_bboxes_ignore = None + gt_labels_ignore = None + + assign_on_cpu = True if (self.gpu_assign_thr > 0) and ( + gt_bboxes.shape[0] > self.gpu_assign_thr) else False + # compute overlap and assign gt on CPU when number of GT is large + if assign_on_cpu: + device = priors.device + priors = priors.cpu() + gt_bboxes = gt_bboxes.cpu() + gt_labels = gt_labels.cpu() + if gt_bboxes_ignore is not None: + gt_bboxes_ignore = gt_bboxes_ignore.cpu() + gt_labels_ignore = gt_labels_ignore.cpu() + + if gt_bboxes_ignore is not None: + all_bboxes = torch.cat([gt_bboxes, gt_bboxes_ignore], dim=0) + all_labels = torch.cat([gt_labels, gt_labels_ignore], dim=0) + else: + all_bboxes = gt_bboxes + all_labels = gt_labels + all_priors = torch.cat([priors, all_bboxes], dim=0) + + overlaps_normal = self.iou_calculator( + all_priors, all_bboxes, mode='iou') + overlaps_ignore = self.iou_calculator( + all_priors, all_bboxes, mode='iof') + gt_ignore_mask = all_labels.eq(-1).repeat(all_priors.shape[0], 1) + overlaps_normal = overlaps_normal * ~gt_ignore_mask + overlaps_ignore = overlaps_ignore * gt_ignore_mask + + overlaps_normal, overlaps_normal_indices = overlaps_normal.sort( + descending=True, dim=1) + overlaps_ignore, overlaps_ignore_indices = overlaps_ignore.sort( + descending=True, dim=1) + + # select the roi with the higher score + max_overlaps_normal = overlaps_normal[:, :self.num_instance].flatten() + gt_assignment_normal = overlaps_normal_indices[:, :self. + num_instance].flatten() + max_overlaps_ignore = overlaps_ignore[:, :self.num_instance].flatten() + gt_assignment_ignore = overlaps_ignore_indices[:, :self. + num_instance].flatten() + + # ignore or not + ignore_assign_mask = (max_overlaps_normal < self.pos_iou_thr) * ( + max_overlaps_ignore > max_overlaps_normal) + overlaps = (max_overlaps_normal * ~ignore_assign_mask) + ( + max_overlaps_ignore * ignore_assign_mask) + gt_assignment = (gt_assignment_normal * ~ignore_assign_mask) + ( + gt_assignment_ignore * ignore_assign_mask) + + assigned_labels = all_labels[gt_assignment] + fg_mask = (overlaps >= self.pos_iou_thr) * (assigned_labels != -1) + bg_mask = (overlaps < self.neg_iou_thr) * (overlaps >= 0) + assigned_labels[fg_mask] = 1 + assigned_labels[bg_mask] = 0 + + overlaps = overlaps.reshape(-1, self.num_instance) + gt_assignment = gt_assignment.reshape(-1, self.num_instance) + assigned_labels = assigned_labels.reshape(-1, self.num_instance) + + assign_result = AssignResult( + num_gts=all_bboxes.size(0), + gt_inds=gt_assignment, + max_overlaps=overlaps, + labels=assigned_labels) + + if assign_on_cpu: + assign_result.gt_inds = assign_result.gt_inds.to(device) + assign_result.max_overlaps = assign_result.max_overlaps.to(device) + if assign_result.labels is not None: + assign_result.labels = assign_result.labels.to(device) + return assign_result diff --git a/mmdetection/mmdet/models/task_modules/assigners/point_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/point_assigner.py new file mode 100644 index 0000000..4da60a4 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/point_assigner.py @@ -0,0 +1,155 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from mmengine.structures import InstanceData + +from mmdet.registry import TASK_UTILS +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +@TASK_UTILS.register_module() +class PointAssigner(BaseAssigner): + """Assign a corresponding gt bbox or background to each point. + + Each proposals will be assigned with `0`, or a positive integer + indicating the ground truth index. + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + """ + + def __init__(self, scale: int = 4, pos_num: int = 3) -> None: + self.scale = scale + self.pos_num = pos_num + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to points. + + This method assign a gt bbox to every points set, each points set + will be assigned with the background_label (-1), or a label number. + -1 is background, and semi-positive number is the index (0-based) of + assigned gt. + The assignment is done in following steps, the order matters. + + 1. assign every points to the background_label (-1) + 2. A point is assigned to some gt bbox if + (i) the point is within the k closest points to the gt bbox + (ii) the distance between this point and the gt is smaller than + other gt bboxes + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + + + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + Returns: + :obj:`AssignResult`: The assign result. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + # points to be assigned, shape(n, 3) while last + # dimension stands for (x, y, stride). + points = pred_instances.priors + + num_points = points.shape[0] + num_gts = gt_bboxes.shape[0] + + if num_gts == 0 or num_points == 0: + # If no truth assign everything to the background + assigned_gt_inds = points.new_full((num_points, ), + 0, + dtype=torch.long) + assigned_labels = points.new_full((num_points, ), + -1, + dtype=torch.long) + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) + + points_xy = points[:, :2] + points_stride = points[:, 2] + points_lvl = torch.log2( + points_stride).int() # [3...,4...,5...,6...,7...] + lvl_min, lvl_max = points_lvl.min(), points_lvl.max() + + # assign gt box + gt_bboxes_xy = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2 + gt_bboxes_wh = (gt_bboxes[:, 2:] - gt_bboxes[:, :2]).clamp(min=1e-6) + scale = self.scale + gt_bboxes_lvl = ((torch.log2(gt_bboxes_wh[:, 0] / scale) + + torch.log2(gt_bboxes_wh[:, 1] / scale)) / 2).int() + gt_bboxes_lvl = torch.clamp(gt_bboxes_lvl, min=lvl_min, max=lvl_max) + + # stores the assigned gt index of each point + assigned_gt_inds = points.new_zeros((num_points, ), dtype=torch.long) + # stores the assigned gt dist (to this point) of each point + assigned_gt_dist = points.new_full((num_points, ), float('inf')) + points_range = torch.arange(points.shape[0]) + + for idx in range(num_gts): + gt_lvl = gt_bboxes_lvl[idx] + # get the index of points in this level + lvl_idx = gt_lvl == points_lvl + points_index = points_range[lvl_idx] + # get the points in this level + lvl_points = points_xy[lvl_idx, :] + # get the center point of gt + gt_point = gt_bboxes_xy[[idx], :] + # get width and height of gt + gt_wh = gt_bboxes_wh[[idx], :] + # compute the distance between gt center and + # all points in this level + points_gt_dist = ((lvl_points - gt_point) / gt_wh).norm(dim=1) + # find the nearest k points to gt center in this level + min_dist, min_dist_index = torch.topk( + points_gt_dist, self.pos_num, largest=False) + # the index of nearest k points to gt center in this level + min_dist_points_index = points_index[min_dist_index] + # The less_than_recorded_index stores the index + # of min_dist that is less then the assigned_gt_dist. Where + # assigned_gt_dist stores the dist from previous assigned gt + # (if exist) to each point. + less_than_recorded_index = min_dist < assigned_gt_dist[ + min_dist_points_index] + # The min_dist_points_index stores the index of points satisfy: + # (1) it is k nearest to current gt center in this level. + # (2) it is closer to current gt center than other gt center. + min_dist_points_index = min_dist_points_index[ + less_than_recorded_index] + # assign the result + assigned_gt_inds[min_dist_points_index] = idx + 1 + assigned_gt_dist[min_dist_points_index] = min_dist[ + less_than_recorded_index] + + assigned_labels = assigned_gt_inds.new_full((num_points, ), -1) + pos_inds = torch.nonzero( + assigned_gt_inds > 0, as_tuple=False).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] - + 1] + + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) diff --git a/mmdetection/mmdet/models/task_modules/assigners/region_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/region_assigner.py new file mode 100644 index 0000000..df54914 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/region_assigner.py @@ -0,0 +1,239 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from ..prior_generators import anchor_inside_flags +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +def calc_region( + bbox: Tensor, + ratio: float, + stride: int, + featmap_size: Optional[Tuple[int, int]] = None) -> Tuple[Tensor]: + """Calculate region of the box defined by the ratio, the ratio is from the + center of the box to every edge.""" + # project bbox on the feature + f_bbox = bbox / stride + x1 = torch.round((1 - ratio) * f_bbox[0] + ratio * f_bbox[2]) + y1 = torch.round((1 - ratio) * f_bbox[1] + ratio * f_bbox[3]) + x2 = torch.round(ratio * f_bbox[0] + (1 - ratio) * f_bbox[2]) + y2 = torch.round(ratio * f_bbox[1] + (1 - ratio) * f_bbox[3]) + if featmap_size is not None: + x1 = x1.clamp(min=0, max=featmap_size[1]) + y1 = y1.clamp(min=0, max=featmap_size[0]) + x2 = x2.clamp(min=0, max=featmap_size[1]) + y2 = y2.clamp(min=0, max=featmap_size[0]) + return (x1, y1, x2, y2) + + +def anchor_ctr_inside_region_flags(anchors: Tensor, stride: int, + region: Tuple[Tensor]) -> Tensor: + """Get the flag indicate whether anchor centers are inside regions.""" + x1, y1, x2, y2 = region + f_anchors = anchors / stride + x = (f_anchors[:, 0] + f_anchors[:, 2]) * 0.5 + y = (f_anchors[:, 1] + f_anchors[:, 3]) * 0.5 + flags = (x >= x1) & (x <= x2) & (y >= y1) & (y <= y2) + return flags + + +@TASK_UTILS.register_module() +class RegionAssigner(BaseAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with `-1`, `0`, or a positive integer + indicating the ground truth index. + + - -1: don't care + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + center_ratio (float): ratio of the region in the center of the bbox to + define positive sample. + ignore_ratio (float): ratio of the region to define ignore samples. + """ + + def __init__(self, + center_ratio: float = 0.2, + ignore_ratio: float = 0.5) -> None: + self.center_ratio = center_ratio + self.ignore_ratio = ignore_ratio + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: dict, + featmap_sizes: List[Tuple[int, int]], + num_level_anchors: List[int], + anchor_scale: int, + anchor_strides: List[int], + gt_instances_ignore: Optional[InstanceData] = None, + allowed_border: int = 0) -> AssignResult: + """Assign gt to anchors. + + This method assign a gt bbox to every bbox (proposal/anchor), each bbox + will be assigned with -1, 0, or a positive number. -1 means don't care, + 0 means negative sample, positive number is the index (1-based) of + assigned gt. + + The assignment is done in following steps, and the order matters. + + 1. Assign every anchor to 0 (negative) + 2. (For each gt_bboxes) Compute ignore flags based on ignore_region + then assign -1 to anchors w.r.t. ignore flags + 3. (For each gt_bboxes) Compute pos flags based on center_region then + assign gt_bboxes to anchors w.r.t. pos flags + 4. (For each gt_bboxes) Compute ignore flags based on adjacent anchor + level then assign -1 to anchors w.r.t. ignore flags + 5. Assign anchor outside of image to -1 + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + img_meta (dict): Meta info of image. + featmap_sizes (list[tuple[int, int]]): Feature map size each level. + num_level_anchors (list[int]): The number of anchors in each level. + anchor_scale (int): Scale of the anchor. + anchor_strides (list[int]): Stride of the anchor. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + allowed_border (int, optional): The border to allow the valid + anchor. Defaults to 0. + + Returns: + :obj:`AssignResult`: The assign result. + """ + if gt_instances_ignore is not None: + raise NotImplementedError + + num_gts = len(gt_instances) + num_bboxes = len(pred_instances) + + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + flat_anchors = pred_instances.priors + flat_valid_flags = pred_instances.valid_flags + mlvl_anchors = torch.split(flat_anchors, num_level_anchors) + + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = gt_bboxes.new_zeros((num_bboxes, )) + assigned_gt_inds = gt_bboxes.new_zeros((num_bboxes, ), + dtype=torch.long) + assigned_labels = gt_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=max_overlaps, + labels=assigned_labels) + + num_lvls = len(mlvl_anchors) + r1 = (1 - self.center_ratio) / 2 + r2 = (1 - self.ignore_ratio) / 2 + + scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) * + (gt_bboxes[:, 3] - gt_bboxes[:, 1])) + min_anchor_size = scale.new_full( + (1, ), float(anchor_scale * anchor_strides[0])) + target_lvls = torch.floor( + torch.log2(scale) - torch.log2(min_anchor_size) + 0.5) + target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long() + + # 1. assign 0 (negative) by default + mlvl_assigned_gt_inds = [] + mlvl_ignore_flags = [] + for lvl in range(num_lvls): + assigned_gt_inds = gt_bboxes.new_full((num_level_anchors[lvl], ), + 0, + dtype=torch.long) + ignore_flags = torch.zeros_like(assigned_gt_inds) + mlvl_assigned_gt_inds.append(assigned_gt_inds) + mlvl_ignore_flags.append(ignore_flags) + + for gt_id in range(num_gts): + lvl = target_lvls[gt_id].item() + featmap_size = featmap_sizes[lvl] + stride = anchor_strides[lvl] + anchors = mlvl_anchors[lvl] + gt_bbox = gt_bboxes[gt_id, :4] + + # Compute regions + ignore_region = calc_region(gt_bbox, r2, stride, featmap_size) + ctr_region = calc_region(gt_bbox, r1, stride, featmap_size) + + # 2. Assign -1 to ignore flags + ignore_flags = anchor_ctr_inside_region_flags( + anchors, stride, ignore_region) + mlvl_assigned_gt_inds[lvl][ignore_flags] = -1 + + # 3. Assign gt_bboxes to pos flags + pos_flags = anchor_ctr_inside_region_flags(anchors, stride, + ctr_region) + mlvl_assigned_gt_inds[lvl][pos_flags] = gt_id + 1 + + # 4. Assign -1 to ignore adjacent lvl + if lvl > 0: + d_lvl = lvl - 1 + d_anchors = mlvl_anchors[d_lvl] + d_featmap_size = featmap_sizes[d_lvl] + d_stride = anchor_strides[d_lvl] + d_ignore_region = calc_region(gt_bbox, r2, d_stride, + d_featmap_size) + ignore_flags = anchor_ctr_inside_region_flags( + d_anchors, d_stride, d_ignore_region) + mlvl_ignore_flags[d_lvl][ignore_flags] = 1 + if lvl < num_lvls - 1: + u_lvl = lvl + 1 + u_anchors = mlvl_anchors[u_lvl] + u_featmap_size = featmap_sizes[u_lvl] + u_stride = anchor_strides[u_lvl] + u_ignore_region = calc_region(gt_bbox, r2, u_stride, + u_featmap_size) + ignore_flags = anchor_ctr_inside_region_flags( + u_anchors, u_stride, u_ignore_region) + mlvl_ignore_flags[u_lvl][ignore_flags] = 1 + + # 4. (cont.) Assign -1 to ignore adjacent lvl + for lvl in range(num_lvls): + ignore_flags = mlvl_ignore_flags[lvl] + mlvl_assigned_gt_inds[lvl][ignore_flags == 1] = -1 + + # 5. Assign -1 to anchor outside of image + flat_assigned_gt_inds = torch.cat(mlvl_assigned_gt_inds) + assert (flat_assigned_gt_inds.shape[0] == flat_anchors.shape[0] == + flat_valid_flags.shape[0]) + inside_flags = anchor_inside_flags(flat_anchors, flat_valid_flags, + img_meta['img_shape'], + allowed_border) + outside_flags = ~inside_flags + flat_assigned_gt_inds[outside_flags] = -1 + + assigned_labels = torch.zeros_like(flat_assigned_gt_inds) + pos_flags = flat_assigned_gt_inds > 0 + assigned_labels[pos_flags] = gt_labels[flat_assigned_gt_inds[pos_flags] + - 1] + + return AssignResult( + num_gts=num_gts, + gt_inds=flat_assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) diff --git a/mmdetection/mmdet/models/task_modules/assigners/sim_ota_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/sim_ota_assigner.py new file mode 100644 index 0000000..d54a8b9 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/sim_ota_assigner.py @@ -0,0 +1,223 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.utils import ConfigType +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + +INF = 100000.0 +EPS = 1.0e-7 + + +@TASK_UTILS.register_module() +class SimOTAAssigner(BaseAssigner): + """Computes matching between predictions and ground truth. + + Args: + center_radius (float): Ground truth center size + to judge whether a prior is in center. Defaults to 2.5. + candidate_topk (int): The candidate top-k which used to + get top-k ious to calculate dynamic-k. Defaults to 10. + iou_weight (float): The scale factor for regression + iou cost. Defaults to 3.0. + cls_weight (float): The scale factor for classification + cost. Defaults to 1.0. + iou_calculator (ConfigType): Config of overlaps Calculator. + Defaults to dict(type='BboxOverlaps2D'). + """ + + def __init__(self, + center_radius: float = 2.5, + candidate_topk: int = 10, + iou_weight: float = 3.0, + cls_weight: float = 1.0, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D')): + self.center_radius = center_radius + self.candidate_topk = candidate_topk + self.iou_weight = iou_weight + self.cls_weight = cls_weight + self.iou_calculator = TASK_UTILS.build(iou_calculator) + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to priors using SimOTA. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + Returns: + obj:`AssignResult`: The assigned result. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + num_gt = gt_bboxes.size(0) + + decoded_bboxes = pred_instances.bboxes + pred_scores = pred_instances.scores + priors = pred_instances.priors + num_bboxes = decoded_bboxes.size(0) + + # assign 0 by default + assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ), + 0, + dtype=torch.long) + if num_gt == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info( + priors, gt_bboxes) + valid_decoded_bbox = decoded_bboxes[valid_mask] + valid_pred_scores = pred_scores[valid_mask] + num_valid = valid_decoded_bbox.size(0) + if num_valid == 0: + # No valid bboxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes) + iou_cost = -torch.log(pairwise_ious + EPS) + + gt_onehot_label = ( + F.one_hot(gt_labels.to(torch.int64), + pred_scores.shape[-1]).float().unsqueeze(0).repeat( + num_valid, 1, 1)) + + valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1) + # disable AMP autocast and calculate BCE with FP32 to avoid overflow + with torch.cuda.amp.autocast(enabled=False): + cls_cost = ( + F.binary_cross_entropy( + valid_pred_scores.to(dtype=torch.float32), + gt_onehot_label, + reduction='none', + ).sum(-1).to(dtype=valid_pred_scores.dtype)) + + cost_matrix = ( + cls_cost * self.cls_weight + iou_cost * self.iou_weight + + (~is_in_boxes_and_center) * INF) + + matched_pred_ious, matched_gt_inds = \ + self.dynamic_k_matching( + cost_matrix, pairwise_ious, num_gt, valid_mask) + + # convert to AssignResult format + assigned_gt_inds[valid_mask] = matched_gt_inds + 1 + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long() + max_overlaps = assigned_gt_inds.new_full((num_bboxes, ), + -INF, + dtype=torch.float32) + max_overlaps[valid_mask] = matched_pred_ious + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + def get_in_gt_and_in_center_info( + self, priors: Tensor, gt_bboxes: Tensor) -> Tuple[Tensor, Tensor]: + """Get the information of which prior is in gt bboxes and gt center + priors.""" + num_gt = gt_bboxes.size(0) + + repeated_x = priors[:, 0].unsqueeze(1).repeat(1, num_gt) + repeated_y = priors[:, 1].unsqueeze(1).repeat(1, num_gt) + repeated_stride_x = priors[:, 2].unsqueeze(1).repeat(1, num_gt) + repeated_stride_y = priors[:, 3].unsqueeze(1).repeat(1, num_gt) + + # is prior centers in gt bboxes, shape: [n_prior, n_gt] + l_ = repeated_x - gt_bboxes[:, 0] + t_ = repeated_y - gt_bboxes[:, 1] + r_ = gt_bboxes[:, 2] - repeated_x + b_ = gt_bboxes[:, 3] - repeated_y + + deltas = torch.stack([l_, t_, r_, b_], dim=1) + is_in_gts = deltas.min(dim=1).values > 0 + is_in_gts_all = is_in_gts.sum(dim=1) > 0 + + # is prior centers in gt centers + gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0 + gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0 + ct_box_l = gt_cxs - self.center_radius * repeated_stride_x + ct_box_t = gt_cys - self.center_radius * repeated_stride_y + ct_box_r = gt_cxs + self.center_radius * repeated_stride_x + ct_box_b = gt_cys + self.center_radius * repeated_stride_y + + cl_ = repeated_x - ct_box_l + ct_ = repeated_y - ct_box_t + cr_ = ct_box_r - repeated_x + cb_ = ct_box_b - repeated_y + + ct_deltas = torch.stack([cl_, ct_, cr_, cb_], dim=1) + is_in_cts = ct_deltas.min(dim=1).values > 0 + is_in_cts_all = is_in_cts.sum(dim=1) > 0 + + # in boxes or in centers, shape: [num_priors] + is_in_gts_or_centers = is_in_gts_all | is_in_cts_all + + # both in boxes and centers, shape: [num_fg, num_gt] + is_in_boxes_and_centers = ( + is_in_gts[is_in_gts_or_centers, :] + & is_in_cts[is_in_gts_or_centers, :]) + return is_in_gts_or_centers, is_in_boxes_and_centers + + def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor, + num_gt: int, + valid_mask: Tensor) -> Tuple[Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets.""" + matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.candidate_topk, pairwise_ious.size(0)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1) + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) + matching_matrix[:, gt_idx][pos_idx] = 1 + + del topk_ious, dynamic_ks, pos_idx + + prior_match_gt_mask = matching_matrix.sum(1) > 1 + if prior_match_gt_mask.sum() > 0: + cost_min, cost_argmin = torch.min( + cost[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(1) > 0 + valid_mask[valid_mask.clone()] = fg_mask_inboxes + + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + matched_pred_ious = (matching_matrix * + pairwise_ious).sum(1)[fg_mask_inboxes] + return matched_pred_ious, matched_gt_inds diff --git a/mmdetection/mmdet/models/task_modules/assigners/task_aligned_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/task_aligned_assigner.py new file mode 100644 index 0000000..220ea84 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/task_aligned_assigner.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from mmengine.structures import InstanceData + +from mmdet.registry import TASK_UTILS +from mmdet.utils import ConfigType +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + +INF = 100000000 + + +@TASK_UTILS.register_module() +class TaskAlignedAssigner(BaseAssigner): + """Task aligned assigner used in the paper: + `TOOD: Task-aligned One-stage Object Detection. + `_. + + Assign a corresponding gt bbox or background to each predicted bbox. + Each bbox will be assigned with `0` or a positive integer + indicating the ground truth index. + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + topk (int): number of bbox selected in each level + iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou + calculator. Defaults to ``dict(type='BboxOverlaps2D')`` + """ + + def __init__(self, + topk: int, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D')): + assert topk >= 1 + self.topk = topk + self.iou_calculator = TASK_UTILS.build(iou_calculator) + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + alpha: int = 1, + beta: int = 6) -> AssignResult: + """Assign gt to bboxes. + + The assignment is done in following steps + + 1. compute alignment metric between all bbox (bbox of all pyramid + levels) and gt + 2. select top-k bbox as candidates for each gt + 3. limit the positive sample's center in gt (because the anchor-free + detector only can predict positive distance) + + + Args: + pred_instances (:obj:`InstaceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors, points, or bboxes predicted by the model, + shape(n, 4). + gt_instances (:obj:`InstaceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + gt_instances_ignore (:obj:`InstaceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + alpha (int): Hyper-parameters related to alignment_metrics. + Defaults to 1. + beta (int): Hyper-parameters related to alignment_metrics. + Defaults to 6. + + Returns: + :obj:`TaskAlignedAssignResult`: The assign result. + """ + priors = pred_instances.priors + decode_bboxes = pred_instances.bboxes + pred_scores = pred_instances.scores + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + + priors = priors[:, :4] + num_gt, num_bboxes = gt_bboxes.size(0), priors.size(0) + # compute alignment metric between all bbox and gt + overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach() + bbox_scores = pred_scores[:, gt_labels].detach() + # assign 0 by default + assigned_gt_inds = priors.new_full((num_bboxes, ), 0, dtype=torch.long) + assign_metrics = priors.new_zeros((num_bboxes, )) + + if num_gt == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = priors.new_zeros((num_bboxes, )) + if num_gt == 0: + # No gt boxes, assign everything to background + assigned_gt_inds[:] = 0 + assigned_labels = priors.new_full((num_bboxes, ), + -1, + dtype=torch.long) + assign_result = AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + assign_result.assign_metrics = assign_metrics + return assign_result + + # select top-k bboxes as candidates for each gt + alignment_metrics = bbox_scores**alpha * overlaps**beta + topk = min(self.topk, alignment_metrics.size(0)) + _, candidate_idxs = alignment_metrics.topk(topk, dim=0, largest=True) + candidate_metrics = alignment_metrics[candidate_idxs, + torch.arange(num_gt)] + is_pos = candidate_metrics > 0 + + # limit the positive sample's center in gt + priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0 + priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0 + for gt_idx in range(num_gt): + candidate_idxs[:, gt_idx] += gt_idx * num_bboxes + ep_priors_cx = priors_cx.view(1, -1).expand( + num_gt, num_bboxes).contiguous().view(-1) + ep_priors_cy = priors_cy.view(1, -1).expand( + num_gt, num_bboxes).contiguous().view(-1) + candidate_idxs = candidate_idxs.view(-1) + + # calculate the left, top, right, bottom distance between positive + # bbox center and gt side + l_ = ep_priors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0] + t_ = ep_priors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1] + r_ = gt_bboxes[:, 2] - ep_priors_cx[candidate_idxs].view(-1, num_gt) + b_ = gt_bboxes[:, 3] - ep_priors_cy[candidate_idxs].view(-1, num_gt) + is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01 + is_pos = is_pos & is_in_gts + + # if an anchor box is assigned to multiple gts, + # the one with the highest iou will be selected. + overlaps_inf = torch.full_like(overlaps, + -INF).t().contiguous().view(-1) + index = candidate_idxs.view(-1)[is_pos.view(-1)] + overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index] + overlaps_inf = overlaps_inf.view(num_gt, -1).t() + + max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1) + assigned_gt_inds[ + max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1 + assign_metrics[max_overlaps != -INF] = alignment_metrics[ + max_overlaps != -INF, argmax_overlaps[max_overlaps != -INF]] + + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + pos_inds = torch.nonzero( + assigned_gt_inds > 0, as_tuple=False).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] - + 1] + assign_result = AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + assign_result.assign_metrics = assign_metrics + return assign_result diff --git a/mmdetection/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py new file mode 100644 index 0000000..e48f092 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py @@ -0,0 +1,182 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmengine.structures import BaseDataElement +from scipy.optimize import linear_sum_assignment + +from mmdet.registry import TASK_UTILS +from .assign_result import AssignResult +from .task_aligned_assigner import TaskAlignedAssigner + + +@TASK_UTILS.register_module() +class TopkHungarianAssigner(TaskAlignedAssigner): + """Computes 1-to-k matching between ground truth and predictions. + + This class computes an assignment between the targets and the predictions + based on the costs. The costs are weighted sum of some components. + For DETR the costs are weighted sum of classification cost, regression L1 + cost and regression iou cost. The targets don't include the no_object, so + generally there are more predictions than targets. After the 1-to-k + gt-pred matching, the un-matched are treated as backgrounds. Thus each + query prediction will be assigned with `0` or a positive integer + indicating the ground truth index: + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + cls_cost (dict): Classification cost configuration. + reg_cost (dict): Regression L1 cost configuration. + iou_cost (dict): Regression iou cost configuration. + """ + + def __init__(self, + *args, + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=5.0), + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0), + **kwargs): + super(TopkHungarianAssigner, self).__init__(*args, **kwargs) + + self.cls_cost = TASK_UTILS.build(cls_cost) + self.reg_cost = TASK_UTILS.build(reg_cost) + self.iou_cost = TASK_UTILS.build(iou_cost) + + def assign(self, + pred_scores, + decode_bboxes, + gt_bboxes, + gt_labels, + img_meta, + alpha=1, + beta=6, + **kwargs): + """Computes 1-to-k gt-pred matching based on the weighted costs. + + This method assign each query prediction to a ground truth or + background. The `assigned_gt_inds` with -1 means don't care, + 0 means negative sample, and positive number is the index (1-based) + of assigned gt. + The assignment is done in the following steps, the order matters. + + 1. Assign every prediction to -1. + 2. Compute the weighted costs, each cost has shape (num_pred, num_gt). + 3. Update topk to be min(topk, int(num_pred / num_gt)), then repeat + costs topk times to shape: (num_pred, num_gt * topk), so that each + gt will match topk predictions. + 3. Do Hungarian matching on CPU based on the costs. + 4. Assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + 5. Calculate alignment metrics and overlaps of each matched pred-gt + pair. + + Args: + pred_scores (Tensor): Predicted normalized classification + scores for one image, has shape (num_dense_queries, + cls_out_channels). + decode_bboxes (Tensor): Predicted unnormalized bbox coordinates + for one image, has shape (num_dense_queries, 4) with the + last dimension arranged as (x1, y1, x2, y2). + gt_bboxes (Tensor): Unnormalized ground truth + bboxes for one image, has shape (num_gt, 4) with the + last dimension arranged as (x1, y1, x2, y2). + NOTE: num_gt is dynamic for each image. + gt_labels (Tensor): Ground truth classification + index for the image, has shape (num_gt,). + NOTE: num_gt is dynamic for each image. + img_meta (dict): Meta information for one image. + alpha (int): Hyper-parameters related to alignment_metrics. + Defaults to 1. + beta (int): Hyper-parameters related to alignment_metrics. + Defaults to 6. + + Returns: + :obj:`AssignResult`: The assigned result. + """ + pred_scores = pred_scores.detach() + decode_bboxes = decode_bboxes.detach() + temp_overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach() + bbox_scores = pred_scores[:, gt_labels].detach() + alignment_metrics = bbox_scores**alpha * temp_overlaps**beta + + pred_instances = BaseDataElement() + gt_instances = BaseDataElement() + + pred_instances.bboxes = decode_bboxes + gt_instances.bboxes = gt_bboxes + + pred_instances.scores = pred_scores + gt_instances.labels = gt_labels + + reg_cost = self.reg_cost(pred_instances, gt_instances, img_meta) + iou_cost = self.iou_cost(pred_instances, gt_instances, img_meta) + cls_cost = self.cls_cost(pred_instances, gt_instances, img_meta) + all_cost = cls_cost + reg_cost + iou_cost + + num_gt, num_bboxes = gt_bboxes.size(0), pred_scores.size(0) + if num_gt > 0: + # assign 0 by default + assigned_gt_inds = pred_scores.new_full((num_bboxes, ), + 0, + dtype=torch.long) + select_cost = all_cost + + topk = min(self.topk, int(len(select_cost) / num_gt)) + + # Repeat the ground truth `topk` times to perform 1-to-k gt-pred + # matching. For example, if `num_pred` = 900, `num_gt` = 3, then + # there are only 3 gt-pred pairs in sum for 1-1 matching. + # However, for 1-k gt-pred matching, if `topk` = 4, then each + # gt is assigned 4 unique predictions, so there would be 12 + # gt-pred pairs in sum. + repeat_select_cost = select_cost[..., + None].repeat(1, 1, topk).view( + select_cost.size(0), -1) + # anchor index and gt index + matched_row_inds, matched_col_inds = linear_sum_assignment( + repeat_select_cost.detach().cpu().numpy()) + matched_row_inds = torch.from_numpy(matched_row_inds).to( + pred_scores.device) + matched_col_inds = torch.from_numpy(matched_col_inds).to( + pred_scores.device) + + match_gt_ids = matched_col_inds // topk + candidate_idxs = matched_row_inds + + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + + if candidate_idxs.numel() > 0: + assigned_labels[candidate_idxs] = gt_labels[match_gt_ids] + else: + assigned_labels = None + + assigned_gt_inds[candidate_idxs] = match_gt_ids + 1 + + overlaps = self.iou_calculator( + decode_bboxes[candidate_idxs], + gt_bboxes[match_gt_ids], + is_aligned=True).detach() + + temp_pos_alignment_metrics = alignment_metrics[candidate_idxs] + pos_alignment_metrics = torch.gather(temp_pos_alignment_metrics, 1, + match_gt_ids[:, + None]).view(-1) + assign_result = AssignResult( + num_gt, assigned_gt_inds, overlaps, labels=assigned_labels) + + assign_result.assign_metrics = pos_alignment_metrics + return assign_result + else: + + assigned_gt_inds = pred_scores.new_full((num_bboxes, ), + -1, + dtype=torch.long) + + assigned_labels = pred_scores.new_full((num_bboxes, ), + -1, + dtype=torch.long) + + assigned_gt_inds[:] = 0 + return AssignResult( + 0, assigned_gt_inds, None, labels=assigned_labels) diff --git a/mmdetection/mmdet/models/task_modules/assigners/uniform_assigner.py b/mmdetection/mmdet/models/task_modules/assigners/uniform_assigner.py new file mode 100644 index 0000000..9a83bfd --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/assigners/uniform_assigner.py @@ -0,0 +1,173 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from mmengine.structures import InstanceData + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import bbox_xyxy_to_cxcywh +from mmdet.utils import ConfigType +from .assign_result import AssignResult +from .base_assigner import BaseAssigner + + +@TASK_UTILS.register_module() +class UniformAssigner(BaseAssigner): + """Uniform Matching between the priors and gt boxes, which can achieve + balance in positive priors, and gt_bboxes_ignore was not considered for + now. + + Args: + pos_ignore_thr (float): the threshold to ignore positive priors + neg_ignore_thr (float): the threshold to ignore negative priors + match_times(int): Number of positive priors for each gt box. + Defaults to 4. + iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou + calculator. Defaults to ``dict(type='BboxOverlaps2D')`` + """ + + def __init__(self, + pos_ignore_thr: float, + neg_ignore_thr: float, + match_times: int = 4, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D')): + self.match_times = match_times + self.pos_ignore_thr = pos_ignore_thr + self.neg_ignore_thr = neg_ignore_thr + self.iou_calculator = TASK_UTILS.build(iou_calculator) + + def assign( + self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None + ) -> AssignResult: + """Assign gt to priors. + + The assignment is done in following steps + + 1. assign -1 by default + 2. compute the L1 cost between boxes. Note that we use priors and + predict boxes both + 3. compute the ignore indexes use gt_bboxes and predict boxes + 4. compute the ignore indexes of positive sample use priors and + predict boxes + + + Args: + pred_instances (:obj:`InstaceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be priors, points, or bboxes predicted by the model, + shape(n, 4). + gt_instances (:obj:`InstaceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + gt_instances_ignore (:obj:`InstaceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + + Returns: + :obj:`AssignResult`: The assign result. + """ + + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + priors = pred_instances.priors + bbox_pred = pred_instances.decoder_priors + + num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) + + # 1. assign -1 by default + assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), + 0, + dtype=torch.long) + assigned_labels = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + if num_gts == 0: + # No ground truth, assign all to background + assigned_gt_inds[:] = 0 + assign_result = AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels) + assign_result.set_extra_property( + 'pos_idx', bbox_pred.new_empty(0, dtype=torch.bool)) + assign_result.set_extra_property('pos_predicted_boxes', + bbox_pred.new_empty((0, 4))) + assign_result.set_extra_property('target_boxes', + bbox_pred.new_empty((0, 4))) + return assign_result + + # 2. Compute the L1 cost between boxes + # Note that we use priors and predict boxes both + cost_bbox = torch.cdist( + bbox_xyxy_to_cxcywh(bbox_pred), + bbox_xyxy_to_cxcywh(gt_bboxes), + p=1) + cost_bbox_priors = torch.cdist( + bbox_xyxy_to_cxcywh(priors), bbox_xyxy_to_cxcywh(gt_bboxes), p=1) + + # We found that topk function has different results in cpu and + # cuda mode. In order to ensure consistency with the source code, + # we also use cpu mode. + # TODO: Check whether the performance of cpu and cuda are the same. + C = cost_bbox.cpu() + C1 = cost_bbox_priors.cpu() + + # self.match_times x n + index = torch.topk( + C, # c=b,n,x c[i]=n,x + k=self.match_times, + dim=0, + largest=False)[1] + + # self.match_times x n + index1 = torch.topk(C1, k=self.match_times, dim=0, largest=False)[1] + # (self.match_times*2) x n + indexes = torch.cat((index, index1), + dim=1).reshape(-1).to(bbox_pred.device) + + pred_overlaps = self.iou_calculator(bbox_pred, gt_bboxes) + anchor_overlaps = self.iou_calculator(priors, gt_bboxes) + pred_max_overlaps, _ = pred_overlaps.max(dim=1) + anchor_max_overlaps, _ = anchor_overlaps.max(dim=0) + + # 3. Compute the ignore indexes use gt_bboxes and predict boxes + ignore_idx = pred_max_overlaps > self.neg_ignore_thr + assigned_gt_inds[ignore_idx] = -1 + + # 4. Compute the ignore indexes of positive sample use priors + # and predict boxes + pos_gt_index = torch.arange( + 0, C1.size(1), + device=bbox_pred.device).repeat(self.match_times * 2) + pos_ious = anchor_overlaps[indexes, pos_gt_index] + pos_ignore_idx = pos_ious < self.pos_ignore_thr + + pos_gt_index_with_ignore = pos_gt_index + 1 + pos_gt_index_with_ignore[pos_ignore_idx] = -1 + assigned_gt_inds[indexes] = pos_gt_index_with_ignore + + if gt_labels is not None: + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + pos_inds = torch.nonzero( + assigned_gt_inds > 0, as_tuple=False).squeeze() + if pos_inds.numel() > 0: + assigned_labels[pos_inds] = gt_labels[ + assigned_gt_inds[pos_inds] - 1] + else: + assigned_labels = None + + assign_result = AssignResult( + num_gts, + assigned_gt_inds, + anchor_max_overlaps, + labels=assigned_labels) + assign_result.set_extra_property('pos_idx', ~pos_ignore_idx) + assign_result.set_extra_property('pos_predicted_boxes', + bbox_pred[indexes]) + assign_result.set_extra_property('target_boxes', + gt_bboxes[pos_gt_index]) + return assign_result diff --git a/mmdetection/mmdet/models/task_modules/builder.py b/mmdetection/mmdet/models/task_modules/builder.py new file mode 100644 index 0000000..6736049 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/builder.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmdet.registry import TASK_UTILS + +PRIOR_GENERATORS = TASK_UTILS +ANCHOR_GENERATORS = TASK_UTILS +BBOX_ASSIGNERS = TASK_UTILS +BBOX_SAMPLERS = TASK_UTILS +BBOX_CODERS = TASK_UTILS +MATCH_COSTS = TASK_UTILS +IOU_CALCULATORS = TASK_UTILS + + +def build_bbox_coder(cfg, **default_args): + """Builder of box coder.""" + warnings.warn('``build_sampler`` would be deprecated soon, please use ' + '``mmdet.registry.TASK_UTILS.build()`` ') + return TASK_UTILS.build(cfg, default_args=default_args) + + +def build_iou_calculator(cfg, default_args=None): + """Builder of IoU calculator.""" + warnings.warn( + '``build_iou_calculator`` would be deprecated soon, please use ' + '``mmdet.registry.TASK_UTILS.build()`` ') + return TASK_UTILS.build(cfg, default_args=default_args) + + +def build_match_cost(cfg, default_args=None): + """Builder of IoU calculator.""" + warnings.warn('``build_match_cost`` would be deprecated soon, please use ' + '``mmdet.registry.TASK_UTILS.build()`` ') + return TASK_UTILS.build(cfg, default_args=default_args) + + +def build_assigner(cfg, **default_args): + """Builder of box assigner.""" + warnings.warn('``build_assigner`` would be deprecated soon, please use ' + '``mmdet.registry.TASK_UTILS.build()`` ') + return TASK_UTILS.build(cfg, default_args=default_args) + + +def build_sampler(cfg, **default_args): + """Builder of box sampler.""" + warnings.warn('``build_sampler`` would be deprecated soon, please use ' + '``mmdet.registry.TASK_UTILS.build()`` ') + return TASK_UTILS.build(cfg, default_args=default_args) + + +def build_prior_generator(cfg, default_args=None): + warnings.warn( + '``build_prior_generator`` would be deprecated soon, please use ' + '``mmdet.registry.TASK_UTILS.build()`` ') + return TASK_UTILS.build(cfg, default_args=default_args) + + +def build_anchor_generator(cfg, default_args=None): + warnings.warn( + '``build_anchor_generator`` would be deprecated soon, please use ' + '``mmdet.registry.TASK_UTILS.build()`` ') + return TASK_UTILS.build(cfg, default_args=default_args) diff --git a/mmdetection/mmdet/models/task_modules/coders/__init__.py b/mmdetection/mmdet/models/task_modules/coders/__init__.py new file mode 100644 index 0000000..97c3982 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_bbox_coder import BaseBBoxCoder +from .bucketing_bbox_coder import BucketingBBoxCoder +from .delta_xywh_bbox_coder import (DeltaXYWHBBoxCoder, + DeltaXYWHBBoxCoderForGLIP) +from .distance_point_bbox_coder import DistancePointBBoxCoder +from .legacy_delta_xywh_bbox_coder import LegacyDeltaXYWHBBoxCoder +from .pseudo_bbox_coder import PseudoBBoxCoder +from .tblr_bbox_coder import TBLRBBoxCoder +from .yolo_bbox_coder import YOLOBBoxCoder + +__all__ = [ + 'BaseBBoxCoder', 'PseudoBBoxCoder', 'DeltaXYWHBBoxCoder', + 'LegacyDeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'YOLOBBoxCoder', + 'BucketingBBoxCoder', 'DistancePointBBoxCoder', 'DeltaXYWHBBoxCoderForGLIP' +] diff --git a/mmdetection/mmdet/models/task_modules/coders/base_bbox_coder.py b/mmdetection/mmdet/models/task_modules/coders/base_bbox_coder.py new file mode 100644 index 0000000..806d265 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/base_bbox_coder.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + + +class BaseBBoxCoder(metaclass=ABCMeta): + """Base bounding box coder. + + Args: + use_box_type (bool): Whether to warp decoded boxes with the + box type data structure. Defaults to False. + """ + + # The size of the last of dimension of the encoded tensor. + encode_size = 4 + + def __init__(self, use_box_type: bool = False, **kwargs): + self.use_box_type = use_box_type + + @abstractmethod + def encode(self, bboxes, gt_bboxes): + """Encode deltas between bboxes and ground truth boxes.""" + + @abstractmethod + def decode(self, bboxes, bboxes_pred): + """Decode the predicted bboxes according to prediction and base + boxes.""" diff --git a/mmdetection/mmdet/models/task_modules/coders/bucketing_bbox_coder.py b/mmdetection/mmdet/models/task_modules/coders/bucketing_bbox_coder.py new file mode 100644 index 0000000..4044e1c --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/bucketing_bbox_coder.py @@ -0,0 +1,366 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import (BaseBoxes, HorizontalBoxes, bbox_rescale, + get_box_tensor) +from .base_bbox_coder import BaseBBoxCoder + + +@TASK_UTILS.register_module() +class BucketingBBoxCoder(BaseBBoxCoder): + """Bucketing BBox Coder for Side-Aware Boundary Localization (SABL). + + Boundary Localization with Bucketing and Bucketing Guided Rescoring + are implemented here. + + Please refer to https://arxiv.org/abs/1912.04260 for more details. + + Args: + num_buckets (int): Number of buckets. + scale_factor (int): Scale factor of proposals to generate buckets. + offset_topk (int): Topk buckets are used to generate + bucket fine regression targets. Defaults to 2. + offset_upperbound (float): Offset upperbound to generate + bucket fine regression targets. + To avoid too large offset displacements. Defaults to 1.0. + cls_ignore_neighbor (bool): Ignore second nearest bucket or Not. + Defaults to True. + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Defaults to True. + """ + + def __init__(self, + num_buckets: int, + scale_factor: int, + offset_topk: int = 2, + offset_upperbound: float = 1.0, + cls_ignore_neighbor: bool = True, + clip_border: bool = True, + **kwargs) -> None: + super().__init__(**kwargs) + self.num_buckets = num_buckets + self.scale_factor = scale_factor + self.offset_topk = offset_topk + self.offset_upperbound = offset_upperbound + self.cls_ignore_neighbor = cls_ignore_neighbor + self.clip_border = clip_border + + def encode(self, bboxes: Union[Tensor, BaseBoxes], + gt_bboxes: Union[Tensor, BaseBoxes]) -> Tuple[Tensor]: + """Get bucketing estimation and fine regression targets during + training. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes, + e.g., object proposals. + gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the + transformation, e.g., ground truth boxes. + + Returns: + encoded_bboxes(tuple[Tensor]): bucketing estimation + and fine regression targets and weights + """ + bboxes = get_box_tensor(bboxes) + gt_bboxes = get_box_tensor(gt_bboxes) + assert bboxes.size(0) == gt_bboxes.size(0) + assert bboxes.size(-1) == gt_bboxes.size(-1) == 4 + encoded_bboxes = bbox2bucket(bboxes, gt_bboxes, self.num_buckets, + self.scale_factor, self.offset_topk, + self.offset_upperbound, + self.cls_ignore_neighbor) + return encoded_bboxes + + def decode( + self, + bboxes: Union[Tensor, BaseBoxes], + pred_bboxes: Tensor, + max_shape: Optional[Tuple[int]] = None + ) -> Tuple[Union[Tensor, BaseBoxes], Tensor]: + """Apply transformation `pred_bboxes` to `boxes`. + Args: + boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. + pred_bboxes (torch.Tensor): Predictions for bucketing estimation + and fine regression + max_shape (tuple[int], optional): Maximum shape of boxes. + Defaults to None. + + Returns: + Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes. + """ + bboxes = get_box_tensor(bboxes) + assert len(pred_bboxes) == 2 + cls_preds, offset_preds = pred_bboxes + assert cls_preds.size(0) == bboxes.size(0) and offset_preds.size( + 0) == bboxes.size(0) + bboxes, loc_confidence = bucket2bbox(bboxes, cls_preds, offset_preds, + self.num_buckets, + self.scale_factor, max_shape, + self.clip_border) + if self.use_box_type: + bboxes = HorizontalBoxes(bboxes, clone=False) + return bboxes, loc_confidence + + +def generat_buckets(proposals: Tensor, + num_buckets: int, + scale_factor: float = 1.0) -> Tuple[Tensor]: + """Generate buckets w.r.t bucket number and scale factor of proposals. + + Args: + proposals (Tensor): Shape (n, 4) + num_buckets (int): Number of buckets. + scale_factor (float): Scale factor to rescale proposals. + + Returns: + tuple[Tensor]: (bucket_w, bucket_h, l_buckets, r_buckets, + t_buckets, d_buckets) + + - bucket_w: Width of buckets on x-axis. Shape (n, ). + - bucket_h: Height of buckets on y-axis. Shape (n, ). + - l_buckets: Left buckets. Shape (n, ceil(side_num/2)). + - r_buckets: Right buckets. Shape (n, ceil(side_num/2)). + - t_buckets: Top buckets. Shape (n, ceil(side_num/2)). + - d_buckets: Down buckets. Shape (n, ceil(side_num/2)). + """ + proposals = bbox_rescale(proposals, scale_factor) + + # number of buckets in each side + side_num = int(np.ceil(num_buckets / 2.0)) + pw = proposals[..., 2] - proposals[..., 0] + ph = proposals[..., 3] - proposals[..., 1] + px1 = proposals[..., 0] + py1 = proposals[..., 1] + px2 = proposals[..., 2] + py2 = proposals[..., 3] + + bucket_w = pw / num_buckets + bucket_h = ph / num_buckets + + # left buckets + l_buckets = px1[:, None] + (0.5 + torch.arange( + 0, side_num).to(proposals).float())[None, :] * bucket_w[:, None] + # right buckets + r_buckets = px2[:, None] - (0.5 + torch.arange( + 0, side_num).to(proposals).float())[None, :] * bucket_w[:, None] + # top buckets + t_buckets = py1[:, None] + (0.5 + torch.arange( + 0, side_num).to(proposals).float())[None, :] * bucket_h[:, None] + # down buckets + d_buckets = py2[:, None] - (0.5 + torch.arange( + 0, side_num).to(proposals).float())[None, :] * bucket_h[:, None] + return bucket_w, bucket_h, l_buckets, r_buckets, t_buckets, d_buckets + + +def bbox2bucket(proposals: Tensor, + gt: Tensor, + num_buckets: int, + scale_factor: float, + offset_topk: int = 2, + offset_upperbound: float = 1.0, + cls_ignore_neighbor: bool = True) -> Tuple[Tensor]: + """Generate buckets estimation and fine regression targets. + + Args: + proposals (Tensor): Shape (n, 4) + gt (Tensor): Shape (n, 4) + num_buckets (int): Number of buckets. + scale_factor (float): Scale factor to rescale proposals. + offset_topk (int): Topk buckets are used to generate + bucket fine regression targets. Defaults to 2. + offset_upperbound (float): Offset allowance to generate + bucket fine regression targets. + To avoid too large offset displacements. Defaults to 1.0. + cls_ignore_neighbor (bool): Ignore second nearest bucket or Not. + Defaults to True. + + Returns: + tuple[Tensor]: (offsets, offsets_weights, bucket_labels, cls_weights). + + - offsets: Fine regression targets. \ + Shape (n, num_buckets*2). + - offsets_weights: Fine regression weights. \ + Shape (n, num_buckets*2). + - bucket_labels: Bucketing estimation labels. \ + Shape (n, num_buckets*2). + - cls_weights: Bucketing estimation weights. \ + Shape (n, num_buckets*2). + """ + assert proposals.size() == gt.size() + + # generate buckets + proposals = proposals.float() + gt = gt.float() + (bucket_w, bucket_h, l_buckets, r_buckets, t_buckets, + d_buckets) = generat_buckets(proposals, num_buckets, scale_factor) + + gx1 = gt[..., 0] + gy1 = gt[..., 1] + gx2 = gt[..., 2] + gy2 = gt[..., 3] + + # generate offset targets and weights + # offsets from buckets to gts + l_offsets = (l_buckets - gx1[:, None]) / bucket_w[:, None] + r_offsets = (r_buckets - gx2[:, None]) / bucket_w[:, None] + t_offsets = (t_buckets - gy1[:, None]) / bucket_h[:, None] + d_offsets = (d_buckets - gy2[:, None]) / bucket_h[:, None] + + # select top-k nearest buckets + l_topk, l_label = l_offsets.abs().topk( + offset_topk, dim=1, largest=False, sorted=True) + r_topk, r_label = r_offsets.abs().topk( + offset_topk, dim=1, largest=False, sorted=True) + t_topk, t_label = t_offsets.abs().topk( + offset_topk, dim=1, largest=False, sorted=True) + d_topk, d_label = d_offsets.abs().topk( + offset_topk, dim=1, largest=False, sorted=True) + + offset_l_weights = l_offsets.new_zeros(l_offsets.size()) + offset_r_weights = r_offsets.new_zeros(r_offsets.size()) + offset_t_weights = t_offsets.new_zeros(t_offsets.size()) + offset_d_weights = d_offsets.new_zeros(d_offsets.size()) + inds = torch.arange(0, proposals.size(0)).to(proposals).long() + + # generate offset weights of top-k nearest buckets + for k in range(offset_topk): + if k >= 1: + offset_l_weights[inds, l_label[:, + k]] = (l_topk[:, k] < + offset_upperbound).float() + offset_r_weights[inds, r_label[:, + k]] = (r_topk[:, k] < + offset_upperbound).float() + offset_t_weights[inds, t_label[:, + k]] = (t_topk[:, k] < + offset_upperbound).float() + offset_d_weights[inds, d_label[:, + k]] = (d_topk[:, k] < + offset_upperbound).float() + else: + offset_l_weights[inds, l_label[:, k]] = 1.0 + offset_r_weights[inds, r_label[:, k]] = 1.0 + offset_t_weights[inds, t_label[:, k]] = 1.0 + offset_d_weights[inds, d_label[:, k]] = 1.0 + + offsets = torch.cat([l_offsets, r_offsets, t_offsets, d_offsets], dim=-1) + offsets_weights = torch.cat([ + offset_l_weights, offset_r_weights, offset_t_weights, offset_d_weights + ], + dim=-1) + + # generate bucket labels and weight + side_num = int(np.ceil(num_buckets / 2.0)) + labels = torch.stack( + [l_label[:, 0], r_label[:, 0], t_label[:, 0], d_label[:, 0]], dim=-1) + + batch_size = labels.size(0) + bucket_labels = F.one_hot(labels.view(-1), side_num).view(batch_size, + -1).float() + bucket_cls_l_weights = (l_offsets.abs() < 1).float() + bucket_cls_r_weights = (r_offsets.abs() < 1).float() + bucket_cls_t_weights = (t_offsets.abs() < 1).float() + bucket_cls_d_weights = (d_offsets.abs() < 1).float() + bucket_cls_weights = torch.cat([ + bucket_cls_l_weights, bucket_cls_r_weights, bucket_cls_t_weights, + bucket_cls_d_weights + ], + dim=-1) + # ignore second nearest buckets for cls if necessary + if cls_ignore_neighbor: + bucket_cls_weights = (~((bucket_cls_weights == 1) & + (bucket_labels == 0))).float() + else: + bucket_cls_weights[:] = 1.0 + return offsets, offsets_weights, bucket_labels, bucket_cls_weights + + +def bucket2bbox(proposals: Tensor, + cls_preds: Tensor, + offset_preds: Tensor, + num_buckets: int, + scale_factor: float = 1.0, + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + clip_border: bool = True) -> Tuple[Tensor]: + """Apply bucketing estimation (cls preds) and fine regression (offset + preds) to generate det bboxes. + + Args: + proposals (Tensor): Boxes to be transformed. Shape (n, 4) + cls_preds (Tensor): bucketing estimation. Shape (n, num_buckets*2). + offset_preds (Tensor): fine regression. Shape (n, num_buckets*2). + num_buckets (int): Number of buckets. + scale_factor (float): Scale factor to rescale proposals. + max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W) + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Defaults to True. + + Returns: + tuple[Tensor]: (bboxes, loc_confidence). + + - bboxes: predicted bboxes. Shape (n, 4) + - loc_confidence: localization confidence of predicted bboxes. + Shape (n,). + """ + + side_num = int(np.ceil(num_buckets / 2.0)) + cls_preds = cls_preds.view(-1, side_num) + offset_preds = offset_preds.view(-1, side_num) + + scores = F.softmax(cls_preds, dim=1) + score_topk, score_label = scores.topk(2, dim=1, largest=True, sorted=True) + + rescaled_proposals = bbox_rescale(proposals, scale_factor) + + pw = rescaled_proposals[..., 2] - rescaled_proposals[..., 0] + ph = rescaled_proposals[..., 3] - rescaled_proposals[..., 1] + px1 = rescaled_proposals[..., 0] + py1 = rescaled_proposals[..., 1] + px2 = rescaled_proposals[..., 2] + py2 = rescaled_proposals[..., 3] + + bucket_w = pw / num_buckets + bucket_h = ph / num_buckets + + score_inds_l = score_label[0::4, 0] + score_inds_r = score_label[1::4, 0] + score_inds_t = score_label[2::4, 0] + score_inds_d = score_label[3::4, 0] + l_buckets = px1 + (0.5 + score_inds_l.float()) * bucket_w + r_buckets = px2 - (0.5 + score_inds_r.float()) * bucket_w + t_buckets = py1 + (0.5 + score_inds_t.float()) * bucket_h + d_buckets = py2 - (0.5 + score_inds_d.float()) * bucket_h + + offsets = offset_preds.view(-1, 4, side_num) + inds = torch.arange(proposals.size(0)).to(proposals).long() + l_offsets = offsets[:, 0, :][inds, score_inds_l] + r_offsets = offsets[:, 1, :][inds, score_inds_r] + t_offsets = offsets[:, 2, :][inds, score_inds_t] + d_offsets = offsets[:, 3, :][inds, score_inds_d] + + x1 = l_buckets - l_offsets * bucket_w + x2 = r_buckets - r_offsets * bucket_w + y1 = t_buckets - t_offsets * bucket_h + y2 = d_buckets - d_offsets * bucket_h + + if clip_border and max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1] - 1) + y1 = y1.clamp(min=0, max=max_shape[0] - 1) + x2 = x2.clamp(min=0, max=max_shape[1] - 1) + y2 = y2.clamp(min=0, max=max_shape[0] - 1) + bboxes = torch.cat([x1[:, None], y1[:, None], x2[:, None], y2[:, None]], + dim=-1) + + # bucketing guided rescoring + loc_confidence = score_topk[:, 0] + top2_neighbor_inds = (score_label[:, 0] - score_label[:, 1]).abs() == 1 + loc_confidence += score_topk[:, 1] * top2_neighbor_inds.float() + loc_confidence = loc_confidence.view(-1, 4).mean(dim=1) + + return bboxes, loc_confidence diff --git a/mmdetection/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py b/mmdetection/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py new file mode 100644 index 0000000..c2b60b5 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py @@ -0,0 +1,579 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Optional, Sequence, Union + +import numpy as np +import torch +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor +from .base_bbox_coder import BaseBBoxCoder + + +@TASK_UTILS.register_module() +class DeltaXYWHBBoxCoder(BaseBBoxCoder): + """Delta XYWH BBox coder. + + Following the practice in `R-CNN `_, + this coder encodes bbox (x1, y1, x2, y2) into delta (dx, dy, dw, dh) and + decodes delta (dx, dy, dw, dh) back to original bbox (x1, y1, x2, y2). + + Args: + target_means (Sequence[float]): Denormalizing means of target for + delta coordinates + target_stds (Sequence[float]): Denormalizing standard deviation of + target for delta coordinates + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Defaults to True. + add_ctr_clamp (bool): Whether to add center clamp, when added, the + predicted box is clamped is its center is too far away from + the original anchor's center. Only used by YOLOF. Default False. + ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF. + Default 32. + """ + + def __init__(self, + target_means: Sequence[float] = (0., 0., 0., 0.), + target_stds: Sequence[float] = (1., 1., 1., 1.), + clip_border: bool = True, + add_ctr_clamp: bool = False, + ctr_clamp: int = 32, + **kwargs) -> None: + super().__init__(**kwargs) + self.means = target_means + self.stds = target_stds + self.clip_border = clip_border + self.add_ctr_clamp = add_ctr_clamp + self.ctr_clamp = ctr_clamp + + def encode(self, bboxes: Union[Tensor, BaseBoxes], + gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor: + """Get box regression transformation deltas that can be used to + transform the ``bboxes`` into the ``gt_bboxes``. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes, + e.g., object proposals. + gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the + transformation, e.g., ground-truth boxes. + + Returns: + torch.Tensor: Box transformation deltas + """ + bboxes = get_box_tensor(bboxes) + gt_bboxes = get_box_tensor(gt_bboxes) + assert bboxes.size(0) == gt_bboxes.size(0) + assert bboxes.size(-1) == gt_bboxes.size(-1) == 4 + encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds) + return encoded_bboxes + + def decode( + self, + bboxes: Union[Tensor, BaseBoxes], + pred_bboxes: Tensor, + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + wh_ratio_clip: Optional[float] = 16 / 1000 + ) -> Union[Tensor, BaseBoxes]: + """Apply transformation `pred_bboxes` to `boxes`. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape + (B, N, 4) or (N, 4) + pred_bboxes (Tensor): Encoded offsets with respect to each roi. + Has shape (B, N, num_classes * 4) or (B, N, 4) or + (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H + when rois is a grid of anchors.Offset encoding follows [1]_. + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]] + and the length of max_shape should also be B. + wh_ratio_clip (float, optional): The allowed ratio between + width and height. + + Returns: + Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes. + """ + bboxes = get_box_tensor(bboxes) + assert pred_bboxes.size(0) == bboxes.size(0) + if pred_bboxes.ndim == 3: + assert pred_bboxes.size(1) == bboxes.size(1) + + if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export(): + # single image decode + decoded_bboxes = delta2bbox(bboxes, pred_bboxes, self.means, + self.stds, max_shape, wh_ratio_clip, + self.clip_border, self.add_ctr_clamp, + self.ctr_clamp) + else: + if pred_bboxes.ndim == 3 and not torch.onnx.is_in_onnx_export(): + warnings.warn( + 'DeprecationWarning: onnx_delta2bbox is deprecated ' + 'in the case of batch decoding and non-ONNX, ' + 'please use “delta2bbox” instead. In order to improve ' + 'the decoding speed, the batch function will no ' + 'longer be supported. ') + decoded_bboxes = onnx_delta2bbox(bboxes, pred_bboxes, self.means, + self.stds, max_shape, + wh_ratio_clip, self.clip_border, + self.add_ctr_clamp, + self.ctr_clamp) + + if self.use_box_type: + assert decoded_bboxes.size(-1) == 4, \ + ('Cannot warp decoded boxes with box type when decoded boxes' + 'have shape of (N, num_classes * 4)') + decoded_bboxes = HorizontalBoxes(decoded_bboxes) + return decoded_bboxes + + +@TASK_UTILS.register_module() +class DeltaXYWHBBoxCoderForGLIP(DeltaXYWHBBoxCoder): + """This is designed specifically for the GLIP algorithm. + + In order to completely match the official performance, we need to perform + special calculations in the encoding and decoding processes, such as + additional +1 and -1 calculations. However, this is not a user-friendly + design. + """ + + def encode(self, bboxes: Union[Tensor, BaseBoxes], + gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor: + """Get box regression transformation deltas that can be used to + transform the ``bboxes`` into the ``gt_bboxes``. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes, + e.g., object proposals. + gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the + transformation, e.g., ground-truth boxes. + + Returns: + torch.Tensor: Box transformation deltas + """ + bboxes = get_box_tensor(bboxes) + gt_bboxes = get_box_tensor(gt_bboxes) + assert bboxes.size(0) == gt_bboxes.size(0) + assert bboxes.size(-1) == gt_bboxes.size(-1) == 4 + encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds) + return encoded_bboxes + + def decode( + self, + bboxes: Union[Tensor, BaseBoxes], + pred_bboxes: Tensor, + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + wh_ratio_clip: Optional[float] = 16 / 1000 + ) -> Union[Tensor, BaseBoxes]: + """Apply transformation `pred_bboxes` to `boxes`. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape + (B, N, 4) or (N, 4) + pred_bboxes (Tensor): Encoded offsets with respect to each roi. + Has shape (B, N, num_classes * 4) or (B, N, 4) or + (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H + when rois is a grid of anchors.Offset encoding follows [1]_. + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]] + and the length of max_shape should also be B. + wh_ratio_clip (float, optional): The allowed ratio between + width and height. + + Returns: + Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes. + """ + bboxes = get_box_tensor(bboxes) + assert pred_bboxes.size(0) == bboxes.size(0) + if pred_bboxes.ndim == 3: + assert pred_bboxes.size(1) == bboxes.size(1) + + if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export(): + # single image decode + decoded_bboxes = delta2bbox_glip(bboxes, pred_bboxes, self.means, + self.stds, max_shape, + wh_ratio_clip, self.clip_border, + self.add_ctr_clamp, + self.ctr_clamp) + else: + raise NotImplementedError() + + if self.use_box_type: + assert decoded_bboxes.size(-1) == 4, \ + ('Cannot warp decoded boxes with box type when decoded boxes' + 'have shape of (N, num_classes * 4)') + decoded_bboxes = HorizontalBoxes(decoded_bboxes) + return decoded_bboxes + + +def bbox2delta( + proposals: Tensor, + gt: Tensor, + means: Sequence[float] = (0., 0., 0., 0.), + stds: Sequence[float] = (1., 1., 1., 1.) +) -> Tensor: + """Compute deltas of proposals w.r.t. gt. + + We usually compute the deltas of x, y, w, h of proposals w.r.t ground + truth bboxes to get regression target. + This is the inverse function of :func:`delta2bbox`. + + Args: + proposals (Tensor): Boxes to be transformed, shape (N, ..., 4) + gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4) + means (Sequence[float]): Denormalizing means for delta coordinates + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates + + Returns: + Tensor: deltas with shape (N, 4), where columns represent dx, dy, + dw, dh. + """ + assert proposals.size() == gt.size() + + proposals = proposals.float() + gt = gt.float() + px = (proposals[..., 0] + proposals[..., 2]) * 0.5 + py = (proposals[..., 1] + proposals[..., 3]) * 0.5 + pw = proposals[..., 2] - proposals[..., 0] + ph = proposals[..., 3] - proposals[..., 1] + + gx = (gt[..., 0] + gt[..., 2]) * 0.5 + gy = (gt[..., 1] + gt[..., 3]) * 0.5 + gw = gt[..., 2] - gt[..., 0] + gh = gt[..., 3] - gt[..., 1] + + dx = (gx - px) / pw + dy = (gy - py) / ph + dw = torch.log(gw / pw) + dh = torch.log(gh / ph) + deltas = torch.stack([dx, dy, dw, dh], dim=-1) + + means = deltas.new_tensor(means).unsqueeze(0) + stds = deltas.new_tensor(stds).unsqueeze(0) + deltas = deltas.sub_(means).div_(stds) + + return deltas + + +def delta2bbox(rois: Tensor, + deltas: Tensor, + means: Sequence[float] = (0., 0., 0., 0.), + stds: Sequence[float] = (1., 1., 1., 1.), + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + wh_ratio_clip: float = 16 / 1000, + clip_border: bool = True, + add_ctr_clamp: bool = False, + ctr_clamp: int = 32) -> Tensor: + """Apply deltas to shift/scale base boxes. + + Typically the rois are anchor or proposed bounding boxes and the deltas are + network outputs used to shift/scale those boxes. + This is the inverse function of :func:`bbox2delta`. + + Args: + rois (Tensor): Boxes to be transformed. Has shape (N, 4). + deltas (Tensor): Encoded offsets relative to each roi. + Has shape (N, num_classes * 4) or (N, 4). Note + N = num_base_anchors * W * H, when rois is a grid of + anchors. Offset encoding follows [1]_. + means (Sequence[float]): Denormalizing means for delta coordinates. + Default (0., 0., 0., 0.). + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates. Default (1., 1., 1., 1.). + max_shape (tuple[int, int]): Maximum bounds for boxes, specifies + (H, W). Default None. + wh_ratio_clip (float): Maximum aspect ratio for boxes. Default + 16 / 1000. + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Default True. + add_ctr_clamp (bool): Whether to add center clamp. When set to True, + the center of the prediction bounding box will be clamped to + avoid being too far away from the center of the anchor. + Only used by YOLOF. Default False. + ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF. + Default 32. + + Returns: + Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4 + represent tl_x, tl_y, br_x, br_y. + + References: + .. [1] https://arxiv.org/abs/1311.2524 + + Example: + >>> rois = torch.Tensor([[ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 5., 5., 5., 5.]]) + >>> deltas = torch.Tensor([[ 0., 0., 0., 0.], + >>> [ 1., 1., 1., 1.], + >>> [ 0., 0., 2., -1.], + >>> [ 0.7, -1.9, -0.5, 0.3]]) + >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3)) + tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.1409, 0.1409, 2.8591, 2.8591], + [0.0000, 0.3161, 4.1945, 0.6839], + [5.0000, 5.0000, 5.0000, 5.0000]]) + """ + num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4 + if num_bboxes == 0: + return deltas + + deltas = deltas.reshape(-1, 4) + + means = deltas.new_tensor(means).view(1, -1) + stds = deltas.new_tensor(stds).view(1, -1) + denorm_deltas = deltas * stds + means + + dxy = denorm_deltas[:, :2] + dwh = denorm_deltas[:, 2:] + + # Compute width/height of each roi + rois_ = rois.repeat(1, num_classes).reshape(-1, 4) + pxy = ((rois_[:, :2] + rois_[:, 2:]) * 0.5) + pwh = (rois_[:, 2:] - rois_[:, :2]) + + dxy_wh = pwh * dxy + + max_ratio = np.abs(np.log(wh_ratio_clip)) + if add_ctr_clamp: + dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp) + dwh = torch.clamp(dwh, max=max_ratio) + else: + dwh = dwh.clamp(min=-max_ratio, max=max_ratio) + + gxy = pxy + dxy_wh + gwh = pwh * dwh.exp() + x1y1 = gxy - (gwh * 0.5) + x2y2 = gxy + (gwh * 0.5) + bboxes = torch.cat([x1y1, x2y2], dim=-1) + if clip_border and max_shape is not None: + bboxes[..., 0::2].clamp_(min=0, max=max_shape[1]) + bboxes[..., 1::2].clamp_(min=0, max=max_shape[0]) + bboxes = bboxes.reshape(num_bboxes, -1) + return bboxes + + +def onnx_delta2bbox(rois: Tensor, + deltas: Tensor, + means: Sequence[float] = (0., 0., 0., 0.), + stds: Sequence[float] = (1., 1., 1., 1.), + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + wh_ratio_clip: float = 16 / 1000, + clip_border: Optional[bool] = True, + add_ctr_clamp: bool = False, + ctr_clamp: int = 32) -> Tensor: + """Apply deltas to shift/scale base boxes. + + Typically the rois are anchor or proposed bounding boxes and the deltas are + network outputs used to shift/scale those boxes. + This is the inverse function of :func:`bbox2delta`. + + Args: + rois (Tensor): Boxes to be transformed. Has shape (N, 4) or (B, N, 4) + deltas (Tensor): Encoded offsets with respect to each roi. + Has shape (B, N, num_classes * 4) or (B, N, 4) or + (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H + when rois is a grid of anchors.Offset encoding follows [1]_. + means (Sequence[float]): Denormalizing means for delta coordinates. + Default (0., 0., 0., 0.). + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates. Default (1., 1., 1., 1.). + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If rois shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]] + and the length of max_shape should also be B. Default None. + wh_ratio_clip (float): Maximum aspect ratio for boxes. + Default 16 / 1000. + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Default True. + add_ctr_clamp (bool): Whether to add center clamp, when added, the + predicted box is clamped is its center is too far away from + the original anchor's center. Only used by YOLOF. Default False. + ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF. + Default 32. + + Returns: + Tensor: Boxes with shape (B, N, num_classes * 4) or (B, N, 4) or + (N, num_classes * 4) or (N, 4), where 4 represent + tl_x, tl_y, br_x, br_y. + + References: + .. [1] https://arxiv.org/abs/1311.2524 + + Example: + >>> rois = torch.Tensor([[ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 5., 5., 5., 5.]]) + >>> deltas = torch.Tensor([[ 0., 0., 0., 0.], + >>> [ 1., 1., 1., 1.], + >>> [ 0., 0., 2., -1.], + >>> [ 0.7, -1.9, -0.5, 0.3]]) + >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3)) + tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.1409, 0.1409, 2.8591, 2.8591], + [0.0000, 0.3161, 4.1945, 0.6839], + [5.0000, 5.0000, 5.0000, 5.0000]]) + """ + means = deltas.new_tensor(means).view(1, + -1).repeat(1, + deltas.size(-1) // 4) + stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(-1) // 4) + denorm_deltas = deltas * stds + means + dx = denorm_deltas[..., 0::4] + dy = denorm_deltas[..., 1::4] + dw = denorm_deltas[..., 2::4] + dh = denorm_deltas[..., 3::4] + + x1, y1 = rois[..., 0], rois[..., 1] + x2, y2 = rois[..., 2], rois[..., 3] + # Compute center of each roi + px = ((x1 + x2) * 0.5).unsqueeze(-1).expand_as(dx) + py = ((y1 + y2) * 0.5).unsqueeze(-1).expand_as(dy) + # Compute width/height of each roi + pw = (x2 - x1).unsqueeze(-1).expand_as(dw) + ph = (y2 - y1).unsqueeze(-1).expand_as(dh) + + dx_width = pw * dx + dy_height = ph * dy + + max_ratio = np.abs(np.log(wh_ratio_clip)) + if add_ctr_clamp: + dx_width = torch.clamp(dx_width, max=ctr_clamp, min=-ctr_clamp) + dy_height = torch.clamp(dy_height, max=ctr_clamp, min=-ctr_clamp) + dw = torch.clamp(dw, max=max_ratio) + dh = torch.clamp(dh, max=max_ratio) + else: + dw = dw.clamp(min=-max_ratio, max=max_ratio) + dh = dh.clamp(min=-max_ratio, max=max_ratio) + # Use exp(network energy) to enlarge/shrink each roi + gw = pw * dw.exp() + gh = ph * dh.exp() + # Use network energy to shift the center of each roi + gx = px + dx_width + gy = py + dy_height + # Convert center-xy/width/height to top-left, bottom-right + x1 = gx - gw * 0.5 + y1 = gy - gh * 0.5 + x2 = gx + gw * 0.5 + y2 = gy + gh * 0.5 + + bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size()) + + if clip_border and max_shape is not None: + # clip bboxes with dynamic `min` and `max` for onnx + if torch.onnx.is_in_onnx_export(): + from mmdet.core.export import dynamic_clip_for_onnx + x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape) + bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size()) + return bboxes + if not isinstance(max_shape, torch.Tensor): + max_shape = x1.new_tensor(max_shape) + max_shape = max_shape[..., :2].type_as(x1) + if max_shape.ndim == 2: + assert bboxes.ndim == 3 + assert max_shape.size(0) == bboxes.size(0) + + min_xy = x1.new_tensor(0) + max_xy = torch.cat( + [max_shape] * (deltas.size(-1) // 2), + dim=-1).flip(-1).unsqueeze(-2) + bboxes = torch.where(bboxes < min_xy, min_xy, bboxes) + bboxes = torch.where(bboxes > max_xy, max_xy, bboxes) + + return bboxes + + +def delta2bbox_glip(rois: Tensor, + deltas: Tensor, + means: Sequence[float] = (0., 0., 0., 0.), + stds: Sequence[float] = (1., 1., 1., 1.), + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + wh_ratio_clip: float = 16 / 1000, + clip_border: bool = True, + add_ctr_clamp: bool = False, + ctr_clamp: int = 32) -> Tensor: + """Apply deltas to shift/scale base boxes. + + Typically the rois are anchor or proposed bounding boxes and the deltas are + network outputs used to shift/scale those boxes. + This is the inverse function of :func:`bbox2delta`. + + Args: + rois (Tensor): Boxes to be transformed. Has shape (N, 4). + deltas (Tensor): Encoded offsets relative to each roi. + Has shape (N, num_classes * 4) or (N, 4). Note + N = num_base_anchors * W * H, when rois is a grid of + anchors. Offset encoding follows [1]_. + means (Sequence[float]): Denormalizing means for delta coordinates. + Default (0., 0., 0., 0.). + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates. Default (1., 1., 1., 1.). + max_shape (tuple[int, int]): Maximum bounds for boxes, specifies + (H, W). Default None. + wh_ratio_clip (float): Maximum aspect ratio for boxes. Default + 16 / 1000. + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Default True. + add_ctr_clamp (bool): Whether to add center clamp. When set to True, + the center of the prediction bounding box will be clamped to + avoid being too far away from the center of the anchor. + Only used by YOLOF. Default False. + ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF. + Default 32. + + Returns: + Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4 + represent tl_x, tl_y, br_x, br_y. + """ + num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4 + if num_bboxes == 0: + return deltas + + deltas = deltas.reshape(-1, 4) + + means = deltas.new_tensor(means).view(1, -1) + stds = deltas.new_tensor(stds).view(1, -1) + denorm_deltas = deltas * stds + means + + dxy = denorm_deltas[:, :2] + dwh = denorm_deltas[:, 2:] + + # Compute width/height of each roi + rois_ = rois.repeat(1, num_classes).reshape(-1, 4) + pxy = ((rois_[:, :2] + rois_[:, 2:] - 1) * 0.5) # note + pwh = (rois_[:, 2:] - rois_[:, :2]) + + dxy_wh = pwh * dxy + + max_ratio = np.abs(np.log(wh_ratio_clip)) + if add_ctr_clamp: + dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp) + dwh = torch.clamp(dwh, max=max_ratio) + else: + dwh = dwh.clamp(min=-max_ratio, max=max_ratio) + + gxy = pxy + dxy_wh + gwh = pwh * dwh.exp() + + x1y1 = gxy - (gwh - 1) * 0.5 # Note + x2y2 = gxy + (gwh - 1) * 0.5 # Note + + bboxes = torch.cat([x1y1, x2y2], dim=-1) + + if clip_border and max_shape is not None: + bboxes[..., 0::2].clamp_(min=0, max=max_shape[1] - 1) # Note + bboxes[..., 1::2].clamp_(min=0, max=max_shape[0] - 1) # Note + bboxes = bboxes.reshape(num_bboxes, -1) + return bboxes diff --git a/mmdetection/mmdet/models/task_modules/coders/distance_point_bbox_coder.py b/mmdetection/mmdet/models/task_modules/coders/distance_point_bbox_coder.py new file mode 100644 index 0000000..ab26bf4 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/distance_point_bbox_coder.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import (BaseBoxes, HorizontalBoxes, bbox2distance, + distance2bbox, get_box_tensor) +from .base_bbox_coder import BaseBBoxCoder + + +@TASK_UTILS.register_module() +class DistancePointBBoxCoder(BaseBBoxCoder): + """Distance Point BBox coder. + + This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left, + right) and decode it back to the original. + + Args: + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Defaults to True. + """ + + def __init__(self, clip_border: Optional[bool] = True, **kwargs) -> None: + super().__init__(**kwargs) + self.clip_border = clip_border + + def encode(self, + points: Tensor, + gt_bboxes: Union[Tensor, BaseBoxes], + max_dis: Optional[float] = None, + eps: float = 0.1) -> Tensor: + """Encode bounding box to distances. + + Args: + points (Tensor): Shape (N, 2), The format is [x, y]. + gt_bboxes (Tensor or :obj:`BaseBoxes`): Shape (N, 4), The format + is "xyxy" + max_dis (float): Upper bound of the distance. Default None. + eps (float): a small value to ensure target < max_dis, instead <=. + Default 0.1. + + Returns: + Tensor: Box transformation deltas. The shape is (N, 4). + """ + gt_bboxes = get_box_tensor(gt_bboxes) + assert points.size(0) == gt_bboxes.size(0) + assert points.size(-1) == 2 + assert gt_bboxes.size(-1) == 4 + return bbox2distance(points, gt_bboxes, max_dis, eps) + + def decode( + self, + points: Tensor, + pred_bboxes: Tensor, + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None + ) -> Union[Tensor, BaseBoxes]: + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). Shape (B, N, 4) + or (N, 4) + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]], + and the length of max_shape should also be B. + Default None. + Returns: + Union[Tensor, :obj:`BaseBoxes`]: Boxes with shape (N, 4) or + (B, N, 4) + """ + assert points.size(0) == pred_bboxes.size(0) + assert points.size(-1) == 2 + assert pred_bboxes.size(-1) == 4 + if self.clip_border is False: + max_shape = None + bboxes = distance2bbox(points, pred_bboxes, max_shape) + + if self.use_box_type: + bboxes = HorizontalBoxes(bboxes) + return bboxes diff --git a/mmdetection/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py b/mmdetection/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py new file mode 100644 index 0000000..9eb1bed --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py @@ -0,0 +1,235 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import numpy as np +import torch +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor +from .base_bbox_coder import BaseBBoxCoder + + +@TASK_UTILS.register_module() +class LegacyDeltaXYWHBBoxCoder(BaseBBoxCoder): + """Legacy Delta XYWH BBox coder used in MMDet V1.x. + + Following the practice in R-CNN [1]_, this coder encodes bbox (x1, y1, x2, + y2) into delta (dx, dy, dw, dh) and decodes delta (dx, dy, dw, dh) + back to original bbox (x1, y1, x2, y2). + + Note: + The main difference between :class`LegacyDeltaXYWHBBoxCoder` and + :class:`DeltaXYWHBBoxCoder` is whether ``+ 1`` is used during width and + height calculation. We suggest to only use this coder when testing with + MMDet V1.x models. + + References: + .. [1] https://arxiv.org/abs/1311.2524 + + Args: + target_means (Sequence[float]): denormalizing means of target for + delta coordinates + target_stds (Sequence[float]): denormalizing standard deviation of + target for delta coordinates + """ + + def __init__(self, + target_means: Sequence[float] = (0., 0., 0., 0.), + target_stds: Sequence[float] = (1., 1., 1., 1.), + **kwargs) -> None: + super().__init__(**kwargs) + self.means = target_means + self.stds = target_stds + + def encode(self, bboxes: Union[Tensor, BaseBoxes], + gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor: + """Get box regression transformation deltas that can be used to + transform the ``bboxes`` into the ``gt_bboxes``. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes, + e.g., object proposals. + gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the + transformation, e.g., ground-truth boxes. + + Returns: + torch.Tensor: Box transformation deltas + """ + bboxes = get_box_tensor(bboxes) + gt_bboxes = get_box_tensor(gt_bboxes) + assert bboxes.size(0) == gt_bboxes.size(0) + assert bboxes.size(-1) == gt_bboxes.size(-1) == 4 + encoded_bboxes = legacy_bbox2delta(bboxes, gt_bboxes, self.means, + self.stds) + return encoded_bboxes + + def decode( + self, + bboxes: Union[Tensor, BaseBoxes], + pred_bboxes: Tensor, + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + wh_ratio_clip: Optional[float] = 16 / 1000 + ) -> Union[Tensor, BaseBoxes]: + """Apply transformation `pred_bboxes` to `boxes`. + + Args: + boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. + pred_bboxes (torch.Tensor): Encoded boxes with shape + max_shape (tuple[int], optional): Maximum shape of boxes. + Defaults to None. + wh_ratio_clip (float, optional): The allowed ratio between + width and height. + + Returns: + Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes. + """ + bboxes = get_box_tensor(bboxes) + assert pred_bboxes.size(0) == bboxes.size(0) + decoded_bboxes = legacy_delta2bbox(bboxes, pred_bboxes, self.means, + self.stds, max_shape, wh_ratio_clip) + + if self.use_box_type: + assert decoded_bboxes.size(-1) == 4, \ + ('Cannot warp decoded boxes with box type when decoded boxes' + 'have shape of (N, num_classes * 4)') + decoded_bboxes = HorizontalBoxes(decoded_bboxes) + return decoded_bboxes + + +def legacy_bbox2delta( + proposals: Tensor, + gt: Tensor, + means: Sequence[float] = (0., 0., 0., 0.), + stds: Sequence[float] = (1., 1., 1., 1.) +) -> Tensor: + """Compute deltas of proposals w.r.t. gt in the MMDet V1.x manner. + + We usually compute the deltas of x, y, w, h of proposals w.r.t ground + truth bboxes to get regression target. + This is the inverse function of `delta2bbox()` + + Args: + proposals (Tensor): Boxes to be transformed, shape (N, ..., 4) + gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4) + means (Sequence[float]): Denormalizing means for delta coordinates + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates + + Returns: + Tensor: deltas with shape (N, 4), where columns represent dx, dy, + dw, dh. + """ + assert proposals.size() == gt.size() + + proposals = proposals.float() + gt = gt.float() + px = (proposals[..., 0] + proposals[..., 2]) * 0.5 + py = (proposals[..., 1] + proposals[..., 3]) * 0.5 + pw = proposals[..., 2] - proposals[..., 0] + 1.0 + ph = proposals[..., 3] - proposals[..., 1] + 1.0 + + gx = (gt[..., 0] + gt[..., 2]) * 0.5 + gy = (gt[..., 1] + gt[..., 3]) * 0.5 + gw = gt[..., 2] - gt[..., 0] + 1.0 + gh = gt[..., 3] - gt[..., 1] + 1.0 + + dx = (gx - px) / pw + dy = (gy - py) / ph + dw = torch.log(gw / pw) + dh = torch.log(gh / ph) + deltas = torch.stack([dx, dy, dw, dh], dim=-1) + + means = deltas.new_tensor(means).unsqueeze(0) + stds = deltas.new_tensor(stds).unsqueeze(0) + deltas = deltas.sub_(means).div_(stds) + + return deltas + + +def legacy_delta2bbox(rois: Tensor, + deltas: Tensor, + means: Sequence[float] = (0., 0., 0., 0.), + stds: Sequence[float] = (1., 1., 1., 1.), + max_shape: Optional[ + Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + wh_ratio_clip: float = 16 / 1000) -> Tensor: + """Apply deltas to shift/scale base boxes in the MMDet V1.x manner. + + Typically the rois are anchor or proposed bounding boxes and the deltas are + network outputs used to shift/scale those boxes. + This is the inverse function of `bbox2delta()` + + Args: + rois (Tensor): Boxes to be transformed. Has shape (N, 4) + deltas (Tensor): Encoded offsets with respect to each roi. + Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when + rois is a grid of anchors. Offset encoding follows [1]_. + means (Sequence[float]): Denormalizing means for delta coordinates + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates + max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W) + wh_ratio_clip (float): Maximum aspect ratio for boxes. + + Returns: + Tensor: Boxes with shape (N, 4), where columns represent + tl_x, tl_y, br_x, br_y. + + References: + .. [1] https://arxiv.org/abs/1311.2524 + + Example: + >>> rois = torch.Tensor([[ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 5., 5., 5., 5.]]) + >>> deltas = torch.Tensor([[ 0., 0., 0., 0.], + >>> [ 1., 1., 1., 1.], + >>> [ 0., 0., 2., -1.], + >>> [ 0.7, -1.9, -0.5, 0.3]]) + >>> legacy_delta2bbox(rois, deltas, max_shape=(32, 32)) + tensor([[0.0000, 0.0000, 1.5000, 1.5000], + [0.0000, 0.0000, 5.2183, 5.2183], + [0.0000, 0.1321, 7.8891, 0.8679], + [5.3967, 2.4251, 6.0033, 3.7749]]) + """ + means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4) + stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4) + denorm_deltas = deltas * stds + means + dx = denorm_deltas[:, 0::4] + dy = denorm_deltas[:, 1::4] + dw = denorm_deltas[:, 2::4] + dh = denorm_deltas[:, 3::4] + max_ratio = np.abs(np.log(wh_ratio_clip)) + dw = dw.clamp(min=-max_ratio, max=max_ratio) + dh = dh.clamp(min=-max_ratio, max=max_ratio) + # Compute center of each roi + px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx) + py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy) + # Compute width/height of each roi + pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw) + ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh) + # Use exp(network energy) to enlarge/shrink each roi + gw = pw * dw.exp() + gh = ph * dh.exp() + # Use network energy to shift the center of each roi + gx = px + pw * dx + gy = py + ph * dy + # Convert center-xy/width/height to top-left, bottom-right + + # The true legacy box coder should +- 0.5 here. + # However, current implementation improves the performance when testing + # the models trained in MMDetection 1.X (~0.5 bbox AP, 0.2 mask AP) + x1 = gx - gw * 0.5 + y1 = gy - gh * 0.5 + x2 = gx + gw * 0.5 + y2 = gy + gh * 0.5 + if max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1] - 1) + y1 = y1.clamp(min=0, max=max_shape[0] - 1) + x2 = x2.clamp(min=0, max=max_shape[1] - 1) + y2 = y2.clamp(min=0, max=max_shape[0] - 1) + bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas) + return bboxes diff --git a/mmdetection/mmdet/models/task_modules/coders/pseudo_bbox_coder.py b/mmdetection/mmdet/models/task_modules/coders/pseudo_bbox_coder.py new file mode 100644 index 0000000..9ee7431 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/pseudo_bbox_coder.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor +from .base_bbox_coder import BaseBBoxCoder + + +@TASK_UTILS.register_module() +class PseudoBBoxCoder(BaseBBoxCoder): + """Pseudo bounding box coder.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def encode(self, bboxes: Tensor, gt_bboxes: Union[Tensor, + BaseBoxes]) -> Tensor: + """torch.Tensor: return the given ``bboxes``""" + gt_bboxes = get_box_tensor(gt_bboxes) + return gt_bboxes + + def decode(self, bboxes: Tensor, pred_bboxes: Union[Tensor, + BaseBoxes]) -> Tensor: + """torch.Tensor: return the given ``pred_bboxes``""" + if self.use_box_type: + pred_bboxes = HorizontalBoxes(pred_bboxes) + return pred_bboxes diff --git a/mmdetection/mmdet/models/task_modules/coders/tblr_bbox_coder.py b/mmdetection/mmdet/models/task_modules/coders/tblr_bbox_coder.py new file mode 100644 index 0000000..74b388f --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/tblr_bbox_coder.py @@ -0,0 +1,228 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import torch +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor +from .base_bbox_coder import BaseBBoxCoder + + +@TASK_UTILS.register_module() +class TBLRBBoxCoder(BaseBBoxCoder): + """TBLR BBox coder. + + Following the practice in `FSAF `_, + this coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left, + right) and decode it back to the original. + + Args: + normalizer (list | float): Normalization factor to be + divided with when coding the coordinates. If it is a list, it should + have length of 4 indicating normalization factor in tblr dims. + Otherwise it is a unified float factor for all dims. Default: 4.0 + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Defaults to True. + """ + + def __init__(self, + normalizer: Union[Sequence[float], float] = 4.0, + clip_border: bool = True, + **kwargs) -> None: + super().__init__(**kwargs) + self.normalizer = normalizer + self.clip_border = clip_border + + def encode(self, bboxes: Union[Tensor, BaseBoxes], + gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor: + """Get box regression transformation deltas that can be used to + transform the ``bboxes`` into the ``gt_bboxes`` in the (top, left, + bottom, right) order. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes, + e.g., object proposals. + gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the + transformation, e.g., ground truth boxes. + + Returns: + torch.Tensor: Box transformation deltas + """ + bboxes = get_box_tensor(bboxes) + gt_bboxes = get_box_tensor(gt_bboxes) + assert bboxes.size(0) == gt_bboxes.size(0) + assert bboxes.size(-1) == gt_bboxes.size(-1) == 4 + encoded_bboxes = bboxes2tblr( + bboxes, gt_bboxes, normalizer=self.normalizer) + return encoded_bboxes + + def decode( + self, + bboxes: Union[Tensor, BaseBoxes], + pred_bboxes: Tensor, + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None + ) -> Union[Tensor, BaseBoxes]: + """Apply transformation `pred_bboxes` to `boxes`. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.Shape + (B, N, 4) or (N, 4) + pred_bboxes (torch.Tensor): Encoded boxes with shape + (B, N, 4) or (N, 4) + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]] + and the length of max_shape should also be B. + + Returns: + Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes. + """ + bboxes = get_box_tensor(bboxes) + decoded_bboxes = tblr2bboxes( + bboxes, + pred_bboxes, + normalizer=self.normalizer, + max_shape=max_shape, + clip_border=self.clip_border) + + if self.use_box_type: + decoded_bboxes = HorizontalBoxes(decoded_bboxes) + return decoded_bboxes + + +def bboxes2tblr(priors: Tensor, + gts: Tensor, + normalizer: Union[Sequence[float], float] = 4.0, + normalize_by_wh: bool = True) -> Tensor: + """Encode ground truth boxes to tblr coordinate. + + It first convert the gt coordinate to tblr format, + (top, bottom, left, right), relative to prior box centers. + The tblr coordinate may be normalized by the side length of prior bboxes + if `normalize_by_wh` is specified as True, and it is then normalized by + the `normalizer` factor. + + Args: + priors (Tensor): Prior boxes in point form + Shape: (num_proposals,4). + gts (Tensor): Coords of ground truth for each prior in point-form + Shape: (num_proposals, 4). + normalizer (Sequence[float] | float): normalization parameter of + encoded boxes. If it is a list, it has to have length = 4. + Default: 4.0 + normalize_by_wh (bool): Whether to normalize tblr coordinate by the + side length (wh) of prior bboxes. + + Return: + encoded boxes (Tensor), Shape: (num_proposals, 4) + """ + + # dist b/t match center and prior's center + if not isinstance(normalizer, float): + normalizer = torch.tensor(normalizer, device=priors.device) + assert len(normalizer) == 4, 'Normalizer must have length = 4' + assert priors.size(0) == gts.size(0) + prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2 + xmin, ymin, xmax, ymax = gts.split(1, dim=1) + top = prior_centers[:, 1].unsqueeze(1) - ymin + bottom = ymax - prior_centers[:, 1].unsqueeze(1) + left = prior_centers[:, 0].unsqueeze(1) - xmin + right = xmax - prior_centers[:, 0].unsqueeze(1) + loc = torch.cat((top, bottom, left, right), dim=1) + if normalize_by_wh: + # Normalize tblr by anchor width and height + wh = priors[:, 2:4] - priors[:, 0:2] + w, h = torch.split(wh, 1, dim=1) + loc[:, :2] /= h # tb is normalized by h + loc[:, 2:] /= w # lr is normalized by w + # Normalize tblr by the given normalization factor + return loc / normalizer + + +def tblr2bboxes(priors: Tensor, + tblr: Tensor, + normalizer: Union[Sequence[float], float] = 4.0, + normalize_by_wh: bool = True, + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None, + clip_border: bool = True) -> Tensor: + """Decode tblr outputs to prediction boxes. + + The process includes 3 steps: 1) De-normalize tblr coordinates by + multiplying it with `normalizer`; 2) De-normalize tblr coordinates by the + prior bbox width and height if `normalize_by_wh` is `True`; 3) Convert + tblr (top, bottom, left, right) pair relative to the center of priors back + to (xmin, ymin, xmax, ymax) coordinate. + + Args: + priors (Tensor): Prior boxes in point form (x0, y0, x1, y1) + Shape: (N,4) or (B, N, 4). + tblr (Tensor): Coords of network output in tblr form + Shape: (N, 4) or (B, N, 4). + normalizer (Sequence[float] | float): Normalization parameter of + encoded boxes. By list, it represents the normalization factors at + tblr dims. By float, it is the unified normalization factor at all + dims. Default: 4.0 + normalize_by_wh (bool): Whether the tblr coordinates have been + normalized by the side length (wh) of prior bboxes. + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]] + and the length of max_shape should also be B. + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Defaults to True. + + Return: + encoded boxes (Tensor): Boxes with shape (N, 4) or (B, N, 4) + """ + if not isinstance(normalizer, float): + normalizer = torch.tensor(normalizer, device=priors.device) + assert len(normalizer) == 4, 'Normalizer must have length = 4' + assert priors.size(0) == tblr.size(0) + if priors.ndim == 3: + assert priors.size(1) == tblr.size(1) + + loc_decode = tblr * normalizer + prior_centers = (priors[..., 0:2] + priors[..., 2:4]) / 2 + if normalize_by_wh: + wh = priors[..., 2:4] - priors[..., 0:2] + w, h = torch.split(wh, 1, dim=-1) + # Inplace operation with slice would failed for exporting to ONNX + th = h * loc_decode[..., :2] # tb + tw = w * loc_decode[..., 2:] # lr + loc_decode = torch.cat([th, tw], dim=-1) + # Cannot be exported using onnx when loc_decode.split(1, dim=-1) + top, bottom, left, right = loc_decode.split((1, 1, 1, 1), dim=-1) + xmin = prior_centers[..., 0].unsqueeze(-1) - left + xmax = prior_centers[..., 0].unsqueeze(-1) + right + ymin = prior_centers[..., 1].unsqueeze(-1) - top + ymax = prior_centers[..., 1].unsqueeze(-1) + bottom + + bboxes = torch.cat((xmin, ymin, xmax, ymax), dim=-1) + + if clip_border and max_shape is not None: + # clip bboxes with dynamic `min` and `max` for onnx + if torch.onnx.is_in_onnx_export(): + from mmdet.core.export import dynamic_clip_for_onnx + xmin, ymin, xmax, ymax = dynamic_clip_for_onnx( + xmin, ymin, xmax, ymax, max_shape) + bboxes = torch.cat([xmin, ymin, xmax, ymax], dim=-1) + return bboxes + if not isinstance(max_shape, torch.Tensor): + max_shape = priors.new_tensor(max_shape) + max_shape = max_shape[..., :2].type_as(priors) + if max_shape.ndim == 2: + assert bboxes.ndim == 3 + assert max_shape.size(0) == bboxes.size(0) + + min_xy = priors.new_tensor(0) + max_xy = torch.cat([max_shape, max_shape], + dim=-1).flip(-1).unsqueeze(-2) + bboxes = torch.where(bboxes < min_xy, min_xy, bboxes) + bboxes = torch.where(bboxes > max_xy, max_xy, bboxes) + + return bboxes diff --git a/mmdetection/mmdet/models/task_modules/coders/yolo_bbox_coder.py b/mmdetection/mmdet/models/task_modules/coders/yolo_bbox_coder.py new file mode 100644 index 0000000..2e1c766 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/coders/yolo_bbox_coder.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor +from .base_bbox_coder import BaseBBoxCoder + + +@TASK_UTILS.register_module() +class YOLOBBoxCoder(BaseBBoxCoder): + """YOLO BBox coder. + + Following `YOLO `_, this coder divide + image into grids, and encode bbox (x1, y1, x2, y2) into (cx, cy, dw, dh). + cx, cy in [0., 1.], denotes relative center position w.r.t the center of + bboxes. dw, dh are the same as :obj:`DeltaXYWHBBoxCoder`. + + Args: + eps (float): Min value of cx, cy when encoding. + """ + + def __init__(self, eps: float = 1e-6, **kwargs): + super().__init__(**kwargs) + self.eps = eps + + def encode(self, bboxes: Union[Tensor, BaseBoxes], + gt_bboxes: Union[Tensor, BaseBoxes], + stride: Union[Tensor, int]) -> Tensor: + """Get box regression transformation deltas that can be used to + transform the ``bboxes`` into the ``gt_bboxes``. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes, + e.g., anchors. + gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the + transformation, e.g., ground-truth boxes. + stride (torch.Tensor | int): Stride of bboxes. + + Returns: + torch.Tensor: Box transformation deltas + """ + bboxes = get_box_tensor(bboxes) + gt_bboxes = get_box_tensor(gt_bboxes) + assert bboxes.size(0) == gt_bboxes.size(0) + assert bboxes.size(-1) == gt_bboxes.size(-1) == 4 + x_center_gt = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) * 0.5 + y_center_gt = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) * 0.5 + w_gt = gt_bboxes[..., 2] - gt_bboxes[..., 0] + h_gt = gt_bboxes[..., 3] - gt_bboxes[..., 1] + x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5 + y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5 + w = bboxes[..., 2] - bboxes[..., 0] + h = bboxes[..., 3] - bboxes[..., 1] + w_target = torch.log((w_gt / w).clamp(min=self.eps)) + h_target = torch.log((h_gt / h).clamp(min=self.eps)) + x_center_target = ((x_center_gt - x_center) / stride + 0.5).clamp( + self.eps, 1 - self.eps) + y_center_target = ((y_center_gt - y_center) / stride + 0.5).clamp( + self.eps, 1 - self.eps) + encoded_bboxes = torch.stack( + [x_center_target, y_center_target, w_target, h_target], dim=-1) + return encoded_bboxes + + def decode(self, bboxes: Union[Tensor, BaseBoxes], pred_bboxes: Tensor, + stride: Union[Tensor, int]) -> Union[Tensor, BaseBoxes]: + """Apply transformation `pred_bboxes` to `boxes`. + + Args: + boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes, + e.g. anchors. + pred_bboxes (torch.Tensor): Encoded boxes with shape + stride (torch.Tensor | int): Strides of bboxes. + + Returns: + Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes. + """ + bboxes = get_box_tensor(bboxes) + assert pred_bboxes.size(-1) == bboxes.size(-1) == 4 + xy_centers = (bboxes[..., :2] + bboxes[..., 2:]) * 0.5 + ( + pred_bboxes[..., :2] - 0.5) * stride + whs = (bboxes[..., 2:] - + bboxes[..., :2]) * 0.5 * pred_bboxes[..., 2:].exp() + decoded_bboxes = torch.stack( + (xy_centers[..., 0] - whs[..., 0], xy_centers[..., 1] - + whs[..., 1], xy_centers[..., 0] + whs[..., 0], + xy_centers[..., 1] + whs[..., 1]), + dim=-1) + + if self.use_box_type: + decoded_bboxes = HorizontalBoxes(decoded_bboxes) + return decoded_bboxes diff --git a/mmdetection/mmdet/models/task_modules/prior_generators/__init__.py b/mmdetection/mmdet/models/task_modules/prior_generators/__init__.py new file mode 100644 index 0000000..7795e98 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/prior_generators/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .anchor_generator import (AnchorGenerator, LegacyAnchorGenerator, + SSDAnchorGenerator, YOLOAnchorGenerator) +from .point_generator import MlvlPointGenerator, PointGenerator +from .utils import anchor_inside_flags, calc_region + +__all__ = [ + 'AnchorGenerator', 'LegacyAnchorGenerator', 'anchor_inside_flags', + 'PointGenerator', 'calc_region', 'YOLOAnchorGenerator', + 'MlvlPointGenerator', 'SSDAnchorGenerator' +] diff --git a/mmdetection/mmdet/models/task_modules/prior_generators/anchor_generator.py b/mmdetection/mmdet/models/task_modules/prior_generators/anchor_generator.py new file mode 100644 index 0000000..2757697 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/prior_generators/anchor_generator.py @@ -0,0 +1,848 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from mmengine.utils import is_tuple_of +from torch import Tensor +from torch.nn.modules.utils import _pair + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import HorizontalBoxes + +DeviceType = Union[str, torch.device] + + +@TASK_UTILS.register_module() +class AnchorGenerator: + """Standard anchor generator for 2D anchor-based detectors. + + Args: + strides (list[int] | list[tuple[int, int]]): Strides of anchors + in multiple feature levels in order (w, h). + ratios (list[float]): The list of ratios between the height and width + of anchors in a single level. + scales (list[int], Optional): Anchor scales for anchors + in a single level. It cannot be set at the same time + if `octave_base_scale` and `scales_per_octave` are set. + base_sizes (list[int], Optional): The basic sizes + of anchors in multiple levels. + If None is given, strides will be used as base_sizes. + (If strides are non square, the shortest stride is taken.) + scale_major (bool): Whether to multiply scales first when generating + base anchors. If true, the anchors in the same row will have the + same scales. By default it is True in V2.0 + octave_base_scale (int, Optional): The base scale of octave. + scales_per_octave (int, Optional): Number of scales for each octave. + `octave_base_scale` and `scales_per_octave` are usually used in + retinanet and the `scales` should be None when they are set. + centers (list[tuple[float]], Optional): The centers of the anchor + relative to the feature grid center in multiple feature levels. + By default it is set to be None and not used. If a list of tuple of + float is given, they will be used to shift the centers of anchors. + center_offset (float): The offset of center in proportion to anchors' + width and height. By default it is 0 in V2.0. + use_box_type (bool): Whether to warp anchors with the box type data + structure. Defaults to False. + + Examples: + >>> from mmdet.models.task_modules. + ... prior_generators import AnchorGenerator + >>> self = AnchorGenerator([16], [1.], [1.], [9]) + >>> all_anchors = self.grid_priors([(2, 2)], device='cpu') + >>> print(all_anchors) + [tensor([[-4.5000, -4.5000, 4.5000, 4.5000], + [11.5000, -4.5000, 20.5000, 4.5000], + [-4.5000, 11.5000, 4.5000, 20.5000], + [11.5000, 11.5000, 20.5000, 20.5000]])] + >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18]) + >>> all_anchors = self.grid_priors([(2, 2), (1, 1)], device='cpu') + >>> print(all_anchors) + [tensor([[-4.5000, -4.5000, 4.5000, 4.5000], + [11.5000, -4.5000, 20.5000, 4.5000], + [-4.5000, 11.5000, 4.5000, 20.5000], + [11.5000, 11.5000, 20.5000, 20.5000]]), \ + tensor([[-9., -9., 9., 9.]])] + """ + + def __init__(self, + strides: Union[List[int], List[Tuple[int, int]]], + ratios: List[float], + scales: Optional[List[int]] = None, + base_sizes: Optional[List[int]] = None, + scale_major: bool = True, + octave_base_scale: Optional[int] = None, + scales_per_octave: Optional[int] = None, + centers: Optional[List[Tuple[float, float]]] = None, + center_offset: float = 0., + use_box_type: bool = False) -> None: + # check center and center_offset + if center_offset != 0: + assert centers is None, 'center cannot be set when center_offset' \ + f'!=0, {centers} is given.' + if not (0 <= center_offset <= 1): + raise ValueError('center_offset should be in range [0, 1], ' + f'{center_offset} is given.') + if centers is not None: + assert len(centers) == len(strides), \ + 'The number of strides should be the same as centers, got ' \ + f'{strides} and {centers}' + + # calculate base sizes of anchors + self.strides = [_pair(stride) for stride in strides] + self.base_sizes = [min(stride) for stride in self.strides + ] if base_sizes is None else base_sizes + assert len(self.base_sizes) == len(self.strides), \ + 'The number of strides should be the same as base sizes, got ' \ + f'{self.strides} and {self.base_sizes}' + + # calculate scales of anchors + assert ((octave_base_scale is not None + and scales_per_octave is not None) ^ (scales is not None)), \ + 'scales and octave_base_scale with scales_per_octave cannot' \ + ' be set at the same time' + if scales is not None: + self.scales = torch.Tensor(scales) + elif octave_base_scale is not None and scales_per_octave is not None: + octave_scales = np.array( + [2**(i / scales_per_octave) for i in range(scales_per_octave)]) + scales = octave_scales * octave_base_scale + self.scales = torch.Tensor(scales) + else: + raise ValueError('Either scales or octave_base_scale with ' + 'scales_per_octave should be set') + + self.octave_base_scale = octave_base_scale + self.scales_per_octave = scales_per_octave + self.ratios = torch.Tensor(ratios) + self.scale_major = scale_major + self.centers = centers + self.center_offset = center_offset + self.base_anchors = self.gen_base_anchors() + self.use_box_type = use_box_type + + @property + def num_base_anchors(self) -> List[int]: + """list[int]: total number of base anchors in a feature grid""" + return self.num_base_priors + + @property + def num_base_priors(self) -> List[int]: + """list[int]: The number of priors (anchors) at a point + on the feature grid""" + return [base_anchors.size(0) for base_anchors in self.base_anchors] + + @property + def num_levels(self) -> int: + """int: number of feature levels that the generator will be applied""" + return len(self.strides) + + def gen_base_anchors(self) -> List[Tensor]: + """Generate base anchors. + + Returns: + list(torch.Tensor): Base anchors of a feature grid in multiple \ + feature levels. + """ + multi_level_base_anchors = [] + for i, base_size in enumerate(self.base_sizes): + center = None + if self.centers is not None: + center = self.centers[i] + multi_level_base_anchors.append( + self.gen_single_level_base_anchors( + base_size, + scales=self.scales, + ratios=self.ratios, + center=center)) + return multi_level_base_anchors + + def gen_single_level_base_anchors(self, + base_size: Union[int, float], + scales: Tensor, + ratios: Tensor, + center: Optional[Tuple[float]] = None) \ + -> Tensor: + """Generate base anchors of a single level. + + Args: + base_size (int | float): Basic size of an anchor. + scales (torch.Tensor): Scales of the anchor. + ratios (torch.Tensor): The ratio between the height + and width of anchors in a single level. + center (tuple[float], optional): The center of the base anchor + related to a single feature grid. Defaults to None. + + Returns: + torch.Tensor: Anchors in a single-level feature maps. + """ + w = base_size + h = base_size + if center is None: + x_center = self.center_offset * w + y_center = self.center_offset * h + else: + x_center, y_center = center + + h_ratios = torch.sqrt(ratios) + w_ratios = 1 / h_ratios + if self.scale_major: + ws = (w * w_ratios[:, None] * scales[None, :]).view(-1) + hs = (h * h_ratios[:, None] * scales[None, :]).view(-1) + else: + ws = (w * scales[:, None] * w_ratios[None, :]).view(-1) + hs = (h * scales[:, None] * h_ratios[None, :]).view(-1) + + # use float anchor and the anchor's center is aligned with the + # pixel center + base_anchors = [ + x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws, + y_center + 0.5 * hs + ] + base_anchors = torch.stack(base_anchors, dim=-1) + + return base_anchors + + def _meshgrid(self, + x: Tensor, + y: Tensor, + row_major: bool = True) -> Tuple[Tensor]: + """Generate mesh grid of x and y. + + Args: + x (torch.Tensor): Grids of x dimension. + y (torch.Tensor): Grids of y dimension. + row_major (bool): Whether to return y grids first. + Defaults to True. + + Returns: + tuple[torch.Tensor]: The mesh grids of x and y. + """ + # use shape instead of len to keep tracing while exporting to onnx + xx = x.repeat(y.shape[0]) + yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1) + if row_major: + return xx, yy + else: + return yy, xx + + def grid_priors(self, + featmap_sizes: List[Tuple], + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda') -> List[Tensor]: + """Generate grid anchors in multiple feature levels. + + Args: + featmap_sizes (list[tuple]): List of feature map sizes in + multiple feature levels. + dtype (:obj:`torch.dtype`): Dtype of priors. + Defaults to torch.float32. + device (str | torch.device): The device where the anchors + will be put on. + + Return: + list[torch.Tensor]: Anchors in multiple feature levels. \ + The sizes of each tensor should be [N, 4], where \ + N = width * height * num_base_anchors, width and height \ + are the sizes of the corresponding feature level, \ + num_base_anchors is the number of anchors for that level. + """ + assert self.num_levels == len(featmap_sizes) + multi_level_anchors = [] + for i in range(self.num_levels): + anchors = self.single_level_grid_priors( + featmap_sizes[i], level_idx=i, dtype=dtype, device=device) + multi_level_anchors.append(anchors) + return multi_level_anchors + + def single_level_grid_priors(self, + featmap_size: Tuple[int, int], + level_idx: int, + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda') -> Tensor: + """Generate grid anchors of a single level. + + Note: + This function is usually called by method ``self.grid_priors``. + + Args: + featmap_size (tuple[int, int]): Size of the feature maps. + level_idx (int): The index of corresponding feature map level. + dtype (obj:`torch.dtype`): Date type of points.Defaults to + ``torch.float32``. + device (str | torch.device): The device the tensor will be put on. + Defaults to 'cuda'. + + Returns: + torch.Tensor: Anchors in the overall feature maps. + """ + + base_anchors = self.base_anchors[level_idx].to(device).to(dtype) + feat_h, feat_w = featmap_size + stride_w, stride_h = self.strides[level_idx] + # First create Range with the default dtype, than convert to + # target `dtype` for onnx exporting. + shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w + shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h + + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1) + # first feat_w elements correspond to the first row of shifts + # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get + # shifted anchors (K, A, 4), reshape to (K*A, 4) + + all_anchors = base_anchors[None, :, :] + shifts[:, None, :] + all_anchors = all_anchors.view(-1, 4) + # first A rows correspond to A anchors of (0, 0) in feature map, + # then (0, 1), (0, 2), ... + if self.use_box_type: + all_anchors = HorizontalBoxes(all_anchors) + return all_anchors + + def sparse_priors(self, + prior_idxs: Tensor, + featmap_size: Tuple[int, int], + level_idx: int, + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda') -> Tensor: + """Generate sparse anchors according to the ``prior_idxs``. + + Args: + prior_idxs (Tensor): The index of corresponding anchors + in the feature map. + featmap_size (tuple[int, int]): feature map size arrange as (h, w). + level_idx (int): The level index of corresponding feature + map. + dtype (obj:`torch.dtype`): Date type of points.Defaults to + ``torch.float32``. + device (str | torch.device): The device where the points is + located. + Returns: + Tensor: Anchor with shape (N, 4), N should be equal to + the length of ``prior_idxs``. + """ + + height, width = featmap_size + num_base_anchors = self.num_base_anchors[level_idx] + base_anchor_id = prior_idxs % num_base_anchors + x = (prior_idxs // + num_base_anchors) % width * self.strides[level_idx][0] + y = (prior_idxs // width // + num_base_anchors) % height * self.strides[level_idx][1] + priors = torch.stack([x, y, x, y], 1).to(dtype).to(device) + \ + self.base_anchors[level_idx][base_anchor_id, :].to(device) + + return priors + + def grid_anchors(self, + featmap_sizes: List[Tuple], + device: DeviceType = 'cuda') -> List[Tensor]: + """Generate grid anchors in multiple feature levels. + + Args: + featmap_sizes (list[tuple]): List of feature map sizes in + multiple feature levels. + device (str | torch.device): Device where the anchors will be + put on. + + Return: + list[torch.Tensor]: Anchors in multiple feature levels. \ + The sizes of each tensor should be [N, 4], where \ + N = width * height * num_base_anchors, width and height \ + are the sizes of the corresponding feature level, \ + num_base_anchors is the number of anchors for that level. + """ + warnings.warn('``grid_anchors`` would be deprecated soon. ' + 'Please use ``grid_priors`` ') + + assert self.num_levels == len(featmap_sizes) + multi_level_anchors = [] + for i in range(self.num_levels): + anchors = self.single_level_grid_anchors( + self.base_anchors[i].to(device), + featmap_sizes[i], + self.strides[i], + device=device) + multi_level_anchors.append(anchors) + return multi_level_anchors + + def single_level_grid_anchors(self, + base_anchors: Tensor, + featmap_size: Tuple[int, int], + stride: Tuple[int, int] = (16, 16), + device: DeviceType = 'cuda') -> Tensor: + """Generate grid anchors of a single level. + + Note: + This function is usually called by method ``self.grid_anchors``. + + Args: + base_anchors (torch.Tensor): The base anchors of a feature grid. + featmap_size (tuple[int]): Size of the feature maps. + stride (tuple[int, int]): Stride of the feature map in order + (w, h). Defaults to (16, 16). + device (str | torch.device): Device the tensor will be put on. + Defaults to 'cuda'. + + Returns: + torch.Tensor: Anchors in the overall feature maps. + """ + + warnings.warn( + '``single_level_grid_anchors`` would be deprecated soon. ' + 'Please use ``single_level_grid_priors`` ') + + # keep featmap_size as Tensor instead of int, so that we + # can convert to ONNX correctly + feat_h, feat_w = featmap_size + shift_x = torch.arange(0, feat_w, device=device) * stride[0] + shift_y = torch.arange(0, feat_h, device=device) * stride[1] + + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1) + shifts = shifts.type_as(base_anchors) + # first feat_w elements correspond to the first row of shifts + # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get + # shifted anchors (K, A, 4), reshape to (K*A, 4) + + all_anchors = base_anchors[None, :, :] + shifts[:, None, :] + all_anchors = all_anchors.view(-1, 4) + # first A rows correspond to A anchors of (0, 0) in feature map, + # then (0, 1), (0, 2), ... + return all_anchors + + def valid_flags(self, + featmap_sizes: List[Tuple[int, int]], + pad_shape: Tuple, + device: DeviceType = 'cuda') -> List[Tensor]: + """Generate valid flags of anchors in multiple feature levels. + + Args: + featmap_sizes (list(tuple[int, int])): List of feature map sizes in + multiple feature levels. + pad_shape (tuple): The padded shape of the image. + device (str | torch.device): Device where the anchors will be + put on. + + Return: + list(torch.Tensor): Valid flags of anchors in multiple levels. + """ + assert self.num_levels == len(featmap_sizes) + multi_level_flags = [] + for i in range(self.num_levels): + anchor_stride = self.strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w = pad_shape[:2] + valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h) + valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w) + flags = self.single_level_valid_flags((feat_h, feat_w), + (valid_feat_h, valid_feat_w), + self.num_base_anchors[i], + device=device) + multi_level_flags.append(flags) + return multi_level_flags + + def single_level_valid_flags(self, + featmap_size: Tuple[int, int], + valid_size: Tuple[int, int], + num_base_anchors: int, + device: DeviceType = 'cuda') -> Tensor: + """Generate the valid flags of anchor in a single feature map. + + Args: + featmap_size (tuple[int]): The size of feature maps, arrange + as (h, w). + valid_size (tuple[int]): The valid size of the feature maps. + num_base_anchors (int): The number of base anchors. + device (str | torch.device): Device where the flags will be put on. + Defaults to 'cuda'. + + Returns: + torch.Tensor: The valid flags of each anchor in a single level \ + feature map. + """ + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device) + valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device) + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + valid = valid[:, None].expand(valid.size(0), + num_base_anchors).contiguous().view(-1) + return valid + + def __repr__(self) -> str: + """str: a string that describes the module""" + indent_str = ' ' + repr_str = self.__class__.__name__ + '(\n' + repr_str += f'{indent_str}strides={self.strides},\n' + repr_str += f'{indent_str}ratios={self.ratios},\n' + repr_str += f'{indent_str}scales={self.scales},\n' + repr_str += f'{indent_str}base_sizes={self.base_sizes},\n' + repr_str += f'{indent_str}scale_major={self.scale_major},\n' + repr_str += f'{indent_str}octave_base_scale=' + repr_str += f'{self.octave_base_scale},\n' + repr_str += f'{indent_str}scales_per_octave=' + repr_str += f'{self.scales_per_octave},\n' + repr_str += f'{indent_str}num_levels={self.num_levels}\n' + repr_str += f'{indent_str}centers={self.centers},\n' + repr_str += f'{indent_str}center_offset={self.center_offset})' + return repr_str + + +@TASK_UTILS.register_module() +class SSDAnchorGenerator(AnchorGenerator): + """Anchor generator for SSD. + + Args: + strides (list[int] | list[tuple[int, int]]): Strides of anchors + in multiple feature levels. + ratios (list[float]): The list of ratios between the height and width + of anchors in a single level. + min_sizes (list[float]): The list of minimum anchor sizes on each + level. + max_sizes (list[float]): The list of maximum anchor sizes on each + level. + basesize_ratio_range (tuple(float)): Ratio range of anchors. Being + used when not setting min_sizes and max_sizes. + input_size (int): Size of feature map, 300 for SSD300, 512 for + SSD512. Being used when not setting min_sizes and max_sizes. + scale_major (bool): Whether to multiply scales first when generating + base anchors. If true, the anchors in the same row will have the + same scales. It is always set to be False in SSD. + use_box_type (bool): Whether to warp anchors with the box type data + structure. Defaults to False. + """ + + def __init__(self, + strides: Union[List[int], List[Tuple[int, int]]], + ratios: List[float], + min_sizes: Optional[List[float]] = None, + max_sizes: Optional[List[float]] = None, + basesize_ratio_range: Tuple[float] = (0.15, 0.9), + input_size: int = 300, + scale_major: bool = True, + use_box_type: bool = False) -> None: + assert len(strides) == len(ratios) + assert not (min_sizes is None) ^ (max_sizes is None) + self.strides = [_pair(stride) for stride in strides] + self.centers = [(stride[0] / 2., stride[1] / 2.) + for stride in self.strides] + + if min_sizes is None and max_sizes is None: + # use hard code to generate SSD anchors + self.input_size = input_size + assert is_tuple_of(basesize_ratio_range, float) + self.basesize_ratio_range = basesize_ratio_range + # calculate anchor ratios and sizes + min_ratio, max_ratio = basesize_ratio_range + min_ratio = int(min_ratio * 100) + max_ratio = int(max_ratio * 100) + step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2)) + min_sizes = [] + max_sizes = [] + for ratio in range(int(min_ratio), int(max_ratio) + 1, step): + min_sizes.append(int(self.input_size * ratio / 100)) + max_sizes.append(int(self.input_size * (ratio + step) / 100)) + if self.input_size == 300: + if basesize_ratio_range[0] == 0.15: # SSD300 COCO + min_sizes.insert(0, int(self.input_size * 7 / 100)) + max_sizes.insert(0, int(self.input_size * 15 / 100)) + elif basesize_ratio_range[0] == 0.2: # SSD300 VOC + min_sizes.insert(0, int(self.input_size * 10 / 100)) + max_sizes.insert(0, int(self.input_size * 20 / 100)) + else: + raise ValueError( + 'basesize_ratio_range[0] should be either 0.15' + 'or 0.2 when input_size is 300, got ' + f'{basesize_ratio_range[0]}.') + elif self.input_size == 512: + if basesize_ratio_range[0] == 0.1: # SSD512 COCO + min_sizes.insert(0, int(self.input_size * 4 / 100)) + max_sizes.insert(0, int(self.input_size * 10 / 100)) + elif basesize_ratio_range[0] == 0.15: # SSD512 VOC + min_sizes.insert(0, int(self.input_size * 7 / 100)) + max_sizes.insert(0, int(self.input_size * 15 / 100)) + else: + raise ValueError( + 'When not setting min_sizes and max_sizes,' + 'basesize_ratio_range[0] should be either 0.1' + 'or 0.15 when input_size is 512, got' + f' {basesize_ratio_range[0]}.') + else: + raise ValueError( + 'Only support 300 or 512 in SSDAnchorGenerator when ' + 'not setting min_sizes and max_sizes, ' + f'got {self.input_size}.') + + assert len(min_sizes) == len(max_sizes) == len(strides) + + anchor_ratios = [] + anchor_scales = [] + for k in range(len(self.strides)): + scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])] + anchor_ratio = [1.] + for r in ratios[k]: + anchor_ratio += [1 / r, r] # 4 or 6 ratio + anchor_ratios.append(torch.Tensor(anchor_ratio)) + anchor_scales.append(torch.Tensor(scales)) + + self.base_sizes = min_sizes + self.scales = anchor_scales + self.ratios = anchor_ratios + self.scale_major = scale_major + self.center_offset = 0 + self.base_anchors = self.gen_base_anchors() + self.use_box_type = use_box_type + + def gen_base_anchors(self) -> List[Tensor]: + """Generate base anchors. + + Returns: + list(torch.Tensor): Base anchors of a feature grid in multiple \ + feature levels. + """ + multi_level_base_anchors = [] + for i, base_size in enumerate(self.base_sizes): + base_anchors = self.gen_single_level_base_anchors( + base_size, + scales=self.scales[i], + ratios=self.ratios[i], + center=self.centers[i]) + indices = list(range(len(self.ratios[i]))) + indices.insert(1, len(indices)) + base_anchors = torch.index_select(base_anchors, 0, + torch.LongTensor(indices)) + multi_level_base_anchors.append(base_anchors) + return multi_level_base_anchors + + def __repr__(self) -> str: + """str: a string that describes the module""" + indent_str = ' ' + repr_str = self.__class__.__name__ + '(\n' + repr_str += f'{indent_str}strides={self.strides},\n' + repr_str += f'{indent_str}scales={self.scales},\n' + repr_str += f'{indent_str}scale_major={self.scale_major},\n' + repr_str += f'{indent_str}input_size={self.input_size},\n' + repr_str += f'{indent_str}scales={self.scales},\n' + repr_str += f'{indent_str}ratios={self.ratios},\n' + repr_str += f'{indent_str}num_levels={self.num_levels},\n' + repr_str += f'{indent_str}base_sizes={self.base_sizes},\n' + repr_str += f'{indent_str}basesize_ratio_range=' + repr_str += f'{self.basesize_ratio_range})' + return repr_str + + +@TASK_UTILS.register_module() +class LegacyAnchorGenerator(AnchorGenerator): + """Legacy anchor generator used in MMDetection V1.x. + + Note: + Difference to the V2.0 anchor generator: + + 1. The center offset of V1.x anchors are set to be 0.5 rather than 0. + 2. The width/height are minused by 1 when calculating the anchors' \ + centers and corners to meet the V1.x coordinate system. + 3. The anchors' corners are quantized. + + Args: + strides (list[int] | list[tuple[int]]): Strides of anchors + in multiple feature levels. + ratios (list[float]): The list of ratios between the height and width + of anchors in a single level. + scales (list[int] | None): Anchor scales for anchors in a single level. + It cannot be set at the same time if `octave_base_scale` and + `scales_per_octave` are set. + base_sizes (list[int]): The basic sizes of anchors in multiple levels. + If None is given, strides will be used to generate base_sizes. + scale_major (bool): Whether to multiply scales first when generating + base anchors. If true, the anchors in the same row will have the + same scales. By default it is True in V2.0 + octave_base_scale (int): The base scale of octave. + scales_per_octave (int): Number of scales for each octave. + `octave_base_scale` and `scales_per_octave` are usually used in + retinanet and the `scales` should be None when they are set. + centers (list[tuple[float, float]] | None): The centers of the anchor + relative to the feature grid center in multiple feature levels. + By default it is set to be None and not used. It a list of float + is given, this list will be used to shift the centers of anchors. + center_offset (float): The offset of center in proportion to anchors' + width and height. By default it is 0.5 in V2.0 but it should be 0.5 + in v1.x models. + use_box_type (bool): Whether to warp anchors with the box type data + structure. Defaults to False. + + Examples: + >>> from mmdet.models.task_modules. + ... prior_generators import LegacyAnchorGenerator + >>> self = LegacyAnchorGenerator( + >>> [16], [1.], [1.], [9], center_offset=0.5) + >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu') + >>> print(all_anchors) + [tensor([[ 0., 0., 8., 8.], + [16., 0., 24., 8.], + [ 0., 16., 8., 24.], + [16., 16., 24., 24.]])] + """ + + def gen_single_level_base_anchors(self, + base_size: Union[int, float], + scales: Tensor, + ratios: Tensor, + center: Optional[Tuple[float]] = None) \ + -> Tensor: + """Generate base anchors of a single level. + + Note: + The width/height of anchors are minused by 1 when calculating \ + the centers and corners to meet the V1.x coordinate system. + + Args: + base_size (int | float): Basic size of an anchor. + scales (torch.Tensor): Scales of the anchor. + ratios (torch.Tensor): The ratio between the height. + and width of anchors in a single level. + center (tuple[float], optional): The center of the base anchor + related to a single feature grid. Defaults to None. + + Returns: + torch.Tensor: Anchors in a single-level feature map. + """ + w = base_size + h = base_size + if center is None: + x_center = self.center_offset * (w - 1) + y_center = self.center_offset * (h - 1) + else: + x_center, y_center = center + + h_ratios = torch.sqrt(ratios) + w_ratios = 1 / h_ratios + if self.scale_major: + ws = (w * w_ratios[:, None] * scales[None, :]).view(-1) + hs = (h * h_ratios[:, None] * scales[None, :]).view(-1) + else: + ws = (w * scales[:, None] * w_ratios[None, :]).view(-1) + hs = (h * scales[:, None] * h_ratios[None, :]).view(-1) + + # use float anchor and the anchor's center is aligned with the + # pixel center + base_anchors = [ + x_center - 0.5 * (ws - 1), y_center - 0.5 * (hs - 1), + x_center + 0.5 * (ws - 1), y_center + 0.5 * (hs - 1) + ] + base_anchors = torch.stack(base_anchors, dim=-1).round() + + return base_anchors + + +@TASK_UTILS.register_module() +class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator): + """Legacy anchor generator used in MMDetection V1.x. + + The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator` + can be found in `LegacyAnchorGenerator`. + """ + + def __init__(self, + strides: Union[List[int], List[Tuple[int, int]]], + ratios: List[float], + basesize_ratio_range: Tuple[float], + input_size: int = 300, + scale_major: bool = True, + use_box_type: bool = False) -> None: + super(LegacySSDAnchorGenerator, self).__init__( + strides=strides, + ratios=ratios, + basesize_ratio_range=basesize_ratio_range, + input_size=input_size, + scale_major=scale_major, + use_box_type=use_box_type) + self.centers = [((stride - 1) / 2., (stride - 1) / 2.) + for stride in strides] + self.base_anchors = self.gen_base_anchors() + + +@TASK_UTILS.register_module() +class YOLOAnchorGenerator(AnchorGenerator): + """Anchor generator for YOLO. + + Args: + strides (list[int] | list[tuple[int, int]]): Strides of anchors + in multiple feature levels. + base_sizes (list[list[tuple[int, int]]]): The basic sizes + of anchors in multiple levels. + """ + + def __init__(self, + strides: Union[List[int], List[Tuple[int, int]]], + base_sizes: List[List[Tuple[int, int]]], + use_box_type: bool = False) -> None: + self.strides = [_pair(stride) for stride in strides] + self.centers = [(stride[0] / 2., stride[1] / 2.) + for stride in self.strides] + self.base_sizes = [] + num_anchor_per_level = len(base_sizes[0]) + for base_sizes_per_level in base_sizes: + assert num_anchor_per_level == len(base_sizes_per_level) + self.base_sizes.append( + [_pair(base_size) for base_size in base_sizes_per_level]) + self.base_anchors = self.gen_base_anchors() + self.use_box_type = use_box_type + + @property + def num_levels(self) -> int: + """int: number of feature levels that the generator will be applied""" + return len(self.base_sizes) + + def gen_base_anchors(self) -> List[Tensor]: + """Generate base anchors. + + Returns: + list(torch.Tensor): Base anchors of a feature grid in multiple \ + feature levels. + """ + multi_level_base_anchors = [] + for i, base_sizes_per_level in enumerate(self.base_sizes): + center = None + if self.centers is not None: + center = self.centers[i] + multi_level_base_anchors.append( + self.gen_single_level_base_anchors(base_sizes_per_level, + center)) + return multi_level_base_anchors + + def gen_single_level_base_anchors(self, + base_sizes_per_level: List[Tuple[int]], + center: Optional[Tuple[float]] = None) \ + -> Tensor: + """Generate base anchors of a single level. + + Args: + base_sizes_per_level (list[tuple[int]]): Basic sizes of + anchors. + center (tuple[float], optional): The center of the base anchor + related to a single feature grid. Defaults to None. + + Returns: + torch.Tensor: Anchors in a single-level feature maps. + """ + x_center, y_center = center + base_anchors = [] + for base_size in base_sizes_per_level: + w, h = base_size + + # use float anchor and the anchor's center is aligned with the + # pixel center + base_anchor = torch.Tensor([ + x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w, + y_center + 0.5 * h + ]) + base_anchors.append(base_anchor) + base_anchors = torch.stack(base_anchors, dim=0) + + return base_anchors diff --git a/mmdetection/mmdet/models/task_modules/prior_generators/point_generator.py b/mmdetection/mmdet/models/task_modules/prior_generators/point_generator.py new file mode 100644 index 0000000..c87ad65 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/prior_generators/point_generator.py @@ -0,0 +1,321 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import numpy as np +import torch +from torch import Tensor +from torch.nn.modules.utils import _pair + +from mmdet.registry import TASK_UTILS + +DeviceType = Union[str, torch.device] + + +@TASK_UTILS.register_module() +class PointGenerator: + + def _meshgrid(self, + x: Tensor, + y: Tensor, + row_major: bool = True) -> Tuple[Tensor, Tensor]: + """Generate mesh grid of x and y. + + Args: + x (torch.Tensor): Grids of x dimension. + y (torch.Tensor): Grids of y dimension. + row_major (bool): Whether to return y grids first. + Defaults to True. + + Returns: + tuple[torch.Tensor]: The mesh grids of x and y. + """ + xx = x.repeat(len(y)) + yy = y.view(-1, 1).repeat(1, len(x)).view(-1) + if row_major: + return xx, yy + else: + return yy, xx + + def grid_points(self, + featmap_size: Tuple[int, int], + stride=16, + device: DeviceType = 'cuda') -> Tensor: + """Generate grid points of a single level. + + Args: + featmap_size (tuple[int, int]): Size of the feature maps. + stride (int): The stride of corresponding feature map. + device (str | torch.device): The device the tensor will be put on. + Defaults to 'cuda'. + + Returns: + torch.Tensor: grid point in a feature map. + """ + feat_h, feat_w = featmap_size + shift_x = torch.arange(0., feat_w, device=device) * stride + shift_y = torch.arange(0., feat_h, device=device) * stride + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + stride = shift_x.new_full((shift_xx.shape[0], ), stride) + shifts = torch.stack([shift_xx, shift_yy, stride], dim=-1) + all_points = shifts.to(device) + return all_points + + def valid_flags(self, + featmap_size: Tuple[int, int], + valid_size: Tuple[int, int], + device: DeviceType = 'cuda') -> Tensor: + """Generate valid flags of anchors in a feature map. + + Args: + featmap_sizes (list(tuple[int, int])): List of feature map sizes in + multiple feature levels. + valid_shape (tuple[int, int]): The valid shape of the image. + device (str | torch.device): Device where the anchors will be + put on. + + Return: + torch.Tensor: Valid flags of anchors in a level. + """ + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device) + valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device) + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + return valid + + +@TASK_UTILS.register_module() +class MlvlPointGenerator: + """Standard points generator for multi-level (Mlvl) feature maps in 2D + points-based detectors. + + Args: + strides (list[int] | list[tuple[int, int]]): Strides of anchors + in multiple feature levels in order (w, h). + offset (float): The offset of points, the value is normalized with + corresponding stride. Defaults to 0.5. + """ + + def __init__(self, + strides: Union[List[int], List[Tuple[int, int]]], + offset: float = 0.5) -> None: + self.strides = [_pair(stride) for stride in strides] + self.offset = offset + + @property + def num_levels(self) -> int: + """int: number of feature levels that the generator will be applied""" + return len(self.strides) + + @property + def num_base_priors(self) -> List[int]: + """list[int]: The number of priors (points) at a point + on the feature grid""" + return [1 for _ in range(len(self.strides))] + + def _meshgrid(self, + x: Tensor, + y: Tensor, + row_major: bool = True) -> Tuple[Tensor, Tensor]: + yy, xx = torch.meshgrid(y, x) + if row_major: + # warning .flatten() would cause error in ONNX exporting + # have to use reshape here + return xx.reshape(-1), yy.reshape(-1) + + else: + return yy.reshape(-1), xx.reshape(-1) + + def grid_priors(self, + featmap_sizes: List[Tuple], + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda', + with_stride: bool = False) -> List[Tensor]: + """Generate grid points of multiple feature levels. + + Args: + featmap_sizes (list[tuple]): List of feature map sizes in + multiple feature levels, each size arrange as + as (h, w). + dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32. + device (str | torch.device): The device where the anchors will be + put on. + with_stride (bool): Whether to concatenate the stride to + the last dimension of points. + + Return: + list[torch.Tensor]: Points of multiple feature levels. + The sizes of each tensor should be (N, 2) when with stride is + ``False``, where N = width * height, width and height + are the sizes of the corresponding feature level, + and the last dimension 2 represent (coord_x, coord_y), + otherwise the shape should be (N, 4), + and the last dimension 4 represent + (coord_x, coord_y, stride_w, stride_h). + """ + + assert self.num_levels == len(featmap_sizes) + multi_level_priors = [] + for i in range(self.num_levels): + priors = self.single_level_grid_priors( + featmap_sizes[i], + level_idx=i, + dtype=dtype, + device=device, + with_stride=with_stride) + multi_level_priors.append(priors) + return multi_level_priors + + def single_level_grid_priors(self, + featmap_size: Tuple[int], + level_idx: int, + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda', + with_stride: bool = False) -> Tensor: + """Generate grid Points of a single level. + + Note: + This function is usually called by method ``self.grid_priors``. + + Args: + featmap_size (tuple[int]): Size of the feature maps, arrange as + (h, w). + level_idx (int): The index of corresponding feature map level. + dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32. + device (str | torch.device): The device the tensor will be put on. + Defaults to 'cuda'. + with_stride (bool): Concatenate the stride to the last dimension + of points. + + Return: + Tensor: Points of single feature levels. + The shape of tensor should be (N, 2) when with stride is + ``False``, where N = width * height, width and height + are the sizes of the corresponding feature level, + and the last dimension 2 represent (coord_x, coord_y), + otherwise the shape should be (N, 4), + and the last dimension 4 represent + (coord_x, coord_y, stride_w, stride_h). + """ + feat_h, feat_w = featmap_size + stride_w, stride_h = self.strides[level_idx] + shift_x = (torch.arange(0, feat_w, device=device) + + self.offset) * stride_w + # keep featmap_size as Tensor instead of int, so that we + # can convert to ONNX correctly + shift_x = shift_x.to(dtype) + + shift_y = (torch.arange(0, feat_h, device=device) + + self.offset) * stride_h + # keep featmap_size as Tensor instead of int, so that we + # can convert to ONNX correctly + shift_y = shift_y.to(dtype) + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + if not with_stride: + shifts = torch.stack([shift_xx, shift_yy], dim=-1) + else: + # use `shape[0]` instead of `len(shift_xx)` for ONNX export + stride_w = shift_xx.new_full((shift_xx.shape[0], ), + stride_w).to(dtype) + stride_h = shift_xx.new_full((shift_yy.shape[0], ), + stride_h).to(dtype) + shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h], + dim=-1) + all_points = shifts.to(device) + return all_points + + def valid_flags(self, + featmap_sizes: List[Tuple[int, int]], + pad_shape: Tuple[int], + device: DeviceType = 'cuda') -> List[Tensor]: + """Generate valid flags of points of multiple feature levels. + + Args: + featmap_sizes (list(tuple)): List of feature map sizes in + multiple feature levels, each size arrange as + as (h, w). + pad_shape (tuple(int)): The padded shape of the image, + arrange as (h, w). + device (str | torch.device): The device where the anchors will be + put on. + + Return: + list(torch.Tensor): Valid flags of points of multiple levels. + """ + assert self.num_levels == len(featmap_sizes) + multi_level_flags = [] + for i in range(self.num_levels): + point_stride = self.strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w = pad_shape[:2] + valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h) + valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w) + flags = self.single_level_valid_flags((feat_h, feat_w), + (valid_feat_h, valid_feat_w), + device=device) + multi_level_flags.append(flags) + return multi_level_flags + + def single_level_valid_flags(self, + featmap_size: Tuple[int, int], + valid_size: Tuple[int, int], + device: DeviceType = 'cuda') -> Tensor: + """Generate the valid flags of points of a single feature map. + + Args: + featmap_size (tuple[int]): The size of feature maps, arrange as + as (h, w). + valid_size (tuple[int]): The valid size of the feature maps. + The size arrange as as (h, w). + device (str | torch.device): The device where the flags will be + put on. Defaults to 'cuda'. + + Returns: + torch.Tensor: The valid flags of each points in a single level \ + feature map. + """ + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device) + valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device) + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + return valid + + def sparse_priors(self, + prior_idxs: Tensor, + featmap_size: Tuple[int], + level_idx: int, + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda') -> Tensor: + """Generate sparse points according to the ``prior_idxs``. + + Args: + prior_idxs (Tensor): The index of corresponding anchors + in the feature map. + featmap_size (tuple[int]): feature map size arrange as (w, h). + level_idx (int): The level index of corresponding feature + map. + dtype (obj:`torch.dtype`): Date type of points. Defaults to + ``torch.float32``. + device (str | torch.device): The device where the points is + located. + Returns: + Tensor: Anchor with shape (N, 2), N should be equal to + the length of ``prior_idxs``. And last dimension + 2 represent (coord_x, coord_y). + """ + height, width = featmap_size + x = (prior_idxs % width + self.offset) * self.strides[level_idx][0] + y = ((prior_idxs // width) % height + + self.offset) * self.strides[level_idx][1] + prioris = torch.stack([x, y], 1).to(dtype) + prioris = prioris.to(device) + return prioris diff --git a/mmdetection/mmdet/models/task_modules/prior_generators/utils.py b/mmdetection/mmdet/models/task_modules/prior_generators/utils.py new file mode 100644 index 0000000..3aa2dfd --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/prior_generators/utils.py @@ -0,0 +1,70 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +from torch import Tensor + +from mmdet.structures.bbox import BaseBoxes + + +def anchor_inside_flags(flat_anchors: Tensor, + valid_flags: Tensor, + img_shape: Tuple[int], + allowed_border: int = 0) -> Tensor: + """Check whether the anchors are inside the border. + + Args: + flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4). + valid_flags (torch.Tensor): An existing valid flags of anchors. + img_shape (tuple(int)): Shape of current image. + allowed_border (int): The border to allow the valid anchor. + Defaults to 0. + + Returns: + torch.Tensor: Flags indicating whether the anchors are inside a \ + valid range. + """ + img_h, img_w = img_shape[:2] + if allowed_border >= 0: + if isinstance(flat_anchors, BaseBoxes): + inside_flags = valid_flags & \ + flat_anchors.is_inside([img_h, img_w], + all_inside=True, + allowed_border=allowed_border) + else: + inside_flags = valid_flags & \ + (flat_anchors[:, 0] >= -allowed_border) & \ + (flat_anchors[:, 1] >= -allowed_border) & \ + (flat_anchors[:, 2] < img_w + allowed_border) & \ + (flat_anchors[:, 3] < img_h + allowed_border) + else: + inside_flags = valid_flags + return inside_flags + + +def calc_region(bbox: Tensor, + ratio: float, + featmap_size: Optional[Tuple] = None) -> Tuple[int]: + """Calculate a proportional bbox region. + + The bbox center are fixed and the new h' and w' is h * ratio and w * ratio. + + Args: + bbox (Tensor): Bboxes to calculate regions, shape (n, 4). + ratio (float): Ratio of the output region. + featmap_size (tuple, Optional): Feature map size in (height, width) + order used for clipping the boundary. Defaults to None. + + Returns: + tuple: x1, y1, x2, y2 + """ + x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long() + y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long() + x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long() + y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long() + if featmap_size is not None: + x1 = x1.clamp(min=0, max=featmap_size[1]) + y1 = y1.clamp(min=0, max=featmap_size[0]) + x2 = x2.clamp(min=0, max=featmap_size[1]) + y2 = y2.clamp(min=0, max=featmap_size[0]) + return (x1, y1, x2, y2) diff --git a/mmdetection/mmdet/models/task_modules/samplers/__init__.py b/mmdetection/mmdet/models/task_modules/samplers/__init__.py new file mode 100644 index 0000000..3782eb8 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_sampler import BaseSampler +from .combined_sampler import CombinedSampler +from .instance_balanced_pos_sampler import InstanceBalancedPosSampler +from .iou_balanced_neg_sampler import IoUBalancedNegSampler +from .mask_pseudo_sampler import MaskPseudoSampler +from .mask_sampling_result import MaskSamplingResult +from .multi_instance_random_sampler import MultiInsRandomSampler +from .multi_instance_sampling_result import MultiInstanceSamplingResult +from .ohem_sampler import OHEMSampler +from .pseudo_sampler import PseudoSampler +from .random_sampler import RandomSampler +from .sampling_result import SamplingResult +from .score_hlr_sampler import ScoreHLRSampler + +__all__ = [ + 'BaseSampler', 'PseudoSampler', 'RandomSampler', + 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', + 'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler', 'MaskPseudoSampler', + 'MaskSamplingResult', 'MultiInstanceSamplingResult', + 'MultiInsRandomSampler' +] diff --git a/mmdetection/mmdet/models/task_modules/samplers/base_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/base_sampler.py new file mode 100644 index 0000000..be8a9a5 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/base_sampler.py @@ -0,0 +1,136 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + +import torch +from mmengine.structures import InstanceData + +from mmdet.structures.bbox import BaseBoxes, cat_boxes +from ..assigners import AssignResult +from .sampling_result import SamplingResult + + +class BaseSampler(metaclass=ABCMeta): + """Base class of samplers. + + Args: + num (int): Number of samples + pos_fraction (float): Fraction of positive samples + neg_pos_up (int): Upper bound number of negative and + positive samples. Defaults to -1. + add_gt_as_proposals (bool): Whether to add ground truth + boxes as proposals. Defaults to True. + """ + + def __init__(self, + num: int, + pos_fraction: float, + neg_pos_ub: int = -1, + add_gt_as_proposals: bool = True, + **kwargs) -> None: + self.num = num + self.pos_fraction = pos_fraction + self.neg_pos_ub = neg_pos_ub + self.add_gt_as_proposals = add_gt_as_proposals + self.pos_sampler = self + self.neg_sampler = self + + @abstractmethod + def _sample_pos(self, assign_result: AssignResult, num_expected: int, + **kwargs): + """Sample positive samples.""" + pass + + @abstractmethod + def _sample_neg(self, assign_result: AssignResult, num_expected: int, + **kwargs): + """Sample negative samples.""" + pass + + def sample(self, assign_result: AssignResult, pred_instances: InstanceData, + gt_instances: InstanceData, **kwargs) -> SamplingResult: + """Sample positive and negative bboxes. + + This is a simple implementation of bbox sampling given candidates, + assigning results and ground truth bboxes. + + Args: + assign_result (:obj:`AssignResult`): Assigning results. + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + + Returns: + :obj:`SamplingResult`: Sampling result. + + Example: + >>> from mmengine.structures import InstanceData + >>> from mmdet.models.task_modules.samplers import RandomSampler, + >>> from mmdet.models.task_modules.assigners import AssignResult + >>> from mmdet.models.task_modules.samplers. + ... sampling_result import ensure_rng, random_boxes + >>> rng = ensure_rng(None) + >>> assign_result = AssignResult.random(rng=rng) + >>> pred_instances = InstanceData() + >>> pred_instances.priors = random_boxes(assign_result.num_preds, + ... rng=rng) + >>> gt_instances = InstanceData() + >>> gt_instances.bboxes = random_boxes(assign_result.num_gts, + ... rng=rng) + >>> gt_instances.labels = torch.randint( + ... 0, 5, (assign_result.num_gts,), dtype=torch.long) + >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1, + >>> add_gt_as_proposals=False) + >>> self = self.sample(assign_result, pred_instances, gt_instances) + """ + gt_bboxes = gt_instances.bboxes + priors = pred_instances.priors + gt_labels = gt_instances.labels + if len(priors.shape) < 2: + priors = priors[None, :] + + gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8) + if self.add_gt_as_proposals and len(gt_bboxes) > 0: + # When `gt_bboxes` and `priors` are all box type, convert + # `gt_bboxes` type to `priors` type. + if (isinstance(gt_bboxes, BaseBoxes) + and isinstance(priors, BaseBoxes)): + gt_bboxes_ = gt_bboxes.convert_to(type(priors)) + else: + gt_bboxes_ = gt_bboxes + priors = cat_boxes([gt_bboxes_, priors], dim=0) + assign_result.add_gt_(gt_labels) + gt_ones = priors.new_ones(gt_bboxes_.shape[0], dtype=torch.uint8) + gt_flags = torch.cat([gt_ones, gt_flags]) + + num_expected_pos = int(self.num * self.pos_fraction) + pos_inds = self.pos_sampler._sample_pos( + assign_result, num_expected_pos, bboxes=priors, **kwargs) + # We found that sampled indices have duplicated items occasionally. + # (may be a bug of PyTorch) + pos_inds = pos_inds.unique() + num_sampled_pos = pos_inds.numel() + num_expected_neg = self.num - num_sampled_pos + if self.neg_pos_ub >= 0: + _pos = max(1, num_sampled_pos) + neg_upper_bound = int(self.neg_pos_ub * _pos) + if num_expected_neg > neg_upper_bound: + num_expected_neg = neg_upper_bound + neg_inds = self.neg_sampler._sample_neg( + assign_result, num_expected_neg, bboxes=priors, **kwargs) + neg_inds = neg_inds.unique() + + sampling_result = SamplingResult( + pos_inds=pos_inds, + neg_inds=neg_inds, + priors=priors, + gt_bboxes=gt_bboxes, + assign_result=assign_result, + gt_flags=gt_flags) + return sampling_result diff --git a/mmdetection/mmdet/models/task_modules/samplers/combined_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/combined_sampler.py new file mode 100644 index 0000000..8e0560e --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/combined_sampler.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.registry import TASK_UTILS +from .base_sampler import BaseSampler + + +@TASK_UTILS.register_module() +class CombinedSampler(BaseSampler): + """A sampler that combines positive sampler and negative sampler.""" + + def __init__(self, pos_sampler, neg_sampler, **kwargs): + super(CombinedSampler, self).__init__(**kwargs) + self.pos_sampler = TASK_UTILS.build(pos_sampler, default_args=kwargs) + self.neg_sampler = TASK_UTILS.build(neg_sampler, default_args=kwargs) + + def _sample_pos(self, **kwargs): + """Sample positive samples.""" + raise NotImplementedError + + def _sample_neg(self, **kwargs): + """Sample negative samples.""" + raise NotImplementedError diff --git a/mmdetection/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py new file mode 100644 index 0000000..e48d8e9 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmdet.registry import TASK_UTILS +from .random_sampler import RandomSampler + + +@TASK_UTILS.register_module() +class InstanceBalancedPosSampler(RandomSampler): + """Instance balanced sampler that samples equal number of positive samples + for each instance.""" + + def _sample_pos(self, assign_result, num_expected, **kwargs): + """Sample positive boxes. + + Args: + assign_result (:obj:`AssignResult`): The assigned results of boxes. + num_expected (int): The number of expected positive samples + + Returns: + Tensor or ndarray: sampled indices. + """ + pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False) + if pos_inds.numel() != 0: + pos_inds = pos_inds.squeeze(1) + if pos_inds.numel() <= num_expected: + return pos_inds + else: + unique_gt_inds = assign_result.gt_inds[pos_inds].unique() + num_gts = len(unique_gt_inds) + num_per_gt = int(round(num_expected / float(num_gts)) + 1) + sampled_inds = [] + for i in unique_gt_inds: + inds = torch.nonzero( + assign_result.gt_inds == i.item(), as_tuple=False) + if inds.numel() != 0: + inds = inds.squeeze(1) + else: + continue + if len(inds) > num_per_gt: + inds = self.random_choice(inds, num_per_gt) + sampled_inds.append(inds) + sampled_inds = torch.cat(sampled_inds) + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array( + list(set(pos_inds.cpu()) - set(sampled_inds.cpu()))) + if len(extra_inds) > num_extra: + extra_inds = self.random_choice(extra_inds, num_extra) + extra_inds = torch.from_numpy(extra_inds).to( + assign_result.gt_inds.device).long() + sampled_inds = torch.cat([sampled_inds, extra_inds]) + elif len(sampled_inds) > num_expected: + sampled_inds = self.random_choice(sampled_inds, num_expected) + return sampled_inds diff --git a/mmdetection/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py new file mode 100644 index 0000000..dc1f464 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmdet.registry import TASK_UTILS +from .random_sampler import RandomSampler + + +@TASK_UTILS.register_module() +class IoUBalancedNegSampler(RandomSampler): + """IoU Balanced Sampling. + + arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019) + + Sampling proposals according to their IoU. `floor_fraction` of needed RoIs + are sampled from proposals whose IoU are lower than `floor_thr` randomly. + The others are sampled from proposals whose IoU are higher than + `floor_thr`. These proposals are sampled from some bins evenly, which are + split by `num_bins` via IoU evenly. + + Args: + num (int): number of proposals. + pos_fraction (float): fraction of positive proposals. + floor_thr (float): threshold (minimum) IoU for IoU balanced sampling, + set to -1 if all using IoU balanced sampling. + floor_fraction (float): sampling fraction of proposals under floor_thr. + num_bins (int): number of bins in IoU balanced sampling. + """ + + def __init__(self, + num, + pos_fraction, + floor_thr=-1, + floor_fraction=0, + num_bins=3, + **kwargs): + super(IoUBalancedNegSampler, self).__init__(num, pos_fraction, + **kwargs) + assert floor_thr >= 0 or floor_thr == -1 + assert 0 <= floor_fraction <= 1 + assert num_bins >= 1 + + self.floor_thr = floor_thr + self.floor_fraction = floor_fraction + self.num_bins = num_bins + + def sample_via_interval(self, max_overlaps, full_set, num_expected): + """Sample according to the iou interval. + + Args: + max_overlaps (torch.Tensor): IoU between bounding boxes and ground + truth boxes. + full_set (set(int)): A full set of indices of boxes。 + num_expected (int): Number of expected samples。 + + Returns: + np.ndarray: Indices of samples + """ + max_iou = max_overlaps.max() + iou_interval = (max_iou - self.floor_thr) / self.num_bins + per_num_expected = int(num_expected / self.num_bins) + + sampled_inds = [] + for i in range(self.num_bins): + start_iou = self.floor_thr + i * iou_interval + end_iou = self.floor_thr + (i + 1) * iou_interval + tmp_set = set( + np.where( + np.logical_and(max_overlaps >= start_iou, + max_overlaps < end_iou))[0]) + tmp_inds = list(tmp_set & full_set) + if len(tmp_inds) > per_num_expected: + tmp_sampled_set = self.random_choice(tmp_inds, + per_num_expected) + else: + tmp_sampled_set = np.array(tmp_inds, dtype=np.int64) + sampled_inds.append(tmp_sampled_set) + + sampled_inds = np.concatenate(sampled_inds) + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array(list(full_set - set(sampled_inds))) + if len(extra_inds) > num_extra: + extra_inds = self.random_choice(extra_inds, num_extra) + sampled_inds = np.concatenate([sampled_inds, extra_inds]) + + return sampled_inds + + def _sample_neg(self, assign_result, num_expected, **kwargs): + """Sample negative boxes. + + Args: + assign_result (:obj:`AssignResult`): The assigned results of boxes. + num_expected (int): The number of expected negative samples + + Returns: + Tensor or ndarray: sampled indices. + """ + neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False) + if neg_inds.numel() != 0: + neg_inds = neg_inds.squeeze(1) + if len(neg_inds) <= num_expected: + return neg_inds + else: + max_overlaps = assign_result.max_overlaps.cpu().numpy() + # balance sampling for negative samples + neg_set = set(neg_inds.cpu().numpy()) + + if self.floor_thr > 0: + floor_set = set( + np.where( + np.logical_and(max_overlaps >= 0, + max_overlaps < self.floor_thr))[0]) + iou_sampling_set = set( + np.where(max_overlaps >= self.floor_thr)[0]) + elif self.floor_thr == 0: + floor_set = set(np.where(max_overlaps == 0)[0]) + iou_sampling_set = set( + np.where(max_overlaps > self.floor_thr)[0]) + else: + floor_set = set() + iou_sampling_set = set( + np.where(max_overlaps > self.floor_thr)[0]) + # for sampling interval calculation + self.floor_thr = 0 + + floor_neg_inds = list(floor_set & neg_set) + iou_sampling_neg_inds = list(iou_sampling_set & neg_set) + num_expected_iou_sampling = int(num_expected * + (1 - self.floor_fraction)) + if len(iou_sampling_neg_inds) > num_expected_iou_sampling: + if self.num_bins >= 2: + iou_sampled_inds = self.sample_via_interval( + max_overlaps, set(iou_sampling_neg_inds), + num_expected_iou_sampling) + else: + iou_sampled_inds = self.random_choice( + iou_sampling_neg_inds, num_expected_iou_sampling) + else: + iou_sampled_inds = np.array( + iou_sampling_neg_inds, dtype=np.int64) + num_expected_floor = num_expected - len(iou_sampled_inds) + if len(floor_neg_inds) > num_expected_floor: + sampled_floor_inds = self.random_choice( + floor_neg_inds, num_expected_floor) + else: + sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int64) + sampled_inds = np.concatenate( + (sampled_floor_inds, iou_sampled_inds)) + if len(sampled_inds) < num_expected: + num_extra = num_expected - len(sampled_inds) + extra_inds = np.array(list(neg_set - set(sampled_inds))) + if len(extra_inds) > num_extra: + extra_inds = self.random_choice(extra_inds, num_extra) + sampled_inds = np.concatenate((sampled_inds, extra_inds)) + sampled_inds = torch.from_numpy(sampled_inds).long().to( + assign_result.gt_inds.device) + return sampled_inds diff --git a/mmdetection/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py new file mode 100644 index 0000000..307dd5d --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py @@ -0,0 +1,60 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""copy from +https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py.""" + +import torch +from mmengine.structures import InstanceData + +from mmdet.registry import TASK_UTILS +from ..assigners import AssignResult +from .base_sampler import BaseSampler +from .mask_sampling_result import MaskSamplingResult + + +@TASK_UTILS.register_module() +class MaskPseudoSampler(BaseSampler): + """A pseudo sampler that does not do sampling actually.""" + + def __init__(self, **kwargs): + pass + + def _sample_pos(self, **kwargs): + """Sample positive samples.""" + raise NotImplementedError + + def _sample_neg(self, **kwargs): + """Sample negative samples.""" + raise NotImplementedError + + def sample(self, assign_result: AssignResult, pred_instances: InstanceData, + gt_instances: InstanceData, *args, **kwargs): + """Directly returns the positive and negative indices of samples. + + Args: + assign_result (:obj:`AssignResult`): Mask assigning results. + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``scores`` and ``masks`` predicted + by the model. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``labels`` and ``masks`` + attributes. + + Returns: + :obj:`SamplingResult`: sampler results + """ + pred_masks = pred_instances.masks + gt_masks = gt_instances.masks + pos_inds = torch.nonzero( + assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() + neg_inds = torch.nonzero( + assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() + gt_flags = pred_masks.new_zeros(pred_masks.shape[0], dtype=torch.uint8) + sampling_result = MaskSamplingResult( + pos_inds=pos_inds, + neg_inds=neg_inds, + masks=pred_masks, + gt_masks=gt_masks, + assign_result=assign_result, + gt_flags=gt_flags, + avg_factor_with_neg=False) + return sampling_result diff --git a/mmdetection/mmdet/models/task_modules/samplers/mask_sampling_result.py b/mmdetection/mmdet/models/task_modules/samplers/mask_sampling_result.py new file mode 100644 index 0000000..adaa62e --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/mask_sampling_result.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""copy from +https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py.""" + +import torch +from torch import Tensor + +from ..assigners import AssignResult +from .sampling_result import SamplingResult + + +class MaskSamplingResult(SamplingResult): + """Mask sampling result.""" + + def __init__(self, + pos_inds: Tensor, + neg_inds: Tensor, + masks: Tensor, + gt_masks: Tensor, + assign_result: AssignResult, + gt_flags: Tensor, + avg_factor_with_neg: bool = True) -> None: + self.pos_inds = pos_inds + self.neg_inds = neg_inds + self.num_pos = max(pos_inds.numel(), 1) + self.num_neg = max(neg_inds.numel(), 1) + self.avg_factor = self.num_pos + self.num_neg \ + if avg_factor_with_neg else self.num_pos + + self.pos_masks = masks[pos_inds] + self.neg_masks = masks[neg_inds] + self.pos_is_gt = gt_flags[pos_inds] + + self.num_gts = gt_masks.shape[0] + self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + + if gt_masks.numel() == 0: + # hack for index error case + assert self.pos_assigned_gt_inds.numel() == 0 + self.pos_gt_masks = torch.empty_like(gt_masks) + else: + self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :] + + @property + def masks(self) -> Tensor: + """torch.Tensor: concatenated positive and negative masks.""" + return torch.cat([self.pos_masks, self.neg_masks]) + + def __nice__(self) -> str: + data = self.info.copy() + data['pos_masks'] = data.pop('pos_masks').shape + data['neg_masks'] = data.pop('neg_masks').shape + parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] + body = ' ' + ',\n '.join(parts) + return '{\n' + body + '\n}' + + @property + def info(self) -> dict: + """Returns a dictionary of info about the object.""" + return { + 'pos_inds': self.pos_inds, + 'neg_inds': self.neg_inds, + 'pos_masks': self.pos_masks, + 'neg_masks': self.neg_masks, + 'pos_is_gt': self.pos_is_gt, + 'num_gts': self.num_gts, + 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, + } diff --git a/mmdetection/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py new file mode 100644 index 0000000..8b74054 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmengine.structures import InstanceData +from numpy import ndarray +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from ..assigners import AssignResult +from .multi_instance_sampling_result import MultiInstanceSamplingResult +from .random_sampler import RandomSampler + + +@TASK_UTILS.register_module() +class MultiInsRandomSampler(RandomSampler): + """Random sampler for multi instance. + + Note: + Multi-instance means to predict multiple detection boxes with + one proposal box. `AssignResult` may assign multiple gt boxes + to each proposal box, in this case `RandomSampler` should be + replaced by `MultiInsRandomSampler` + """ + + def _sample_pos(self, assign_result: AssignResult, num_expected: int, + **kwargs) -> Union[Tensor, ndarray]: + """Randomly sample some positive samples. + + Args: + assign_result (:obj:`AssignResult`): Bbox assigning results. + num_expected (int): The number of expected positive samples + + Returns: + Tensor or ndarray: sampled indices. + """ + pos_inds = torch.nonzero( + assign_result.labels[:, 0] > 0, as_tuple=False) + if pos_inds.numel() != 0: + pos_inds = pos_inds.squeeze(1) + if pos_inds.numel() <= num_expected: + return pos_inds + else: + return self.random_choice(pos_inds, num_expected) + + def _sample_neg(self, assign_result: AssignResult, num_expected: int, + **kwargs) -> Union[Tensor, ndarray]: + """Randomly sample some negative samples. + + Args: + assign_result (:obj:`AssignResult`): Bbox assigning results. + num_expected (int): The number of expected positive samples + + Returns: + Tensor or ndarray: sampled indices. + """ + neg_inds = torch.nonzero( + assign_result.labels[:, 0] == 0, as_tuple=False) + if neg_inds.numel() != 0: + neg_inds = neg_inds.squeeze(1) + if len(neg_inds) <= num_expected: + return neg_inds + else: + return self.random_choice(neg_inds, num_expected) + + def sample(self, assign_result: AssignResult, pred_instances: InstanceData, + gt_instances: InstanceData, + **kwargs) -> MultiInstanceSamplingResult: + """Sample positive and negative bboxes. + + Args: + assign_result (:obj:`AssignResult`): Assigning results from + MultiInstanceAssigner. + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + + Returns: + :obj:`MultiInstanceSamplingResult`: Sampling result. + """ + + assert 'batch_gt_instances_ignore' in kwargs, \ + 'batch_gt_instances_ignore is necessary for MultiInsRandomSampler' + + gt_bboxes = gt_instances.bboxes + ignore_bboxes = kwargs['batch_gt_instances_ignore'].bboxes + gt_and_ignore_bboxes = torch.cat([gt_bboxes, ignore_bboxes], dim=0) + priors = pred_instances.priors + if len(priors.shape) < 2: + priors = priors[None, :] + priors = priors[:, :4] + + gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8) + priors = torch.cat([priors, gt_and_ignore_bboxes], dim=0) + gt_ones = priors.new_ones( + gt_and_ignore_bboxes.shape[0], dtype=torch.uint8) + gt_flags = torch.cat([gt_flags, gt_ones]) + + num_expected_pos = int(self.num * self.pos_fraction) + pos_inds = self.pos_sampler._sample_pos(assign_result, + num_expected_pos) + # We found that sampled indices have duplicated items occasionally. + # (may be a bug of PyTorch) + pos_inds = pos_inds.unique() + num_sampled_pos = pos_inds.numel() + num_expected_neg = self.num - num_sampled_pos + if self.neg_pos_ub >= 0: + _pos = max(1, num_sampled_pos) + neg_upper_bound = int(self.neg_pos_ub * _pos) + if num_expected_neg > neg_upper_bound: + num_expected_neg = neg_upper_bound + neg_inds = self.neg_sampler._sample_neg(assign_result, + num_expected_neg) + neg_inds = neg_inds.unique() + + sampling_result = MultiInstanceSamplingResult( + pos_inds=pos_inds, + neg_inds=neg_inds, + priors=priors, + gt_and_ignore_bboxes=gt_and_ignore_bboxes, + assign_result=assign_result, + gt_flags=gt_flags) + return sampling_result diff --git a/mmdetection/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py b/mmdetection/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py new file mode 100644 index 0000000..438a0aa --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +from ..assigners import AssignResult +from .sampling_result import SamplingResult + + +class MultiInstanceSamplingResult(SamplingResult): + """Bbox sampling result. Further encapsulation of SamplingResult. Three + attributes neg_assigned_gt_inds, neg_gt_labels, and neg_gt_bboxes have been + added for SamplingResult. + + Args: + pos_inds (Tensor): Indices of positive samples. + neg_inds (Tensor): Indices of negative samples. + priors (Tensor): The priors can be anchors or points, + or the bboxes predicted by the previous stage. + gt_and_ignore_bboxes (Tensor): Ground truth and ignore bboxes. + assign_result (:obj:`AssignResult`): Assigning results. + gt_flags (Tensor): The Ground truth flags. + avg_factor_with_neg (bool): If True, ``avg_factor`` equal to + the number of total priors; Otherwise, it is the number of + positive priors. Defaults to True. + """ + + def __init__(self, + pos_inds: Tensor, + neg_inds: Tensor, + priors: Tensor, + gt_and_ignore_bboxes: Tensor, + assign_result: AssignResult, + gt_flags: Tensor, + avg_factor_with_neg: bool = True) -> None: + self.neg_assigned_gt_inds = assign_result.gt_inds[neg_inds] + self.neg_gt_labels = assign_result.labels[neg_inds] + + if gt_and_ignore_bboxes.numel() == 0: + self.neg_gt_bboxes = torch.empty_like(gt_and_ignore_bboxes).view( + -1, 4) + else: + if len(gt_and_ignore_bboxes.shape) < 2: + gt_and_ignore_bboxes = gt_and_ignore_bboxes.view(-1, 4) + self.neg_gt_bboxes = gt_and_ignore_bboxes[ + self.neg_assigned_gt_inds.long(), :] + + # To resist the minus 1 operation in `SamplingResult.init()`. + assign_result.gt_inds += 1 + super().__init__( + pos_inds=pos_inds, + neg_inds=neg_inds, + priors=priors, + gt_bboxes=gt_and_ignore_bboxes, + assign_result=assign_result, + gt_flags=gt_flags, + avg_factor_with_neg=avg_factor_with_neg) diff --git a/mmdetection/mmdet/models/task_modules/samplers/ohem_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/ohem_sampler.py new file mode 100644 index 0000000..f478a44 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/ohem_sampler.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import bbox2roi +from .base_sampler import BaseSampler + + +@TASK_UTILS.register_module() +class OHEMSampler(BaseSampler): + r"""Online Hard Example Mining Sampler described in `Training Region-based + Object Detectors with Online Hard Example Mining + `_. + """ + + def __init__(self, + num, + pos_fraction, + context, + neg_pos_ub=-1, + add_gt_as_proposals=True, + loss_key='loss_cls', + **kwargs): + super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub, + add_gt_as_proposals) + self.context = context + if not hasattr(self.context, 'num_stages'): + self.bbox_head = self.context.bbox_head + else: + self.bbox_head = self.context.bbox_head[self.context.current_stage] + + self.loss_key = loss_key + + def hard_mining(self, inds, num_expected, bboxes, labels, feats): + with torch.no_grad(): + rois = bbox2roi([bboxes]) + if not hasattr(self.context, 'num_stages'): + bbox_results = self.context._bbox_forward(feats, rois) + else: + bbox_results = self.context._bbox_forward( + self.context.current_stage, feats, rois) + cls_score = bbox_results['cls_score'] + loss = self.bbox_head.loss( + cls_score=cls_score, + bbox_pred=None, + rois=rois, + labels=labels, + label_weights=cls_score.new_ones(cls_score.size(0)), + bbox_targets=None, + bbox_weights=None, + reduction_override='none')[self.loss_key] + _, topk_loss_inds = loss.topk(num_expected) + return inds[topk_loss_inds] + + def _sample_pos(self, + assign_result, + num_expected, + bboxes=None, + feats=None, + **kwargs): + """Sample positive boxes. + + Args: + assign_result (:obj:`AssignResult`): Assigned results + num_expected (int): Number of expected positive samples + bboxes (torch.Tensor, optional): Boxes. Defaults to None. + feats (list[torch.Tensor], optional): Multi-level features. + Defaults to None. + + Returns: + torch.Tensor: Indices of positive samples + """ + # Sample some hard positive samples + pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False) + if pos_inds.numel() != 0: + pos_inds = pos_inds.squeeze(1) + if pos_inds.numel() <= num_expected: + return pos_inds + else: + return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds], + assign_result.labels[pos_inds], feats) + + def _sample_neg(self, + assign_result, + num_expected, + bboxes=None, + feats=None, + **kwargs): + """Sample negative boxes. + + Args: + assign_result (:obj:`AssignResult`): Assigned results + num_expected (int): Number of expected negative samples + bboxes (torch.Tensor, optional): Boxes. Defaults to None. + feats (list[torch.Tensor], optional): Multi-level features. + Defaults to None. + + Returns: + torch.Tensor: Indices of negative samples + """ + # Sample some hard negative samples + neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False) + if neg_inds.numel() != 0: + neg_inds = neg_inds.squeeze(1) + if len(neg_inds) <= num_expected: + return neg_inds + else: + neg_labels = assign_result.labels.new_empty( + neg_inds.size(0)).fill_(self.bbox_head.num_classes) + return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds], + neg_labels, feats) diff --git a/mmdetection/mmdet/models/task_modules/samplers/pseudo_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/pseudo_sampler.py new file mode 100644 index 0000000..a8186cc --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/pseudo_sampler.py @@ -0,0 +1,60 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmengine.structures import InstanceData + +from mmdet.registry import TASK_UTILS +from ..assigners import AssignResult +from .base_sampler import BaseSampler +from .sampling_result import SamplingResult + + +@TASK_UTILS.register_module() +class PseudoSampler(BaseSampler): + """A pseudo sampler that does not do sampling actually.""" + + def __init__(self, **kwargs): + pass + + def _sample_pos(self, **kwargs): + """Sample positive samples.""" + raise NotImplementedError + + def _sample_neg(self, **kwargs): + """Sample negative samples.""" + raise NotImplementedError + + def sample(self, assign_result: AssignResult, pred_instances: InstanceData, + gt_instances: InstanceData, *args, **kwargs): + """Directly returns the positive and negative indices of samples. + + Args: + assign_result (:obj:`AssignResult`): Bbox assigning results. + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors, points, or bboxes predicted by the model, + shape(n, 4). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes`` and ``labels`` + attributes. + + Returns: + :obj:`SamplingResult`: sampler results + """ + gt_bboxes = gt_instances.bboxes + priors = pred_instances.priors + + pos_inds = torch.nonzero( + assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() + neg_inds = torch.nonzero( + assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() + + gt_flags = priors.new_zeros(priors.shape[0], dtype=torch.uint8) + sampling_result = SamplingResult( + pos_inds=pos_inds, + neg_inds=neg_inds, + priors=priors, + gt_bboxes=gt_bboxes, + assign_result=assign_result, + gt_flags=gt_flags, + avg_factor_with_neg=False) + return sampling_result diff --git a/mmdetection/mmdet/models/task_modules/samplers/random_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/random_sampler.py new file mode 100644 index 0000000..fa03665 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/random_sampler.py @@ -0,0 +1,109 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from numpy import ndarray +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from ..assigners import AssignResult +from .base_sampler import BaseSampler + + +@TASK_UTILS.register_module() +class RandomSampler(BaseSampler): + """Random sampler. + + Args: + num (int): Number of samples + pos_fraction (float): Fraction of positive samples + neg_pos_up (int): Upper bound number of negative and + positive samples. Defaults to -1. + add_gt_as_proposals (bool): Whether to add ground truth + boxes as proposals. Defaults to True. + """ + + def __init__(self, + num: int, + pos_fraction: float, + neg_pos_ub: int = -1, + add_gt_as_proposals: bool = True, + **kwargs): + from .sampling_result import ensure_rng + super().__init__( + num=num, + pos_fraction=pos_fraction, + neg_pos_ub=neg_pos_ub, + add_gt_as_proposals=add_gt_as_proposals) + self.rng = ensure_rng(kwargs.get('rng', None)) + + def random_choice(self, gallery: Union[Tensor, ndarray, list], + num: int) -> Union[Tensor, ndarray]: + """Random select some elements from the gallery. + + If `gallery` is a Tensor, the returned indices will be a Tensor; + If `gallery` is a ndarray or list, the returned indices will be a + ndarray. + + Args: + gallery (Tensor | ndarray | list): indices pool. + num (int): expected sample num. + + Returns: + Tensor or ndarray: sampled indices. + """ + assert len(gallery) >= num + + is_tensor = isinstance(gallery, torch.Tensor) + if not is_tensor: + if torch.cuda.is_available(): + device = torch.cuda.current_device() + else: + device = 'cpu' + gallery = torch.tensor(gallery, dtype=torch.long, device=device) + # This is a temporary fix. We can revert the following code + # when PyTorch fixes the abnormal return of torch.randperm. + # See: https://github.com/open-mmlab/mmdetection/pull/5014 + perm = torch.randperm(gallery.numel())[:num].to(device=gallery.device) + rand_inds = gallery[perm] + if not is_tensor: + rand_inds = rand_inds.cpu().numpy() + return rand_inds + + def _sample_pos(self, assign_result: AssignResult, num_expected: int, + **kwargs) -> Union[Tensor, ndarray]: + """Randomly sample some positive samples. + + Args: + assign_result (:obj:`AssignResult`): Bbox assigning results. + num_expected (int): The number of expected positive samples + + Returns: + Tensor or ndarray: sampled indices. + """ + pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False) + if pos_inds.numel() != 0: + pos_inds = pos_inds.squeeze(1) + if pos_inds.numel() <= num_expected: + return pos_inds + else: + return self.random_choice(pos_inds, num_expected) + + def _sample_neg(self, assign_result: AssignResult, num_expected: int, + **kwargs) -> Union[Tensor, ndarray]: + """Randomly sample some negative samples. + + Args: + assign_result (:obj:`AssignResult`): Bbox assigning results. + num_expected (int): The number of expected positive samples + + Returns: + Tensor or ndarray: sampled indices. + """ + neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False) + if neg_inds.numel() != 0: + neg_inds = neg_inds.squeeze(1) + if len(neg_inds) <= num_expected: + return neg_inds + else: + return self.random_choice(neg_inds, num_expected) diff --git a/mmdetection/mmdet/models/task_modules/samplers/sampling_result.py b/mmdetection/mmdet/models/task_modules/samplers/sampling_result.py new file mode 100644 index 0000000..cb510ee --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/sampling_result.py @@ -0,0 +1,240 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np +import torch +from torch import Tensor + +from mmdet.structures.bbox import BaseBoxes, cat_boxes +from mmdet.utils import util_mixins +from mmdet.utils.util_random import ensure_rng +from ..assigners import AssignResult + + +def random_boxes(num=1, scale=1, rng=None): + """Simple version of ``kwimage.Boxes.random`` + + Returns: + Tensor: shape (n, 4) in x1, y1, x2, y2 format. + + References: + https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390 + + Example: + >>> num = 3 + >>> scale = 512 + >>> rng = 0 + >>> boxes = random_boxes(num, scale, rng) + >>> print(boxes) + tensor([[280.9925, 278.9802, 308.6148, 366.1769], + [216.9113, 330.6978, 224.0446, 456.5878], + [405.3632, 196.3221, 493.3953, 270.7942]]) + """ + rng = ensure_rng(rng) + + tlbr = rng.rand(num, 4).astype(np.float32) + + tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2]) + tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3]) + br_x = np.maximum(tlbr[:, 0], tlbr[:, 2]) + br_y = np.maximum(tlbr[:, 1], tlbr[:, 3]) + + tlbr[:, 0] = tl_x * scale + tlbr[:, 1] = tl_y * scale + tlbr[:, 2] = br_x * scale + tlbr[:, 3] = br_y * scale + + boxes = torch.from_numpy(tlbr) + return boxes + + +class SamplingResult(util_mixins.NiceRepr): + """Bbox sampling result. + + Args: + pos_inds (Tensor): Indices of positive samples. + neg_inds (Tensor): Indices of negative samples. + priors (Tensor): The priors can be anchors or points, + or the bboxes predicted by the previous stage. + gt_bboxes (Tensor): Ground truth of bboxes. + assign_result (:obj:`AssignResult`): Assigning results. + gt_flags (Tensor): The Ground truth flags. + avg_factor_with_neg (bool): If True, ``avg_factor`` equal to + the number of total priors; Otherwise, it is the number of + positive priors. Defaults to True. + + Example: + >>> # xdoctest: +IGNORE_WANT + >>> from mmdet.models.task_modules.samplers.sampling_result import * # NOQA + >>> self = SamplingResult.random(rng=10) + >>> print(f'self = {self}') + self = + """ + + def __init__(self, + pos_inds: Tensor, + neg_inds: Tensor, + priors: Tensor, + gt_bboxes: Tensor, + assign_result: AssignResult, + gt_flags: Tensor, + avg_factor_with_neg: bool = True) -> None: + self.pos_inds = pos_inds + self.neg_inds = neg_inds + self.num_pos = max(pos_inds.numel(), 1) + self.num_neg = max(neg_inds.numel(), 1) + self.avg_factor_with_neg = avg_factor_with_neg + self.avg_factor = self.num_pos + self.num_neg \ + if avg_factor_with_neg else self.num_pos + self.pos_priors = priors[pos_inds] + self.neg_priors = priors[neg_inds] + self.pos_is_gt = gt_flags[pos_inds] + + self.num_gts = gt_bboxes.shape[0] + self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + self.pos_gt_labels = assign_result.labels[pos_inds] + box_dim = gt_bboxes.box_dim if isinstance(gt_bboxes, BaseBoxes) else 4 + if gt_bboxes.numel() == 0: + # hack for index error case + assert self.pos_assigned_gt_inds.numel() == 0 + self.pos_gt_bboxes = gt_bboxes.view(-1, box_dim) + else: + if len(gt_bboxes.shape) < 2: + gt_bboxes = gt_bboxes.view(-1, box_dim) + self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long()] + + @property + def priors(self): + """torch.Tensor: concatenated positive and negative priors""" + return cat_boxes([self.pos_priors, self.neg_priors]) + + @property + def bboxes(self): + """torch.Tensor: concatenated positive and negative boxes""" + warnings.warn('DeprecationWarning: bboxes is deprecated, ' + 'please use "priors" instead') + return self.priors + + @property + def pos_bboxes(self): + warnings.warn('DeprecationWarning: pos_bboxes is deprecated, ' + 'please use "pos_priors" instead') + return self.pos_priors + + @property + def neg_bboxes(self): + warnings.warn('DeprecationWarning: neg_bboxes is deprecated, ' + 'please use "neg_priors" instead') + return self.neg_priors + + def to(self, device): + """Change the device of the data inplace. + + Example: + >>> self = SamplingResult.random() + >>> print(f'self = {self.to(None)}') + >>> # xdoctest: +REQUIRES(--gpu) + >>> print(f'self = {self.to(0)}') + """ + _dict = self.__dict__ + for key, value in _dict.items(): + if isinstance(value, (torch.Tensor, BaseBoxes)): + _dict[key] = value.to(device) + return self + + def __nice__(self): + data = self.info.copy() + data['pos_priors'] = data.pop('pos_priors').shape + data['neg_priors'] = data.pop('neg_priors').shape + parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] + body = ' ' + ',\n '.join(parts) + return '{\n' + body + '\n}' + + @property + def info(self): + """Returns a dictionary of info about the object.""" + return { + 'pos_inds': self.pos_inds, + 'neg_inds': self.neg_inds, + 'pos_priors': self.pos_priors, + 'neg_priors': self.neg_priors, + 'pos_is_gt': self.pos_is_gt, + 'num_gts': self.num_gts, + 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, + 'num_pos': self.num_pos, + 'num_neg': self.num_neg, + 'avg_factor': self.avg_factor + } + + @classmethod + def random(cls, rng=None, **kwargs): + """ + Args: + rng (None | int | numpy.random.RandomState): seed or state. + kwargs (keyword arguments): + - num_preds: Number of predicted boxes. + - num_gts: Number of true boxes. + - p_ignore (float): Probability of a predicted box assigned to + an ignored truth. + - p_assigned (float): probability of a predicted box not being + assigned. + + Returns: + :obj:`SamplingResult`: Randomly generated sampling result. + + Example: + >>> from mmdet.models.task_modules.samplers.sampling_result import * # NOQA + >>> self = SamplingResult.random() + >>> print(self.__dict__) + """ + from mmengine.structures import InstanceData + + from mmdet.models.task_modules.assigners import AssignResult + from mmdet.models.task_modules.samplers import RandomSampler + rng = ensure_rng(rng) + + # make probabilistic? + num = 32 + pos_fraction = 0.5 + neg_pos_ub = -1 + + assign_result = AssignResult.random(rng=rng, **kwargs) + + # Note we could just compute an assignment + priors = random_boxes(assign_result.num_preds, rng=rng) + gt_bboxes = random_boxes(assign_result.num_gts, rng=rng) + gt_labels = torch.randint( + 0, 5, (assign_result.num_gts, ), dtype=torch.long) + + pred_instances = InstanceData() + pred_instances.priors = priors + + gt_instances = InstanceData() + gt_instances.bboxes = gt_bboxes + gt_instances.labels = gt_labels + + add_gt_as_proposals = True + + sampler = RandomSampler( + num, + pos_fraction, + neg_pos_ub=neg_pos_ub, + add_gt_as_proposals=add_gt_as_proposals, + rng=rng) + self = sampler.sample( + assign_result=assign_result, + pred_instances=pred_instances, + gt_instances=gt_instances) + return self diff --git a/mmdetection/mmdet/models/task_modules/samplers/score_hlr_sampler.py b/mmdetection/mmdet/models/task_modules/samplers/score_hlr_sampler.py new file mode 100644 index 0000000..0227585 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/samplers/score_hlr_sampler.py @@ -0,0 +1,290 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmcv.ops import nms_match +from mmengine.structures import InstanceData +from numpy import ndarray +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import bbox2roi +from ..assigners import AssignResult +from .base_sampler import BaseSampler +from .sampling_result import SamplingResult + + +@TASK_UTILS.register_module() +class ScoreHLRSampler(BaseSampler): + r"""Importance-based Sample Reweighting (ISR_N), described in `Prime Sample + Attention in Object Detection `_. + + Score hierarchical local rank (HLR) differentiates with RandomSampler in + negative part. It firstly computes Score-HLR in a two-step way, + then linearly maps score hlr to the loss weights. + + Args: + num (int): Total number of sampled RoIs. + pos_fraction (float): Fraction of positive samples. + context (:obj:`BaseRoIHead`): RoI head that the sampler belongs to. + neg_pos_ub (int): Upper bound of the ratio of num negative to num + positive, -1 means no upper bound. Defaults to -1. + add_gt_as_proposals (bool): Whether to add ground truth as proposals. + Defaults to True. + k (float): Power of the non-linear mapping. Defaults to 0.5 + bias (float): Shift of the non-linear mapping. Defaults to 0. + score_thr (float): Minimum score that a negative sample is to be + considered as valid bbox. Defaults to 0.05. + iou_thr (float): IoU threshold for NMS match. Defaults to 0.5. + """ + + def __init__(self, + num: int, + pos_fraction: float, + context, + neg_pos_ub: int = -1, + add_gt_as_proposals: bool = True, + k: float = 0.5, + bias: float = 0, + score_thr: float = 0.05, + iou_thr: float = 0.5, + **kwargs) -> None: + super().__init__( + num=num, + pos_fraction=pos_fraction, + neg_pos_ub=neg_pos_ub, + add_gt_as_proposals=add_gt_as_proposals) + self.k = k + self.bias = bias + self.score_thr = score_thr + self.iou_thr = iou_thr + self.context = context + # context of cascade detectors is a list, so distinguish them here. + if not hasattr(context, 'num_stages'): + self.bbox_roi_extractor = context.bbox_roi_extractor + self.bbox_head = context.bbox_head + self.with_shared_head = context.with_shared_head + if self.with_shared_head: + self.shared_head = context.shared_head + else: + self.bbox_roi_extractor = context.bbox_roi_extractor[ + context.current_stage] + self.bbox_head = context.bbox_head[context.current_stage] + + @staticmethod + def random_choice(gallery: Union[Tensor, ndarray, list], + num: int) -> Union[Tensor, ndarray]: + """Randomly select some elements from the gallery. + + If `gallery` is a Tensor, the returned indices will be a Tensor; + If `gallery` is a ndarray or list, the returned indices will be a + ndarray. + + Args: + gallery (Tensor or ndarray or list): indices pool. + num (int): expected sample num. + + Returns: + Tensor or ndarray: sampled indices. + """ + assert len(gallery) >= num + + is_tensor = isinstance(gallery, torch.Tensor) + if not is_tensor: + if torch.cuda.is_available(): + device = torch.cuda.current_device() + else: + device = 'cpu' + gallery = torch.tensor(gallery, dtype=torch.long, device=device) + perm = torch.randperm(gallery.numel(), device=gallery.device)[:num] + rand_inds = gallery[perm] + if not is_tensor: + rand_inds = rand_inds.cpu().numpy() + return rand_inds + + def _sample_pos(self, assign_result: AssignResult, num_expected: int, + **kwargs) -> Union[Tensor, ndarray]: + """Randomly sample some positive samples. + + Args: + assign_result (:obj:`AssignResult`): Bbox assigning results. + num_expected (int): The number of expected positive samples + + Returns: + Tensor or ndarray: sampled indices. + """ + pos_inds = torch.nonzero(assign_result.gt_inds > 0).flatten() + if pos_inds.numel() <= num_expected: + return pos_inds + else: + return self.random_choice(pos_inds, num_expected) + + def _sample_neg(self, assign_result: AssignResult, num_expected: int, + bboxes: Tensor, feats: Tensor, + **kwargs) -> Union[Tensor, ndarray]: + """Sample negative samples. + + Score-HLR sampler is done in the following steps: + 1. Take the maximum positive score prediction of each negative samples + as s_i. + 2. Filter out negative samples whose s_i <= score_thr, the left samples + are called valid samples. + 3. Use NMS-Match to divide valid samples into different groups, + samples in the same group will greatly overlap with each other + 4. Rank the matched samples in two-steps to get Score-HLR. + (1) In the same group, rank samples with their scores. + (2) In the same score rank across different groups, + rank samples with their scores again. + 5. Linearly map Score-HLR to the final label weights. + + Args: + assign_result (:obj:`AssignResult`): result of assigner. + num_expected (int): Expected number of samples. + bboxes (Tensor): bbox to be sampled. + feats (Tensor): Features come from FPN. + + Returns: + Tensor or ndarray: sampled indices. + """ + neg_inds = torch.nonzero(assign_result.gt_inds == 0).flatten() + num_neg = neg_inds.size(0) + if num_neg == 0: + return neg_inds, None + with torch.no_grad(): + neg_bboxes = bboxes[neg_inds] + neg_rois = bbox2roi([neg_bboxes]) + bbox_result = self.context._bbox_forward(feats, neg_rois) + cls_score, bbox_pred = bbox_result['cls_score'], bbox_result[ + 'bbox_pred'] + + ori_loss = self.bbox_head.loss( + cls_score=cls_score, + bbox_pred=None, + rois=None, + labels=neg_inds.new_full((num_neg, ), + self.bbox_head.num_classes), + label_weights=cls_score.new_ones(num_neg), + bbox_targets=None, + bbox_weights=None, + reduction_override='none')['loss_cls'] + + # filter out samples with the max score lower than score_thr + max_score, argmax_score = cls_score.softmax(-1)[:, :-1].max(-1) + valid_inds = (max_score > self.score_thr).nonzero().view(-1) + invalid_inds = (max_score <= self.score_thr).nonzero().view(-1) + num_valid = valid_inds.size(0) + num_invalid = invalid_inds.size(0) + + num_expected = min(num_neg, num_expected) + num_hlr = min(num_valid, num_expected) + num_rand = num_expected - num_hlr + if num_valid > 0: + valid_rois = neg_rois[valid_inds] + valid_max_score = max_score[valid_inds] + valid_argmax_score = argmax_score[valid_inds] + valid_bbox_pred = bbox_pred[valid_inds] + + # valid_bbox_pred shape: [num_valid, #num_classes, 4] + valid_bbox_pred = valid_bbox_pred.view( + valid_bbox_pred.size(0), -1, 4) + selected_bbox_pred = valid_bbox_pred[range(num_valid), + valid_argmax_score] + pred_bboxes = self.bbox_head.bbox_coder.decode( + valid_rois[:, 1:], selected_bbox_pred) + pred_bboxes_with_score = torch.cat( + [pred_bboxes, valid_max_score[:, None]], -1) + group = nms_match(pred_bboxes_with_score, self.iou_thr) + + # imp: importance + imp = cls_score.new_zeros(num_valid) + for g in group: + g_score = valid_max_score[g] + # g_score has already sorted + rank = g_score.new_tensor(range(g_score.size(0))) + imp[g] = num_valid - rank + g_score + _, imp_rank_inds = imp.sort(descending=True) + _, imp_rank = imp_rank_inds.sort() + hlr_inds = imp_rank_inds[:num_expected] + + if num_rand > 0: + rand_inds = torch.randperm(num_invalid)[:num_rand] + select_inds = torch.cat( + [valid_inds[hlr_inds], invalid_inds[rand_inds]]) + else: + select_inds = valid_inds[hlr_inds] + + neg_label_weights = cls_score.new_ones(num_expected) + + up_bound = max(num_expected, num_valid) + imp_weights = (up_bound - + imp_rank[hlr_inds].float()) / up_bound + neg_label_weights[:num_hlr] = imp_weights + neg_label_weights[num_hlr:] = imp_weights.min() + neg_label_weights = (self.bias + + (1 - self.bias) * neg_label_weights).pow( + self.k) + ori_selected_loss = ori_loss[select_inds] + new_loss = ori_selected_loss * neg_label_weights + norm_ratio = ori_selected_loss.sum() / new_loss.sum() + neg_label_weights *= norm_ratio + else: + neg_label_weights = cls_score.new_ones(num_expected) + select_inds = torch.randperm(num_neg)[:num_expected] + + return neg_inds[select_inds], neg_label_weights + + def sample(self, assign_result: AssignResult, pred_instances: InstanceData, + gt_instances: InstanceData, **kwargs) -> SamplingResult: + """Sample positive and negative bboxes. + + This is a simple implementation of bbox sampling given candidates, + assigning results and ground truth bboxes. + + Args: + assign_result (:obj:`AssignResult`): Assigning results. + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + + Returns: + :obj:`SamplingResult`: Sampling result. + """ + gt_bboxes = gt_instances.bboxes + priors = pred_instances.priors + gt_labels = gt_instances.labels + + gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8) + if self.add_gt_as_proposals and len(gt_bboxes) > 0: + priors = torch.cat([gt_bboxes, priors], dim=0) + assign_result.add_gt_(gt_labels) + gt_ones = priors.new_ones(gt_bboxes.shape[0], dtype=torch.uint8) + gt_flags = torch.cat([gt_ones, gt_flags]) + + num_expected_pos = int(self.num * self.pos_fraction) + pos_inds = self.pos_sampler._sample_pos( + assign_result, num_expected_pos, bboxes=priors, **kwargs) + num_sampled_pos = pos_inds.numel() + num_expected_neg = self.num - num_sampled_pos + if self.neg_pos_ub >= 0: + _pos = max(1, num_sampled_pos) + neg_upper_bound = int(self.neg_pos_ub * _pos) + if num_expected_neg > neg_upper_bound: + num_expected_neg = neg_upper_bound + neg_inds, neg_label_weights = self.neg_sampler._sample_neg( + assign_result, num_expected_neg, bboxes=priors, **kwargs) + + sampling_result = SamplingResult( + pos_inds=pos_inds, + neg_inds=neg_inds, + priors=priors, + gt_bboxes=gt_bboxes, + assign_result=assign_result, + gt_flags=gt_flags) + return sampling_result, neg_label_weights diff --git a/mmdetection/mmdet/models/task_modules/tracking/__init__.py b/mmdetection/mmdet/models/task_modules/tracking/__init__.py new file mode 100644 index 0000000..57a86d7 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/tracking/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .aflink import AppearanceFreeLink +from .camera_motion_compensation import CameraMotionCompensation +from .interpolation import InterpolateTracklets +from .kalman_filter import KalmanFilter +from .similarity import embed_similarity + +__all__ = [ + 'KalmanFilter', 'InterpolateTracklets', 'embed_similarity', + 'AppearanceFreeLink', 'CameraMotionCompensation' +] diff --git a/mmdetection/mmdet/models/task_modules/tracking/aflink.py b/mmdetection/mmdet/models/task_modules/tracking/aflink.py new file mode 100644 index 0000000..5246106 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/tracking/aflink.py @@ -0,0 +1,281 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import defaultdict +from typing import Tuple + +import numpy as np +import torch +from mmengine.model import BaseModule +from mmengine.runner.checkpoint import load_checkpoint +from scipy.optimize import linear_sum_assignment +from torch import Tensor, nn + +from mmdet.registry import TASK_UTILS + +INFINITY = 1e5 + + +class TemporalBlock(BaseModule): + """The temporal block of AFLink model. + + Args: + in_channel (int): the dimension of the input channels. + out_channel (int): the dimension of the output channels. + """ + + def __init__(self, + in_channel: int, + out_channel: int, + kernel_size: tuple = (7, 1)): + super(TemporalBlock, self).__init__() + self.conv = nn.Conv2d(in_channel, out_channel, kernel_size, bias=False) + self.relu = nn.ReLU(inplace=True) + self.bnf = nn.BatchNorm1d(out_channel) + self.bnx = nn.BatchNorm1d(out_channel) + self.bny = nn.BatchNorm1d(out_channel) + + def bn(self, x: Tensor) -> Tensor: + x[:, :, :, 0] = self.bnf(x[:, :, :, 0]) + x[:, :, :, 1] = self.bnx(x[:, :, :, 1]) + x[:, :, :, 2] = self.bny(x[:, :, :, 2]) + return x + + def forward(self, x: Tensor) -> Tensor: + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + +class FusionBlock(BaseModule): + """The fusion block of AFLink model. + + Args: + in_channel (int): the dimension of the input channels. + out_channel (int): the dimension of the output channels. + """ + + def __init__(self, in_channel: int, out_channel: int): + super(FusionBlock, self).__init__() + self.conv = nn.Conv2d(in_channel, out_channel, (1, 3), bias=False) + self.bn = nn.BatchNorm2d(out_channel) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x: Tensor) -> Tensor: + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + +class Classifier(BaseModule): + """The classifier of AFLink model. + + Args: + in_channel (int): the dimension of the input channels. + """ + + def __init__(self, in_channel: int, out_channel: int): + super(Classifier, self).__init__() + self.fc1 = nn.Linear(in_channel * 2, in_channel // 2) + self.relu = nn.ReLU(inplace=True) + self.fc2 = nn.Linear(in_channel // 2, out_channel) + + def forward(self, x1: Tensor, x2: Tensor) -> Tensor: + x = torch.cat((x1, x2), dim=1) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + return x + + +class AFLinkModel(BaseModule): + """Appearance-Free Link Model.""" + + def __init__(self, + temporal_module_channels: list = [1, 32, 64, 128, 256], + fusion_module_channels: list = [256, 256], + classifier_channels: list = [256, 2]): + super(AFLinkModel, self).__init__() + self.TemporalModule_1 = nn.Sequential(*[ + TemporalBlock(temporal_module_channels[i], + temporal_module_channels[i + 1]) + for i in range(len(temporal_module_channels) - 1) + ]) + + self.TemporalModule_2 = nn.Sequential(*[ + TemporalBlock(temporal_module_channels[i], + temporal_module_channels[i + 1]) + for i in range(len(temporal_module_channels) - 1) + ]) + + self.FusionBlock_1 = FusionBlock(*fusion_module_channels) + self.FusionBlock_2 = FusionBlock(*fusion_module_channels) + + self.pooling = nn.AdaptiveAvgPool2d((1, 1)) + self.classifier = Classifier(*classifier_channels) + + def forward(self, x1: Tensor, x2: Tensor) -> Tensor: + assert not self.training, 'Only testing is supported for AFLink.' + x1 = x1[:, :, :, :3] + x2 = x2[:, :, :, :3] + x1 = self.TemporalModule_1(x1) # [B,1,30,3] -> [B,256,6,3] + x2 = self.TemporalModule_2(x2) + x1 = self.FusionBlock_1(x1) + x2 = self.FusionBlock_2(x2) + x1 = self.pooling(x1).squeeze(-1).squeeze(-1) + x2 = self.pooling(x2).squeeze(-1).squeeze(-1) + y = self.classifier(x1, x2) + y = torch.softmax(y, dim=1)[0, 1] + return y + + +@TASK_UTILS.register_module() +class AppearanceFreeLink(BaseModule): + """Appearance-Free Link method. + + This method is proposed in + "StrongSORT: Make DeepSORT Great Again" + `StrongSORT`_. + + Args: + checkpoint (str): Checkpoint path. + temporal_threshold (tuple, optional): The temporal constraint + for tracklets association. Defaults to (0, 30). + spatial_threshold (int, optional): The spatial constraint for + tracklets association. Defaults to 75. + confidence_threshold (float, optional): The minimum confidence + threshold for tracklets association. Defaults to 0.95. + """ + + def __init__(self, + checkpoint: str, + temporal_threshold: tuple = (0, 30), + spatial_threshold: int = 75, + confidence_threshold: float = 0.95): + super(AppearanceFreeLink, self).__init__() + self.temporal_threshold = temporal_threshold + self.spatial_threshold = spatial_threshold + self.confidence_threshold = confidence_threshold + + self.model = AFLinkModel() + if checkpoint: + load_checkpoint(self.model, checkpoint) + if torch.cuda.is_available(): + self.model.cuda() + self.model.eval() + + self.device = next(self.model.parameters()).device + self.fn_l2 = lambda x, y: np.sqrt(x**2 + y**2) + + def data_transform(self, + track1: np.ndarray, + track2: np.ndarray, + length: int = 30) -> Tuple[np.ndarray]: + """Data Transformation. This is used to standardize the length of + tracks to a unified length. Then perform min-max normalization to the + motion embeddings. + + Args: + track1 (ndarray): the first track with shape (N,C). + track2 (ndarray): the second track with shape (M,C). + length (int): the unified length of tracks. Defaults to 30. + + Returns: + Tuple[ndarray]: the transformed track1 and track2. + """ + # fill or cut track1 + length_1 = track1.shape[0] + track1 = track1[-length:] if length_1 >= length else \ + np.pad(track1, ((length - length_1, 0), (0, 0))) + + # fill or cut track1 + length_2 = track2.shape[0] + track2 = track2[:length] if length_2 >= length else \ + np.pad(track2, ((0, length - length_2), (0, 0))) + + # min-max normalization + min_ = np.concatenate((track1, track2), axis=0).min(axis=0) + max_ = np.concatenate((track1, track2), axis=0).max(axis=0) + subtractor = (max_ + min_) / 2 + divisor = (max_ - min_) / 2 + 1e-5 + track1 = (track1 - subtractor) / divisor + track2 = (track2 - subtractor) / divisor + + return track1, track2 + + def forward(self, pred_tracks: np.ndarray) -> np.ndarray: + """Forward function. + + pred_tracks (ndarray): With shape (N, 7). Each row denotes + (frame_id, track_id, x1, y1, x2, y2, score). + + Returns: + ndarray: The linked tracks with shape (N, 7). Each row denotes + (frame_id, track_id, x1, y1, x2, y2, score) + """ + # sort tracks by the frame id + pred_tracks = pred_tracks[np.argsort(pred_tracks[:, 0])] + + # gather tracks information + id2info = defaultdict(list) + for row in pred_tracks: + frame_id, track_id, x1, y1, x2, y2 = row[:6] + id2info[track_id].append([frame_id, x1, y1, x2 - x1, y2 - y1]) + id2info = {k: np.array(v) for k, v in id2info.items()} + num_track = len(id2info) + track_ids = np.array(list(id2info)) + cost_matrix = np.full((num_track, num_track), INFINITY) + + # compute the cost matrix + for i, id_i in enumerate(track_ids): + for j, id_j in enumerate(track_ids): + if id_i == id_j: + continue + info_i, info_j = id2info[id_i], id2info[id_j] + frame_i, box_i = info_i[-1][0], info_i[-1][1:3] + frame_j, box_j = info_j[0][0], info_j[0][1:3] + # temporal constraint + if not self.temporal_threshold[0] <= \ + frame_j - frame_i <= self.temporal_threshold[1]: + continue + # spatial constraint + if self.fn_l2(box_i[0] - box_j[0], box_i[1] - box_j[1]) \ + > self.spatial_threshold: + continue + # confidence constraint + track_i, track_j = self.data_transform(info_i, info_j) + + # numpy to torch + track_i = torch.tensor( + track_i, dtype=torch.float).to(self.device) + track_j = torch.tensor( + track_j, dtype=torch.float).to(self.device) + track_i = track_i.unsqueeze(0).unsqueeze(0) + track_j = track_j.unsqueeze(0).unsqueeze(0) + + confidence = self.model(track_i, + track_j).detach().cpu().numpy() + if confidence >= self.confidence_threshold: + cost_matrix[i, j] = 1 - confidence + + # linear assignment + indices = linear_sum_assignment(cost_matrix) + _id2id = dict() # the temporary assignment results + id2id = dict() # the final assignment results + for i, j in zip(indices[0], indices[1]): + if cost_matrix[i, j] < INFINITY: + _id2id[i] = j + for k, v in _id2id.items(): + if k in id2id: + id2id[v] = id2id[k] + else: + id2id[v] = k + + # link + for k, v in id2id.items(): + pred_tracks[pred_tracks[:, 1] == k, 1] = v + + # deduplicate + _, index = np.unique(pred_tracks[:, :2], return_index=True, axis=0) + + return pred_tracks[index] diff --git a/mmdetection/mmdet/models/task_modules/tracking/camera_motion_compensation.py b/mmdetection/mmdet/models/task_modules/tracking/camera_motion_compensation.py new file mode 100644 index 0000000..1a62984 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/tracking/camera_motion_compensation.py @@ -0,0 +1,104 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np +import torch +from torch import Tensor + +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import bbox_cxcyah_to_xyxy, bbox_xyxy_to_cxcyah + + +@TASK_UTILS.register_module() +class CameraMotionCompensation: + """Camera motion compensation. + + Args: + warp_mode (str): Warp mode in opencv. + Defaults to 'cv2.MOTION_EUCLIDEAN'. + num_iters (int): Number of the iterations. Defaults to 50. + stop_eps (float): Terminate threshold. Defaults to 0.001. + """ + + def __init__(self, + warp_mode: str = 'cv2.MOTION_EUCLIDEAN', + num_iters: int = 50, + stop_eps: float = 0.001): + self.warp_mode = eval(warp_mode) + self.num_iters = num_iters + self.stop_eps = stop_eps + + def get_warp_matrix(self, img: np.ndarray, ref_img: np.ndarray) -> Tensor: + """Calculate warping matrix between two images.""" + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + ref_img = cv2.cvtColor(ref_img, cv2.COLOR_BGR2GRAY) + + warp_matrix = np.eye(2, 3, dtype=np.float32) + criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, + self.num_iters, self.stop_eps) + cc, warp_matrix = cv2.findTransformECC(img, ref_img, warp_matrix, + self.warp_mode, criteria, None, + 1) + warp_matrix = torch.from_numpy(warp_matrix) + return warp_matrix + + def warp_bboxes(self, bboxes: Tensor, warp_matrix: Tensor) -> Tensor: + """Warp bounding boxes according to the warping matrix.""" + tl, br = bboxes[:, :2], bboxes[:, 2:] + tl = torch.cat((tl, torch.ones(tl.shape[0], 1).to(bboxes.device)), + dim=1) + br = torch.cat((br, torch.ones(tl.shape[0], 1).to(bboxes.device)), + dim=1) + trans_tl = torch.mm(warp_matrix, tl.t()).t() + trans_br = torch.mm(warp_matrix, br.t()).t() + trans_bboxes = torch.cat((trans_tl, trans_br), dim=1) + return trans_bboxes.to(bboxes.device) + + def warp_means(self, means: np.ndarray, warp_matrix: Tensor) -> np.ndarray: + """Warp track.mean according to the warping matrix.""" + cxcyah = torch.from_numpy(means[:, :4]).float() + xyxy = bbox_cxcyah_to_xyxy(cxcyah) + warped_xyxy = self.warp_bboxes(xyxy, warp_matrix) + warped_cxcyah = bbox_xyxy_to_cxcyah(warped_xyxy).numpy() + means[:, :4] = warped_cxcyah + return means + + def track(self, img: Tensor, ref_img: Tensor, tracks: dict, + num_samples: int, frame_id: int, metainfo: dict) -> dict: + """Tracking forward.""" + img = img.squeeze(0).cpu().numpy().transpose((1, 2, 0)) + ref_img = ref_img.squeeze(0).cpu().numpy().transpose((1, 2, 0)) + warp_matrix = self.get_warp_matrix(img, ref_img) + + # rescale the warp_matrix due to the `resize` in pipeline + scale_factor_h, scale_factor_w = metainfo['scale_factor'] + warp_matrix[0, 2] = warp_matrix[0, 2] / scale_factor_w + warp_matrix[1, 2] = warp_matrix[1, 2] / scale_factor_h + + bboxes = [] + num_bboxes = [] + means = [] + for k, v in tracks.items(): + if int(v['frame_ids'][-1]) < frame_id - 1: + _num = 1 + else: + _num = min(num_samples, len(v.bboxes)) + num_bboxes.append(_num) + bboxes.extend(v.bboxes[-_num:]) + if len(v.mean) > 0: + means.append(v.mean) + bboxes = torch.cat(bboxes, dim=0) + warped_bboxes = self.warp_bboxes(bboxes, warp_matrix.to(bboxes.device)) + + warped_bboxes = torch.split(warped_bboxes, num_bboxes) + for b, (k, v) in zip(warped_bboxes, tracks.items()): + _num = b.shape[0] + b = torch.split(b, [1] * _num) + tracks[k].bboxes[-_num:] = b + + if means: + means = np.asarray(means) + warped_means = self.warp_means(means, warp_matrix) + for m, (k, v) in zip(warped_means, tracks.items()): + tracks[k].mean = m + + return tracks diff --git a/mmdetection/mmdet/models/task_modules/tracking/interpolation.py b/mmdetection/mmdet/models/task_modules/tracking/interpolation.py new file mode 100644 index 0000000..fb6a25a --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/tracking/interpolation.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + +try: + from sklearn.gaussian_process import GaussianProcessRegressor as GPR + from sklearn.gaussian_process.kernels import RBF + HAS_SKIKIT_LEARN = True +except ImportError: + HAS_SKIKIT_LEARN = False + +from mmdet.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class InterpolateTracklets: + """Interpolate tracks to make tracks more complete. + + Args: + min_num_frames (int, optional): The minimum length of a track that will + be interpolated. Defaults to 5. + max_num_frames (int, optional): The maximum disconnected length in + a track. Defaults to 20. + use_gsi (bool, optional): Whether to use the GSI (Gaussian-smoothed + interpolation) method. Defaults to False. + smooth_tau (int, optional): smoothing parameter in GSI. Defaults to 10. + """ + + def __init__(self, + min_num_frames: int = 5, + max_num_frames: int = 20, + use_gsi: bool = False, + smooth_tau: int = 10): + if not HAS_SKIKIT_LEARN: + raise RuntimeError('sscikit-learn is not installed,\ + please install it by: pip install scikit-learn') + self.min_num_frames = min_num_frames + self.max_num_frames = max_num_frames + self.use_gsi = use_gsi + self.smooth_tau = smooth_tau + + def _interpolate_track(self, + track: np.ndarray, + track_id: int, + max_num_frames: int = 20) -> np.ndarray: + """Interpolate a track linearly to make the track more complete. + + This function is proposed in + "ByteTrack: Multi-Object Tracking by Associating Every Detection Box." + `ByteTrack`_. + + Args: + track (ndarray): With shape (N, 7). Each row denotes + (frame_id, track_id, x1, y1, x2, y2, score). + max_num_frames (int, optional): The maximum disconnected length in + the track. Defaults to 20. + + Returns: + ndarray: The interpolated track with shape (N, 7). Each row denotes + (frame_id, track_id, x1, y1, x2, y2, score) + """ + assert (track[:, 1] == track_id).all(), \ + 'The track id should not changed when interpolate a track.' + + frame_ids = track[:, 0] + interpolated_track = np.zeros((0, 7)) + # perform interpolation for the disconnected frames in the track. + for i in np.where(np.diff(frame_ids) > 1)[0]: + left_frame_id = frame_ids[i] + right_frame_id = frame_ids[i + 1] + num_disconnected_frames = int(right_frame_id - left_frame_id) + + if 1 < num_disconnected_frames < max_num_frames: + left_bbox = track[i, 2:6] + right_bbox = track[i + 1, 2:6] + + # perform interpolation for two adjacent tracklets. + for j in range(1, num_disconnected_frames): + cur_bbox = j / (num_disconnected_frames) * ( + right_bbox - left_bbox) + left_bbox + cur_result = np.ones((7, )) + cur_result[0] = j + left_frame_id + cur_result[1] = track_id + cur_result[2:6] = cur_bbox + + interpolated_track = np.concatenate( + (interpolated_track, cur_result[None]), axis=0) + + interpolated_track = np.concatenate((track, interpolated_track), + axis=0) + return interpolated_track + + def gaussian_smoothed_interpolation(self, + track: np.ndarray, + smooth_tau: int = 10) -> np.ndarray: + """Gaussian-Smoothed Interpolation. + + This function is proposed in + "StrongSORT: Make DeepSORT Great Again" + `StrongSORT`_. + + Args: + track (ndarray): With shape (N, 7). Each row denotes + (frame_id, track_id, x1, y1, x2, y2, score). + smooth_tau (int, optional): smoothing parameter in GSI. + Defaults to 10. + + Returns: + ndarray: The interpolated tracks with shape (N, 7). Each row + denotes (frame_id, track_id, x1, y1, x2, y2, score) + """ + len_scale = np.clip(smooth_tau * np.log(smooth_tau**3 / len(track)), + smooth_tau**-1, smooth_tau**2) + gpr = GPR(RBF(len_scale, 'fixed')) + t = track[:, 0].reshape(-1, 1) + x1 = track[:, 2].reshape(-1, 1) + y1 = track[:, 3].reshape(-1, 1) + x2 = track[:, 4].reshape(-1, 1) + y2 = track[:, 5].reshape(-1, 1) + gpr.fit(t, x1) + x1_gpr = gpr.predict(t) + gpr.fit(t, y1) + y1_gpr = gpr.predict(t) + gpr.fit(t, x2) + x2_gpr = gpr.predict(t) + gpr.fit(t, y2) + y2_gpr = gpr.predict(t) + gsi_track = [[ + t[i, 0], track[i, 1], x1_gpr[i], y1_gpr[i], x2_gpr[i], y2_gpr[i], + track[i, 6] + ] for i in range(len(t))] + return np.array(gsi_track) + + def forward(self, pred_tracks: np.ndarray) -> np.ndarray: + """Forward function. + + pred_tracks (ndarray): With shape (N, 7). Each row denotes + (frame_id, track_id, x1, y1, x2, y2, score). + + Returns: + ndarray: The interpolated tracks with shape (N, 7). Each row + denotes (frame_id, track_id, x1, y1, x2, y2, score). + """ + max_track_id = int(np.max(pred_tracks[:, 1])) + min_track_id = int(np.min(pred_tracks[:, 1])) + + # perform interpolation for each track + interpolated_tracks = [] + for track_id in range(min_track_id, max_track_id + 1): + inds = pred_tracks[:, 1] == track_id + track = pred_tracks[inds] + num_frames = len(track) + if num_frames <= 2: + continue + + if num_frames > self.min_num_frames: + interpolated_track = self._interpolate_track( + track, track_id, self.max_num_frames) + else: + interpolated_track = track + + if self.use_gsi: + interpolated_track = self.gaussian_smoothed_interpolation( + interpolated_track, self.smooth_tau) + + interpolated_tracks.append(interpolated_track) + + interpolated_tracks = np.concatenate(interpolated_tracks) + return interpolated_tracks[interpolated_tracks[:, 0].argsort()] diff --git a/mmdetection/mmdet/models/task_modules/tracking/kalman_filter.py b/mmdetection/mmdet/models/task_modules/tracking/kalman_filter.py new file mode 100644 index 0000000..a8ae141 --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/tracking/kalman_filter.py @@ -0,0 +1,267 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import numpy as np +import torch + +try: + import scipy.linalg + HAS_SCIPY = True +except ImportError: + HAS_SCIPY = False + +from mmdet.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class KalmanFilter: + """A simple Kalman filter for tracking bounding boxes in image space. + + The implementation is referred to https://github.com/nwojke/deep_sort. + + Args: + center_only (bool): If True, distance computation is done with + respect to the bounding box center position only. + Defaults to False. + use_nsa (bool): Whether to use the NSA (Noise Scale Adaptive) Kalman + Filter, which adaptively modulates the noise scale according to + the quality of detections. More details in + https://arxiv.org/abs/2202.11983. Defaults to False. + """ + chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919 + } + + def __init__(self, center_only: bool = False, use_nsa: bool = False): + if not HAS_SCIPY: + raise RuntimeError('sscikit-learn is not installed,\ + please install it by: pip install scikit-learn') + self.center_only = center_only + if self.center_only: + self.gating_threshold = self.chi2inv95[2] + else: + self.gating_threshold = self.chi2inv95[4] + + self.use_nsa = use_nsa + ndim, dt = 4, 1. + + # Create Kalman filter model matrices. + self._motion_mat = np.eye(2 * ndim, 2 * ndim) + for i in range(ndim): + self._motion_mat[i, ndim + i] = dt + self._update_mat = np.eye(ndim, 2 * ndim) + + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. + self._std_weight_position = 1. / 20 + self._std_weight_velocity = 1. / 160 + + def initiate(self, measurement: np.array) -> Tuple[np.array, np.array]: + """Create track from unassociated measurement. + + Args: + measurement (ndarray): Bounding box coordinates (x, y, a, h) with + center position (x, y), aspect ratio a, and height h. + + Returns: + (ndarray, ndarray): Returns the mean vector (8 dimensional) and + covariance matrix (8x8 dimensional) of the new track. + Unobserved velocities are initialized to 0 mean. + """ + mean_pos = measurement + mean_vel = np.zeros_like(mean_pos) + mean = np.r_[mean_pos, mean_vel] + + std = [ + 2 * self._std_weight_position * measurement[3], + 2 * self._std_weight_position * measurement[3], 1e-2, + 2 * self._std_weight_position * measurement[3], + 10 * self._std_weight_velocity * measurement[3], + 10 * self._std_weight_velocity * measurement[3], 1e-5, + 10 * self._std_weight_velocity * measurement[3] + ] + covariance = np.diag(np.square(std)) + return mean, covariance + + def predict(self, mean: np.array, + covariance: np.array) -> Tuple[np.array, np.array]: + """Run Kalman filter prediction step. + + Args: + mean (ndarray): The 8 dimensional mean vector of the object + state at the previous time step. + + covariance (ndarray): The 8x8 dimensional covariance matrix + of the object state at the previous time step. + + Returns: + (ndarray, ndarray): Returns the mean vector and covariance + matrix of the predicted state. Unobserved velocities are + initialized to 0 mean. + """ + std_pos = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], 1e-2, + self._std_weight_position * mean[3] + ] + std_vel = [ + self._std_weight_velocity * mean[3], + self._std_weight_velocity * mean[3], 1e-5, + self._std_weight_velocity * mean[3] + ] + motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) + + mean = np.dot(self._motion_mat, mean) + covariance = np.linalg.multi_dot( + (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov + + return mean, covariance + + def project(self, + mean: np.array, + covariance: np.array, + bbox_score: float = 0.) -> Tuple[np.array, np.array]: + """Project state distribution to measurement space. + + Args: + mean (ndarray): The state's mean vector (8 dimensional array). + covariance (ndarray): The state's covariance matrix (8x8 + dimensional). + bbox_score (float): The confidence score of the bbox. + Defaults to 0. + + Returns: + (ndarray, ndarray): Returns the projected mean and covariance + matrix of the given state estimate. + """ + std = [ + self._std_weight_position * mean[3], + self._std_weight_position * mean[3], 1e-1, + self._std_weight_position * mean[3] + ] + + if self.use_nsa: + std = [(1 - bbox_score) * x for x in std] + + innovation_cov = np.diag(np.square(std)) + + mean = np.dot(self._update_mat, mean) + covariance = np.linalg.multi_dot( + (self._update_mat, covariance, self._update_mat.T)) + return mean, covariance + innovation_cov + + def update(self, + mean: np.array, + covariance: np.array, + measurement: np.array, + bbox_score: float = 0.) -> Tuple[np.array, np.array]: + """Run Kalman filter correction step. + + Args: + mean (ndarray): The predicted state's mean vector (8 dimensional). + covariance (ndarray): The state's covariance matrix (8x8 + dimensional). + measurement (ndarray): The 4 dimensional measurement vector + (x, y, a, h), where (x, y) is the center position, a the + aspect ratio, and h the height of the bounding box. + bbox_score (float): The confidence score of the bbox. + Defaults to 0. + + Returns: + (ndarray, ndarray): Returns the measurement-corrected state + distribution. + """ + projected_mean, projected_cov = \ + self.project(mean, covariance, bbox_score) + + chol_factor, lower = scipy.linalg.cho_factor( + projected_cov, lower=True, check_finite=False) + kalman_gain = scipy.linalg.cho_solve((chol_factor, lower), + np.dot(covariance, + self._update_mat.T).T, + check_finite=False).T + innovation = measurement - projected_mean + + new_mean = mean + np.dot(innovation, kalman_gain.T) + new_covariance = covariance - np.linalg.multi_dot( + (kalman_gain, projected_cov, kalman_gain.T)) + return new_mean, new_covariance + + def gating_distance(self, + mean: np.array, + covariance: np.array, + measurements: np.array, + only_position: bool = False) -> np.array: + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Args: + mean (ndarray): Mean vector over the state distribution (8 + dimensional). + covariance (ndarray): Covariance of the state distribution (8x8 + dimensional). + measurements (ndarray): An Nx4 dimensional matrix of N + measurements, each in format (x, y, a, h) where (x, y) is the + bounding box center position, a the aspect ratio, and h the + height. + only_position (bool, optional): If True, distance computation is + done with respect to the bounding box center position only. + Defaults to False. + + Returns: + ndarray: Returns an array of length N, where the i-th element + contains the squared Mahalanobis distance between + (mean, covariance) and `measurements[i]`. + """ + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean + z = scipy.linalg.solve_triangular( + cholesky_factor, + d.T, + lower=True, + check_finite=False, + overwrite_b=True) + squared_maha = np.sum(z * z, axis=0) + return squared_maha + + def track(self, tracks: dict, + bboxes: torch.Tensor) -> Tuple[dict, np.array]: + """Track forward. + + Args: + tracks (dict[int:dict]): Track buffer. + bboxes (Tensor): Detected bounding boxes. + + Returns: + (dict[int:dict], ndarray): Updated tracks and bboxes. + """ + costs = [] + for id, track in tracks.items(): + track.mean, track.covariance = self.predict( + track.mean, track.covariance) + gating_distance = self.gating_distance(track.mean, + track.covariance, + bboxes.cpu().numpy(), + self.center_only) + costs.append(gating_distance) + + costs = np.stack(costs, 0) + costs[costs > self.gating_threshold] = np.nan + return tracks, costs diff --git a/mmdetection/mmdet/models/task_modules/tracking/similarity.py b/mmdetection/mmdet/models/task_modules/tracking/similarity.py new file mode 100644 index 0000000..730e43b --- /dev/null +++ b/mmdetection/mmdet/models/task_modules/tracking/similarity.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn.functional as F +from torch import Tensor + + +def embed_similarity(key_embeds: Tensor, + ref_embeds: Tensor, + method: str = 'dot_product', + temperature: int = -1) -> Tensor: + """Calculate feature similarity from embeddings. + + Args: + key_embeds (Tensor): Shape (N1, C). + ref_embeds (Tensor): Shape (N2, C). + method (str, optional): Method to calculate the similarity, + options are 'dot_product' and 'cosine'. Defaults to + 'dot_product'. + temperature (int, optional): Softmax temperature. Defaults to -1. + + Returns: + Tensor: Similarity matrix of shape (N1, N2). + """ + assert method in ['dot_product', 'cosine'] + + if method == 'cosine': + key_embeds = F.normalize(key_embeds, p=2, dim=1) + ref_embeds = F.normalize(ref_embeds, p=2, dim=1) + + similarity = torch.mm(key_embeds, ref_embeds.T) + + if temperature > 0: + similarity /= float(temperature) + return similarity diff --git a/mmdetection/mmdet/models/test_time_augs/__init__.py b/mmdetection/mmdet/models/test_time_augs/__init__.py new file mode 100644 index 0000000..f5e4926 --- /dev/null +++ b/mmdetection/mmdet/models/test_time_augs/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .det_tta import DetTTAModel +from .merge_augs import (merge_aug_bboxes, merge_aug_masks, + merge_aug_proposals, merge_aug_results, + merge_aug_scores) + +__all__ = [ + 'merge_aug_bboxes', 'merge_aug_masks', 'merge_aug_proposals', + 'merge_aug_scores', 'merge_aug_results', 'DetTTAModel' +] diff --git a/mmdetection/mmdet/models/test_time_augs/det_tta.py b/mmdetection/mmdet/models/test_time_augs/det_tta.py new file mode 100644 index 0000000..95f91db --- /dev/null +++ b/mmdetection/mmdet/models/test_time_augs/det_tta.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +from mmcv.ops import batched_nms +from mmengine.model import BaseTTAModel +from mmengine.registry import MODELS +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import bbox_flip + + +@MODELS.register_module() +class DetTTAModel(BaseTTAModel): + """Merge augmented detection results, only bboxes corresponding score under + flipping and multi-scale resizing can be processed now. + + Examples: + >>> tta_model = dict( + >>> type='DetTTAModel', + >>> tta_cfg=dict(nms=dict( + >>> type='nms', + >>> iou_threshold=0.5), + >>> max_per_img=100)) + >>> + >>> tta_pipeline = [ + >>> dict(type='LoadImageFromFile', + >>> backend_args=None), + >>> dict( + >>> type='TestTimeAug', + >>> transforms=[[ + >>> dict(type='Resize', + >>> scale=(1333, 800), + >>> keep_ratio=True), + >>> ], [ + >>> dict(type='RandomFlip', prob=1.), + >>> dict(type='RandomFlip', prob=0.) + >>> ], [ + >>> dict( + >>> type='PackDetInputs', + >>> meta_keys=('img_id', 'img_path', 'ori_shape', + >>> 'img_shape', 'scale_factor', 'flip', + >>> 'flip_direction')) + >>> ]])] + """ + + def __init__(self, tta_cfg=None, **kwargs): + super().__init__(**kwargs) + self.tta_cfg = tta_cfg + + def merge_aug_bboxes(self, aug_bboxes: List[Tensor], + aug_scores: List[Tensor], + img_metas: List[str]) -> Tuple[Tensor, Tensor]: + """Merge augmented detection bboxes and scores. + + Args: + aug_bboxes (list[Tensor]): shape (n, 4*#class) + aug_scores (list[Tensor] or None): shape (n, #class) + Returns: + tuple[Tensor]: ``bboxes`` with shape (n,4), where + 4 represent (tl_x, tl_y, br_x, br_y) + and ``scores`` with shape (n,). + """ + recovered_bboxes = [] + for bboxes, img_info in zip(aug_bboxes, img_metas): + ori_shape = img_info['ori_shape'] + flip = img_info['flip'] + flip_direction = img_info['flip_direction'] + if flip: + bboxes = bbox_flip( + bboxes=bboxes, + img_shape=ori_shape, + direction=flip_direction) + recovered_bboxes.append(bboxes) + bboxes = torch.cat(recovered_bboxes, dim=0) + if aug_scores is None: + return bboxes + else: + scores = torch.cat(aug_scores, dim=0) + return bboxes, scores + + def merge_preds(self, data_samples_list: List[List[DetDataSample]]): + """Merge batch predictions of enhanced data. + + Args: + data_samples_list (List[List[DetDataSample]]): List of predictions + of all enhanced data. The outer list indicates images, and the + inner list corresponds to the different views of one image. + Each element of the inner list is a ``DetDataSample``. + Returns: + List[DetDataSample]: Merged batch prediction. + """ + merged_data_samples = [] + for data_samples in data_samples_list: + merged_data_samples.append(self._merge_single_sample(data_samples)) + return merged_data_samples + + def _merge_single_sample( + self, data_samples: List[DetDataSample]) -> DetDataSample: + """Merge predictions which come form the different views of one image + to one prediction. + + Args: + data_samples (List[DetDataSample]): List of predictions + of enhanced data which come form one image. + Returns: + List[DetDataSample]: Merged prediction. + """ + aug_bboxes = [] + aug_scores = [] + aug_labels = [] + img_metas = [] + # TODO: support instance segmentation TTA + assert data_samples[0].pred_instances.get('masks', None) is None, \ + 'TTA of instance segmentation does not support now.' + for data_sample in data_samples: + aug_bboxes.append(data_sample.pred_instances.bboxes) + aug_scores.append(data_sample.pred_instances.scores) + aug_labels.append(data_sample.pred_instances.labels) + img_metas.append(data_sample.metainfo) + + merged_bboxes, merged_scores = self.merge_aug_bboxes( + aug_bboxes, aug_scores, img_metas) + merged_labels = torch.cat(aug_labels, dim=0) + + if merged_bboxes.numel() == 0: + return data_samples[0] + + det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores, + merged_labels, self.tta_cfg.nms) + + det_bboxes = det_bboxes[:self.tta_cfg.max_per_img] + det_labels = merged_labels[keep_idxs][:self.tta_cfg.max_per_img] + + results = InstanceData() + _det_bboxes = det_bboxes.clone() + results.bboxes = _det_bboxes[:, :-1] + results.scores = _det_bboxes[:, -1] + results.labels = det_labels + det_results = data_samples[0] + det_results.pred_instances = results + return det_results diff --git a/mmdetection/mmdet/models/test_time_augs/merge_augs.py b/mmdetection/mmdet/models/test_time_augs/merge_augs.py new file mode 100644 index 0000000..5935a86 --- /dev/null +++ b/mmdetection/mmdet/models/test_time_augs/merge_augs.py @@ -0,0 +1,219 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from typing import List, Optional, Union + +import numpy as np +import torch +from mmcv.ops import nms +from mmengine.config import ConfigDict +from torch import Tensor + +from mmdet.structures.bbox import bbox_mapping_back + + +# TODO remove this, never be used in mmdet +def merge_aug_proposals(aug_proposals, img_metas, cfg): + """Merge augmented proposals (multiscale, flip, etc.) + + Args: + aug_proposals (list[Tensor]): proposals from different testing + schemes, shape (n, 5). Note that they are not rescaled to the + original image size. + + img_metas (list[dict]): list of image info dict where each dict has: + 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmdet/datasets/pipelines/formatting.py:Collect`. + + cfg (dict): rpn test config. + + Returns: + Tensor: shape (n, 4), proposals corresponding to original image scale. + """ + + cfg = copy.deepcopy(cfg) + + # deprecate arguments warning + if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg: + warnings.warn( + 'In rpn_proposal or test_cfg, ' + 'nms_thr has been moved to a dict named nms as ' + 'iou_threshold, max_num has been renamed as max_per_img, ' + 'name of original arguments and the way to specify ' + 'iou_threshold of NMS will be deprecated.') + if 'nms' not in cfg: + cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr)) + if 'max_num' in cfg: + if 'max_per_img' in cfg: + assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \ + f'max_per_img at the same time, but get {cfg.max_num} ' \ + f'and {cfg.max_per_img} respectively' \ + f'Please delete max_num which will be deprecated.' + else: + cfg.max_per_img = cfg.max_num + if 'nms_thr' in cfg: + assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \ + f'iou_threshold in nms and ' \ + f'nms_thr at the same time, but get ' \ + f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \ + f' respectively. Please delete the nms_thr ' \ + f'which will be deprecated.' + + recovered_proposals = [] + for proposals, img_info in zip(aug_proposals, img_metas): + img_shape = img_info['img_shape'] + scale_factor = img_info['scale_factor'] + flip = img_info['flip'] + flip_direction = img_info['flip_direction'] + _proposals = proposals.clone() + _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape, + scale_factor, flip, + flip_direction) + recovered_proposals.append(_proposals) + aug_proposals = torch.cat(recovered_proposals, dim=0) + merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(), + aug_proposals[:, -1].contiguous(), + cfg.nms.iou_threshold) + scores = merged_proposals[:, 4] + _, order = scores.sort(0, descending=True) + num = min(cfg.max_per_img, merged_proposals.shape[0]) + order = order[:num] + merged_proposals = merged_proposals[order, :] + return merged_proposals + + +# TODO remove this, never be used in mmdet +def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg): + """Merge augmented detection bboxes and scores. + + Args: + aug_bboxes (list[Tensor]): shape (n, 4*#class) + aug_scores (list[Tensor] or None): shape (n, #class) + img_shapes (list[Tensor]): shape (3, ). + rcnn_test_cfg (dict): rcnn test config. + + Returns: + tuple: (bboxes, scores) + """ + recovered_bboxes = [] + for bboxes, img_info in zip(aug_bboxes, img_metas): + img_shape = img_info[0]['img_shape'] + scale_factor = img_info[0]['scale_factor'] + flip = img_info[0]['flip'] + flip_direction = img_info[0]['flip_direction'] + bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip, + flip_direction) + recovered_bboxes.append(bboxes) + bboxes = torch.stack(recovered_bboxes).mean(dim=0) + if aug_scores is None: + return bboxes + else: + scores = torch.stack(aug_scores).mean(dim=0) + return bboxes, scores + + +def merge_aug_results(aug_batch_results, aug_batch_img_metas): + """Merge augmented detection results, only bboxes corresponding score under + flipping and multi-scale resizing can be processed now. + + Args: + aug_batch_results (list[list[[obj:`InstanceData`]]): + Detection results of multiple images with + different augmentations. + The outer list indicate the augmentation . The inter + list indicate the batch dimension. + Each item usually contains the following keys. + + - scores (Tensor): Classification scores, in shape + (num_instance,) + - labels (Tensor): Labels of bboxes, in shape + (num_instances,). + - bboxes (Tensor): In shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + aug_batch_img_metas (list[list[dict]]): The outer list + indicates test-time augs (multiscale, flip, etc.) + and the inner list indicates + images in a batch. Each dict in the list contains + information of an image in the batch. + + Returns: + batch_results (list[obj:`InstanceData`]): Same with + the input `aug_results` except that all bboxes have + been mapped to the original scale. + """ + num_augs = len(aug_batch_results) + num_imgs = len(aug_batch_results[0]) + + batch_results = [] + aug_batch_results = copy.deepcopy(aug_batch_results) + for img_id in range(num_imgs): + aug_results = [] + for aug_id in range(num_augs): + img_metas = aug_batch_img_metas[aug_id][img_id] + results = aug_batch_results[aug_id][img_id] + + img_shape = img_metas['img_shape'] + scale_factor = img_metas['scale_factor'] + flip = img_metas['flip'] + flip_direction = img_metas['flip_direction'] + bboxes = bbox_mapping_back(results.bboxes, img_shape, scale_factor, + flip, flip_direction) + results.bboxes = bboxes + aug_results.append(results) + merged_aug_results = results.cat(aug_results) + batch_results.append(merged_aug_results) + + return batch_results + + +def merge_aug_scores(aug_scores): + """Merge augmented bbox scores.""" + if isinstance(aug_scores[0], torch.Tensor): + return torch.mean(torch.stack(aug_scores), dim=0) + else: + return np.mean(aug_scores, axis=0) + + +def merge_aug_masks(aug_masks: List[Tensor], + img_metas: dict, + weights: Optional[Union[list, Tensor]] = None) -> Tensor: + """Merge augmented mask prediction. + + Args: + aug_masks (list[Tensor]): each has shape + (n, c, h, w). + img_metas (dict): Image information. + weights (list or Tensor): Weight of each aug_masks, + the length should be n. + + Returns: + Tensor: has shape (n, c, h, w) + """ + recovered_masks = [] + for i, mask in enumerate(aug_masks): + if weights is not None: + assert len(weights) == len(aug_masks) + weight = weights[i] + else: + weight = 1 + flip = img_metas.get('flip', False) + if flip: + flip_direction = img_metas['flip_direction'] + if flip_direction == 'horizontal': + mask = mask[:, :, :, ::-1] + elif flip_direction == 'vertical': + mask = mask[:, :, ::-1, :] + elif flip_direction == 'diagonal': + mask = mask[:, :, :, ::-1] + mask = mask[:, :, ::-1, :] + else: + raise ValueError( + f"Invalid flipping direction '{flip_direction}'") + recovered_masks.append(mask[None, :] * weight) + + merged_masks = torch.cat(recovered_masks, 0).mean(dim=0) + if weights is not None: + merged_masks = merged_masks * len(weights) / sum(weights) + return merged_masks diff --git a/mmdetection/mmdet/models/trackers/__init__.py b/mmdetection/mmdet/models/trackers/__init__.py new file mode 100644 index 0000000..00284bb --- /dev/null +++ b/mmdetection/mmdet/models/trackers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_tracker import BaseTracker +from .byte_tracker import ByteTracker +from .masktrack_rcnn_tracker import MaskTrackRCNNTracker +from .ocsort_tracker import OCSORTTracker +from .quasi_dense_tracker import QuasiDenseTracker +from .sort_tracker import SORTTracker +from .strongsort_tracker import StrongSORTTracker + +__all__ = [ + 'BaseTracker', 'ByteTracker', 'QuasiDenseTracker', 'SORTTracker', + 'StrongSORTTracker', 'OCSORTTracker', 'MaskTrackRCNNTracker' +] diff --git a/mmdetection/mmdet/models/trackers/base_tracker.py b/mmdetection/mmdet/models/trackers/base_tracker.py new file mode 100644 index 0000000..0cf1886 --- /dev/null +++ b/mmdetection/mmdet/models/trackers/base_tracker.py @@ -0,0 +1,240 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Optional, Tuple + +import torch +import torch.nn.functional as F +from addict import Dict + + +class BaseTracker(metaclass=ABCMeta): + """Base tracker model. + + Args: + momentums (dict[str:float], optional): Momentums to update the buffers. + The `str` indicates the name of the buffer while the `float` + indicates the momentum. Defaults to None. + num_frames_retain (int, optional). If a track is disappeared more than + `num_frames_retain` frames, it will be deleted in the memo. + Defaults to 10. + """ + + def __init__(self, + momentums: Optional[dict] = None, + num_frames_retain: int = 10) -> None: + super().__init__() + if momentums is not None: + assert isinstance(momentums, dict), 'momentums must be a dict' + self.momentums = momentums + self.num_frames_retain = num_frames_retain + + self.reset() + + def reset(self) -> None: + """Reset the buffer of the tracker.""" + self.num_tracks = 0 + self.tracks = dict() + + @property + def empty(self) -> bool: + """Whether the buffer is empty or not.""" + return False if self.tracks else True + + @property + def ids(self) -> List[dict]: + """All ids in the tracker.""" + return list(self.tracks.keys()) + + @property + def with_reid(self) -> bool: + """bool: whether the framework has a reid model""" + return hasattr(self, 'reid') and self.reid is not None + + def update(self, **kwargs) -> None: + """Update the tracker. + + Args: + kwargs (dict[str: Tensor | int]): The `str` indicates the + name of the input variable. `ids` and `frame_ids` are + obligatory in the keys. + """ + memo_items = [k for k, v in kwargs.items() if v is not None] + rm_items = [k for k in kwargs.keys() if k not in memo_items] + for item in rm_items: + kwargs.pop(item) + if not hasattr(self, 'memo_items'): + self.memo_items = memo_items + else: + assert memo_items == self.memo_items + + assert 'ids' in memo_items + num_objs = len(kwargs['ids']) + id_indice = memo_items.index('ids') + assert 'frame_ids' in memo_items + frame_id = int(kwargs['frame_ids']) + if isinstance(kwargs['frame_ids'], int): + kwargs['frame_ids'] = torch.tensor([kwargs['frame_ids']] * + num_objs) + # cur_frame_id = int(kwargs['frame_ids'][0]) + for k, v in kwargs.items(): + if len(v) != num_objs: + raise ValueError('kwargs value must both equal') + + for obj in zip(*kwargs.values()): + id = int(obj[id_indice]) + if id in self.tracks: + self.update_track(id, obj) + else: + self.init_track(id, obj) + + self.pop_invalid_tracks(frame_id) + + def pop_invalid_tracks(self, frame_id: int) -> None: + """Pop out invalid tracks.""" + invalid_ids = [] + for k, v in self.tracks.items(): + if frame_id - v['frame_ids'][-1] >= self.num_frames_retain: + invalid_ids.append(k) + for invalid_id in invalid_ids: + self.tracks.pop(invalid_id) + + def update_track(self, id: int, obj: Tuple[torch.Tensor]): + """Update a track.""" + for k, v in zip(self.memo_items, obj): + v = v[None] + if self.momentums is not None and k in self.momentums: + m = self.momentums[k] + self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v + else: + self.tracks[id][k].append(v) + + def init_track(self, id: int, obj: Tuple[torch.Tensor]): + """Initialize a track.""" + self.tracks[id] = Dict() + for k, v in zip(self.memo_items, obj): + v = v[None] + if self.momentums is not None and k in self.momentums: + self.tracks[id][k] = v + else: + self.tracks[id][k] = [v] + + @property + def memo(self) -> dict: + """Return all buffers in the tracker.""" + outs = Dict() + for k in self.memo_items: + outs[k] = [] + + for id, objs in self.tracks.items(): + for k, v in objs.items(): + if k not in outs: + continue + if self.momentums is not None and k in self.momentums: + v = v + else: + v = v[-1] + outs[k].append(v) + + for k, v in outs.items(): + outs[k] = torch.cat(v, dim=0) + return outs + + def get(self, + item: str, + ids: Optional[list] = None, + num_samples: Optional[int] = None, + behavior: Optional[str] = None) -> torch.Tensor: + """Get the buffer of a specific item. + + Args: + item (str): The demanded item. + ids (list[int], optional): The demanded ids. Defaults to None. + num_samples (int, optional): Number of samples to calculate the + results. Defaults to None. + behavior (str, optional): Behavior to calculate the results. + Options are `mean` | None. Defaults to None. + + Returns: + Tensor: The results of the demanded item. + """ + if ids is None: + ids = self.ids + + outs = [] + for id in ids: + out = self.tracks[id][item] + if isinstance(out, list): + if num_samples is not None: + out = out[-num_samples:] + out = torch.cat(out, dim=0) + if behavior == 'mean': + out = out.mean(dim=0, keepdim=True) + elif behavior is None: + out = out[None] + else: + raise NotImplementedError() + else: + out = out[-1] + outs.append(out) + return torch.cat(outs, dim=0) + + @abstractmethod + def track(self, *args, **kwargs): + """Tracking forward function.""" + pass + + def crop_imgs(self, + img: torch.Tensor, + meta_info: dict, + bboxes: torch.Tensor, + rescale: bool = False) -> torch.Tensor: + """Crop the images according to some bounding boxes. Typically for re- + identification sub-module. + + Args: + img (Tensor): of shape (T, C, H, W) encoding input image. + Typically these should be mean centered and std scaled. + meta_info (dict): image information dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + bboxes (Tensor): of shape (N, 4) or (N, 5). + rescale (bool, optional): If True, the bounding boxes should be + rescaled to fit the scale of the image. Defaults to False. + + Returns: + Tensor: Image tensor of shape (T, C, H, W). + """ + h, w = meta_info['img_shape'] + img = img[:, :, :h, :w] + if rescale: + factor_x, factor_y = meta_info['scale_factor'] + bboxes[:, :4] *= torch.tensor( + [factor_x, factor_y, factor_x, factor_y]).to(bboxes.device) + bboxes[:, 0] = torch.clamp(bboxes[:, 0], min=0, max=w - 1) + bboxes[:, 1] = torch.clamp(bboxes[:, 1], min=0, max=h - 1) + bboxes[:, 2] = torch.clamp(bboxes[:, 2], min=1, max=w) + bboxes[:, 3] = torch.clamp(bboxes[:, 3], min=1, max=h) + + crop_imgs = [] + for bbox in bboxes: + x1, y1, x2, y2 = map(int, bbox) + if x2 <= x1: + x2 = x1 + 1 + if y2 <= y1: + y2 = y1 + 1 + crop_img = img[:, :, y1:y2, x1:x2] + if self.reid.get('img_scale', False): + crop_img = F.interpolate( + crop_img, + size=self.reid['img_scale'], + mode='bilinear', + align_corners=False) + crop_imgs.append(crop_img) + + if len(crop_imgs) > 0: + return torch.cat(crop_imgs, dim=0) + elif self.reid.get('img_scale', False): + _h, _w = self.reid['img_scale'] + return img.new_zeros((0, 3, _h, _w)) + else: + return img.new_zeros((0, 3, h, w)) diff --git a/mmdetection/mmdet/models/trackers/byte_tracker.py b/mmdetection/mmdet/models/trackers/byte_tracker.py new file mode 100644 index 0000000..11f3adc --- /dev/null +++ b/mmdetection/mmdet/models/trackers/byte_tracker.py @@ -0,0 +1,334 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +try: + import lap +except ImportError: + lap = None +import numpy as np +import torch +from mmengine.structures import InstanceData + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import (bbox_cxcyah_to_xyxy, bbox_overlaps, + bbox_xyxy_to_cxcyah) +from .base_tracker import BaseTracker + + +@MODELS.register_module() +class ByteTracker(BaseTracker): + """Tracker for ByteTrack. + + Args: + motion (dict): Configuration of motion. Defaults to None. + obj_score_thrs (dict): Detection score threshold for matching objects. + - high (float): Threshold of the first matching. Defaults to 0.6. + - low (float): Threshold of the second matching. Defaults to 0.1. + init_track_thr (float): Detection score threshold for initializing a + new tracklet. Defaults to 0.7. + weight_iou_with_det_scores (bool): Whether using detection scores to + weight IOU which is used for matching. Defaults to True. + match_iou_thrs (dict): IOU distance threshold for matching between two + frames. + - high (float): Threshold of the first matching. Defaults to 0.1. + - low (float): Threshold of the second matching. Defaults to 0.5. + - tentative (float): Threshold of the matching for tentative + tracklets. Defaults to 0.3. + num_tentatives (int, optional): Number of continuous frames to confirm + a track. Defaults to 3. + """ + + def __init__(self, + motion: Optional[dict] = None, + obj_score_thrs: dict = dict(high=0.6, low=0.1), + init_track_thr: float = 0.7, + weight_iou_with_det_scores: bool = True, + match_iou_thrs: dict = dict(high=0.1, low=0.5, tentative=0.3), + num_tentatives: int = 3, + **kwargs): + super().__init__(**kwargs) + + if lap is None: + raise RuntimeError('lap is not installed,\ + please install it by: pip install lap') + if motion is not None: + self.motion = TASK_UTILS.build(motion) + + self.obj_score_thrs = obj_score_thrs + self.init_track_thr = init_track_thr + + self.weight_iou_with_det_scores = weight_iou_with_det_scores + self.match_iou_thrs = match_iou_thrs + + self.num_tentatives = num_tentatives + + @property + def confirmed_ids(self) -> List: + """Confirmed ids in the tracker.""" + ids = [id for id, track in self.tracks.items() if not track.tentative] + return ids + + @property + def unconfirmed_ids(self) -> List: + """Unconfirmed ids in the tracker.""" + ids = [id for id, track in self.tracks.items() if track.tentative] + return ids + + def init_track(self, id: int, obj: Tuple[torch.Tensor]) -> None: + """Initialize a track.""" + super().init_track(id, obj) + if self.tracks[id].frame_ids[-1] == 0: + self.tracks[id].tentative = False + else: + self.tracks[id].tentative = True + bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1]) # size = (1, 4) + assert bbox.ndim == 2 and bbox.shape[0] == 1 + bbox = bbox.squeeze(0).cpu().numpy() + self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate( + bbox) + + def update_track(self, id: int, obj: Tuple[torch.Tensor]) -> None: + """Update a track.""" + super().update_track(id, obj) + if self.tracks[id].tentative: + if len(self.tracks[id]['bboxes']) >= self.num_tentatives: + self.tracks[id].tentative = False + bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1]) # size = (1, 4) + assert bbox.ndim == 2 and bbox.shape[0] == 1 + bbox = bbox.squeeze(0).cpu().numpy() + track_label = self.tracks[id]['labels'][-1] + label_idx = self.memo_items.index('labels') + obj_label = obj[label_idx] + assert obj_label == track_label + self.tracks[id].mean, self.tracks[id].covariance = self.kf.update( + self.tracks[id].mean, self.tracks[id].covariance, bbox) + + def pop_invalid_tracks(self, frame_id: int) -> None: + """Pop out invalid tracks.""" + invalid_ids = [] + for k, v in self.tracks.items(): + # case1: disappeared frames >= self.num_frames_retrain + case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain + # case2: tentative tracks but not matched in this frame + case2 = v.tentative and v['frame_ids'][-1] != frame_id + if case1 or case2: + invalid_ids.append(k) + for invalid_id in invalid_ids: + self.tracks.pop(invalid_id) + + def assign_ids( + self, + ids: List[int], + det_bboxes: torch.Tensor, + det_labels: torch.Tensor, + det_scores: torch.Tensor, + weight_iou_with_det_scores: Optional[bool] = False, + match_iou_thr: Optional[float] = 0.5 + ) -> Tuple[np.ndarray, np.ndarray]: + """Assign ids. + + Args: + ids (list[int]): Tracking ids. + det_bboxes (Tensor): of shape (N, 4) + det_labels (Tensor): of shape (N,) + det_scores (Tensor): of shape (N,) + weight_iou_with_det_scores (bool, optional): Whether using + detection scores to weight IOU which is used for matching. + Defaults to False. + match_iou_thr (float, optional): Matching threshold. + Defaults to 0.5. + + Returns: + tuple(np.ndarray, np.ndarray): The assigning ids. + """ + # get track_bboxes + track_bboxes = np.zeros((0, 4)) + for id in ids: + track_bboxes = np.concatenate( + (track_bboxes, self.tracks[id].mean[:4][None]), axis=0) + track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes) + track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes) + + # compute distance + ious = bbox_overlaps(track_bboxes, det_bboxes) + if weight_iou_with_det_scores: + ious *= det_scores + # support multi-class association + track_labels = torch.tensor([ + self.tracks[id]['labels'][-1] for id in ids + ]).to(det_bboxes.device) + + cate_match = det_labels[None, :] == track_labels[:, None] + # to avoid det and track of different categories are matched + cate_cost = (1 - cate_match.int()) * 1e6 + + dists = (1 - ious + cate_cost).cpu().numpy() + + # bipartite match + if dists.size > 0: + cost, row, col = lap.lapjv( + dists, extend_cost=True, cost_limit=1 - match_iou_thr) + else: + row = np.zeros(len(ids)).astype(np.int32) - 1 + col = np.zeros(len(det_bboxes)).astype(np.int32) - 1 + return row, col + + def track(self, data_sample: DetDataSample, **kwargs) -> InstanceData: + """Tracking forward function. + + Args: + data_sample (:obj:`DetDataSample`): The data sample. + It includes information such as `pred_instances`. + + Returns: + :obj:`InstanceData`: Tracking results of the input images. + Each InstanceData usually contains ``bboxes``, ``labels``, + ``scores`` and ``instances_id``. + """ + metainfo = data_sample.metainfo + bboxes = data_sample.pred_instances.bboxes + labels = data_sample.pred_instances.labels + scores = data_sample.pred_instances.scores + + frame_id = metainfo.get('frame_id', -1) + if frame_id == 0: + self.reset() + if not hasattr(self, 'kf'): + self.kf = self.motion + + if self.empty or bboxes.size(0) == 0: + valid_inds = scores > self.init_track_thr + scores = scores[valid_inds] + bboxes = bboxes[valid_inds] + labels = labels[valid_inds] + num_new_tracks = bboxes.size(0) + ids = torch.arange(self.num_tracks, + self.num_tracks + num_new_tracks).to(labels) + self.num_tracks += num_new_tracks + + else: + # 0. init + ids = torch.full((bboxes.size(0), ), + -1, + dtype=labels.dtype, + device=labels.device) + + # get the detection bboxes for the first association + first_det_inds = scores > self.obj_score_thrs['high'] + first_det_bboxes = bboxes[first_det_inds] + first_det_labels = labels[first_det_inds] + first_det_scores = scores[first_det_inds] + first_det_ids = ids[first_det_inds] + + # get the detection bboxes for the second association + second_det_inds = (~first_det_inds) & ( + scores > self.obj_score_thrs['low']) + second_det_bboxes = bboxes[second_det_inds] + second_det_labels = labels[second_det_inds] + second_det_scores = scores[second_det_inds] + second_det_ids = ids[second_det_inds] + + # 1. use Kalman Filter to predict current location + for id in self.confirmed_ids: + # track is lost in previous frame + if self.tracks[id].frame_ids[-1] != frame_id - 1: + self.tracks[id].mean[7] = 0 + (self.tracks[id].mean, + self.tracks[id].covariance) = self.kf.predict( + self.tracks[id].mean, self.tracks[id].covariance) + + # 2. first match + first_match_track_inds, first_match_det_inds = self.assign_ids( + self.confirmed_ids, first_det_bboxes, first_det_labels, + first_det_scores, self.weight_iou_with_det_scores, + self.match_iou_thrs['high']) + # '-1' mean a detection box is not matched with tracklets in + # previous frame + valid = first_match_det_inds > -1 + first_det_ids[valid] = torch.tensor( + self.confirmed_ids)[first_match_det_inds[valid]].to(labels) + + first_match_det_bboxes = first_det_bboxes[valid] + first_match_det_labels = first_det_labels[valid] + first_match_det_scores = first_det_scores[valid] + first_match_det_ids = first_det_ids[valid] + assert (first_match_det_ids > -1).all() + + first_unmatch_det_bboxes = first_det_bboxes[~valid] + first_unmatch_det_labels = first_det_labels[~valid] + first_unmatch_det_scores = first_det_scores[~valid] + first_unmatch_det_ids = first_det_ids[~valid] + assert (first_unmatch_det_ids == -1).all() + + # 3. use unmatched detection bboxes from the first match to match + # the unconfirmed tracks + (tentative_match_track_inds, + tentative_match_det_inds) = self.assign_ids( + self.unconfirmed_ids, first_unmatch_det_bboxes, + first_unmatch_det_labels, first_unmatch_det_scores, + self.weight_iou_with_det_scores, + self.match_iou_thrs['tentative']) + valid = tentative_match_det_inds > -1 + first_unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[ + tentative_match_det_inds[valid]].to(labels) + + # 4. second match for unmatched tracks from the first match + first_unmatch_track_ids = [] + for i, id in enumerate(self.confirmed_ids): + # tracklet is not matched in the first match + case_1 = first_match_track_inds[i] == -1 + # tracklet is not lost in the previous frame + case_2 = self.tracks[id].frame_ids[-1] == frame_id - 1 + if case_1 and case_2: + first_unmatch_track_ids.append(id) + + second_match_track_inds, second_match_det_inds = self.assign_ids( + first_unmatch_track_ids, second_det_bboxes, second_det_labels, + second_det_scores, False, self.match_iou_thrs['low']) + valid = second_match_det_inds > -1 + second_det_ids[valid] = torch.tensor(first_unmatch_track_ids)[ + second_match_det_inds[valid]].to(ids) + + # 5. gather all matched detection bboxes from step 2-4 + # we only keep matched detection bboxes in second match, which + # means the id != -1 + valid = second_det_ids > -1 + bboxes = torch.cat( + (first_match_det_bboxes, first_unmatch_det_bboxes), dim=0) + bboxes = torch.cat((bboxes, second_det_bboxes[valid]), dim=0) + + labels = torch.cat( + (first_match_det_labels, first_unmatch_det_labels), dim=0) + labels = torch.cat((labels, second_det_labels[valid]), dim=0) + + scores = torch.cat( + (first_match_det_scores, first_unmatch_det_scores), dim=0) + scores = torch.cat((scores, second_det_scores[valid]), dim=0) + + ids = torch.cat((first_match_det_ids, first_unmatch_det_ids), + dim=0) + ids = torch.cat((ids, second_det_ids[valid]), dim=0) + + # 6. assign new ids + new_track_inds = ids == -1 + ids[new_track_inds] = torch.arange( + self.num_tracks, + self.num_tracks + new_track_inds.sum()).to(labels) + self.num_tracks += new_track_inds.sum() + + self.update( + ids=ids, + bboxes=bboxes, + scores=scores, + labels=labels, + frame_ids=frame_id) + + # update pred_track_instances + pred_track_instances = InstanceData() + pred_track_instances.bboxes = bboxes + pred_track_instances.labels = labels + pred_track_instances.scores = scores + pred_track_instances.instances_id = ids + + return pred_track_instances diff --git a/mmdetection/mmdet/models/trackers/masktrack_rcnn_tracker.py b/mmdetection/mmdet/models/trackers/masktrack_rcnn_tracker.py new file mode 100644 index 0000000..cc16778 --- /dev/null +++ b/mmdetection/mmdet/models/trackers/masktrack_rcnn_tracker.py @@ -0,0 +1,189 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import bbox_overlaps +from .base_tracker import BaseTracker + + +@MODELS.register_module() +class MaskTrackRCNNTracker(BaseTracker): + """Tracker for MaskTrack R-CNN. + + Args: + match_weights (dict[str : float]): The Weighting factor when computing + the match score. It contains keys as follows: + + - det_score (float): The coefficient of `det_score` when computing + match score. + - iou (float): The coefficient of `ious` when computing match + score. + - det_label (float): The coefficient of `label_deltas` when + computing match score. + """ + + def __init__(self, + match_weights: dict = dict( + det_score=1.0, iou=2.0, det_label=10.0), + **kwargs): + super().__init__(**kwargs) + self.match_weights = match_weights + + def get_match_score(self, bboxes: Tensor, labels: Tensor, scores: Tensor, + prev_bboxes: Tensor, prev_labels: Tensor, + similarity_logits: Tensor) -> Tensor: + """Get the match score. + + Args: + bboxes (torch.Tensor): of shape (num_current_bboxes, 4) in + [tl_x, tl_y, br_x, br_y] format. Denoting the detection + bboxes of current frame. + labels (torch.Tensor): of shape (num_current_bboxes, ) + scores (torch.Tensor): of shape (num_current_bboxes, ) + prev_bboxes (torch.Tensor): of shape (num_previous_bboxes, 4) in + [tl_x, tl_y, br_x, br_y] format. Denoting the detection bboxes + of previous frame. + prev_labels (torch.Tensor): of shape (num_previous_bboxes, ) + similarity_logits (torch.Tensor): of shape (num_current_bboxes, + num_previous_bboxes + 1). Denoting the similarity logits from + track head. + + Returns: + torch.Tensor: The matching score of shape (num_current_bboxes, + num_previous_bboxes + 1) + """ + similarity_scores = similarity_logits.softmax(dim=1) + + ious = bbox_overlaps(bboxes, prev_bboxes) + iou_dummy = ious.new_zeros(ious.shape[0], 1) + ious = torch.cat((iou_dummy, ious), dim=1) + + label_deltas = (labels.view(-1, 1) == prev_labels).float() + label_deltas_dummy = label_deltas.new_ones(label_deltas.shape[0], 1) + label_deltas = torch.cat((label_deltas_dummy, label_deltas), dim=1) + + match_score = similarity_scores.log() + match_score += self.match_weights['det_score'] * \ + scores.view(-1, 1).log() + match_score += self.match_weights['iou'] * ious + match_score += self.match_weights['det_label'] * label_deltas + + return match_score + + def assign_ids(self, match_scores: Tensor): + num_prev_bboxes = match_scores.shape[1] - 1 + _, match_ids = match_scores.max(dim=1) + + ids = match_ids.new_zeros(match_ids.shape[0]) - 1 + best_match_scores = match_scores.new_zeros(num_prev_bboxes) - 1e6 + for idx, match_id in enumerate(match_ids): + if match_id == 0: + ids[idx] = self.num_tracks + self.num_tracks += 1 + else: + match_score = match_scores[idx, match_id] + # TODO: fix the bug where multiple candidate might match + # with the same previous object. + if match_score > best_match_scores[match_id - 1]: + ids[idx] = self.ids[match_id - 1] + best_match_scores[match_id - 1] = match_score + return ids, best_match_scores + + def track(self, + model: torch.nn.Module, + feats: List[torch.Tensor], + data_sample: DetDataSample, + rescale=True, + **kwargs) -> InstanceData: + """Tracking forward function. + + Args: + model (nn.Module): VIS model. + img (Tensor): of shape (T, C, H, W) encoding input image. + Typically these should be mean centered and std scaled. + The T denotes the number of key images and usually is 1 in + MaskTrackRCNN method. + feats (list[Tensor]): Multi level feature maps of `img`. + data_sample (:obj:`TrackDataSample`): The data sample. + It includes information such as `pred_det_instances`. + rescale (bool, optional): If True, the bounding boxes should be + rescaled to fit the original scale of the image. Defaults to + True. + + Returns: + :obj:`InstanceData`: Tracking results of the input images. + Each InstanceData usually contains ``bboxes``, ``labels``, + ``scores`` and ``instances_id``. + """ + metainfo = data_sample.metainfo + bboxes = data_sample.pred_instances.bboxes + masks = data_sample.pred_instances.masks + labels = data_sample.pred_instances.labels + scores = data_sample.pred_instances.scores + + frame_id = metainfo.get('frame_id', -1) + # create pred_track_instances + pred_track_instances = InstanceData() + + if bboxes.shape[0] == 0: + ids = torch.zeros_like(labels) + pred_track_instances = data_sample.pred_instances.clone() + pred_track_instances.instances_id = ids + return pred_track_instances + + rescaled_bboxes = bboxes.clone() + if rescale: + scale_factor = rescaled_bboxes.new_tensor( + metainfo['scale_factor']).repeat((1, 2)) + rescaled_bboxes = rescaled_bboxes * scale_factor + roi_feats, _ = model.track_head.extract_roi_feats( + feats, [rescaled_bboxes]) + + if self.empty: + num_new_tracks = bboxes.size(0) + ids = torch.arange( + self.num_tracks, + self.num_tracks + num_new_tracks, + dtype=torch.long) + self.num_tracks += num_new_tracks + else: + prev_bboxes = self.get('bboxes') + prev_labels = self.get('labels') + prev_roi_feats = self.get('roi_feats') + + similarity_logits = model.track_head.predict( + roi_feats, prev_roi_feats) + match_scores = self.get_match_score(bboxes, labels, scores, + prev_bboxes, prev_labels, + similarity_logits) + ids, _ = self.assign_ids(match_scores) + + valid_inds = ids > -1 + ids = ids[valid_inds] + bboxes = bboxes[valid_inds] + labels = labels[valid_inds] + scores = scores[valid_inds] + masks = masks[valid_inds] + roi_feats = roi_feats[valid_inds] + + self.update( + ids=ids, + bboxes=bboxes, + labels=labels, + scores=scores, + masks=masks, + roi_feats=roi_feats, + frame_ids=frame_id) + # update pred_track_instances + pred_track_instances.bboxes = bboxes + pred_track_instances.masks = masks + pred_track_instances.labels = labels + pred_track_instances.scores = scores + pred_track_instances.instances_id = ids + + return pred_track_instances diff --git a/mmdetection/mmdet/models/trackers/ocsort_tracker.py b/mmdetection/mmdet/models/trackers/ocsort_tracker.py new file mode 100644 index 0000000..4e09990 --- /dev/null +++ b/mmdetection/mmdet/models/trackers/ocsort_tracker.py @@ -0,0 +1,531 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +try: + import lap +except ImportError: + lap = None +import numpy as np +import torch +from addict import Dict +from mmengine.structures import InstanceData + +from mmdet.registry import MODELS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import (bbox_cxcyah_to_xyxy, bbox_overlaps, + bbox_xyxy_to_cxcyah) +from .sort_tracker import SORTTracker + + +@MODELS.register_module() +class OCSORTTracker(SORTTracker): + """Tracker for OC-SORT. + + Args: + motion (dict): Configuration of motion. Defaults to None. + obj_score_thrs (float): Detection score threshold for matching objects. + Defaults to 0.3. + init_track_thr (float): Detection score threshold for initializing a + new tracklet. Defaults to 0.7. + weight_iou_with_det_scores (bool): Whether using detection scores to + weight IOU which is used for matching. Defaults to True. + match_iou_thr (float): IOU distance threshold for matching between two + frames. Defaults to 0.3. + num_tentatives (int, optional): Number of continuous frames to confirm + a track. Defaults to 3. + vel_consist_weight (float): Weight of the velocity consistency term in + association (OCM term in the paper). + vel_delta_t (int): The difference of time step for calculating of the + velocity direction of tracklets. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + motion: Optional[dict] = None, + obj_score_thr: float = 0.3, + init_track_thr: float = 0.7, + weight_iou_with_det_scores: bool = True, + match_iou_thr: float = 0.3, + num_tentatives: int = 3, + vel_consist_weight: float = 0.2, + vel_delta_t: int = 3, + **kwargs): + if lap is None: + raise RuntimeError('lap is not installed,\ + please install it by: pip install lap') + super().__init__(motion=motion, **kwargs) + self.obj_score_thr = obj_score_thr + self.init_track_thr = init_track_thr + + self.weight_iou_with_det_scores = weight_iou_with_det_scores + self.match_iou_thr = match_iou_thr + self.vel_consist_weight = vel_consist_weight + self.vel_delta_t = vel_delta_t + + self.num_tentatives = num_tentatives + + @property + def unconfirmed_ids(self): + """Unconfirmed ids in the tracker.""" + ids = [id for id, track in self.tracks.items() if track.tentative] + return ids + + def init_track(self, id: int, obj: Tuple[torch.Tensor]): + """Initialize a track.""" + super().init_track(id, obj) + if self.tracks[id].frame_ids[-1] == 0: + self.tracks[id].tentative = False + else: + self.tracks[id].tentative = True + bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1]) # size = (1, 4) + assert bbox.ndim == 2 and bbox.shape[0] == 1 + bbox = bbox.squeeze(0).cpu().numpy() + self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate( + bbox) + # track.obs maintains the history associated detections to this track + self.tracks[id].obs = [] + bbox_id = self.memo_items.index('bboxes') + self.tracks[id].obs.append(obj[bbox_id]) + # a placefolder to save mean/covariance before losing tracking it + # parameters to save: mean, covariance, measurement + self.tracks[id].tracked = True + self.tracks[id].saved_attr = Dict() + self.tracks[id].velocity = torch.tensor( + (-1, -1)).to(obj[bbox_id].device) # placeholder + + def update_track(self, id: int, obj: Tuple[torch.Tensor]): + """Update a track.""" + super().update_track(id, obj) + if self.tracks[id].tentative: + if len(self.tracks[id]['bboxes']) >= self.num_tentatives: + self.tracks[id].tentative = False + bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1]) # size = (1, 4) + assert bbox.ndim == 2 and bbox.shape[0] == 1 + bbox = bbox.squeeze(0).cpu().numpy() + self.tracks[id].mean, self.tracks[id].covariance = self.kf.update( + self.tracks[id].mean, self.tracks[id].covariance, bbox) + self.tracks[id].tracked = True + bbox_id = self.memo_items.index('bboxes') + self.tracks[id].obs.append(obj[bbox_id]) + + bbox1 = self.k_step_observation(self.tracks[id]) + bbox2 = obj[bbox_id] + self.tracks[id].velocity = self.vel_direction(bbox1, bbox2).to( + obj[bbox_id].device) + + def vel_direction(self, bbox1: torch.Tensor, bbox2: torch.Tensor): + """Estimate the direction vector between two boxes.""" + if bbox1.sum() < 0 or bbox2.sum() < 0: + return torch.tensor((-1, -1)) + cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0 + cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0 + speed = torch.tensor([cy2 - cy1, cx2 - cx1]) + norm = torch.sqrt((speed[0])**2 + (speed[1])**2) + 1e-6 + return speed / norm + + def vel_direction_batch(self, bboxes1: torch.Tensor, + bboxes2: torch.Tensor): + """Estimate the direction vector given two batches of boxes.""" + cx1, cy1 = (bboxes1[:, 0] + bboxes1[:, 2]) / 2.0, (bboxes1[:, 1] + + bboxes1[:, 3]) / 2.0 + cx2, cy2 = (bboxes2[:, 0] + bboxes2[:, 2]) / 2.0, (bboxes2[:, 1] + + bboxes2[:, 3]) / 2.0 + speed_diff_y = cy2[None, :] - cy1[:, None] + speed_diff_x = cx2[None, :] - cx1[:, None] + speed = torch.cat((speed_diff_y[..., None], speed_diff_x[..., None]), + dim=-1) + norm = torch.sqrt((speed[:, :, 0])**2 + (speed[:, :, 1])**2) + 1e-6 + speed[:, :, 0] /= norm + speed[:, :, 1] /= norm + return speed + + def k_step_observation(self, track: Dict): + """return the observation k step away before.""" + obs_seqs = track.obs + num_obs = len(obs_seqs) + if num_obs == 0: + return torch.tensor((-1, -1, -1, -1)).to(track.obs[0].device) + elif num_obs > self.vel_delta_t: + if obs_seqs[num_obs - 1 - self.vel_delta_t] is not None: + return obs_seqs[num_obs - 1 - self.vel_delta_t] + else: + return self.last_obs(track) + else: + return self.last_obs(track) + + def ocm_assign_ids(self, + ids: List[int], + det_bboxes: torch.Tensor, + det_labels: torch.Tensor, + det_scores: torch.Tensor, + weight_iou_with_det_scores: Optional[bool] = False, + match_iou_thr: Optional[float] = 0.5): + """Apply Observation-Centric Momentum (OCM) to assign ids. + + OCM adds movement direction consistency into the association cost + matrix. This term requires no additional assumption but from the + same linear motion assumption as the canonical Kalman Filter in SORT. + + Args: + ids (list[int]): Tracking ids. + det_bboxes (Tensor): of shape (N, 4) + det_labels (Tensor): of shape (N,) + det_scores (Tensor): of shape (N,) + weight_iou_with_det_scores (bool, optional): Whether using + detection scores to weight IOU which is used for matching. + Defaults to False. + match_iou_thr (float, optional): Matching threshold. + Defaults to 0.5. + + Returns: + tuple(int): The assigning ids. + + OC-SORT uses velocity consistency besides IoU for association + """ + # get track_bboxes + track_bboxes = np.zeros((0, 4)) + for id in ids: + track_bboxes = np.concatenate( + (track_bboxes, self.tracks[id].mean[:4][None]), axis=0) + track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes) + track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes) + + # compute distance + ious = bbox_overlaps(track_bboxes, det_bboxes) + if weight_iou_with_det_scores: + ious *= det_scores + + # support multi-class association + track_labels = torch.tensor([ + self.tracks[id]['labels'][-1] for id in ids + ]).to(det_bboxes.device) + cate_match = det_labels[None, :] == track_labels[:, None] + # to avoid det and track of different categories are matched + cate_cost = (1 - cate_match.int()) * 1e6 + + dists = (1 - ious + cate_cost).cpu().numpy() + + if len(ids) > 0 and len(det_bboxes) > 0: + track_velocities = torch.stack( + [self.tracks[id].velocity for id in ids]).to(det_bboxes.device) + k_step_observations = torch.stack([ + self.k_step_observation(self.tracks[id]) for id in ids + ]).to(det_bboxes.device) + # valid1: if the track has previous observations to estimate speed + # valid2: if the associated observation k steps ago is a detection + valid1 = track_velocities.sum(dim=1) != -2 + valid2 = k_step_observations.sum(dim=1) != -4 + valid = valid1 & valid2 + + vel_to_match = self.vel_direction_batch(k_step_observations, + det_bboxes) + track_velocities = track_velocities[:, None, :].repeat( + 1, det_bboxes.shape[0], 1) + + angle_cos = (vel_to_match * track_velocities).sum(dim=-1) + angle_cos = torch.clamp(angle_cos, min=-1, max=1) + angle = torch.acos(angle_cos) # [0, pi] + norm_angle = (angle - np.pi / 2.) / np.pi # [-0.5, 0.5] + valid_matrix = valid[:, None].int().repeat(1, det_bboxes.shape[0]) + # set non-valid entries 0 + valid_norm_angle = norm_angle * valid_matrix + + dists += valid_norm_angle.cpu().numpy() * self.vel_consist_weight + + # bipartite match + if dists.size > 0: + cost, row, col = lap.lapjv( + dists, extend_cost=True, cost_limit=1 - match_iou_thr) + else: + row = np.zeros(len(ids)).astype(np.int32) - 1 + col = np.zeros(len(det_bboxes)).astype(np.int32) - 1 + return row, col + + def last_obs(self, track: Dict): + """extract the last associated observation.""" + for bbox in track.obs[::-1]: + if bbox is not None: + return bbox + + def ocr_assign_ids(self, + track_obs: torch.Tensor, + last_track_labels: torch.Tensor, + det_bboxes: torch.Tensor, + det_labels: torch.Tensor, + det_scores: torch.Tensor, + weight_iou_with_det_scores: Optional[bool] = False, + match_iou_thr: Optional[float] = 0.5): + """association for Observation-Centric Recovery. + + As try to recover tracks from being lost whose estimated velocity is + out- to-date, we use IoU-only matching strategy. + + Args: + track_obs (Tensor): the list of historical associated + detections of tracks + det_bboxes (Tensor): of shape (N, 5), unmatched detections + det_labels (Tensor): of shape (N,) + det_scores (Tensor): of shape (N,) + weight_iou_with_det_scores (bool, optional): Whether using + detection scores to weight IOU which is used for matching. + Defaults to False. + match_iou_thr (float, optional): Matching threshold. + Defaults to 0.5. + + Returns: + tuple(int): The assigning ids. + """ + # compute distance + ious = bbox_overlaps(track_obs, det_bboxes) + if weight_iou_with_det_scores: + ious *= det_scores + + # support multi-class association + cate_match = det_labels[None, :] == last_track_labels[:, None] + # to avoid det and track of different categories are matched + cate_cost = (1 - cate_match.int()) * 1e6 + + dists = (1 - ious + cate_cost).cpu().numpy() + + # bipartite match + if dists.size > 0: + cost, row, col = lap.lapjv( + dists, extend_cost=True, cost_limit=1 - match_iou_thr) + else: + row = np.zeros(len(track_obs)).astype(np.int32) - 1 + col = np.zeros(len(det_bboxes)).astype(np.int32) - 1 + return row, col + + def online_smooth(self, track: Dict, obj: torch.Tensor): + """Once a track is recovered from being lost, online smooth its + parameters to fix the error accumulated during being lost. + + NOTE: you can use different virtual trajectory generation + strategies, we adopt the naive linear interpolation as default + """ + last_match_bbox = self.last_obs(track) + new_match_bbox = obj + unmatch_len = 0 + for bbox in track.obs[::-1]: + if bbox is None: + unmatch_len += 1 + else: + break + bbox_shift_per_step = (new_match_bbox - last_match_bbox) / ( + unmatch_len + 1) + track.mean = track.saved_attr.mean + track.covariance = track.saved_attr.covariance + for i in range(unmatch_len): + virtual_bbox = last_match_bbox + (i + 1) * bbox_shift_per_step + virtual_bbox = bbox_xyxy_to_cxcyah(virtual_bbox[None, :]) + virtual_bbox = virtual_bbox.squeeze(0).cpu().numpy() + track.mean, track.covariance = self.kf.update( + track.mean, track.covariance, virtual_bbox) + + def track(self, data_sample: DetDataSample, **kwargs) -> InstanceData: + """Tracking forward function. + NOTE: this implementation is slightly different from the original + OC-SORT implementation (https://github.com/noahcao/OC_SORT)that we + do association between detections and tentative/non-tentative tracks + independently while the original implementation combines them together. + + Args: + data_sample (:obj:`DetDataSample`): The data sample. + It includes information such as `pred_instances`. + + Returns: + :obj:`InstanceData`: Tracking results of the input images. + Each InstanceData usually contains ``bboxes``, ``labels``, + ``scores`` and ``instances_id``. + """ + metainfo = data_sample.metainfo + bboxes = data_sample.pred_instances.bboxes + labels = data_sample.pred_instances.labels + scores = data_sample.pred_instances.scores + frame_id = metainfo.get('frame_id', -1) + if frame_id == 0: + self.reset() + if not hasattr(self, 'kf'): + self.kf = self.motion + + if self.empty or bboxes.size(0) == 0: + valid_inds = scores > self.init_track_thr + scores = scores[valid_inds] + bboxes = bboxes[valid_inds] + labels = labels[valid_inds] + num_new_tracks = bboxes.size(0) + ids = torch.arange(self.num_tracks, + self.num_tracks + num_new_tracks).to(labels) + self.num_tracks += num_new_tracks + else: + # 0. init + ids = torch.full((bboxes.size(0), ), + -1, + dtype=labels.dtype, + device=labels.device) + + # get the detection bboxes for the first association + det_inds = scores > self.obj_score_thr + det_bboxes = bboxes[det_inds] + det_labels = labels[det_inds] + det_scores = scores[det_inds] + det_ids = ids[det_inds] + + # 1. predict by Kalman Filter + for id in self.confirmed_ids: + # track is lost in previous frame + if self.tracks[id].frame_ids[-1] != frame_id - 1: + self.tracks[id].mean[7] = 0 + if self.tracks[id].tracked: + self.tracks[id].saved_attr.mean = self.tracks[id].mean + self.tracks[id].saved_attr.covariance = self.tracks[ + id].covariance + (self.tracks[id].mean, + self.tracks[id].covariance) = self.kf.predict( + self.tracks[id].mean, self.tracks[id].covariance) + + # 2. match detections and tracks' predicted locations + match_track_inds, raw_match_det_inds = self.ocm_assign_ids( + self.confirmed_ids, det_bboxes, det_labels, det_scores, + self.weight_iou_with_det_scores, self.match_iou_thr) + # '-1' mean a detection box is not matched with tracklets in + # previous frame + valid = raw_match_det_inds > -1 + det_ids[valid] = torch.tensor( + self.confirmed_ids)[raw_match_det_inds[valid]].to(labels) + + match_det_bboxes = det_bboxes[valid] + match_det_labels = det_labels[valid] + match_det_scores = det_scores[valid] + match_det_ids = det_ids[valid] + assert (match_det_ids > -1).all() + + # unmatched tracks and detections + unmatch_det_bboxes = det_bboxes[~valid] + unmatch_det_labels = det_labels[~valid] + unmatch_det_scores = det_scores[~valid] + unmatch_det_ids = det_ids[~valid] + assert (unmatch_det_ids == -1).all() + + # 3. use unmatched detection bboxes from the first match to match + # the unconfirmed tracks + (tentative_match_track_inds, + tentative_match_det_inds) = self.ocm_assign_ids( + self.unconfirmed_ids, unmatch_det_bboxes, unmatch_det_labels, + unmatch_det_scores, self.weight_iou_with_det_scores, + self.match_iou_thr) + valid = tentative_match_det_inds > -1 + unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[ + tentative_match_det_inds[valid]].to(labels) + + match_det_bboxes = torch.cat( + (match_det_bboxes, unmatch_det_bboxes[valid]), dim=0) + match_det_labels = torch.cat( + (match_det_labels, unmatch_det_labels[valid]), dim=0) + match_det_scores = torch.cat( + (match_det_scores, unmatch_det_scores[valid]), dim=0) + match_det_ids = torch.cat((match_det_ids, unmatch_det_ids[valid]), + dim=0) + assert (match_det_ids > -1).all() + + unmatch_det_bboxes = unmatch_det_bboxes[~valid] + unmatch_det_labels = unmatch_det_labels[~valid] + unmatch_det_scores = unmatch_det_scores[~valid] + unmatch_det_ids = unmatch_det_ids[~valid] + assert (unmatch_det_ids == -1).all() + + all_track_ids = [id for id, _ in self.tracks.items()] + unmatched_track_inds = torch.tensor( + [ind for ind in all_track_ids if ind not in match_det_ids]) + + if len(unmatched_track_inds) > 0: + # 4. still some tracks not associated yet, perform OCR + last_observations = [] + for id in unmatched_track_inds: + last_box = self.last_obs(self.tracks[id.item()]) + last_observations.append(last_box) + last_observations = torch.stack(last_observations) + last_track_labels = torch.tensor([ + self.tracks[id.item()]['labels'][-1] + for id in unmatched_track_inds + ]).to(det_bboxes.device) + + remain_det_ids = torch.full((unmatch_det_bboxes.size(0), ), + -1, + dtype=labels.dtype, + device=labels.device) + + _, ocr_match_det_inds = self.ocr_assign_ids( + last_observations, last_track_labels, unmatch_det_bboxes, + unmatch_det_labels, unmatch_det_scores, + self.weight_iou_with_det_scores, self.match_iou_thr) + + valid = ocr_match_det_inds > -1 + remain_det_ids[valid] = unmatched_track_inds.clone()[ + ocr_match_det_inds[valid]].to(labels) + + ocr_match_det_bboxes = unmatch_det_bboxes[valid] + ocr_match_det_labels = unmatch_det_labels[valid] + ocr_match_det_scores = unmatch_det_scores[valid] + ocr_match_det_ids = remain_det_ids[valid] + assert (ocr_match_det_ids > -1).all() + + ocr_unmatch_det_bboxes = unmatch_det_bboxes[~valid] + ocr_unmatch_det_labels = unmatch_det_labels[~valid] + ocr_unmatch_det_scores = unmatch_det_scores[~valid] + ocr_unmatch_det_ids = remain_det_ids[~valid] + assert (ocr_unmatch_det_ids == -1).all() + + unmatch_det_bboxes = ocr_unmatch_det_bboxes + unmatch_det_labels = ocr_unmatch_det_labels + unmatch_det_scores = ocr_unmatch_det_scores + unmatch_det_ids = ocr_unmatch_det_ids + match_det_bboxes = torch.cat( + (match_det_bboxes, ocr_match_det_bboxes), dim=0) + match_det_labels = torch.cat( + (match_det_labels, ocr_match_det_labels), dim=0) + match_det_scores = torch.cat( + (match_det_scores, ocr_match_det_scores), dim=0) + match_det_ids = torch.cat((match_det_ids, ocr_match_det_ids), + dim=0) + + # 5. summarize the track results + for i in range(len(match_det_ids)): + det_bbox = match_det_bboxes[i] + track_id = match_det_ids[i].item() + if not self.tracks[track_id].tracked: + # the track is lost before this step + self.online_smooth(self.tracks[track_id], det_bbox) + + for track_id in all_track_ids: + if track_id not in match_det_ids: + self.tracks[track_id].tracked = False + self.tracks[track_id].obs.append(None) + + bboxes = torch.cat((match_det_bboxes, unmatch_det_bboxes), dim=0) + labels = torch.cat((match_det_labels, unmatch_det_labels), dim=0) + scores = torch.cat((match_det_scores, unmatch_det_scores), dim=0) + ids = torch.cat((match_det_ids, unmatch_det_ids), dim=0) + # 6. assign new ids + new_track_inds = ids == -1 + + ids[new_track_inds] = torch.arange( + self.num_tracks, + self.num_tracks + new_track_inds.sum()).to(labels) + self.num_tracks += new_track_inds.sum() + + self.update( + ids=ids, + bboxes=bboxes, + labels=labels, + scores=scores, + frame_ids=frame_id) + + # update pred_track_instances + pred_track_instances = InstanceData() + pred_track_instances.bboxes = bboxes + pred_track_instances.labels = labels + pred_track_instances.scores = scores + pred_track_instances.instances_id = ids + return pred_track_instances diff --git a/mmdetection/mmdet/models/trackers/quasi_dense_tracker.py b/mmdetection/mmdet/models/trackers/quasi_dense_tracker.py new file mode 100644 index 0000000..c93c3c4 --- /dev/null +++ b/mmdetection/mmdet/models/trackers/quasi_dense_tracker.py @@ -0,0 +1,316 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +import torch.nn.functional as F +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS +from mmdet.structures import TrackDataSample +from mmdet.structures.bbox import bbox_overlaps +from .base_tracker import BaseTracker + + +@MODELS.register_module() +class QuasiDenseTracker(BaseTracker): + """Tracker for Quasi-Dense Tracking. + + Args: + init_score_thr (float): The cls_score threshold to + initialize a new tracklet. Defaults to 0.8. + obj_score_thr (float): The cls_score threshold to + update a tracked tracklet. Defaults to 0.5. + match_score_thr (float): The match threshold. Defaults to 0.5. + memo_tracklet_frames (int): The most frames in a tracklet memory. + Defaults to 10. + memo_backdrop_frames (int): The most frames in the backdrops. + Defaults to 1. + memo_momentum (float): The momentum value for embeds updating. + Defaults to 0.8. + nms_conf_thr (float): The nms threshold for confidence. + Defaults to 0.5. + nms_backdrop_iou_thr (float): The nms threshold for backdrop IoU. + Defaults to 0.3. + nms_class_iou_thr (float): The nms threshold for class IoU. + Defaults to 0.7. + with_cats (bool): Whether to track with the same category. + Defaults to True. + match_metric (str): The match metric. Defaults to 'bisoftmax'. + """ + + def __init__(self, + init_score_thr: float = 0.8, + obj_score_thr: float = 0.5, + match_score_thr: float = 0.5, + memo_tracklet_frames: int = 10, + memo_backdrop_frames: int = 1, + memo_momentum: float = 0.8, + nms_conf_thr: float = 0.5, + nms_backdrop_iou_thr: float = 0.3, + nms_class_iou_thr: float = 0.7, + with_cats: bool = True, + match_metric: str = 'bisoftmax', + **kwargs): + super().__init__(**kwargs) + assert 0 <= memo_momentum <= 1.0 + assert memo_tracklet_frames >= 0 + assert memo_backdrop_frames >= 0 + self.init_score_thr = init_score_thr + self.obj_score_thr = obj_score_thr + self.match_score_thr = match_score_thr + self.memo_tracklet_frames = memo_tracklet_frames + self.memo_backdrop_frames = memo_backdrop_frames + self.memo_momentum = memo_momentum + self.nms_conf_thr = nms_conf_thr + self.nms_backdrop_iou_thr = nms_backdrop_iou_thr + self.nms_class_iou_thr = nms_class_iou_thr + self.with_cats = with_cats + assert match_metric in ['bisoftmax', 'softmax', 'cosine'] + self.match_metric = match_metric + + self.num_tracks = 0 + self.tracks = dict() + self.backdrops = [] + + def reset(self): + """Reset the buffer of the tracker.""" + self.num_tracks = 0 + self.tracks = dict() + self.backdrops = [] + + def update(self, ids: Tensor, bboxes: Tensor, embeds: Tensor, + labels: Tensor, scores: Tensor, frame_id: int) -> None: + """Tracking forward function. + + Args: + ids (Tensor): of shape(N, ). + bboxes (Tensor): of shape (N, 5). + embeds (Tensor): of shape (N, 256). + labels (Tensor): of shape (N, ). + scores (Tensor): of shape (N, ). + frame_id (int): The id of current frame, 0-index. + """ + tracklet_inds = ids > -1 + + for id, bbox, embed, label, score in zip(ids[tracklet_inds], + bboxes[tracklet_inds], + embeds[tracklet_inds], + labels[tracklet_inds], + scores[tracklet_inds]): + id = int(id) + # update the tracked ones and initialize new tracks + if id in self.tracks.keys(): + velocity = (bbox - self.tracks[id]['bbox']) / ( + frame_id - self.tracks[id]['last_frame']) + self.tracks[id]['bbox'] = bbox + self.tracks[id]['embed'] = ( + 1 - self.memo_momentum + ) * self.tracks[id]['embed'] + self.memo_momentum * embed + self.tracks[id]['last_frame'] = frame_id + self.tracks[id]['label'] = label + self.tracks[id]['score'] = score + self.tracks[id]['velocity'] = ( + self.tracks[id]['velocity'] * self.tracks[id]['acc_frame'] + + velocity) / ( + self.tracks[id]['acc_frame'] + 1) + self.tracks[id]['acc_frame'] += 1 + else: + self.tracks[id] = dict( + bbox=bbox, + embed=embed, + label=label, + score=score, + last_frame=frame_id, + velocity=torch.zeros_like(bbox), + acc_frame=0) + # backdrop update according to IoU + backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1) + ious = bbox_overlaps(bboxes[backdrop_inds], bboxes) + for i, ind in enumerate(backdrop_inds): + if (ious[i, :ind] > self.nms_backdrop_iou_thr).any(): + backdrop_inds[i] = -1 + backdrop_inds = backdrop_inds[backdrop_inds > -1] + # old backdrops would be removed at first + self.backdrops.insert( + 0, + dict( + bboxes=bboxes[backdrop_inds], + embeds=embeds[backdrop_inds], + labels=labels[backdrop_inds])) + + # pop memo + invalid_ids = [] + for k, v in self.tracks.items(): + if frame_id - v['last_frame'] >= self.memo_tracklet_frames: + invalid_ids.append(k) + for invalid_id in invalid_ids: + self.tracks.pop(invalid_id) + + if len(self.backdrops) > self.memo_backdrop_frames: + self.backdrops.pop() + + @property + def memo(self) -> Tuple[Tensor, ...]: + """Get tracks memory.""" + memo_embeds = [] + memo_ids = [] + memo_bboxes = [] + memo_labels = [] + # velocity of tracks + memo_vs = [] + # get tracks + for k, v in self.tracks.items(): + memo_bboxes.append(v['bbox'][None, :]) + memo_embeds.append(v['embed'][None, :]) + memo_ids.append(k) + memo_labels.append(v['label'].view(1, 1)) + memo_vs.append(v['velocity'][None, :]) + memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1) + # get backdrops + for backdrop in self.backdrops: + backdrop_ids = torch.full((1, backdrop['embeds'].size(0)), + -1, + dtype=torch.long) + backdrop_vs = torch.zeros_like(backdrop['bboxes']) + memo_bboxes.append(backdrop['bboxes']) + memo_embeds.append(backdrop['embeds']) + memo_ids = torch.cat([memo_ids, backdrop_ids], dim=1) + memo_labels.append(backdrop['labels'][:, None]) + memo_vs.append(backdrop_vs) + + memo_bboxes = torch.cat(memo_bboxes, dim=0) + memo_embeds = torch.cat(memo_embeds, dim=0) + memo_labels = torch.cat(memo_labels, dim=0).squeeze(1) + memo_vs = torch.cat(memo_vs, dim=0) + return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze( + 0), memo_vs + + def track(self, + model: torch.nn.Module, + img: torch.Tensor, + feats: List[torch.Tensor], + data_sample: TrackDataSample, + rescale=True, + **kwargs) -> InstanceData: + """Tracking forward function. + + Args: + model (nn.Module): MOT model. + img (Tensor): of shape (T, C, H, W) encoding input image. + Typically these should be mean centered and std scaled. + The T denotes the number of key images and usually is 1 in + QDTrack method. + feats (list[Tensor]): Multi level feature maps of `img`. + data_sample (:obj:`TrackDataSample`): The data sample. + It includes information such as `pred_instances`. + rescale (bool, optional): If True, the bounding boxes should be + rescaled to fit the original scale of the image. Defaults to + True. + + Returns: + :obj:`InstanceData`: Tracking results of the input images. + Each InstanceData usually contains ``bboxes``, ``labels``, + ``scores`` and ``instances_id``. + """ + metainfo = data_sample.metainfo + bboxes = data_sample.pred_instances.bboxes + labels = data_sample.pred_instances.labels + scores = data_sample.pred_instances.scores + + frame_id = metainfo.get('frame_id', -1) + # create pred_track_instances + pred_track_instances = InstanceData() + + # return zero bboxes if there is no track targets + if bboxes.shape[0] == 0: + ids = torch.zeros_like(labels) + pred_track_instances = data_sample.pred_instances.clone() + pred_track_instances.instances_id = ids + return pred_track_instances + + # get track feats + rescaled_bboxes = bboxes.clone() + if rescale: + scale_factor = rescaled_bboxes.new_tensor( + metainfo['scale_factor']).repeat((1, 2)) + rescaled_bboxes = rescaled_bboxes * scale_factor + track_feats = model.track_head.predict(feats, [rescaled_bboxes]) + # sort according to the object_score + _, inds = scores.sort(descending=True) + bboxes = bboxes[inds] + scores = scores[inds] + labels = labels[inds] + embeds = track_feats[inds, :] + + # duplicate removal for potential backdrops and cross classes + valids = bboxes.new_ones((bboxes.size(0))) + ious = bbox_overlaps(bboxes, bboxes) + for i in range(1, bboxes.size(0)): + thr = self.nms_backdrop_iou_thr if scores[ + i] < self.obj_score_thr else self.nms_class_iou_thr + if (ious[i, :i] > thr).any(): + valids[i] = 0 + valids = valids == 1 + bboxes = bboxes[valids] + scores = scores[valids] + labels = labels[valids] + embeds = embeds[valids, :] + + # init ids container + ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long) + + # match if buffer is not empty + if bboxes.size(0) > 0 and not self.empty: + (memo_bboxes, memo_labels, memo_embeds, memo_ids, + memo_vs) = self.memo + + if self.match_metric == 'bisoftmax': + feats = torch.mm(embeds, memo_embeds.t()) + d2t_scores = feats.softmax(dim=1) + t2d_scores = feats.softmax(dim=0) + match_scores = (d2t_scores + t2d_scores) / 2 + elif self.match_metric == 'softmax': + feats = torch.mm(embeds, memo_embeds.t()) + match_scores = feats.softmax(dim=1) + elif self.match_metric == 'cosine': + match_scores = torch.mm( + F.normalize(embeds, p=2, dim=1), + F.normalize(memo_embeds, p=2, dim=1).t()) + else: + raise NotImplementedError + # track with the same category + if self.with_cats: + cat_same = labels.view(-1, 1) == memo_labels.view(1, -1) + match_scores *= cat_same.float().to(match_scores.device) + # track according to match_scores + for i in range(bboxes.size(0)): + conf, memo_ind = torch.max(match_scores[i, :], dim=0) + id = memo_ids[memo_ind] + if conf > self.match_score_thr: + if id > -1: + # keep bboxes with high object score + # and remove background bboxes + if scores[i] > self.obj_score_thr: + ids[i] = id + match_scores[:i, memo_ind] = 0 + match_scores[i + 1:, memo_ind] = 0 + else: + if conf > self.nms_conf_thr: + ids[i] = -2 + # initialize new tracks + new_inds = (ids == -1) & (scores > self.init_score_thr).cpu() + num_news = new_inds.sum() + ids[new_inds] = torch.arange( + self.num_tracks, self.num_tracks + num_news, dtype=torch.long) + self.num_tracks += num_news + + self.update(ids, bboxes, embeds, labels, scores, frame_id) + tracklet_inds = ids > -1 + # update pred_track_instances + pred_track_instances.bboxes = bboxes[tracklet_inds] + pred_track_instances.labels = labels[tracklet_inds] + pred_track_instances.scores = scores[tracklet_inds] + pred_track_instances.instances_id = ids[tracklet_inds] + + return pred_track_instances diff --git a/mmdetection/mmdet/models/trackers/sort_tracker.py b/mmdetection/mmdet/models/trackers/sort_tracker.py new file mode 100644 index 0000000..c4a4fed --- /dev/null +++ b/mmdetection/mmdet/models/trackers/sort_tracker.py @@ -0,0 +1,268 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import numpy as np +import torch +from mmengine.structures import InstanceData + +try: + import motmetrics + from motmetrics.lap import linear_sum_assignment +except ImportError: + motmetrics = None +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcyah +from mmdet.utils import OptConfigType +from ..utils import imrenormalize +from .base_tracker import BaseTracker + + +@MODELS.register_module() +class SORTTracker(BaseTracker): + """Tracker for SORT/DeepSORT. + + Args: + obj_score_thr (float, optional): Threshold to filter the objects. + Defaults to 0.3. + motion (dict): Configuration of motion. Defaults to None. + reid (dict, optional): Configuration for the ReID model. + - num_samples (int, optional): Number of samples to calculate the + feature embeddings of a track. Default to 10. + - image_scale (tuple, optional): Input scale of the ReID model. + Default to (256, 128). + - img_norm_cfg (dict, optional): Configuration to normalize the + input. Default to None. + - match_score_thr (float, optional): Similarity threshold for the + matching process. Default to 2.0. + match_iou_thr (float, optional): Threshold of the IoU matching process. + Defaults to 0.7. + num_tentatives (int, optional): Number of continuous frames to confirm + a track. Defaults to 3. + """ + + def __init__(self, + motion: Optional[dict] = None, + obj_score_thr: float = 0.3, + reid: dict = dict( + num_samples=10, + img_scale=(256, 128), + img_norm_cfg=None, + match_score_thr=2.0), + match_iou_thr: float = 0.7, + num_tentatives: int = 3, + **kwargs): + if motmetrics is None: + raise RuntimeError('motmetrics is not installed,\ + please install it by: pip install motmetrics') + super().__init__(**kwargs) + if motion is not None: + self.motion = TASK_UTILS.build(motion) + assert self.motion is not None, 'SORT/Deep SORT need KalmanFilter' + self.obj_score_thr = obj_score_thr + self.reid = reid + self.match_iou_thr = match_iou_thr + self.num_tentatives = num_tentatives + + @property + def confirmed_ids(self) -> List: + """Confirmed ids in the tracker.""" + ids = [id for id, track in self.tracks.items() if not track.tentative] + return ids + + def init_track(self, id: int, obj: Tuple[Tensor]) -> None: + """Initialize a track.""" + super().init_track(id, obj) + self.tracks[id].tentative = True + bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1]) # size = (1, 4) + assert bbox.ndim == 2 and bbox.shape[0] == 1 + bbox = bbox.squeeze(0).cpu().numpy() + self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate( + bbox) + + def update_track(self, id: int, obj: Tuple[Tensor]) -> None: + """Update a track.""" + super().update_track(id, obj) + if self.tracks[id].tentative: + if len(self.tracks[id]['bboxes']) >= self.num_tentatives: + self.tracks[id].tentative = False + bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1]) # size = (1, 4) + assert bbox.ndim == 2 and bbox.shape[0] == 1 + bbox = bbox.squeeze(0).cpu().numpy() + self.tracks[id].mean, self.tracks[id].covariance = self.kf.update( + self.tracks[id].mean, self.tracks[id].covariance, bbox) + + def pop_invalid_tracks(self, frame_id: int) -> None: + """Pop out invalid tracks.""" + invalid_ids = [] + for k, v in self.tracks.items(): + # case1: disappeared frames >= self.num_frames_retrain + case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain + # case2: tentative tracks but not matched in this frame + case2 = v.tentative and v['frame_ids'][-1] != frame_id + if case1 or case2: + invalid_ids.append(k) + for invalid_id in invalid_ids: + self.tracks.pop(invalid_id) + + def track(self, + model: torch.nn.Module, + img: Tensor, + data_sample: DetDataSample, + data_preprocessor: OptConfigType = None, + rescale: bool = False, + **kwargs) -> InstanceData: + """Tracking forward function. + + Args: + model (nn.Module): MOT model. + img (Tensor): of shape (T, C, H, W) encoding input image. + Typically these should be mean centered and std scaled. + The T denotes the number of key images and usually is 1 in + SORT method. + data_sample (:obj:`TrackDataSample`): The data sample. + It includes information such as `pred_det_instances`. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + rescale (bool, optional): If True, the bounding boxes should be + rescaled to fit the original scale of the image. Defaults to + False. + + Returns: + :obj:`InstanceData`: Tracking results of the input images. + Each InstanceData usually contains ``bboxes``, ``labels``, + ``scores`` and ``instances_id``. + """ + metainfo = data_sample.metainfo + bboxes = data_sample.pred_instances.bboxes + labels = data_sample.pred_instances.labels + scores = data_sample.pred_instances.scores + + frame_id = metainfo.get('frame_id', -1) + if frame_id == 0: + self.reset() + if not hasattr(self, 'kf'): + self.kf = self.motion + + if self.with_reid: + if self.reid.get('img_norm_cfg', False): + img_norm_cfg = dict( + mean=data_preprocessor['mean'], + std=data_preprocessor['std'], + to_bgr=data_preprocessor['rgb_to_bgr']) + reid_img = imrenormalize(img, img_norm_cfg, + self.reid['img_norm_cfg']) + else: + reid_img = img.clone() + + valid_inds = scores > self.obj_score_thr + bboxes = bboxes[valid_inds] + labels = labels[valid_inds] + scores = scores[valid_inds] + + if self.empty or bboxes.size(0) == 0: + num_new_tracks = bboxes.size(0) + ids = torch.arange( + self.num_tracks, + self.num_tracks + num_new_tracks, + dtype=torch.long).to(bboxes.device) + self.num_tracks += num_new_tracks + if self.with_reid: + crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(), + rescale) + if crops.size(0) > 0: + embeds = model.reid(crops, mode='tensor') + else: + embeds = crops.new_zeros((0, model.reid.head.out_channels)) + else: + ids = torch.full((bboxes.size(0), ), -1, + dtype=torch.long).to(bboxes.device) + + # motion + self.tracks, costs = self.motion.track(self.tracks, + bbox_xyxy_to_cxcyah(bboxes)) + + active_ids = self.confirmed_ids + if self.with_reid: + crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(), + rescale) + embeds = model.reid(crops, mode='tensor') + + # reid + if len(active_ids) > 0: + track_embeds = self.get( + 'embeds', + active_ids, + self.reid.get('num_samples', None), + behavior='mean') + reid_dists = torch.cdist(track_embeds, embeds) + + # support multi-class association + track_labels = torch.tensor([ + self.tracks[id]['labels'][-1] for id in active_ids + ]).to(bboxes.device) + cate_match = labels[None, :] == track_labels[:, None] + cate_cost = (1 - cate_match.int()) * 1e6 + reid_dists = (reid_dists + cate_cost).cpu().numpy() + + valid_inds = [list(self.ids).index(_) for _ in active_ids] + reid_dists[~np.isfinite(costs[valid_inds, :])] = np.nan + + row, col = linear_sum_assignment(reid_dists) + for r, c in zip(row, col): + dist = reid_dists[r, c] + if not np.isfinite(dist): + continue + if dist <= self.reid['match_score_thr']: + ids[c] = active_ids[r] + + active_ids = [ + id for id in self.ids if id not in ids + and self.tracks[id].frame_ids[-1] == frame_id - 1 + ] + if len(active_ids) > 0: + active_dets = torch.nonzero(ids == -1).squeeze(1) + track_bboxes = self.get('bboxes', active_ids) + ious = bbox_overlaps(track_bboxes, bboxes[active_dets]) + + # support multi-class association + track_labels = torch.tensor([ + self.tracks[id]['labels'][-1] for id in active_ids + ]).to(bboxes.device) + cate_match = labels[None, active_dets] == track_labels[:, None] + cate_cost = (1 - cate_match.int()) * 1e6 + + dists = (1 - ious + cate_cost).cpu().numpy() + + row, col = linear_sum_assignment(dists) + for r, c in zip(row, col): + dist = dists[r, c] + if dist < 1 - self.match_iou_thr: + ids[active_dets[c]] = active_ids[r] + + new_track_inds = ids == -1 + ids[new_track_inds] = torch.arange( + self.num_tracks, + self.num_tracks + new_track_inds.sum(), + dtype=torch.long).to(bboxes.device) + self.num_tracks += new_track_inds.sum() + + self.update( + ids=ids, + bboxes=bboxes, + scores=scores, + labels=labels, + embeds=embeds if self.with_reid else None, + frame_ids=frame_id) + + # update pred_track_instances + pred_track_instances = InstanceData() + pred_track_instances.bboxes = bboxes + pred_track_instances.labels = labels + pred_track_instances.scores = scores + pred_track_instances.instances_id = ids + + return pred_track_instances diff --git a/mmdetection/mmdet/models/trackers/strongsort_tracker.py b/mmdetection/mmdet/models/trackers/strongsort_tracker.py new file mode 100644 index 0000000..9d70757 --- /dev/null +++ b/mmdetection/mmdet/models/trackers/strongsort_tracker.py @@ -0,0 +1,273 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import numpy as np +import torch +from mmengine.structures import InstanceData + +try: + import motmetrics + from motmetrics.lap import linear_sum_assignment +except ImportError: + motmetrics = None +from torch import Tensor + +from mmdet.models.utils import imrenormalize +from mmdet.registry import MODELS +from mmdet.structures import TrackDataSample +from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcyah +from mmdet.utils import OptConfigType +from .sort_tracker import SORTTracker + + +def cosine_distance(x: Tensor, y: Tensor) -> np.ndarray: + """compute the cosine distance. + + Args: + x (Tensor): embeddings with shape (N,C). + y (Tensor): embeddings with shape (M,C). + + Returns: + ndarray: cosine distance with shape (N,M). + """ + x = x.cpu().numpy() + y = y.cpu().numpy() + x = x / np.linalg.norm(x, axis=1, keepdims=True) + y = y / np.linalg.norm(y, axis=1, keepdims=True) + dists = 1. - np.dot(x, y.T) + return dists + + +@MODELS.register_module() +class StrongSORTTracker(SORTTracker): + """Tracker for StrongSORT. + + Args: + obj_score_thr (float, optional): Threshold to filter the objects. + Defaults to 0.6. + motion (dict): Configuration of motion. Defaults to None. + reid (dict, optional): Configuration for the ReID model. + - num_samples (int, optional): Number of samples to calculate the + feature embeddings of a track. Default to None. + - image_scale (tuple, optional): Input scale of the ReID model. + Default to (256, 128). + - img_norm_cfg (dict, optional): Configuration to normalize the + input. Default to None. + - match_score_thr (float, optional): Similarity threshold for the + matching process. Default to 0.3. + - motion_weight (float, optional): the weight of the motion cost. + Defaults to 0.02. + match_iou_thr (float, optional): Threshold of the IoU matching process. + Defaults to 0.7. + num_tentatives (int, optional): Number of continuous frames to confirm + a track. Defaults to 2. + """ + + def __init__(self, + motion: Optional[dict] = None, + obj_score_thr: float = 0.6, + reid: dict = dict( + num_samples=None, + img_scale=(256, 128), + img_norm_cfg=None, + match_score_thr=0.3, + motion_weight=0.02), + match_iou_thr: float = 0.7, + num_tentatives: int = 2, + **kwargs): + if motmetrics is None: + raise RuntimeError('motmetrics is not installed,\ + please install it by: pip install motmetrics') + super().__init__(motion, obj_score_thr, reid, match_iou_thr, + num_tentatives, **kwargs) + + def update_track(self, id: int, obj: Tuple[Tensor]) -> None: + """Update a track.""" + for k, v in zip(self.memo_items, obj): + v = v[None] + if self.momentums is not None and k in self.momentums: + m = self.momentums[k] + self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v + else: + self.tracks[id][k].append(v) + + if self.tracks[id].tentative: + if len(self.tracks[id]['bboxes']) >= self.num_tentatives: + self.tracks[id].tentative = False + bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1]) # size = (1, 4) + assert bbox.ndim == 2 and bbox.shape[0] == 1 + bbox = bbox.squeeze(0).cpu().numpy() + score = float(self.tracks[id].scores[-1].cpu()) + self.tracks[id].mean, self.tracks[id].covariance = self.kf.update( + self.tracks[id].mean, self.tracks[id].covariance, bbox, score) + + def track(self, + model: torch.nn.Module, + img: Tensor, + data_sample: TrackDataSample, + data_preprocessor: OptConfigType = None, + rescale: bool = False, + **kwargs) -> InstanceData: + """Tracking forward function. + + Args: + model (nn.Module): MOT model. + img (Tensor): of shape (T, C, H, W) encoding input image. + Typically these should be mean centered and std scaled. + The T denotes the number of key images and usually is 1 in + SORT method. + feats (list[Tensor]): Multi level feature maps of `img`. + data_sample (:obj:`TrackDataSample`): The data sample. + It includes information such as `pred_det_instances`. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + rescale (bool, optional): If True, the bounding boxes should be + rescaled to fit the original scale of the image. Defaults to + False. + + Returns: + :obj:`InstanceData`: Tracking results of the input images. + Each InstanceData usually contains ``bboxes``, ``labels``, + ``scores`` and ``instances_id``. + """ + metainfo = data_sample.metainfo + bboxes = data_sample.pred_instances.bboxes + labels = data_sample.pred_instances.labels + scores = data_sample.pred_instances.scores + + frame_id = metainfo.get('frame_id', -1) + if frame_id == 0: + self.reset() + if not hasattr(self, 'kf'): + self.kf = self.motion + + if self.with_reid: + if self.reid.get('img_norm_cfg', False): + img_norm_cfg = dict( + mean=data_preprocessor.get('mean', [0, 0, 0]), + std=data_preprocessor.get('std', [1, 1, 1]), + to_bgr=data_preprocessor.get('rgb_to_bgr', False)) + reid_img = imrenormalize(img, img_norm_cfg, + self.reid['img_norm_cfg']) + else: + reid_img = img.clone() + + valid_inds = scores > self.obj_score_thr + bboxes = bboxes[valid_inds] + labels = labels[valid_inds] + scores = scores[valid_inds] + + if self.empty or bboxes.size(0) == 0: + num_new_tracks = bboxes.size(0) + ids = torch.arange( + self.num_tracks, + self.num_tracks + num_new_tracks, + dtype=torch.long).to(bboxes.device) + self.num_tracks += num_new_tracks + if self.with_reid: + crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(), + rescale) + if crops.size(0) > 0: + embeds = model.reid(crops, mode='tensor') + else: + embeds = crops.new_zeros((0, model.reid.head.out_channels)) + else: + ids = torch.full((bboxes.size(0), ), -1, + dtype=torch.long).to(bboxes.device) + + # motion + if model.with_cmc: + num_samples = 1 + self.tracks = model.cmc.track(self.last_img, img, self.tracks, + num_samples, frame_id, metainfo) + + self.tracks, motion_dists = self.motion.track( + self.tracks, bbox_xyxy_to_cxcyah(bboxes)) + + active_ids = self.confirmed_ids + if self.with_reid: + crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(), + rescale) + embeds = model.reid(crops, mode='tensor') + + # reid + if len(active_ids) > 0: + track_embeds = self.get( + 'embeds', + active_ids, + self.reid.get('num_samples', None), + behavior='mean') + reid_dists = cosine_distance(track_embeds, embeds) + valid_inds = [list(self.ids).index(_) for _ in active_ids] + reid_dists[~np.isfinite(motion_dists[ + valid_inds, :])] = np.nan + + weight_motion = self.reid.get('motion_weight') + match_dists = (1 - weight_motion) * reid_dists + \ + weight_motion * motion_dists[valid_inds] + + # support multi-class association + track_labels = torch.tensor([ + self.tracks[id]['labels'][-1] for id in active_ids + ]).to(bboxes.device) + cate_match = labels[None, :] == track_labels[:, None] + cate_cost = ((1 - cate_match.int()) * 1e6).cpu().numpy() + match_dists = match_dists + cate_cost + + row, col = linear_sum_assignment(match_dists) + for r, c in zip(row, col): + dist = match_dists[r, c] + if not np.isfinite(dist): + continue + if dist <= self.reid['match_score_thr']: + ids[c] = active_ids[r] + + active_ids = [ + id for id in self.ids if id not in ids + and self.tracks[id].frame_ids[-1] == frame_id - 1 + ] + if len(active_ids) > 0: + active_dets = torch.nonzero(ids == -1).squeeze(1) + track_bboxes = self.get('bboxes', active_ids) + ious = bbox_overlaps(track_bboxes, bboxes[active_dets]) + + # support multi-class association + track_labels = torch.tensor([ + self.tracks[id]['labels'][-1] for id in active_ids + ]).to(bboxes.device) + cate_match = labels[None, active_dets] == track_labels[:, None] + cate_cost = (1 - cate_match.int()) * 1e6 + + dists = (1 - ious + cate_cost).cpu().numpy() + + row, col = linear_sum_assignment(dists) + for r, c in zip(row, col): + dist = dists[r, c] + if dist < 1 - self.match_iou_thr: + ids[active_dets[c]] = active_ids[r] + + new_track_inds = ids == -1 + ids[new_track_inds] = torch.arange( + self.num_tracks, + self.num_tracks + new_track_inds.sum(), + dtype=torch.long).to(bboxes.device) + self.num_tracks += new_track_inds.sum() + + self.update( + ids=ids, + bboxes=bboxes, + scores=scores, + labels=labels, + embeds=embeds if self.with_reid else None, + frame_ids=frame_id) + self.last_img = img + + # update pred_track_instances + pred_track_instances = InstanceData() + pred_track_instances.bboxes = bboxes + pred_track_instances.labels = labels + pred_track_instances.scores = scores + pred_track_instances.instances_id = ids + + return pred_track_instances diff --git a/mmdetection/mmdet/models/tracking_heads/__init__.py b/mmdetection/mmdet/models/tracking_heads/__init__.py new file mode 100644 index 0000000..bd1f056 --- /dev/null +++ b/mmdetection/mmdet/models/tracking_heads/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .mask2former_track_head import Mask2FormerTrackHead +from .quasi_dense_embed_head import QuasiDenseEmbedHead +from .quasi_dense_track_head import QuasiDenseTrackHead +from .roi_embed_head import RoIEmbedHead +from .roi_track_head import RoITrackHead + +__all__ = [ + 'QuasiDenseEmbedHead', 'QuasiDenseTrackHead', 'Mask2FormerTrackHead', + 'RoIEmbedHead', 'RoITrackHead' +] diff --git a/mmdetection/mmdet/models/tracking_heads/mask2former_track_head.py b/mmdetection/mmdet/models/tracking_heads/mask2former_track_head.py new file mode 100644 index 0000000..0877241 --- /dev/null +++ b/mmdetection/mmdet/models/tracking_heads/mask2former_track_head.py @@ -0,0 +1,729 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from collections import defaultdict +from typing import Dict, List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Conv2d +from mmcv.ops import point_sample +from mmengine.model import ModuleList +from mmengine.model.weight_init import caffe2_xavier_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.dense_heads import AnchorFreeHead, MaskFormerHead +from mmdet.models.utils import get_uncertain_point_coords_with_randomness +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import TrackDataSample, TrackSampleList +from mmdet.structures.mask import mask2bbox +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptMultiConfig, reduce_mean) +from ..layers import Mask2FormerTransformerDecoder + + +@MODELS.register_module() +class Mask2FormerTrackHead(MaskFormerHead): + """Implements the Mask2Former head. + + See `Masked-attention Mask Transformer for Universal Image + Segmentation `_ for details. + + Args: + in_channels (list[int]): Number of channels in the input feature map. + feat_channels (int): Number of channels for features. + out_channels (int): Number of channels for output. + num_classes (int): Number of VIS classes. + num_queries (int): Number of query in Transformer decoder. + Defaults to 100. + num_transformer_feat_level (int): Number of feats levels. + Defaults to 3. + pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel + decoder. + enforce_decoder_input_project (bool, optional): Whether to add + a layer to change the embed_dim of transformer encoder in + pixel decoder to the embed_dim of transformer decoder. + Defaults to False. + transformer_decoder (:obj:`ConfigDict` or dict): Config for + transformer decoder. + positional_encoding (:obj:`ConfigDict` or dict): Config for + transformer decoder position encoding. + Defaults to `SinePositionalEncoding3D`. + loss_cls (:obj:`ConfigDict` or dict): Config of the classification + loss. Defaults to `CrossEntropyLoss`. + loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss. + Defaults to 'CrossEntropyLoss'. + loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss. + Defaults to 'DiceLoss'. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + Mask2Former head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + Mask2Former head. Defaults to None. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict], optional): Initialization config dict. Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + feat_channels: int, + out_channels: int, + num_classes: int, + num_frames: int = 2, + num_queries: int = 100, + num_transformer_feat_level: int = 3, + pixel_decoder: ConfigType = ..., + enforce_decoder_input_project: bool = False, + transformer_decoder: ConfigType = ..., + positional_encoding: ConfigType = dict( + num_feats=128, normalize=True), + loss_cls: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + reduction='mean', + class_weight=[1.0] * 133 + [0.1]), + loss_mask: ConfigType = dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=5.0), + loss_dice: ConfigType = dict( + type='DiceLoss', + use_sigmoid=True, + activate=True, + reduction='mean', + naive_dice=True, + eps=1.0, + loss_weight=5.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + super(AnchorFreeHead, self).__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.num_frames = num_frames + self.num_queries = num_queries + self.num_transformer_feat_level = num_transformer_feat_level + self.num_transformer_feat_level = num_transformer_feat_level + self.num_heads = transformer_decoder.layer_cfg.cross_attn_cfg.num_heads + self.num_transformer_decoder_layers = transformer_decoder.num_layers + assert pixel_decoder.encoder.layer_cfg. \ + self_attn_cfg.num_levels == num_transformer_feat_level + pixel_decoder_ = copy.deepcopy(pixel_decoder) + pixel_decoder_.update( + in_channels=in_channels, + feat_channels=feat_channels, + out_channels=out_channels) + self.pixel_decoder = MODELS.build(pixel_decoder_) + self.transformer_decoder = Mask2FormerTransformerDecoder( + **transformer_decoder) + self.decoder_embed_dims = self.transformer_decoder.embed_dims + + self.decoder_input_projs = ModuleList() + # from low resolution to high resolution + for _ in range(num_transformer_feat_level): + if (self.decoder_embed_dims != feat_channels + or enforce_decoder_input_project): + self.decoder_input_projs.append( + Conv2d( + feat_channels, self.decoder_embed_dims, kernel_size=1)) + else: + self.decoder_input_projs.append(nn.Identity()) + self.decoder_positional_encoding = MODELS.build(positional_encoding) + self.query_embed = nn.Embedding(self.num_queries, feat_channels) + self.query_feat = nn.Embedding(self.num_queries, feat_channels) + # from low resolution to high resolution + self.level_embed = nn.Embedding(self.num_transformer_feat_level, + feat_channels) + + self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) + self.mask_embed = nn.Sequential( + nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), + nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), + nn.Linear(feat_channels, out_channels)) + + self.test_cfg = test_cfg + self.train_cfg = train_cfg + if train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + self.sampler = TASK_UTILS.build( + # self.train_cfg.sampler, default_args=dict(context=self)) + self.train_cfg['sampler'], + default_args=dict(context=self)) + self.num_points = self.train_cfg.get('num_points', 12544) + self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0) + self.importance_sample_ratio = self.train_cfg.get( + 'importance_sample_ratio', 0.75) + + self.class_weight = loss_cls.class_weight + self.loss_cls = MODELS.build(loss_cls) + self.loss_mask = MODELS.build(loss_mask) + self.loss_dice = MODELS.build(loss_dice) + + def init_weights(self) -> None: + for m in self.decoder_input_projs: + if isinstance(m, Conv2d): + caffe2_xavier_init(m, bias=0) + + self.pixel_decoder.init_weights() + + for p in self.transformer_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_normal_(p) + + def preprocess_gt(self, batch_gt_instances: InstanceList) -> InstanceList: + """Preprocess the ground truth for all images. + + It aims to reorganize the `gt`. For example, in the + `batch_data_sample.gt_instances.mask`, its shape is + `(all_num_gts, h, w)`, but we don't know each gt belongs to which `img` + (assume `num_frames` is 2). So, this func used to reshape the `gt_mask` + to `(num_gts_per_img, num_frames, h, w)`. In addition, we can't + guarantee that the number of instances in these two images is equal, + so `-1` refers to nonexistent instances. + + Args: + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``labels``, each is + ground truth labels of each bbox, with shape (num_gts, ) + and ``masks``, each is ground truth masks of each instances + of an image, shape (num_gts, h, w). + + Returns: + list[obj:`InstanceData`]: each contains the following keys + + - labels (Tensor): Ground truth class indices\ + for an image, with shape (n, ), n is the sum of\ + number of stuff type and number of instance in an image. + - masks (Tensor): Ground truth mask for a\ + image, with shape (n, t, h, w). + """ + final_batch_gt_instances = [] + batch_size = len(batch_gt_instances) // self.num_frames + for batch_idx in range(batch_size): + pair_gt_insatences = batch_gt_instances[batch_idx * + self.num_frames:batch_idx * + self.num_frames + + self.num_frames] + + assert len( + pair_gt_insatences + ) > 1, f'mask2former for vis need multi frames to train, \ + but you only use {len(pair_gt_insatences)} frames' + + _device = pair_gt_insatences[0].labels.device + + for gt_instances in pair_gt_insatences: + gt_instances.masks = gt_instances.masks.to_tensor( + dtype=torch.bool, device=_device) + all_ins_id = torch.cat([ + gt_instances.instances_ids + for gt_instances in pair_gt_insatences + ]) + all_ins_id = all_ins_id.unique().tolist() + map_ins_id = dict() + for i, ins_id in enumerate(all_ins_id): + map_ins_id[ins_id] = i + + num_instances = len(all_ins_id) + mask_shape = [ + num_instances, self.num_frames, + pair_gt_insatences[0].masks.shape[1], + pair_gt_insatences[0].masks.shape[2] + ] + gt_masks_per_video = torch.zeros( + mask_shape, dtype=torch.bool, device=_device) + gt_ids_per_video = torch.full((num_instances, self.num_frames), + -1, + dtype=torch.long, + device=_device) + gt_labels_per_video = torch.full((num_instances, ), + -1, + dtype=torch.long, + device=_device) + + for frame_id in range(self.num_frames): + cur_frame_gts = pair_gt_insatences[frame_id] + ins_ids = cur_frame_gts.instances_ids.tolist() + for i, id in enumerate(ins_ids): + gt_masks_per_video[map_ins_id[id], + frame_id, :, :] = cur_frame_gts.masks[i] + gt_ids_per_video[map_ins_id[id], + frame_id] = cur_frame_gts.instances_ids[i] + gt_labels_per_video[ + map_ins_id[id]] = cur_frame_gts.labels[i] + + tmp_instances = InstanceData( + labels=gt_labels_per_video, + masks=gt_masks_per_video.long(), + instances_id=gt_ids_per_video) + final_batch_gt_instances.append(tmp_instances) + + return final_batch_gt_instances + + def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor, + gt_instances: InstanceData, + img_meta: dict) -> Tuple[Tensor]: + """Compute classification and mask targets for one image. + + Args: + cls_score (Tensor): Mask score logits from a single decoder layer + for one image. Shape (num_queries, cls_out_channels). + mask_pred (Tensor): Mask logits for a single decoder layer for one + image. Shape (num_queries, num_frames, h, w). + gt_instances (:obj:`InstanceData`): It contains ``labels`` and + ``masks``. + img_meta (dict): Image informtation. + + Returns: + tuple[Tensor]: A tuple containing the following for one image. + + - labels (Tensor): Labels of each image. \ + shape (num_queries, ). + - label_weights (Tensor): Label weights of each image. \ + shape (num_queries, ). + - mask_targets (Tensor): Mask targets of each image. \ + shape (num_queries, num_frames, h, w). + - mask_weights (Tensor): Mask weights of each image. \ + shape (num_queries, ). + - pos_inds (Tensor): Sampled positive indices for each \ + image. + - neg_inds (Tensor): Sampled negative indices for each \ + image. + - sampling_result (:obj:`SamplingResult`): Sampling results. + """ + # (num_gts, ) + gt_labels = gt_instances.labels + # (num_gts, num_frames, h, w) + gt_masks = gt_instances.masks + # sample points + num_queries = cls_score.shape[0] + num_gts = gt_labels.shape[0] + + point_coords = torch.rand((1, self.num_points, 2), + device=cls_score.device) + + # shape (num_queries, num_points) + mask_points_pred = point_sample(mask_pred, + point_coords.repeat(num_queries, 1, + 1)).flatten(1) + # shape (num_gts, num_points) + gt_points_masks = point_sample(gt_masks.float(), + point_coords.repeat(num_gts, 1, + 1)).flatten(1) + + sampled_gt_instances = InstanceData( + labels=gt_labels, masks=gt_points_masks) + sampled_pred_instances = InstanceData( + scores=cls_score, masks=mask_points_pred) + # assign and sample + assign_result = self.assigner.assign( + pred_instances=sampled_pred_instances, + gt_instances=sampled_gt_instances, + img_meta=img_meta) + pred_instances = InstanceData(scores=cls_score, masks=mask_pred) + sampling_result = self.sampler.sample( + assign_result=assign_result, + pred_instances=pred_instances, + gt_instances=gt_instances) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label target + labels = gt_labels.new_full((self.num_queries, ), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_labels.new_ones((self.num_queries, )) + + # mask target + mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds] + mask_weights = mask_pred.new_zeros((self.num_queries, )) + mask_weights[pos_inds] = 1.0 + + return (labels, label_weights, mask_targets, mask_weights, pos_inds, + neg_inds, sampling_result) + + def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor, + batch_gt_instances: List[InstanceData], + batch_img_metas: List[dict]) -> Tuple[Tensor]: + """Loss function for outputs from a single decoder layer. + + Args: + cls_scores (Tensor): Mask score logits from a single decoder layer + for all images. Shape (batch_size, num_queries, + cls_out_channels). Note `cls_out_channels` should include + background. + mask_preds (Tensor): Mask logits for a pixel decoder for all + images. Shape (batch_size, num_queries, num_frames,h, w). + batch_gt_instances (list[obj:`InstanceData`]): each contains + ``labels`` and ``masks``. + batch_img_metas (list[dict]): List of image meta information. + + Returns: + tuple[Tensor]: Loss components for outputs from a single \ + decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + mask_preds_list = [mask_preds[i] for i in range(num_imgs)] + (labels_list, label_weights_list, mask_targets_list, mask_weights_list, + avg_factor) = self.get_targets(cls_scores_list, mask_preds_list, + batch_gt_instances, batch_img_metas) + # shape (batch_size, num_queries) + labels = torch.stack(labels_list, dim=0) + # shape (batch_size, num_queries) + label_weights = torch.stack(label_weights_list, dim=0) + # shape (num_total_gts, num_frames, h, w) + mask_targets = torch.cat(mask_targets_list, dim=0) + # shape (batch_size, num_queries) + mask_weights = torch.stack(mask_weights_list, dim=0) + + # classfication loss + # shape (batch_size * num_queries, ) + cls_scores = cls_scores.flatten(0, 1) + labels = labels.flatten(0, 1) + label_weights = label_weights.flatten(0, 1) + + class_weight = cls_scores.new_tensor(self.class_weight) + loss_cls = self.loss_cls( + cls_scores, + labels, + label_weights, + avg_factor=class_weight[labels].sum()) + + num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor])) + num_total_masks = max(num_total_masks, 1) + + # extract positive ones + # shape (batch_size, num_queries, num_frames, h, w) + # -> (num_total_gts, num_frames, h, w) + mask_preds = mask_preds[mask_weights > 0] + + if mask_targets.shape[0] == 0: + # zero match + loss_dice = mask_preds.sum() + loss_mask = mask_preds.sum() + return loss_cls, loss_mask, loss_dice + + with torch.no_grad(): + points_coords = get_uncertain_point_coords_with_randomness( + mask_preds.flatten(0, 1).unsqueeze(1), None, self.num_points, + self.oversample_ratio, self.importance_sample_ratio) + # shape (num_total_gts * num_frames, h, w) -> + # (num_total_gts, num_points) + mask_point_targets = point_sample( + mask_targets.flatten(0, 1).unsqueeze(1).float(), + points_coords).squeeze(1) + # shape (num_total_gts * num_frames, num_points) + mask_point_preds = point_sample( + mask_preds.flatten(0, 1).unsqueeze(1), points_coords).squeeze(1) + + # dice loss + loss_dice = self.loss_dice( + mask_point_preds, mask_point_targets, avg_factor=num_total_masks) + + # mask loss + # shape (num_total_gts * num_frames, num_points) -> + # (num_total_gts * num_frames * num_points, ) + mask_point_preds = mask_point_preds.reshape(-1) + # shape (num_total_gts, num_points) -> (num_total_gts * num_points, ) + mask_point_targets = mask_point_targets.reshape(-1) + loss_mask = self.loss_mask( + mask_point_preds, + mask_point_targets, + avg_factor=num_total_masks * self.num_points / self.num_frames) + + return loss_cls, loss_mask, loss_dice + + def _forward_head( + self, decoder_out: Tensor, mask_feature: Tensor, + attn_mask_target_size: Tuple[int, + int]) -> Tuple[Tensor, Tensor, Tensor]: + """Forward for head part which is called after every decoder layer. + + Args: + decoder_out (Tensor): in shape (num_queries, batch_size, c). + mask_feature (Tensor): in shape (batch_size, t, c, h, w). + attn_mask_target_size (tuple[int, int]): target attention + mask size. + + Returns: + tuple: A tuple contain three elements. + + - cls_pred (Tensor): Classification scores in shape \ + (batch_size, num_queries, cls_out_channels). \ + Note `cls_out_channels` should include background. + - mask_pred (Tensor): Mask scores in shape \ + (batch_size, num_queries,h, w). + - attn_mask (Tensor): Attention mask in shape \ + (batch_size * num_heads, num_queries, h, w). + """ + decoder_out = self.transformer_decoder.post_norm(decoder_out) + cls_pred = self.cls_embed(decoder_out) + mask_embed = self.mask_embed(decoder_out) + + # shape (batch_size, num_queries, t, h, w) + mask_pred = torch.einsum('bqc,btchw->bqthw', mask_embed, mask_feature) + b, q, t, _, _ = mask_pred.shape + + attn_mask = F.interpolate( + mask_pred.flatten(0, 1), + attn_mask_target_size, + mode='bilinear', + align_corners=False).view(b, q, t, attn_mask_target_size[0], + attn_mask_target_size[1]) + + # shape (batch_size, num_queries, t, h, w) -> + # (batch_size, num_queries, t*h*w) -> + # (batch_size, num_head, num_queries, t*h*w) -> + # (batch_size*num_head, num_queries, t*h*w) + attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat( + (1, self.num_heads, 1, 1)).flatten(0, 1) + attn_mask = attn_mask.sigmoid() < 0.5 + attn_mask = attn_mask.detach() + + return cls_pred, mask_pred, attn_mask + + def forward( + self, x: List[Tensor], data_samples: TrackDataSample + ) -> Tuple[List[Tensor], List[Tensor]]: + """Forward function. + + Args: + x (list[Tensor]): Multi scale Features from the + upstream network, each is a 4D-tensor. + data_samples (List[:obj:`TrackDataSample`]): The Data + Samples. It usually includes information such as `gt_instance`. + + Returns: + tuple[list[Tensor]]: A tuple contains two elements. + + - cls_pred_list (list[Tensor)]: Classification logits \ + for each decoder layer. Each is a 3D-tensor with shape \ + (batch_size, num_queries, cls_out_channels). \ + Note `cls_out_channels` should include background. + - mask_pred_list (list[Tensor]): Mask logits for each \ + decoder layer. Each with shape (batch_size, num_queries, \ + h, w). + """ + mask_features, multi_scale_memorys = self.pixel_decoder(x) + bt, c_m, h_m, w_m = mask_features.shape + batch_size = bt // self.num_frames if self.training else 1 + t = bt // batch_size + mask_features = mask_features.view(batch_size, t, c_m, h_m, w_m) + # multi_scale_memorys (from low resolution to high resolution) + decoder_inputs = [] + decoder_positional_encodings = [] + for i in range(self.num_transformer_feat_level): + decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i]) + decoder_input = decoder_input.flatten(2) + level_embed = self.level_embed.weight[i][None, :, None] + decoder_input = decoder_input + level_embed + _, c, hw = decoder_input.shape + # shape (batch_size*t, c, h, w) -> + # (batch_size, t, c, hw) -> + # (batch_size, t*h*w, c) + decoder_input = decoder_input.view(batch_size, t, c, + hw).permute(0, 1, 3, + 2).flatten(1, 2) + # shape (batch_size, c, h, w) -> (h*w, batch_size, c) + mask = decoder_input.new_zeros( + (batch_size, t) + multi_scale_memorys[i].shape[-2:], + dtype=torch.bool) + decoder_positional_encoding = self.decoder_positional_encoding( + mask) + decoder_positional_encoding = decoder_positional_encoding.flatten( + 3).permute(0, 1, 3, 2).flatten(1, 2) + decoder_inputs.append(decoder_input) + decoder_positional_encodings.append(decoder_positional_encoding) + # shape (num_queries, c) -> (batch_size, num_queries, c) + query_feat = self.query_feat.weight.unsqueeze(0).repeat( + (batch_size, 1, 1)) + query_embed = self.query_embed.weight.unsqueeze(0).repeat( + (batch_size, 1, 1)) + + cls_pred_list = [] + mask_pred_list = [] + cls_pred, mask_pred, attn_mask = self._forward_head( + query_feat, mask_features, multi_scale_memorys[0].shape[-2:]) + cls_pred_list.append(cls_pred) + mask_pred_list.append(mask_pred) + + for i in range(self.num_transformer_decoder_layers): + level_idx = i % self.num_transformer_feat_level + # if a mask is all True(all background), then set it all False. + attn_mask[torch.where( + attn_mask.sum(-1) == attn_mask.shape[-1])] = False + + # cross_attn + self_attn + layer = self.transformer_decoder.layers[i] + query_feat = layer( + query=query_feat, + key=decoder_inputs[level_idx], + value=decoder_inputs[level_idx], + query_pos=query_embed, + key_pos=decoder_positional_encodings[level_idx], + cross_attn_mask=attn_mask, + query_key_padding_mask=None, + # here we do not apply masking on padded region + key_padding_mask=None) + cls_pred, mask_pred, attn_mask = self._forward_head( + query_feat, mask_features, multi_scale_memorys[ + (i + 1) % self.num_transformer_feat_level].shape[-2:]) + + cls_pred_list.append(cls_pred) + mask_pred_list.append(mask_pred) + + return cls_pred_list, mask_pred_list + + def loss( + self, + x: Tuple[Tensor], + data_samples: TrackSampleList, + ) -> Dict[str, Tensor]: + """Perform forward propagation and loss calculation of the track head + on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the upstream + network, each is a 4D-tensor. + data_samples (List[:obj:`TrackDataSample`]): The Data + Samples. It usually includes information such as `gt_instance`. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + batch_img_metas = [] + batch_gt_instances = [] + + for data_sample in data_samples: + video_img_metas = defaultdict(list) + for image_idx in range(len(data_sample)): + batch_gt_instances.append(data_sample[image_idx].gt_instances) + for key, value in data_sample[image_idx].metainfo.items(): + video_img_metas[key].append(value) + batch_img_metas.append(video_img_metas) + + # forward + all_cls_scores, all_mask_preds = self(x, data_samples) + + # preprocess ground truth + batch_gt_instances = self.preprocess_gt(batch_gt_instances) + # loss + losses = self.loss_by_feat(all_cls_scores, all_mask_preds, + batch_gt_instances, batch_img_metas) + + return losses + + def predict(self, + x: Tuple[Tensor], + data_samples: TrackDataSample, + rescale: bool = True) -> InstanceList: + """Test without augmentation. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + data_samples (List[:obj:`TrackDataSample`]): The Data + Samples. It usually includes information such as `gt_instance`. + rescale (bool, Optional): If False, then returned bboxes and masks + will fit the scale of img, otherwise, returned bboxes and masks + will fit the scale of original image shape. Defaults to True. + + Returns: + list[obj:`InstanceData`]: each contains the following keys + - labels (Tensor): Prediction class indices\ + for an image, with shape (n, ), n is the sum of\ + number of stuff type and number of instance in an image. + - masks (Tensor): Prediction mask for a\ + image, with shape (n, t, h, w). + """ + + batch_img_metas = [ + data_samples[img_idx].metainfo + for img_idx in range(len(data_samples)) + ] + all_cls_scores, all_mask_preds = self(x, data_samples) + mask_cls_results = all_cls_scores[-1] + mask_pred_results = all_mask_preds[-1] + + mask_cls_results = mask_cls_results[0] + # upsample masks + img_shape = batch_img_metas[0]['batch_input_shape'] + mask_pred_results = F.interpolate( + mask_pred_results[0], + size=(img_shape[0], img_shape[1]), + mode='bilinear', + align_corners=False) + + results = self.predict_by_feat(mask_cls_results, mask_pred_results, + batch_img_metas) + return results + + def predict_by_feat(self, + mask_cls_results: List[Tensor], + mask_pred_results: List[Tensor], + batch_img_metas: List[dict], + rescale: bool = True) -> InstanceList: + """Get top-10 predictions. + + Args: + mask_cls_results (Tensor): Mask classification logits,\ + shape (batch_size, num_queries, cls_out_channels). + Note `cls_out_channels` should include background. + mask_pred_results (Tensor): Mask logits, shape \ + (batch_size, num_queries, h, w). + batch_img_metas (list[dict]): List of image meta information. + rescale (bool, Optional): If False, then returned bboxes and masks + will fit the scale of img, otherwise, returned bboxes and masks + will fit the scale of original image shape. Defaults to True. + + Returns: + list[obj:`InstanceData`]: each contains the following keys + - labels (Tensor): Prediction class indices\ + for an image, with shape (n, ), n is the sum of\ + number of stuff type and number of instance in an image. + - masks (Tensor): Prediction mask for a\ + image, with shape (n, t, h, w). + """ + results = [] + if len(mask_cls_results) > 0: + scores = F.softmax(mask_cls_results, dim=-1)[:, :-1] + labels = torch.arange(self.num_classes).unsqueeze(0).repeat( + self.num_queries, 1).flatten(0, 1).to(scores.device) + # keep top-10 predictions + scores_per_image, topk_indices = scores.flatten(0, 1).topk( + 10, sorted=False) + labels_per_image = labels[topk_indices] + topk_indices = topk_indices // self.num_classes + mask_pred_results = mask_pred_results[topk_indices] + + img_shape = batch_img_metas[0]['img_shape'] + mask_pred_results = \ + mask_pred_results[:, :, :img_shape[0], :img_shape[1]] + if rescale: + # return result in original resolution + ori_height, ori_width = batch_img_metas[0]['ori_shape'][:2] + mask_pred_results = F.interpolate( + mask_pred_results, + size=(ori_height, ori_width), + mode='bilinear', + align_corners=False) + + masks = mask_pred_results > 0. + + # format top-10 predictions + for img_idx in range(len(batch_img_metas)): + pred_track_instances = InstanceData() + + pred_track_instances.masks = masks[:, img_idx] + pred_track_instances.bboxes = mask2bbox(masks[:, img_idx]) + pred_track_instances.labels = labels_per_image + pred_track_instances.scores = scores_per_image + pred_track_instances.instances_id = torch.arange(10) + + results.append(pred_track_instances) + + return results diff --git a/mmdetection/mmdet/models/tracking_heads/quasi_dense_embed_head.py b/mmdetection/mmdet/models/tracking_heads/quasi_dense_embed_head.py new file mode 100644 index 0000000..55e3c05 --- /dev/null +++ b/mmdetection/mmdet/models/tracking_heads/quasi_dense_embed_head.py @@ -0,0 +1,347 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch import Tensor +from torch.nn.modules.utils import _pair + +from mmdet.models.task_modules import SamplingResult +from mmdet.registry import MODELS +from ..task_modules.tracking import embed_similarity + + +@MODELS.register_module() +class QuasiDenseEmbedHead(BaseModule): + """The quasi-dense roi embed head. + + Args: + embed_channels (int): The input channel of embed features. + Defaults to 256. + softmax_temp (int): Softmax temperature. Defaults to -1. + loss_track (dict): The loss function for tracking. Defaults to + MultiPosCrossEntropyLoss. + loss_track_aux (dict): The auxiliary loss function for tracking. + Defaults to MarginL2Loss. + init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \ + dict]): Initialization config dict. + """ + + def __init__(self, + num_convs: int = 0, + num_fcs: int = 0, + roi_feat_size: int = 7, + in_channels: int = 256, + conv_out_channels: int = 256, + with_avg_pool: bool = False, + fc_out_channels: int = 1024, + conv_cfg: Optional[dict] = None, + norm_cfg: Optional[dict] = None, + embed_channels: int = 256, + softmax_temp: int = -1, + loss_track: Optional[dict] = None, + loss_track_aux: dict = dict( + type='MarginL2Loss', + sample_ratio=3, + margin=0.3, + loss_weight=1.0, + hard_mining=True), + init_cfg: dict = dict( + type='Xavier', + layer='Linear', + distribution='uniform', + bias=0, + override=dict( + type='Normal', + name='fc_embed', + mean=0, + std=0.01, + bias=0))): + super(QuasiDenseEmbedHead, self).__init__(init_cfg=init_cfg) + self.num_convs = num_convs + self.num_fcs = num_fcs + self.roi_feat_size = _pair(roi_feat_size) + self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1] + self.in_channels = in_channels + self.conv_out_channels = conv_out_channels + self.with_avg_pool = with_avg_pool + self.fc_out_channels = fc_out_channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + if self.with_avg_pool: + self.avg_pool = nn.AvgPool2d(self.roi_feat_size) + # add convs and fcs + self.convs, self.fcs, self.last_layer_dim = self._add_conv_fc_branch( + self.num_convs, self.num_fcs, self.in_channels) + self.relu = nn.ReLU(inplace=True) + + if loss_track is None: + loss_track = dict( + type='MultiPosCrossEntropyLoss', loss_weight=0.25) + + self.fc_embed = nn.Linear(self.last_layer_dim, embed_channels) + self.softmax_temp = softmax_temp + self.loss_track = MODELS.build(loss_track) + if loss_track_aux is not None: + self.loss_track_aux = MODELS.build(loss_track_aux) + else: + self.loss_track_aux = None + + def _add_conv_fc_branch( + self, num_branch_convs: int, num_branch_fcs: int, + in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]: + """Add shared or separable branch. convs -> avg pool (optional) -> fcs. + + Args: + num_branch_convs (int): The number of convoluational layers. + num_branch_fcs (int): The number of fully connection layers. + in_channels (int): The input channel of roi features. + + Returns: + Tuple[nn.ModuleList, nn.ModuleList, int]: The convs, fcs and the + last layer dimension. + """ + last_layer_dim = in_channels + # add branch specific conv layers + branch_convs = nn.ModuleList() + if num_branch_convs > 0: + for i in range(num_branch_convs): + conv_in_channels = ( + last_layer_dim if i == 0 else self.conv_out_channels) + branch_convs.append( + ConvModule( + conv_in_channels, + self.conv_out_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + last_layer_dim = self.conv_out_channels + + # add branch specific fc layers + branch_fcs = nn.ModuleList() + if num_branch_fcs > 0: + if not self.with_avg_pool: + last_layer_dim *= self.roi_feat_area + for i in range(num_branch_fcs): + fc_in_channels = ( + last_layer_dim if i == 0 else self.fc_out_channels) + branch_fcs.append( + nn.Linear(fc_in_channels, self.fc_out_channels)) + last_layer_dim = self.fc_out_channels + + return branch_convs, branch_fcs, last_layer_dim + + def forward(self, x: Tensor) -> Tensor: + """Forward function. + + Args: + x (Tensor): The input features from ROI head. + + Returns: + Tensor: The embedding feature map. + """ + + if self.num_convs > 0: + for conv in self.convs: + x = conv(x) + x = x.flatten(1) + if self.num_fcs > 0: + for fc in self.fcs: + x = self.relu(fc(x)) + x = self.fc_embed(x) + return x + + def get_targets( + self, gt_match_indices: List[Tensor], + key_sampling_results: List[SamplingResult], + ref_sampling_results: List[SamplingResult]) -> Tuple[List, List]: + """Calculate the track targets and track weights for all samples in a + batch according to the sampling_results. + + Args: + gt_match_indices (list(Tensor)): Mapping from gt_instance_ids to + ref_gt_instance_ids of the same tracklet in a pair of images. + key_sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + ref_sampling_results (List[obj:SamplingResult]): Assign results of + all reference images in a batch after sampling. + + Returns: + Tuple[list[Tensor]]: Association results. + Containing the following list of Tensors: + + - track_targets (list[Tensor]): The mapping instance ids from + all positive proposals in the key image to all proposals + in the reference image, each tensor in list has + shape (len(key_pos_bboxes), len(ref_bboxes)). + - track_weights (list[Tensor]): Loss weights for all positive + proposals in a batch, each tensor in list has + shape (len(key_pos_bboxes),). + """ + + track_targets = [] + track_weights = [] + for _gt_match_indices, key_res, ref_res in zip(gt_match_indices, + key_sampling_results, + ref_sampling_results): + targets = _gt_match_indices.new_zeros( + (key_res.pos_bboxes.size(0), ref_res.bboxes.size(0)), + dtype=torch.int) + _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds] + pos2pos = (_match_indices.view( + -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int() + targets[:, :pos2pos.size(1)] = pos2pos + weights = (targets.sum(dim=1) > 0).float() + track_targets.append(targets) + track_weights.append(weights) + return track_targets, track_weights + + def match( + self, key_embeds: Tensor, ref_embeds: Tensor, + key_sampling_results: List[SamplingResult], + ref_sampling_results: List[SamplingResult] + ) -> Tuple[List[Tensor], List[Tensor]]: + """Calculate the dist matrixes for loss measurement. + + Args: + key_embeds (Tensor): Embeds of positive bboxes in sampling results + of key image. + ref_embeds (Tensor): Embeds of all bboxes in sampling results + of the reference image. + key_sampling_results (List[obj:SamplingResults]): Assign results of + all images in a batch after sampling. + ref_sampling_results (List[obj:SamplingResults]): Assign results of + all reference images in a batch after sampling. + + Returns: + Tuple[list[Tensor]]: Calculation results. + Containing the following list of Tensors: + + - dists (list[Tensor]): Dot-product dists between + key_embeds and ref_embeds, each tensor in list has + shape (len(key_pos_bboxes), len(ref_bboxes)). + - cos_dists (list[Tensor]): Cosine dists between + key_embeds and ref_embeds, each tensor in list has + shape (len(key_pos_bboxes), len(ref_bboxes)). + """ + + num_key_rois = [res.pos_bboxes.size(0) for res in key_sampling_results] + key_embeds = torch.split(key_embeds, num_key_rois) + num_ref_rois = [res.bboxes.size(0) for res in ref_sampling_results] + ref_embeds = torch.split(ref_embeds, num_ref_rois) + + dists, cos_dists = [], [] + for key_embed, ref_embed in zip(key_embeds, ref_embeds): + dist = embed_similarity( + key_embed, + ref_embed, + method='dot_product', + temperature=self.softmax_temp) + dists.append(dist) + if self.loss_track_aux is not None: + cos_dist = embed_similarity( + key_embed, ref_embed, method='cosine') + cos_dists.append(cos_dist) + else: + cos_dists.append(None) + return dists, cos_dists + + def loss(self, key_roi_feats: Tensor, ref_roi_feats: Tensor, + key_sampling_results: List[SamplingResult], + ref_sampling_results: List[SamplingResult], + gt_match_indices_list: List[Tensor]) -> dict: + """Calculate the track loss and the auxiliary track loss. + + Args: + key_roi_feats (Tensor): Embeds of positive bboxes in sampling + results of key image. + ref_roi_feats (Tensor): Embeds of all bboxes in sampling results + of the reference image. + key_sampling_results (List[obj:SamplingResults]): Assign results of + all images in a batch after sampling. + ref_sampling_results (List[obj:SamplingResults]): Assign results of + all reference images in a batch after sampling. + gt_match_indices_list (list(Tensor)): Mapping from gt_instances_ids + to ref_gt_instances_ids of the same tracklet in a pair of + images. + + Returns: + Dict [str: Tensor]: Calculation results. + Containing the following list of Tensors: + + - loss_track (Tensor): Results of loss_track function. + - loss_track_aux (Tensor): Results of loss_track_aux function. + """ + key_track_feats = self(key_roi_feats) + ref_track_feats = self(ref_roi_feats) + + losses = self.loss_by_feat(key_track_feats, ref_track_feats, + key_sampling_results, ref_sampling_results, + gt_match_indices_list) + return losses + + def loss_by_feat(self, key_track_feats: Tensor, ref_track_feats: Tensor, + key_sampling_results: List[SamplingResult], + ref_sampling_results: List[SamplingResult], + gt_match_indices_list: List[Tensor]) -> dict: + """Calculate the track loss and the auxiliary track loss. + + Args: + key_track_feats (Tensor): Embeds of positive bboxes in sampling + results of key image. + ref_track_feats (Tensor): Embeds of all bboxes in sampling results + of the reference image. + key_sampling_results (List[obj:SamplingResults]): Assign results of + all images in a batch after sampling. + ref_sampling_results (List[obj:SamplingResults]): Assign results of + all reference images in a batch after sampling. + gt_match_indices_list (list(Tensor)): Mapping from instances_ids + from key image to reference image of the same tracklet in a + pair of images. + + Returns: + Dict [str: Tensor]: Calculation results. + Containing the following list of Tensors: + + - loss_track (Tensor): Results of loss_track function. + - loss_track_aux (Tensor): Results of loss_track_aux function. + """ + dists, cos_dists = self.match(key_track_feats, ref_track_feats, + key_sampling_results, + ref_sampling_results) + targets, weights = self.get_targets(gt_match_indices_list, + key_sampling_results, + ref_sampling_results) + losses = dict() + + loss_track = 0. + loss_track_aux = 0. + for _dists, _cos_dists, _targets, _weights in zip( + dists, cos_dists, targets, weights): + loss_track += self.loss_track( + _dists, _targets, _weights, avg_factor=_weights.sum()) + if self.loss_track_aux is not None: + loss_track_aux += self.loss_track_aux(_cos_dists, _targets) + losses['loss_track'] = loss_track / len(dists) + + if self.loss_track_aux is not None: + losses['loss_track_aux'] = loss_track_aux / len(dists) + + return losses + + def predict(self, bbox_feats: Tensor) -> Tensor: + """Perform forward propagation of the tracking head and predict + tracking results on the features of the upstream network. + + Args: + bbox_feats: The extracted roi features. + + Returns: + Tensor: The extracted track features. + """ + track_feats = self(bbox_feats) + return track_feats diff --git a/mmdetection/mmdet/models/tracking_heads/quasi_dense_track_head.py b/mmdetection/mmdet/models/tracking_heads/quasi_dense_track_head.py new file mode 100644 index 0000000..bd078da --- /dev/null +++ b/mmdetection/mmdet/models/tracking_heads/quasi_dense_track_head.py @@ -0,0 +1,178 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional + +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import TrackSampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import InstanceList + + +@MODELS.register_module() +class QuasiDenseTrackHead(BaseModule): + """The quasi-dense track head.""" + + def __init__(self, + roi_extractor: Optional[dict] = None, + embed_head: Optional[dict] = None, + regress_head: Optional[dict] = None, + train_cfg: Optional[dict] = None, + test_cfg: Optional[dict] = None, + init_cfg: Optional[dict] = None, + **kwargs): + super().__init__(init_cfg=init_cfg) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if embed_head is not None: + self.init_embed_head(roi_extractor, embed_head) + + if regress_head is not None: + raise NotImplementedError('Regression head is not supported yet.') + + self.init_assigner_sampler() + + def init_embed_head(self, roi_extractor, embed_head) -> None: + """Initialize ``embed_head`` + + Args: + roi_extractor (dict, optional): Configuration of roi extractor. + Defaults to None. + embed_head (dict, optional): Configuration of embed head. Defaults + to None. + """ + self.roi_extractor = MODELS.build(roi_extractor) + self.embed_head = MODELS.build(embed_head) + + def init_assigner_sampler(self) -> None: + """Initialize assigner and sampler.""" + self.bbox_assigner = None + self.bbox_sampler = None + if self.train_cfg: + self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner) + self.bbox_sampler = TASK_UTILS.build( + self.train_cfg.sampler, default_args=dict(context=self)) + + @property + def with_track(self) -> bool: + """bool: whether the multi-object tracker has an embed head""" + return hasattr(self, 'embed_head') and self.embed_head is not None + + def extract_roi_feats(self, feats: List[Tensor], + bboxes: List[Tensor]) -> Tensor: + """Extract roi features. + + Args: + feats (list[Tensor]): list of multi-level image features. + bboxes (list[Tensor]): list of bboxes in sampling result. + + Returns: + Tensor: The extracted roi features. + """ + rois = bbox2roi(bboxes) + bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs], + rois) + return bbox_feats + + def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor], + rpn_results_list: InstanceList, + ref_rpn_results_list: InstanceList, data_samples: TrackSampleList, + **kwargs) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + key_feats (list[Tensor]): list of multi-level image features. + ref_feats (list[Tensor]): list of multi-level ref_img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals of key img. + ref_rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals of ref img. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance`. + + Returns: + dict: A dictionary of loss components. + """ + assert self.with_track + num_imgs = len(data_samples) + batch_gt_instances = [] + ref_batch_gt_instances = [] + batch_gt_instances_ignore = [] + gt_match_indices_list = [] + for track_data_sample in data_samples: + key_data_sample = track_data_sample.get_key_frames()[0] + ref_data_sample = track_data_sample.get_ref_frames()[0] + batch_gt_instances.append(key_data_sample.gt_instances) + ref_batch_gt_instances.append(ref_data_sample.gt_instances) + if 'ignored_instances' in key_data_sample: + batch_gt_instances_ignore.append( + key_data_sample.ignored_instances) + else: + batch_gt_instances_ignore.append(None) + # get gt_match_indices + ins_ids = key_data_sample.gt_instances.instances_ids.tolist() + ref_ins_ids = ref_data_sample.gt_instances.instances_ids.tolist() + match_indices = Tensor([ + ref_ins_ids.index(i) if (i in ref_ins_ids and i > 0) else -1 + for i in ins_ids + ]).to(key_feats[0].device) + gt_match_indices_list.append(match_indices) + + key_sampling_results, ref_sampling_results = [], [] + for i in range(num_imgs): + rpn_results = rpn_results_list[i] + ref_rpn_results = ref_rpn_results_list[i] + # rename ref_rpn_results.bboxes to ref_rpn_results.priors + ref_rpn_results.priors = ref_rpn_results.pop('bboxes') + + assign_result = self.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in key_feats]) + key_sampling_results.append(sampling_result) + + ref_assign_result = self.bbox_assigner.assign( + ref_rpn_results, ref_batch_gt_instances[i], + batch_gt_instances_ignore[i]) + ref_sampling_result = self.bbox_sampler.sample( + ref_assign_result, + ref_rpn_results, + ref_batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in ref_feats]) + ref_sampling_results.append(ref_sampling_result) + + key_bboxes = [res.pos_bboxes for res in key_sampling_results] + key_roi_feats = self.extract_roi_feats(key_feats, key_bboxes) + ref_bboxes = [res.bboxes for res in ref_sampling_results] + ref_roi_feats = self.extract_roi_feats(ref_feats, ref_bboxes) + + loss_track = self.embed_head.loss(key_roi_feats, ref_roi_feats, + key_sampling_results, + ref_sampling_results, + gt_match_indices_list) + + return loss_track + + def predict(self, feats: List[Tensor], + rescaled_bboxes: List[Tensor]) -> Tensor: + """Perform forward propagation of the tracking head and predict + tracking results on the features of the upstream network. + + Args: + feats (list[Tensor]): Multi level feature maps of `img`. + rescaled_bboxes (list[Tensor]): list of rescaled bboxes in sampling + result. + + Returns: + Tensor: The extracted track features. + """ + bbox_feats = self.extract_roi_feats(feats, rescaled_bboxes) + track_feats = self.embed_head.predict(bbox_feats) + return track_feats diff --git a/mmdetection/mmdet/models/tracking_heads/roi_embed_head.py b/mmdetection/mmdet/models/tracking_heads/roi_embed_head.py new file mode 100644 index 0000000..e18b81f --- /dev/null +++ b/mmdetection/mmdet/models/tracking_heads/roi_embed_head.py @@ -0,0 +1,391 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import defaultdict +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmengine.model import BaseModule +from torch import Tensor +from torch.nn.modules.utils import _pair + +from mmdet.models.losses import accuracy +from mmdet.models.task_modules import SamplingResult +from mmdet.models.task_modules.tracking import embed_similarity +from mmdet.registry import MODELS + + +@MODELS.register_module() +class RoIEmbedHead(BaseModule): + """The roi embed head. + + This module is used in multi-object tracking methods, such as MaskTrack + R-CNN. + + Args: + num_convs (int): The number of convoluational layers to embed roi + features. Defaults to 0. + num_fcs (int): The number of fully connection layers to embed roi + features. Defaults to 0. + roi_feat_size (int|tuple(int)): The spatial size of roi features. + Defaults to 7. + in_channels (int): The input channel of roi features. Defaults to 256. + conv_out_channels (int): The output channel of roi features after + forwarding convoluational layers. Defaults to 256. + with_avg_pool (bool): Whether use average pooling before passing roi + features into fully connection layers. Defaults to False. + fc_out_channels (int): The output channel of roi features after + forwarding fully connection layers. Defaults to 1024. + conv_cfg (dict): Config dict for convolution layer. Defaults to None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Defaults to None. + loss_match (dict): The loss function. Defaults to + dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0) + init_cfg (dict): Configuration of initialization. Defaults to None. + """ + + def __init__(self, + num_convs: int = 0, + num_fcs: int = 0, + roi_feat_size: int = 7, + in_channels: int = 256, + conv_out_channels: int = 256, + with_avg_pool: bool = False, + fc_out_channels: int = 1024, + conv_cfg: Optional[dict] = None, + norm_cfg: Optional[dict] = None, + loss_match: dict = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + init_cfg: Optional[dict] = None, + **kwargs): + super(RoIEmbedHead, self).__init__(init_cfg=init_cfg) + self.num_convs = num_convs + self.num_fcs = num_fcs + self.roi_feat_size = _pair(roi_feat_size) + self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1] + self.in_channels = in_channels + self.conv_out_channels = conv_out_channels + self.with_avg_pool = with_avg_pool + self.fc_out_channels = fc_out_channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.loss_match = MODELS.build(loss_match) + self.fp16_enabled = False + + if self.with_avg_pool: + self.avg_pool = nn.AvgPool2d(self.roi_feat_size) + # add convs and fcs + self.convs, self.fcs, self.last_layer_dim = self._add_conv_fc_branch( + self.num_convs, self.num_fcs, self.in_channels) + self.relu = nn.ReLU(inplace=True) + + def _add_conv_fc_branch( + self, num_branch_convs: int, num_branch_fcs: int, + in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]: + """Add shared or separable branch. + + convs -> avg pool (optional) -> fcs + """ + last_layer_dim = in_channels + # add branch specific conv layers + branch_convs = nn.ModuleList() + if num_branch_convs > 0: + for i in range(num_branch_convs): + conv_in_channels = ( + last_layer_dim if i == 0 else self.conv_out_channels) + branch_convs.append( + ConvModule( + conv_in_channels, + self.conv_out_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + last_layer_dim = self.conv_out_channels + + # add branch specific fc layers + branch_fcs = nn.ModuleList() + if num_branch_fcs > 0: + if not self.with_avg_pool: + last_layer_dim *= self.roi_feat_area + for i in range(num_branch_fcs): + fc_in_channels = ( + last_layer_dim if i == 0 else self.fc_out_channels) + branch_fcs.append( + nn.Linear(fc_in_channels, self.fc_out_channels)) + last_layer_dim = self.fc_out_channels + + return branch_convs, branch_fcs, last_layer_dim + + @property + def custom_activation(self): + return getattr(self.loss_match, 'custom_activation', False) + + def extract_feat(self, x: Tensor, + num_x_per_img: List[int]) -> Tuple[Tensor]: + """Extract feature from the input `x`, and split the output to a list. + + Args: + x (Tensor): of shape [N, C, H, W]. N is the number of proposals. + num_x_per_img (list[int]): The `x` contains proposals of + multi-images. `num_x_per_img` denotes the number of proposals + for each image. + + Returns: + list[Tensor]: Each Tensor denotes the embed features belonging to + an image in a batch. + """ + if self.num_convs > 0: + for conv in self.convs: + x = conv(x) + + if self.num_fcs > 0: + if self.with_avg_pool: + x = self.avg_pool(x) + x = x.flatten(1) + for fc in self.fcs: + x = self.relu(fc(x)) + else: + x = x.flatten(1) + + x_split = torch.split(x, num_x_per_img, dim=0) + return x_split + + def forward( + self, x: Tensor, ref_x: Tensor, num_x_per_img: List[int], + num_x_per_ref_img: List[int] + ) -> Tuple[Tuple[Tensor], Tuple[Tensor]]: + """Computing the similarity scores between `x` and `ref_x`. + + Args: + x (Tensor): of shape [N, C, H, W]. N is the number of key frame + proposals. + ref_x (Tensor): of shape [M, C, H, W]. M is the number of reference + frame proposals. + num_x_per_img (list[int]): The `x` contains proposals of + multi-images. `num_x_per_img` denotes the number of proposals + for each key image. + num_x_per_ref_img (list[int]): The `ref_x` contains proposals of + multi-images. `num_x_per_ref_img` denotes the number of + proposals for each reference image. + + Returns: + tuple[tuple[Tensor], tuple[Tensor]]: Each tuple of tensor denotes + the embed features belonging to an image in a batch. + """ + x_split = self.extract_feat(x, num_x_per_img) + ref_x_split = self.extract_feat(ref_x, num_x_per_ref_img) + + return x_split, ref_x_split + + def get_targets(self, sampling_results: List[SamplingResult], + gt_instance_ids: List[Tensor], + ref_gt_instance_ids: List[Tensor]) -> Tuple[List, List]: + """Calculate the ground truth for all samples in a batch according to + the sampling_results. + + Args: + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of + all images in a batch, each tensor has shape (num_gt, ). + ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes + of all reference images in a batch, each tensor has shape + (num_gt, ). + + Returns: + Tuple[list[Tensor]]: Ground truth for proposals in a batch. + Containing the following list of Tensors: + + - track_id_targets (list[Tensor]): The instance ids of + Gt_labels for all proposals in a batch, each tensor in list + has shape (num_proposals,). + - track_id_weights (list[Tensor]): Labels_weights for + all proposals in a batch, each tensor in list has + shape (num_proposals,). + """ + track_id_targets = [] + track_id_weights = [] + + for res, gt_instance_id, ref_gt_instance_id in zip( + sampling_results, gt_instance_ids, ref_gt_instance_ids): + pos_instance_ids = gt_instance_id[res.pos_assigned_gt_inds] + pos_match_id = gt_instance_id.new_zeros(len(pos_instance_ids)) + for i, id in enumerate(pos_instance_ids): + if id in ref_gt_instance_id: + pos_match_id[i] = ref_gt_instance_id.tolist().index(id) + 1 + + track_id_target = gt_instance_id.new_zeros( + len(res.bboxes), dtype=torch.int64) + track_id_target[:len(res.pos_bboxes)] = pos_match_id + track_id_weight = res.bboxes.new_zeros(len(res.bboxes)) + track_id_weight[:len(res.pos_bboxes)] = 1.0 + + track_id_targets.append(track_id_target) + track_id_weights.append(track_id_weight) + + return track_id_targets, track_id_weights + + def loss( + self, + bbox_feats: Tensor, + ref_bbox_feats: Tensor, + num_bbox_per_img: int, + num_bbox_per_ref_img: int, + sampling_results: List[SamplingResult], + gt_instance_ids: List[Tensor], + ref_gt_instance_ids: List[Tensor], + reduction_override: Optional[str] = None, + ) -> dict: + """Calculate the loss in a batch. + + Args: + bbox_feats (Tensor): of shape [N, C, H, W]. N is the number of + bboxes. + ref_bbox_feats (Tensor): of shape [M, C, H, W]. M is the number of + reference bboxes. + num_bbox_per_img (list[int]): The `bbox_feats` contains proposals + of multi-images. `num_bbox_per_img` denotes the number of + proposals for each key image. + num_bbox_per_ref_img (list[int]): The `ref_bbox_feats` contains + proposals of multi-images. `num_bbox_per_ref_img` denotes the + number of proposals for each reference image. + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of + all images in a batch, each tensor has shape (num_gt, ). + ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes + of all reference images in a batch, each tensor has shape + (num_gt, ). + reduction_override (str, optional): The method used to reduce the + loss. Options are "none", "mean" and "sum". + + Returns: + dict[str, Tensor]: a dictionary of loss components. + """ + x_split, ref_x_split = self(bbox_feats, ref_bbox_feats, + num_bbox_per_img, num_bbox_per_ref_img) + + losses = self.loss_by_feat(x_split, ref_x_split, sampling_results, + gt_instance_ids, ref_gt_instance_ids, + reduction_override) + return losses + + def loss_by_feat(self, + x_split: Tuple[Tensor], + ref_x_split: Tuple[Tensor], + sampling_results: List[SamplingResult], + gt_instance_ids: List[Tensor], + ref_gt_instance_ids: List[Tensor], + reduction_override: Optional[str] = None) -> dict: + """Calculate losses. + + Args: + x_split (Tensor): The embed features belonging to key image. + ref_x_split (Tensor): The embed features belonging to ref image. + sampling_results (List[obj:SamplingResult]): Assign results of + all images in a batch after sampling. + gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of + all images in a batch, each tensor has shape (num_gt, ). + ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes + of all reference images in a batch, each tensor has shape + (num_gt, ). + reduction_override (str, optional): The method used to reduce the + loss. Options are "none", "mean" and "sum". + + Returns: + dict[str, Tensor]: a dictionary of loss components. + """ + track_id_targets, track_id_weights = self.get_targets( + sampling_results, gt_instance_ids, ref_gt_instance_ids) + assert isinstance(track_id_targets, list) + assert isinstance(track_id_weights, list) + assert len(track_id_weights) == len(track_id_targets) + + losses = defaultdict(list) + similarity_logits = [] + for one_x, one_ref_x in zip(x_split, ref_x_split): + similarity_logit = embed_similarity( + one_x, one_ref_x, method='dot_product') + dummy = similarity_logit.new_zeros(one_x.shape[0], 1) + similarity_logit = torch.cat((dummy, similarity_logit), dim=1) + similarity_logits.append(similarity_logit) + assert isinstance(similarity_logits, list) + assert len(similarity_logits) == len(track_id_targets) + + for similarity_logit, track_id_target, track_id_weight in zip( + similarity_logits, track_id_targets, track_id_weights): + avg_factor = max(torch.sum(track_id_target > 0).float().item(), 1.) + if similarity_logit.numel() > 0: + loss_match = self.loss_match( + similarity_logit, + track_id_target, + track_id_weight, + avg_factor=avg_factor, + reduction_override=reduction_override) + if isinstance(loss_match, dict): + for key, value in loss_match.items(): + losses[key].append(value) + else: + losses['loss_match'].append(loss_match) + + valid_index = track_id_weight > 0 + valid_similarity_logit = similarity_logit[valid_index] + valid_track_id_target = track_id_target[valid_index] + if self.custom_activation: + match_accuracy = self.loss_match.get_accuracy( + valid_similarity_logit, valid_track_id_target) + for key, value in match_accuracy.items(): + losses[key].append(value) + else: + losses['match_accuracy'].append( + accuracy(valid_similarity_logit, + valid_track_id_target)) + + for key, value in losses.items(): + losses[key] = sum(losses[key]) / len(similarity_logits) + return losses + + def predict(self, roi_feats: Tensor, + prev_roi_feats: Tensor) -> List[Tensor]: + """Perform forward propagation of the tracking head and predict + tracking results on the features of the upstream network. + + Args: + roi_feats (Tensor): Feature map of current images rois. + prev_roi_feats (Tensor): Feature map of previous images rois. + + Returns: + list[Tensor]: The predicted similarity_logits of each pair of key + image and reference image. + """ + x_split, ref_x_split = self(roi_feats, prev_roi_feats, + [roi_feats.shape[0]], + [prev_roi_feats.shape[0]]) + + similarity_logits = self.predict_by_feat(x_split, ref_x_split) + + return similarity_logits + + def predict_by_feat(self, x_split: Tuple[Tensor], + ref_x_split: Tuple[Tensor]) -> List[Tensor]: + """Get similarity_logits. + + Args: + x_split (Tensor): The embed features belonging to key image. + ref_x_split (Tensor): The embed features belonging to ref image. + + Returns: + list[Tensor]: The predicted similarity_logits of each pair of key + image and reference image. + """ + similarity_logits = [] + for one_x, one_ref_x in zip(x_split, ref_x_split): + similarity_logit = embed_similarity( + one_x, one_ref_x, method='dot_product') + dummy = similarity_logit.new_zeros(one_x.shape[0], 1) + similarity_logit = torch.cat((dummy, similarity_logit), dim=1) + similarity_logits.append(similarity_logit) + return similarity_logits diff --git a/mmdetection/mmdet/models/tracking_heads/roi_track_head.py b/mmdetection/mmdet/models/tracking_heads/roi_track_head.py new file mode 100644 index 0000000..c51c810 --- /dev/null +++ b/mmdetection/mmdet/models/tracking_heads/roi_track_head.py @@ -0,0 +1,178 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta +from typing import List, Optional, Tuple + +from mmengine.model import BaseModule +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import TrackSampleList +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import InstanceList + + +@MODELS.register_module() +class RoITrackHead(BaseModule, metaclass=ABCMeta): + """The roi track head. + + This module is used in multi-object tracking methods, such as MaskTrack + R-CNN. + + Args: + roi_extractor (dict): Configuration of roi extractor. Defaults to None. + embed_head (dict): Configuration of embed head. Defaults to None. + train_cfg (dict): Configuration when training. Defaults to None. + test_cfg (dict): Configuration when testing. Defaults to None. + init_cfg (dict): Configuration of initialization. Defaults to None. + """ + + def __init__(self, + roi_extractor: Optional[dict] = None, + embed_head: Optional[dict] = None, + regress_head: Optional[dict] = None, + train_cfg: Optional[dict] = None, + test_cfg: Optional[dict] = None, + init_cfg: Optional[dict] = None, + *args, + **kwargs): + super().__init__(init_cfg=init_cfg) + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if embed_head is not None: + self.init_embed_head(roi_extractor, embed_head) + + if regress_head is not None: + raise NotImplementedError('Regression head is not supported yet.') + + self.init_assigner_sampler() + + def init_embed_head(self, roi_extractor, embed_head) -> None: + """Initialize ``embed_head``""" + self.roi_extractor = MODELS.build(roi_extractor) + self.embed_head = MODELS.build(embed_head) + + def init_assigner_sampler(self) -> None: + """Initialize assigner and sampler.""" + self.bbox_assigner = None + self.bbox_sampler = None + if self.train_cfg: + self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner) + self.bbox_sampler = TASK_UTILS.build( + self.train_cfg.sampler, default_args=dict(context=self)) + + @property + def with_track(self) -> bool: + """bool: whether the multi-object tracker has an embed head""" + return hasattr(self, 'embed_head') and self.embed_head is not None + + def extract_roi_feats( + self, feats: List[Tensor], + bboxes: List[Tensor]) -> Tuple[Tuple[Tensor], List[int]]: + """Extract roi features. + + Args: + feats (list[Tensor]): list of multi-level image features. + bboxes (list[Tensor]): list of bboxes in sampling result. + + Returns: + tuple[tuple[Tensor], list[int]]: The extracted roi features and + the number of bboxes in each image. + """ + rois = bbox2roi(bboxes) + bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs], + rois) + num_bbox_per_img = [len(bbox) for bbox in bboxes] + return bbox_feats, num_bbox_per_img + + def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor], + rpn_results_list: InstanceList, data_samples: TrackSampleList, + **kwargs) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + key_feats (list[Tensor]): list of multi-level image features. + ref_feats (list[Tensor]): list of multi-level ref_img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance`. + + Returns: + dict: A dictionary of loss components. + """ + assert self.with_track + batch_gt_instances = [] + ref_batch_gt_instances = [] + batch_gt_instances_ignore = [] + gt_instance_ids = [] + ref_gt_instance_ids = [] + for track_data_sample in data_samples: + key_data_sample = track_data_sample.get_key_frames()[0] + ref_data_sample = track_data_sample.get_ref_frames()[0] + batch_gt_instances.append(key_data_sample.gt_instances) + ref_batch_gt_instances.append(ref_data_sample.gt_instances) + if 'ignored_instances' in key_data_sample: + batch_gt_instances_ignore.append( + key_data_sample.ignored_instances) + else: + batch_gt_instances_ignore.append(None) + + gt_instance_ids.append(key_data_sample.gt_instances.instances_ids) + ref_gt_instance_ids.append( + ref_data_sample.gt_instances.instances_ids) + + losses = dict() + num_imgs = len(data_samples) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + sampling_results = [] + for i in range(num_imgs): + rpn_results = rpn_results_list[i] + + assign_result = self.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in key_feats]) + sampling_results.append(sampling_result) + + bboxes = [res.bboxes for res in sampling_results] + bbox_feats, num_bbox_per_img = self.extract_roi_feats( + key_feats, bboxes) + + # batch_size is 1 + ref_gt_bboxes = [ + ref_batch_gt_instance.bboxes + for ref_batch_gt_instance in ref_batch_gt_instances + ] + ref_bbox_feats, num_bbox_per_ref_img = self.extract_roi_feats( + ref_feats, ref_gt_bboxes) + + loss_track = self.embed_head.loss(bbox_feats, ref_bbox_feats, + num_bbox_per_img, + num_bbox_per_ref_img, + sampling_results, gt_instance_ids, + ref_gt_instance_ids) + losses.update(loss_track) + + return losses + + def predict(self, roi_feats: Tensor, + prev_roi_feats: Tensor) -> List[Tensor]: + """Perform forward propagation of the tracking head and predict + tracking results on the features of the upstream network. + + Args: + roi_feats (Tensor): Feature map of current images rois. + prev_roi_feats (Tensor): Feature map of previous images rois. + + Returns: + list[Tensor]: The predicted similarity_logits of each pair of key + image and reference image. + """ + return self.embed_head.predict(roi_feats, prev_roi_feats)[0] diff --git a/mmdetection/mmdet/models/utils/__init__.py b/mmdetection/mmdet/models/utils/__init__.py new file mode 100644 index 0000000..a00d9a3 --- /dev/null +++ b/mmdetection/mmdet/models/utils/__init__.py @@ -0,0 +1,37 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .gaussian_target import (gather_feat, gaussian_radius, + gen_gaussian_target, get_local_maximum, + get_topk_from_heatmap, transpose_and_gather_feat) +from .image import imrenormalize +from .make_divisible import make_divisible +# Disable yapf because it conflicts with isort. +# yapf: disable +from .misc import (align_tensor, aligned_bilinear, center_of_mass, + empty_instances, filter_gt_instances, + filter_scores_and_topk, flip_tensor, generate_coordinate, + images_to_levels, interpolate_as, levels_to_images, + mask2ndarray, multi_apply, relative_coordinate_maps, + rename_loss_dict, reweight_loss_dict, + samplelist_boxtype2tensor, select_single_mlvl, + sigmoid_geometric_mean, unfold_wo_center, unmap, + unpack_gt_instances) +from .panoptic_gt_processing import preprocess_panoptic_gt +from .point_sample import (get_uncertain_point_coords_with_randomness, + get_uncertainty) +from .vlfuse_helper import BertEncoderLayer, VLFuse, permute_and_flatten +from .wbf import weighted_boxes_fusion + +__all__ = [ + 'gaussian_radius', 'gen_gaussian_target', 'make_divisible', + 'get_local_maximum', 'get_topk_from_heatmap', 'transpose_and_gather_feat', + 'interpolate_as', 'sigmoid_geometric_mean', 'gather_feat', + 'preprocess_panoptic_gt', 'get_uncertain_point_coords_with_randomness', + 'get_uncertainty', 'unpack_gt_instances', 'empty_instances', + 'center_of_mass', 'filter_scores_and_topk', 'flip_tensor', + 'generate_coordinate', 'levels_to_images', 'mask2ndarray', 'multi_apply', + 'select_single_mlvl', 'unmap', 'images_to_levels', + 'samplelist_boxtype2tensor', 'filter_gt_instances', 'rename_loss_dict', + 'reweight_loss_dict', 'relative_coordinate_maps', 'aligned_bilinear', + 'unfold_wo_center', 'imrenormalize', 'VLFuse', 'permute_and_flatten', + 'BertEncoderLayer', 'align_tensor', 'weighted_boxes_fusion' +] diff --git a/mmdetection/mmdet/models/utils/gaussian_target.py b/mmdetection/mmdet/models/utils/gaussian_target.py new file mode 100644 index 0000000..5bf4d55 --- /dev/null +++ b/mmdetection/mmdet/models/utils/gaussian_target.py @@ -0,0 +1,268 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from math import sqrt + +import torch +import torch.nn.functional as F + + +def gaussian2D(radius, sigma=1, dtype=torch.float32, device='cpu'): + """Generate 2D gaussian kernel. + + Args: + radius (int): Radius of gaussian kernel. + sigma (int): Sigma of gaussian function. Default: 1. + dtype (torch.dtype): Dtype of gaussian tensor. Default: torch.float32. + device (str): Device of gaussian tensor. Default: 'cpu'. + + Returns: + h (Tensor): Gaussian kernel with a + ``(2 * radius + 1) * (2 * radius + 1)`` shape. + """ + x = torch.arange( + -radius, radius + 1, dtype=dtype, device=device).view(1, -1) + y = torch.arange( + -radius, radius + 1, dtype=dtype, device=device).view(-1, 1) + + h = (-(x * x + y * y) / (2 * sigma * sigma)).exp() + + h[h < torch.finfo(h.dtype).eps * h.max()] = 0 + return h + + +def gen_gaussian_target(heatmap, center, radius, k=1): + """Generate 2D gaussian heatmap. + + Args: + heatmap (Tensor): Input heatmap, the gaussian kernel will cover on + it and maintain the max value. + center (list[int]): Coord of gaussian kernel's center. + radius (int): Radius of gaussian kernel. + k (int): Coefficient of gaussian kernel. Default: 1. + + Returns: + out_heatmap (Tensor): Updated heatmap covered by gaussian kernel. + """ + diameter = 2 * radius + 1 + gaussian_kernel = gaussian2D( + radius, sigma=diameter / 6, dtype=heatmap.dtype, device=heatmap.device) + + x, y = center + + height, width = heatmap.shape[:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian_kernel[radius - top:radius + bottom, + radius - left:radius + right] + out_heatmap = heatmap + torch.max( + masked_heatmap, + masked_gaussian * k, + out=out_heatmap[y - top:y + bottom, x - left:x + right]) + + return out_heatmap + + +def gaussian_radius(det_size, min_overlap): + r"""Generate 2D gaussian radius. + + This function is modified from the `official github repo + `_. + + Given ``min_overlap``, radius could computed by a quadratic equation + according to Vieta's formulas. + + There are 3 cases for computing gaussian radius, details are following: + + - Explanation of figure: ``lt`` and ``br`` indicates the left-top and + bottom-right corner of ground truth box. ``x`` indicates the + generated corner at the limited position when ``radius=r``. + + - Case1: one corner is inside the gt box and the other is outside. + + .. code:: text + + |< width >| + + lt-+----------+ - + | | | ^ + +--x----------+--+ + | | | | + | | | | height + | | overlap | | + | | | | + | | | | v + +--+---------br--+ - + | | | + +----------+--x + + To ensure IoU of generated box and gt box is larger than ``min_overlap``: + + .. math:: + \cfrac{(w-r)*(h-r)}{w*h+(w+h)r-r^2} \ge {iou} \quad\Rightarrow\quad + {r^2-(w+h)r+\cfrac{1-iou}{1+iou}*w*h} \ge 0 \\ + {a} = 1,\quad{b} = {-(w+h)},\quad{c} = {\cfrac{1-iou}{1+iou}*w*h} + {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a} + + - Case2: both two corners are inside the gt box. + + .. code:: text + + |< width >| + + lt-+----------+ - + | | | ^ + +--x-------+ | + | | | | + | |overlap| | height + | | | | + | +-------x--+ + | | | v + +----------+-br - + + To ensure IoU of generated box and gt box is larger than ``min_overlap``: + + .. math:: + \cfrac{(w-2*r)*(h-2*r)}{w*h} \ge {iou} \quad\Rightarrow\quad + {4r^2-2(w+h)r+(1-iou)*w*h} \ge 0 \\ + {a} = 4,\quad {b} = {-2(w+h)},\quad {c} = {(1-iou)*w*h} + {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a} + + - Case3: both two corners are outside the gt box. + + .. code:: text + + |< width >| + + x--+----------------+ + | | | + +-lt-------------+ | - + | | | | ^ + | | | | + | | overlap | | height + | | | | + | | | | v + | +------------br--+ - + | | | + +----------------+--x + + To ensure IoU of generated box and gt box is larger than ``min_overlap``: + + .. math:: + \cfrac{w*h}{(w+2*r)*(h+2*r)} \ge {iou} \quad\Rightarrow\quad + {4*iou*r^2+2*iou*(w+h)r+(iou-1)*w*h} \le 0 \\ + {a} = {4*iou},\quad {b} = {2*iou*(w+h)},\quad {c} = {(iou-1)*w*h} \\ + {r} \le \cfrac{-b+\sqrt{b^2-4*a*c}}{2*a} + + Args: + det_size (list[int]): Shape of object. + min_overlap (float): Min IoU with ground truth for boxes generated by + keypoints inside the gaussian kernel. + + Returns: + radius (int): Radius of gaussian kernel. + """ + height, width = det_size + + a1 = 1 + b1 = (height + width) + c1 = width * height * (1 - min_overlap) / (1 + min_overlap) + sq1 = sqrt(b1**2 - 4 * a1 * c1) + r1 = (b1 - sq1) / (2 * a1) + + a2 = 4 + b2 = 2 * (height + width) + c2 = (1 - min_overlap) * width * height + sq2 = sqrt(b2**2 - 4 * a2 * c2) + r2 = (b2 - sq2) / (2 * a2) + + a3 = 4 * min_overlap + b3 = -2 * min_overlap * (height + width) + c3 = (min_overlap - 1) * width * height + sq3 = sqrt(b3**2 - 4 * a3 * c3) + r3 = (b3 + sq3) / (2 * a3) + return min(r1, r2, r3) + + +def get_local_maximum(heat, kernel=3): + """Extract local maximum pixel with given kernel. + + Args: + heat (Tensor): Target heatmap. + kernel (int): Kernel size of max pooling. Default: 3. + + Returns: + heat (Tensor): A heatmap where local maximum pixels maintain its + own value and other positions are 0. + """ + pad = (kernel - 1) // 2 + hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad) + keep = (hmax == heat).float() + return heat * keep + + +def get_topk_from_heatmap(scores, k=20): + """Get top k positions from heatmap. + + Args: + scores (Tensor): Target heatmap with shape + [batch, num_classes, height, width]. + k (int): Target number. Default: 20. + + Returns: + tuple[torch.Tensor]: Scores, indexes, categories and coords of + topk keypoint. Containing following Tensors: + + - topk_scores (Tensor): Max scores of each topk keypoint. + - topk_inds (Tensor): Indexes of each topk keypoint. + - topk_clses (Tensor): Categories of each topk keypoint. + - topk_ys (Tensor): Y-coord of each topk keypoint. + - topk_xs (Tensor): X-coord of each topk keypoint. + """ + batch, _, height, width = scores.size() + topk_scores, topk_inds = torch.topk(scores.view(batch, -1), k) + topk_clses = topk_inds // (height * width) + topk_inds = topk_inds % (height * width) + topk_ys = topk_inds // width + topk_xs = (topk_inds % width).int().float() + return topk_scores, topk_inds, topk_clses, topk_ys, topk_xs + + +def gather_feat(feat, ind, mask=None): + """Gather feature according to index. + + Args: + feat (Tensor): Target feature map. + ind (Tensor): Target coord index. + mask (Tensor | None): Mask of feature map. Default: None. + + Returns: + feat (Tensor): Gathered feature. + """ + dim = feat.size(2) + ind = ind.unsqueeze(2).repeat(1, 1, dim) + feat = feat.gather(1, ind) + if mask is not None: + mask = mask.unsqueeze(2).expand_as(feat) + feat = feat[mask] + feat = feat.view(-1, dim) + return feat + + +def transpose_and_gather_feat(feat, ind): + """Transpose and gather feature according to index. + + Args: + feat (Tensor): Target feature map. + ind (Tensor): Target coord index. + + Returns: + feat (Tensor): Transposed and gathered feature. + """ + feat = feat.permute(0, 2, 3, 1).contiguous() + feat = feat.view(feat.size(0), -1, feat.size(3)) + feat = gather_feat(feat, ind) + return feat diff --git a/mmdetection/mmdet/models/utils/image.py b/mmdetection/mmdet/models/utils/image.py new file mode 100644 index 0000000..16b5787 --- /dev/null +++ b/mmdetection/mmdet/models/utils/image.py @@ -0,0 +1,52 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import mmcv +import numpy as np +import torch +from torch import Tensor + + +def imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict, + new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]: + """Re-normalize the image. + + Args: + img (Tensor | ndarray): Input image. If the input is a Tensor, the + shape is (1, C, H, W). If the input is a ndarray, the shape + is (H, W, C). + img_norm_cfg (dict): Original configuration for the normalization. + new_img_norm_cfg (dict): New configuration for the normalization. + + Returns: + Tensor | ndarray: Output image with the same type and shape of + the input. + """ + if isinstance(img, torch.Tensor): + assert img.ndim == 4 and img.shape[0] == 1 + new_img = img.squeeze(0).cpu().numpy().transpose(1, 2, 0) + new_img = _imrenormalize(new_img, img_norm_cfg, new_img_norm_cfg) + new_img = new_img.transpose(2, 0, 1)[None] + return torch.from_numpy(new_img).to(img) + else: + return _imrenormalize(img, img_norm_cfg, new_img_norm_cfg) + + +def _imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict, + new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]: + """Re-normalize the image.""" + img_norm_cfg = img_norm_cfg.copy() + new_img_norm_cfg = new_img_norm_cfg.copy() + for k, v in img_norm_cfg.items(): + if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray): + img_norm_cfg[k] = np.array(v, dtype=img.dtype) + # reverse cfg + if 'bgr_to_rgb' in img_norm_cfg: + img_norm_cfg['rgb_to_bgr'] = img_norm_cfg['bgr_to_rgb'] + img_norm_cfg.pop('bgr_to_rgb') + for k, v in new_img_norm_cfg.items(): + if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray): + new_img_norm_cfg[k] = np.array(v, dtype=img.dtype) + img = mmcv.imdenormalize(img, **img_norm_cfg) + img = mmcv.imnormalize(img, **new_img_norm_cfg) + return img diff --git a/mmdetection/mmdet/models/utils/make_divisible.py b/mmdetection/mmdet/models/utils/make_divisible.py new file mode 100644 index 0000000..ed42c2e --- /dev/null +++ b/mmdetection/mmdet/models/utils/make_divisible.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +def make_divisible(value, divisor, min_value=None, min_ratio=0.9): + """Make divisible function. + + This function rounds the channel number to the nearest value that can be + divisible by the divisor. It is taken from the original tf repo. It ensures + that all layers have a channel number that is divisible by divisor. It can + be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py # noqa + + Args: + value (int): The original channel number. + divisor (int): The divisor to fully divide the channel number. + min_value (int): The minimum value of the output channel. + Default: None, means that the minimum value equal to the divisor. + min_ratio (float): The minimum ratio of the rounded channel number to + the original channel number. Default: 0.9. + + Returns: + int: The modified output channel number. + """ + + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than (1-min_ratio). + if new_value < min_ratio * value: + new_value += divisor + return new_value diff --git a/mmdetection/mmdet/models/utils/misc.py b/mmdetection/mmdet/models/utils/misc.py new file mode 100644 index 0000000..2cf4291 --- /dev/null +++ b/mmdetection/mmdet/models/utils/misc.py @@ -0,0 +1,697 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import partial +from typing import List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +from mmengine.structures import InstanceData +from mmengine.utils import digit_version +from six.moves import map, zip +from torch import Tensor +from torch.autograd import Function +from torch.nn import functional as F + +from mmdet.structures import SampleList +from mmdet.structures.bbox import BaseBoxes, get_box_type, stack_boxes +from mmdet.structures.mask import BitmapMasks, PolygonMasks +from mmdet.utils import OptInstanceList + + +class SigmoidGeometricMean(Function): + """Forward and backward function of geometric mean of two sigmoid + functions. + + This implementation with analytical gradient function substitutes + the autograd function of (x.sigmoid() * y.sigmoid()).sqrt(). The + original implementation incurs none during gradient backprapagation + if both x and y are very small values. + """ + + @staticmethod + def forward(ctx, x, y): + x_sigmoid = x.sigmoid() + y_sigmoid = y.sigmoid() + z = (x_sigmoid * y_sigmoid).sqrt() + ctx.save_for_backward(x_sigmoid, y_sigmoid, z) + return z + + @staticmethod + def backward(ctx, grad_output): + x_sigmoid, y_sigmoid, z = ctx.saved_tensors + grad_x = grad_output * z * (1 - x_sigmoid) / 2 + grad_y = grad_output * z * (1 - y_sigmoid) / 2 + return grad_x, grad_y + + +sigmoid_geometric_mean = SigmoidGeometricMean.apply + + +def interpolate_as(source, target, mode='bilinear', align_corners=False): + """Interpolate the `source` to the shape of the `target`. + + The `source` must be a Tensor, but the `target` can be a Tensor or a + np.ndarray with the shape (..., target_h, target_w). + + Args: + source (Tensor): A 3D/4D Tensor with the shape (N, H, W) or + (N, C, H, W). + target (Tensor | np.ndarray): The interpolation target with the shape + (..., target_h, target_w). + mode (str): Algorithm used for interpolation. The options are the + same as those in F.interpolate(). Default: ``'bilinear'``. + align_corners (bool): The same as the argument in F.interpolate(). + + Returns: + Tensor: The interpolated source Tensor. + """ + assert len(target.shape) >= 2 + + def _interpolate_as(source, target, mode='bilinear', align_corners=False): + """Interpolate the `source` (4D) to the shape of the `target`.""" + target_h, target_w = target.shape[-2:] + source_h, source_w = source.shape[-2:] + if target_h != source_h or target_w != source_w: + source = F.interpolate( + source, + size=(target_h, target_w), + mode=mode, + align_corners=align_corners) + return source + + if len(source.shape) == 3: + source = source[:, None, :, :] + source = _interpolate_as(source, target, mode, align_corners) + return source[:, 0, :, :] + else: + return _interpolate_as(source, target, mode, align_corners) + + +def unpack_gt_instances(batch_data_samples: SampleList) -> tuple: + """Unpack ``gt_instances``, ``gt_instances_ignore`` and ``img_metas`` based + on ``batch_data_samples`` + + Args: + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + tuple: + + - batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + - batch_gt_instances_ignore (list[:obj:`InstanceData`]): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + - batch_img_metas (list[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + """ + batch_gt_instances = [] + batch_gt_instances_ignore = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + if 'ignored_instances' in data_sample: + batch_gt_instances_ignore.append(data_sample.ignored_instances) + else: + batch_gt_instances_ignore.append(None) + + return batch_gt_instances, batch_gt_instances_ignore, batch_img_metas + + +def empty_instances(batch_img_metas: List[dict], + device: torch.device, + task_type: str, + instance_results: OptInstanceList = None, + mask_thr_binary: Union[int, float] = 0, + box_type: Union[str, type] = 'hbox', + use_box_type: bool = False, + num_classes: int = 80, + score_per_cls: bool = False) -> List[InstanceData]: + """Handle predicted instances when RoI is empty. + + Note: If ``instance_results`` is not None, it will be modified + in place internally, and then return ``instance_results`` + + Args: + batch_img_metas (list[dict]): List of image information. + device (torch.device): Device of tensor. + task_type (str): Expected returned task type. it currently + supports bbox and mask. + instance_results (list[:obj:`InstanceData`]): List of instance + results. + mask_thr_binary (int, float): mask binarization threshold. + Defaults to 0. + box_type (str or type): The empty box type. Defaults to `hbox`. + use_box_type (bool): Whether to warp boxes with the box type. + Defaults to False. + num_classes (int): num_classes of bbox_head. Defaults to 80. + score_per_cls (bool): Whether to generate classwise score for + the empty instance. ``score_per_cls`` will be True when the model + needs to produce raw results without nms. Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + """ + assert task_type in ('bbox', 'mask'), 'Only support bbox and mask,' \ + f' but got {task_type}' + + if instance_results is not None: + assert len(instance_results) == len(batch_img_metas) + + results_list = [] + for img_id in range(len(batch_img_metas)): + if instance_results is not None: + results = instance_results[img_id] + assert isinstance(results, InstanceData) + else: + results = InstanceData() + + if task_type == 'bbox': + _, box_type = get_box_type(box_type) + bboxes = torch.zeros(0, box_type.box_dim, device=device) + if use_box_type: + bboxes = box_type(bboxes, clone=False) + results.bboxes = bboxes + score_shape = (0, num_classes + 1) if score_per_cls else (0, ) + results.scores = torch.zeros(score_shape, device=device) + results.labels = torch.zeros((0, ), + device=device, + dtype=torch.long) + else: + # TODO: Handle the case where rescale is false + img_h, img_w = batch_img_metas[img_id]['ori_shape'][:2] + # the type of `im_mask` will be torch.bool or torch.uint8, + # where uint8 if for visualization and debugging. + im_mask = torch.zeros( + 0, + img_h, + img_w, + device=device, + dtype=torch.bool if mask_thr_binary >= 0 else torch.uint8) + results.masks = im_mask + results_list.append(results) + return results_list + + +def multi_apply(func, *args, **kwargs): + """Apply function to a list of arguments. + + Note: + This function applies the ``func`` to multiple inputs and + map the multiple outputs of the ``func`` into different + list. Each list contains the same type of outputs corresponding + to different inputs. + + Args: + func (Function): A function that will be applied to a list of + arguments + + Returns: + tuple(list): A tuple containing multiple list, each list contains \ + a kind of returned results by the function + """ + pfunc = partial(func, **kwargs) if kwargs else func + map_results = map(pfunc, *args) + return tuple(map(list, zip(*map_results))) + + +def unmap(data, count, inds, fill=0): + """Unmap a subset of item (data) back to the original set of items (of size + count)""" + if data.dim() == 1: + ret = data.new_full((count, ), fill) + ret[inds.type(torch.bool)] = data + else: + new_size = (count, ) + data.size()[1:] + ret = data.new_full(new_size, fill) + ret[inds.type(torch.bool), :] = data + return ret + + +def mask2ndarray(mask): + """Convert Mask to ndarray.. + + Args: + mask (:obj:`BitmapMasks` or :obj:`PolygonMasks` or + torch.Tensor or np.ndarray): The mask to be converted. + + Returns: + np.ndarray: Ndarray mask of shape (n, h, w) that has been converted + """ + if isinstance(mask, (BitmapMasks, PolygonMasks)): + mask = mask.to_ndarray() + elif isinstance(mask, torch.Tensor): + mask = mask.detach().cpu().numpy() + elif not isinstance(mask, np.ndarray): + raise TypeError(f'Unsupported {type(mask)} data type') + return mask + + +def flip_tensor(src_tensor, flip_direction): + """flip tensor base on flip_direction. + + Args: + src_tensor (Tensor): input feature map, shape (B, C, H, W). + flip_direction (str): The flipping direction. Options are + 'horizontal', 'vertical', 'diagonal'. + + Returns: + out_tensor (Tensor): Flipped tensor. + """ + assert src_tensor.ndim == 4 + valid_directions = ['horizontal', 'vertical', 'diagonal'] + assert flip_direction in valid_directions + if flip_direction == 'horizontal': + out_tensor = torch.flip(src_tensor, [3]) + elif flip_direction == 'vertical': + out_tensor = torch.flip(src_tensor, [2]) + else: + out_tensor = torch.flip(src_tensor, [2, 3]) + return out_tensor + + +def select_single_mlvl(mlvl_tensors, batch_id, detach=True): + """Extract a multi-scale single image tensor from a multi-scale batch + tensor based on batch index. + + Note: The default value of detach is True, because the proposal gradient + needs to be detached during the training of the two-stage model. E.g + Cascade Mask R-CNN. + + Args: + mlvl_tensors (list[Tensor]): Batch tensor for all scale levels, + each is a 4D-tensor. + batch_id (int): Batch index. + detach (bool): Whether detach gradient. Default True. + + Returns: + list[Tensor]: Multi-scale single image tensor. + """ + assert isinstance(mlvl_tensors, (list, tuple)) + num_levels = len(mlvl_tensors) + + if detach: + mlvl_tensor_list = [ + mlvl_tensors[i][batch_id].detach() for i in range(num_levels) + ] + else: + mlvl_tensor_list = [ + mlvl_tensors[i][batch_id] for i in range(num_levels) + ] + return mlvl_tensor_list + + +def filter_scores_and_topk(scores, score_thr, topk, results=None): + """Filter results using score threshold and topk candidates. + + Args: + scores (Tensor): The scores, shape (num_bboxes, K). + score_thr (float): The score filter threshold. + topk (int): The number of topk candidates. + results (dict or list or Tensor, Optional): The results to + which the filtering rule is to be applied. The shape + of each item is (num_bboxes, N). + + Returns: + tuple: Filtered results + + - scores (Tensor): The scores after being filtered, \ + shape (num_bboxes_filtered, ). + - labels (Tensor): The class labels, shape \ + (num_bboxes_filtered, ). + - anchor_idxs (Tensor): The anchor indexes, shape \ + (num_bboxes_filtered, ). + - filtered_results (dict or list or Tensor, Optional): \ + The filtered results. The shape of each item is \ + (num_bboxes_filtered, N). + """ + valid_mask = scores > score_thr + scores = scores[valid_mask] + valid_idxs = torch.nonzero(valid_mask) + + num_topk = min(topk, valid_idxs.size(0)) + # torch.sort is actually faster than .topk (at least on GPUs) + scores, idxs = scores.sort(descending=True) + scores = scores[:num_topk] + topk_idxs = valid_idxs[idxs[:num_topk]] + keep_idxs, labels = topk_idxs.unbind(dim=1) + + filtered_results = None + if results is not None: + if isinstance(results, dict): + filtered_results = {k: v[keep_idxs] for k, v in results.items()} + elif isinstance(results, list): + filtered_results = [result[keep_idxs] for result in results] + elif isinstance(results, torch.Tensor): + filtered_results = results[keep_idxs] + else: + raise NotImplementedError(f'Only supports dict or list or Tensor, ' + f'but get {type(results)}.') + return scores, labels, keep_idxs, filtered_results + + +def center_of_mass(mask, esp=1e-6): + """Calculate the centroid coordinates of the mask. + + Args: + mask (Tensor): The mask to be calculated, shape (h, w). + esp (float): Avoid dividing by zero. Default: 1e-6. + + Returns: + tuple[Tensor]: the coordinates of the center point of the mask. + + - center_h (Tensor): the center point of the height. + - center_w (Tensor): the center point of the width. + """ + h, w = mask.shape + grid_h = torch.arange(h, device=mask.device)[:, None] + grid_w = torch.arange(w, device=mask.device) + normalizer = mask.sum().float().clamp(min=esp) + center_h = (mask * grid_h).sum() / normalizer + center_w = (mask * grid_w).sum() / normalizer + return center_h, center_w + + +def generate_coordinate(featmap_sizes, device='cuda'): + """Generate the coordinate. + + Args: + featmap_sizes (tuple): The feature to be calculated, + of shape (N, C, W, H). + device (str): The device where the feature will be put on. + Returns: + coord_feat (Tensor): The coordinate feature, of shape (N, 2, W, H). + """ + + x_range = torch.linspace(-1, 1, featmap_sizes[-1], device=device) + y_range = torch.linspace(-1, 1, featmap_sizes[-2], device=device) + y, x = torch.meshgrid(y_range, x_range) + y = y.expand([featmap_sizes[0], 1, -1, -1]) + x = x.expand([featmap_sizes[0], 1, -1, -1]) + coord_feat = torch.cat([x, y], 1) + + return coord_feat + + +def levels_to_images(mlvl_tensor: List[torch.Tensor]) -> List[torch.Tensor]: + """Concat multi-level feature maps by image. + + [feature_level0, feature_level1...] -> [feature_image0, feature_image1...] + Convert the shape of each element in mlvl_tensor from (N, C, H, W) to + (N, H*W , C), then split the element to N elements with shape (H*W, C), and + concat elements in same image of all level along first dimension. + + Args: + mlvl_tensor (list[Tensor]): list of Tensor which collect from + corresponding level. Each element is of shape (N, C, H, W) + + Returns: + list[Tensor]: A list that contains N tensors and each tensor is + of shape (num_elements, C) + """ + batch_size = mlvl_tensor[0].size(0) + batch_list = [[] for _ in range(batch_size)] + channels = mlvl_tensor[0].size(1) + for t in mlvl_tensor: + t = t.permute(0, 2, 3, 1) + t = t.view(batch_size, -1, channels).contiguous() + for img in range(batch_size): + batch_list[img].append(t[img]) + return [torch.cat(item, 0) for item in batch_list] + + +def images_to_levels(target, num_levels): + """Convert targets by image to targets by feature level. + + [target_img0, target_img1] -> [target_level0, target_level1, ...] + """ + target = stack_boxes(target, 0) + level_targets = [] + start = 0 + for n in num_levels: + end = start + n + # level_targets.append(target[:, start:end].squeeze(0)) + level_targets.append(target[:, start:end]) + start = end + return level_targets + + +def samplelist_boxtype2tensor(batch_data_samples: SampleList) -> SampleList: + for data_samples in batch_data_samples: + if 'gt_instances' in data_samples: + bboxes = data_samples.gt_instances.get('bboxes', None) + if isinstance(bboxes, BaseBoxes): + data_samples.gt_instances.bboxes = bboxes.tensor + if 'pred_instances' in data_samples: + bboxes = data_samples.pred_instances.get('bboxes', None) + if isinstance(bboxes, BaseBoxes): + data_samples.pred_instances.bboxes = bboxes.tensor + if 'ignored_instances' in data_samples: + bboxes = data_samples.ignored_instances.get('bboxes', None) + if isinstance(bboxes, BaseBoxes): + data_samples.ignored_instances.bboxes = bboxes.tensor + + +_torch_version_div_indexing = ( + 'parrots' not in torch.__version__ + and digit_version(torch.__version__) >= digit_version('1.8')) + + +def floordiv(dividend, divisor, rounding_mode='trunc'): + if _torch_version_div_indexing: + return torch.div(dividend, divisor, rounding_mode=rounding_mode) + else: + return dividend // divisor + + +def _filter_gt_instances_by_score(batch_data_samples: SampleList, + score_thr: float) -> SampleList: + """Filter ground truth (GT) instances by score. + + Args: + batch_data_samples (SampleList): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + score_thr (float): The score filter threshold. + + Returns: + SampleList: The Data Samples filtered by score. + """ + for data_samples in batch_data_samples: + assert 'scores' in data_samples.gt_instances, \ + 'there does not exit scores in instances' + if data_samples.gt_instances.bboxes.shape[0] > 0: + data_samples.gt_instances = data_samples.gt_instances[ + data_samples.gt_instances.scores > score_thr] + return batch_data_samples + + +def _filter_gt_instances_by_size(batch_data_samples: SampleList, + wh_thr: tuple) -> SampleList: + """Filter ground truth (GT) instances by size. + + Args: + batch_data_samples (SampleList): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + wh_thr (tuple): Minimum width and height of bbox. + + Returns: + SampleList: The Data Samples filtered by score. + """ + for data_samples in batch_data_samples: + bboxes = data_samples.gt_instances.bboxes + if bboxes.shape[0] > 0: + w = bboxes[:, 2] - bboxes[:, 0] + h = bboxes[:, 3] - bboxes[:, 1] + data_samples.gt_instances = data_samples.gt_instances[ + (w > wh_thr[0]) & (h > wh_thr[1])] + return batch_data_samples + + +def filter_gt_instances(batch_data_samples: SampleList, + score_thr: float = None, + wh_thr: tuple = None): + """Filter ground truth (GT) instances by score and/or size. + + Args: + batch_data_samples (SampleList): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + score_thr (float): The score filter threshold. + wh_thr (tuple): Minimum width and height of bbox. + + Returns: + SampleList: The Data Samples filtered by score and/or size. + """ + + if score_thr is not None: + batch_data_samples = _filter_gt_instances_by_score( + batch_data_samples, score_thr) + if wh_thr is not None: + batch_data_samples = _filter_gt_instances_by_size( + batch_data_samples, wh_thr) + return batch_data_samples + + +def rename_loss_dict(prefix: str, losses: dict) -> dict: + """Rename the key names in loss dict by adding a prefix. + + Args: + prefix (str): The prefix for loss components. + losses (dict): A dictionary of loss components. + + Returns: + dict: A dictionary of loss components with prefix. + """ + return {prefix + k: v for k, v in losses.items()} + + +def reweight_loss_dict(losses: dict, weight: float) -> dict: + """Reweight losses in the dict by weight. + + Args: + losses (dict): A dictionary of loss components. + weight (float): Weight for loss components. + + Returns: + dict: A dictionary of weighted loss components. + """ + for name, loss in losses.items(): + if 'loss' in name: + if isinstance(loss, Sequence): + losses[name] = [item * weight for item in loss] + else: + losses[name] = loss * weight + return losses + + +def relative_coordinate_maps( + locations: Tensor, + centers: Tensor, + strides: Tensor, + size_of_interest: int, + feat_sizes: Tuple[int], +) -> Tensor: + """Generate the relative coordinate maps with feat_stride. + + Args: + locations (Tensor): The prior location of mask feature map. + It has shape (num_priors, 2). + centers (Tensor): The prior points of a object in + all feature pyramid. It has shape (num_pos, 2) + strides (Tensor): The prior strides of a object in + all feature pyramid. It has shape (num_pos, 1) + size_of_interest (int): The size of the region used in rel coord. + feat_sizes (Tuple[int]): The feature size H and W, which has 2 dims. + Returns: + rel_coord_feat (Tensor): The coordinate feature + of shape (num_pos, 2, H, W). + """ + + H, W = feat_sizes + rel_coordinates = centers.reshape(-1, 1, 2) - locations.reshape(1, -1, 2) + rel_coordinates = rel_coordinates.permute(0, 2, 1).float() + rel_coordinates = rel_coordinates / ( + strides[:, None, None] * size_of_interest) + return rel_coordinates.reshape(-1, 2, H, W) + + +def aligned_bilinear(tensor: Tensor, factor: int) -> Tensor: + """aligned bilinear, used in original implement in CondInst: + + https://github.com/aim-uofa/AdelaiDet/blob/\ + c0b2092ce72442b0f40972f7c6dda8bb52c46d16/adet/utils/comm.py#L23 + """ + + assert tensor.dim() == 4 + assert factor >= 1 + assert int(factor) == factor + + if factor == 1: + return tensor + + h, w = tensor.size()[2:] + tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode='replicate') + oh = factor * h + 1 + ow = factor * w + 1 + tensor = F.interpolate( + tensor, size=(oh, ow), mode='bilinear', align_corners=True) + tensor = F.pad( + tensor, pad=(factor // 2, 0, factor // 2, 0), mode='replicate') + + return tensor[:, :, :oh - 1, :ow - 1] + + +def unfold_wo_center(x, kernel_size: int, dilation: int) -> Tensor: + """unfold_wo_center, used in original implement in BoxInst: + + https://github.com/aim-uofa/AdelaiDet/blob/\ + 4a3a1f7372c35b48ebf5f6adc59f135a0fa28d60/\ + adet/modeling/condinst/condinst.py#L53 + """ + assert x.dim() == 4 + assert kernel_size % 2 == 1 + + # using SAME padding + padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2 + unfolded_x = F.unfold( + x, kernel_size=kernel_size, padding=padding, dilation=dilation) + unfolded_x = unfolded_x.reshape( + x.size(0), x.size(1), -1, x.size(2), x.size(3)) + # remove the center pixels + size = kernel_size**2 + unfolded_x = torch.cat( + (unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:]), + dim=2) + + return unfolded_x + + +def padding_to(input_tensor: Tensor, max_len: int = 300) -> Tensor: + """Pad the first dimension of `input_tensor` to `max_len`. + + Args: + input_tensor (Tensor): The tensor to be padded, + max_len (int): Padding target size in the first dimension. + Default: 300 + https://github.com/jshilong/DDQ/blob/ddq_detr/projects/models/utils.py#L19 + Returns: + Tensor: The tensor padded with the first dimension size `max_len`. + """ + if max_len is None: + return input_tensor + num_padding = max_len - len(input_tensor) + if input_tensor.dim() > 1: + padding = input_tensor.new_zeros( + num_padding, *input_tensor.size()[1:], dtype=input_tensor.dtype) + else: + padding = input_tensor.new_zeros(num_padding, dtype=input_tensor.dtype) + output_tensor = torch.cat([input_tensor, padding], dim=0) + return output_tensor + + +def align_tensor(inputs: List[Tensor], + max_len: Optional[int] = None) -> Tensor: + """Pad each input to `max_len`, then stack them. If `max_len` is None, then + it is the max size of the first dimension of each input. + + https://github.com/jshilong/DDQ/blob/ddq_detr/projects/models/\ + utils.py#L12 + + Args: + inputs (list[Tensor]): The tensors to be padded, + Each input should have the same shape except the first dimension. + max_len (int): Padding target size in the first dimension. + Default: None + Returns: + Tensor: Stacked inputs after padding in the first dimension. + """ + if max_len is None: + max_len = max([len(item) for item in inputs]) + + return torch.stack([padding_to(item, max_len) for item in inputs]) diff --git a/mmdetection/mmdet/models/utils/panoptic_gt_processing.py b/mmdetection/mmdet/models/utils/panoptic_gt_processing.py new file mode 100644 index 0000000..7a3bc95 --- /dev/null +++ b/mmdetection/mmdet/models/utils/panoptic_gt_processing.py @@ -0,0 +1,70 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +from torch import Tensor + + +def preprocess_panoptic_gt(gt_labels: Tensor, gt_masks: Tensor, + gt_semantic_seg: Tensor, num_things: int, + num_stuff: int) -> Tuple[Tensor, Tensor]: + """Preprocess the ground truth for a image. + + Args: + gt_labels (Tensor): Ground truth labels of each bbox, + with shape (num_gts, ). + gt_masks (BitmapMasks): Ground truth masks of each instances + of a image, shape (num_gts, h, w). + gt_semantic_seg (Tensor | None): Ground truth of semantic + segmentation with the shape (1, h, w). + [0, num_thing_class - 1] means things, + [num_thing_class, num_class-1] means stuff, + 255 means VOID. It's None when training instance segmentation. + + Returns: + tuple[Tensor, Tensor]: a tuple containing the following targets. + + - labels (Tensor): Ground truth class indices for a + image, with shape (n, ), n is the sum of number + of stuff type and number of instance in a image. + - masks (Tensor): Ground truth mask for a image, with + shape (n, h, w). Contains stuff and things when training + panoptic segmentation, and things only when training + instance segmentation. + """ + num_classes = num_things + num_stuff + things_masks = gt_masks.to_tensor( + dtype=torch.bool, device=gt_labels.device) + + if gt_semantic_seg is None: + masks = things_masks.long() + return gt_labels, masks + + things_labels = gt_labels + gt_semantic_seg = gt_semantic_seg.squeeze(0) + + semantic_labels = torch.unique( + gt_semantic_seg, + sorted=False, + return_inverse=False, + return_counts=False) + stuff_masks_list = [] + stuff_labels_list = [] + for label in semantic_labels: + if label < num_things or label >= num_classes: + continue + stuff_mask = gt_semantic_seg == label + stuff_masks_list.append(stuff_mask) + stuff_labels_list.append(label) + + if len(stuff_masks_list) > 0: + stuff_masks = torch.stack(stuff_masks_list, dim=0) + stuff_labels = torch.stack(stuff_labels_list, dim=0) + labels = torch.cat([things_labels, stuff_labels], dim=0) + masks = torch.cat([things_masks, stuff_masks], dim=0) + else: + labels = things_labels + masks = things_masks + + masks = masks.long() + return labels, masks diff --git a/mmdetection/mmdet/models/utils/point_sample.py b/mmdetection/mmdet/models/utils/point_sample.py new file mode 100644 index 0000000..1afc957 --- /dev/null +++ b/mmdetection/mmdet/models/utils/point_sample.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmcv.ops import point_sample +from torch import Tensor + + +def get_uncertainty(mask_preds: Tensor, labels: Tensor) -> Tensor: + """Estimate uncertainty based on pred logits. + + We estimate uncertainty as L1 distance between 0.0 and the logits + prediction in 'mask_preds' for the foreground class in `classes`. + + Args: + mask_preds (Tensor): mask predication logits, shape (num_rois, + num_classes, mask_height, mask_width). + + labels (Tensor): Either predicted or ground truth label for + each predicted mask, of length num_rois. + + Returns: + scores (Tensor): Uncertainty scores with the most uncertain + locations having the highest uncertainty score, + shape (num_rois, 1, mask_height, mask_width) + """ + if mask_preds.shape[1] == 1: + gt_class_logits = mask_preds.clone() + else: + inds = torch.arange(mask_preds.shape[0], device=mask_preds.device) + gt_class_logits = mask_preds[inds, labels].unsqueeze(1) + return -torch.abs(gt_class_logits) + + +def get_uncertain_point_coords_with_randomness( + mask_preds: Tensor, labels: Tensor, num_points: int, + oversample_ratio: float, importance_sample_ratio: float) -> Tensor: + """Get ``num_points`` most uncertain points with random points during + train. + + Sample points in [0, 1] x [0, 1] coordinate space based on their + uncertainty. The uncertainties are calculated for each point using + 'get_uncertainty()' function that takes point's logit prediction as + input. + + Args: + mask_preds (Tensor): A tensor of shape (num_rois, num_classes, + mask_height, mask_width) for class-specific or class-agnostic + prediction. + labels (Tensor): The ground truth class for each instance. + num_points (int): The number of points to sample. + oversample_ratio (float): Oversampling parameter. + importance_sample_ratio (float): Ratio of points that are sampled + via importnace sampling. + + Returns: + point_coords (Tensor): A tensor of shape (num_rois, num_points, 2) + that contains the coordinates sampled points. + """ + assert oversample_ratio >= 1 + assert 0 <= importance_sample_ratio <= 1 + batch_size = mask_preds.shape[0] + num_sampled = int(num_points * oversample_ratio) + point_coords = torch.rand( + batch_size, num_sampled, 2, device=mask_preds.device) + point_logits = point_sample(mask_preds, point_coords) + # It is crucial to calculate uncertainty based on the sampled + # prediction value for the points. Calculating uncertainties of the + # coarse predictions first and sampling them for points leads to + # incorrect results. To illustrate this: assume uncertainty func( + # logits)=-abs(logits), a sampled point between two coarse + # predictions with -1 and 1 logits has 0 logits, and therefore 0 + # uncertainty value. However, if we calculate uncertainties for the + # coarse predictions first, both will have -1 uncertainty, + # and sampled point will get -1 uncertainty. + point_uncertainties = get_uncertainty(point_logits, labels) + num_uncertain_points = int(importance_sample_ratio * num_points) + num_random_points = num_points - num_uncertain_points + idx = torch.topk( + point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] + shift = num_sampled * torch.arange( + batch_size, dtype=torch.long, device=mask_preds.device) + idx += shift[:, None] + point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view( + batch_size, num_uncertain_points, 2) + if num_random_points > 0: + rand_roi_coords = torch.rand( + batch_size, num_random_points, 2, device=mask_preds.device) + point_coords = torch.cat((point_coords, rand_roi_coords), dim=1) + return point_coords diff --git a/mmdetection/mmdet/models/utils/vlfuse_helper.py b/mmdetection/mmdet/models/utils/vlfuse_helper.py new file mode 100644 index 0000000..76b54de --- /dev/null +++ b/mmdetection/mmdet/models/utils/vlfuse_helper.py @@ -0,0 +1,773 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Modified from https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/utils/fuse_helper.py # noqa +# and https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/modeling/rpn/modeling_bert.py # noqa +import math +from typing import Dict, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from mmcv.cnn.bricks import DropPath +from torch import Tensor + +try: + from transformers import BertConfig, BertPreTrainedModel + from transformers.modeling_utils import apply_chunking_to_forward + from transformers.models.bert.modeling_bert import \ + BertAttention as HFBertAttention + from transformers.models.bert.modeling_bert import \ + BertIntermediate as HFBertIntermediate + from transformers.models.bert.modeling_bert import \ + BertOutput as HFBertOutput +except ImportError: + BertConfig = None + BertPreTrainedModel = object + apply_chunking_to_forward = None + HFBertAttention = object + HFBertIntermediate = object + HFBertOutput = object + +MAX_CLAMP_VALUE = 50000 + + +def permute_and_flatten(layer: Tensor, N: int, A: int, C: int, H: int, + W: int) -> Tensor: + """Permute and then flatten a tensor, + + from size (N, A, C, H, W) to (N, H * W * A, C). + + Args: + layer (Tensor): Tensor of shape (N, C, H, W). + N (int): Batch size. + A (int): Number of attention heads. + C (int): Number of channels. + H (int): Height of feature map. + W (int): Width of feature map. + + Returns: + Tensor: A Tensor of shape (N, H * W * A, C). + """ + layer = layer.view(N, A, C, H, W) + layer = layer.permute(0, 3, 4, 1, 2) + layer = layer.reshape(N, -1, C) + return layer + + +def clamp_values(vector: Tensor) -> Tensor: + """Clamp the values of a vector to the range [-MAX_CLAMP_VALUE, + MAX_CLAMP_VALUE]. + + Args: + vector (Tensor): Tensor of shape (N, C, H, W). + + Returns: + Tensor: A Tensor of shape (N, C, H, W) with clamped values. + """ + vector = torch.clamp(vector, min=-MAX_CLAMP_VALUE, max=MAX_CLAMP_VALUE) + return vector + + +class BiMultiHeadAttention(nn.Module): + """Bidirectional fusion Multi-Head Attention layer. + + Args: + v_dim (int): The dimension of the vision input. + l_dim (int): The dimension of the language input. + embed_dim (int): The embedding dimension for the attention operation. + num_heads (int): The number of attention heads. + dropout (float, optional): The dropout probability. Defaults to 0.1. + """ + + def __init__(self, + v_dim: int, + l_dim: int, + embed_dim: int, + num_heads: int, + dropout: float = 0.1): + super(BiMultiHeadAttention, self).__init__() + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.v_dim = v_dim + self.l_dim = l_dim + + assert ( + self.head_dim * self.num_heads == self.embed_dim + ), 'embed_dim must be divisible by num_heads ' \ + f'(got `embed_dim`: {self.embed_dim} ' \ + f'and `num_heads`: {self.num_heads}).' + self.scale = self.head_dim**(-0.5) + self.dropout = dropout + + self.v_proj = nn.Linear(self.v_dim, self.embed_dim) + self.l_proj = nn.Linear(self.l_dim, self.embed_dim) + self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim) + self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim) + + self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim) + self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim) + + self.stable_softmax_2d = False + self.clamp_min_for_underflow = True + self.clamp_max_for_overflow = True + + self._reset_parameters() + + def _shape(self, tensor: Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, + self.head_dim).transpose(1, 2).contiguous() + + def _reset_parameters(self): + nn.init.xavier_uniform_(self.v_proj.weight) + self.v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.l_proj.weight) + self.l_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_v_proj.weight) + self.values_v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_l_proj.weight) + self.values_l_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_v_proj.weight) + self.out_v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_l_proj.weight) + self.out_l_proj.bias.data.fill_(0) + + def forward( + self, + vision: Tensor, + lang: Tensor, + attention_mask_v: Optional[Tensor] = None, + attention_mask_l: Optional[Tensor] = None, + ) -> Tuple[Tensor, Tensor]: + bsz, tgt_len, _ = vision.size() + + query_states = self.v_proj(vision) * self.scale + key_states = self._shape(self.l_proj(lang), -1, bsz) + value_v_states = self._shape(self.values_v_proj(vision), -1, bsz) + value_l_states = self._shape(self.values_l_proj(lang), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, + bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_v_states = value_v_states.view(*proj_shape) + value_l_states = value_l_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f'Attention weights should be of ' + f'size {(bsz * self.num_heads, tgt_len, src_len)}, ' + f'but is {attn_weights.size()}') + + if self.stable_softmax_2d: + attn_weights = attn_weights - attn_weights.max() + + if self.clamp_min_for_underflow: + # Do not increase -50000, data type half has quite limited range + attn_weights = torch.clamp(attn_weights, min=-MAX_CLAMP_VALUE) + if self.clamp_max_for_overflow: + # Do not increase 50000, data type half has quite limited range + attn_weights = torch.clamp(attn_weights, max=MAX_CLAMP_VALUE) + + attn_weights_T = attn_weights.transpose(1, 2) + attn_weights_l = ( + attn_weights_T - + torch.max(attn_weights_T, dim=-1, keepdim=True)[0]) + if self.clamp_min_for_underflow: + # Do not increase -50000, data type half has quite limited range + attn_weights_l = torch.clamp(attn_weights_l, min=-MAX_CLAMP_VALUE) + if self.clamp_max_for_overflow: + # Do not increase 50000, data type half has quite limited range + attn_weights_l = torch.clamp(attn_weights_l, max=MAX_CLAMP_VALUE) + + if attention_mask_v is not None: + attention_mask_v = ( + attention_mask_v[:, None, + None, :].repeat(1, self.num_heads, 1, + 1).flatten(0, 1)) + attn_weights_l.masked_fill_(attention_mask_v, float('-inf')) + + attn_weights_l = attn_weights_l.softmax(dim=-1) + + if attention_mask_l is not None: + assert (attention_mask_l.dim() == 2) + attention_mask = attention_mask_l.unsqueeze(1).unsqueeze(1) + attention_mask = attention_mask.expand(bsz, 1, tgt_len, src_len) + attention_mask = attention_mask.masked_fill( + attention_mask == 0, -9e15) + + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError('Attention mask should be of ' + f'size {(bsz, 1, tgt_len, src_len)}') + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, + src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, + src_len) + + attn_weights_v = nn.functional.softmax(attn_weights, dim=-1) + + attn_probs_v = F.dropout( + attn_weights_v, p=self.dropout, training=self.training) + attn_probs_l = F.dropout( + attn_weights_l, p=self.dropout, training=self.training) + + attn_output_v = torch.bmm(attn_probs_v, value_l_states) + attn_output_l = torch.bmm(attn_probs_l, value_v_states) + + if attn_output_v.size() != (bsz * self.num_heads, tgt_len, + self.head_dim): + raise ValueError( + '`attn_output_v` should be of ' + f'size {(bsz, self.num_heads, tgt_len, self.head_dim)}, ' + f'but is {attn_output_v.size()}') + + if attn_output_l.size() != (bsz * self.num_heads, src_len, + self.head_dim): + raise ValueError( + '`attn_output_l` should be of size ' + f'{(bsz, self.num_heads, src_len, self.head_dim)}, ' + f'but is {attn_output_l.size()}') + + attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, + self.head_dim) + attn_output_v = attn_output_v.transpose(1, 2) + attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim) + + attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, + self.head_dim) + attn_output_l = attn_output_l.transpose(1, 2) + attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim) + + attn_output_v = self.out_v_proj(attn_output_v) + attn_output_l = self.out_l_proj(attn_output_l) + + return attn_output_v, attn_output_l + + +class BiAttentionBlock(nn.Module): + """BiAttentionBlock Module: + + First, multi-level visual features are concat; Then the concat visual + feature and lang feature are fused by attention; Finally the newly visual + feature are split into multi levels. + + Args: + v_dim (int): The dimension of the visual features. + l_dim (int): The dimension of the language feature. + embed_dim (int): The embedding dimension for the attention operation. + num_heads (int): The number of attention heads. + dropout (float, optional): The dropout probability. Defaults to 0.1. + drop_path (float, optional): The drop path probability. + Defaults to 0.0. + init_values (float, optional): + The initial value for the scaling parameter. + Defaults to 1e-4. + """ + + def __init__(self, + v_dim: int, + l_dim: int, + embed_dim: int, + num_heads: int, + dropout: float = 0.1, + drop_path: float = .0, + init_values: float = 1e-4): + super().__init__() + + # pre layer norm + self.layer_norm_v = nn.LayerNorm(v_dim) + self.layer_norm_l = nn.LayerNorm(l_dim) + self.attn = BiMultiHeadAttention( + v_dim=v_dim, + l_dim=l_dim, + embed_dim=embed_dim, + num_heads=num_heads, + dropout=dropout) + + # add layer scale for training stability + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.gamma_v = nn.Parameter( + init_values * torch.ones(v_dim), requires_grad=True) + self.gamma_l = nn.Parameter( + init_values * torch.ones(l_dim), requires_grad=True) + + def forward(self, + vf0: Tensor, + vf1: Tensor, + vf2: Tensor, + vf3: Tensor, + vf4: Tensor, + lang_feature: Tensor, + attention_mask_l=None): + visual_features = [vf0, vf1, vf2, vf3, vf4] + size_per_level, visual_features_flatten = [], [] + for i, feat_per_level in enumerate(visual_features): + bs, c, h, w = feat_per_level.shape + size_per_level.append([h, w]) + feat = permute_and_flatten(feat_per_level, bs, -1, c, h, w) + visual_features_flatten.append(feat) + visual_features_flatten = torch.cat(visual_features_flatten, dim=1) + new_v, new_lang_feature = self.single_attention_call( + visual_features_flatten, + lang_feature, + attention_mask_l=attention_mask_l) + # [bs, N, C] -> [bs, C, N] + new_v = new_v.transpose(1, 2).contiguous() + + start = 0 + # fvfs is mean fusion_visual_features + fvfs = [] + for (h, w) in size_per_level: + new_v_per_level = new_v[:, :, + start:start + h * w].view(bs, -1, h, + w).contiguous() + fvfs.append(new_v_per_level) + start += h * w + + return fvfs[0], fvfs[1], fvfs[2], fvfs[3], fvfs[4], new_lang_feature + + def single_attention_call( + self, + visual: Tensor, + lang: Tensor, + attention_mask_v: Optional[Tensor] = None, + attention_mask_l: Optional[Tensor] = None, + ) -> Tuple[Tensor, Tensor]: + """Perform a single attention call between the visual and language + inputs. + + Args: + visual (Tensor): The visual input tensor. + lang (Tensor): The language input tensor. + attention_mask_v (Optional[Tensor]): + An optional attention mask tensor for the visual input. + attention_mask_l (Optional[Tensor]): + An optional attention mask tensor for the language input. + + Returns: + Tuple[Tensor, Tensor]: A tuple containing the updated + visual and language tensors after the attention call. + """ + visual = self.layer_norm_v(visual) + lang = self.layer_norm_l(lang) + delta_v, delta_l = self.attn( + visual, + lang, + attention_mask_v=attention_mask_v, + attention_mask_l=attention_mask_l) + # visual, lang = visual + delta_v, l + delta_l + visual = visual + self.drop_path(self.gamma_v * delta_v) + lang = lang + self.drop_path(self.gamma_l * delta_l) + return visual, lang + + +class SingleScaleBiAttentionBlock(BiAttentionBlock): + """This is a single-scale implementation of `BiAttentionBlock`. + + The only differenece between it and `BiAttentionBlock` is that the + `forward` function of `SingleScaleBiAttentionBlock` only accepts a single + flatten visual feature map, while the `forward` function in + `BiAttentionBlock` accepts multiple visual feature maps. + """ + + def forward(self, + visual_feature: Tensor, + lang_feature: Tensor, + attention_mask_v=None, + attention_mask_l=None): + """Single-scale forward pass. + + Args: + visual_feature (Tensor): The visual input tensor. Tensor of + shape (bs, patch_len, ch). + lang_feature (Tensor): The language input tensor. Tensor of + shape (bs, text_len, ch). + attention_mask_v (_type_, optional): Visual feature attention + mask. Defaults to None. + attention_mask_l (_type_, optional): Language feature attention + mask.Defaults to None. + """ + new_v, new_lang_feature = self.single_attention_call( + visual_feature, + lang_feature, + attention_mask_v=attention_mask_v, + attention_mask_l=attention_mask_l) + return new_v, new_lang_feature + + +class VLFuse(nn.Module): + """Early Fusion Module. + + Args: + v_dim (int): Dimension of visual features. + l_dim (int): Dimension of language features. + embed_dim (int): The embedding dimension for the attention operation. + num_heads (int): Number of attention heads. + dropout (float): Dropout probability. + drop_path (float): Drop path probability. + use_checkpoint (bool): Whether to use PyTorch's checkpoint function. + """ + + def __init__(self, + v_dim: int = 256, + l_dim: int = 768, + embed_dim: int = 2048, + num_heads: int = 8, + dropout: float = 0.1, + drop_path: float = 0.0, + use_checkpoint: bool = False): + super().__init__() + self.use_checkpoint = use_checkpoint + self.b_attn = BiAttentionBlock( + v_dim=v_dim, + l_dim=l_dim, + embed_dim=embed_dim, + num_heads=num_heads, + dropout=dropout, + drop_path=drop_path, + init_values=1.0 / 6.0) + + def forward(self, x: dict) -> dict: + """Forward pass of the VLFuse module.""" + visual_features = x['visual'] + language_dict_features = x['lang'] + + if self.use_checkpoint: + # vf is mean visual_features + # checkpoint does not allow complex data structures as input, + # such as list, so we must split them. + vf0, vf1, vf2, vf3, vf4, language_features = checkpoint.checkpoint( + self.b_attn, *visual_features, + language_dict_features['hidden'], + language_dict_features['masks']) + else: + vf0, vf1, vf2, vf3, vf4, language_features = self.b_attn( + *visual_features, language_dict_features['hidden'], + language_dict_features['masks']) + + language_dict_features['hidden'] = language_features + fused_language_dict_features = language_dict_features + + features_dict = { + 'visual': [vf0, vf1, vf2, vf3, vf4], + 'lang': fused_language_dict_features + } + + return features_dict + + +class BertEncoderLayer(BertPreTrainedModel): + """A modified version of the `BertLayer` class from the + `transformers.models.bert.modeling_bert` module. + + Args: + config (:class:`~transformers.BertConfig`): + The configuration object that + contains various parameters for the model. + clamp_min_for_underflow (bool, optional): + Whether to clamp the minimum value of the hidden states + to prevent underflow. Defaults to `False`. + clamp_max_for_overflow (bool, optional): + Whether to clamp the maximum value of the hidden states + to prevent overflow. Defaults to `False`. + """ + + def __init__(self, + config: BertConfig, + clamp_min_for_underflow: bool = False, + clamp_max_for_overflow: bool = False): + super().__init__(config) + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + + self.attention = BertAttention(config, clamp_min_for_underflow, + clamp_max_for_overflow) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, inputs: Dict[str, Dict[str, torch.Tensor]] + ) -> Dict[str, Dict[str, torch.Tensor]]: + """Applies the BertEncoderLayer to the input features.""" + language_dict_features = inputs['lang'] + hidden_states = language_dict_features['hidden'] + attention_mask = language_dict_features['masks'] + + device = hidden_states.device + input_shape = hidden_states.size()[:-1] + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, device) + + self_attention_outputs = self.attention( + hidden_states, + extended_attention_mask, + None, + output_attentions=False, + past_key_value=None) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] + layer_output = apply_chunking_to_forward(self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output) + outputs = (layer_output, ) + outputs + hidden_states = outputs[0] + + language_dict_features['hidden'] = hidden_states + + features_dict = { + 'visual': inputs['visual'], + 'lang': language_dict_features + } + + return features_dict + + def feed_forward_chunk(self, attention_output: Tensor) -> Tensor: + """Applies the intermediate and output layers of the BertEncoderLayer + to a chunk of the input sequence.""" + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# The following code is the same as the Huggingface code, +# with the only difference being the additional clamp operation. +class BertSelfAttention(nn.Module): + """BERT self-attention layer from Huggingface transformers. + + Compared to the BertSelfAttention of Huggingface, only add the clamp. + + Args: + config (:class:`~transformers.BertConfig`): + The configuration object that + contains various parameters for the model. + clamp_min_for_underflow (bool, optional): + Whether to clamp the minimum value of the hidden states + to prevent underflow. Defaults to `False`. + clamp_max_for_overflow (bool, optional): + Whether to clamp the maximum value of the hidden states + to prevent overflow. Defaults to `False`. + """ + + def __init__(self, + config: BertConfig, + clamp_min_for_underflow: bool = False, + clamp_max_for_overflow: bool = False): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and \ + not hasattr(config, 'embedding_size'): + raise ValueError(f'The hidden size ({config.hidden_size}) is ' + 'not a multiple of the number of attention ' + f'heads ({config.num_attention_heads})') + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / + config.num_attention_heads) + self.all_head_size = self.num_attention_heads * \ + self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, + 'position_embedding_type', + 'absolute') + if self.position_embedding_type == 'relative_key' or \ + self.position_embedding_type == 'relative_key_query': + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, + self.attention_head_size) + self.clamp_min_for_underflow = clamp_min_for_underflow + self.clamp_max_for_overflow = clamp_max_for_overflow + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: Tensor) -> Tensor: + """Transpose the dimensions of `x`.""" + new_x_shape = x.size()[:-1] + (self.num_attention_heads, + self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: Tensor, + attention_mask: Optional[Tensor] = None, + head_mask: Optional[Tensor] = None, + encoder_hidden_states: Optional[Tensor] = None, + encoder_attention_mask: Optional[Tensor] = None, + past_key_value: Optional[Tuple[Tensor, Tensor]] = None, + output_attentions: bool = False, + ) -> Tuple[Tensor, ...]: + """Perform a forward pass through the BERT self-attention layer.""" + + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" + # to get the raw attention scores. + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + + if self.position_embedding_type == 'relative_key' or \ + self.position_embedding_type == 'relative_key_query': + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange( + seq_length, dtype=torch.long, + device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == 'relative_key': + relative_position_scores = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == 'relative_key_query': + relative_position_scores_query = torch.einsum( + 'bhld,lrd->bhlr', query_layer, positional_embedding) + relative_position_scores_key = torch.einsum( + 'bhrd,lrd->bhlr', key_layer, positional_embedding) + attention_scores = attention_scores + \ + relative_position_scores_query + \ + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt( + self.attention_head_size) + + if self.clamp_min_for_underflow: + attention_scores = torch.clamp( + attention_scores, min=-MAX_CLAMP_VALUE + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attention_scores = torch.clamp( + attention_scores, max=MAX_CLAMP_VALUE + ) # Do not increase 50000, data type half has quite limited range + + if attention_mask is not None: + # Apply the attention mask is + # (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + ( + self.all_head_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, + attention_probs) if output_attentions else (context_layer, ) + + if self.is_decoder: + outputs = outputs + (past_key_value, ) + return outputs + + +class BertAttention(HFBertAttention): + """BertAttention is made up of self-attention and intermediate+output. + + Compared to the BertAttention of Huggingface, only add the clamp. + + Args: + config (:class:`~transformers.BertConfig`): + The configuration object that + contains various parameters for the model. + clamp_min_for_underflow (bool, optional): + Whether to clamp the minimum value of the hidden states + to prevent underflow. Defaults to `False`. + clamp_max_for_overflow (bool, optional): + Whether to clamp the maximum value of the hidden states + to prevent overflow. Defaults to `False`. + """ + + def __init__(self, + config: BertConfig, + clamp_min_for_underflow: bool = False, + clamp_max_for_overflow: bool = False): + super().__init__(config) + self.self = BertSelfAttention(config, clamp_min_for_underflow, + clamp_max_for_overflow) + + +class BertIntermediate(HFBertIntermediate): + """Modified from transformers.models.bert.modeling_bert.BertIntermediate. + + Compared to the BertIntermediate of Huggingface, only add the clamp. + """ + + def forward(self, hidden_states: Tensor) -> Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = clamp_values(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = clamp_values(hidden_states) + return hidden_states + + +class BertOutput(HFBertOutput): + """Modified from transformers.models.bert.modeling_bert.BertOutput. + + Compared to the BertOutput of Huggingface, only add the clamp. + """ + + def forward(self, hidden_states: Tensor, input_tensor: Tensor) -> Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = clamp_values(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = clamp_values(hidden_states) + return hidden_states diff --git a/mmdetection/mmdet/models/utils/wbf.py b/mmdetection/mmdet/models/utils/wbf.py new file mode 100644 index 0000000..b26a2c6 --- /dev/null +++ b/mmdetection/mmdet/models/utils/wbf.py @@ -0,0 +1,250 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import warnings +from typing import Tuple + +import numpy as np +import torch +from torch import Tensor + + +# References: https://github.com/ZFTurbo/Weighted-Boxes-Fusion +def weighted_boxes_fusion( + bboxes_list: list, + scores_list: list, + labels_list: list, + weights: list = None, + iou_thr: float = 0.55, + skip_box_thr: float = 0.0, + conf_type: str = 'avg', + allows_overflow: bool = False) -> Tuple[Tensor, Tensor, Tensor]: + """weighted boxes fusion is a method for + fusing predictions from different object detection models, which utilizes + confidence scores of all proposed bounding boxes to construct averaged + boxes. + + Args: + bboxes_list(list): list of boxes predictions from each model, + each box is 4 numbers. + scores_list(list): list of scores for each model + labels_list(list): list of labels for each model + weights: list of weights for each model. + Default: None, which means weight == 1 for each model + iou_thr: IoU value for boxes to be a match + skip_box_thr: exclude boxes with score lower than this variable. + conf_type: how to calculate confidence in weighted boxes. + 'avg': average value, + 'max': maximum value, + 'box_and_model_avg': box and model wise hybrid weighted average, + 'absent_model_aware_avg': weighted average that takes into + account the absent model. + allows_overflow: false if we want confidence score not exceed 1.0. + + Returns: + bboxes(Tensor): boxes coordinates (Order of boxes: x1, y1, x2, y2). + scores(Tensor): confidence scores + labels(Tensor): boxes labels + """ + + if weights is None: + weights = np.ones(len(bboxes_list)) + if len(weights) != len(bboxes_list): + print('Warning: incorrect number of weights {}. Must be: ' + '{}. Set weights equal to 1.'.format( + len(weights), len(bboxes_list))) + weights = np.ones(len(bboxes_list)) + weights = np.array(weights) + + if conf_type not in [ + 'avg', 'max', 'box_and_model_avg', 'absent_model_aware_avg' + ]: + print('Unknown conf_type: {}. Must be "avg", ' + '"max" or "box_and_model_avg", ' + 'or "absent_model_aware_avg"'.format(conf_type)) + exit() + + filtered_boxes = prefilter_boxes(bboxes_list, scores_list, labels_list, + weights, skip_box_thr) + if len(filtered_boxes) == 0: + return torch.Tensor(), torch.Tensor(), torch.Tensor() + + overall_boxes = [] + + for label in filtered_boxes: + boxes = filtered_boxes[label] + new_boxes = [] + weighted_boxes = np.empty((0, 8)) + + # Clusterize boxes + for j in range(0, len(boxes)): + index, best_iou = find_matching_box_fast(weighted_boxes, boxes[j], + iou_thr) + + if index != -1: + new_boxes[index].append(boxes[j]) + weighted_boxes[index] = get_weighted_box( + new_boxes[index], conf_type) + else: + new_boxes.append([boxes[j].copy()]) + weighted_boxes = np.vstack((weighted_boxes, boxes[j].copy())) + + # Rescale confidence based on number of models and boxes + for i in range(len(new_boxes)): + clustered_boxes = new_boxes[i] + if conf_type == 'box_and_model_avg': + clustered_boxes = np.array(clustered_boxes) + # weighted average for boxes + weighted_boxes[i, 1] = weighted_boxes[i, 1] * len( + clustered_boxes) / weighted_boxes[i, 2] + # identify unique model index by model index column + _, idx = np.unique(clustered_boxes[:, 3], return_index=True) + # rescale by unique model weights + weighted_boxes[i, 1] = weighted_boxes[i, 1] * clustered_boxes[ + idx, 2].sum() / weights.sum() + elif conf_type == 'absent_model_aware_avg': + clustered_boxes = np.array(clustered_boxes) + # get unique model index in the cluster + models = np.unique(clustered_boxes[:, 3]).astype(int) + # create a mask to get unused model weights + mask = np.ones(len(weights), dtype=bool) + mask[models] = False + # absent model aware weighted average + weighted_boxes[ + i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / ( + weighted_boxes[i, 2] + weights[mask].sum()) + elif conf_type == 'max': + weighted_boxes[i, 1] = weighted_boxes[i, 1] / weights.max() + elif not allows_overflow: + weighted_boxes[i, 1] = weighted_boxes[i, 1] * min( + len(weights), len(clustered_boxes)) / weights.sum() + else: + weighted_boxes[i, 1] = weighted_boxes[i, 1] * len( + clustered_boxes) / weights.sum() + overall_boxes.append(weighted_boxes) + overall_boxes = np.concatenate(overall_boxes, axis=0) + overall_boxes = overall_boxes[overall_boxes[:, 1].argsort()[::-1]] + + bboxes = torch.Tensor(overall_boxes[:, 4:]) + scores = torch.Tensor(overall_boxes[:, 1]) + labels = torch.Tensor(overall_boxes[:, 0]).int() + + return bboxes, scores, labels + + +def prefilter_boxes(boxes, scores, labels, weights, thr): + + new_boxes = dict() + + for t in range(len(boxes)): + + if len(boxes[t]) != len(scores[t]): + print('Error. Length of boxes arrays not equal to ' + 'length of scores array: {} != {}'.format( + len(boxes[t]), len(scores[t]))) + exit() + + if len(boxes[t]) != len(labels[t]): + print('Error. Length of boxes arrays not equal to ' + 'length of labels array: {} != {}'.format( + len(boxes[t]), len(labels[t]))) + exit() + + for j in range(len(boxes[t])): + score = scores[t][j] + if score < thr: + continue + label = int(labels[t][j]) + box_part = boxes[t][j] + x1 = float(box_part[0]) + y1 = float(box_part[1]) + x2 = float(box_part[2]) + y2 = float(box_part[3]) + + # Box data checks + if x2 < x1: + warnings.warn('X2 < X1 value in box. Swap them.') + x1, x2 = x2, x1 + if y2 < y1: + warnings.warn('Y2 < Y1 value in box. Swap them.') + y1, y2 = y2, y1 + if (x2 - x1) * (y2 - y1) == 0.0: + warnings.warn('Zero area box skipped: {}.'.format(box_part)) + continue + + # [label, score, weight, model index, x1, y1, x2, y2] + b = [ + int(label), + float(score) * weights[t], weights[t], t, x1, y1, x2, y2 + ] + + if label not in new_boxes: + new_boxes[label] = [] + new_boxes[label].append(b) + + # Sort each list in dict by score and transform it to numpy array + for k in new_boxes: + current_boxes = np.array(new_boxes[k]) + new_boxes[k] = current_boxes[current_boxes[:, 1].argsort()[::-1]] + + return new_boxes + + +def get_weighted_box(boxes, conf_type='avg'): + + box = np.zeros(8, dtype=np.float32) + conf = 0 + conf_list = [] + w = 0 + for b in boxes: + box[4:] += (b[1] * b[4:]) + conf += b[1] + conf_list.append(b[1]) + w += b[2] + box[0] = boxes[0][0] + if conf_type in ('avg', 'box_and_model_avg', 'absent_model_aware_avg'): + box[1] = conf / len(boxes) + elif conf_type == 'max': + box[1] = np.array(conf_list).max() + box[2] = w + box[3] = -1 + box[4:] /= conf + + return box + + +def find_matching_box_fast(boxes_list, new_box, match_iou): + + def bb_iou_array(boxes, new_box): + # bb intersection over union + xA = np.maximum(boxes[:, 0], new_box[0]) + yA = np.maximum(boxes[:, 1], new_box[1]) + xB = np.minimum(boxes[:, 2], new_box[2]) + yB = np.minimum(boxes[:, 3], new_box[3]) + + interArea = np.maximum(xB - xA, 0) * np.maximum(yB - yA, 0) + + # compute the area of both the prediction and ground-truth rectangles + boxAArea = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + boxBArea = (new_box[2] - new_box[0]) * (new_box[3] - new_box[1]) + + iou = interArea / (boxAArea + boxBArea - interArea) + + return iou + + if boxes_list.shape[0] == 0: + return -1, match_iou + + boxes = boxes_list + + ious = bb_iou_array(boxes[:, 4:], new_box[4:]) + + ious[boxes[:, 0] != new_box[0]] = -1 + + best_idx = np.argmax(ious) + best_iou = ious[best_idx] + + if best_iou <= match_iou: + best_iou = match_iou + best_idx = -1 + + return best_idx, best_iou diff --git a/mmdetection/mmdet/models/vis/__init__.py b/mmdetection/mmdet/models/vis/__init__.py new file mode 100644 index 0000000..ab63a90 --- /dev/null +++ b/mmdetection/mmdet/models/vis/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .mask2former_vis import Mask2FormerVideo +from .masktrack_rcnn import MaskTrackRCNN + +__all__ = ['Mask2FormerVideo', 'MaskTrackRCNN'] diff --git a/mmdetection/mmdet/models/vis/mask2former_vis.py b/mmdetection/mmdet/models/vis/mask2former_vis.py new file mode 100644 index 0000000..6ab0429 --- /dev/null +++ b/mmdetection/mmdet/models/vis/mask2former_vis.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +from torch import Tensor + +from mmdet.models.mot import BaseMOTModel +from mmdet.registry import MODELS +from mmdet.structures import TrackDataSample, TrackSampleList +from mmdet.utils import OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class Mask2FormerVideo(BaseMOTModel): + r"""Implementation of `Masked-attention Mask + Transformer for Universal Image Segmentation + `_. + + Args: + backbone (dict): Configuration of backbone. Defaults to None. + track_head (dict): Configuration of track head. Defaults to None. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + Defaults to None. + init_cfg (dict or list[dict]): Configuration of initialization. + Defaults to None. + """ + + def __init__(self, + backbone: Optional[dict] = None, + track_head: Optional[dict] = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super(BaseMOTModel, self).__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + + if backbone is not None: + self.backbone = MODELS.build(backbone) + + if track_head is not None: + self.track_head = MODELS.build(track_head) + + self.num_classes = self.track_head.num_classes + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + """Overload in order to load mmdet pretrained ckpt.""" + for key in list(state_dict): + if key.startswith('panoptic_head'): + state_dict[key.replace('panoptic', + 'track')] = state_dict.pop(key) + + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) + + def loss(self, inputs: Tensor, data_samples: TrackSampleList, + **kwargs) -> Union[dict, tuple]: + """ + Args: + inputs (Tensor): Input images of shape (N, T, C, H, W). + These should usually be mean centered and std scaled. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance`. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + # shape (N * T, C, H, W) + img = inputs.flatten(0, 1) + + x = self.backbone(img) + losses = self.track_head.loss(x, data_samples) + + return losses + + def predict(self, + inputs: Tensor, + data_samples: TrackSampleList, + rescale: bool = True) -> TrackSampleList: + """Predict results from a batch of inputs and data samples with + postprocessing. + + Args: + inputs (Tensor): of shape (N, T, C, H, W) encoding + input images. The N denotes batch size. + The T denotes the number of frames in a video. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `video_data_samples`. + rescale (bool, Optional): If False, then returned bboxes and masks + will fit the scale of img, otherwise, returned bboxes and masks + will fit the scale of original image shape. Defaults to True. + + Returns: + TrackSampleList: Tracking results of the inputs. + """ + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + + assert len(data_samples) == 1, \ + 'Mask2former only support 1 batch size per gpu for now.' + + # [T, C, H, W] + img = inputs[0] + track_data_sample = data_samples[0] + feats = self.backbone(img) + pred_track_ins_list = self.track_head.predict(feats, track_data_sample, + rescale) + + det_data_samples_list = [] + for idx, pred_track_ins in enumerate(pred_track_ins_list): + img_data_sample = track_data_sample[idx] + img_data_sample.pred_track_instances = pred_track_ins + det_data_samples_list.append(img_data_sample) + + results = TrackDataSample() + results.video_data_samples = det_data_samples_list + return [results] diff --git a/mmdetection/mmdet/models/vis/masktrack_rcnn.py b/mmdetection/mmdet/models/vis/masktrack_rcnn.py new file mode 100644 index 0000000..9c28e7b --- /dev/null +++ b/mmdetection/mmdet/models/vis/masktrack_rcnn.py @@ -0,0 +1,181 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from torch import Tensor + +from mmdet.models.mot import BaseMOTModel +from mmdet.registry import MODELS +from mmdet.structures import TrackSampleList +from mmdet.utils import OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class MaskTrackRCNN(BaseMOTModel): + """Video Instance Segmentation. + + This video instance segmentor is the implementation of`MaskTrack R-CNN + `_. + + Args: + detector (dict): Configuration of detector. Defaults to None. + track_head (dict): Configuration of track head. Defaults to None. + tracker (dict): Configuration of tracker. Defaults to None. + data_preprocessor (dict or ConfigDict, optional): The pre-process + config of :class:`TrackDataPreprocessor`. it usually includes, + ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``. + init_cfg (dict or list[dict]): Configuration of initialization. + Defaults to None. + """ + + def __init__(self, + detector: Optional[dict] = None, + track_head: Optional[dict] = None, + tracker: Optional[dict] = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(data_preprocessor, init_cfg) + + if detector is not None: + self.detector = MODELS.build(detector) + assert hasattr(self.detector, 'roi_head'), \ + 'MaskTrack R-CNN only supports two stage detectors.' + + if track_head is not None: + self.track_head = MODELS.build(track_head) + if tracker is not None: + self.tracker = MODELS.build(tracker) + + def loss(self, inputs: Tensor, data_samples: TrackSampleList, + **kwargs) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding + input images. Typically these should be mean centered and std + scaled. The N denotes batch size. The T denotes the number of + frames. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance`. + + Returns: + dict: A dictionary of loss components. + """ + + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + assert inputs.size(1) == 2, \ + 'MaskTrackRCNN can only have 1 key frame and 1 reference frame.' + + # split the data_samples into two aspects: key frames and reference + # frames + ref_data_samples, key_data_samples = [], [] + key_frame_inds, ref_frame_inds = [], [] + + # set cat_id of gt_labels to 0 in RPN + for track_data_sample in data_samples: + key_data_sample = track_data_sample.get_key_frames()[0] + key_data_samples.append(key_data_sample) + ref_data_sample = track_data_sample.get_ref_frames()[0] + ref_data_samples.append(ref_data_sample) + key_frame_inds.append(track_data_sample.key_frames_inds[0]) + ref_frame_inds.append(track_data_sample.ref_frames_inds[0]) + + key_frame_inds = torch.tensor(key_frame_inds, dtype=torch.int64) + ref_frame_inds = torch.tensor(ref_frame_inds, dtype=torch.int64) + batch_inds = torch.arange(len(inputs)) + key_imgs = inputs[batch_inds, key_frame_inds].contiguous() + ref_imgs = inputs[batch_inds, ref_frame_inds].contiguous() + + x = self.detector.extract_feat(key_imgs) + ref_x = self.detector.extract_feat(ref_imgs) + + losses = dict() + + # RPN forward and loss + if self.detector.with_rpn: + proposal_cfg = self.detector.train_cfg.get( + 'rpn_proposal', self.detector.test_cfg.rpn) + + rpn_losses, rpn_results_list = self.detector.rpn_head. \ + loss_and_predict(x, + key_data_samples, + proposal_cfg=proposal_cfg, + **kwargs) + + # avoid get same name with roi_head loss + keys = rpn_losses.keys() + for key in keys: + if 'loss' in key and 'rpn' not in key: + rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) + losses.update(rpn_losses) + else: + # TODO: Not support currently, should have a check at Fast R-CNN + assert key_data_samples[0].get('proposals', None) is not None + # use pre-defined proposals in InstanceData for the second stage + # to extract ROI features. + rpn_results_list = [ + key_data_sample.proposals + for key_data_sample in key_data_samples + ] + + losses_detect = self.detector.roi_head.loss(x, rpn_results_list, + key_data_samples, **kwargs) + losses.update(losses_detect) + + losses_track = self.track_head.loss(x, ref_x, rpn_results_list, + data_samples, **kwargs) + losses.update(losses_track) + + return losses + + def predict(self, + inputs: Tensor, + data_samples: TrackSampleList, + rescale: bool = True, + **kwargs) -> TrackSampleList: + """Test without augmentation. + + Args: + inputs (Tensor): of shape (N, T, C, H, W) encoding + input images. The N denotes batch size. + The T denotes the number of frames in a video. + data_samples (list[:obj:`TrackDataSample`]): The batch + data samples. It usually includes information such + as `video_data_samples`. + rescale (bool, Optional): If False, then returned bboxes and masks + will fit the scale of img, otherwise, returned bboxes and masks + will fit the scale of original image shape. Defaults to True. + + Returns: + TrackSampleList: Tracking results of the inputs. + """ + assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).' + + assert len(data_samples) == 1, \ + 'MaskTrackRCNN only support 1 batch size per gpu for now.' + + track_data_sample = data_samples[0] + video_len = len(track_data_sample) + if track_data_sample[0].frame_id == 0: + self.tracker.reset() + + for frame_id in range(video_len): + img_data_sample = track_data_sample[frame_id] + single_img = inputs[:, frame_id].contiguous() + x = self.detector.extract_feat(single_img) + + rpn_results_list = self.detector.rpn_head.predict( + x, [img_data_sample]) + # det_results List[InstanceData] + det_results = self.detector.roi_head.predict( + x, rpn_results_list, [img_data_sample], rescale=rescale) + assert len(det_results) == 1, 'Batch inference is not supported.' + assert 'masks' in det_results[0], 'There are no mask results.' + + img_data_sample.pred_instances = det_results[0] + frame_pred_track_instances = self.tracker.track( + model=self, feats=x, data_sample=img_data_sample, **kwargs) + img_data_sample.pred_track_instances = frame_pred_track_instances + + return [track_data_sample] diff --git a/mmdetection/mmdet/registry.py b/mmdetection/mmdet/registry.py new file mode 100644 index 0000000..3a5b2b2 --- /dev/null +++ b/mmdetection/mmdet/registry.py @@ -0,0 +1,121 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""MMDetection provides 17 registry nodes to support using modules across +projects. Each node is a child of the root registry in MMEngine. + +More details can be found at +https://mmengine.readthedocs.io/en/latest/tutorials/registry.html. +""" + +from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS +from mmengine.registry import DATASETS as MMENGINE_DATASETS +from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR +from mmengine.registry import HOOKS as MMENGINE_HOOKS +from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS +from mmengine.registry import LOOPS as MMENGINE_LOOPS +from mmengine.registry import METRICS as MMENGINE_METRICS +from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS +from mmengine.registry import MODELS as MMENGINE_MODELS +from mmengine.registry import \ + OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS +from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS +from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS +from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS +from mmengine.registry import \ + RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS +from mmengine.registry import RUNNERS as MMENGINE_RUNNERS +from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS +from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS +from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS +from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS +from mmengine.registry import \ + WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS +from mmengine.registry import Registry + +# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner` +RUNNERS = Registry( + 'runner', parent=MMENGINE_RUNNERS, locations=['mmdet.engine.runner']) +# manage runner constructors that define how to initialize runners +RUNNER_CONSTRUCTORS = Registry( + 'runner constructor', + parent=MMENGINE_RUNNER_CONSTRUCTORS, + locations=['mmdet.engine.runner']) +# manage all kinds of loops like `EpochBasedTrainLoop` +LOOPS = Registry( + 'loop', parent=MMENGINE_LOOPS, locations=['mmdet.engine.runner']) +# manage all kinds of hooks like `CheckpointHook` +HOOKS = Registry( + 'hook', parent=MMENGINE_HOOKS, locations=['mmdet.engine.hooks']) + +# manage data-related modules +DATASETS = Registry( + 'dataset', parent=MMENGINE_DATASETS, locations=['mmdet.datasets']) +DATA_SAMPLERS = Registry( + 'data sampler', + parent=MMENGINE_DATA_SAMPLERS, + locations=['mmdet.datasets.samplers']) +TRANSFORMS = Registry( + 'transform', + parent=MMENGINE_TRANSFORMS, + locations=['mmdet.datasets.transforms']) + +# manage all kinds of modules inheriting `nn.Module` +MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmdet.models']) +# manage all kinds of model wrappers like 'MMDistributedDataParallel' +MODEL_WRAPPERS = Registry( + 'model_wrapper', + parent=MMENGINE_MODEL_WRAPPERS, + locations=['mmdet.models']) +# manage all kinds of weight initialization modules like `Uniform` +WEIGHT_INITIALIZERS = Registry( + 'weight initializer', + parent=MMENGINE_WEIGHT_INITIALIZERS, + locations=['mmdet.models']) + +# manage all kinds of optimizers like `SGD` and `Adam` +OPTIMIZERS = Registry( + 'optimizer', + parent=MMENGINE_OPTIMIZERS, + locations=['mmdet.engine.optimizers']) +# manage optimizer wrapper +OPTIM_WRAPPERS = Registry( + 'optim_wrapper', + parent=MMENGINE_OPTIM_WRAPPERS, + locations=['mmdet.engine.optimizers']) +# manage constructors that customize the optimization hyperparameters. +OPTIM_WRAPPER_CONSTRUCTORS = Registry( + 'optimizer constructor', + parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS, + locations=['mmdet.engine.optimizers']) +# manage all kinds of parameter schedulers like `MultiStepLR` +PARAM_SCHEDULERS = Registry( + 'parameter scheduler', + parent=MMENGINE_PARAM_SCHEDULERS, + locations=['mmdet.engine.schedulers']) +# manage all kinds of metrics +METRICS = Registry( + 'metric', parent=MMENGINE_METRICS, locations=['mmdet.evaluation']) +# manage evaluator +EVALUATOR = Registry( + 'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmdet.evaluation']) + +# manage task-specific modules like anchor generators and box coders +TASK_UTILS = Registry( + 'task util', parent=MMENGINE_TASK_UTILS, locations=['mmdet.models']) + +# manage visualizer +VISUALIZERS = Registry( + 'visualizer', + parent=MMENGINE_VISUALIZERS, + locations=['mmdet.visualization']) +# manage visualizer backend +VISBACKENDS = Registry( + 'vis_backend', + parent=MMENGINE_VISBACKENDS, + locations=['mmdet.visualization']) + +# manage logprocessor +LOG_PROCESSORS = Registry( + 'log_processor', + parent=MMENGINE_LOG_PROCESSORS, + # TODO: update the location when mmdet has its own log processor + locations=['mmdet.engine']) diff --git a/mmdetection/mmdet/structures/__init__.py b/mmdetection/mmdet/structures/__init__.py new file mode 100644 index 0000000..381c6a4 --- /dev/null +++ b/mmdetection/mmdet/structures/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .det_data_sample import DetDataSample, OptSampleList, SampleList +from .reid_data_sample import ReIDDataSample +from .track_data_sample import (OptTrackSampleList, TrackDataSample, + TrackSampleList) + +__all__ = [ + 'DetDataSample', 'SampleList', 'OptSampleList', 'TrackDataSample', + 'TrackSampleList', 'OptTrackSampleList', 'ReIDDataSample' +] diff --git a/mmdetection/mmdet/structures/bbox/__init__.py b/mmdetection/mmdet/structures/bbox/__init__.py new file mode 100644 index 0000000..4d53198 --- /dev/null +++ b/mmdetection/mmdet/structures/bbox/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_boxes import BaseBoxes +from .bbox_overlaps import bbox_overlaps +from .box_type import (autocast_box_type, convert_box_type, get_box_type, + register_box, register_box_converter) +from .horizontal_boxes import HorizontalBoxes +from .transforms import bbox_cxcyah_to_xyxy # noqa: E501 +from .transforms import (bbox2corner, bbox2distance, bbox2result, bbox2roi, + bbox_cxcywh_to_xyxy, bbox_flip, bbox_mapping, + bbox_mapping_back, bbox_project, bbox_rescale, + bbox_xyxy_to_cxcyah, bbox_xyxy_to_cxcywh, cat_boxes, + corner2bbox, distance2bbox, empty_box_as, + find_inside_bboxes, get_box_tensor, get_box_wh, + roi2bbox, scale_boxes, stack_boxes) + +__all__ = [ + 'bbox_overlaps', 'bbox_flip', 'bbox_mapping', 'bbox_mapping_back', + 'bbox2roi', 'roi2bbox', 'bbox2result', 'distance2bbox', 'bbox2distance', + 'bbox_rescale', 'bbox_cxcywh_to_xyxy', 'bbox_xyxy_to_cxcywh', + 'find_inside_bboxes', 'bbox2corner', 'corner2bbox', 'bbox_project', + 'BaseBoxes', 'convert_box_type', 'get_box_type', 'register_box', + 'register_box_converter', 'HorizontalBoxes', 'autocast_box_type', + 'cat_boxes', 'stack_boxes', 'scale_boxes', 'get_box_wh', 'get_box_tensor', + 'empty_box_as', 'bbox_xyxy_to_cxcyah', 'bbox_cxcyah_to_xyxy' +] diff --git a/mmdetection/mmdet/structures/bbox/base_boxes.py b/mmdetection/mmdet/structures/bbox/base_boxes.py new file mode 100644 index 0000000..0ed6676 --- /dev/null +++ b/mmdetection/mmdet/structures/bbox/base_boxes.py @@ -0,0 +1,549 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod, abstractproperty, abstractstaticmethod +from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Union + +import numpy as np +import torch +from torch import BoolTensor, Tensor + +from mmdet.structures.mask.structures import BitmapMasks, PolygonMasks + +T = TypeVar('T') +DeviceType = Union[str, torch.device] +IndexType = Union[slice, int, list, torch.LongTensor, torch.cuda.LongTensor, + torch.BoolTensor, torch.cuda.BoolTensor, np.ndarray] +MaskType = Union[BitmapMasks, PolygonMasks] + + +class BaseBoxes(metaclass=ABCMeta): + """The base class for 2D box types. + + The functions of ``BaseBoxes`` lie in three fields: + + - Verify the boxes shape. + - Support tensor-like operations. + - Define abstract functions for 2D boxes. + + In ``__init__`` , ``BaseBoxes`` verifies the validity of the data shape + w.r.t ``box_dim``. The tensor with the dimension >= 2 and the length + of the last dimension being ``box_dim`` will be regarded as valid. + ``BaseBoxes`` will restore them at the field ``tensor``. It's necessary + to override ``box_dim`` in subclass to guarantee the data shape is + correct. + + There are many basic tensor-like functions implemented in ``BaseBoxes``. + In most cases, users can operate ``BaseBoxes`` instance like a normal + tensor. To protect the validity of data shape, All tensor-like functions + cannot modify the last dimension of ``self.tensor``. + + When creating a new box type, users need to inherit from ``BaseBoxes`` + and override abstract methods and specify the ``box_dim``. Then, register + the new box type by using the decorator ``register_box_type``. + + Args: + data (Tensor or np.ndarray or Sequence): The box data with shape + (..., box_dim). + dtype (torch.dtype, Optional): data type of boxes. Defaults to None. + device (str or torch.device, Optional): device of boxes. + Default to None. + clone (bool): Whether clone ``boxes`` or not. Defaults to True. + """ + + # Used to verify the last dimension length + # Should override it in subclass. + box_dim: int = 0 + + def __init__(self, + data: Union[Tensor, np.ndarray, Sequence], + dtype: Optional[torch.dtype] = None, + device: Optional[DeviceType] = None, + clone: bool = True) -> None: + if isinstance(data, (np.ndarray, Tensor, Sequence)): + data = torch.as_tensor(data) + else: + raise TypeError('boxes should be Tensor, ndarray, or Sequence, ', + f'but got {type(data)}') + + if device is not None or dtype is not None: + data = data.to(dtype=dtype, device=device) + # Clone the data to avoid potential bugs + if clone: + data = data.clone() + # handle the empty input like [] + if data.numel() == 0: + data = data.reshape((-1, self.box_dim)) + + assert data.dim() >= 2 and data.size(-1) == self.box_dim, \ + ('The boxes dimension must >= 2 and the length of the last ' + f'dimension must be {self.box_dim}, but got boxes with ' + f'shape {data.shape}.') + self.tensor = data + + def convert_to(self, dst_type: Union[str, type]) -> 'BaseBoxes': + """Convert self to another box type. + + Args: + dst_type (str or type): destination box type. + + Returns: + :obj:`BaseBoxes`: destination box type object . + """ + from .box_type import convert_box_type + return convert_box_type(self, dst_type=dst_type) + + def empty_boxes(self: T, + dtype: Optional[torch.dtype] = None, + device: Optional[DeviceType] = None) -> T: + """Create empty box. + + Args: + dtype (torch.dtype, Optional): data type of boxes. + device (str or torch.device, Optional): device of boxes. + + Returns: + T: empty boxes with shape of (0, box_dim). + """ + empty_box = self.tensor.new_zeros( + 0, self.box_dim, dtype=dtype, device=device) + return type(self)(empty_box, clone=False) + + def fake_boxes(self: T, + sizes: Tuple[int], + fill: float = 0, + dtype: Optional[torch.dtype] = None, + device: Optional[DeviceType] = None) -> T: + """Create fake boxes with specific sizes and fill values. + + Args: + sizes (Tuple[int]): The size of fake boxes. The last value must + be equal with ``self.box_dim``. + fill (float): filling value. Defaults to 0. + dtype (torch.dtype, Optional): data type of boxes. + device (str or torch.device, Optional): device of boxes. + + Returns: + T: Fake boxes with shape of ``sizes``. + """ + fake_boxes = self.tensor.new_full( + sizes, fill, dtype=dtype, device=device) + return type(self)(fake_boxes, clone=False) + + def __getitem__(self: T, index: IndexType) -> T: + """Rewrite getitem to protect the last dimension shape.""" + boxes = self.tensor + if isinstance(index, np.ndarray): + index = torch.as_tensor(index, device=self.device) + if isinstance(index, Tensor) and index.dtype == torch.bool: + assert index.dim() < boxes.dim() + elif isinstance(index, tuple): + assert len(index) < boxes.dim() + # `Ellipsis`(...) is commonly used in index like [None, ...]. + # When `Ellipsis` is in index, it must be the last item. + if Ellipsis in index: + assert index[-1] is Ellipsis + + boxes = boxes[index] + if boxes.dim() == 1: + boxes = boxes.reshape(1, -1) + return type(self)(boxes, clone=False) + + def __setitem__(self: T, index: IndexType, values: Union[Tensor, T]) -> T: + """Rewrite setitem to protect the last dimension shape.""" + assert type(values) is type(self), \ + 'The value to be set must be the same box type as self' + values = values.tensor + + if isinstance(index, np.ndarray): + index = torch.as_tensor(index, device=self.device) + if isinstance(index, Tensor) and index.dtype == torch.bool: + assert index.dim() < self.tensor.dim() + elif isinstance(index, tuple): + assert len(index) < self.tensor.dim() + # `Ellipsis`(...) is commonly used in index like [None, ...]. + # When `Ellipsis` is in index, it must be the last item. + if Ellipsis in index: + assert index[-1] is Ellipsis + + self.tensor[index] = values + + def __len__(self) -> int: + """Return the length of self.tensor first dimension.""" + return self.tensor.size(0) + + def __deepcopy__(self, memo): + """Only clone the ``self.tensor`` when applying deepcopy.""" + cls = self.__class__ + other = cls.__new__(cls) + memo[id(self)] = other + other.tensor = self.tensor.clone() + return other + + def __repr__(self) -> str: + """Return a strings that describes the object.""" + return self.__class__.__name__ + '(\n' + str(self.tensor) + ')' + + def new_tensor(self, *args, **kwargs) -> Tensor: + """Reload ``new_tensor`` from self.tensor.""" + return self.tensor.new_tensor(*args, **kwargs) + + def new_full(self, *args, **kwargs) -> Tensor: + """Reload ``new_full`` from self.tensor.""" + return self.tensor.new_full(*args, **kwargs) + + def new_empty(self, *args, **kwargs) -> Tensor: + """Reload ``new_empty`` from self.tensor.""" + return self.tensor.new_empty(*args, **kwargs) + + def new_ones(self, *args, **kwargs) -> Tensor: + """Reload ``new_ones`` from self.tensor.""" + return self.tensor.new_ones(*args, **kwargs) + + def new_zeros(self, *args, **kwargs) -> Tensor: + """Reload ``new_zeros`` from self.tensor.""" + return self.tensor.new_zeros(*args, **kwargs) + + def size(self, dim: Optional[int] = None) -> Union[int, torch.Size]: + """Reload new_zeros from self.tensor.""" + # self.tensor.size(dim) cannot work when dim=None. + return self.tensor.size() if dim is None else self.tensor.size(dim) + + def dim(self) -> int: + """Reload ``dim`` from self.tensor.""" + return self.tensor.dim() + + @property + def device(self) -> torch.device: + """Reload ``device`` from self.tensor.""" + return self.tensor.device + + @property + def dtype(self) -> torch.dtype: + """Reload ``dtype`` from self.tensor.""" + return self.tensor.dtype + + @property + def shape(self) -> torch.Size: + return self.tensor.shape + + def numel(self) -> int: + """Reload ``numel`` from self.tensor.""" + return self.tensor.numel() + + def numpy(self) -> np.ndarray: + """Reload ``numpy`` from self.tensor.""" + return self.tensor.numpy() + + def to(self: T, *args, **kwargs) -> T: + """Reload ``to`` from self.tensor.""" + return type(self)(self.tensor.to(*args, **kwargs), clone=False) + + def cpu(self: T) -> T: + """Reload ``cpu`` from self.tensor.""" + return type(self)(self.tensor.cpu(), clone=False) + + def cuda(self: T, *args, **kwargs) -> T: + """Reload ``cuda`` from self.tensor.""" + return type(self)(self.tensor.cuda(*args, **kwargs), clone=False) + + def clone(self: T) -> T: + """Reload ``clone`` from self.tensor.""" + return type(self)(self.tensor) + + def detach(self: T) -> T: + """Reload ``detach`` from self.tensor.""" + return type(self)(self.tensor.detach(), clone=False) + + def view(self: T, *shape: Tuple[int]) -> T: + """Reload ``view`` from self.tensor.""" + return type(self)(self.tensor.view(shape), clone=False) + + def reshape(self: T, *shape: Tuple[int]) -> T: + """Reload ``reshape`` from self.tensor.""" + return type(self)(self.tensor.reshape(shape), clone=False) + + def expand(self: T, *sizes: Tuple[int]) -> T: + """Reload ``expand`` from self.tensor.""" + return type(self)(self.tensor.expand(sizes), clone=False) + + def repeat(self: T, *sizes: Tuple[int]) -> T: + """Reload ``repeat`` from self.tensor.""" + return type(self)(self.tensor.repeat(sizes), clone=False) + + def transpose(self: T, dim0: int, dim1: int) -> T: + """Reload ``transpose`` from self.tensor.""" + ndim = self.tensor.dim() + assert dim0 != -1 and dim0 != ndim - 1 + assert dim1 != -1 and dim1 != ndim - 1 + return type(self)(self.tensor.transpose(dim0, dim1), clone=False) + + def permute(self: T, *dims: Tuple[int]) -> T: + """Reload ``permute`` from self.tensor.""" + assert dims[-1] == -1 or dims[-1] == self.tensor.dim() - 1 + return type(self)(self.tensor.permute(dims), clone=False) + + def split(self: T, + split_size_or_sections: Union[int, Sequence[int]], + dim: int = 0) -> List[T]: + """Reload ``split`` from self.tensor.""" + assert dim != -1 and dim != self.tensor.dim() - 1 + boxes_list = self.tensor.split(split_size_or_sections, dim=dim) + return [type(self)(boxes, clone=False) for boxes in boxes_list] + + def chunk(self: T, chunks: int, dim: int = 0) -> List[T]: + """Reload ``chunk`` from self.tensor.""" + assert dim != -1 and dim != self.tensor.dim() - 1 + boxes_list = self.tensor.chunk(chunks, dim=dim) + return [type(self)(boxes, clone=False) for boxes in boxes_list] + + def unbind(self: T, dim: int = 0) -> T: + """Reload ``unbind`` from self.tensor.""" + assert dim != -1 and dim != self.tensor.dim() - 1 + boxes_list = self.tensor.unbind(dim=dim) + return [type(self)(boxes, clone=False) for boxes in boxes_list] + + def flatten(self: T, start_dim: int = 0, end_dim: int = -2) -> T: + """Reload ``flatten`` from self.tensor.""" + assert end_dim != -1 and end_dim != self.tensor.dim() - 1 + return type(self)(self.tensor.flatten(start_dim, end_dim), clone=False) + + def squeeze(self: T, dim: Optional[int] = None) -> T: + """Reload ``squeeze`` from self.tensor.""" + boxes = self.tensor.squeeze() if dim is None else \ + self.tensor.squeeze(dim) + return type(self)(boxes, clone=False) + + def unsqueeze(self: T, dim: int) -> T: + """Reload ``unsqueeze`` from self.tensor.""" + assert dim != -1 and dim != self.tensor.dim() + return type(self)(self.tensor.unsqueeze(dim), clone=False) + + @classmethod + def cat(cls: Type[T], box_list: Sequence[T], dim: int = 0) -> T: + """Cancatenates a box instance list into one single box instance. + Similar to ``torch.cat``. + + Args: + box_list (Sequence[T]): A sequence of box instances. + dim (int): The dimension over which the box are concatenated. + Defaults to 0. + + Returns: + T: Concatenated box instance. + """ + assert isinstance(box_list, Sequence) + if len(box_list) == 0: + raise ValueError('box_list should not be a empty list.') + + assert dim != -1 and dim != box_list[0].dim() - 1 + assert all(isinstance(boxes, cls) for boxes in box_list) + + th_box_list = [boxes.tensor for boxes in box_list] + return cls(torch.cat(th_box_list, dim=dim), clone=False) + + @classmethod + def stack(cls: Type[T], box_list: Sequence[T], dim: int = 0) -> T: + """Concatenates a sequence of tensors along a new dimension. Similar to + ``torch.stack``. + + Args: + box_list (Sequence[T]): A sequence of box instances. + dim (int): Dimension to insert. Defaults to 0. + + Returns: + T: Concatenated box instance. + """ + assert isinstance(box_list, Sequence) + if len(box_list) == 0: + raise ValueError('box_list should not be a empty list.') + + assert dim != -1 and dim != box_list[0].dim() + assert all(isinstance(boxes, cls) for boxes in box_list) + + th_box_list = [boxes.tensor for boxes in box_list] + return cls(torch.stack(th_box_list, dim=dim), clone=False) + + @abstractproperty + def centers(self) -> Tensor: + """Return a tensor representing the centers of boxes.""" + pass + + @abstractproperty + def areas(self) -> Tensor: + """Return a tensor representing the areas of boxes.""" + pass + + @abstractproperty + def widths(self) -> Tensor: + """Return a tensor representing the widths of boxes.""" + pass + + @abstractproperty + def heights(self) -> Tensor: + """Return a tensor representing the heights of boxes.""" + pass + + @abstractmethod + def flip_(self, + img_shape: Tuple[int, int], + direction: str = 'horizontal') -> None: + """Flip boxes horizontally or vertically in-place. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + direction (str): Flip direction, options are "horizontal", + "vertical" and "diagonal". Defaults to "horizontal" + """ + pass + + @abstractmethod + def translate_(self, distances: Tuple[float, float]) -> None: + """Translate boxes in-place. + + Args: + distances (Tuple[float, float]): translate distances. The first + is horizontal distance and the second is vertical distance. + """ + pass + + @abstractmethod + def clip_(self, img_shape: Tuple[int, int]) -> None: + """Clip boxes according to the image shape in-place. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + """ + pass + + @abstractmethod + def rotate_(self, center: Tuple[float, float], angle: float) -> None: + """Rotate all boxes in-place. + + Args: + center (Tuple[float, float]): Rotation origin. + angle (float): Rotation angle represented in degrees. Positive + values mean clockwise rotation. + """ + pass + + @abstractmethod + def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None: + """Geometric transformat boxes in-place. + + Args: + homography_matrix (Tensor or np.ndarray]): + Shape (3, 3) for geometric transformation. + """ + pass + + @abstractmethod + def rescale_(self, scale_factor: Tuple[float, float]) -> None: + """Rescale boxes w.r.t. rescale_factor in-place. + + Note: + Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes + w.r.t ``scale_facotr``. The difference is that ``resize_`` only + changes the width and the height of boxes, but ``rescale_`` also + rescales the box centers simultaneously. + + Args: + scale_factor (Tuple[float, float]): factors for scaling boxes. + The length should be 2. + """ + pass + + @abstractmethod + def resize_(self, scale_factor: Tuple[float, float]) -> None: + """Resize the box width and height w.r.t scale_factor in-place. + + Note: + Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes + w.r.t ``scale_facotr``. The difference is that ``resize_`` only + changes the width and the height of boxes, but ``rescale_`` also + rescales the box centers simultaneously. + + Args: + scale_factor (Tuple[float, float]): factors for scaling box + shapes. The length should be 2. + """ + pass + + @abstractmethod + def is_inside(self, + img_shape: Tuple[int, int], + all_inside: bool = False, + allowed_border: int = 0) -> BoolTensor: + """Find boxes inside the image. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + all_inside (bool): Whether the boxes are all inside the image or + part inside the image. Defaults to False. + allowed_border (int): Boxes that extend beyond the image shape + boundary by more than ``allowed_border`` are considered + "outside" Defaults to 0. + Returns: + BoolTensor: A BoolTensor indicating whether the box is inside + the image. Assuming the original boxes have shape (m, n, box_dim), + the output has shape (m, n). + """ + pass + + @abstractmethod + def find_inside_points(self, + points: Tensor, + is_aligned: bool = False) -> BoolTensor: + """Find inside box points. Boxes dimension must be 2. + + Args: + points (Tensor): Points coordinates. Has shape of (m, 2). + is_aligned (bool): Whether ``points`` has been aligned with boxes + or not. If True, the length of boxes and ``points`` should be + the same. Defaults to False. + + Returns: + BoolTensor: A BoolTensor indicating whether a point is inside + boxes. Assuming the boxes has shape of (n, box_dim), if + ``is_aligned`` is False. The index has shape of (m, n). If + ``is_aligned`` is True, m should be equal to n and the index has + shape of (m, ). + """ + pass + + @abstractstaticmethod + def overlaps(boxes1: 'BaseBoxes', + boxes2: 'BaseBoxes', + mode: str = 'iou', + is_aligned: bool = False, + eps: float = 1e-6) -> Tensor: + """Calculate overlap between two set of boxes with their types + converted to the present box type. + + Args: + boxes1 (:obj:`BaseBoxes`): BaseBoxes with shape of (m, box_dim) + or empty. + boxes2 (:obj:`BaseBoxes`): BaseBoxes with shape of (n, box_dim) + or empty. + mode (str): "iou" (intersection over union), "iof" (intersection + over foreground). Defaults to "iou". + is_aligned (bool): If True, then m and n must be equal. Defaults + to False. + eps (float): A value added to the denominator for numerical + stability. Defaults to 1e-6. + + Returns: + Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) + """ + pass + + @abstractstaticmethod + def from_instance_masks(masks: MaskType) -> 'BaseBoxes': + """Create boxes from instance masks. + + Args: + masks (:obj:`BitmapMasks` or :obj:`PolygonMasks`): BitmapMasks or + PolygonMasks instance with length of n. + + Returns: + :obj:`BaseBoxes`: Converted boxes with shape of (n, box_dim). + """ + pass diff --git a/mmdetection/mmdet/structures/bbox/bbox_overlaps.py b/mmdetection/mmdet/structures/bbox/bbox_overlaps.py new file mode 100644 index 0000000..8e3435d --- /dev/null +++ b/mmdetection/mmdet/structures/bbox/bbox_overlaps.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def fp16_clamp(x, min=None, max=None): + if not x.is_cuda and x.dtype == torch.float16: + # clamp for cpu float16, tensor fp16 has no clamp implementation + return x.float().clamp(min, max).half() + + return x.clamp(min, max) + + +def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): + """Calculate overlap between two set of bboxes. + + FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889 + Note: + Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou', + there are some new generated variable when calculating IOU + using bbox_overlaps function: + + 1) is_aligned is False + area1: M x 1 + area2: N x 1 + lt: M x N x 2 + rb: M x N x 2 + wh: M x N x 2 + overlap: M x N x 1 + union: M x N x 1 + ious: M x N x 1 + + Total memory: + S = (9 x N x M + N + M) * 4 Byte, + + When using FP16, we can reduce: + R = (9 x N x M + N + M) * 4 / 2 Byte + R large than (N + M) * 4 * 2 is always true when N and M >= 1. + Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2, + N + 1 < 3 * N, when N or M is 1. + + Given M = 40 (ground truth), N = 400000 (three anchor boxes + in per grid, FPN, R-CNNs), + R = 275 MB (one times) + + A special case (dense detection), M = 512 (ground truth), + R = 3516 MB = 3.43 GB + + When the batch size is B, reduce: + B x R + + Therefore, CUDA memory runs out frequently. + + Experiments on GeForce RTX 2080Ti (11019 MiB): + + | dtype | M | N | Use | Real | Ideal | + |:----:|:----:|:----:|:----:|:----:|:----:| + | FP32 | 512 | 400000 | 8020 MiB | -- | -- | + | FP16 | 512 | 400000 | 4504 MiB | 3516 MiB | 3516 MiB | + | FP32 | 40 | 400000 | 1540 MiB | -- | -- | + | FP16 | 40 | 400000 | 1264 MiB | 276MiB | 275 MiB | + + 2) is_aligned is True + area1: N x 1 + area2: N x 1 + lt: N x 2 + rb: N x 2 + wh: N x 2 + overlap: N x 1 + union: N x 1 + ious: N x 1 + + Total memory: + S = 11 x N * 4 Byte + + When using FP16, we can reduce: + R = 11 x N * 4 / 2 Byte + + So do the 'giou' (large than 'iou'). + + Time-wise, FP16 is generally faster than FP32. + + When gpu_assign_thr is not -1, it takes more time on cpu + but not reduce memory. + There, we can reduce half the memory and keep the speed. + + If ``is_aligned`` is ``False``, then calculate the overlaps between each + bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned + pair of bboxes1 and bboxes2. + + Args: + bboxes1 (Tensor): shape (B, m, 4) in format or empty. + bboxes2 (Tensor): shape (B, n, 4) in format or empty. + B indicates the batch dim, in shape (B1, B2, ..., Bn). + If ``is_aligned`` is ``True``, then m and n must be equal. + mode (str): "iou" (intersection over union), "iof" (intersection over + foreground) or "giou" (generalized intersection over union). + Default "iou". + is_aligned (bool, optional): If True, then m and n must be equal. + Default False. + eps (float, optional): A value added to the denominator for numerical + stability. Default 1e-6. + + Returns: + Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) + + Example: + >>> bboxes1 = torch.FloatTensor([ + >>> [0, 0, 10, 10], + >>> [10, 10, 20, 20], + >>> [32, 32, 38, 42], + >>> ]) + >>> bboxes2 = torch.FloatTensor([ + >>> [0, 0, 10, 20], + >>> [0, 10, 10, 19], + >>> [10, 10, 20, 20], + >>> ]) + >>> overlaps = bbox_overlaps(bboxes1, bboxes2) + >>> assert overlaps.shape == (3, 3) + >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True) + >>> assert overlaps.shape == (3, ) + + Example: + >>> empty = torch.empty(0, 4) + >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]]) + >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) + >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) + >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) + """ + + assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}' + # Either the boxes are empty or the length of boxes' last dimension is 4 + assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0) + assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0) + + # Batch dim must be the same + # Batch dim: (B1, B2, ... Bn) + assert bboxes1.shape[:-2] == bboxes2.shape[:-2] + batch_shape = bboxes1.shape[:-2] + + rows = bboxes1.size(-2) + cols = bboxes2.size(-2) + if is_aligned: + assert rows == cols + + if rows * cols == 0: + if is_aligned: + return bboxes1.new(batch_shape + (rows, )) + else: + return bboxes1.new(batch_shape + (rows, cols)) + + area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * ( + bboxes1[..., 3] - bboxes1[..., 1]) + area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * ( + bboxes2[..., 3] - bboxes2[..., 1]) + + if is_aligned: + lt = torch.max(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2] + rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2] + + wh = fp16_clamp(rb - lt, min=0) + overlap = wh[..., 0] * wh[..., 1] + + if mode in ['iou', 'giou']: + union = area1 + area2 - overlap + else: + union = area1 + if mode == 'giou': + enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2]) + enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:]) + else: + lt = torch.max(bboxes1[..., :, None, :2], + bboxes2[..., None, :, :2]) # [B, rows, cols, 2] + rb = torch.min(bboxes1[..., :, None, 2:], + bboxes2[..., None, :, 2:]) # [B, rows, cols, 2] + + wh = fp16_clamp(rb - lt, min=0) + overlap = wh[..., 0] * wh[..., 1] + + if mode in ['iou', 'giou']: + union = area1[..., None] + area2[..., None, :] - overlap + else: + union = area1[..., None] + if mode == 'giou': + enclosed_lt = torch.min(bboxes1[..., :, None, :2], + bboxes2[..., None, :, :2]) + enclosed_rb = torch.max(bboxes1[..., :, None, 2:], + bboxes2[..., None, :, 2:]) + + eps = union.new_tensor([eps]) + union = torch.max(union, eps) + ious = overlap / union + if mode in ['iou', 'iof']: + return ious + # calculate gious + enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0) + enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] + enclose_area = torch.max(enclose_area, eps) + gious = ious - (enclose_area - union) / enclose_area + return gious diff --git a/mmdetection/mmdet/structures/bbox/box_type.py b/mmdetection/mmdet/structures/bbox/box_type.py new file mode 100644 index 0000000..c7eb549 --- /dev/null +++ b/mmdetection/mmdet/structures/bbox/box_type.py @@ -0,0 +1,296 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Callable, Optional, Tuple, Type, Union + +import numpy as np +import torch +from torch import Tensor + +from .base_boxes import BaseBoxes + +BoxType = Union[np.ndarray, Tensor, BaseBoxes] + +box_types: dict = {} +_box_type_to_name: dict = {} +box_converters: dict = {} + + +def _register_box(name: str, box_type: Type, force: bool = False) -> None: + """Register a box type. + + Args: + name (str): The name of box type. + box_type (type): Box mode class to be registered. + force (bool): Whether to override an existing class with the same + name. Defaults to False. + """ + assert issubclass(box_type, BaseBoxes) + name = name.lower() + + if not force and (name in box_types or box_type in _box_type_to_name): + raise KeyError(f'box type {name} has been registered') + elif name in box_types: + _box_type = box_types.pop(name) + _box_type_to_name.pop(_box_type) + elif box_type in _box_type_to_name: + _name = _box_type_to_name.pop(box_type) + box_types.pop(_name) + + box_types[name] = box_type + _box_type_to_name[box_type] = name + + +def register_box(name: str, + box_type: Type = None, + force: bool = False) -> Union[Type, Callable]: + """Register a box type. + + A record will be added to ``bbox_types``, whose key is the box type name + and value is the box type itself. Simultaneously, a reverse dictionary + ``_box_type_to_name`` will be updated. It can be used as a decorator or + a normal function. + + Args: + name (str): The name of box type. + bbox_type (type, Optional): Box type class to be registered. + Defaults to None. + force (bool): Whether to override the existing box type with the same + name. Defaults to False. + + Examples: + >>> from mmdet.structures.bbox import register_box + >>> from mmdet.structures.bbox import BaseBoxes + + >>> # as a decorator + >>> @register_box('hbox') + >>> class HorizontalBoxes(BaseBoxes): + >>> pass + + >>> # as a normal function + >>> class RotatedBoxes(BaseBoxes): + >>> pass + >>> register_box('rbox', RotatedBoxes) + """ + if not isinstance(force, bool): + raise TypeError(f'force must be a boolean, but got {type(force)}') + + # use it as a normal method: register_box(name, box_type=BoxCls) + if box_type is not None: + _register_box(name=name, box_type=box_type, force=force) + return box_type + + # use it as a decorator: @register_box(name) + def _register(cls): + _register_box(name=name, box_type=cls, force=force) + return cls + + return _register + + +def _register_box_converter(src_type: Union[str, type], + dst_type: Union[str, type], + converter: Callable, + force: bool = False) -> None: + """Register a box converter. + + Args: + src_type (str or type): source box type name or class. + dst_type (str or type): destination box type name or class. + converter (Callable): Convert function. + force (bool): Whether to override the existing box type with the same + name. Defaults to False. + """ + assert callable(converter) + src_type_name, _ = get_box_type(src_type) + dst_type_name, _ = get_box_type(dst_type) + + converter_name = src_type_name + '2' + dst_type_name + if not force and converter_name in box_converters: + raise KeyError(f'The box converter from {src_type_name} to ' + f'{dst_type_name} has been registered.') + + box_converters[converter_name] = converter + + +def register_box_converter(src_type: Union[str, type], + dst_type: Union[str, type], + converter: Optional[Callable] = None, + force: bool = False) -> Callable: + """Register a box converter. + + A record will be added to ``box_converter``, whose key is + '{src_type_name}2{dst_type_name}' and value is the convert function. + It can be used as a decorator or a normal function. + + Args: + src_type (str or type): source box type name or class. + dst_type (str or type): destination box type name or class. + converter (Callable): Convert function. Defaults to None. + force (bool): Whether to override the existing box type with the same + name. Defaults to False. + + Examples: + >>> from mmdet.structures.bbox import register_box_converter + >>> # as a decorator + >>> @register_box_converter('hbox', 'rbox') + >>> def converter_A(boxes): + >>> pass + + >>> # as a normal function + >>> def converter_B(boxes): + >>> pass + >>> register_box_converter('rbox', 'hbox', converter_B) + """ + if not isinstance(force, bool): + raise TypeError(f'force must be a boolean, but got {type(force)}') + + # use it as a normal method: + # register_box_converter(src_type, dst_type, converter=Func) + if converter is not None: + _register_box_converter( + src_type=src_type, + dst_type=dst_type, + converter=converter, + force=force) + return converter + + # use it as a decorator: @register_box_converter(name) + def _register(func): + _register_box_converter( + src_type=src_type, dst_type=dst_type, converter=func, force=force) + return func + + return _register + + +def get_box_type(box_type: Union[str, type]) -> Tuple[str, type]: + """get both box type name and class. + + Args: + box_type (str or type): Single box type name or class. + + Returns: + Tuple[str, type]: A tuple of box type name and class. + """ + if isinstance(box_type, str): + type_name = box_type.lower() + assert type_name in box_types, \ + f"Box type {type_name} hasn't been registered in box_types." + type_cls = box_types[type_name] + elif issubclass(box_type, BaseBoxes): + assert box_type in _box_type_to_name, \ + f"Box type {box_type} hasn't been registered in box_types." + type_name = _box_type_to_name[box_type] + type_cls = box_type + else: + raise KeyError('box_type must be a str or class inheriting from ' + f'BaseBoxes, but got {type(box_type)}.') + return type_name, type_cls + + +def convert_box_type(boxes: BoxType, + *, + src_type: Union[str, type] = None, + dst_type: Union[str, type] = None) -> BoxType: + """Convert boxes from source type to destination type. + + If ``boxes`` is a instance of BaseBoxes, the ``src_type`` will be set + as the type of ``boxes``. + + Args: + boxes (np.ndarray or Tensor or :obj:`BaseBoxes`): boxes need to + convert. + src_type (str or type, Optional): source box type. Defaults to None. + dst_type (str or type, Optional): destination box type. Defaults to + None. + + Returns: + Union[np.ndarray, Tensor, :obj:`BaseBoxes`]: Converted boxes. It's type + is consistent with the input's type. + """ + assert dst_type is not None + dst_type_name, dst_type_cls = get_box_type(dst_type) + + is_box_cls = False + is_numpy = False + if isinstance(boxes, BaseBoxes): + src_type_name, _ = get_box_type(type(boxes)) + is_box_cls = True + elif isinstance(boxes, (Tensor, np.ndarray)): + assert src_type is not None + src_type_name, _ = get_box_type(src_type) + if isinstance(boxes, np.ndarray): + is_numpy = True + else: + raise TypeError('boxes must be a instance of BaseBoxes, Tensor or ' + f'ndarray, but get {type(boxes)}.') + + if src_type_name == dst_type_name: + return boxes + + converter_name = src_type_name + '2' + dst_type_name + assert converter_name in box_converters, \ + "Convert function hasn't been registered in box_converters." + converter = box_converters[converter_name] + + if is_box_cls: + boxes = converter(boxes.tensor) + return dst_type_cls(boxes) + elif is_numpy: + boxes = converter(torch.from_numpy(boxes)) + return boxes.numpy() + else: + return converter(boxes) + + +def autocast_box_type(dst_box_type='hbox') -> Callable: + """A decorator which automatically casts results['gt_bboxes'] to the + destination box type. + + It commenly used in mmdet.datasets.transforms to make the transforms up- + compatible with the np.ndarray type of results['gt_bboxes']. + + The speed of processing of np.ndarray and BaseBoxes data are the same: + + - np.ndarray: 0.0509 img/s + - BaseBoxes: 0.0551 img/s + + Args: + dst_box_type (str): Destination box type. + """ + _, box_type_cls = get_box_type(dst_box_type) + + def decorator(func: Callable) -> Callable: + + def wrapper(self, results: dict, *args, **kwargs) -> dict: + if ('gt_bboxes' not in results + or isinstance(results['gt_bboxes'], BaseBoxes)): + return func(self, results) + elif isinstance(results['gt_bboxes'], np.ndarray): + results['gt_bboxes'] = box_type_cls( + results['gt_bboxes'], clone=False) + if 'mix_results' in results: + for res in results['mix_results']: + if isinstance(res['gt_bboxes'], np.ndarray): + res['gt_bboxes'] = box_type_cls( + res['gt_bboxes'], clone=False) + + _results = func(self, results, *args, **kwargs) + + # In some cases, the function will process gt_bboxes in-place + # Simultaneously convert inputting and outputting gt_bboxes + # back to np.ndarray + if isinstance(_results, dict) and 'gt_bboxes' in _results: + if isinstance(_results['gt_bboxes'], BaseBoxes): + _results['gt_bboxes'] = _results['gt_bboxes'].numpy() + if isinstance(results['gt_bboxes'], BaseBoxes): + results['gt_bboxes'] = results['gt_bboxes'].numpy() + return _results + else: + raise TypeError( + "auto_box_type requires results['gt_bboxes'] to " + 'be BaseBoxes or np.ndarray, but got ' + f"{type(results['gt_bboxes'])}") + + return wrapper + + return decorator diff --git a/mmdetection/mmdet/structures/bbox/horizontal_boxes.py b/mmdetection/mmdet/structures/bbox/horizontal_boxes.py new file mode 100644 index 0000000..b3a7851 --- /dev/null +++ b/mmdetection/mmdet/structures/bbox/horizontal_boxes.py @@ -0,0 +1,432 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, TypeVar, Union + +import cv2 +import numpy as np +import torch +from torch import BoolTensor, Tensor + +from mmdet.structures.mask.structures import BitmapMasks, PolygonMasks +from .base_boxes import BaseBoxes +from .bbox_overlaps import bbox_overlaps +from .box_type import register_box + +T = TypeVar('T') +DeviceType = Union[str, torch.device] +MaskType = Union[BitmapMasks, PolygonMasks] + + +@register_box(name='hbox') +class HorizontalBoxes(BaseBoxes): + """The horizontal box class used in MMDetection by default. + + The ``box_dim`` of ``HorizontalBoxes`` is 4, which means the length of + the last dimension of the data should be 4. Two modes of box data are + supported in ``HorizontalBoxes``: + + - 'xyxy': Each row of data indicates (x1, y1, x2, y2), which are the + coordinates of the left-top and right-bottom points. + - 'cxcywh': Each row of data indicates (x, y, w, h), where (x, y) are the + coordinates of the box centers and (w, h) are the width and height. + + ``HorizontalBoxes`` only restores 'xyxy' mode of data. If the the data is + in 'cxcywh' mode, users need to input ``in_mode='cxcywh'`` and The code + will convert the 'cxcywh' data to 'xyxy' automatically. + + Args: + data (Tensor or np.ndarray or Sequence): The box data with shape of + (..., 4). + dtype (torch.dtype, Optional): data type of boxes. Defaults to None. + device (str or torch.device, Optional): device of boxes. + Default to None. + clone (bool): Whether clone ``boxes`` or not. Defaults to True. + mode (str, Optional): the mode of boxes. If it is 'cxcywh', the + `data` will be converted to 'xyxy' mode. Defaults to None. + """ + + box_dim: int = 4 + + def __init__(self, + data: Union[Tensor, np.ndarray], + dtype: torch.dtype = None, + device: DeviceType = None, + clone: bool = True, + in_mode: Optional[str] = None) -> None: + super().__init__(data=data, dtype=dtype, device=device, clone=clone) + if isinstance(in_mode, str): + if in_mode not in ('xyxy', 'cxcywh'): + raise ValueError(f'Get invalid mode {in_mode}.') + if in_mode == 'cxcywh': + self.tensor = self.cxcywh_to_xyxy(self.tensor) + + @staticmethod + def cxcywh_to_xyxy(boxes: Tensor) -> Tensor: + """Convert box coordinates from (cx, cy, w, h) to (x1, y1, x2, y2). + + Args: + boxes (Tensor): cxcywh boxes tensor with shape of (..., 4). + + Returns: + Tensor: xyxy boxes tensor with shape of (..., 4). + """ + ctr, wh = boxes.split((2, 2), dim=-1) + return torch.cat([(ctr - wh / 2), (ctr + wh / 2)], dim=-1) + + @staticmethod + def xyxy_to_cxcywh(boxes: Tensor) -> Tensor: + """Convert box coordinates from (x1, y1, x2, y2) to (cx, cy, w, h). + + Args: + boxes (Tensor): xyxy boxes tensor with shape of (..., 4). + + Returns: + Tensor: cxcywh boxes tensor with shape of (..., 4). + """ + xy1, xy2 = boxes.split((2, 2), dim=-1) + return torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1) + + @property + def cxcywh(self) -> Tensor: + """Return a tensor representing the cxcywh boxes.""" + return self.xyxy_to_cxcywh(self.tensor) + + @property + def centers(self) -> Tensor: + """Return a tensor representing the centers of boxes.""" + boxes = self.tensor + return (boxes[..., :2] + boxes[..., 2:]) / 2 + + @property + def areas(self) -> Tensor: + """Return a tensor representing the areas of boxes.""" + boxes = self.tensor + return (boxes[..., 2] - boxes[..., 0]) * ( + boxes[..., 3] - boxes[..., 1]) + + @property + def widths(self) -> Tensor: + """Return a tensor representing the widths of boxes.""" + boxes = self.tensor + return boxes[..., 2] - boxes[..., 0] + + @property + def heights(self) -> Tensor: + """Return a tensor representing the heights of boxes.""" + boxes = self.tensor + return boxes[..., 3] - boxes[..., 1] + + def flip_(self, + img_shape: Tuple[int, int], + direction: str = 'horizontal') -> None: + """Flip boxes horizontally or vertically in-place. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + direction (str): Flip direction, options are "horizontal", + "vertical" and "diagonal". Defaults to "horizontal" + """ + assert direction in ['horizontal', 'vertical', 'diagonal'] + flipped = self.tensor + boxes = flipped.clone() + if direction == 'horizontal': + flipped[..., 0] = img_shape[1] - boxes[..., 2] + flipped[..., 2] = img_shape[1] - boxes[..., 0] + elif direction == 'vertical': + flipped[..., 1] = img_shape[0] - boxes[..., 3] + flipped[..., 3] = img_shape[0] - boxes[..., 1] + else: + flipped[..., 0] = img_shape[1] - boxes[..., 2] + flipped[..., 1] = img_shape[0] - boxes[..., 3] + flipped[..., 2] = img_shape[1] - boxes[..., 0] + flipped[..., 3] = img_shape[0] - boxes[..., 1] + + def translate_(self, distances: Tuple[float, float]) -> None: + """Translate boxes in-place. + + Args: + distances (Tuple[float, float]): translate distances. The first + is horizontal distance and the second is vertical distance. + """ + boxes = self.tensor + assert len(distances) == 2 + self.tensor = boxes + boxes.new_tensor(distances).repeat(2) + + def clip_(self, img_shape: Tuple[int, int]) -> None: + """Clip boxes according to the image shape in-place. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + """ + boxes = self.tensor + boxes[..., 0::2] = boxes[..., 0::2].clamp(0, img_shape[1]) + boxes[..., 1::2] = boxes[..., 1::2].clamp(0, img_shape[0]) + + def rotate_(self, center: Tuple[float, float], angle: float) -> None: + """Rotate all boxes in-place. + + Args: + center (Tuple[float, float]): Rotation origin. + angle (float): Rotation angle represented in degrees. Positive + values mean clockwise rotation. + """ + boxes = self.tensor + rotation_matrix = boxes.new_tensor( + cv2.getRotationMatrix2D(center, -angle, 1)) + + corners = self.hbox2corner(boxes) + corners = torch.cat( + [corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1) + corners_T = torch.transpose(corners, -1, -2) + corners_T = torch.matmul(rotation_matrix, corners_T) + corners = torch.transpose(corners_T, -1, -2) + self.tensor = self.corner2hbox(corners) + + def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None: + """Geometric transformat boxes in-place. + + Args: + homography_matrix (Tensor or np.ndarray]): + Shape (3, 3) for geometric transformation. + """ + boxes = self.tensor + if isinstance(homography_matrix, np.ndarray): + homography_matrix = boxes.new_tensor(homography_matrix) + corners = self.hbox2corner(boxes) + corners = torch.cat( + [corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1) + corners_T = torch.transpose(corners, -1, -2) + corners_T = torch.matmul(homography_matrix, corners_T) + corners = torch.transpose(corners_T, -1, -2) + # Convert to homogeneous coordinates by normalization + corners = corners[..., :2] / corners[..., 2:3] + self.tensor = self.corner2hbox(corners) + + @staticmethod + def hbox2corner(boxes: Tensor) -> Tensor: + """Convert box coordinates from (x1, y1, x2, y2) to corners ((x1, y1), + (x2, y1), (x1, y2), (x2, y2)). + + Args: + boxes (Tensor): Horizontal box tensor with shape of (..., 4). + + Returns: + Tensor: Corner tensor with shape of (..., 4, 2). + """ + x1, y1, x2, y2 = torch.split(boxes, 1, dim=-1) + corners = torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=-1) + return corners.reshape(*corners.shape[:-1], 4, 2) + + @staticmethod + def corner2hbox(corners: Tensor) -> Tensor: + """Convert box coordinates from corners ((x1, y1), (x2, y1), (x1, y2), + (x2, y2)) to (x1, y1, x2, y2). + + Args: + corners (Tensor): Corner tensor with shape of (..., 4, 2). + + Returns: + Tensor: Horizontal box tensor with shape of (..., 4). + """ + if corners.numel() == 0: + return corners.new_zeros((0, 4)) + min_xy = corners.min(dim=-2)[0] + max_xy = corners.max(dim=-2)[0] + return torch.cat([min_xy, max_xy], dim=-1) + + def rescale_(self, scale_factor: Tuple[float, float]) -> None: + """Rescale boxes w.r.t. rescale_factor in-place. + + Note: + Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes + w.r.t ``scale_facotr``. The difference is that ``resize_`` only + changes the width and the height of boxes, but ``rescale_`` also + rescales the box centers simultaneously. + + Args: + scale_factor (Tuple[float, float]): factors for scaling boxes. + The length should be 2. + """ + boxes = self.tensor + assert len(scale_factor) == 2 + scale_factor = boxes.new_tensor(scale_factor).repeat(2) + self.tensor = boxes * scale_factor + + def resize_(self, scale_factor: Tuple[float, float]) -> None: + """Resize the box width and height w.r.t scale_factor in-place. + + Note: + Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes + w.r.t ``scale_facotr``. The difference is that ``resize_`` only + changes the width and the height of boxes, but ``rescale_`` also + rescales the box centers simultaneously. + + Args: + scale_factor (Tuple[float, float]): factors for scaling box + shapes. The length should be 2. + """ + boxes = self.tensor + assert len(scale_factor) == 2 + ctrs = (boxes[..., 2:] + boxes[..., :2]) / 2 + wh = boxes[..., 2:] - boxes[..., :2] + scale_factor = boxes.new_tensor(scale_factor) + wh = wh * scale_factor + xy1 = ctrs - 0.5 * wh + xy2 = ctrs + 0.5 * wh + self.tensor = torch.cat([xy1, xy2], dim=-1) + + def is_inside(self, + img_shape: Tuple[int, int], + all_inside: bool = False, + allowed_border: int = 0) -> BoolTensor: + """Find boxes inside the image. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + all_inside (bool): Whether the boxes are all inside the image or + part inside the image. Defaults to False. + allowed_border (int): Boxes that extend beyond the image shape + boundary by more than ``allowed_border`` are considered + "outside" Defaults to 0. + Returns: + BoolTensor: A BoolTensor indicating whether the box is inside + the image. Assuming the original boxes have shape (m, n, 4), + the output has shape (m, n). + """ + img_h, img_w = img_shape + boxes = self.tensor + if all_inside: + return (boxes[:, 0] >= -allowed_border) & \ + (boxes[:, 1] >= -allowed_border) & \ + (boxes[:, 2] < img_w + allowed_border) & \ + (boxes[:, 3] < img_h + allowed_border) + else: + return (boxes[..., 0] < img_w + allowed_border) & \ + (boxes[..., 1] < img_h + allowed_border) & \ + (boxes[..., 2] > -allowed_border) & \ + (boxes[..., 3] > -allowed_border) + + def find_inside_points(self, + points: Tensor, + is_aligned: bool = False) -> BoolTensor: + """Find inside box points. Boxes dimension must be 2. + + Args: + points (Tensor): Points coordinates. Has shape of (m, 2). + is_aligned (bool): Whether ``points`` has been aligned with boxes + or not. If True, the length of boxes and ``points`` should be + the same. Defaults to False. + + Returns: + BoolTensor: A BoolTensor indicating whether a point is inside + boxes. Assuming the boxes has shape of (n, 4), if ``is_aligned`` + is False. The index has shape of (m, n). If ``is_aligned`` is + True, m should be equal to n and the index has shape of (m, ). + """ + boxes = self.tensor + assert boxes.dim() == 2, 'boxes dimension must be 2.' + + if not is_aligned: + boxes = boxes[None, :, :] + points = points[:, None, :] + else: + assert boxes.size(0) == points.size(0) + + x_min, y_min, x_max, y_max = boxes.unbind(dim=-1) + return (points[..., 0] >= x_min) & (points[..., 0] <= x_max) & \ + (points[..., 1] >= y_min) & (points[..., 1] <= y_max) + + def create_masks(self, img_shape: Tuple[int, int]) -> BitmapMasks: + """ + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + + Returns: + :obj:`BitmapMasks`: Converted masks + """ + img_h, img_w = img_shape + boxes = self.tensor + + xmin, ymin = boxes[:, 0:1], boxes[:, 1:2] + xmax, ymax = boxes[:, 2:3], boxes[:, 3:4] + gt_masks = np.zeros((len(boxes), img_h, img_w), dtype=np.uint8) + for i in range(len(boxes)): + gt_masks[i, + int(ymin[i]):int(ymax[i]), + int(xmin[i]):int(xmax[i])] = 1 + return BitmapMasks(gt_masks, img_h, img_w) + + @staticmethod + def overlaps(boxes1: BaseBoxes, + boxes2: BaseBoxes, + mode: str = 'iou', + is_aligned: bool = False, + eps: float = 1e-6) -> Tensor: + """Calculate overlap between two set of boxes with their types + converted to ``HorizontalBoxes``. + + Args: + boxes1 (:obj:`BaseBoxes`): BaseBoxes with shape of (m, box_dim) + or empty. + boxes2 (:obj:`BaseBoxes`): BaseBoxes with shape of (n, box_dim) + or empty. + mode (str): "iou" (intersection over union), "iof" (intersection + over foreground). Defaults to "iou". + is_aligned (bool): If True, then m and n must be equal. Defaults + to False. + eps (float): A value added to the denominator for numerical + stability. Defaults to 1e-6. + + Returns: + Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) + """ + boxes1 = boxes1.convert_to('hbox') + boxes2 = boxes2.convert_to('hbox') + return bbox_overlaps( + boxes1.tensor, + boxes2.tensor, + mode=mode, + is_aligned=is_aligned, + eps=eps) + + @staticmethod + def from_instance_masks(masks: MaskType) -> 'HorizontalBoxes': + """Create horizontal boxes from instance masks. + + Args: + masks (:obj:`BitmapMasks` or :obj:`PolygonMasks`): BitmapMasks or + PolygonMasks instance with length of n. + + Returns: + :obj:`HorizontalBoxes`: Converted boxes with shape of (n, 4). + """ + num_masks = len(masks) + boxes = np.zeros((num_masks, 4), dtype=np.float32) + if isinstance(masks, BitmapMasks): + x_any = masks.masks.any(axis=1) + y_any = masks.masks.any(axis=2) + for idx in range(num_masks): + x = np.where(x_any[idx, :])[0] + y = np.where(y_any[idx, :])[0] + if len(x) > 0 and len(y) > 0: + # use +1 for x_max and y_max so that the right and bottom + # boundary of instance masks are fully included by the box + boxes[idx, :] = np.array( + [x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=np.float32) + elif isinstance(masks, PolygonMasks): + for idx, poly_per_obj in enumerate(masks.masks): + # simply use a number that is big enough for comparison with + # coordinates + xy_min = np.array([masks.width * 2, masks.height * 2], + dtype=np.float32) + xy_max = np.zeros(2, dtype=np.float32) + for p in poly_per_obj: + xy = np.array(p).reshape(-1, 2).astype(np.float32) + xy_min = np.minimum(xy_min, np.min(xy, axis=0)) + xy_max = np.maximum(xy_max, np.max(xy, axis=0)) + boxes[idx, :2] = xy_min + boxes[idx, 2:] = xy_max + else: + raise TypeError( + '`masks` must be `BitmapMasks` or `PolygonMasks`, ' + f'but got {type(masks)}.') + return HorizontalBoxes(boxes) diff --git a/mmdetection/mmdet/structures/bbox/transforms.py b/mmdetection/mmdet/structures/bbox/transforms.py new file mode 100644 index 0000000..287e6aa --- /dev/null +++ b/mmdetection/mmdet/structures/bbox/transforms.py @@ -0,0 +1,498 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +from torch import Tensor + +from mmdet.structures.bbox import BaseBoxes + + +def find_inside_bboxes(bboxes: Tensor, img_h: int, img_w: int) -> Tensor: + """Find bboxes as long as a part of bboxes is inside the image. + + Args: + bboxes (Tensor): Shape (N, 4). + img_h (int): Image height. + img_w (int): Image width. + + Returns: + Tensor: Index of the remaining bboxes. + """ + inside_inds = (bboxes[:, 0] < img_w) & (bboxes[:, 2] > 0) \ + & (bboxes[:, 1] < img_h) & (bboxes[:, 3] > 0) + return inside_inds + + +def bbox_flip(bboxes: Tensor, + img_shape: Tuple[int], + direction: str = 'horizontal') -> Tensor: + """Flip bboxes horizontally or vertically. + + Args: + bboxes (Tensor): Shape (..., 4*k) + img_shape (Tuple[int]): Image shape. + direction (str): Flip direction, options are "horizontal", "vertical", + "diagonal". Default: "horizontal" + + Returns: + Tensor: Flipped bboxes. + """ + assert bboxes.shape[-1] % 4 == 0 + assert direction in ['horizontal', 'vertical', 'diagonal'] + flipped = bboxes.clone() + if direction == 'horizontal': + flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4] + flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4] + elif direction == 'vertical': + flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4] + flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4] + else: + flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4] + flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4] + flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4] + flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4] + return flipped + + +def bbox_mapping(bboxes: Tensor, + img_shape: Tuple[int], + scale_factor: Union[float, Tuple[float]], + flip: bool, + flip_direction: str = 'horizontal') -> Tensor: + """Map bboxes from the original image scale to testing scale.""" + new_bboxes = bboxes * bboxes.new_tensor(scale_factor) + if flip: + new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction) + return new_bboxes + + +def bbox_mapping_back(bboxes: Tensor, + img_shape: Tuple[int], + scale_factor: Union[float, Tuple[float]], + flip: bool, + flip_direction: str = 'horizontal') -> Tensor: + """Map bboxes from testing scale to original image scale.""" + new_bboxes = bbox_flip(bboxes, img_shape, + flip_direction) if flip else bboxes + new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor) + return new_bboxes.view(bboxes.shape) + + +def bbox2roi(bbox_list: List[Union[Tensor, BaseBoxes]]) -> Tensor: + """Convert a list of bboxes to roi format. + + Args: + bbox_list (List[Union[Tensor, :obj:`BaseBoxes`]): a list of bboxes + corresponding to a batch of images. + + Returns: + Tensor: shape (n, box_dim + 1), where ``box_dim`` depends on the + different box types. For example, If the box type in ``bbox_list`` + is HorizontalBoxes, the output shape is (n, 5). Each row of data + indicates [batch_ind, x1, y1, x2, y2]. + """ + rois_list = [] + for img_id, bboxes in enumerate(bbox_list): + bboxes = get_box_tensor(bboxes) + img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) + rois = torch.cat([img_inds, bboxes], dim=-1) + rois_list.append(rois) + rois = torch.cat(rois_list, 0) + return rois + + +def roi2bbox(rois: Tensor) -> List[Tensor]: + """Convert rois to bounding box format. + + Args: + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + List[Tensor]: Converted boxes of corresponding rois. + """ + bbox_list = [] + img_ids = torch.unique(rois[:, 0].cpu(), sorted=True) + for img_id in img_ids: + inds = (rois[:, 0] == img_id.item()) + bbox = rois[inds, 1:] + bbox_list.append(bbox) + return bbox_list + + +# TODO remove later +def bbox2result(bboxes: Union[Tensor, np.ndarray], labels: Union[Tensor, + np.ndarray], + num_classes: int) -> List[np.ndarray]: + """Convert detection results to a list of numpy arrays. + + Args: + bboxes (Tensor | np.ndarray): shape (n, 5) + labels (Tensor | np.ndarray): shape (n, ) + num_classes (int): class number, including background class + + Returns: + List(np.ndarray]): bbox results of each class + """ + if bboxes.shape[0] == 0: + return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)] + else: + if isinstance(bboxes, torch.Tensor): + bboxes = bboxes.detach().cpu().numpy() + labels = labels.detach().cpu().numpy() + return [bboxes[labels == i, :] for i in range(num_classes)] + + +def distance2bbox( + points: Tensor, + distance: Tensor, + max_shape: Optional[Union[Sequence[int], Tensor, + Sequence[Sequence[int]]]] = None +) -> Tensor: + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + distance (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). Shape (B, N, 4) or (N, 4) + max_shape (Union[Sequence[int], Tensor, Sequence[Sequence[int]]], + optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]] + and the length of max_shape should also be B. + + Returns: + Tensor: Boxes with shape (N, 4) or (B, N, 4) + """ + + x1 = points[..., 0] - distance[..., 0] + y1 = points[..., 1] - distance[..., 1] + x2 = points[..., 0] + distance[..., 2] + y2 = points[..., 1] + distance[..., 3] + + bboxes = torch.stack([x1, y1, x2, y2], -1) + + if max_shape is not None: + if bboxes.dim() == 2 and not torch.onnx.is_in_onnx_export(): + # speed up + bboxes[:, 0::2].clamp_(min=0, max=max_shape[1]) + bboxes[:, 1::2].clamp_(min=0, max=max_shape[0]) + return bboxes + + # clip bboxes with dynamic `min` and `max` for onnx + if torch.onnx.is_in_onnx_export(): + # TODO: delete + from mmdet.core.export import dynamic_clip_for_onnx + x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape) + bboxes = torch.stack([x1, y1, x2, y2], dim=-1) + return bboxes + if not isinstance(max_shape, torch.Tensor): + max_shape = x1.new_tensor(max_shape) + max_shape = max_shape[..., :2].type_as(x1) + if max_shape.ndim == 2: + assert bboxes.ndim == 3 + assert max_shape.size(0) == bboxes.size(0) + + min_xy = x1.new_tensor(0) + max_xy = torch.cat([max_shape, max_shape], + dim=-1).flip(-1).unsqueeze(-2) + bboxes = torch.where(bboxes < min_xy, min_xy, bboxes) + bboxes = torch.where(bboxes > max_xy, max_xy, bboxes) + + return bboxes + + +def bbox2distance(points: Tensor, + bbox: Tensor, + max_dis: Optional[float] = None, + eps: float = 0.1) -> Tensor: + """Decode bounding box based on distances. + + Args: + points (Tensor): Shape (n, 2) or (b, n, 2), [x, y]. + bbox (Tensor): Shape (n, 4) or (b, n, 4), "xyxy" format + max_dis (float, optional): Upper bound of the distance. + eps (float): a small value to ensure target < max_dis, instead <= + + Returns: + Tensor: Decoded distances. + """ + left = points[..., 0] - bbox[..., 0] + top = points[..., 1] - bbox[..., 1] + right = bbox[..., 2] - points[..., 0] + bottom = bbox[..., 3] - points[..., 1] + if max_dis is not None: + left = left.clamp(min=0, max=max_dis - eps) + top = top.clamp(min=0, max=max_dis - eps) + right = right.clamp(min=0, max=max_dis - eps) + bottom = bottom.clamp(min=0, max=max_dis - eps) + return torch.stack([left, top, right, bottom], -1) + + +def bbox_rescale(bboxes: Tensor, scale_factor: float = 1.0) -> Tensor: + """Rescale bounding box w.r.t. scale_factor. + + Args: + bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois + scale_factor (float): rescale factor + + Returns: + Tensor: Rescaled bboxes. + """ + if bboxes.size(1) == 5: + bboxes_ = bboxes[:, 1:] + inds_ = bboxes[:, 0] + else: + bboxes_ = bboxes + cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5 + cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5 + w = bboxes_[:, 2] - bboxes_[:, 0] + h = bboxes_[:, 3] - bboxes_[:, 1] + w = w * scale_factor + h = h * scale_factor + x1 = cx - 0.5 * w + x2 = cx + 0.5 * w + y1 = cy - 0.5 * h + y2 = cy + 0.5 * h + if bboxes.size(1) == 5: + rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1) + else: + rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1) + return rescaled_bboxes + + +def bbox_cxcywh_to_xyxy(bbox: Tensor) -> Tensor: + """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2). + + Args: + bbox (Tensor): Shape (n, 4) for bboxes. + + Returns: + Tensor: Converted bboxes. + """ + cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1) + bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)] + return torch.cat(bbox_new, dim=-1) + + +def bbox_xyxy_to_cxcywh(bbox: Tensor) -> Tensor: + """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h). + + Args: + bbox (Tensor): Shape (n, 4) for bboxes. + + Returns: + Tensor: Converted bboxes. + """ + x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1) + bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)] + return torch.cat(bbox_new, dim=-1) + + +def bbox2corner(bboxes: torch.Tensor) -> torch.Tensor: + """Convert bbox coordinates from (x1, y1, x2, y2) to corners ((x1, y1), + (x2, y1), (x1, y2), (x2, y2)). + + Args: + bboxes (Tensor): Shape (n, 4) for bboxes. + Returns: + Tensor: Shape (n*4, 2) for corners. + """ + x1, y1, x2, y2 = torch.split(bboxes, 1, dim=1) + return torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=1).reshape(-1, 2) + + +def corner2bbox(corners: torch.Tensor) -> torch.Tensor: + """Convert bbox coordinates from corners ((x1, y1), (x2, y1), (x1, y2), + (x2, y2)) to (x1, y1, x2, y2). + + Args: + corners (Tensor): Shape (n*4, 2) for corners. + Returns: + Tensor: Shape (n, 4) for bboxes. + """ + corners = corners.reshape(-1, 4, 2) + min_xy = corners.min(dim=1)[0] + max_xy = corners.max(dim=1)[0] + return torch.cat([min_xy, max_xy], dim=1) + + +def bbox_project( + bboxes: Union[torch.Tensor, np.ndarray], + homography_matrix: Union[torch.Tensor, np.ndarray], + img_shape: Optional[Tuple[int, int]] = None +) -> Union[torch.Tensor, np.ndarray]: + """Geometric transformation for bbox. + + Args: + bboxes (Union[torch.Tensor, np.ndarray]): Shape (n, 4) for bboxes. + homography_matrix (Union[torch.Tensor, np.ndarray]): + Shape (3, 3) for geometric transformation. + img_shape (Tuple[int, int], optional): Image shape. Defaults to None. + Returns: + Union[torch.Tensor, np.ndarray]: Converted bboxes. + """ + bboxes_type = type(bboxes) + if bboxes_type is np.ndarray: + bboxes = torch.from_numpy(bboxes) + if isinstance(homography_matrix, np.ndarray): + homography_matrix = torch.from_numpy(homography_matrix) + corners = bbox2corner(bboxes) + corners = torch.cat( + [corners, corners.new_ones(corners.shape[0], 1)], dim=1) + corners = torch.matmul(homography_matrix, corners.t()).t() + # Convert to homogeneous coordinates by normalization + corners = corners[:, :2] / corners[:, 2:3] + bboxes = corner2bbox(corners) + if img_shape is not None: + bboxes[:, 0::2] = bboxes[:, 0::2].clamp(0, img_shape[1]) + bboxes[:, 1::2] = bboxes[:, 1::2].clamp(0, img_shape[0]) + if bboxes_type is np.ndarray: + bboxes = bboxes.numpy() + return bboxes + + +def cat_boxes(data_list: List[Union[Tensor, BaseBoxes]], + dim: int = 0) -> Union[Tensor, BaseBoxes]: + """Concatenate boxes with type of tensor or box type. + + Args: + data_list (List[Union[Tensor, :obj:`BaseBoxes`]]): A list of tensors + or box types need to be concatenated. + dim (int): The dimension over which the box are concatenated. + Defaults to 0. + + Returns: + Union[Tensor, :obj`BaseBoxes`]: Concatenated results. + """ + if data_list and isinstance(data_list[0], BaseBoxes): + return data_list[0].cat(data_list, dim=dim) + else: + return torch.cat(data_list, dim=dim) + + +def stack_boxes(data_list: List[Union[Tensor, BaseBoxes]], + dim: int = 0) -> Union[Tensor, BaseBoxes]: + """Stack boxes with type of tensor or box type. + + Args: + data_list (List[Union[Tensor, :obj:`BaseBoxes`]]): A list of tensors + or box types need to be stacked. + dim (int): The dimension over which the box are stacked. + Defaults to 0. + + Returns: + Union[Tensor, :obj`BaseBoxes`]: Stacked results. + """ + if data_list and isinstance(data_list[0], BaseBoxes): + return data_list[0].stack(data_list, dim=dim) + else: + return torch.stack(data_list, dim=dim) + + +def scale_boxes(boxes: Union[Tensor, BaseBoxes], + scale_factor: Tuple[float, float]) -> Union[Tensor, BaseBoxes]: + """Scale boxes with type of tensor or box type. + + Args: + boxes (Tensor or :obj:`BaseBoxes`): boxes need to be scaled. Its type + can be a tensor or a box type. + scale_factor (Tuple[float, float]): factors for scaling boxes. + The length should be 2. + + Returns: + Union[Tensor, :obj:`BaseBoxes`]: Scaled boxes. + """ + if isinstance(boxes, BaseBoxes): + boxes.rescale_(scale_factor) + return boxes + else: + # Tensor boxes will be treated as horizontal boxes + repeat_num = int(boxes.size(-1) / 2) + scale_factor = boxes.new_tensor(scale_factor).repeat((1, repeat_num)) + return boxes * scale_factor + + +def get_box_wh(boxes: Union[Tensor, BaseBoxes]) -> Tuple[Tensor, Tensor]: + """Get the width and height of boxes with type of tensor or box type. + + Args: + boxes (Tensor or :obj:`BaseBoxes`): boxes with type of tensor + or box type. + + Returns: + Tuple[Tensor, Tensor]: the width and height of boxes. + """ + if isinstance(boxes, BaseBoxes): + w = boxes.widths + h = boxes.heights + else: + # Tensor boxes will be treated as horizontal boxes by defaults + w = boxes[:, 2] - boxes[:, 0] + h = boxes[:, 3] - boxes[:, 1] + return w, h + + +def get_box_tensor(boxes: Union[Tensor, BaseBoxes]) -> Tensor: + """Get tensor data from box type boxes. + + Args: + boxes (Tensor or BaseBoxes): boxes with type of tensor or box type. + If its type is a tensor, the boxes will be directly returned. + If its type is a box type, the `boxes.tensor` will be returned. + + Returns: + Tensor: boxes tensor. + """ + if isinstance(boxes, BaseBoxes): + boxes = boxes.tensor + return boxes + + +def empty_box_as(boxes: Union[Tensor, BaseBoxes]) -> Union[Tensor, BaseBoxes]: + """Generate empty box according to input ``boxes` type and device. + + Args: + boxes (Tensor or :obj:`BaseBoxes`): boxes with type of tensor + or box type. + + Returns: + Union[Tensor, BaseBoxes]: Generated empty box. + """ + if isinstance(boxes, BaseBoxes): + return boxes.empty_boxes() + else: + # Tensor boxes will be treated as horizontal boxes by defaults + return boxes.new_zeros(0, 4) + + +def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor: + """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, ratio, h). + + Args: + bbox (Tensor): Shape (n, 4) for bboxes. + + Returns: + Tensor: Converted bboxes. + """ + cx = (bboxes[:, 2] + bboxes[:, 0]) / 2 + cy = (bboxes[:, 3] + bboxes[:, 1]) / 2 + w = bboxes[:, 2] - bboxes[:, 0] + h = bboxes[:, 3] - bboxes[:, 1] + xyah = torch.stack([cx, cy, w / h, h], -1) + return xyah + + +def bbox_cxcyah_to_xyxy(bboxes: torch.Tensor) -> torch.Tensor: + """Convert bbox coordinates from (cx, cy, ratio, h) to (x1, y1, x2, y2). + + Args: + bbox (Tensor): Shape (n, 4) for bboxes. + Returns: + Tensor: Converted bboxes. + """ + cx, cy, ratio, h = bboxes.split((1, 1, 1, 1), dim=-1) + w = ratio * h + x1y1x2y2 = [cx - w / 2.0, cy - h / 2.0, cx + w / 2.0, cy + h / 2.0] + return torch.cat(x1y1x2y2, dim=-1) diff --git a/mmdetection/mmdet/structures/det_data_sample.py b/mmdetection/mmdet/structures/det_data_sample.py new file mode 100644 index 0000000..37dd747 --- /dev/null +++ b/mmdetection/mmdet/structures/det_data_sample.py @@ -0,0 +1,237 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional + +from mmengine.structures import BaseDataElement, InstanceData, PixelData + + +class DetDataSample(BaseDataElement): + """A data structure interface of MMDetection. They are used as interfaces + between different components. + + The attributes in ``DetDataSample`` are divided into several parts: + + - ``proposals``(InstanceData): Region proposals used in two-stage + detectors. + - ``gt_instances``(InstanceData): Ground truth of instance annotations. + - ``pred_instances``(InstanceData): Instances of detection predictions. + - ``pred_track_instances``(InstanceData): Instances of tracking + predictions. + - ``ignored_instances``(InstanceData): Instances to be ignored during + training/testing. + - ``gt_panoptic_seg``(PixelData): Ground truth of panoptic + segmentation. + - ``pred_panoptic_seg``(PixelData): Prediction of panoptic + segmentation. + - ``gt_sem_seg``(PixelData): Ground truth of semantic segmentation. + - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation. + + Examples: + >>> import torch + >>> import numpy as np + >>> from mmengine.structures import InstanceData + >>> from mmdet.structures import DetDataSample + + >>> data_sample = DetDataSample() + >>> img_meta = dict(img_shape=(800, 1196), + ... pad_shape=(800, 1216)) + >>> gt_instances = InstanceData(metainfo=img_meta) + >>> gt_instances.bboxes = torch.rand((5, 4)) + >>> gt_instances.labels = torch.rand((5,)) + >>> data_sample.gt_instances = gt_instances + >>> assert 'img_shape' in data_sample.gt_instances.metainfo_keys() + >>> len(data_sample.gt_instances) + 5 + >>> print(data_sample) + + ) at 0x7f21fb1b9880> + >>> pred_instances = InstanceData(metainfo=img_meta) + >>> pred_instances.bboxes = torch.rand((5, 4)) + >>> pred_instances.scores = torch.rand((5,)) + >>> data_sample = DetDataSample(pred_instances=pred_instances) + >>> assert 'pred_instances' in data_sample + + >>> pred_track_instances = InstanceData(metainfo=img_meta) + >>> pred_track_instances.bboxes = torch.rand((5, 4)) + >>> pred_track_instances.scores = torch.rand((5,)) + >>> data_sample = DetDataSample( + ... pred_track_instances=pred_track_instances) + >>> assert 'pred_track_instances' in data_sample + + >>> data_sample = DetDataSample() + >>> gt_instances_data = dict( + ... bboxes=torch.rand(2, 4), + ... labels=torch.rand(2), + ... masks=np.random.rand(2, 2, 2)) + >>> gt_instances = InstanceData(**gt_instances_data) + >>> data_sample.gt_instances = gt_instances + >>> assert 'gt_instances' in data_sample + >>> assert 'masks' in data_sample.gt_instances + + >>> data_sample = DetDataSample() + >>> gt_panoptic_seg_data = dict(panoptic_seg=torch.rand(2, 4)) + >>> gt_panoptic_seg = PixelData(**gt_panoptic_seg_data) + >>> data_sample.gt_panoptic_seg = gt_panoptic_seg + >>> print(data_sample) + + gt_panoptic_seg: + ) at 0x7f66c2bb7280> + >>> data_sample = DetDataSample() + >>> gt_segm_seg_data = dict(segm_seg=torch.rand(2, 2, 2)) + >>> gt_segm_seg = PixelData(**gt_segm_seg_data) + >>> data_sample.gt_segm_seg = gt_segm_seg + >>> assert 'gt_segm_seg' in data_sample + >>> assert 'segm_seg' in data_sample.gt_segm_seg + """ + + @property + def proposals(self) -> InstanceData: + return self._proposals + + @proposals.setter + def proposals(self, value: InstanceData): + self.set_field(value, '_proposals', dtype=InstanceData) + + @proposals.deleter + def proposals(self): + del self._proposals + + @property + def gt_instances(self) -> InstanceData: + return self._gt_instances + + @gt_instances.setter + def gt_instances(self, value: InstanceData): + self.set_field(value, '_gt_instances', dtype=InstanceData) + + @gt_instances.deleter + def gt_instances(self): + del self._gt_instances + + @property + def pred_instances(self) -> InstanceData: + return self._pred_instances + + @pred_instances.setter + def pred_instances(self, value: InstanceData): + self.set_field(value, '_pred_instances', dtype=InstanceData) + + @pred_instances.deleter + def pred_instances(self): + del self._pred_instances + + # directly add ``pred_track_instances`` in ``DetDataSample`` + # so that the ``TrackDataSample`` does not bother to access the + # instance-level information. + @property + def pred_track_instances(self) -> InstanceData: + return self._pred_track_instances + + @pred_track_instances.setter + def pred_track_instances(self, value: InstanceData): + self.set_field(value, '_pred_track_instances', dtype=InstanceData) + + @pred_track_instances.deleter + def pred_track_instances(self): + del self._pred_track_instances + + @property + def ignored_instances(self) -> InstanceData: + return self._ignored_instances + + @ignored_instances.setter + def ignored_instances(self, value: InstanceData): + self.set_field(value, '_ignored_instances', dtype=InstanceData) + + @ignored_instances.deleter + def ignored_instances(self): + del self._ignored_instances + + @property + def gt_panoptic_seg(self) -> PixelData: + return self._gt_panoptic_seg + + @gt_panoptic_seg.setter + def gt_panoptic_seg(self, value: PixelData): + self.set_field(value, '_gt_panoptic_seg', dtype=PixelData) + + @gt_panoptic_seg.deleter + def gt_panoptic_seg(self): + del self._gt_panoptic_seg + + @property + def pred_panoptic_seg(self) -> PixelData: + return self._pred_panoptic_seg + + @pred_panoptic_seg.setter + def pred_panoptic_seg(self, value: PixelData): + self.set_field(value, '_pred_panoptic_seg', dtype=PixelData) + + @pred_panoptic_seg.deleter + def pred_panoptic_seg(self): + del self._pred_panoptic_seg + + @property + def gt_sem_seg(self) -> PixelData: + return self._gt_sem_seg + + @gt_sem_seg.setter + def gt_sem_seg(self, value: PixelData): + self.set_field(value, '_gt_sem_seg', dtype=PixelData) + + @gt_sem_seg.deleter + def gt_sem_seg(self): + del self._gt_sem_seg + + @property + def pred_sem_seg(self) -> PixelData: + return self._pred_sem_seg + + @pred_sem_seg.setter + def pred_sem_seg(self, value: PixelData): + self.set_field(value, '_pred_sem_seg', dtype=PixelData) + + @pred_sem_seg.deleter + def pred_sem_seg(self): + del self._pred_sem_seg + + +SampleList = List[DetDataSample] +OptSampleList = Optional[SampleList] diff --git a/mmdetection/mmdet/structures/mask/__init__.py b/mmdetection/mmdet/structures/mask/__init__.py new file mode 100644 index 0000000..f783947 --- /dev/null +++ b/mmdetection/mmdet/structures/mask/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .mask_target import mask_target +from .structures import (BaseInstanceMasks, BitmapMasks, PolygonMasks, + bitmap_to_polygon, polygon_to_bitmap) +from .utils import encode_mask_results, mask2bbox, split_combined_polys + +__all__ = [ + 'split_combined_polys', 'mask_target', 'BaseInstanceMasks', 'BitmapMasks', + 'PolygonMasks', 'encode_mask_results', 'mask2bbox', 'polygon_to_bitmap', + 'bitmap_to_polygon' +] diff --git a/mmdetection/mmdet/structures/mask/mask_target.py b/mmdetection/mmdet/structures/mask/mask_target.py new file mode 100644 index 0000000..b2fc5f1 --- /dev/null +++ b/mmdetection/mmdet/structures/mask/mask_target.py @@ -0,0 +1,127 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from torch.nn.modules.utils import _pair + + +def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list, + cfg): + """Compute mask target for positive proposals in multiple images. + + Args: + pos_proposals_list (list[Tensor]): Positive proposals in multiple + images, each has shape (num_pos, 4). + pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each + positive proposals, each has shape (num_pos,). + gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of + each image. + cfg (dict): Config dict that specifies the mask size. + + Returns: + Tensor: Mask target of each image, has shape (num_pos, w, h). + + Example: + >>> from mmengine.config import Config + >>> import mmdet + >>> from mmdet.data_elements.mask import BitmapMasks + >>> from mmdet.data_elements.mask.mask_target import * + >>> H, W = 17, 18 + >>> cfg = Config({'mask_size': (13, 14)}) + >>> rng = np.random.RandomState(0) + >>> # Positive proposals (tl_x, tl_y, br_x, br_y) for each image + >>> pos_proposals_list = [ + >>> torch.Tensor([ + >>> [ 7.2425, 5.5929, 13.9414, 14.9541], + >>> [ 7.3241, 3.6170, 16.3850, 15.3102], + >>> ]), + >>> torch.Tensor([ + >>> [ 4.8448, 6.4010, 7.0314, 9.7681], + >>> [ 5.9790, 2.6989, 7.4416, 4.8580], + >>> [ 0.0000, 0.0000, 0.1398, 9.8232], + >>> ]), + >>> ] + >>> # Corresponding class index for each proposal for each image + >>> pos_assigned_gt_inds_list = [ + >>> torch.LongTensor([7, 0]), + >>> torch.LongTensor([5, 4, 1]), + >>> ] + >>> # Ground truth mask for each true object for each image + >>> gt_masks_list = [ + >>> BitmapMasks(rng.rand(8, H, W), height=H, width=W), + >>> BitmapMasks(rng.rand(6, H, W), height=H, width=W), + >>> ] + >>> mask_targets = mask_target( + >>> pos_proposals_list, pos_assigned_gt_inds_list, + >>> gt_masks_list, cfg) + >>> assert mask_targets.shape == (5,) + cfg['mask_size'] + """ + cfg_list = [cfg for _ in range(len(pos_proposals_list))] + mask_targets = map(mask_target_single, pos_proposals_list, + pos_assigned_gt_inds_list, gt_masks_list, cfg_list) + mask_targets = list(mask_targets) + if len(mask_targets) > 0: + mask_targets = torch.cat(mask_targets) + return mask_targets + + +def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg): + """Compute mask target for each positive proposal in the image. + + Args: + pos_proposals (Tensor): Positive proposals. + pos_assigned_gt_inds (Tensor): Assigned GT inds of positive proposals. + gt_masks (:obj:`BaseInstanceMasks`): GT masks in the format of Bitmap + or Polygon. + cfg (dict): Config dict that indicate the mask size. + + Returns: + Tensor: Mask target of each positive proposals in the image. + + Example: + >>> from mmengine.config import Config + >>> import mmdet + >>> from mmdet.data_elements.mask import BitmapMasks + >>> from mmdet.data_elements.mask.mask_target import * # NOQA + >>> H, W = 32, 32 + >>> cfg = Config({'mask_size': (7, 11)}) + >>> rng = np.random.RandomState(0) + >>> # Masks for each ground truth box (relative to the image) + >>> gt_masks_data = rng.rand(3, H, W) + >>> gt_masks = BitmapMasks(gt_masks_data, height=H, width=W) + >>> # Predicted positive boxes in one image + >>> pos_proposals = torch.FloatTensor([ + >>> [ 16.2, 5.5, 19.9, 20.9], + >>> [ 17.3, 13.6, 19.3, 19.3], + >>> [ 14.8, 16.4, 17.0, 23.7], + >>> [ 0.0, 0.0, 16.0, 16.0], + >>> [ 4.0, 0.0, 20.0, 16.0], + >>> ]) + >>> # For each predicted proposal, its assignment to a gt mask + >>> pos_assigned_gt_inds = torch.LongTensor([0, 1, 2, 1, 1]) + >>> mask_targets = mask_target_single( + >>> pos_proposals, pos_assigned_gt_inds, gt_masks, cfg) + >>> assert mask_targets.shape == (5,) + cfg['mask_size'] + """ + device = pos_proposals.device + mask_size = _pair(cfg.mask_size) + binarize = not cfg.get('soft_mask_target', False) + num_pos = pos_proposals.size(0) + if num_pos > 0: + proposals_np = pos_proposals.cpu().numpy() + maxh, maxw = gt_masks.height, gt_masks.width + proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw) + proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh) + pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy() + + mask_targets = gt_masks.crop_and_resize( + proposals_np, + mask_size, + device=device, + inds=pos_assigned_gt_inds, + binarize=binarize).to_ndarray() + + mask_targets = torch.from_numpy(mask_targets).float().to(device) + else: + mask_targets = pos_proposals.new_zeros((0, ) + mask_size) + + return mask_targets diff --git a/mmdetection/mmdet/structures/mask/structures.py b/mmdetection/mmdet/structures/mask/structures.py new file mode 100644 index 0000000..b4fdd27 --- /dev/null +++ b/mmdetection/mmdet/structures/mask/structures.py @@ -0,0 +1,1193 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import itertools +from abc import ABCMeta, abstractmethod +from typing import Sequence, Type, TypeVar + +import cv2 +import mmcv +import numpy as np +import pycocotools.mask as maskUtils +import shapely.geometry as geometry +import torch +from mmcv.ops.roi_align import roi_align + +T = TypeVar('T') + + +class BaseInstanceMasks(metaclass=ABCMeta): + """Base class for instance masks.""" + + @abstractmethod + def rescale(self, scale, interpolation='nearest'): + """Rescale masks as large as possible while keeping the aspect ratio. + For details can refer to `mmcv.imrescale`. + + Args: + scale (tuple[int]): The maximum size (h, w) of rescaled mask. + interpolation (str): Same as :func:`mmcv.imrescale`. + + Returns: + BaseInstanceMasks: The rescaled masks. + """ + + @abstractmethod + def resize(self, out_shape, interpolation='nearest'): + """Resize masks to the given out_shape. + + Args: + out_shape: Target (h, w) of resized mask. + interpolation (str): See :func:`mmcv.imresize`. + + Returns: + BaseInstanceMasks: The resized masks. + """ + + @abstractmethod + def flip(self, flip_direction='horizontal'): + """Flip masks alone the given direction. + + Args: + flip_direction (str): Either 'horizontal' or 'vertical'. + + Returns: + BaseInstanceMasks: The flipped masks. + """ + + @abstractmethod + def pad(self, out_shape, pad_val): + """Pad masks to the given size of (h, w). + + Args: + out_shape (tuple[int]): Target (h, w) of padded mask. + pad_val (int): The padded value. + + Returns: + BaseInstanceMasks: The padded masks. + """ + + @abstractmethod + def crop(self, bbox): + """Crop each mask by the given bbox. + + Args: + bbox (ndarray): Bbox in format [x1, y1, x2, y2], shape (4, ). + + Return: + BaseInstanceMasks: The cropped masks. + """ + + @abstractmethod + def crop_and_resize(self, + bboxes, + out_shape, + inds, + device, + interpolation='bilinear', + binarize=True): + """Crop and resize masks by the given bboxes. + + This function is mainly used in mask targets computation. + It firstly align mask to bboxes by assigned_inds, then crop mask by the + assigned bbox and resize to the size of (mask_h, mask_w) + + Args: + bboxes (Tensor): Bboxes in format [x1, y1, x2, y2], shape (N, 4) + out_shape (tuple[int]): Target (h, w) of resized mask + inds (ndarray): Indexes to assign masks to each bbox, + shape (N,) and values should be between [0, num_masks - 1]. + device (str): Device of bboxes + interpolation (str): See `mmcv.imresize` + binarize (bool): if True fractional values are rounded to 0 or 1 + after the resize operation. if False and unsupported an error + will be raised. Defaults to True. + + Return: + BaseInstanceMasks: the cropped and resized masks. + """ + + @abstractmethod + def expand(self, expanded_h, expanded_w, top, left): + """see :class:`Expand`.""" + + @property + @abstractmethod + def areas(self): + """ndarray: areas of each instance.""" + + @abstractmethod + def to_ndarray(self): + """Convert masks to the format of ndarray. + + Return: + ndarray: Converted masks in the format of ndarray. + """ + + @abstractmethod + def to_tensor(self, dtype, device): + """Convert masks to the format of Tensor. + + Args: + dtype (str): Dtype of converted mask. + device (torch.device): Device of converted masks. + + Returns: + Tensor: Converted masks in the format of Tensor. + """ + + @abstractmethod + def translate(self, + out_shape, + offset, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """Translate the masks. + + Args: + out_shape (tuple[int]): Shape for output mask, format (h, w). + offset (int | float): The offset for translate. + direction (str): The translate direction, either "horizontal" + or "vertical". + border_value (int | float): Border value. Default 0. + interpolation (str): Same as :func:`mmcv.imtranslate`. + + Returns: + Translated masks. + """ + + def shear(self, + out_shape, + magnitude, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """Shear the masks. + + Args: + out_shape (tuple[int]): Shape for output mask, format (h, w). + magnitude (int | float): The magnitude used for shear. + direction (str): The shear direction, either "horizontal" + or "vertical". + border_value (int | tuple[int]): Value used in case of a + constant border. Default 0. + interpolation (str): Same as in :func:`mmcv.imshear`. + + Returns: + ndarray: Sheared masks. + """ + + @abstractmethod + def rotate(self, out_shape, angle, center=None, scale=1.0, border_value=0): + """Rotate the masks. + + Args: + out_shape (tuple[int]): Shape for output mask, format (h, w). + angle (int | float): Rotation angle in degrees. Positive values + mean counter-clockwise rotation. + center (tuple[float], optional): Center point (w, h) of the + rotation in source image. If not specified, the center of + the image will be used. + scale (int | float): Isotropic scale factor. + border_value (int | float): Border value. Default 0 for masks. + + Returns: + Rotated masks. + """ + + def get_bboxes(self, dst_type='hbb'): + """Get the certain type boxes from masks. + + Please refer to ``mmdet.structures.bbox.box_type`` for more details of + the box type. + + Args: + dst_type: Destination box type. + + Returns: + :obj:`BaseBoxes`: Certain type boxes. + """ + from ..bbox import get_box_type + _, box_type_cls = get_box_type(dst_type) + return box_type_cls.from_instance_masks(self) + + @classmethod + @abstractmethod + def cat(cls: Type[T], masks: Sequence[T]) -> T: + """Concatenate a sequence of masks into one single mask instance. + + Args: + masks (Sequence[T]): A sequence of mask instances. + + Returns: + T: Concatenated mask instance. + """ + + +class BitmapMasks(BaseInstanceMasks): + """This class represents masks in the form of bitmaps. + + Args: + masks (ndarray): ndarray of masks in shape (N, H, W), where N is + the number of objects. + height (int): height of masks + width (int): width of masks + + Example: + >>> from mmdet.data_elements.mask.structures import * # NOQA + >>> num_masks, H, W = 3, 32, 32 + >>> rng = np.random.RandomState(0) + >>> masks = (rng.rand(num_masks, H, W) > 0.1).astype(np.int64) + >>> self = BitmapMasks(masks, height=H, width=W) + + >>> # demo crop_and_resize + >>> num_boxes = 5 + >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes) + >>> out_shape = (14, 14) + >>> inds = torch.randint(0, len(self), size=(num_boxes,)) + >>> device = 'cpu' + >>> interpolation = 'bilinear' + >>> new = self.crop_and_resize( + ... bboxes, out_shape, inds, device, interpolation) + >>> assert len(new) == num_boxes + >>> assert new.height, new.width == out_shape + """ + + def __init__(self, masks, height, width): + self.height = height + self.width = width + if len(masks) == 0: + self.masks = np.empty((0, self.height, self.width), dtype=np.uint8) + else: + assert isinstance(masks, (list, np.ndarray)) + if isinstance(masks, list): + assert isinstance(masks[0], np.ndarray) + assert masks[0].ndim == 2 # (H, W) + else: + assert masks.ndim == 3 # (N, H, W) + + self.masks = np.stack(masks).reshape(-1, height, width) + assert self.masks.shape[1] == self.height + assert self.masks.shape[2] == self.width + + def __getitem__(self, index): + """Index the BitmapMask. + + Args: + index (int | ndarray): Indices in the format of integer or ndarray. + + Returns: + :obj:`BitmapMasks`: Indexed bitmap masks. + """ + masks = self.masks[index].reshape(-1, self.height, self.width) + return BitmapMasks(masks, self.height, self.width) + + def __iter__(self): + return iter(self.masks) + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += f'num_masks={len(self.masks)}, ' + s += f'height={self.height}, ' + s += f'width={self.width})' + return s + + def __len__(self): + """Number of masks.""" + return len(self.masks) + + def rescale(self, scale, interpolation='nearest'): + """See :func:`BaseInstanceMasks.rescale`.""" + if len(self.masks) == 0: + new_w, new_h = mmcv.rescale_size((self.width, self.height), scale) + rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8) + else: + rescaled_masks = np.stack([ + mmcv.imrescale(mask, scale, interpolation=interpolation) + for mask in self.masks + ]) + height, width = rescaled_masks.shape[1:] + return BitmapMasks(rescaled_masks, height, width) + + def resize(self, out_shape, interpolation='nearest'): + """See :func:`BaseInstanceMasks.resize`.""" + if len(self.masks) == 0: + resized_masks = np.empty((0, *out_shape), dtype=np.uint8) + else: + resized_masks = np.stack([ + mmcv.imresize( + mask, out_shape[::-1], interpolation=interpolation) + for mask in self.masks + ]) + return BitmapMasks(resized_masks, *out_shape) + + def flip(self, flip_direction='horizontal'): + """See :func:`BaseInstanceMasks.flip`.""" + assert flip_direction in ('horizontal', 'vertical', 'diagonal') + + if len(self.masks) == 0: + flipped_masks = self.masks + else: + flipped_masks = np.stack([ + mmcv.imflip(mask, direction=flip_direction) + for mask in self.masks + ]) + return BitmapMasks(flipped_masks, self.height, self.width) + + def pad(self, out_shape, pad_val=0): + """See :func:`BaseInstanceMasks.pad`.""" + if len(self.masks) == 0: + padded_masks = np.empty((0, *out_shape), dtype=np.uint8) + else: + padded_masks = np.stack([ + mmcv.impad(mask, shape=out_shape, pad_val=pad_val) + for mask in self.masks + ]) + return BitmapMasks(padded_masks, *out_shape) + + def crop(self, bbox): + """See :func:`BaseInstanceMasks.crop`.""" + assert isinstance(bbox, np.ndarray) + assert bbox.ndim == 1 + + # clip the boundary + bbox = bbox.copy() + bbox[0::2] = np.clip(bbox[0::2], 0, self.width) + bbox[1::2] = np.clip(bbox[1::2], 0, self.height) + x1, y1, x2, y2 = bbox + w = np.maximum(x2 - x1, 1) + h = np.maximum(y2 - y1, 1) + + if len(self.masks) == 0: + cropped_masks = np.empty((0, h, w), dtype=np.uint8) + else: + cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w] + return BitmapMasks(cropped_masks, h, w) + + def crop_and_resize(self, + bboxes, + out_shape, + inds, + device='cpu', + interpolation='bilinear', + binarize=True): + """See :func:`BaseInstanceMasks.crop_and_resize`.""" + if len(self.masks) == 0: + empty_masks = np.empty((0, *out_shape), dtype=np.uint8) + return BitmapMasks(empty_masks, *out_shape) + + # convert bboxes to tensor + if isinstance(bboxes, np.ndarray): + bboxes = torch.from_numpy(bboxes).to(device=device) + if isinstance(inds, np.ndarray): + inds = torch.from_numpy(inds).to(device=device) + + num_bbox = bboxes.shape[0] + fake_inds = torch.arange( + num_bbox, device=device).to(dtype=bboxes.dtype)[:, None] + rois = torch.cat([fake_inds, bboxes], dim=1) # Nx5 + rois = rois.to(device=device) + if num_bbox > 0: + gt_masks_th = torch.from_numpy(self.masks).to(device).index_select( + 0, inds).to(dtype=rois.dtype) + targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape, + 1.0, 0, 'avg', True).squeeze(1) + if binarize: + resized_masks = (targets >= 0.5).cpu().numpy() + else: + resized_masks = targets.cpu().numpy() + else: + resized_masks = [] + return BitmapMasks(resized_masks, *out_shape) + + def expand(self, expanded_h, expanded_w, top, left): + """See :func:`BaseInstanceMasks.expand`.""" + if len(self.masks) == 0: + expanded_mask = np.empty((0, expanded_h, expanded_w), + dtype=np.uint8) + else: + expanded_mask = np.zeros((len(self), expanded_h, expanded_w), + dtype=np.uint8) + expanded_mask[:, top:top + self.height, + left:left + self.width] = self.masks + return BitmapMasks(expanded_mask, expanded_h, expanded_w) + + def translate(self, + out_shape, + offset, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """Translate the BitmapMasks. + + Args: + out_shape (tuple[int]): Shape for output mask, format (h, w). + offset (int | float): The offset for translate. + direction (str): The translate direction, either "horizontal" + or "vertical". + border_value (int | float): Border value. Default 0 for masks. + interpolation (str): Same as :func:`mmcv.imtranslate`. + + Returns: + BitmapMasks: Translated BitmapMasks. + + Example: + >>> from mmdet.data_elements.mask.structures import BitmapMasks + >>> self = BitmapMasks.random(dtype=np.uint8) + >>> out_shape = (32, 32) + >>> offset = 4 + >>> direction = 'horizontal' + >>> border_value = 0 + >>> interpolation = 'bilinear' + >>> # Note, There seem to be issues when: + >>> # * the mask dtype is not supported by cv2.AffineWarp + >>> new = self.translate(out_shape, offset, direction, + >>> border_value, interpolation) + >>> assert len(new) == len(self) + >>> assert new.height, new.width == out_shape + """ + if len(self.masks) == 0: + translated_masks = np.empty((0, *out_shape), dtype=np.uint8) + else: + masks = self.masks + if masks.shape[-2:] != out_shape: + empty_masks = np.zeros((masks.shape[0], *out_shape), + dtype=masks.dtype) + min_h = min(out_shape[0], masks.shape[1]) + min_w = min(out_shape[1], masks.shape[2]) + empty_masks[:, :min_h, :min_w] = masks[:, :min_h, :min_w] + masks = empty_masks + translated_masks = mmcv.imtranslate( + masks.transpose((1, 2, 0)), + offset, + direction, + border_value=border_value, + interpolation=interpolation) + if translated_masks.ndim == 2: + translated_masks = translated_masks[:, :, None] + translated_masks = translated_masks.transpose( + (2, 0, 1)).astype(self.masks.dtype) + return BitmapMasks(translated_masks, *out_shape) + + def shear(self, + out_shape, + magnitude, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """Shear the BitmapMasks. + + Args: + out_shape (tuple[int]): Shape for output mask, format (h, w). + magnitude (int | float): The magnitude used for shear. + direction (str): The shear direction, either "horizontal" + or "vertical". + border_value (int | tuple[int]): Value used in case of a + constant border. + interpolation (str): Same as in :func:`mmcv.imshear`. + + Returns: + BitmapMasks: The sheared masks. + """ + if len(self.masks) == 0: + sheared_masks = np.empty((0, *out_shape), dtype=np.uint8) + else: + sheared_masks = mmcv.imshear( + self.masks.transpose((1, 2, 0)), + magnitude, + direction, + border_value=border_value, + interpolation=interpolation) + if sheared_masks.ndim == 2: + sheared_masks = sheared_masks[:, :, None] + sheared_masks = sheared_masks.transpose( + (2, 0, 1)).astype(self.masks.dtype) + return BitmapMasks(sheared_masks, *out_shape) + + def rotate(self, + out_shape, + angle, + center=None, + scale=1.0, + border_value=0, + interpolation='bilinear'): + """Rotate the BitmapMasks. + + Args: + out_shape (tuple[int]): Shape for output mask, format (h, w). + angle (int | float): Rotation angle in degrees. Positive values + mean counter-clockwise rotation. + center (tuple[float], optional): Center point (w, h) of the + rotation in source image. If not specified, the center of + the image will be used. + scale (int | float): Isotropic scale factor. + border_value (int | float): Border value. Default 0 for masks. + interpolation (str): Same as in :func:`mmcv.imrotate`. + + Returns: + BitmapMasks: Rotated BitmapMasks. + """ + if len(self.masks) == 0: + rotated_masks = np.empty((0, *out_shape), dtype=self.masks.dtype) + else: + rotated_masks = mmcv.imrotate( + self.masks.transpose((1, 2, 0)), + angle, + center=center, + scale=scale, + border_value=border_value, + interpolation=interpolation) + if rotated_masks.ndim == 2: + # case when only one mask, (h, w) + rotated_masks = rotated_masks[:, :, None] # (h, w, 1) + rotated_masks = rotated_masks.transpose( + (2, 0, 1)).astype(self.masks.dtype) + return BitmapMasks(rotated_masks, *out_shape) + + @property + def areas(self): + """See :py:attr:`BaseInstanceMasks.areas`.""" + return self.masks.sum((1, 2)) + + def to_ndarray(self): + """See :func:`BaseInstanceMasks.to_ndarray`.""" + return self.masks + + def to_tensor(self, dtype, device): + """See :func:`BaseInstanceMasks.to_tensor`.""" + return torch.tensor(self.masks, dtype=dtype, device=device) + + @classmethod + def random(cls, + num_masks=3, + height=32, + width=32, + dtype=np.uint8, + rng=None): + """Generate random bitmap masks for demo / testing purposes. + + Example: + >>> from mmdet.data_elements.mask.structures import BitmapMasks + >>> self = BitmapMasks.random() + >>> print('self = {}'.format(self)) + self = BitmapMasks(num_masks=3, height=32, width=32) + """ + from mmdet.utils.util_random import ensure_rng + rng = ensure_rng(rng) + masks = (rng.rand(num_masks, height, width) > 0.1).astype(dtype) + self = cls(masks, height=height, width=width) + return self + + @classmethod + def cat(cls: Type[T], masks: Sequence[T]) -> T: + """Concatenate a sequence of masks into one single mask instance. + + Args: + masks (Sequence[BitmapMasks]): A sequence of mask instances. + + Returns: + BitmapMasks: Concatenated mask instance. + """ + assert isinstance(masks, Sequence) + if len(masks) == 0: + raise ValueError('masks should not be an empty list.') + assert all(isinstance(m, cls) for m in masks) + + mask_array = np.concatenate([m.masks for m in masks], axis=0) + return cls(mask_array, *mask_array.shape[1:]) + + +class PolygonMasks(BaseInstanceMasks): + """This class represents masks in the form of polygons. + + Polygons is a list of three levels. The first level of the list + corresponds to objects, the second level to the polys that compose the + object, the third level to the poly coordinates + + Args: + masks (list[list[ndarray]]): The first level of the list + corresponds to objects, the second level to the polys that + compose the object, the third level to the poly coordinates + height (int): height of masks + width (int): width of masks + + Example: + >>> from mmdet.data_elements.mask.structures import * # NOQA + >>> masks = [ + >>> [ np.array([0, 0, 10, 0, 10, 10., 0, 10, 0, 0]) ] + >>> ] + >>> height, width = 16, 16 + >>> self = PolygonMasks(masks, height, width) + + >>> # demo translate + >>> new = self.translate((16, 16), 4., direction='horizontal') + >>> assert np.all(new.masks[0][0][1::2] == masks[0][0][1::2]) + >>> assert np.all(new.masks[0][0][0::2] == masks[0][0][0::2] + 4) + + >>> # demo crop_and_resize + >>> num_boxes = 3 + >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes) + >>> out_shape = (16, 16) + >>> inds = torch.randint(0, len(self), size=(num_boxes,)) + >>> device = 'cpu' + >>> interpolation = 'bilinear' + >>> new = self.crop_and_resize( + ... bboxes, out_shape, inds, device, interpolation) + >>> assert len(new) == num_boxes + >>> assert new.height, new.width == out_shape + """ + + def __init__(self, masks, height, width): + assert isinstance(masks, list) + if len(masks) > 0: + assert isinstance(masks[0], list) + assert isinstance(masks[0][0], np.ndarray) + + self.height = height + self.width = width + self.masks = masks + + def __getitem__(self, index): + """Index the polygon masks. + + Args: + index (ndarray | List): The indices. + + Returns: + :obj:`PolygonMasks`: The indexed polygon masks. + """ + if isinstance(index, np.ndarray): + if index.dtype == bool: + index = np.where(index)[0].tolist() + else: + index = index.tolist() + if isinstance(index, list): + masks = [self.masks[i] for i in index] + else: + try: + masks = self.masks[index] + except Exception: + raise ValueError( + f'Unsupported input of type {type(index)} for indexing!') + if len(masks) and isinstance(masks[0], np.ndarray): + masks = [masks] # ensure a list of three levels + return PolygonMasks(masks, self.height, self.width) + + def __iter__(self): + return iter(self.masks) + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += f'num_masks={len(self.masks)}, ' + s += f'height={self.height}, ' + s += f'width={self.width})' + return s + + def __len__(self): + """Number of masks.""" + return len(self.masks) + + def rescale(self, scale, interpolation=None): + """see :func:`BaseInstanceMasks.rescale`""" + new_w, new_h = mmcv.rescale_size((self.width, self.height), scale) + if len(self.masks) == 0: + rescaled_masks = PolygonMasks([], new_h, new_w) + else: + rescaled_masks = self.resize((new_h, new_w)) + return rescaled_masks + + def resize(self, out_shape, interpolation=None): + """see :func:`BaseInstanceMasks.resize`""" + if len(self.masks) == 0: + resized_masks = PolygonMasks([], *out_shape) + else: + h_scale = out_shape[0] / self.height + w_scale = out_shape[1] / self.width + resized_masks = [] + for poly_per_obj in self.masks: + resized_poly = [] + for p in poly_per_obj: + p = p.copy() + p[0::2] = p[0::2] * w_scale + p[1::2] = p[1::2] * h_scale + resized_poly.append(p) + resized_masks.append(resized_poly) + resized_masks = PolygonMasks(resized_masks, *out_shape) + return resized_masks + + def flip(self, flip_direction='horizontal'): + """see :func:`BaseInstanceMasks.flip`""" + assert flip_direction in ('horizontal', 'vertical', 'diagonal') + if len(self.masks) == 0: + flipped_masks = PolygonMasks([], self.height, self.width) + else: + flipped_masks = [] + for poly_per_obj in self.masks: + flipped_poly_per_obj = [] + for p in poly_per_obj: + p = p.copy() + if flip_direction == 'horizontal': + p[0::2] = self.width - p[0::2] + elif flip_direction == 'vertical': + p[1::2] = self.height - p[1::2] + else: + p[0::2] = self.width - p[0::2] + p[1::2] = self.height - p[1::2] + flipped_poly_per_obj.append(p) + flipped_masks.append(flipped_poly_per_obj) + flipped_masks = PolygonMasks(flipped_masks, self.height, + self.width) + return flipped_masks + + def crop(self, bbox): + """see :func:`BaseInstanceMasks.crop`""" + assert isinstance(bbox, np.ndarray) + assert bbox.ndim == 1 + + # clip the boundary + bbox = bbox.copy() + bbox[0::2] = np.clip(bbox[0::2], 0, self.width) + bbox[1::2] = np.clip(bbox[1::2], 0, self.height) + x1, y1, x2, y2 = bbox + w = np.maximum(x2 - x1, 1) + h = np.maximum(y2 - y1, 1) + + if len(self.masks) == 0: + cropped_masks = PolygonMasks([], h, w) + else: + # reference: https://github.com/facebookresearch/fvcore/blob/main/fvcore/transforms/transform.py # noqa + crop_box = geometry.box(x1, y1, x2, y2).buffer(0.0) + cropped_masks = [] + # suppress shapely warnings util it incorporates GEOS>=3.11.2 + # reference: https://github.com/shapely/shapely/issues/1345 + initial_settings = np.seterr() + np.seterr(invalid='ignore') + for poly_per_obj in self.masks: + cropped_poly_per_obj = [] + for p in poly_per_obj: + p = p.copy() + p = geometry.Polygon(p.reshape(-1, 2)).buffer(0.0) + # polygon must be valid to perform intersection. + if not p.is_valid: + continue + cropped = p.intersection(crop_box) + if cropped.is_empty: + continue + if isinstance(cropped, + geometry.collection.BaseMultipartGeometry): + cropped = cropped.geoms + else: + cropped = [cropped] + # one polygon may be cropped to multiple ones + for poly in cropped: + # ignore lines or points + if not isinstance( + poly, geometry.Polygon) or not poly.is_valid: + continue + coords = np.asarray(poly.exterior.coords) + # remove an extra identical vertex at the end + coords = coords[:-1] + coords[:, 0] -= x1 + coords[:, 1] -= y1 + cropped_poly_per_obj.append(coords.reshape(-1)) + # a dummy polygon to avoid misalignment between masks and boxes + if len(cropped_poly_per_obj) == 0: + cropped_poly_per_obj = [np.array([0, 0, 0, 0, 0, 0])] + cropped_masks.append(cropped_poly_per_obj) + np.seterr(**initial_settings) + cropped_masks = PolygonMasks(cropped_masks, h, w) + return cropped_masks + + def pad(self, out_shape, pad_val=0): + """padding has no effect on polygons`""" + return PolygonMasks(self.masks, *out_shape) + + def expand(self, *args, **kwargs): + """TODO: Add expand for polygon""" + raise NotImplementedError + + def crop_and_resize(self, + bboxes, + out_shape, + inds, + device='cpu', + interpolation='bilinear', + binarize=True): + """see :func:`BaseInstanceMasks.crop_and_resize`""" + out_h, out_w = out_shape + if len(self.masks) == 0: + return PolygonMasks([], out_h, out_w) + + if not binarize: + raise ValueError('Polygons are always binary, ' + 'setting binarize=False is unsupported') + + resized_masks = [] + for i in range(len(bboxes)): + mask = self.masks[inds[i]] + bbox = bboxes[i, :] + x1, y1, x2, y2 = bbox + w = np.maximum(x2 - x1, 1) + h = np.maximum(y2 - y1, 1) + h_scale = out_h / max(h, 0.1) # avoid too large scale + w_scale = out_w / max(w, 0.1) + + resized_mask = [] + for p in mask: + p = p.copy() + # crop + # pycocotools will clip the boundary + p[0::2] = p[0::2] - bbox[0] + p[1::2] = p[1::2] - bbox[1] + + # resize + p[0::2] = p[0::2] * w_scale + p[1::2] = p[1::2] * h_scale + resized_mask.append(p) + resized_masks.append(resized_mask) + return PolygonMasks(resized_masks, *out_shape) + + def translate(self, + out_shape, + offset, + direction='horizontal', + border_value=None, + interpolation=None): + """Translate the PolygonMasks. + + Example: + >>> self = PolygonMasks.random(dtype=np.int64) + >>> out_shape = (self.height, self.width) + >>> new = self.translate(out_shape, 4., direction='horizontal') + >>> assert np.all(new.masks[0][0][1::2] == self.masks[0][0][1::2]) + >>> assert np.all(new.masks[0][0][0::2] == self.masks[0][0][0::2] + 4) # noqa: E501 + """ + assert border_value is None or border_value == 0, \ + 'Here border_value is not '\ + f'used, and defaultly should be None or 0. got {border_value}.' + if len(self.masks) == 0: + translated_masks = PolygonMasks([], *out_shape) + else: + translated_masks = [] + for poly_per_obj in self.masks: + translated_poly_per_obj = [] + for p in poly_per_obj: + p = p.copy() + if direction == 'horizontal': + p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1]) + elif direction == 'vertical': + p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0]) + translated_poly_per_obj.append(p) + translated_masks.append(translated_poly_per_obj) + translated_masks = PolygonMasks(translated_masks, *out_shape) + return translated_masks + + def shear(self, + out_shape, + magnitude, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """See :func:`BaseInstanceMasks.shear`.""" + if len(self.masks) == 0: + sheared_masks = PolygonMasks([], *out_shape) + else: + sheared_masks = [] + if direction == 'horizontal': + shear_matrix = np.stack([[1, magnitude], + [0, 1]]).astype(np.float32) + elif direction == 'vertical': + shear_matrix = np.stack([[1, 0], [magnitude, + 1]]).astype(np.float32) + for poly_per_obj in self.masks: + sheared_poly = [] + for p in poly_per_obj: + p = np.stack([p[0::2], p[1::2]], axis=0) # [2, n] + new_coords = np.matmul(shear_matrix, p) # [2, n] + new_coords[0, :] = np.clip(new_coords[0, :], 0, + out_shape[1]) + new_coords[1, :] = np.clip(new_coords[1, :], 0, + out_shape[0]) + sheared_poly.append( + new_coords.transpose((1, 0)).reshape(-1)) + sheared_masks.append(sheared_poly) + sheared_masks = PolygonMasks(sheared_masks, *out_shape) + return sheared_masks + + def rotate(self, + out_shape, + angle, + center=None, + scale=1.0, + border_value=0, + interpolation='bilinear'): + """See :func:`BaseInstanceMasks.rotate`.""" + if len(self.masks) == 0: + rotated_masks = PolygonMasks([], *out_shape) + else: + rotated_masks = [] + rotate_matrix = cv2.getRotationMatrix2D(center, -angle, scale) + for poly_per_obj in self.masks: + rotated_poly = [] + for p in poly_per_obj: + p = p.copy() + coords = np.stack([p[0::2], p[1::2]], axis=1) # [n, 2] + # pad 1 to convert from format [x, y] to homogeneous + # coordinates format [x, y, 1] + coords = np.concatenate( + (coords, np.ones((coords.shape[0], 1), coords.dtype)), + axis=1) # [n, 3] + rotated_coords = np.matmul( + rotate_matrix[None, :, :], + coords[:, :, None])[..., 0] # [n, 2, 1] -> [n, 2] + rotated_coords[:, 0] = np.clip(rotated_coords[:, 0], 0, + out_shape[1]) + rotated_coords[:, 1] = np.clip(rotated_coords[:, 1], 0, + out_shape[0]) + rotated_poly.append(rotated_coords.reshape(-1)) + rotated_masks.append(rotated_poly) + rotated_masks = PolygonMasks(rotated_masks, *out_shape) + return rotated_masks + + def to_bitmap(self): + """convert polygon masks to bitmap masks.""" + bitmap_masks = self.to_ndarray() + return BitmapMasks(bitmap_masks, self.height, self.width) + + @property + def areas(self): + """Compute areas of masks. + + This func is modified from `detectron2 + `_. + The function only works with Polygons using the shoelace formula. + + Return: + ndarray: areas of each instance + """ # noqa: W501 + area = [] + for polygons_per_obj in self.masks: + area_per_obj = 0 + for p in polygons_per_obj: + area_per_obj += self._polygon_area(p[0::2], p[1::2]) + area.append(area_per_obj) + return np.asarray(area) + + def _polygon_area(self, x, y): + """Compute the area of a component of a polygon. + + Using the shoelace formula: + https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates + + Args: + x (ndarray): x coordinates of the component + y (ndarray): y coordinates of the component + + Return: + float: the are of the component + """ # noqa: 501 + return 0.5 * np.abs( + np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))) + + def to_ndarray(self): + """Convert masks to the format of ndarray.""" + if len(self.masks) == 0: + return np.empty((0, self.height, self.width), dtype=np.uint8) + bitmap_masks = [] + for poly_per_obj in self.masks: + bitmap_masks.append( + polygon_to_bitmap(poly_per_obj, self.height, self.width)) + return np.stack(bitmap_masks) + + def to_tensor(self, dtype, device): + """See :func:`BaseInstanceMasks.to_tensor`.""" + if len(self.masks) == 0: + return torch.empty((0, self.height, self.width), + dtype=dtype, + device=device) + ndarray_masks = self.to_ndarray() + return torch.tensor(ndarray_masks, dtype=dtype, device=device) + + @classmethod + def random(cls, + num_masks=3, + height=32, + width=32, + n_verts=5, + dtype=np.float32, + rng=None): + """Generate random polygon masks for demo / testing purposes. + + Adapted from [1]_ + + References: + .. [1] https://gitlab.kitware.com/computer-vision/kwimage/-/blob/928cae35ca8/kwimage/structs/polygon.py#L379 # noqa: E501 + + Example: + >>> from mmdet.data_elements.mask.structures import PolygonMasks + >>> self = PolygonMasks.random() + >>> print('self = {}'.format(self)) + """ + from mmdet.utils.util_random import ensure_rng + rng = ensure_rng(rng) + + def _gen_polygon(n, irregularity, spikeyness): + """Creates the polygon by sampling points on a circle around the + centre. Random noise is added by varying the angular spacing + between sequential points, and by varying the radial distance of + each point from the centre. + + Based on original code by Mike Ounsworth + + Args: + n (int): number of vertices + irregularity (float): [0,1] indicating how much variance there + is in the angular spacing of vertices. [0,1] will map to + [0, 2pi/numberOfVerts] + spikeyness (float): [0,1] indicating how much variance there is + in each vertex from the circle of radius aveRadius. [0,1] + will map to [0, aveRadius] + + Returns: + a list of vertices, in CCW order. + """ + from scipy.stats import truncnorm + + # Generate around the unit circle + cx, cy = (0.0, 0.0) + radius = 1 + + tau = np.pi * 2 + + irregularity = np.clip(irregularity, 0, 1) * 2 * np.pi / n + spikeyness = np.clip(spikeyness, 1e-9, 1) + + # generate n angle steps + lower = (tau / n) - irregularity + upper = (tau / n) + irregularity + angle_steps = rng.uniform(lower, upper, n) + + # normalize the steps so that point 0 and point n+1 are the same + k = angle_steps.sum() / (2 * np.pi) + angles = (angle_steps / k).cumsum() + rng.uniform(0, tau) + + # Convert high and low values to be wrt the standard normal range + # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html + low = 0 + high = 2 * radius + mean = radius + std = spikeyness + a = (low - mean) / std + b = (high - mean) / std + tnorm = truncnorm(a=a, b=b, loc=mean, scale=std) + + # now generate the points + radii = tnorm.rvs(n, random_state=rng) + x_pts = cx + radii * np.cos(angles) + y_pts = cy + radii * np.sin(angles) + + points = np.hstack([x_pts[:, None], y_pts[:, None]]) + + # Scale to 0-1 space + points = points - points.min(axis=0) + points = points / points.max(axis=0) + + # Randomly place within 0-1 space + points = points * (rng.rand() * .8 + .2) + min_pt = points.min(axis=0) + max_pt = points.max(axis=0) + + high = (1 - max_pt) + low = (0 - min_pt) + offset = (rng.rand(2) * (high - low)) + low + points = points + offset + return points + + def _order_vertices(verts): + """ + References: + https://stackoverflow.com/questions/1709283/how-can-i-sort-a-coordinate-list-for-a-rectangle-counterclockwise + """ + mlat = verts.T[0].sum() / len(verts) + mlng = verts.T[1].sum() / len(verts) + + tau = np.pi * 2 + angle = (np.arctan2(mlat - verts.T[0], verts.T[1] - mlng) + + tau) % tau + sortx = angle.argsort() + verts = verts.take(sortx, axis=0) + return verts + + # Generate a random exterior for each requested mask + masks = [] + for _ in range(num_masks): + exterior = _order_vertices(_gen_polygon(n_verts, 0.9, 0.9)) + exterior = (exterior * [(width, height)]).astype(dtype) + masks.append([exterior.ravel()]) + + self = cls(masks, height, width) + return self + + @classmethod + def cat(cls: Type[T], masks: Sequence[T]) -> T: + """Concatenate a sequence of masks into one single mask instance. + + Args: + masks (Sequence[PolygonMasks]): A sequence of mask instances. + + Returns: + PolygonMasks: Concatenated mask instance. + """ + assert isinstance(masks, Sequence) + if len(masks) == 0: + raise ValueError('masks should not be an empty list.') + assert all(isinstance(m, cls) for m in masks) + + mask_list = list(itertools.chain(*[m.masks for m in masks])) + return cls(mask_list, masks[0].height, masks[0].width) + + +def polygon_to_bitmap(polygons, height, width): + """Convert masks from the form of polygons to bitmaps. + + Args: + polygons (list[ndarray]): masks in polygon representation + height (int): mask height + width (int): mask width + + Return: + ndarray: the converted masks in bitmap representation + """ + rles = maskUtils.frPyObjects(polygons, height, width) + rle = maskUtils.merge(rles) + bitmap_mask = maskUtils.decode(rle).astype(bool) + return bitmap_mask + + +def bitmap_to_polygon(bitmap): + """Convert masks from the form of bitmaps to polygons. + + Args: + bitmap (ndarray): masks in bitmap representation. + + Return: + list[ndarray]: the converted mask in polygon representation. + bool: whether the mask has holes. + """ + bitmap = np.ascontiguousarray(bitmap).astype(np.uint8) + # cv2.RETR_CCOMP: retrieves all of the contours and organizes them + # into a two-level hierarchy. At the top level, there are external + # boundaries of the components. At the second level, there are + # boundaries of the holes. If there is another contour inside a hole + # of a connected component, it is still put at the top level. + # cv2.CHAIN_APPROX_NONE: stores absolutely all the contour points. + outs = cv2.findContours(bitmap, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) + contours = outs[-2] + hierarchy = outs[-1] + if hierarchy is None: + return [], False + # hierarchy[i]: 4 elements, for the indexes of next, previous, + # parent, or nested contours. If there is no corresponding contour, + # it will be -1. + with_hole = (hierarchy.reshape(-1, 4)[:, 3] >= 0).any() + contours = [c.reshape(-1, 2) for c in contours] + return contours, with_hole diff --git a/mmdetection/mmdet/structures/mask/utils.py b/mmdetection/mmdet/structures/mask/utils.py new file mode 100644 index 0000000..6bd445e --- /dev/null +++ b/mmdetection/mmdet/structures/mask/utils.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pycocotools.mask as mask_util +import torch +from mmengine.utils import slice_list + + +def split_combined_polys(polys, poly_lens, polys_per_mask): + """Split the combined 1-D polys into masks. + + A mask is represented as a list of polys, and a poly is represented as + a 1-D array. In dataset, all masks are concatenated into a single 1-D + tensor. Here we need to split the tensor into original representations. + + Args: + polys (list): a list (length = image num) of 1-D tensors + poly_lens (list): a list (length = image num) of poly length + polys_per_mask (list): a list (length = image num) of poly number + of each mask + + Returns: + list: a list (length = image num) of list (length = mask num) of \ + list (length = poly num) of numpy array. + """ + mask_polys_list = [] + for img_id in range(len(polys)): + polys_single = polys[img_id] + polys_lens_single = poly_lens[img_id].tolist() + polys_per_mask_single = polys_per_mask[img_id].tolist() + + split_polys = slice_list(polys_single, polys_lens_single) + mask_polys = slice_list(split_polys, polys_per_mask_single) + mask_polys_list.append(mask_polys) + return mask_polys_list + + +# TODO: move this function to more proper place +def encode_mask_results(mask_results): + """Encode bitmap mask to RLE code. + + Args: + mask_results (list): bitmap mask results. + + Returns: + list | tuple: RLE encoded mask. + """ + encoded_mask_results = [] + for mask in mask_results: + encoded_mask_results.append( + mask_util.encode( + np.array(mask[:, :, np.newaxis], order='F', + dtype='uint8'))[0]) # encoded with RLE + return encoded_mask_results + + +def mask2bbox(masks): + """Obtain tight bounding boxes of binary masks. + + Args: + masks (Tensor): Binary mask of shape (n, h, w). + + Returns: + Tensor: Bboxe with shape (n, 4) of \ + positive region in binary mask. + """ + N = masks.shape[0] + bboxes = masks.new_zeros((N, 4), dtype=torch.float32) + x_any = torch.any(masks, dim=1) + y_any = torch.any(masks, dim=2) + for i in range(N): + x = torch.where(x_any[i, :])[0] + y = torch.where(y_any[i, :])[0] + if len(x) > 0 and len(y) > 0: + bboxes[i, :] = bboxes.new_tensor( + [x[0], y[0], x[-1] + 1, y[-1] + 1]) + + return bboxes diff --git a/mmdetection/mmdet/structures/reid_data_sample.py b/mmdetection/mmdet/structures/reid_data_sample.py new file mode 100644 index 0000000..69958ee --- /dev/null +++ b/mmdetection/mmdet/structures/reid_data_sample.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from numbers import Number +from typing import Sequence, Union + +import mmengine +import numpy as np +import torch +from mmengine.structures import BaseDataElement, LabelData + + +def format_label(value: Union[torch.Tensor, np.ndarray, Sequence, int], + num_classes: int = None) -> LabelData: + """Convert label of various python types to :obj:`mmengine.LabelData`. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int`. + + Args: + value (torch.Tensor | numpy.ndarray | Sequence | int): Label value. + num_classes (int, optional): The number of classes. If not None, set + it to the metainfo. Defaults to None. + + Returns: + :obj:`mmengine.LabelData`: The foramtted label data. + """ + + # Handle single number + if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0: + value = int(value.item()) + + if isinstance(value, np.ndarray): + value = torch.from_numpy(value) + elif isinstance(value, Sequence) and not mmengine.utils.is_str(value): + value = torch.tensor(value) + elif isinstance(value, int): + value = torch.LongTensor([value]) + elif not isinstance(value, torch.Tensor): + raise TypeError(f'Type {type(value)} is not an available label type.') + + metainfo = {} + if num_classes is not None: + metainfo['num_classes'] = num_classes + if value.max() >= num_classes: + raise ValueError(f'The label data ({value}) should not ' + f'exceed num_classes ({num_classes}).') + label = LabelData(label=value, metainfo=metainfo) + return label + + +class ReIDDataSample(BaseDataElement): + """A data structure interface of ReID task. + + It's used as interfaces between different components. + + Meta field: + img_shape (Tuple): The shape of the corresponding input image. + Used for visualization. + ori_shape (Tuple): The original shape of the corresponding image. + Used for visualization. + num_classes (int): The number of all categories. + Used for label format conversion. + + Data field: + gt_label (LabelData): The ground truth label. + pred_label (LabelData): The predicted label. + scores (torch.Tensor): The outputs of model. + """ + + @property + def gt_label(self): + return self._gt_label + + @gt_label.setter + def gt_label(self, value: LabelData): + self.set_field(value, '_gt_label', dtype=LabelData) + + @gt_label.deleter + def gt_label(self): + del self._gt_label + + def set_gt_label( + self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number] + ) -> 'ReIDDataSample': + """Set label of ``gt_label``.""" + label = format_label(value, self.get('num_classes')) + if 'gt_label' in self: # setting for the second time + self.gt_label.label = label.label + else: # setting for the first time + self.gt_label = label + return self + + def set_gt_score(self, value: torch.Tensor) -> 'ReIDDataSample': + """Set score of ``gt_label``.""" + assert isinstance(value, torch.Tensor), \ + f'The value should be a torch.Tensor but got {type(value)}.' + assert value.ndim == 1, \ + f'The dims of value should be 1, but got {value.ndim}.' + + if 'num_classes' in self: + assert value.size(0) == self.num_classes, \ + f"The length of value ({value.size(0)}) doesn't "\ + f'match the num_classes ({self.num_classes}).' + metainfo = {'num_classes': self.num_classes} + else: + metainfo = {'num_classes': value.size(0)} + + if 'gt_label' in self: # setting for the second time + self.gt_label.score = value + else: # setting for the first time + self.gt_label = LabelData(score=value, metainfo=metainfo) + return self + + @property + def pred_feature(self): + return self._pred_feature + + @pred_feature.setter + def pred_feature(self, value: torch.Tensor): + self.set_field(value, '_pred_feature', dtype=torch.Tensor) + + @pred_feature.deleter + def pred_feature(self): + del self._pred_feature diff --git a/mmdetection/mmdet/structures/track_data_sample.py b/mmdetection/mmdet/structures/track_data_sample.py new file mode 100644 index 0000000..d005a5a --- /dev/null +++ b/mmdetection/mmdet/structures/track_data_sample.py @@ -0,0 +1,273 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence + +import numpy as np +import torch +from mmengine.structures import BaseDataElement + +from .det_data_sample import DetDataSample + + +class TrackDataSample(BaseDataElement): + """A data structure interface of tracking task in MMDetection. It is used + as interfaces between different components. + + This data structure can be viewd as a wrapper of multiple DetDataSample to + some extent. Specifically, it only contains a property: + ``video_data_samples`` which is a list of DetDataSample, each of which + corresponds to a single frame. If you want to get the property of a single + frame, you must first get the corresponding ``DetDataSample`` by indexing + and then get the property of the frame, such as ``gt_instances``, + ``pred_instances`` and so on. As for metainfo, it differs from + ``DetDataSample`` in that each value corresponds to the metainfo key is a + list where each element corresponds to information of a single frame. + + Examples: + >>> import torch + >>> from mmengine.structures import InstanceData + >>> from mmdet.structures import DetDataSample, TrackDataSample + >>> track_data_sample = TrackDataSample() + >>> # set the 1st frame + >>> frame1_data_sample = DetDataSample(metainfo=dict( + ... img_shape=(100, 100), frame_id=0)) + >>> frame1_gt_instances = InstanceData() + >>> frame1_gt_instances.bbox = torch.zeros([2, 4]) + >>> frame1_data_sample.gt_instances = frame1_gt_instances + >>> # set the 2nd frame + >>> frame2_data_sample = DetDataSample(metainfo=dict( + ... img_shape=(100, 100), frame_id=1)) + >>> frame2_gt_instances = InstanceData() + >>> frame2_gt_instances.bbox = torch.ones([3, 4]) + >>> frame2_data_sample.gt_instances = frame2_gt_instances + >>> track_data_sample.video_data_samples = [frame1_data_sample, + ... frame2_data_sample] + >>> # set metainfo for track_data_sample + >>> track_data_sample.set_metainfo(dict(key_frames_inds=[0])) + >>> track_data_sample.set_metainfo(dict(ref_frames_inds=[1])) + >>> print(track_data_sample) + + ) at 0x7f64bd223340>, + ) at 0x7f64bd1346d0>] + ) at 0x7f64bd2237f0> + >>> print(len(track_data_sample)) + 2 + >>> key_data_sample = track_data_sample.get_key_frames() + >>> print(key_data_sample[0].frame_id) + 0 + >>> ref_data_sample = track_data_sample.get_ref_frames() + >>> print(ref_data_sample[0].frame_id) + 1 + >>> frame1_data_sample = track_data_sample[0] + >>> print(frame1_data_sample.gt_instances.bbox) + tensor([[0., 0., 0., 0.], + [0., 0., 0., 0.]]) + >>> # Tensor-like methods + >>> cuda_track_data_sample = track_data_sample.to('cuda') + >>> cuda_track_data_sample = track_data_sample.cuda() + >>> cpu_track_data_sample = track_data_sample.cpu() + >>> cpu_track_data_sample = track_data_sample.to('cpu') + >>> fp16_instances = cuda_track_data_sample.to( + ... device=None, dtype=torch.float16, non_blocking=False, + ... copy=False, memory_format=torch.preserve_format) + """ + + @property + def video_data_samples(self) -> List[DetDataSample]: + return self._video_data_samples + + @video_data_samples.setter + def video_data_samples(self, value: List[DetDataSample]): + if isinstance(value, DetDataSample): + value = [value] + assert isinstance(value, list), 'video_data_samples must be a list' + assert isinstance( + value[0], DetDataSample + ), 'video_data_samples must be a list of DetDataSample, but got ' + f'{value[0]}' + self.set_field(value, '_video_data_samples', dtype=list) + + @video_data_samples.deleter + def video_data_samples(self): + del self._video_data_samples + + def __getitem__(self, index): + assert hasattr(self, + '_video_data_samples'), 'video_data_samples not set' + return self._video_data_samples[index] + + def get_key_frames(self): + assert hasattr(self, 'key_frames_inds'), \ + 'key_frames_inds not set' + assert isinstance(self.key_frames_inds, Sequence) + key_frames_info = [] + for index in self.key_frames_inds: + key_frames_info.append(self[index]) + return key_frames_info + + def get_ref_frames(self): + assert hasattr(self, 'ref_frames_inds'), \ + 'ref_frames_inds not set' + ref_frames_info = [] + assert isinstance(self.ref_frames_inds, Sequence) + for index in self.ref_frames_inds: + ref_frames_info.append(self[index]) + return ref_frames_info + + def __len__(self): + return len(self._video_data_samples) if hasattr( + self, '_video_data_samples') else 0 + + # TODO: add UT for this Tensor-like method + # Tensor-like methods + def to(self, *args, **kwargs) -> 'BaseDataElement': + """Apply same name function to all tensors in data_fields.""" + new_data = self.new() + for k, v_list in self.items(): + data_list = [] + for v in v_list: + if hasattr(v, 'to'): + v = v.to(*args, **kwargs) + data_list.append(v) + if len(data_list) > 0: + new_data.set_data({f'{k}': data_list}) + return new_data + + # Tensor-like methods + def cpu(self) -> 'BaseDataElement': + """Convert all tensors to CPU in data.""" + new_data = self.new() + for k, v_list in self.items(): + data_list = [] + for v in v_list: + if isinstance(v, (torch.Tensor, BaseDataElement)): + v = v.cpu() + data_list.append(v) + if len(data_list) > 0: + new_data.set_data({f'{k}': data_list}) + return new_data + + # Tensor-like methods + def cuda(self) -> 'BaseDataElement': + """Convert all tensors to GPU in data.""" + new_data = self.new() + for k, v_list in self.items(): + data_list = [] + for v in v_list: + if isinstance(v, (torch.Tensor, BaseDataElement)): + v = v.cuda() + data_list.append(v) + if len(data_list) > 0: + new_data.set_data({f'{k}': data_list}) + return new_data + + # Tensor-like methods + def npu(self) -> 'BaseDataElement': + """Convert all tensors to NPU in data.""" + new_data = self.new() + for k, v_list in self.items(): + data_list = [] + for v in v_list: + if isinstance(v, (torch.Tensor, BaseDataElement)): + v = v.npu() + data_list.append(v) + if len(data_list) > 0: + new_data.set_data({f'{k}': data_list}) + return new_data + + # Tensor-like methods + def detach(self) -> 'BaseDataElement': + """Detach all tensors in data.""" + new_data = self.new() + for k, v_list in self.items(): + data_list = [] + for v in v_list: + if isinstance(v, (torch.Tensor, BaseDataElement)): + v = v.detach() + data_list.append(v) + if len(data_list) > 0: + new_data.set_data({f'{k}': data_list}) + return new_data + + # Tensor-like methods + def numpy(self) -> 'BaseDataElement': + """Convert all tensors to np.ndarray in data.""" + new_data = self.new() + for k, v_list in self.items(): + data_list = [] + for v in v_list: + if isinstance(v, (torch.Tensor, BaseDataElement)): + v = v.detach().cpu().numpy() + data_list.append(v) + if len(data_list) > 0: + new_data.set_data({f'{k}': data_list}) + return new_data + + def to_tensor(self) -> 'BaseDataElement': + """Convert all np.ndarray to tensor in data.""" + new_data = self.new() + for k, v_list in self.items(): + data_list = [] + for v in v_list: + if isinstance(v, np.ndarray): + v = torch.from_numpy(v) + elif isinstance(v, BaseDataElement): + v = v.to_tensor() + data_list.append(v) + if len(data_list) > 0: + new_data.set_data({f'{k}': data_list}) + return new_data + + # Tensor-like methods + def clone(self) -> 'BaseDataElement': + """Deep copy the current data element. + + Returns: + BaseDataElement: The copy of current data element. + """ + clone_data = self.__class__() + clone_data.set_metainfo(dict(self.metainfo_items())) + + for k, v_list in self.items(): + clone_item_list = [] + for v in v_list: + clone_item_list.append(v.clone()) + clone_data.set_data({k: clone_item_list}) + return clone_data + + +TrackSampleList = List[TrackDataSample] +OptTrackSampleList = Optional[TrackSampleList] diff --git a/mmdetection/mmdet/testing/__init__.py b/mmdetection/mmdet/testing/__init__.py new file mode 100644 index 0000000..766fb47 --- /dev/null +++ b/mmdetection/mmdet/testing/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ._fast_stop_training_hook import FastStopTrainingHook # noqa: F401,F403 +from ._utils import (demo_mm_inputs, demo_mm_proposals, + demo_mm_sampling_results, demo_track_inputs, + get_detector_cfg, get_roi_head_cfg, random_boxes, + replace_to_ceph) + +__all__ = [ + 'demo_mm_inputs', 'get_detector_cfg', 'get_roi_head_cfg', + 'demo_mm_proposals', 'demo_mm_sampling_results', 'replace_to_ceph', + 'demo_track_inputs', 'VideoDataSampleFeeder', 'random_boxes' +] diff --git a/mmdetection/mmdet/testing/_fast_stop_training_hook.py b/mmdetection/mmdet/testing/_fast_stop_training_hook.py new file mode 100644 index 0000000..f8e3d11 --- /dev/null +++ b/mmdetection/mmdet/testing/_fast_stop_training_hook.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import Hook + +from mmdet.registry import HOOKS + + +@HOOKS.register_module() +class FastStopTrainingHook(Hook): + """Set runner's epoch information to the model.""" + + def __init__(self, by_epoch, save_ckpt=False, stop_iter_or_epoch=5): + self.by_epoch = by_epoch + self.save_ckpt = save_ckpt + self.stop_iter_or_epoch = stop_iter_or_epoch + + def after_train_iter(self, runner, batch_idx: int, data_batch: None, + outputs: None) -> None: + if self.save_ckpt and self.by_epoch: + # If it is epoch-based and want to save weights, + # we must run at least 1 epoch. + return + if runner.iter >= self.stop_iter_or_epoch: + raise RuntimeError('quick exit') + + def after_train_epoch(self, runner) -> None: + if runner.epoch >= self.stop_iter_or_epoch - 1: + raise RuntimeError('quick exit') diff --git a/mmdetection/mmdet/testing/_utils.py b/mmdetection/mmdet/testing/_utils.py new file mode 100644 index 0000000..c4d3a86 --- /dev/null +++ b/mmdetection/mmdet/testing/_utils.py @@ -0,0 +1,469 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from os.path import dirname, exists, join + +import numpy as np +import torch +from mmengine.config import Config +from mmengine.dataset import pseudo_collate +from mmengine.structures import InstanceData, PixelData + +from mmdet.utils.util_random import ensure_rng +from ..registry import TASK_UTILS +from ..structures import DetDataSample, TrackDataSample +from ..structures.bbox import HorizontalBoxes + + +def _get_config_directory(): + """Find the predefined detector config directory.""" + try: + # Assume we are running in the source mmdetection repo + repo_dpath = dirname(dirname(dirname(__file__))) + except NameError: + # For IPython development when this __file__ is not defined + import mmdet + repo_dpath = dirname(dirname(mmdet.__file__)) + config_dpath = join(repo_dpath, 'configs') + if not exists(config_dpath): + raise Exception('Cannot find config path') + return config_dpath + + +def _get_config_module(fname): + """Load a configuration as a python module.""" + config_dpath = _get_config_directory() + config_fpath = join(config_dpath, fname) + config_mod = Config.fromfile(config_fpath) + return config_mod + + +def get_detector_cfg(fname): + """Grab configs necessary to create a detector. + + These are deep copied to allow for safe modification of parameters without + influencing other tests. + """ + config = _get_config_module(fname) + model = copy.deepcopy(config.model) + return model + + +def get_roi_head_cfg(fname): + """Grab configs necessary to create a roi_head. + + These are deep copied to allow for safe modification of parameters without + influencing other tests. + """ + config = _get_config_module(fname) + model = copy.deepcopy(config.model) + + roi_head = model.roi_head + train_cfg = None if model.train_cfg is None else model.train_cfg.rcnn + test_cfg = None if model.test_cfg is None else model.test_cfg.rcnn + roi_head.update(dict(train_cfg=train_cfg, test_cfg=test_cfg)) + return roi_head + + +def _rand_bboxes(rng, num_boxes, w, h): + cx, cy, bw, bh = rng.rand(num_boxes, 4).T + + tl_x = ((cx * w) - (w * bw / 2)).clip(0, w) + tl_y = ((cy * h) - (h * bh / 2)).clip(0, h) + br_x = ((cx * w) + (w * bw / 2)).clip(0, w) + br_y = ((cy * h) + (h * bh / 2)).clip(0, h) + + bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T + return bboxes + + +def _rand_masks(rng, num_boxes, bboxes, img_w, img_h): + from mmdet.structures.mask import BitmapMasks + masks = np.zeros((num_boxes, img_h, img_w)) + for i, bbox in enumerate(bboxes): + bbox = bbox.astype(np.int32) + mask = (rng.rand(1, bbox[3] - bbox[1], bbox[2] - bbox[0]) > + 0.3).astype(np.int64) + masks[i:i + 1, bbox[1]:bbox[3], bbox[0]:bbox[2]] = mask + return BitmapMasks(masks, height=img_h, width=img_w) + + +def demo_mm_inputs(batch_size=2, + image_shapes=(3, 128, 128), + num_items=None, + num_classes=10, + sem_seg_output_strides=1, + with_mask=False, + with_semantic=False, + use_box_type=False, + device='cpu', + texts=None, + custom_entities=False): + """Create a superset of inputs needed to run test or train batches. + + Args: + batch_size (int): batch size. Defaults to 2. + image_shapes (List[tuple], Optional): image shape. + Defaults to (3, 128, 128) + num_items (None | List[int]): specifies the number + of boxes in each batch item. Default to None. + num_classes (int): number of different labels a + box might have. Defaults to 10. + with_mask (bool): Whether to return mask annotation. + Defaults to False. + with_semantic (bool): whether to return semantic. + Defaults to False. + device (str): Destination device type. Defaults to cpu. + """ + rng = np.random.RandomState(0) + + if isinstance(image_shapes, list): + assert len(image_shapes) == batch_size + else: + image_shapes = [image_shapes] * batch_size + + if isinstance(num_items, list): + assert len(num_items) == batch_size + + if texts is not None: + assert batch_size == len(texts) + + packed_inputs = [] + for idx in range(batch_size): + image_shape = image_shapes[idx] + c, h, w = image_shape + + image = rng.randint(0, 255, size=image_shape, dtype=np.uint8) + + mm_inputs = dict() + mm_inputs['inputs'] = torch.from_numpy(image).to(device) + + img_meta = { + 'img_id': idx, + 'img_shape': image_shape[1:], + 'ori_shape': image_shape[1:], + 'filename': '.png', + 'scale_factor': np.array([1.1, 1.2]), + 'flip': False, + 'flip_direction': None, + 'border': [1, 1, 1, 1] # Only used by CenterNet + } + + if texts: + img_meta['text'] = texts[idx] + img_meta['custom_entities'] = custom_entities + + data_sample = DetDataSample() + data_sample.set_metainfo(img_meta) + + # gt_instances + gt_instances = InstanceData() + if num_items is None: + num_boxes = rng.randint(1, 10) + else: + num_boxes = num_items[idx] + + bboxes = _rand_bboxes(rng, num_boxes, w, h) + labels = rng.randint(1, num_classes, size=num_boxes) + # TODO: remove this part when all model adapted with BaseBoxes + if use_box_type: + gt_instances.bboxes = HorizontalBoxes(bboxes, dtype=torch.float32) + else: + gt_instances.bboxes = torch.FloatTensor(bboxes) + gt_instances.labels = torch.LongTensor(labels) + + if with_mask: + masks = _rand_masks(rng, num_boxes, bboxes, w, h) + gt_instances.masks = masks + + # TODO: waiting for ci to be fixed + # masks = np.random.randint(0, 2, (len(bboxes), h, w), dtype=np.uint8) + # gt_instances.mask = BitmapMasks(masks, h, w) + + data_sample.gt_instances = gt_instances + + # ignore_instances + ignore_instances = InstanceData() + bboxes = _rand_bboxes(rng, num_boxes, w, h) + if use_box_type: + ignore_instances.bboxes = HorizontalBoxes( + bboxes, dtype=torch.float32) + else: + ignore_instances.bboxes = torch.FloatTensor(bboxes) + data_sample.ignored_instances = ignore_instances + + # gt_sem_seg + if with_semantic: + # assume gt_semantic_seg using scale 1/8 of the img + gt_semantic_seg = torch.from_numpy( + np.random.randint( + 0, + num_classes, (1, h // sem_seg_output_strides, + w // sem_seg_output_strides), + dtype=np.uint8)) + gt_sem_seg_data = dict(sem_seg=gt_semantic_seg) + data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data) + + mm_inputs['data_samples'] = data_sample.to(device) + + # TODO: gt_ignore + + packed_inputs.append(mm_inputs) + data = pseudo_collate(packed_inputs) + return data + + +def demo_mm_proposals(image_shapes, num_proposals, device='cpu'): + """Create a list of fake porposals. + + Args: + image_shapes (list[tuple[int]]): Batch image shapes. + num_proposals (int): The number of fake proposals. + """ + rng = np.random.RandomState(0) + + results = [] + for img_shape in image_shapes: + result = InstanceData() + w, h = img_shape[1:] + proposals = _rand_bboxes(rng, num_proposals, w, h) + result.bboxes = torch.from_numpy(proposals).float() + result.scores = torch.from_numpy(rng.rand(num_proposals)).float() + result.labels = torch.zeros(num_proposals).long() + results.append(result.to(device)) + return results + + +def demo_mm_sampling_results(proposals_list, + batch_gt_instances, + batch_gt_instances_ignore=None, + assigner_cfg=None, + sampler_cfg=None, + feats=None): + """Create sample results that can be passed to BBoxHead.get_targets.""" + assert len(proposals_list) == len(batch_gt_instances) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None for _ in batch_gt_instances] + else: + assert len(batch_gt_instances_ignore) == len(batch_gt_instances) + + default_assigner_cfg = dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1) + assigner_cfg = assigner_cfg if assigner_cfg is not None \ + else default_assigner_cfg + default_sampler_cfg = dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True) + sampler_cfg = sampler_cfg if sampler_cfg is not None \ + else default_sampler_cfg + bbox_assigner = TASK_UTILS.build(assigner_cfg) + bbox_sampler = TASK_UTILS.build(sampler_cfg) + + sampling_results = [] + for i in range(len(batch_gt_instances)): + if feats is not None: + feats = [lvl_feat[i][None] for lvl_feat in feats] + # rename proposals.bboxes to proposals.priors + proposals = proposals_list[i] + proposals.priors = proposals.pop('bboxes') + + assign_result = bbox_assigner.assign(proposals, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = bbox_sampler.sample( + assign_result, proposals, batch_gt_instances[i], feats=feats) + sampling_results.append(sampling_result) + + return sampling_results + + +def demo_track_inputs(batch_size=1, + num_frames=2, + key_frames_inds=None, + image_shapes=(3, 128, 128), + num_items=None, + num_classes=1, + with_mask=False, + with_semantic=False): + """Create a superset of inputs needed to run test or train batches. + + Args: + batch_size (int): batch size. Default to 1. + num_frames (int): The number of frames. + key_frames_inds (List): The indices of key frames. + image_shapes (List[tuple], Optional): image shape. + Default to (3, 128, 128) + num_items (None | List[int]): specifies the number + of boxes in each batch item. Default to None. + num_classes (int): number of different labels a + box might have. Default to 1. + with_mask (bool): Whether to return mask annotation. + Defaults to False. + with_semantic (bool): whether to return semantic. + Default to False. + """ + rng = np.random.RandomState(0) + + # Make sure the length of image_shapes is equal to ``batch_size`` + if isinstance(image_shapes, list): + assert len(image_shapes) == batch_size + else: + image_shapes = [image_shapes] * batch_size + + packed_inputs = [] + for idx in range(batch_size): + mm_inputs = dict(inputs=dict()) + _, h, w = image_shapes[idx] + + imgs = rng.randint( + 0, 255, size=(num_frames, *image_shapes[idx]), dtype=np.uint8) + mm_inputs['inputs'] = torch.from_numpy(imgs) + + img_meta = { + 'img_id': idx, + 'img_shape': image_shapes[idx][-2:], + 'ori_shape': image_shapes[idx][-2:], + 'filename': '.png', + 'scale_factor': np.array([1.1, 1.2]), + 'flip': False, + 'flip_direction': None, + 'is_video_data': True, + } + + video_data_samples = [] + for i in range(num_frames): + data_sample = DetDataSample() + img_meta['frame_id'] = i + data_sample.set_metainfo(img_meta) + + # gt_instances + gt_instances = InstanceData() + if num_items is None: + num_boxes = rng.randint(1, 10) + else: + num_boxes = num_items[idx] + + bboxes = _rand_bboxes(rng, num_boxes, w, h) + labels = rng.randint(0, num_classes, size=num_boxes) + instances_id = rng.randint(100, num_classes + 100, size=num_boxes) + gt_instances.bboxes = torch.FloatTensor(bboxes) + gt_instances.labels = torch.LongTensor(labels) + gt_instances.instances_ids = torch.LongTensor(instances_id) + + if with_mask: + masks = _rand_masks(rng, num_boxes, bboxes, w, h) + gt_instances.masks = masks + + data_sample.gt_instances = gt_instances + # ignore_instances + ignore_instances = InstanceData() + bboxes = _rand_bboxes(rng, num_boxes, w, h) + ignore_instances.bboxes = bboxes + data_sample.ignored_instances = ignore_instances + + video_data_samples.append(data_sample) + + track_data_sample = TrackDataSample() + track_data_sample.video_data_samples = video_data_samples + if key_frames_inds is not None: + assert isinstance( + key_frames_inds, + list) and len(key_frames_inds) < num_frames and max( + key_frames_inds) < num_frames + ref_frames_inds = [ + i for i in range(num_frames) if i not in key_frames_inds + ] + track_data_sample.set_metainfo( + dict(key_frames_inds=key_frames_inds)) + track_data_sample.set_metainfo( + dict(ref_frames_inds=ref_frames_inds)) + mm_inputs['data_samples'] = track_data_sample + + # TODO: gt_ignore + packed_inputs.append(mm_inputs) + data = pseudo_collate(packed_inputs) + return data + + +def random_boxes(num=1, scale=1, rng=None): + """Simple version of ``kwimage.Boxes.random`` + Returns: + Tensor: shape (n, 4) in x1, y1, x2, y2 format. + References: + https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390 # noqa: E501 + Example: + >>> num = 3 + >>> scale = 512 + >>> rng = 0 + >>> boxes = random_boxes(num, scale, rng) + >>> print(boxes) + tensor([[280.9925, 278.9802, 308.6148, 366.1769], + [216.9113, 330.6978, 224.0446, 456.5878], + [405.3632, 196.3221, 493.3953, 270.7942]]) + """ + rng = ensure_rng(rng) + + tlbr = rng.rand(num, 4).astype(np.float32) + + tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2]) + tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3]) + br_x = np.maximum(tlbr[:, 0], tlbr[:, 2]) + br_y = np.maximum(tlbr[:, 1], tlbr[:, 3]) + + tlbr[:, 0] = tl_x * scale + tlbr[:, 1] = tl_y * scale + tlbr[:, 2] = br_x * scale + tlbr[:, 3] = br_y * scale + + boxes = torch.from_numpy(tlbr) + return boxes + + +# TODO: Support full ceph +def replace_to_ceph(cfg): + backend_args = dict( + backend='petrel', + path_mapping=dict({ + './data/': 's3://openmmlab/datasets/detection/', + 'data/': 's3://openmmlab/datasets/detection/' + })) + + # TODO: name is a reserved interface, which will be used later. + def _process_pipeline(dataset, name): + + def replace_img(pipeline): + if pipeline['type'] == 'LoadImageFromFile': + pipeline['backend_args'] = backend_args + + def replace_ann(pipeline): + if pipeline['type'] == 'LoadAnnotations' or pipeline[ + 'type'] == 'LoadPanopticAnnotations': + pipeline['backend_args'] = backend_args + + if 'pipeline' in dataset: + replace_img(dataset.pipeline[0]) + replace_ann(dataset.pipeline[1]) + if 'dataset' in dataset: + # dataset wrapper + replace_img(dataset.dataset.pipeline[0]) + replace_ann(dataset.dataset.pipeline[1]) + else: + # dataset wrapper + replace_img(dataset.dataset.pipeline[0]) + replace_ann(dataset.dataset.pipeline[1]) + + def _process_evaluator(evaluator, name): + if evaluator['type'] == 'CocoPanopticMetric': + evaluator['backend_args'] = backend_args + + # half ceph + _process_pipeline(cfg.train_dataloader.dataset, cfg.filename) + _process_pipeline(cfg.val_dataloader.dataset, cfg.filename) + _process_pipeline(cfg.test_dataloader.dataset, cfg.filename) + _process_evaluator(cfg.val_evaluator, cfg.filename) + _process_evaluator(cfg.test_evaluator, cfg.filename) diff --git a/mmdetection/mmdet/utils/__init__.py b/mmdetection/mmdet/utils/__init__.py new file mode 100644 index 0000000..449a890 --- /dev/null +++ b/mmdetection/mmdet/utils/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .collect_env import collect_env +from .compat_config import compat_cfg +from .dist_utils import (all_reduce_dict, allreduce_grads, reduce_mean, + sync_random_seed) +from .logger import get_caller_name, log_img_scale +from .memory import AvoidCUDAOOM, AvoidOOM +from .misc import (find_latest_checkpoint, get_test_pipeline_cfg, + update_data_root) +from .mot_error_visualize import imshow_mot_errors +from .replace_cfg_vals import replace_cfg_vals +from .setup_env import (register_all_modules, setup_cache_size_limit_of_dynamo, + setup_multi_processes) +from .split_batch import split_batch +from .typing_utils import (ConfigType, InstanceList, MultiConfig, + OptConfigType, OptInstanceList, OptMultiConfig, + OptPixelList, PixelList, RangeType) + +__all__ = [ + 'collect_env', 'find_latest_checkpoint', 'update_data_root', + 'setup_multi_processes', 'get_caller_name', 'log_img_scale', 'compat_cfg', + 'split_batch', 'register_all_modules', 'replace_cfg_vals', 'AvoidOOM', + 'AvoidCUDAOOM', 'all_reduce_dict', 'allreduce_grads', 'reduce_mean', + 'sync_random_seed', 'ConfigType', 'InstanceList', 'MultiConfig', + 'OptConfigType', 'OptInstanceList', 'OptMultiConfig', 'OptPixelList', + 'PixelList', 'RangeType', 'get_test_pipeline_cfg', + 'setup_cache_size_limit_of_dynamo', 'imshow_mot_errors' +] diff --git a/mmdetection/mmdet/utils/benchmark.py b/mmdetection/mmdet/utils/benchmark.py new file mode 100644 index 0000000..5419b2d --- /dev/null +++ b/mmdetection/mmdet/utils/benchmark.py @@ -0,0 +1,529 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import time +from functools import partial +from typing import List, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import fuse_conv_bn +# TODO need update +# from mmcv.runner import wrap_fp16_model +from mmengine import MMLogger +from mmengine.config import Config +from mmengine.device import get_max_cuda_memory +from mmengine.dist import get_world_size +from mmengine.runner import Runner, load_checkpoint +from mmengine.utils.dl_utils import set_multi_processing +from torch.nn.parallel import DistributedDataParallel + +from mmdet.registry import DATASETS, MODELS + +try: + import psutil +except ImportError: + psutil = None + + +def custom_round(value: Union[int, float], + factor: Union[int, float], + precision: int = 2) -> float: + """Custom round function.""" + return round(value / factor, precision) + + +gb_round = partial(custom_round, factor=1024**3) + + +def print_log(msg: str, logger: Optional[MMLogger] = None) -> None: + """Print a log message.""" + if logger is None: + print(msg, flush=True) + else: + logger.info(msg) + + +def print_process_memory(p: psutil.Process, + logger: Optional[MMLogger] = None) -> None: + """print process memory info.""" + mem_used = gb_round(psutil.virtual_memory().used) + memory_full_info = p.memory_full_info() + uss_mem = gb_round(memory_full_info.uss) + if hasattr(memory_full_info, 'pss'): + pss_mem = gb_round(memory_full_info.pss) + + for children in p.children(): + child_mem_info = children.memory_full_info() + uss_mem += gb_round(child_mem_info.uss) + if hasattr(child_mem_info, 'pss'): + pss_mem += gb_round(child_mem_info.pss) + + process_count = 1 + len(p.children()) + + log_msg = f'(GB) mem_used: {mem_used:.2f} | uss: {uss_mem:.2f} | ' + if hasattr(memory_full_info, 'pss'): + log_msg += f'pss: {pss_mem:.2f} | ' + log_msg += f'total_proc: {process_count}' + print_log(log_msg, logger) + + +class BaseBenchmark: + """The benchmark base class. + + The ``run`` method is an external calling interface, and it will + call the ``run_once`` method ``repeat_num`` times for benchmarking. + Finally, call the ``average_multiple_runs`` method to further process + the results of multiple runs. + + Args: + max_iter (int): maximum iterations of benchmark. + log_interval (int): interval of logging. + num_warmup (int): Number of Warmup. + logger (MMLogger, optional): Formatted logger used to record messages. + """ + + def __init__(self, + max_iter: int, + log_interval: int, + num_warmup: int, + logger: Optional[MMLogger] = None): + self.max_iter = max_iter + self.log_interval = log_interval + self.num_warmup = num_warmup + self.logger = logger + + def run(self, repeat_num: int = 1) -> dict: + """benchmark entry method. + + Args: + repeat_num (int): Number of repeat benchmark. + Defaults to 1. + """ + assert repeat_num >= 1 + + results = [] + for _ in range(repeat_num): + results.append(self.run_once()) + + results = self.average_multiple_runs(results) + return results + + def run_once(self) -> dict: + """Executes the benchmark once.""" + raise NotImplementedError() + + def average_multiple_runs(self, results: List[dict]) -> dict: + """Average the results of multiple runs.""" + raise NotImplementedError() + + +class InferenceBenchmark(BaseBenchmark): + """The inference benchmark class. It will be statistical inference FPS, + CUDA memory and CPU memory information. + + Args: + cfg (mmengine.Config): config. + checkpoint (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. + distributed (bool): distributed testing flag. + is_fuse_conv_bn (bool): Whether to fuse conv and bn, this will + slightly increase the inference speed. + max_iter (int): maximum iterations of benchmark. Defaults to 2000. + log_interval (int): interval of logging. Defaults to 50. + num_warmup (int): Number of Warmup. Defaults to 5. + logger (MMLogger, optional): Formatted logger used to record messages. + """ + + def __init__(self, + cfg: Config, + checkpoint: str, + distributed: bool, + is_fuse_conv_bn: bool, + max_iter: int = 2000, + log_interval: int = 50, + num_warmup: int = 5, + logger: Optional[MMLogger] = None): + super().__init__(max_iter, log_interval, num_warmup, logger) + + assert get_world_size( + ) == 1, 'Inference benchmark does not allow distributed multi-GPU' + + self.cfg = copy.deepcopy(cfg) + self.distributed = distributed + + if psutil is None: + raise ImportError('psutil is not installed, please install it by: ' + 'pip install psutil') + + self._process = psutil.Process() + env_cfg = self.cfg.get('env_cfg') + if env_cfg.get('cudnn_benchmark'): + torch.backends.cudnn.benchmark = True + + mp_cfg: dict = env_cfg.get('mp_cfg', {}) + set_multi_processing(**mp_cfg, distributed=self.distributed) + + print_log('before build: ', self.logger) + print_process_memory(self._process, self.logger) + + self.model = self._init_model(checkpoint, is_fuse_conv_bn) + + # Because multiple processes will occupy additional CPU resources, + # FPS statistics will be more unstable when num_workers is not 0. + # It is reasonable to set num_workers to 0. + dataloader_cfg = cfg.test_dataloader + dataloader_cfg['num_workers'] = 0 + dataloader_cfg['batch_size'] = 1 + dataloader_cfg['persistent_workers'] = False + self.data_loader = Runner.build_dataloader(dataloader_cfg) + + print_log('after build: ', self.logger) + print_process_memory(self._process, self.logger) + + def _init_model(self, checkpoint: str, is_fuse_conv_bn: bool) -> nn.Module: + """Initialize the model.""" + model = MODELS.build(self.cfg.model) + # TODO need update + # fp16_cfg = self.cfg.get('fp16', None) + # if fp16_cfg is not None: + # wrap_fp16_model(model) + + load_checkpoint(model, checkpoint, map_location='cpu') + if is_fuse_conv_bn: + model = fuse_conv_bn(model) + + model = model.cuda() + + if self.distributed: + model = DistributedDataParallel( + model, + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=False) + + model.eval() + return model + + def run_once(self) -> dict: + """Executes the benchmark once.""" + pure_inf_time = 0 + fps = 0 + + for i, data in enumerate(self.data_loader): + + if (i + 1) % self.log_interval == 0: + print_log('==================================', self.logger) + + torch.cuda.synchronize() + start_time = time.perf_counter() + + with torch.no_grad(): + self.model.test_step(data) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - start_time + + if i >= self.num_warmup: + pure_inf_time += elapsed + if (i + 1) % self.log_interval == 0: + fps = (i + 1 - self.num_warmup) / pure_inf_time + cuda_memory = get_max_cuda_memory() + + print_log( + f'Done image [{i + 1:<3}/{self.max_iter}], ' + f'fps: {fps:.1f} img/s, ' + f'times per image: {1000 / fps:.1f} ms/img, ' + f'cuda memory: {cuda_memory} MB', self.logger) + print_process_memory(self._process, self.logger) + + if (i + 1) == self.max_iter: + fps = (i + 1 - self.num_warmup) / pure_inf_time + break + + return {'fps': fps} + + def average_multiple_runs(self, results: List[dict]) -> dict: + """Average the results of multiple runs.""" + print_log('============== Done ==================', self.logger) + + fps_list_ = [round(result['fps'], 1) for result in results] + avg_fps_ = sum(fps_list_) / len(fps_list_) + outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_} + + if len(fps_list_) > 1: + times_pre_image_list_ = [ + round(1000 / result['fps'], 1) for result in results + ] + avg_times_pre_image_ = sum(times_pre_image_list_) / len( + times_pre_image_list_) + + print_log( + f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, ' + 'times per image: ' + f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] ' + 'ms/img', self.logger) + else: + print_log( + f'Overall fps: {fps_list_[0]:.1f} img/s, ' + f'times per image: {1000 / fps_list_[0]:.1f} ms/img', + self.logger) + + print_log(f'cuda memory: {get_max_cuda_memory()} MB', self.logger) + print_process_memory(self._process, self.logger) + + return outputs + + +class DataLoaderBenchmark(BaseBenchmark): + """The dataloader benchmark class. It will be statistical inference FPS and + CPU memory information. + + Args: + cfg (mmengine.Config): config. + distributed (bool): distributed testing flag. + dataset_type (str): benchmark data type, only supports ``train``, + ``val`` and ``test``. + max_iter (int): maximum iterations of benchmark. Defaults to 2000. + log_interval (int): interval of logging. Defaults to 50. + num_warmup (int): Number of Warmup. Defaults to 5. + logger (MMLogger, optional): Formatted logger used to record messages. + """ + + def __init__(self, + cfg: Config, + distributed: bool, + dataset_type: str, + max_iter: int = 2000, + log_interval: int = 50, + num_warmup: int = 5, + logger: Optional[MMLogger] = None): + super().__init__(max_iter, log_interval, num_warmup, logger) + + assert dataset_type in ['train', 'val', 'test'], \ + 'dataset_type only supports train,' \ + f' val and test, but got {dataset_type}' + assert get_world_size( + ) == 1, 'Dataloader benchmark does not allow distributed multi-GPU' + + self.cfg = copy.deepcopy(cfg) + self.distributed = distributed + + if psutil is None: + raise ImportError('psutil is not installed, please install it by: ' + 'pip install psutil') + self._process = psutil.Process() + + mp_cfg = self.cfg.get('env_cfg', {}).get('mp_cfg') + if mp_cfg is not None: + set_multi_processing(distributed=self.distributed, **mp_cfg) + else: + set_multi_processing(distributed=self.distributed) + + print_log('before build: ', self.logger) + print_process_memory(self._process, self.logger) + + if dataset_type == 'train': + self.data_loader = Runner.build_dataloader(cfg.train_dataloader) + elif dataset_type == 'test': + self.data_loader = Runner.build_dataloader(cfg.test_dataloader) + else: + self.data_loader = Runner.build_dataloader(cfg.val_dataloader) + + self.batch_size = self.data_loader.batch_size + self.num_workers = self.data_loader.num_workers + + print_log('after build: ', self.logger) + print_process_memory(self._process, self.logger) + + def run_once(self) -> dict: + """Executes the benchmark once.""" + pure_inf_time = 0 + fps = 0 + + # benchmark with 2000 image and take the average + start_time = time.perf_counter() + for i, data in enumerate(self.data_loader): + elapsed = time.perf_counter() - start_time + + if (i + 1) % self.log_interval == 0: + print_log('==================================', self.logger) + + if i >= self.num_warmup: + pure_inf_time += elapsed + if (i + 1) % self.log_interval == 0: + fps = (i + 1 - self.num_warmup) / pure_inf_time + + print_log( + f'Done batch [{i + 1:<3}/{self.max_iter}], ' + f'fps: {fps:.1f} batch/s, ' + f'times per batch: {1000 / fps:.1f} ms/batch, ' + f'batch size: {self.batch_size}, num_workers: ' + f'{self.num_workers}', self.logger) + print_process_memory(self._process, self.logger) + + if (i + 1) == self.max_iter: + fps = (i + 1 - self.num_warmup) / pure_inf_time + break + + start_time = time.perf_counter() + + return {'fps': fps} + + def average_multiple_runs(self, results: List[dict]) -> dict: + """Average the results of multiple runs.""" + print_log('============== Done ==================', self.logger) + + fps_list_ = [round(result['fps'], 1) for result in results] + avg_fps_ = sum(fps_list_) / len(fps_list_) + outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_} + + if len(fps_list_) > 1: + times_pre_image_list_ = [ + round(1000 / result['fps'], 1) for result in results + ] + avg_times_pre_image_ = sum(times_pre_image_list_) / len( + times_pre_image_list_) + + print_log( + f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, ' + 'times per batch: ' + f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] ' + f'ms/batch, batch size: {self.batch_size}, num_workers: ' + f'{self.num_workers}', self.logger) + else: + print_log( + f'Overall fps: {fps_list_[0]:.1f} batch/s, ' + f'times per batch: {1000 / fps_list_[0]:.1f} ms/batch, ' + f'batch size: {self.batch_size}, num_workers: ' + f'{self.num_workers}', self.logger) + + print_process_memory(self._process, self.logger) + + return outputs + + +class DatasetBenchmark(BaseBenchmark): + """The dataset benchmark class. It will be statistical inference FPS, FPS + pre transform and CPU memory information. + + Args: + cfg (mmengine.Config): config. + dataset_type (str): benchmark data type, only supports ``train``, + ``val`` and ``test``. + max_iter (int): maximum iterations of benchmark. Defaults to 2000. + log_interval (int): interval of logging. Defaults to 50. + num_warmup (int): Number of Warmup. Defaults to 5. + logger (MMLogger, optional): Formatted logger used to record messages. + """ + + def __init__(self, + cfg: Config, + dataset_type: str, + max_iter: int = 2000, + log_interval: int = 50, + num_warmup: int = 5, + logger: Optional[MMLogger] = None): + super().__init__(max_iter, log_interval, num_warmup, logger) + assert dataset_type in ['train', 'val', 'test'], \ + 'dataset_type only supports train,' \ + f' val and test, but got {dataset_type}' + assert get_world_size( + ) == 1, 'Dataset benchmark does not allow distributed multi-GPU' + self.cfg = copy.deepcopy(cfg) + + if dataset_type == 'train': + dataloader_cfg = copy.deepcopy(cfg.train_dataloader) + elif dataset_type == 'test': + dataloader_cfg = copy.deepcopy(cfg.test_dataloader) + else: + dataloader_cfg = copy.deepcopy(cfg.val_dataloader) + + dataset_cfg = dataloader_cfg.pop('dataset') + dataset = DATASETS.build(dataset_cfg) + if hasattr(dataset, 'full_init'): + dataset.full_init() + self.dataset = dataset + + def run_once(self) -> dict: + """Executes the benchmark once.""" + pure_inf_time = 0 + fps = 0 + + total_index = list(range(len(self.dataset))) + np.random.shuffle(total_index) + + start_time = time.perf_counter() + for i, idx in enumerate(total_index): + if (i + 1) % self.log_interval == 0: + print_log('==================================', self.logger) + + get_data_info_start_time = time.perf_counter() + data_info = self.dataset.get_data_info(idx) + get_data_info_elapsed = time.perf_counter( + ) - get_data_info_start_time + + if (i + 1) % self.log_interval == 0: + print_log(f'get_data_info - {get_data_info_elapsed * 1000} ms', + self.logger) + + for t in self.dataset.pipeline.transforms: + transform_start_time = time.perf_counter() + data_info = t(data_info) + transform_elapsed = time.perf_counter() - transform_start_time + + if (i + 1) % self.log_interval == 0: + print_log( + f'{t.__class__.__name__} - ' + f'{transform_elapsed * 1000} ms', self.logger) + + if data_info is None: + break + + elapsed = time.perf_counter() - start_time + + if i >= self.num_warmup: + pure_inf_time += elapsed + if (i + 1) % self.log_interval == 0: + fps = (i + 1 - self.num_warmup) / pure_inf_time + + print_log( + f'Done img [{i + 1:<3}/{self.max_iter}], ' + f'fps: {fps:.1f} img/s, ' + f'times per img: {1000 / fps:.1f} ms/img', self.logger) + + if (i + 1) == self.max_iter: + fps = (i + 1 - self.num_warmup) / pure_inf_time + break + + start_time = time.perf_counter() + + return {'fps': fps} + + def average_multiple_runs(self, results: List[dict]) -> dict: + """Average the results of multiple runs.""" + print_log('============== Done ==================', self.logger) + + fps_list_ = [round(result['fps'], 1) for result in results] + avg_fps_ = sum(fps_list_) / len(fps_list_) + outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_} + + if len(fps_list_) > 1: + times_pre_image_list_ = [ + round(1000 / result['fps'], 1) for result in results + ] + avg_times_pre_image_ = sum(times_pre_image_list_) / len( + times_pre_image_list_) + + print_log( + f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, ' + 'times per img: ' + f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] ' + 'ms/img', self.logger) + else: + print_log( + f'Overall fps: {fps_list_[0]:.1f} img/s, ' + f'times per img: {1000 / fps_list_[0]:.1f} ms/img', + self.logger) + + return outputs diff --git a/mmdetection/mmdet/utils/collect_env.py b/mmdetection/mmdet/utils/collect_env.py new file mode 100644 index 0000000..b0eed80 --- /dev/null +++ b/mmdetection/mmdet/utils/collect_env.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.utils import get_git_hash +from mmengine.utils.dl_utils import collect_env as collect_base_env + +import mmdet + + +def collect_env(): + """Collect the information of the running environments.""" + env_info = collect_base_env() + env_info['MMDetection'] = mmdet.__version__ + '+' + get_git_hash()[:7] + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print(f'{name}: {val}') diff --git a/mmdetection/mmdet/utils/compat_config.py b/mmdetection/mmdet/utils/compat_config.py new file mode 100644 index 0000000..133adb6 --- /dev/null +++ b/mmdetection/mmdet/utils/compat_config.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings + +from mmengine.config import ConfigDict + + +def compat_cfg(cfg): + """This function would modify some filed to keep the compatibility of + config. + + For example, it will move some args which will be deprecated to the correct + fields. + """ + cfg = copy.deepcopy(cfg) + cfg = compat_imgs_per_gpu(cfg) + cfg = compat_loader_args(cfg) + cfg = compat_runner_args(cfg) + return cfg + + +def compat_runner_args(cfg): + if 'runner' not in cfg: + cfg.runner = ConfigDict({ + 'type': 'EpochBasedRunner', + 'max_epochs': cfg.total_epochs + }) + warnings.warn( + 'config is now expected to have a `runner` section, ' + 'please set `runner` in your config.', UserWarning) + else: + if 'total_epochs' in cfg: + assert cfg.total_epochs == cfg.runner.max_epochs + return cfg + + +def compat_imgs_per_gpu(cfg): + cfg = copy.deepcopy(cfg) + if 'imgs_per_gpu' in cfg.data: + warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. ' + 'Please use "samples_per_gpu" instead') + if 'samples_per_gpu' in cfg.data: + warnings.warn( + f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' + f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' + f'={cfg.data.imgs_per_gpu} is used in this experiments') + else: + warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"=' + f'{cfg.data.imgs_per_gpu} in this experiments') + cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu + return cfg + + +def compat_loader_args(cfg): + """Deprecated sample_per_gpu in cfg.data.""" + + cfg = copy.deepcopy(cfg) + if 'train_dataloader' not in cfg.data: + cfg.data['train_dataloader'] = ConfigDict() + if 'val_dataloader' not in cfg.data: + cfg.data['val_dataloader'] = ConfigDict() + if 'test_dataloader' not in cfg.data: + cfg.data['test_dataloader'] = ConfigDict() + + # special process for train_dataloader + if 'samples_per_gpu' in cfg.data: + + samples_per_gpu = cfg.data.pop('samples_per_gpu') + assert 'samples_per_gpu' not in \ + cfg.data.train_dataloader, ('`samples_per_gpu` are set ' + 'in `data` field and ` ' + 'data.train_dataloader` ' + 'at the same time. ' + 'Please only set it in ' + '`data.train_dataloader`. ') + cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu + + if 'persistent_workers' in cfg.data: + + persistent_workers = cfg.data.pop('persistent_workers') + assert 'persistent_workers' not in \ + cfg.data.train_dataloader, ('`persistent_workers` are set ' + 'in `data` field and ` ' + 'data.train_dataloader` ' + 'at the same time. ' + 'Please only set it in ' + '`data.train_dataloader`. ') + cfg.data.train_dataloader['persistent_workers'] = persistent_workers + + if 'workers_per_gpu' in cfg.data: + + workers_per_gpu = cfg.data.pop('workers_per_gpu') + cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu + cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu + cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu + + # special process for val_dataloader + if 'samples_per_gpu' in cfg.data.val: + # keep default value of `sample_per_gpu` is 1 + assert 'samples_per_gpu' not in \ + cfg.data.val_dataloader, ('`samples_per_gpu` are set ' + 'in `data.val` field and ` ' + 'data.val_dataloader` at ' + 'the same time. ' + 'Please only set it in ' + '`data.val_dataloader`. ') + cfg.data.val_dataloader['samples_per_gpu'] = \ + cfg.data.val.pop('samples_per_gpu') + # special process for val_dataloader + + # in case the test dataset is concatenated + if isinstance(cfg.data.test, dict): + if 'samples_per_gpu' in cfg.data.test: + assert 'samples_per_gpu' not in \ + cfg.data.test_dataloader, ('`samples_per_gpu` are set ' + 'in `data.test` field and ` ' + 'data.test_dataloader` ' + 'at the same time. ' + 'Please only set it in ' + '`data.test_dataloader`. ') + + cfg.data.test_dataloader['samples_per_gpu'] = \ + cfg.data.test.pop('samples_per_gpu') + + elif isinstance(cfg.data.test, list): + for ds_cfg in cfg.data.test: + if 'samples_per_gpu' in ds_cfg: + assert 'samples_per_gpu' not in \ + cfg.data.test_dataloader, ('`samples_per_gpu` are set ' + 'in `data.test` field and ` ' + 'data.test_dataloader` at' + ' the same time. ' + 'Please only set it in ' + '`data.test_dataloader`. ') + samples_per_gpu = max( + [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) + cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu + + return cfg diff --git a/mmdetection/mmdet/utils/contextmanagers.py b/mmdetection/mmdet/utils/contextmanagers.py new file mode 100644 index 0000000..fa12bfc --- /dev/null +++ b/mmdetection/mmdet/utils/contextmanagers.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import asyncio +import contextlib +import logging +import os +import time +from typing import List + +import torch + +logger = logging.getLogger(__name__) + +DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False)) + + +@contextlib.asynccontextmanager +async def completed(trace_name='', + name='', + sleep_interval=0.05, + streams: List[torch.cuda.Stream] = None): + """Async context manager that waits for work to complete on given CUDA + streams.""" + if not torch.cuda.is_available(): + yield + return + + stream_before_context_switch = torch.cuda.current_stream() + if not streams: + streams = [stream_before_context_switch] + else: + streams = [s if s else stream_before_context_switch for s in streams] + + end_events = [ + torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams + ] + + if DEBUG_COMPLETED_TIME: + start = torch.cuda.Event(enable_timing=True) + stream_before_context_switch.record_event(start) + + cpu_start = time.monotonic() + logger.debug('%s %s starting, streams: %s', trace_name, name, streams) + grad_enabled_before = torch.is_grad_enabled() + try: + yield + finally: + current_stream = torch.cuda.current_stream() + assert current_stream == stream_before_context_switch + + if DEBUG_COMPLETED_TIME: + cpu_end = time.monotonic() + for i, stream in enumerate(streams): + event = end_events[i] + stream.record_event(event) + + grad_enabled_after = torch.is_grad_enabled() + + # observed change of torch.is_grad_enabled() during concurrent run of + # async_test_bboxes code + assert (grad_enabled_before == grad_enabled_after + ), 'Unexpected is_grad_enabled() value change' + + are_done = [e.query() for e in end_events] + logger.debug('%s %s completed: %s streams: %s', trace_name, name, + are_done, streams) + with torch.cuda.stream(stream_before_context_switch): + while not all(are_done): + await asyncio.sleep(sleep_interval) + are_done = [e.query() for e in end_events] + logger.debug( + '%s %s completed: %s streams: %s', + trace_name, + name, + are_done, + streams, + ) + + current_stream = torch.cuda.current_stream() + assert current_stream == stream_before_context_switch + + if DEBUG_COMPLETED_TIME: + cpu_time = (cpu_end - cpu_start) * 1000 + stream_times_ms = '' + for i, stream in enumerate(streams): + elapsed_time = start.elapsed_time(end_events[i]) + stream_times_ms += f' {stream} {elapsed_time:.2f} ms' + logger.info('%s %s %.2f ms %s', trace_name, name, cpu_time, + stream_times_ms) + + +@contextlib.asynccontextmanager +async def concurrent(streamqueue: asyncio.Queue, + trace_name='concurrent', + name='stream'): + """Run code concurrently in different streams. + + :param streamqueue: asyncio.Queue instance. + + Queue tasks define the pool of streams used for concurrent execution. + """ + if not torch.cuda.is_available(): + yield + return + + initial_stream = torch.cuda.current_stream() + + with torch.cuda.stream(initial_stream): + stream = await streamqueue.get() + assert isinstance(stream, torch.cuda.Stream) + + try: + with torch.cuda.stream(stream): + logger.debug('%s %s is starting, stream: %s', trace_name, name, + stream) + yield + current = torch.cuda.current_stream() + assert current == stream + logger.debug('%s %s has finished, stream: %s', trace_name, + name, stream) + finally: + streamqueue.task_done() + streamqueue.put_nowait(stream) diff --git a/mmdetection/mmdet/utils/dist_utils.py b/mmdetection/mmdet/utils/dist_utils.py new file mode 100644 index 0000000..2f2c861 --- /dev/null +++ b/mmdetection/mmdet/utils/dist_utils.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +import pickle +import warnings +from collections import OrderedDict + +import numpy as np +import torch +import torch.distributed as dist +from mmengine.dist import get_dist_info +from torch._utils import (_flatten_dense_tensors, _take_tensors, + _unflatten_dense_tensors) + + +def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): + if bucket_size_mb > 0: + bucket_size_bytes = bucket_size_mb * 1024 * 1024 + buckets = _take_tensors(tensors, bucket_size_bytes) + else: + buckets = OrderedDict() + for tensor in tensors: + tp = tensor.type() + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(tensor) + buckets = buckets.values() + + for bucket in buckets: + flat_tensors = _flatten_dense_tensors(bucket) + dist.all_reduce(flat_tensors) + flat_tensors.div_(world_size) + for tensor, synced in zip( + bucket, _unflatten_dense_tensors(flat_tensors, bucket)): + tensor.copy_(synced) + + +def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): + """Allreduce gradients. + + Args: + params (list[torch.Parameters]): List of parameters of a model + coalesce (bool, optional): Whether allreduce parameters as a whole. + Defaults to True. + bucket_size_mb (int, optional): Size of bucket, the unit is MB. + Defaults to -1. + """ + grads = [ + param.grad.data for param in params + if param.requires_grad and param.grad is not None + ] + world_size = dist.get_world_size() + if coalesce: + _allreduce_coalesced(grads, world_size, bucket_size_mb) + else: + for tensor in grads: + dist.all_reduce(tensor.div_(world_size)) + + +def reduce_mean(tensor): + """"Obtain the mean of tensor on different GPUs.""" + if not (dist.is_available() and dist.is_initialized()): + return tensor + tensor = tensor.clone() + dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) + return tensor + + +def obj2tensor(pyobj, device='cuda'): + """Serialize picklable python object to tensor.""" + storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj)) + return torch.ByteTensor(storage).to(device=device) + + +def tensor2obj(tensor): + """Deserialize tensor to picklable python object.""" + return pickle.loads(tensor.cpu().numpy().tobytes()) + + +@functools.lru_cache() +def _get_global_gloo_group(): + """Return a process group based on gloo backend, containing all the ranks + The result is cached.""" + if dist.get_backend() == 'nccl': + return dist.new_group(backend='gloo') + else: + return dist.group.WORLD + + +def all_reduce_dict(py_dict, op='sum', group=None, to_float=True): + """Apply all reduce function for python dict object. + + The code is modified from https://github.com/Megvii- + BaseDetection/YOLOX/blob/main/yolox/utils/allreduce_norm.py. + + NOTE: make sure that py_dict in different ranks has the same keys and + the values should be in the same shape. Currently only supports + nccl backend. + + Args: + py_dict (dict): Dict to be applied all reduce op. + op (str): Operator, could be 'sum' or 'mean'. Default: 'sum' + group (:obj:`torch.distributed.group`, optional): Distributed group, + Default: None. + to_float (bool): Whether to convert all values of dict to float. + Default: True. + + Returns: + OrderedDict: reduced python dict object. + """ + warnings.warn( + 'group` is deprecated. Currently only supports NCCL backend.') + _, world_size = get_dist_info() + if world_size == 1: + return py_dict + + # all reduce logic across different devices. + py_key = list(py_dict.keys()) + if not isinstance(py_dict, OrderedDict): + py_key_tensor = obj2tensor(py_key) + dist.broadcast(py_key_tensor, src=0) + py_key = tensor2obj(py_key_tensor) + + tensor_shapes = [py_dict[k].shape for k in py_key] + tensor_numels = [py_dict[k].numel() for k in py_key] + + if to_float: + warnings.warn('Note: the "to_float" is True, you need to ' + 'ensure that the behavior is reasonable.') + flatten_tensor = torch.cat( + [py_dict[k].flatten().float() for k in py_key]) + else: + flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key]) + + dist.all_reduce(flatten_tensor, op=dist.ReduceOp.SUM) + if op == 'mean': + flatten_tensor /= world_size + + split_tensors = [ + x.reshape(shape) for x, shape in zip( + torch.split(flatten_tensor, tensor_numels), tensor_shapes) + ] + out_dict = {k: v for k, v in zip(py_key, split_tensors)} + if isinstance(py_dict, OrderedDict): + out_dict = OrderedDict(out_dict) + return out_dict + + +def sync_random_seed(seed=None, device='cuda'): + """Make sure different ranks share the same seed. + + All workers must call this function, otherwise it will deadlock. + This method is generally used in `DistributedSampler`, + because the seed should be identical across all processes + in the distributed group. + + In distributed sampling, different ranks should sample non-overlapped + data in the dataset. Therefore, this function is used to make sure that + each rank shuffles the data indices in the same order based + on the same seed. Then different ranks could use different indices + to select non-overlapped data from the same data list. + + Args: + seed (int, Optional): The seed. Default to None. + device (str): The device where the seed will be put on. + Default to 'cuda'. + + Returns: + int: Seed to be used. + """ + if seed is None: + seed = np.random.randint(2**31) + assert isinstance(seed, int) + + rank, world_size = get_dist_info() + + if world_size == 1: + return seed + + if rank == 0: + random_num = torch.tensor(seed, dtype=torch.int32, device=device) + else: + random_num = torch.tensor(0, dtype=torch.int32, device=device) + dist.broadcast(random_num, src=0) + return random_num.item() diff --git a/mmdetection/mmdet/utils/large_image.py b/mmdetection/mmdet/utils/large_image.py new file mode 100644 index 0000000..f1f07c2 --- /dev/null +++ b/mmdetection/mmdet/utils/large_image.py @@ -0,0 +1,104 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple + +import torch +from mmcv.ops import batched_nms +from mmengine.structures import InstanceData + +from mmdet.structures import DetDataSample, SampleList + + +def shift_rbboxes(bboxes: torch.Tensor, offset: Sequence[int]): + """Shift rotated bboxes with offset. + + Args: + bboxes (Tensor): The rotated bboxes need to be translated. + With shape (n, 5), which means (x, y, w, h, a). + offset (Sequence[int]): The translation offsets with shape of (2, ). + Returns: + Tensor: Shifted rotated bboxes. + """ + offset_tensor = bboxes.new_tensor(offset) + shifted_bboxes = bboxes.clone() + shifted_bboxes[:, 0:2] = shifted_bboxes[:, 0:2] + offset_tensor + return shifted_bboxes + + +def shift_predictions(det_data_samples: SampleList, + offsets: Sequence[Tuple[int, int]], + src_image_shape: Tuple[int, int]) -> SampleList: + """Shift predictions to the original image. + + Args: + det_data_samples (List[:obj:`DetDataSample`]): A list of patch results. + offsets (Sequence[Tuple[int, int]]): Positions of the left top points + of patches. + src_image_shape (Tuple[int, int]): A (height, width) tuple of the large + image's width and height. + Returns: + (List[:obj:`DetDataSample`]): shifted results. + """ + try: + from sahi.slicing import shift_bboxes, shift_masks + except ImportError: + raise ImportError('Please run "pip install -U sahi" ' + 'to install sahi first for large image inference.') + + assert len(det_data_samples) == len( + offsets), 'The `results` should has the ' 'same length with `offsets`.' + shifted_predictions = [] + for det_data_sample, offset in zip(det_data_samples, offsets): + pred_inst = det_data_sample.pred_instances.clone() + + # Check bbox type + if pred_inst.bboxes.size(-1) == 4: + # Horizontal bboxes + shifted_bboxes = shift_bboxes(pred_inst.bboxes, offset) + elif pred_inst.bboxes.size(-1) == 5: + # Rotated bboxes + shifted_bboxes = shift_rbboxes(pred_inst.bboxes, offset) + else: + raise NotImplementedError + + # shift bboxes and masks + pred_inst.bboxes = shifted_bboxes + if 'masks' in det_data_sample: + pred_inst.masks = shift_masks(pred_inst.masks, offset, + src_image_shape) + + shifted_predictions.append(pred_inst.clone()) + + shifted_predictions = InstanceData.cat(shifted_predictions) + + return shifted_predictions + + +def merge_results_by_nms(results: SampleList, offsets: Sequence[Tuple[int, + int]], + src_image_shape: Tuple[int, int], + nms_cfg: dict) -> DetDataSample: + """Merge patch results by nms. + + Args: + results (List[:obj:`DetDataSample`]): A list of patch results. + offsets (Sequence[Tuple[int, int]]): Positions of the left top points + of patches. + src_image_shape (Tuple[int, int]): A (height, width) tuple of the large + image's width and height. + nms_cfg (dict): it should specify nms type and other parameters + like `iou_threshold`. + Returns: + :obj:`DetDataSample`: merged results. + """ + shifted_instances = shift_predictions(results, offsets, src_image_shape) + + _, keeps = batched_nms( + boxes=shifted_instances.bboxes, + scores=shifted_instances.scores, + idxs=shifted_instances.labels, + nms_cfg=nms_cfg) + merged_instances = shifted_instances[keeps] + + merged_result = results[0].clone() + merged_result.pred_instances = merged_instances + return merged_result diff --git a/mmdetection/mmdet/utils/logger.py b/mmdetection/mmdet/utils/logger.py new file mode 100644 index 0000000..9fec08b --- /dev/null +++ b/mmdetection/mmdet/utils/logger.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect + +from mmengine.logging import print_log + + +def get_caller_name(): + """Get name of caller method.""" + # this_func_frame = inspect.stack()[0][0] # i.e., get_caller_name + # callee_frame = inspect.stack()[1][0] # e.g., log_img_scale + caller_frame = inspect.stack()[2][0] # e.g., caller of log_img_scale + caller_method = caller_frame.f_code.co_name + try: + caller_class = caller_frame.f_locals['self'].__class__.__name__ + return f'{caller_class}.{caller_method}' + except KeyError: # caller is a function + return caller_method + + +def log_img_scale(img_scale, shape_order='hw', skip_square=False): + """Log image size. + + Args: + img_scale (tuple): Image size to be logged. + shape_order (str, optional): The order of image shape. + 'hw' for (height, width) and 'wh' for (width, height). + Defaults to 'hw'. + skip_square (bool, optional): Whether to skip logging for square + img_scale. Defaults to False. + + Returns: + bool: Whether to have done logging. + """ + if shape_order == 'hw': + height, width = img_scale + elif shape_order == 'wh': + width, height = img_scale + else: + raise ValueError(f'Invalid shape_order {shape_order}.') + + if skip_square and (height == width): + return False + + caller = get_caller_name() + print_log( + f'image shape: height={height}, width={width} in {caller}', + logger='current') + + return True diff --git a/mmdetection/mmdet/utils/memory.py b/mmdetection/mmdet/utils/memory.py new file mode 100644 index 0000000..b6f9cbc --- /dev/null +++ b/mmdetection/mmdet/utils/memory.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from collections import abc +from contextlib import contextmanager +from functools import wraps + +import torch +from mmengine.logging import MMLogger + + +def cast_tensor_type(inputs, src_type=None, dst_type=None): + """Recursively convert Tensor in inputs from ``src_type`` to ``dst_type``. + + Args: + inputs: Inputs that to be casted. + src_type (torch.dtype | torch.device): Source type. + src_type (torch.dtype | torch.device): Destination type. + + Returns: + The same type with inputs, but all contained Tensors have been cast. + """ + assert dst_type is not None + if isinstance(inputs, torch.Tensor): + if isinstance(dst_type, torch.device): + # convert Tensor to dst_device + if hasattr(inputs, 'to') and \ + hasattr(inputs, 'device') and \ + (inputs.device == src_type or src_type is None): + return inputs.to(dst_type) + else: + return inputs + else: + # convert Tensor to dst_dtype + if hasattr(inputs, 'to') and \ + hasattr(inputs, 'dtype') and \ + (inputs.dtype == src_type or src_type is None): + return inputs.to(dst_type) + else: + return inputs + # we need to ensure that the type of inputs to be casted are the same + # as the argument `src_type`. + elif isinstance(inputs, abc.Mapping): + return type(inputs)({ + k: cast_tensor_type(v, src_type=src_type, dst_type=dst_type) + for k, v in inputs.items() + }) + elif isinstance(inputs, abc.Iterable): + return type(inputs)( + cast_tensor_type(item, src_type=src_type, dst_type=dst_type) + for item in inputs) + # TODO: Currently not supported + # elif isinstance(inputs, InstanceData): + # for key, value in inputs.items(): + # inputs[key] = cast_tensor_type( + # value, src_type=src_type, dst_type=dst_type) + # return inputs + else: + return inputs + + +@contextmanager +def _ignore_torch_cuda_oom(): + """A context which ignores CUDA OOM exception from pytorch. + + Code is modified from + # noqa: E501 + """ + try: + yield + except RuntimeError as e: + # NOTE: the string may change? + if 'CUDA out of memory. ' in str(e): + pass + else: + raise + + +class AvoidOOM: + """Try to convert inputs to FP16 and CPU if got a PyTorch's CUDA Out of + Memory error. It will do the following steps: + + 1. First retry after calling `torch.cuda.empty_cache()`. + 2. If that still fails, it will then retry by converting inputs + to FP16. + 3. If that still fails trying to convert inputs to CPUs. + In this case, it expects the function to dispatch to + CPU implementation. + + Args: + to_cpu (bool): Whether to convert outputs to CPU if get an OOM + error. This will slow down the code significantly. + Defaults to True. + test (bool): Skip `_ignore_torch_cuda_oom` operate that can use + lightweight data in unit test, only used in + test unit. Defaults to False. + + Examples: + >>> from mmdet.utils.memory import AvoidOOM + >>> AvoidCUDAOOM = AvoidOOM() + >>> output = AvoidOOM.retry_if_cuda_oom( + >>> some_torch_function)(input1, input2) + >>> # To use as a decorator + >>> # from mmdet.utils import AvoidCUDAOOM + >>> @AvoidCUDAOOM.retry_if_cuda_oom + >>> def function(*args, **kwargs): + >>> return None + ``` + + Note: + 1. The output may be on CPU even if inputs are on GPU. Processing + on CPU will slow down the code significantly. + 2. When converting inputs to CPU, it will only look at each argument + and check if it has `.device` and `.to` for conversion. Nested + structures of tensors are not supported. + 3. Since the function might be called more than once, it has to be + stateless. + """ + + def __init__(self, to_cpu=True, test=False): + self.to_cpu = to_cpu + self.test = test + + def retry_if_cuda_oom(self, func): + """Makes a function retry itself after encountering pytorch's CUDA OOM + error. + + The implementation logic is referred to + https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py + + Args: + func: a stateless callable that takes tensor-like objects + as arguments. + Returns: + func: a callable which retries `func` if OOM is encountered. + """ # noqa: W605 + + @wraps(func) + def wrapped(*args, **kwargs): + + # raw function + if not self.test: + with _ignore_torch_cuda_oom(): + return func(*args, **kwargs) + + # Clear cache and retry + torch.cuda.empty_cache() + with _ignore_torch_cuda_oom(): + return func(*args, **kwargs) + + # get the type and device of first tensor + dtype, device = None, None + values = args + tuple(kwargs.values()) + for value in values: + if isinstance(value, torch.Tensor): + dtype = value.dtype + device = value.device + break + if dtype is None or device is None: + raise ValueError('There is no tensor in the inputs, ' + 'cannot get dtype and device.') + + # Convert to FP16 + fp16_args = cast_tensor_type(args, dst_type=torch.half) + fp16_kwargs = cast_tensor_type(kwargs, dst_type=torch.half) + logger = MMLogger.get_current_instance() + logger.warning(f'Attempting to copy inputs of {str(func)} ' + 'to FP16 due to CUDA OOM') + + # get input tensor type, the output type will same as + # the first parameter type. + with _ignore_torch_cuda_oom(): + output = func(*fp16_args, **fp16_kwargs) + output = cast_tensor_type( + output, src_type=torch.half, dst_type=dtype) + if not self.test: + return output + logger.warning('Using FP16 still meet CUDA OOM') + + # Try on CPU. This will slow down the code significantly, + # therefore print a notice. + if self.to_cpu: + logger.warning(f'Attempting to copy inputs of {str(func)} ' + 'to CPU due to CUDA OOM') + cpu_device = torch.empty(0).device + cpu_args = cast_tensor_type(args, dst_type=cpu_device) + cpu_kwargs = cast_tensor_type(kwargs, dst_type=cpu_device) + + # convert outputs to GPU + with _ignore_torch_cuda_oom(): + logger.warning(f'Convert outputs to GPU (device={device})') + output = func(*cpu_args, **cpu_kwargs) + output = cast_tensor_type( + output, src_type=cpu_device, dst_type=device) + return output + + warnings.warn('Cannot convert output to GPU due to CUDA OOM, ' + 'the output is now on CPU, which might cause ' + 'errors if the output need to interact with GPU ' + 'data in subsequent operations') + logger.warning('Cannot convert output to GPU due to ' + 'CUDA OOM, the output is on CPU now.') + + return func(*cpu_args, **cpu_kwargs) + else: + # may still get CUDA OOM error + return func(*args, **kwargs) + + return wrapped + + +# To use AvoidOOM as a decorator +AvoidCUDAOOM = AvoidOOM() diff --git a/mmdetection/mmdet/utils/misc.py b/mmdetection/mmdet/utils/misc.py new file mode 100644 index 0000000..8dfb394 --- /dev/null +++ b/mmdetection/mmdet/utils/misc.py @@ -0,0 +1,149 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import glob +import os +import os.path as osp +import urllib +import warnings +from typing import Union + +import torch +from mmengine.config import Config, ConfigDict +from mmengine.logging import print_log +from mmengine.utils import scandir + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def find_latest_checkpoint(path, suffix='pth'): + """Find the latest checkpoint from the working directory. + + Args: + path(str): The path to find checkpoints. + suffix(str): File extension. + Defaults to pth. + + Returns: + latest_path(str | None): File path of the latest checkpoint. + References: + .. [1] https://github.com/microsoft/SoftTeacher + /blob/main/ssod/utils/patch.py + """ + if not osp.exists(path): + warnings.warn('The path of checkpoints does not exist.') + return None + if osp.exists(osp.join(path, f'latest.{suffix}')): + return osp.join(path, f'latest.{suffix}') + + checkpoints = glob.glob(osp.join(path, f'*.{suffix}')) + if len(checkpoints) == 0: + warnings.warn('There are no checkpoints in the path.') + return None + latest = -1 + latest_path = None + for checkpoint in checkpoints: + count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0]) + if count > latest: + latest = count + latest_path = checkpoint + return latest_path + + +def update_data_root(cfg, logger=None): + """Update data root according to env MMDET_DATASETS. + + If set env MMDET_DATASETS, update cfg.data_root according to + MMDET_DATASETS. Otherwise, using cfg.data_root as default. + + Args: + cfg (:obj:`Config`): The model config need to modify + logger (logging.Logger | str | None): the way to print msg + """ + assert isinstance(cfg, Config), \ + f'cfg got wrong type: {type(cfg)}, expected mmengine.Config' + + if 'MMDET_DATASETS' in os.environ: + dst_root = os.environ['MMDET_DATASETS'] + print_log(f'MMDET_DATASETS has been set to be {dst_root}.' + f'Using {dst_root} as data root.') + else: + return + + assert isinstance(cfg, Config), \ + f'cfg got wrong type: {type(cfg)}, expected mmengine.Config' + + def update(cfg, src_str, dst_str): + for k, v in cfg.items(): + if isinstance(v, ConfigDict): + update(cfg[k], src_str, dst_str) + if isinstance(v, str) and src_str in v: + cfg[k] = v.replace(src_str, dst_str) + + update(cfg.data, cfg.data_root, dst_root) + cfg.data_root = dst_root + + +def get_test_pipeline_cfg(cfg: Union[str, ConfigDict]) -> ConfigDict: + """Get the test dataset pipeline from entire config. + + Args: + cfg (str or :obj:`ConfigDict`): the entire config. Can be a config + file or a ``ConfigDict``. + + Returns: + :obj:`ConfigDict`: the config of test dataset. + """ + if isinstance(cfg, str): + cfg = Config.fromfile(cfg) + + def _get_test_pipeline_cfg(dataset_cfg): + if 'pipeline' in dataset_cfg: + return dataset_cfg.pipeline + # handle dataset wrapper + elif 'dataset' in dataset_cfg: + return _get_test_pipeline_cfg(dataset_cfg.dataset) + # handle dataset wrappers like ConcatDataset + elif 'datasets' in dataset_cfg: + return _get_test_pipeline_cfg(dataset_cfg.datasets[0]) + + raise RuntimeError('Cannot find `pipeline` in `test_dataloader`') + + return _get_test_pipeline_cfg(cfg.test_dataloader.dataset) + + +def get_file_list(source_root: str) -> [list, dict]: + """Get file list. + + Args: + source_root (str): image or video source path + + Return: + source_file_path_list (list): A list for all source file. + source_type (dict): Source type: file or url or dir. + """ + is_dir = os.path.isdir(source_root) + is_url = source_root.startswith(('http:/', 'https:/')) + is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS + + source_file_path_list = [] + if is_dir: + # when input source is dir + for file in scandir(source_root, IMG_EXTENSIONS, recursive=True): + source_file_path_list.append(os.path.join(source_root, file)) + elif is_url: + # when input source is url + filename = os.path.basename( + urllib.parse.unquote(source_root).split('?')[0]) + file_save_path = os.path.join(os.getcwd(), filename) + print(f'Downloading source file to {file_save_path}') + torch.hub.download_url_to_file(source_root, file_save_path) + source_file_path_list = [file_save_path] + elif is_file: + # when input source is single image + source_file_path_list = [source_root] + else: + print('Cannot find image file.') + + source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file) + + return source_file_path_list, source_type diff --git a/mmdetection/mmdet/utils/mot_error_visualize.py b/mmdetection/mmdet/utils/mot_error_visualize.py new file mode 100644 index 0000000..01bf864 --- /dev/null +++ b/mmdetection/mmdet/utils/mot_error_visualize.py @@ -0,0 +1,273 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from typing import Union + +try: + import seaborn as sns +except ImportError: + sns = None +import cv2 +import matplotlib.pyplot as plt +import mmcv +import numpy as np +from matplotlib.patches import Rectangle +from mmengine.utils import mkdir_or_exist + + +def imshow_mot_errors(*args, backend: str = 'cv2', **kwargs): + """Show the wrong tracks on the input image. + + Args: + backend (str, optional): Backend of visualization. + Defaults to 'cv2'. + """ + if backend == 'cv2': + return _cv2_show_wrong_tracks(*args, **kwargs) + elif backend == 'plt': + return _plt_show_wrong_tracks(*args, **kwargs) + else: + raise NotImplementedError() + + +def _cv2_show_wrong_tracks(img: Union[str, np.ndarray], + bboxes: np.ndarray, + ids: np.ndarray, + error_types: np.ndarray, + thickness: int = 2, + font_scale: float = 0.4, + text_width: int = 10, + text_height: int = 15, + show: bool = False, + wait_time: int = 100, + out_file: str = None) -> np.ndarray: + """Show the wrong tracks with opencv. + + Args: + img (str or ndarray): The image to be displayed. + bboxes (ndarray): A ndarray of shape (k, 5). + ids (ndarray): A ndarray of shape (k, ). + error_types (ndarray): A ndarray of shape (k, ), where 0 denotes + false positives, 1 denotes false negative and 2 denotes ID switch. + thickness (int, optional): Thickness of lines. + Defaults to 2. + font_scale (float, optional): Font scale to draw id and score. + Defaults to 0.4. + text_width (int, optional): Width to draw id and score. + Defaults to 10. + text_height (int, optional): Height to draw id and score. + Defaults to 15. + show (bool, optional): Whether to show the image on the fly. + Defaults to False. + wait_time (int, optional): Value of waitKey param. + Defaults to 100. + out_file (str, optional): The filename to write the image. + Defaults to None. + + Returns: + ndarray: Visualized image. + """ + if sns is None: + raise ImportError('please run pip install seaborn') + assert bboxes.ndim == 2, \ + f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.' + assert ids.ndim == 1, \ + f' ids ndim should be 1, but its ndim is {ids.ndim}.' + assert error_types.ndim == 1, \ + f' error_types ndim should be 1, but its ndim is {error_types.ndim}.' + assert bboxes.shape[0] == ids.shape[0], \ + 'bboxes.shape[0] and ids.shape[0] should have the same length.' + assert bboxes.shape[1] == 5, \ + f' bboxes.shape[1] should be 5, but its {bboxes.shape[1]}.' + + bbox_colors = sns.color_palette() + # red, yellow, blue + bbox_colors = [bbox_colors[3], bbox_colors[1], bbox_colors[0]] + bbox_colors = [[int(255 * _c) for _c in bbox_color][::-1] + for bbox_color in bbox_colors] + + if isinstance(img, str): + img = mmcv.imread(img) + else: + assert img.ndim == 3 + + img_shape = img.shape + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) + + for bbox, error_type, id in zip(bboxes, error_types, ids): + x1, y1, x2, y2 = bbox[:4].astype(np.int32) + score = float(bbox[-1]) + + # bbox + bbox_color = bbox_colors[error_type] + cv2.rectangle(img, (x1, y1), (x2, y2), bbox_color, thickness=thickness) + + # FN does not have id and score + if error_type == 1: + continue + + # score + text = '{:.02f}'.format(score) + width = (len(text) - 1) * text_width + img[y1:y1 + text_height, x1:x1 + width, :] = bbox_color + cv2.putText( + img, + text, (x1, y1 + text_height - 2), + cv2.FONT_HERSHEY_COMPLEX, + font_scale, + color=(0, 0, 0)) + + # id + text = str(id) + width = len(text) * text_width + img[y1 + text_height:y1 + text_height * 2, + x1:x1 + width, :] = bbox_color + cv2.putText( + img, + str(id), (x1, y1 + text_height * 2 - 2), + cv2.FONT_HERSHEY_COMPLEX, + font_scale, + color=(0, 0, 0)) + + if show: + mmcv.imshow(img, wait_time=wait_time) + if out_file is not None: + mmcv.imwrite(img, out_file) + + return img + + +def _plt_show_wrong_tracks(img: Union[str, np.ndarray], + bboxes: np.ndarray, + ids: np.ndarray, + error_types: np.ndarray, + thickness: float = 0.1, + font_scale: float = 3.0, + text_width: int = 8, + text_height: int = 13, + show: bool = False, + wait_time: int = 100, + out_file: str = None) -> np.ndarray: + """Show the wrong tracks with matplotlib. + + Args: + img (str or ndarray): The image to be displayed. + bboxes (ndarray): A ndarray of shape (k, 5). + ids (ndarray): A ndarray of shape (k, ). + error_types (ndarray): A ndarray of shape (k, ), where 0 denotes + false positives, 1 denotes false negative and 2 denotes ID switch. + thickness (float, optional): Thickness of lines. + Defaults to 0.1. + font_scale (float, optional): Font scale to draw id and score. + Defaults to 3.0. + text_width (int, optional): Width to draw id and score. + Defaults to 8. + text_height (int, optional): Height to draw id and score. + Defaults to 13. + show (bool, optional): Whether to show the image on the fly. + Defaults to False. + wait_time (int, optional): Value of waitKey param. + Defaults to 100. + out_file (str, optional): The filename to write the image. + Defaults to None. + + Returns: + ndarray: Original image. + """ + assert bboxes.ndim == 2, \ + f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.' + assert ids.ndim == 1, \ + f' ids ndim should be 1, but its ndim is {ids.ndim}.' + assert error_types.ndim == 1, \ + f' error_types ndim should be 1, but its ndim is {error_types.ndim}.' + assert bboxes.shape[0] == ids.shape[0], \ + 'bboxes.shape[0] and ids.shape[0] should have the same length.' + assert bboxes.shape[1] == 5, \ + f' bboxes.shape[1] should be 5, but its {bboxes.shape[1]}.' + + bbox_colors = sns.color_palette() + # red, yellow, blue + bbox_colors = [bbox_colors[3], bbox_colors[1], bbox_colors[0]] + + if isinstance(img, str): + img = plt.imread(img) + else: + assert img.ndim == 3 + img = mmcv.bgr2rgb(img) + + img_shape = img.shape + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) + + plt.imshow(img) + plt.gca().set_axis_off() + plt.autoscale(False) + plt.subplots_adjust( + top=1, bottom=0, right=1, left=0, hspace=None, wspace=None) + plt.margins(0, 0) + plt.gca().xaxis.set_major_locator(plt.NullLocator()) + plt.gca().yaxis.set_major_locator(plt.NullLocator()) + plt.rcParams['figure.figsize'] = img_shape[1], img_shape[0] + + for bbox, error_type, id in zip(bboxes, error_types, ids): + x1, y1, x2, y2, score = bbox + w, h = int(x2 - x1), int(y2 - y1) + left_top = (int(x1), int(y1)) + + # bbox + plt.gca().add_patch( + Rectangle( + left_top, + w, + h, + thickness, + edgecolor=bbox_colors[error_type], + facecolor='none')) + + # FN does not have id and score + if error_type == 1: + continue + + # score + text = '{:.02f}'.format(score) + width = len(text) * text_width + plt.gca().add_patch( + Rectangle((left_top[0], left_top[1]), + width, + text_height, + thickness, + edgecolor=bbox_colors[error_type], + facecolor=bbox_colors[error_type])) + + plt.text( + left_top[0], + left_top[1] + text_height + 2, + text, + fontsize=font_scale) + + # id + text = str(id) + width = len(text) * text_width + plt.gca().add_patch( + Rectangle((left_top[0], left_top[1] + text_height + 1), + width, + text_height, + thickness, + edgecolor=bbox_colors[error_type], + facecolor=bbox_colors[error_type])) + plt.text( + left_top[0], + left_top[1] + 2 * (text_height + 1), + text, + fontsize=font_scale) + + if out_file is not None: + mkdir_or_exist(osp.abspath(osp.dirname(out_file))) + plt.savefig(out_file, dpi=300, bbox_inches='tight', pad_inches=0.0) + + if show: + plt.draw() + plt.pause(wait_time / 1000.) + + plt.clf() + return img diff --git a/mmdetection/mmdet/utils/profiling.py b/mmdetection/mmdet/utils/profiling.py new file mode 100644 index 0000000..2f53f45 --- /dev/null +++ b/mmdetection/mmdet/utils/profiling.py @@ -0,0 +1,40 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import contextlib +import sys +import time + +import torch + +if sys.version_info >= (3, 7): + + @contextlib.contextmanager + def profile_time(trace_name, + name, + enabled=True, + stream=None, + end_stream=None): + """Print time spent by CPU and GPU. + + Useful as a temporary context manager to find sweet spots of code + suitable for async implementation. + """ + if (not enabled) or not torch.cuda.is_available(): + yield + return + stream = stream if stream else torch.cuda.current_stream() + end_stream = end_stream if end_stream else stream + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + stream.record_event(start) + try: + cpu_start = time.monotonic() + yield + finally: + cpu_end = time.monotonic() + end_stream.record_event(end) + end.synchronize() + cpu_time = (cpu_end - cpu_start) * 1000 + gpu_time = start.elapsed_time(end) + msg = f'{trace_name} {name} cpu_time {cpu_time:.2f} ms ' + msg += f'gpu_time {gpu_time:.2f} ms stream {stream}' + print(msg, end_stream) diff --git a/mmdetection/mmdet/utils/replace_cfg_vals.py b/mmdetection/mmdet/utils/replace_cfg_vals.py new file mode 100644 index 0000000..a3331a3 --- /dev/null +++ b/mmdetection/mmdet/utils/replace_cfg_vals.py @@ -0,0 +1,70 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import re + +from mmengine.config import Config + + +def replace_cfg_vals(ori_cfg): + """Replace the string "${key}" with the corresponding value. + + Replace the "${key}" with the value of ori_cfg.key in the config. And + support replacing the chained ${key}. Such as, replace "${key0.key1}" + with the value of cfg.key0.key1. Code is modified from `vars.py + < https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/vars.py>`_ # noqa: E501 + + Args: + ori_cfg (mmengine.config.Config): + The origin config with "${key}" generated from a file. + + Returns: + updated_cfg [mmengine.config.Config]: + The config with "${key}" replaced by the corresponding value. + """ + + def get_value(cfg, key): + for k in key.split('.'): + cfg = cfg[k] + return cfg + + def replace_value(cfg): + if isinstance(cfg, dict): + return {key: replace_value(value) for key, value in cfg.items()} + elif isinstance(cfg, list): + return [replace_value(item) for item in cfg] + elif isinstance(cfg, tuple): + return tuple([replace_value(item) for item in cfg]) + elif isinstance(cfg, str): + # the format of string cfg may be: + # 1) "${key}", which will be replaced with cfg.key directly + # 2) "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx", + # which will be replaced with the string of the cfg.key + keys = pattern_key.findall(cfg) + values = [get_value(ori_cfg, key[2:-1]) for key in keys] + if len(keys) == 1 and keys[0] == cfg: + # the format of string cfg is "${key}" + cfg = values[0] + else: + for key, value in zip(keys, values): + # the format of string cfg is + # "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx" + assert not isinstance(value, (dict, list, tuple)), \ + f'for the format of string cfg is ' \ + f"'xxxxx${key}xxxxx' or 'xxx${key}xxx${key}xxx', " \ + f"the type of the value of '${key}' " \ + f'can not be dict, list, or tuple' \ + f'but you input {type(value)} in {cfg}' + cfg = cfg.replace(key, str(value)) + return cfg + else: + return cfg + + # the pattern of string "${key}" + pattern_key = re.compile(r'\$\{[a-zA-Z\d_.]*\}') + # the type of ori_cfg._cfg_dict is mmengine.config.ConfigDict + updated_cfg = Config( + replace_value(ori_cfg._cfg_dict), filename=ori_cfg.filename) + # replace the model with model_wrapper + if updated_cfg.get('model_wrapper', None) is not None: + updated_cfg.model = updated_cfg.model_wrapper + updated_cfg.pop('model_wrapper') + return updated_cfg diff --git a/mmdetection/mmdet/utils/setup_env.py b/mmdetection/mmdet/utils/setup_env.py new file mode 100644 index 0000000..a7b3784 --- /dev/null +++ b/mmdetection/mmdet/utils/setup_env.py @@ -0,0 +1,118 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import logging +import os +import platform +import warnings + +import cv2 +import torch.multiprocessing as mp +from mmengine import DefaultScope +from mmengine.logging import print_log +from mmengine.utils import digit_version + + +def setup_cache_size_limit_of_dynamo(): + """Setup cache size limit of dynamo. + + Note: Due to the dynamic shape of the loss calculation and + post-processing parts in the object detection algorithm, these + functions must be compiled every time they are run. + Setting a large value for torch._dynamo.config.cache_size_limit + may result in repeated compilation, which can slow down training + and testing speed. Therefore, we need to set the default value of + cache_size_limit smaller. An empirical value is 4. + """ + + import torch + if digit_version(torch.__version__) >= digit_version('2.0.0'): + if 'DYNAMO_CACHE_SIZE_LIMIT' in os.environ: + import torch._dynamo + cache_size_limit = int(os.environ['DYNAMO_CACHE_SIZE_LIMIT']) + torch._dynamo.config.cache_size_limit = cache_size_limit + print_log( + f'torch._dynamo.config.cache_size_limit is force ' + f'set to {cache_size_limit}.', + logger='current', + level=logging.WARNING) + + +def setup_multi_processes(cfg): + """Setup multi-processing environment variables.""" + # set multi-process start method as `fork` to speed up the training + if platform.system() != 'Windows': + mp_start_method = cfg.get('mp_start_method', 'fork') + current_method = mp.get_start_method(allow_none=True) + if current_method is not None and current_method != mp_start_method: + warnings.warn( + f'Multi-processing start method `{mp_start_method}` is ' + f'different from the previous setting `{current_method}`.' + f'It will be force set to `{mp_start_method}`. You can change ' + f'this behavior by changing `mp_start_method` in your config.') + mp.set_start_method(mp_start_method, force=True) + + # disable opencv multithreading to avoid system being overloaded + opencv_num_threads = cfg.get('opencv_num_threads', 0) + cv2.setNumThreads(opencv_num_threads) + + # setup OMP threads + # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py # noqa + workers_per_gpu = cfg.data.get('workers_per_gpu', 1) + if 'train_dataloader' in cfg.data: + workers_per_gpu = \ + max(cfg.data.train_dataloader.get('workers_per_gpu', 1), + workers_per_gpu) + + if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1: + omp_num_threads = 1 + warnings.warn( + f'Setting OMP_NUM_THREADS environment variable for each process ' + f'to be {omp_num_threads} in default, to avoid your system being ' + f'overloaded, please further tune the variable for optimal ' + f'performance in your application as needed.') + os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) + + # setup MKL threads + if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1: + mkl_num_threads = 1 + warnings.warn( + f'Setting MKL_NUM_THREADS environment variable for each process ' + f'to be {mkl_num_threads} in default, to avoid your system being ' + f'overloaded, please further tune the variable for optimal ' + f'performance in your application as needed.') + os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads) + + +def register_all_modules(init_default_scope: bool = True) -> None: + """Register all modules in mmdet into the registries. + + Args: + init_default_scope (bool): Whether initialize the mmdet default scope. + When `init_default_scope=True`, the global default scope will be + set to `mmdet`, and all registries will build modules from mmdet's + registry node. To understand more about the registry, please refer + to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md + Defaults to True. + """ # noqa + import mmdet.datasets # noqa: F401,F403 + import mmdet.engine # noqa: F401,F403 + import mmdet.evaluation # noqa: F401,F403 + import mmdet.models # noqa: F401,F403 + import mmdet.visualization # noqa: F401,F403 + + if init_default_scope: + never_created = DefaultScope.get_current_instance() is None \ + or not DefaultScope.check_instance_created('mmdet') + if never_created: + DefaultScope.get_instance('mmdet', scope_name='mmdet') + return + current_scope = DefaultScope.get_current_instance() + if current_scope.scope_name != 'mmdet': + warnings.warn('The current default scope ' + f'"{current_scope.scope_name}" is not "mmdet", ' + '`register_all_modules` will force the current' + 'default scope to be "mmdet". If this is not ' + 'expected, please set `init_default_scope=False`.') + # avoid name conflict + new_instance_name = f'mmdet-{datetime.datetime.now()}' + DefaultScope.get_instance(new_instance_name, scope_name='mmdet') diff --git a/mmdetection/mmdet/utils/split_batch.py b/mmdetection/mmdet/utils/split_batch.py new file mode 100644 index 0000000..0276fb3 --- /dev/null +++ b/mmdetection/mmdet/utils/split_batch.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def split_batch(img, img_metas, kwargs): + """Split data_batch by tags. + + Code is modified from + # noqa: E501 + + Args: + img (Tensor): of shape (N, C, H, W) encoding input images. + Typically these should be mean centered and std scaled. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys, see + :class:`mmdet.datasets.pipelines.Collect`. + kwargs (dict): Specific to concrete implementation. + + Returns: + data_groups (dict): a dict that data_batch splited by tags, + such as 'sup', 'unsup_teacher', and 'unsup_student'. + """ + + # only stack img in the batch + def fuse_list(obj_list, obj): + return torch.stack(obj_list) if isinstance(obj, + torch.Tensor) else obj_list + + # select data with tag from data_batch + def select_group(data_batch, current_tag): + group_flag = [tag == current_tag for tag in data_batch['tag']] + return { + k: fuse_list([vv for vv, gf in zip(v, group_flag) if gf], v) + for k, v in data_batch.items() + } + + kwargs.update({'img': img, 'img_metas': img_metas}) + kwargs.update({'tag': [meta['tag'] for meta in img_metas]}) + tags = list(set(kwargs['tag'])) + data_groups = {tag: select_group(kwargs, tag) for tag in tags} + for tag, group in data_groups.items(): + group.pop('tag') + return data_groups diff --git a/mmdetection/mmdet/utils/typing_utils.py b/mmdetection/mmdet/utils/typing_utils.py new file mode 100644 index 0000000..6caf6de --- /dev/null +++ b/mmdetection/mmdet/utils/typing_utils.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Collecting some commonly used type hint in mmdetection.""" +from typing import List, Optional, Sequence, Tuple, Union + +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData, PixelData + +# TODO: Need to avoid circular import with assigner and sampler +# Type hint of config data +ConfigType = Union[ConfigDict, dict] +OptConfigType = Optional[ConfigType] +# Type hint of one or more config data +MultiConfig = Union[ConfigType, List[ConfigType]] +OptMultiConfig = Optional[MultiConfig] + +InstanceList = List[InstanceData] +OptInstanceList = Optional[InstanceList] + +PixelList = List[PixelData] +OptPixelList = Optional[PixelList] + +RangeType = Sequence[Tuple[int, int]] diff --git a/mmdetection/mmdet/utils/util_mixins.py b/mmdetection/mmdet/utils/util_mixins.py new file mode 100644 index 0000000..b83b661 --- /dev/null +++ b/mmdetection/mmdet/utils/util_mixins.py @@ -0,0 +1,105 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This module defines the :class:`NiceRepr` mixin class, which defines a +``__repr__`` and ``__str__`` method that only depend on a custom ``__nice__`` +method, which you must define. This means you only have to overload one +function instead of two. Furthermore, if the object defines a ``__len__`` +method, then the ``__nice__`` method defaults to something sensible, otherwise +it is treated as abstract and raises ``NotImplementedError``. + +To use simply have your object inherit from :class:`NiceRepr` +(multi-inheritance should be ok). + +This code was copied from the ubelt library: https://github.com/Erotemic/ubelt + +Example: + >>> # Objects that define __nice__ have a default __str__ and __repr__ + >>> class Student(NiceRepr): + ... def __init__(self, name): + ... self.name = name + ... def __nice__(self): + ... return self.name + >>> s1 = Student('Alice') + >>> s2 = Student('Bob') + >>> print(f's1 = {s1}') + >>> print(f's2 = {s2}') + s1 = + s2 = + +Example: + >>> # Objects that define __len__ have a default __nice__ + >>> class Group(NiceRepr): + ... def __init__(self, data): + ... self.data = data + ... def __len__(self): + ... return len(self.data) + >>> g = Group([1, 2, 3]) + >>> print(f'g = {g}') + g = +""" +import warnings + + +class NiceRepr: + """Inherit from this class and define ``__nice__`` to "nicely" print your + objects. + + Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function + Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``. + If the inheriting class has a ``__len__``, method then the default + ``__nice__`` method will return its length. + + Example: + >>> class Foo(NiceRepr): + ... def __nice__(self): + ... return 'info' + >>> foo = Foo() + >>> assert str(foo) == '' + >>> assert repr(foo).startswith('>> class Bar(NiceRepr): + ... pass + >>> bar = Bar() + >>> import pytest + >>> with pytest.warns(None) as record: + >>> assert 'object at' in str(bar) + >>> assert 'object at' in repr(bar) + + Example: + >>> class Baz(NiceRepr): + ... def __len__(self): + ... return 5 + >>> baz = Baz() + >>> assert str(baz) == '' + """ + + def __nice__(self): + """str: a "nice" summary string describing this module""" + if hasattr(self, '__len__'): + # It is a common pattern for objects to use __len__ in __nice__ + # As a convenience we define a default __nice__ for these objects + return str(len(self)) + else: + # In all other cases force the subclass to overload __nice__ + raise NotImplementedError( + f'Define the __nice__ method for {self.__class__!r}') + + def __repr__(self): + """str: the string of the module""" + try: + nice = self.__nice__() + classname = self.__class__.__name__ + return f'<{classname}({nice}) at {hex(id(self))}>' + except NotImplementedError as ex: + warnings.warn(str(ex), category=RuntimeWarning) + return object.__repr__(self) + + def __str__(self): + """str: the string of the module""" + try: + classname = self.__class__.__name__ + nice = self.__nice__() + return f'<{classname}({nice})>' + except NotImplementedError as ex: + warnings.warn(str(ex), category=RuntimeWarning) + return object.__repr__(self) diff --git a/mmdetection/mmdet/utils/util_random.py b/mmdetection/mmdet/utils/util_random.py new file mode 100644 index 0000000..dc1ecb6 --- /dev/null +++ b/mmdetection/mmdet/utils/util_random.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Helpers for random number generators.""" +import numpy as np + + +def ensure_rng(rng=None): + """Coerces input into a random number generator. + + If the input is None, then a global random state is returned. + + If the input is a numeric value, then that is used as a seed to construct a + random state. Otherwise the input is returned as-is. + + Adapted from [1]_. + + Args: + rng (int | numpy.random.RandomState | None): + if None, then defaults to the global rng. Otherwise this can be an + integer or a RandomState class + Returns: + (numpy.random.RandomState) : rng - + a numpy random number generator + + References: + .. [1] https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270 # noqa: E501 + """ + + if rng is None: + rng = np.random.mtrand._rand + elif isinstance(rng, int): + rng = np.random.RandomState(rng) + else: + rng = rng + return rng diff --git a/mmdetection/mmdet/version.py b/mmdetection/mmdet/version.py new file mode 100644 index 0000000..38ce834 --- /dev/null +++ b/mmdetection/mmdet/version.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +__version__ = '3.2.0' +short_version = __version__ + + +def parse_version_info(version_str): + """Parse a version string into a tuple. + + Args: + version_str (str): The version string. + Returns: + tuple[int | str]: The version info, e.g., "1.3.0" is parsed into + (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1'). + """ + version_info = [] + for x in version_str.split('.'): + if x.isdigit(): + version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + version_info.append(int(patch_version[0])) + version_info.append(f'rc{patch_version[1]}') + return tuple(version_info) + + +version_info = parse_version_info(__version__) diff --git a/mmdetection/mmdet/visualization/__init__.py b/mmdetection/mmdet/visualization/__init__.py new file mode 100644 index 0000000..a7edaed --- /dev/null +++ b/mmdetection/mmdet/visualization/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .local_visualizer import DetLocalVisualizer, TrackLocalVisualizer +from .palette import get_palette, jitter_color, palette_val + +__all__ = [ + 'palette_val', 'get_palette', 'DetLocalVisualizer', 'jitter_color', + 'TrackLocalVisualizer' +] diff --git a/mmdetection/mmdet/visualization/local_visualizer.py b/mmdetection/mmdet/visualization/local_visualizer.py new file mode 100644 index 0000000..cc6521c --- /dev/null +++ b/mmdetection/mmdet/visualization/local_visualizer.py @@ -0,0 +1,699 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Tuple, Union + +import cv2 +import mmcv +import numpy as np + +try: + import seaborn as sns +except ImportError: + sns = None +import torch +from mmengine.dist import master_only +from mmengine.structures import InstanceData, PixelData +from mmengine.visualization import Visualizer + +from ..evaluation import INSTANCE_OFFSET +from ..registry import VISUALIZERS +from ..structures import DetDataSample +from ..structures.mask import BitmapMasks, PolygonMasks, bitmap_to_polygon +from .palette import _get_adaptive_scales, get_palette, jitter_color + + +@VISUALIZERS.register_module() +class DetLocalVisualizer(Visualizer): + """MMDetection Local Visualizer. + + Args: + name (str): Name of the instance. Defaults to 'visualizer'. + image (np.ndarray, optional): the origin image to draw. The format + should be RGB. Defaults to None. + vis_backends (list, optional): Visual backend config list. + Defaults to None. + save_dir (str, optional): Save file dir for all storage backends. + If it is None, the backend storage will not save any data. + bbox_color (str, tuple(int), optional): Color of bbox lines. + The tuple of color should be in BGR order. Defaults to None. + text_color (str, tuple(int), optional): Color of texts. + The tuple of color should be in BGR order. + Defaults to (200, 200, 200). + mask_color (str, tuple(int), optional): Color of masks. + The tuple of color should be in BGR order. + Defaults to None. + line_width (int, float): The linewidth of lines. + Defaults to 3. + alpha (int, float): The transparency of bboxes or mask. + Defaults to 0.8. + + Examples: + >>> import numpy as np + >>> import torch + >>> from mmengine.structures import InstanceData + >>> from mmdet.structures import DetDataSample + >>> from mmdet.visualization import DetLocalVisualizer + + >>> det_local_visualizer = DetLocalVisualizer() + >>> image = np.random.randint(0, 256, + ... size=(10, 12, 3)).astype('uint8') + >>> gt_instances = InstanceData() + >>> gt_instances.bboxes = torch.Tensor([[1, 2, 2, 5]]) + >>> gt_instances.labels = torch.randint(0, 2, (1,)) + >>> gt_det_data_sample = DetDataSample() + >>> gt_det_data_sample.gt_instances = gt_instances + >>> det_local_visualizer.add_datasample('image', image, + ... gt_det_data_sample) + >>> det_local_visualizer.add_datasample( + ... 'image', image, gt_det_data_sample, + ... out_file='out_file.jpg') + >>> det_local_visualizer.add_datasample( + ... 'image', image, gt_det_data_sample, + ... show=True) + >>> pred_instances = InstanceData() + >>> pred_instances.bboxes = torch.Tensor([[2, 4, 4, 8]]) + >>> pred_instances.labels = torch.randint(0, 2, (1,)) + >>> pred_det_data_sample = DetDataSample() + >>> pred_det_data_sample.pred_instances = pred_instances + >>> det_local_visualizer.add_datasample('image', image, + ... gt_det_data_sample, + ... pred_det_data_sample) + """ + + def __init__(self, + name: str = 'visualizer', + image: Optional[np.ndarray] = None, + vis_backends: Optional[Dict] = None, + save_dir: Optional[str] = None, + bbox_color: Optional[Union[str, Tuple[int]]] = None, + text_color: Optional[Union[str, + Tuple[int]]] = (200, 200, 200), + mask_color: Optional[Union[str, Tuple[int]]] = None, + line_width: Union[int, float] = 3, + alpha: float = 0.8) -> None: + super().__init__( + name=name, + image=image, + vis_backends=vis_backends, + save_dir=save_dir) + self.bbox_color = bbox_color + self.text_color = text_color + self.mask_color = mask_color + self.line_width = line_width + self.alpha = alpha + # Set default value. When calling + # `DetLocalVisualizer().dataset_meta=xxx`, + # it will override the default value. + self.dataset_meta = {} + + def _draw_instances(self, image: np.ndarray, instances: ['InstanceData'], + classes: Optional[List[str]], + palette: Optional[List[tuple]]) -> np.ndarray: + """Draw instances of GT or prediction. + + Args: + image (np.ndarray): The image to draw. + instances (:obj:`InstanceData`): Data structure for + instance-level annotations or predictions. + classes (List[str], optional): Category information. + palette (List[tuple], optional): Palette information + corresponding to the category. + + Returns: + np.ndarray: the drawn image which channel is RGB. + """ + self.set_image(image) + + if 'bboxes' in instances and instances.bboxes.sum() > 0: + bboxes = instances.bboxes + labels = instances.labels + + max_label = int(max(labels) if len(labels) > 0 else 0) + text_palette = get_palette(self.text_color, max_label + 1) + text_colors = [text_palette[label] for label in labels] + + bbox_color = palette if self.bbox_color is None \ + else self.bbox_color + bbox_palette = get_palette(bbox_color, max_label + 1) + colors = [bbox_palette[label] for label in labels] + self.draw_bboxes( + bboxes, + edge_colors=colors, + alpha=self.alpha, + line_widths=self.line_width) + + positions = bboxes[:, :2] + self.line_width + areas = (bboxes[:, 3] - bboxes[:, 1]) * ( + bboxes[:, 2] - bboxes[:, 0]) + scales = _get_adaptive_scales(areas) + + for i, (pos, label) in enumerate(zip(positions, labels)): + if 'label_names' in instances: + label_text = instances.label_names[i] + else: + label_text = classes[ + label] if classes is not None else f'class {label}' + if 'scores' in instances: + score = round(float(instances.scores[i]) * 100, 1) + label_text += f': {score}' + + self.draw_texts( + label_text, + pos, + colors=text_colors[i], + font_sizes=int(13 * scales[i]), + bboxes=[{ + 'facecolor': 'black', + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }]) + + if 'masks' in instances: + labels = instances.labels + masks = instances.masks + if isinstance(masks, torch.Tensor): + masks = masks.numpy() + elif isinstance(masks, (PolygonMasks, BitmapMasks)): + masks = masks.to_ndarray() + + masks = masks.astype(bool) + + max_label = int(max(labels) if len(labels) > 0 else 0) + mask_color = palette if self.mask_color is None \ + else self.mask_color + mask_palette = get_palette(mask_color, max_label + 1) + colors = [jitter_color(mask_palette[label]) for label in labels] + text_palette = get_palette(self.text_color, max_label + 1) + text_colors = [text_palette[label] for label in labels] + + polygons = [] + for i, mask in enumerate(masks): + contours, _ = bitmap_to_polygon(mask) + polygons.extend(contours) + self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha) + self.draw_binary_masks(masks, colors=colors, alphas=self.alpha) + + if len(labels) > 0 and \ + ('bboxes' not in instances or + instances.bboxes.sum() == 0): + # instances.bboxes.sum()==0 represent dummy bboxes. + # A typical example of SOLO does not exist bbox branch. + areas = [] + positions = [] + for mask in masks: + _, _, stats, centroids = cv2.connectedComponentsWithStats( + mask.astype(np.uint8), connectivity=8) + if stats.shape[0] > 1: + largest_id = np.argmax(stats[1:, -1]) + 1 + positions.append(centroids[largest_id]) + areas.append(stats[largest_id, -1]) + areas = np.stack(areas, axis=0) + scales = _get_adaptive_scales(areas) + + for i, (pos, label) in enumerate(zip(positions, labels)): + if 'label_names' in instances: + label_text = instances.label_names[i] + else: + label_text = classes[ + label] if classes is not None else f'class {label}' + if 'scores' in instances: + score = round(float(instances.scores[i]) * 100, 1) + label_text += f': {score}' + + self.draw_texts( + label_text, + pos, + colors=text_colors[i], + font_sizes=int(13 * scales[i]), + horizontal_alignments='center', + bboxes=[{ + 'facecolor': 'black', + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }]) + return self.get_image() + + def _draw_panoptic_seg(self, image: np.ndarray, + panoptic_seg: ['PixelData'], + classes: Optional[List[str]], + palette: Optional[List]) -> np.ndarray: + """Draw panoptic seg of GT or prediction. + + Args: + image (np.ndarray): The image to draw. + panoptic_seg (:obj:`PixelData`): Data structure for + pixel-level annotations or predictions. + classes (List[str], optional): Category information. + + Returns: + np.ndarray: the drawn image which channel is RGB. + """ + # TODO: Is there a way to bypass? + num_classes = len(classes) + + panoptic_seg_data = panoptic_seg.sem_seg[0] + + ids = np.unique(panoptic_seg_data)[::-1] + + if 'label_names' in panoptic_seg: + # open set panoptic segmentation + classes = panoptic_seg.metainfo['label_names'] + ignore_index = panoptic_seg.metainfo.get('ignore_index', + len(classes)) + ids = ids[ids != ignore_index] + else: + # for VOID label + ids = ids[ids != num_classes] + + labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64) + segms = (panoptic_seg_data[None] == ids[:, None, None]) + + max_label = int(max(labels) if len(labels) > 0 else 0) + + mask_color = palette if self.mask_color is None \ + else self.mask_color + mask_palette = get_palette(mask_color, max_label + 1) + colors = [mask_palette[label] for label in labels] + + self.set_image(image) + + # draw segm + polygons = [] + for i, mask in enumerate(segms): + contours, _ = bitmap_to_polygon(mask) + polygons.extend(contours) + self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha) + self.draw_binary_masks(segms, colors=colors, alphas=self.alpha) + + # draw label + areas = [] + positions = [] + for mask in segms: + _, _, stats, centroids = cv2.connectedComponentsWithStats( + mask.astype(np.uint8), connectivity=8) + max_id = np.argmax(stats[1:, -1]) + 1 + positions.append(centroids[max_id]) + areas.append(stats[max_id, -1]) + areas = np.stack(areas, axis=0) + scales = _get_adaptive_scales(areas) + + text_palette = get_palette(self.text_color, max_label + 1) + text_colors = [text_palette[label] for label in labels] + + for i, (pos, label) in enumerate(zip(positions, labels)): + label_text = classes[label] + + self.draw_texts( + label_text, + pos, + colors=text_colors[i], + font_sizes=int(13 * scales[i]), + bboxes=[{ + 'facecolor': 'black', + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }], + horizontal_alignments='center') + return self.get_image() + + def _draw_sem_seg(self, image: np.ndarray, sem_seg: PixelData, + classes: Optional[List], + palette: Optional[List]) -> np.ndarray: + """Draw semantic seg of GT or prediction. + + Args: + image (np.ndarray): The image to draw. + sem_seg (:obj:`PixelData`): Data structure for pixel-level + annotations or predictions. + classes (list, optional): Input classes for result rendering, as + the prediction of segmentation model is a segment map with + label indices, `classes` is a list which includes items + responding to the label indices. If classes is not defined, + visualizer will take `cityscapes` classes by default. + Defaults to None. + palette (list, optional): Input palette for result rendering, which + is a list of color palette responding to the classes. + Defaults to None. + + Returns: + np.ndarray: the drawn image which channel is RGB. + """ + sem_seg_data = sem_seg.sem_seg + if isinstance(sem_seg_data, torch.Tensor): + sem_seg_data = sem_seg_data.numpy() + + # 0 ~ num_class, the value 0 means background + ids = np.unique(sem_seg_data) + ignore_index = sem_seg.metainfo.get('ignore_index', 255) + ids = ids[ids != ignore_index] + + if 'label_names' in sem_seg: + # open set semseg + label_names = sem_seg.metainfo['label_names'] + else: + label_names = classes + + labels = np.array(ids, dtype=np.int64) + colors = [palette[label] for label in labels] + + self.set_image(image) + + # draw semantic masks + for i, (label, color) in enumerate(zip(labels, colors)): + masks = sem_seg_data == label + self.draw_binary_masks(masks, colors=[color], alphas=self.alpha) + label_text = label_names[label] + _, _, stats, centroids = cv2.connectedComponentsWithStats( + masks[0].astype(np.uint8), connectivity=8) + if stats.shape[0] > 1: + largest_id = np.argmax(stats[1:, -1]) + 1 + centroids = centroids[largest_id] + + areas = stats[largest_id, -1] + scales = _get_adaptive_scales(areas) + + self.draw_texts( + label_text, + centroids, + colors=(255, 255, 255), + font_sizes=int(13 * scales), + horizontal_alignments='center', + bboxes=[{ + 'facecolor': 'black', + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }]) + + return self.get_image() + + @master_only + def add_datasample( + self, + name: str, + image: np.ndarray, + data_sample: Optional['DetDataSample'] = None, + draw_gt: bool = True, + draw_pred: bool = True, + show: bool = False, + wait_time: float = 0, + # TODO: Supported in mmengine's Viusalizer. + out_file: Optional[str] = None, + pred_score_thr: float = 0.3, + step: int = 0) -> None: + """Draw datasample and save to all backends. + + - If GT and prediction are plotted at the same time, they are + displayed in a stitched image where the left image is the + ground truth and the right image is the prediction. + - If ``show`` is True, all storage backends are ignored, and + the images will be displayed in a local window. + - If ``out_file`` is specified, the drawn image will be + saved to ``out_file``. t is usually used when the display + is not available. + + Args: + name (str): The image identifier. + image (np.ndarray): The image to draw. + data_sample (:obj:`DetDataSample`, optional): A data + sample that contain annotations and predictions. + Defaults to None. + draw_gt (bool): Whether to draw GT DetDataSample. Default to True. + draw_pred (bool): Whether to draw Prediction DetDataSample. + Defaults to True. + show (bool): Whether to display the drawn image. Default to False. + wait_time (float): The interval of show (s). Defaults to 0. + out_file (str): Path to output file. Defaults to None. + pred_score_thr (float): The threshold to visualize the bboxes + and masks. Defaults to 0.3. + step (int): Global step value to record. Defaults to 0. + """ + image = image.clip(0, 255).astype(np.uint8) + classes = self.dataset_meta.get('classes', None) + palette = self.dataset_meta.get('palette', None) + + gt_img_data = None + pred_img_data = None + + if data_sample is not None: + data_sample = data_sample.cpu() + + if draw_gt and data_sample is not None: + gt_img_data = image + if 'gt_instances' in data_sample: + gt_img_data = self._draw_instances(image, + data_sample.gt_instances, + classes, palette) + if 'gt_sem_seg' in data_sample: + gt_img_data = self._draw_sem_seg(gt_img_data, + data_sample.gt_sem_seg, + classes, palette) + + if 'gt_panoptic_seg' in data_sample: + assert classes is not None, 'class information is ' \ + 'not provided when ' \ + 'visualizing panoptic ' \ + 'segmentation results.' + gt_img_data = self._draw_panoptic_seg( + gt_img_data, data_sample.gt_panoptic_seg, classes, palette) + + if draw_pred and data_sample is not None: + pred_img_data = image + if 'pred_instances' in data_sample: + pred_instances = data_sample.pred_instances + pred_instances = pred_instances[ + pred_instances.scores > pred_score_thr] + pred_img_data = self._draw_instances(image, pred_instances, + classes, palette) + + if 'pred_sem_seg' in data_sample: + pred_img_data = self._draw_sem_seg(pred_img_data, + data_sample.pred_sem_seg, + classes, palette) + + if 'pred_panoptic_seg' in data_sample: + assert classes is not None, 'class information is ' \ + 'not provided when ' \ + 'visualizing panoptic ' \ + 'segmentation results.' + pred_img_data = self._draw_panoptic_seg( + pred_img_data, data_sample.pred_panoptic_seg.numpy(), + classes, palette) + + if gt_img_data is not None and pred_img_data is not None: + drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1) + elif gt_img_data is not None: + drawn_img = gt_img_data + elif pred_img_data is not None: + drawn_img = pred_img_data + else: + # Display the original image directly if nothing is drawn. + drawn_img = image + + # It is convenient for users to obtain the drawn image. + # For example, the user wants to obtain the drawn image and + # save it as a video during video inference. + self.set_image(drawn_img) + + if show: + self.show(drawn_img, win_name=name, wait_time=wait_time) + + if out_file is not None: + mmcv.imwrite(drawn_img[..., ::-1], out_file) + else: + self.add_image(name, drawn_img, step) + + +def random_color(seed): + """Random a color according to the input seed.""" + if sns is None: + raise RuntimeError('motmetrics is not installed,\ + please install it by: pip install seaborn') + np.random.seed(seed) + colors = sns.color_palette() + color = colors[np.random.choice(range(len(colors)))] + color = tuple([int(255 * c) for c in color]) + return color + + +@VISUALIZERS.register_module() +class TrackLocalVisualizer(Visualizer): + """Tracking Local Visualizer for the MOT, VIS tasks. + + Args: + name (str): Name of the instance. Defaults to 'visualizer'. + image (np.ndarray, optional): the origin image to draw. The format + should be RGB. Defaults to None. + vis_backends (list, optional): Visual backend config list. + Defaults to None. + save_dir (str, optional): Save file dir for all storage backends. + If it is None, the backend storage will not save any data. + line_width (int, float): The linewidth of lines. + Defaults to 3. + alpha (int, float): The transparency of bboxes or mask. + Defaults to 0.8. + """ + + def __init__(self, + name: str = 'visualizer', + image: Optional[np.ndarray] = None, + vis_backends: Optional[Dict] = None, + save_dir: Optional[str] = None, + line_width: Union[int, float] = 3, + alpha: float = 0.8) -> None: + super().__init__(name, image, vis_backends, save_dir) + self.line_width = line_width + self.alpha = alpha + # Set default value. When calling + # `TrackLocalVisualizer().dataset_meta=xxx`, + # it will override the default value. + self.dataset_meta = {} + + def _draw_instances(self, image: np.ndarray, + instances: InstanceData) -> np.ndarray: + """Draw instances of GT or prediction. + + Args: + image (np.ndarray): The image to draw. + instances (:obj:`InstanceData`): Data structure for + instance-level annotations or predictions. + Returns: + np.ndarray: the drawn image which channel is RGB. + """ + self.set_image(image) + classes = self.dataset_meta.get('classes', None) + + # get colors and texts + # for the MOT and VIS tasks + colors = [random_color(_id) for _id in instances.instances_id] + categories = [ + classes[label] if classes is not None else f'cls{label}' + for label in instances.labels + ] + if 'scores' in instances: + texts = [ + f'{category_name}\n{instance_id} | {score:.2f}' + for category_name, instance_id, score in zip( + categories, instances.instances_id, instances.scores) + ] + else: + texts = [ + f'{category_name}\n{instance_id}' for category_name, + instance_id in zip(categories, instances.instances_id) + ] + + # draw bboxes and texts + if 'bboxes' in instances: + # draw bboxes + bboxes = instances.bboxes.clone() + self.draw_bboxes( + bboxes, + edge_colors=colors, + alpha=self.alpha, + line_widths=self.line_width) + # draw texts + if texts is not None: + positions = bboxes[:, :2] + self.line_width + areas = (bboxes[:, 3] - bboxes[:, 1]) * ( + bboxes[:, 2] - bboxes[:, 0]) + scales = _get_adaptive_scales(areas.cpu().numpy()) + for i, pos in enumerate(positions): + self.draw_texts( + texts[i], + pos, + colors='black', + font_sizes=int(13 * scales[i]), + bboxes=[{ + 'facecolor': [c / 255 for c in colors[i]], + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }]) + + # draw masks + if 'masks' in instances: + masks = instances.masks + polygons = [] + for i, mask in enumerate(masks): + contours, _ = bitmap_to_polygon(mask) + polygons.extend(contours) + self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha) + self.draw_binary_masks(masks, colors=colors, alphas=self.alpha) + + return self.get_image() + + @master_only + def add_datasample( + self, + name: str, + image: np.ndarray, + data_sample: DetDataSample = None, + draw_gt: bool = True, + draw_pred: bool = True, + show: bool = False, + wait_time: int = 0, + # TODO: Supported in mmengine's Viusalizer. + out_file: Optional[str] = None, + pred_score_thr: float = 0.3, + step: int = 0) -> None: + """Draw datasample and save to all backends. + + - If GT and prediction are plotted at the same time, they are + displayed in a stitched image where the left image is the + ground truth and the right image is the prediction. + - If ``show`` is True, all storage backends are ignored, and + the images will be displayed in a local window. + - If ``out_file`` is specified, the drawn image will be + saved to ``out_file``. t is usually used when the display + is not available. + Args: + name (str): The image identifier. + image (np.ndarray): The image to draw. + data_sample (OptTrackSampleList): A data + sample that contain annotations and predictions. + Defaults to None. + draw_gt (bool): Whether to draw GT TrackDataSample. + Default to True. + draw_pred (bool): Whether to draw Prediction TrackDataSample. + Defaults to True. + show (bool): Whether to display the drawn image. Default to False. + wait_time (int): The interval of show (s). Defaults to 0. + out_file (str): Path to output file. Defaults to None. + pred_score_thr (float): The threshold to visualize the bboxes + and masks. Defaults to 0.3. + step (int): Global step value to record. Defaults to 0. + """ + gt_img_data = None + pred_img_data = None + + if data_sample is not None: + data_sample = data_sample.cpu() + + if draw_gt and data_sample is not None: + assert 'gt_instances' in data_sample + gt_img_data = self._draw_instances(image, data_sample.gt_instances) + + if draw_pred and data_sample is not None: + assert 'pred_track_instances' in data_sample + pred_instances = data_sample.pred_track_instances + if 'scores' in pred_instances: + pred_instances = pred_instances[ + pred_instances.scores > pred_score_thr].cpu() + pred_img_data = self._draw_instances(image, pred_instances) + + if gt_img_data is not None and pred_img_data is not None: + drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1) + elif gt_img_data is not None: + drawn_img = gt_img_data + else: + drawn_img = pred_img_data + + if show: + self.show(drawn_img, win_name=name, wait_time=wait_time) + + if out_file is not None: + mmcv.imwrite(drawn_img[..., ::-1], out_file) + else: + self.add_image(name, drawn_img, step) diff --git a/mmdetection/mmdet/visualization/palette.py b/mmdetection/mmdet/visualization/palette.py new file mode 100644 index 0000000..3c402c0 --- /dev/null +++ b/mmdetection/mmdet/visualization/palette.py @@ -0,0 +1,108 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import mmcv +import numpy as np +from mmengine.utils import is_str + + +def palette_val(palette: List[tuple]) -> List[tuple]: + """Convert palette to matplotlib palette. + + Args: + palette (List[tuple]): A list of color tuples. + + Returns: + List[tuple[float]]: A list of RGB matplotlib color tuples. + """ + new_palette = [] + for color in palette: + color = [c / 255 for c in color] + new_palette.append(tuple(color)) + return new_palette + + +def get_palette(palette: Union[List[tuple], str, tuple], + num_classes: int) -> List[Tuple[int]]: + """Get palette from various inputs. + + Args: + palette (list[tuple] | str | tuple): palette inputs. + num_classes (int): the number of classes. + + Returns: + list[tuple[int]]: A list of color tuples. + """ + assert isinstance(num_classes, int) + + if isinstance(palette, list): + dataset_palette = palette + elif isinstance(palette, tuple): + dataset_palette = [palette] * num_classes + elif palette == 'random' or palette is None: + state = np.random.get_state() + # random color + np.random.seed(42) + palette = np.random.randint(0, 256, size=(num_classes, 3)) + np.random.set_state(state) + dataset_palette = [tuple(c) for c in palette] + elif palette == 'coco': + from mmdet.datasets import CocoDataset, CocoPanopticDataset + dataset_palette = CocoDataset.METAINFO['palette'] + if len(dataset_palette) < num_classes: + dataset_palette = CocoPanopticDataset.METAINFO['palette'] + elif palette == 'citys': + from mmdet.datasets import CityscapesDataset + dataset_palette = CityscapesDataset.METAINFO['palette'] + elif palette == 'voc': + from mmdet.datasets import VOCDataset + dataset_palette = VOCDataset.METAINFO['palette'] + elif is_str(palette): + dataset_palette = [mmcv.color_val(palette)[::-1]] * num_classes + else: + raise TypeError(f'Invalid type for palette: {type(palette)}') + + assert len(dataset_palette) >= num_classes, \ + 'The length of palette should not be less than `num_classes`.' + return dataset_palette + + +def _get_adaptive_scales(areas: np.ndarray, + min_area: int = 800, + max_area: int = 30000) -> np.ndarray: + """Get adaptive scales according to areas. + + The scale range is [0.5, 1.0]. When the area is less than + ``min_area``, the scale is 0.5 while the area is larger than + ``max_area``, the scale is 1.0. + + Args: + areas (ndarray): The areas of bboxes or masks with the + shape of (n, ). + min_area (int): Lower bound areas for adaptive scales. + Defaults to 800. + max_area (int): Upper bound areas for adaptive scales. + Defaults to 30000. + + Returns: + ndarray: The adaotive scales with the shape of (n, ). + """ + scales = 0.5 + (areas - min_area) // (max_area - min_area) + scales = np.clip(scales, 0.5, 1.0) + return scales + + +def jitter_color(color: tuple) -> tuple: + """Randomly jitter the given color in order to better distinguish instances + with the same class. + + Args: + color (tuple): The RGB color tuple. Each value is between [0, 255]. + + Returns: + tuple: The jittered color tuple. + """ + jitter = np.random.rand(3) + jitter = (jitter / np.linalg.norm(jitter) - 0.5) * 0.5 * 255 + color = np.clip(jitter + color, 0, 255).astype(np.uint8) + return tuple(color) diff --git a/mmdetection/model-index.yml b/mmdetection/model-index.yml new file mode 100644 index 0000000..f1704c0 --- /dev/null +++ b/mmdetection/model-index.yml @@ -0,0 +1,101 @@ +Import: + - configs/albu_example/metafile.yml + - configs/atss/metafile.yml + - configs/autoassign/metafile.yml + - configs/boxinst/metafile.yml + - configs/carafe/metafile.yml + - configs/cascade_rcnn/metafile.yml + - configs/cascade_rpn/metafile.yml + - configs/centernet/metafile.yml + - configs/centripetalnet/metafile.yml + - configs/condinst/metafile.yml + - configs/conditional_detr/metafile.yml + - configs/cornernet/metafile.yml + - configs/convnext/metafile.yml + - configs/crowddet/metafile.yml + - configs/dab_detr/metafile.yml + - configs/dcn/metafile.yml + - configs/dcnv2/metafile.yml + - configs/ddod/metafile.yml + - configs/deformable_detr/metafile.yml + - configs/detectors/metafile.yml + - configs/detr/metafile.yml + - configs/dino/metafile.yml + - configs/double_heads/metafile.yml + - configs/dyhead/metafile.yml + - configs/dynamic_rcnn/metafile.yml + - configs/efficientnet/metafile.yml + - configs/empirical_attention/metafile.yml + - configs/faster_rcnn/metafile.yml + - configs/fcos/metafile.yml + - configs/foveabox/metafile.yml + - configs/fpg/metafile.yml + - configs/free_anchor/metafile.yml + - configs/fsaf/metafile.yml + - configs/gcnet/metafile.yml + - configs/gfl/metafile.yml + - configs/ghm/metafile.yml + - configs/gn/metafile.yml + - configs/gn+ws/metafile.yml + - configs/grid_rcnn/metafile.yml + - configs/groie/metafile.yml + - configs/guided_anchoring/metafile.yml + - configs/hrnet/metafile.yml + - configs/htc/metafile.yml + - configs/instaboost/metafile.yml + - configs/lad/metafile.yml + - configs/ld/metafile.yml + - configs/libra_rcnn/metafile.yml + - configs/lvis/metafile.yml + - configs/mask2former/metafile.yml + - configs/mask_rcnn/metafile.yml + - configs/maskformer/metafile.yml + - configs/ms_rcnn/metafile.yml + - configs/nas_fcos/metafile.yml + - configs/nas_fpn/metafile.yml + - configs/openimages/metafile.yml + - configs/paa/metafile.yml + - configs/pafpn/metafile.yml + - configs/panoptic_fpn/metafile.yml + - configs/pvt/metafile.yml + - configs/pisa/metafile.yml + - configs/point_rend/metafile.yml + - configs/queryinst/metafile.yml + - configs/regnet/metafile.yml + - configs/reppoints/metafile.yml + - configs/res2net/metafile.yml + - configs/resnest/metafile.yml + - configs/resnet_strikes_back/metafile.yml + - configs/retinanet/metafile.yml + - configs/rpn/metafile.yml + - configs/rtmdet/metafile.yml + - configs/sabl/metafile.yml + - configs/scnet/metafile.yml + - configs/scratch/metafile.yml + - configs/seesaw_loss/metafile.yml + - configs/simple_copy_paste/metafile.yml + - configs/soft_teacher/metafile.yml + - configs/sparse_rcnn/metafile.yml + - configs/solo/metafile.yml + - configs/solov2/metafile.yml + - configs/ssd/metafile.yml + - configs/strong_baselines/metafile.yml + - configs/swin/metafile.yml + - configs/tridentnet/metafile.yml + - configs/tood/metafile.yml + - configs/vfnet/metafile.yml + - configs/yolact/metafile.yml + - configs/yolo/metafile.yml + - configs/yolof/metafile.yml + - configs/yolox/metafile.yml + - configs/bytetrack/metafile.yml + - configs/strongsort/metafile.yml + - configs/ocsort/metafile.yml + - configs/sort/metafile.yml + - configs/deepsort/metafile.yml + - configs/qdtrack/metafile.yml + - configs/mask2former_vis/metafile.yml + - configs/masktrack_rcnn/metafile.yml + - configs/glip/metafile.yml + - configs/ddq/metafile.yml + - configs/grounding_dino/metafile.yml diff --git a/mmdetection/projects/AlignDETR/README.md b/mmdetection/projects/AlignDETR/README.md new file mode 100644 index 0000000..33690fe --- /dev/null +++ b/mmdetection/projects/AlignDETR/README.md @@ -0,0 +1,33 @@ +# AlignDETR + +> [Align-DETR: Improving DETR with Simple IoU-aware BCE loss](https://arxiv.org/abs/2304.07527) + + + +## Abstract + +DETR has set up a simple end-to-end pipeline for object detection by formulating this task as a set prediction problem, showing promising potential. However, despite the significant progress in improving DETR, this paper identifies a problem of misalignment in the output distribution, which prevents the best-regressed samples from being assigned with high confidence, hindering the model's accuracy. We propose a metric, recall of best-regressed samples, to quantitively evaluate the misalignment problem. Observing its importance, we propose a novel Align-DETR that incorporates a localization precision-aware classification loss in optimization. The proposed loss, IA-BCE, guides the training of DETR to build a strong correlation between classification score and localization precision. We also adopt the mixed-matching strategy, to facilitate DETR-based detectors with faster training convergence while keeping an end-to-end scheme. Moreover, to overcome the dramatic decrease in sample quality induced by the sparsity of queries, we introduce a prime sample weighting mechanism to suppress the interference of unimportant samples. Extensive experiments are conducted with very competitive results reported. In particular, it delivers a 46 (+3.8)% AP on the DAB-DETR baseline with the ResNet-50 backbone and reaches a new SOTA performance of 50.2% AP in the 1x setting on the COCO validation set when employing the strong baseline DINO. + +![image](https://github.com/open-mmlab/mmdetection/assets/33146359/5a4fa664-b4c6-487d-b6d8-22be9d59a2bc) + +## Results and Models + +| Backbone | Model | Lr schd | box AP | Config | Download | +| :------: | :---------: | :-----: | :----: | :------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | DINO-4scale | 12e | 50.5 | [config](./align_detr-4scale_r50_8xb2-12e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/align_detr/align_detr-4scale_r50_8xb2-12e_coco/align_detr-4scale_r50_8xb2-12e_coco_20230914_095734-61f921af.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/align_detr/align_detr-4scale_r50_8xb2-12e_coco/align_detr-4scale_r50_8xb2-12e_coco_20230914_095734.log.json) | +| R-50 | DINO-4scale | 24e | 51.4 | [config](./align_detr-4scale_r50_8xb2-24e_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/align_detr/align_detr-4scale_r50_8xb2-24e_coco/align_detr-4scale_r50_8xb2-24e_coco_20230919_152414-f4b6cf76.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/align_detr/align_detr-4scale_r50_8xb2-24e_coco/align_detr-4scale_r50_8xb2-24e_coco_20230919_152414.log.json) | + +## Citation + +We provide the config files for AlignDETR: [Align-DETR: Improving DETR with Simple IoU-aware BCE loss](https://arxiv.org/abs/2304.07527). + +```latex +@misc{cai2023aligndetr, + title={Align-DETR: Improving DETR with Simple IoU-aware BCE loss}, + author={Zhi Cai and Songtao Liu and Guodong Wang and Zheng Ge and Xiangyu Zhang and Di Huang}, + year={2023}, + eprint={2304.07527}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/mmdetection/projects/AlignDETR/align_detr/__init__.py b/mmdetection/projects/AlignDETR/align_detr/__init__.py new file mode 100644 index 0000000..26a49b5 --- /dev/null +++ b/mmdetection/projects/AlignDETR/align_detr/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .align_detr_head import AlignDETRHead +from .mixed_hungarian_assigner import MixedHungarianAssigner + +__all__ = ['AlignDETRHead', 'MixedHungarianAssigner'] diff --git a/mmdetection/projects/AlignDETR/align_detr/align_detr_head.py b/mmdetection/projects/AlignDETR/align_detr/align_detr_head.py new file mode 100644 index 0000000..c06d1bd --- /dev/null +++ b/mmdetection/projects/AlignDETR/align_detr/align_detr_head.py @@ -0,0 +1,508 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Dict, List, Tuple, Union + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.dense_heads import DINOHead +from mmdet.registry import MODELS +from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps, + bbox_xyxy_to_cxcywh) +from mmdet.utils import InstanceList +from .utils import KeysRecorder + + +@MODELS.register_module() +class AlignDETRHead(DINOHead): + r"""Head of the Align-DETR: Improving DETR with Simple IoU-aware BCE loss + + Code is modified from the `official github repo + `_. + + More details can be found in the `paper + `_ . + + Args: + all_layers_num_gt_repeat List[int]: Number to repeat gt for 1-to-k + matching between ground truth and predictions of each decoder + layer. Only used for matching queries, not for denoising queries. + Element count is `num_pred_layer`. If `as_two_stage` is True, then + the last element is for encoder output, and the others for + decoder layers. Otherwise, all elements are for decoder layers. + Defaults to a list of `1` for the last decoder layer and `2` for + the others. + alpha (float): Hyper-parameter of classification loss that controls + the proportion of each item to calculate `t`, the weighted + geometric average of the confident score and the IoU score, to + align classification and regression scores. Defaults to `0.25`. + gamma (float): Hyper-parameter of classification loss to do the hard + negative mining. Defaults to `2.0`. + tau (float): Hyper-parameter of classification and regression losses, + it is the temperature controlling the sharpness of the function + to calculate positive sample weight. Defaults to `1.5`. + """ + + def __init__(self, + *args, + all_layers_num_gt_repeat: List[int] = None, + alpha: float = 0.25, + gamma: float = 2.0, + tau: float = 1.5, + **kwargs) -> None: + self.all_layers_num_gt_repeat = all_layers_num_gt_repeat + self.alpha = alpha + self.gamma = gamma + self.tau = tau + self.weight_table = torch.zeros( + len(all_layers_num_gt_repeat), max(all_layers_num_gt_repeat)) + for layer_index, num_gt_repeat in enumerate(all_layers_num_gt_repeat): + self.weight_table[layer_index][:num_gt_repeat] = torch.exp( + -torch.arange(num_gt_repeat) / tau) + + super().__init__(*args, **kwargs) + assert len(self.all_layers_num_gt_repeat) == self.num_pred_layer + + def loss_by_feat(self, all_layers_cls_scores: Tensor, *args, + **kwargs) -> Any: + """Loss function. + AlignDETR: This method is based on `DINOHead.loss_by_feat`. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels), where + `num_queries_total` is the sum of `num_denoising_queries` + and `num_matching_queries`. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + # Wrap `all_layers_cls_scores` with KeysRecorder to record its + # `__getitem__` keys and get decoder layer index. + all_layers_cls_scores = KeysRecorder(all_layers_cls_scores) + result = super(AlignDETRHead, + self).loss_by_feat(all_layers_cls_scores, *args, + **kwargs) + return result + + def loss_by_feat_single(self, cls_scores: Union[KeysRecorder, Tensor], + bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict]) -> Tuple[Tensor]: + """Loss function for outputs from a single decoder layer of a single + feature level. + AlignDETR: This method is based on `DINOHead.loss_by_feat_single`. + + Args: + cls_scores (Union[KeysRecorder, Tensor]): Box score logits from a + single decoder layer for all images, has shape (bs, + num_queries, cls_out_channels). + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape (bs, num_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + # AlignDETR: Get layer_index. + if isinstance(cls_scores, KeysRecorder): + # Outputs are from decoder layer. Get layer_index from + # `__getitem__` keys history. + keys = [key for key in cls_scores.keys if isinstance(key, int)] + assert len(keys) == 1, \ + 'Failed to extract key from cls_scores.keys: {}'.format(keys) + layer_index = keys[0] + # Get dn_cls_scores tensor. + cls_scores = cls_scores.obj + else: + # Outputs are from encoder layer. + layer_index = self.num_pred_layer - 1 + + for img_meta in batch_img_metas: + img_meta['layer_index'] = layer_index + + results = super(AlignDETRHead, self).loss_by_feat_single( + cls_scores, + bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas) + return results + + def get_targets(self, cls_scores_list: List[Tensor], + bbox_preds_list: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict]) -> tuple: + """Compute regression and classification targets for a batch image. + + Outputs from a single decoder layer of a single feature level are used. + AlignDETR: This method is based on `DETRHead.get_targets`. + + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image, has shape [num_queries, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_queries, 4]. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + + Returns: + tuple: a tuple containing the following targets. + + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all images. + - bbox_targets_list (list[Tensor]): BBox targets for all images. + - bbox_weights_list (list[Tensor]): BBox weights for all images. + - num_total_pos (int): Number of positive samples in all images. + - num_total_neg (int): Number of negative samples in all images. + """ + results = super(AlignDETRHead, + self).get_targets(cls_scores_list, bbox_preds_list, + batch_gt_instances, batch_img_metas) + + # AlignDETR: `num_total_pos` for matching queries is the number of + # unique gt bboxes in the batch. Refer to AlignDETR official code: + # https://github.com/FelixCaae/AlignDETR/blob/8c2b1806026e1b33fe1c282577de1647e352d7f0/aligndetr/criterions/base_criterion.py#L195C15-L195C15 # noqa: E501 + num_total_pos = sum( + len(gt_instances) for gt_instances in batch_gt_instances) + + results = list(results) + results[-2] = num_total_pos + return tuple(results) + + def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor, + gt_instances: InstanceData, + img_meta: dict) -> tuple: + """Compute regression and classification targets for one image. + + Outputs from a single decoder layer of a single feature level are used. + AlignDETR: This method is based on `DETRHead._get_targets_single`. + + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_queries, 4]. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + layer_index (int): Decoder layer index for the outputs. Defaults + to `-1`. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + img_h, img_w = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + # convert bbox_pred from xywh, normalized to xyxy, unnormalized + bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) + bbox_pred = bbox_pred * factor + + pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred) + + # assigner and sampler + # AlignDETR: Get `k` of current layer. + layer_index = img_meta['layer_index'] + num_gt_repeat = self.all_layers_num_gt_repeat[layer_index] + assign_result = self.assigner.assign( + pred_instances=pred_instances, + gt_instances=gt_instances, + img_meta=img_meta, + k=num_gt_repeat) + + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + pos_inds = torch.nonzero( + assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique() + neg_inds = torch.nonzero( + assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique() + pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 + pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :] + + # AlignDETR: Get label targets, label weights, and bbox weights. + target_results = self._get_align_detr_targets_single( + cls_score, + bbox_pred, + gt_labels, + pos_gt_bboxes, + pos_inds, + pos_assigned_gt_inds, + layer_index, + is_matching_queries=True) + + label_targets, label_weights, bbox_weights = target_results + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype) + + # DETR regress the relative position of boxes (cxcywh) in the image. + # Thus the learning target should be normalized by the image size, also + # the box format should be converted from defaultly x1y1x2y2 to cxcywh. + pos_gt_bboxes_normalized = pos_gt_bboxes / factor + pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized) + bbox_targets[pos_inds] = pos_gt_bboxes_targets + return (label_targets, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds) + + def _loss_dn_single(self, dn_cls_scores: KeysRecorder, + dn_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + dn_meta: Dict[str, int]) -> Tuple[Tensor]: + """Denoising loss for outputs from a single decoder layer. + AlignDETR: This method is based on `DINOHead._loss_dn_single`. + + Args: + dn_cls_scores (KeysRecorder): Classification scores of a single + decoder layer in denoising part, has shape (bs, + num_denoising_queries, cls_out_channels). + dn_bbox_preds (Tensor): Regression outputs of a single decoder + layer in denoising part. Each is a 4D-tensor with normalized + coordinate format (cx, cy, w, h) and has shape + (bs, num_denoising_queries, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and + `loss_iou`. + """ + # AlignDETR: Get dn_cls_scores tensor. + dn_cls_scores = dn_cls_scores.obj + + # AlignDETR: Add layer outputs to meta info because they are not + # variables of method `_get_dn_targets_single`. + for image_index, img_meta in enumerate(batch_img_metas): + img_meta['dn_cls_score'] = dn_cls_scores[image_index] + img_meta['dn_bbox_pred'] = dn_bbox_preds[image_index] + + results = super()._loss_dn_single(dn_cls_scores, dn_bbox_preds, + batch_gt_instances, batch_img_metas, + dn_meta) + return results + + def _get_dn_targets_single(self, gt_instances: InstanceData, + img_meta: dict, dn_meta: Dict[str, + int]) -> tuple: + """Get targets in denoising part for one image. + AlignDETR: This method is based on + `DINOHead._get_dn_targets_single`. + and 1) Added passing `dn_cls_score`, `dn_bbox_pred` to this + method; 2) Modified the way to get targets. + Args: + dn_cls_score (Tensor): Box score logits from a single decoder + layer in denoising part for one image, has shape + [num_denoising_queries, cls_out_channels]. + dn_bbox_pred (Tensor): Sigmoid outputs from a single decoder + layer in denoising part for one image, with + normalized coordinate (cx, cy, w, h) and shape + [num_denoising_queries, 4]. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for one image. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + num_groups = dn_meta['num_denoising_groups'] + num_denoising_queries = dn_meta['num_denoising_queries'] + num_queries_each_group = int(num_denoising_queries / num_groups) + device = gt_bboxes.device + + if len(gt_labels) > 0: + t = torch.arange(len(gt_labels), dtype=torch.long, device=device) + t = t.unsqueeze(0).repeat(num_groups, 1) + pos_assigned_gt_inds = t.flatten() + pos_inds = torch.arange( + num_groups, dtype=torch.long, device=device) + pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t + pos_inds = pos_inds.flatten() + else: + pos_inds = pos_assigned_gt_inds = \ + gt_bboxes.new_tensor([], dtype=torch.long) + + neg_inds = pos_inds + num_queries_each_group // 2 + + # AlignDETR: Get meta info and layer outputs. + img_h, img_w = img_meta['img_shape'] + dn_cls_score = img_meta['dn_cls_score'] + dn_bbox_pred = img_meta['dn_bbox_pred'] + factor = dn_bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + + # AlignDETR: Convert dn_bbox_pred from xywh, normalized to xyxy, + # unnormalized. + dn_bbox_pred = bbox_cxcywh_to_xyxy(dn_bbox_pred) + dn_bbox_pred = dn_bbox_pred * factor + + # AlignDETR: Get label targets, label weights, and bbox weights. + target_results = self._get_align_detr_targets_single( + dn_cls_score, dn_bbox_pred, gt_labels, + gt_bboxes.repeat([num_groups, 1]), pos_inds, pos_assigned_gt_inds) + + label_targets, label_weights, bbox_weights = target_results + + # bbox targets + bbox_targets = torch.zeros(num_denoising_queries, 4, device=device) + + # DETR regress the relative position of boxes (cxcywh) in the image. + # Thus the learning target should be normalized by the image size, also + # the box format should be converted from defaultly x1y1x2y2 to cxcywh. + gt_bboxes_normalized = gt_bboxes / factor + gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized) + bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1]) + + return (label_targets, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds) + + def _get_align_detr_targets_single(self, + cls_score: Tensor, + bbox_pred: Tensor, + gt_labels: Tensor, + pos_gt_bboxes: Tensor, + pos_inds: Tensor, + pos_assigned_gt_inds: Tensor, + layer_index: int = -1, + is_matching_queries: bool = False): + '''AlignDETR: Get label targets, label weights, and bbox weights based + on `t`, the weighted geometric average of the confident score and + the IoU score, to align classification and regression scores. + + Args: + cls_score (Tensor): Box score logits from the last encoder layer + or a single decoder layer for one image. Shape + [num_queries or num_denoising_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from the last encoder layer + or a single decoder layer for one image, with unnormalized + coordinate (x, y, x, y) and shape + [num_queries or num_denoising_queries, 4]. + gt_labels (Tensor): Ground truth classification labels for one + image, has shape [num_gt]. + pos_gt_bboxes (Tensor): Positive ground truth bboxes for one + image, with unnormalized coordinate (x, y, x, y) and shape + [num_positive, 4]. + pos_inds (Tensor): Positive prediction box indices, has shape + [num_positive]. + pos_assigned_gt_inds Tensor: Positive ground truth box indices, + has shape [num_positive]. + layer_index (int): decoder layer index for the outputs. Defaults + to `-1`. + is_matching_queries (bool): The outputs are from matching + queries or denoising queries. Defaults to `False`. + + Returns: + tuple[Tensor]: a tuple containing the following for one image. + + - label_targets (Tensor): Labels of one image. Shape + [num_queries or num_denoising_queries, cls_out_channels]. + - label_weights (Tensor): Label weights of one image. Shape + [num_queries or num_denoising_queries, cls_out_channels]. + - bbox_weights (Tensor): BBox weights of one image. Shape + [num_queries or num_denoising_queries, 4]. + ''' + + # Classification loss + # = 1 * BCE(prob, t * rank_weights) for positive sample; + # = prob**gamma * BCE(prob, 0) for negative sample. + # That is, + # label_targets = 0 for negative sample; + # = t * rank_weights for positive sample. + # label_weights = pred**gamma for negative sample; + # = 1 for positive sample. + cls_prob = cls_score.sigmoid() + label_targets = torch.zeros_like( + cls_score, device=pos_gt_bboxes.device) + label_weights = cls_prob**self.gamma + + bbox_weights = torch.zeros_like(bbox_pred, dtype=pos_gt_bboxes.dtype) + + if len(pos_inds) == 0: + return label_targets, label_weights, bbox_weights + + pos_cls_score_inds = (pos_inds, gt_labels[pos_assigned_gt_inds]) + iou_scores = bbox_overlaps( + bbox_pred[pos_inds], pos_gt_bboxes, is_aligned=True) + + # t (Tensor): The weighted geometric average of the confident score + # and the IoU score, to align classification and regression scores. + # Shape [num_positive]. + t = ( + cls_prob[pos_cls_score_inds]**self.alpha * + iou_scores**(1 - self.alpha)) + t = torch.clamp(t, 0.01).detach() + + # Calculate rank_weights for matching queries. + if is_matching_queries: + # rank_weights (Tensor): Weights of each group of predictions + # assigned to the same positive gt bbox. Shape [num_positive]. + rank_weights = torch.zeros_like(t, dtype=self.weight_table.dtype) + + assert 0 <= layer_index < len(self.weight_table), layer_index + rank_to_weight = self.weight_table[layer_index].to( + rank_weights.device) + unique_gt_inds = torch.unique(pos_assigned_gt_inds) + + # For each positive gt bbox, get all predictions assigned to it, + # then calculate rank weights for this group of predictions. + for gt_index in unique_gt_inds: + pred_group_cond = pos_assigned_gt_inds == gt_index + # Weights are based on their rank sorted by t in the group. + pred_group = t[pred_group_cond] + indices = pred_group.sort(descending=True)[1] + group_weights = torch.zeros_like( + indices, dtype=self.weight_table.dtype) + group_weights[indices] = rank_to_weight[:len(indices)] + rank_weights[pred_group_cond] = group_weights + + t = t * rank_weights + pos_bbox_weights = rank_weights.unsqueeze(-1).repeat( + 1, bbox_pred.size(-1)) + bbox_weights[pos_inds] = pos_bbox_weights + else: + bbox_weights[pos_inds] = 1.0 + + label_targets[pos_cls_score_inds] = t + label_weights[pos_cls_score_inds] = 1.0 + + return label_targets, label_weights, bbox_weights diff --git a/mmdetection/projects/AlignDETR/align_detr/mixed_hungarian_assigner.py b/mmdetection/projects/AlignDETR/align_detr/mixed_hungarian_assigner.py new file mode 100644 index 0000000..cc31b5e --- /dev/null +++ b/mmdetection/projects/AlignDETR/align_detr/mixed_hungarian_assigner.py @@ -0,0 +1,162 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Union + +import torch +from mmengine import ConfigDict +from mmengine.structures import InstanceData +from scipy.optimize import linear_sum_assignment +from torch import Tensor + +from mmdet.models.task_modules import AssignResult, BaseAssigner +from mmdet.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class MixedHungarianAssigner(BaseAssigner): + """Computes 1-to-k matching between ground truth and predictions. + + This class computes an assignment between the targets and the predictions + based on the costs. The costs are weighted sum of some components. + For DETR the costs are weighted sum of classification cost, regression L1 + cost and regression iou cost. The targets don't include the no_object, so + generally there are more predictions than targets. After the 1-to-k + gt-pred matching, the un-matched are treated as backgrounds. Thus + each query prediction will be assigned with `0` or a positive integer + indicating the ground truth index: + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + match_costs (:obj:`ConfigDict` or dict or \ + List[Union[:obj:`ConfigDict`, dict]]): Match cost configs. + """ + + def __init__( + self, match_costs: Union[List[Union[dict, ConfigDict]], dict, + ConfigDict] + ) -> None: + + if isinstance(match_costs, dict): + match_costs = [match_costs] + elif isinstance(match_costs, list): + assert len(match_costs) > 0, \ + 'match_costs must not be a empty list.' + + self.match_costs = [ + TASK_UTILS.build(match_cost) for match_cost in match_costs + ] + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + img_meta: Optional[dict] = None, + k: int = 1, + **kwargs) -> AssignResult: + """Computes 1-to-k gt-pred matching based on the weighted costs. + + This method assign each query prediction to a ground truth or + background. The `assigned_gt_inds` with -1 means don't care, + 0 means negative sample, and positive number is the index (1-based) + of assigned gt. + The assignment is done in the following steps, the order matters. + + 1. Assign every prediction to -1. + 2. Compute the weighted costs, each cost has shape + (num_preds, num_gts). + 3. Update k according to num_preds and num_gts, then repeat + costs k times to shape: (num_preds, k * num_gts), so that each + gt will match k predictions. + 4. Do Hungarian matching on CPU based on the costs. + 5. Assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. It may includes ``masks``, with shape + (n, h, w) or (n, l). + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + ``labels``, with shape (k, ) and ``masks``, with shape + (k, h, w) or (k, l). + img_meta (dict): Image information for one image. + + Returns: + :obj:`AssignResult`: The assigned result. + """ + assert isinstance(gt_instances.labels, Tensor) + num_gts, num_preds = len(gt_instances), len(pred_instances) + gt_labels = gt_instances.labels + device = gt_labels.device + + # 1. Assign -1 by default. + assigned_gt_inds = torch.full((num_preds, ), + -1, + dtype=torch.long, + device=device) + assigned_labels = torch.full((num_preds, ), + -1, + dtype=torch.long, + device=device) + + if num_gts == 0 or num_preds == 0: + # No ground truth or boxes, return empty assignment. + if num_gts == 0: + # No ground truth, assign all to background. + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts=num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) + + # 2. Compute weighted costs. + cost_list = [] + for match_cost in self.match_costs: + cost = match_cost( + pred_instances=pred_instances, + gt_instances=gt_instances, + img_meta=img_meta) + cost_list.append(cost) + cost = torch.stack(cost_list).sum(dim=0) + + # 3. Update k according to num_preds and num_gts, then + # repeat the ground truth k times to perform 1-to-k gt-pred + # matching. For example, if num_preds = 900, num_gts = 3, then + # there are only 3 gt-pred pairs in sum for 1-1 matching. + # However, for 1-k gt-pred matching, if k = 4, then each + # gt is assigned 4 unique predictions, so there would be 12 + # gt-pred pairs in sum. + k = max(1, min(k, num_preds // num_gts)) + cost = cost.repeat(1, k) + + # 4. Do Hungarian matching on CPU using linear_sum_assignment. + cost = cost.detach().cpu() + if linear_sum_assignment is None: + raise ImportError('Please run "pip install scipy" ' + 'to install scipy first.') + + matched_row_inds, matched_col_inds = linear_sum_assignment(cost) + matched_row_inds = torch.from_numpy(matched_row_inds).to(device) + matched_col_inds = torch.from_numpy(matched_col_inds).to(device) + + matched_col_inds = matched_col_inds % num_gts + # 5. Assign backgrounds and foregrounds. + # Assign all indices to backgrounds first. + assigned_gt_inds[:] = 0 + # Assign foregrounds based on matching results. + assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 + assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] + assign_result = AssignResult( + num_gts=k * num_gts, + gt_inds=assigned_gt_inds, + max_overlaps=None, + labels=assigned_labels) + + return assign_result diff --git a/mmdetection/projects/AlignDETR/align_detr/utils.py b/mmdetection/projects/AlignDETR/align_detr/utils.py new file mode 100644 index 0000000..5a3c17e --- /dev/null +++ b/mmdetection/projects/AlignDETR/align_detr/utils.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, List, Optional + + +class KeysRecorder: + """Wrap object to record its `__getitem__` keys in the history. + + Args: + obj (object): Any object that supports `__getitem__`. + keys (List): List of keys already recorded. Default to None. + """ + + def __init__(self, obj: Any, keys: Optional[List[Any]] = None) -> None: + self.obj = obj + + if keys is None: + keys = [] + self.keys = keys + + def __getitem__(self, key: Any) -> 'KeysRecorder': + """Wrap method `__getitem__` to record its keys. + + Args: + key: Key that is passed to the object. + + Returns: + result (KeysRecorder): KeysRecorder instance that wraps sub_obj. + """ + sub_obj = self.obj.__getitem__(key) + keys = self.keys.copy() + keys.append(key) + # Create a KeysRecorder instance from the sub_obj. + result = KeysRecorder(sub_obj, keys) + return result diff --git a/mmdetection/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-12e_coco.py b/mmdetection/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-12e_coco.py new file mode 100644 index 0000000..0fe0699 --- /dev/null +++ b/mmdetection/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-12e_coco.py @@ -0,0 +1,185 @@ +_base_ = [ + '../../../configs/_base_/datasets/coco_detection.py', + '../../../configs/_base_/default_runtime.py' +] +custom_imports = dict( + imports=['projects.AlignDETR.align_detr'], allow_failed_imports=False) + +model = dict( + type='DINO', + num_queries=900, # num_matching_queries + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + # AlignDETR: Only freeze stem. + frozen_stages=0, + norm_cfg=dict(type='FrozenBN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + # AlignDETR: Add conv bias. + bias=True, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + encoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0))), # 0.1 for DeformDETR + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, + dropout=0.0), # 0.1 for DeformDETR + cross_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0)), # 0.1 for DeformDETR + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, + normalize=True, + # AlignDETR: Set offset and temperature the same as DeformDETR. + offset=-0.5, # -0.5 for DeformDETR + temperature=10000), # 10000 for DeformDETR + bbox_head=dict( + type='AlignDETRHead', + # AlignDETR: First 6 elements of `all_layers_num_gt_repeat` are for + # decoder layers' outputs. The last element is for encoder layer. + all_layers_num_gt_repeat=[2, 2, 2, 2, 2, 1, 2], + alpha=0.25, + gamma=2.0, + tau=1.5, + num_classes=80, + sync_cls_avg_factor=True, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), # 2.0 in DeformDETR + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + dn_cfg=dict( # TODO: Move to model.train_cfg ? + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, + num_dn_queries=100)), # TODO: half num_dn_queries + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MixedHungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) # 100 for DeformDETR + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] +train_dataloader = dict( + dataset=dict( + # AlignDETR: Filter empty gt. + filter_cfg=dict(filter_empty_gt=True), + pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='AdamW', + lr=0.0001, # 0.0002 for DeformDETR + weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1)}, + # AlignDETR: No norm decay. + norm_decay_mult=0.0) +) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa + +# learning policy +max_epochs = 12 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=False, + begin=0, + end=2000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-24e_coco.py b/mmdetection/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-24e_coco.py new file mode 100644 index 0000000..f62114c --- /dev/null +++ b/mmdetection/projects/AlignDETR/configs/align_detr-4scale_r50_8xb2-24e_coco.py @@ -0,0 +1,19 @@ +_base_ = './align_detr-4scale_r50_8xb2-12e_coco.py' +max_epochs = 24 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=False, + begin=0, + end=2000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[20], + gamma=0.1) +] diff --git a/mmdetection/projects/CO-DETR/README.md b/mmdetection/projects/CO-DETR/README.md new file mode 100644 index 0000000..787592a --- /dev/null +++ b/mmdetection/projects/CO-DETR/README.md @@ -0,0 +1,32 @@ +# CO-DETR + +> [DETRs with Collaborative Hybrid Assignments Training](https://arxiv.org/abs/2211.12860) + + + +## Abstract + +In this paper, we provide the observation that too few queries assigned as positive samples in DETR with one-to-one set matching leads to sparse supervision on the encoder's output which considerably hurt the discriminative feature learning of the encoder and vice visa for attention learning in the decoder. To alleviate this, we present a novel collaborative hybrid assignments training scheme, namely Co-DETR, to learn more efficient and effective DETR-based detectors from versatile label assignment manners. This new training scheme can easily enhance the encoder's learning ability in end-to-end detectors by training the multiple parallel auxiliary heads supervised by one-to-many label assignments such as ATSS and Faster RCNN. In addition, we conduct extra customized positive queries by extracting the positive coordinates from these auxiliary heads to improve the training efficiency of positive samples in the decoder. In inference, these auxiliary heads are discarded and thus our method introduces no additional parameters and computational cost to the original detector while requiring no hand-crafted non-maximum suppression (NMS). We conduct extensive experiments to evaluate the effectiveness of the proposed approach on DETR variants, including DAB-DETR, Deformable-DETR, and DINO-Deformable-DETR. The state-of-the-art DINO-Deformable-DETR with Swin-L can be improved from 58.5% to 59.5% AP on COCO val. Surprisingly, incorporated with ViT-L backbone, we achieve 66.0% AP on COCO test-dev and 67.9% AP on LVIS val, outperforming previous methods by clear margins with much fewer model sizes. + +
    + +
    + +## Results and Models + +| Model | Backbone | Epochs | Aug | Dataset | box AP | Config | Download | +| :-------: | :------: | :----: | :--: | :---------------------------: | :----: | :--------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Co-DINO | R50 | 12 | LSJ | COCO | 52.0 | [config](configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_r50_lsj_8xb2_1x_coco/co_dino_5scale_r50_lsj_8xb2_1x_coco-69a72d67.pth)\\ [log](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_r50_lsj_8xb2_1x_coco/co_dino_5scale_r50_lsj_8xb2_1x_coco_20230818_150457.json) | +| Co-DINO\* | R50 | 12 | DETR | COCO | 52.1 | [config](configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_r50_1x_coco-7481f903.pth) | +| Co-DINO\* | R50 | 36 | LSJ | COCO | 54.8 | [config](configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_lsj_r50_3x_coco-fe5a6829.pth) | +| Co-DINO\* | Swin-L | 12 | DETR | COCO | 58.9 | [config](configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_swin_large_1x_coco-27c13da4.pth) | +| Co-DINO\* | Swin-L | 12 | LSJ | COCO | 59.3 | [config](configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_lsj_swin_large_1x_coco-3af73af2.pth) | +| Co-DINO\* | Swin-L | 36 | DETR | COCO | 60.0 | [config](configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_swin_large_3x_coco-d7a6d8af.pth) | +| Co-DINO\* | Swin-L | 36 | LSJ | COCO | 60.7 | [config](configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_lsj_swin_large_1x_coco-3af73af2.pth) | +| Co-DINO\* | Swin-L | 16 | DETR | Objects365 pre-trained + COCO | 64.1 | [config](configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_swin_large_16e_o365tococo-614254c9.pth) | + +Note + +- Models labeled * are not trained by us, but from [CO-DETR](https://github.com/Sense-X/Co-DETR) official website. +- We find that the performance is unstable and may fluctuate by about 0.3 mAP. +- If you want to save GPU memory by enabling checkpointing, please use the `pip install fairscale` command. diff --git a/mmdetection/projects/CO-DETR/codetr/__init__.py b/mmdetection/projects/CO-DETR/codetr/__init__.py new file mode 100644 index 0000000..2ca4c02 --- /dev/null +++ b/mmdetection/projects/CO-DETR/codetr/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .co_atss_head import CoATSSHead +from .co_dino_head import CoDINOHead +from .co_roi_head import CoStandardRoIHead +from .codetr import CoDETR +from .transformer import (CoDinoTransformer, DetrTransformerDecoderLayer, + DetrTransformerEncoder, DinoTransformerDecoder) + +__all__ = [ + 'CoDETR', 'CoDinoTransformer', 'DinoTransformerDecoder', 'CoDINOHead', + 'CoATSSHead', 'CoStandardRoIHead', 'DetrTransformerEncoder', + 'DetrTransformerDecoderLayer' +] diff --git a/mmdetection/projects/CO-DETR/codetr/co_atss_head.py b/mmdetection/projects/CO-DETR/codetr/co_atss_head.py new file mode 100644 index 0000000..c6ae018 --- /dev/null +++ b/mmdetection/projects/CO-DETR/codetr/co_atss_head.py @@ -0,0 +1,153 @@ +from typing import List + +import torch +from torch import Tensor + +from mmdet.models.dense_heads import ATSSHead +from mmdet.models.utils import images_to_levels, multi_apply +from mmdet.registry import MODELS +from mmdet.utils import InstanceList, OptInstanceList, reduce_mean + + +@MODELS.register_module() +class CoATSSHead(ATSSHead): + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + centernesses: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + centernesses (list[Tensor]): Centerness for each scale + level with shape (N, num_anchors * 1, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + + (anchor_list, labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, avg_factor, ori_anchors, ori_labels, + ori_bbox_targets) = cls_reg_targets + + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + + losses_cls, losses_bbox, loss_centerness, \ + bbox_avg_factor = multi_apply( + self.loss_by_feat_single, + anchor_list, + cls_scores, + bbox_preds, + centernesses, + labels_list, + label_weights_list, + bbox_targets_list, + avg_factor=avg_factor) + + bbox_avg_factor = sum(bbox_avg_factor) + bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item() + losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox)) + + # diff + pos_coords = (ori_anchors, ori_labels, ori_bbox_targets, 'atss') + return dict( + loss_cls=losses_cls, + loss_bbox=losses_bbox, + loss_centerness=loss_centerness, + pos_coords=pos_coords) + + def get_targets(self, + anchor_list: List[List[Tensor]], + valid_flag_list: List[List[Tensor]], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None, + unmap_outputs: bool = True) -> tuple: + """Get targets for ATSS head. + + This method is almost the same as `AnchorHead.get_targets()`. Besides + returning the targets as the parent method does, it also returns the + anchors as the first element of the returned tuple. + """ + num_imgs = len(batch_img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + num_level_anchors_list = [num_level_anchors] * num_imgs + + # concat all level anchors and flags to a single tensor + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + anchor_list[i] = torch.cat(anchor_list[i]) + valid_flag_list[i] = torch.cat(valid_flag_list[i]) + + # compute targets for each image + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + (all_anchors, all_labels, all_label_weights, all_bbox_targets, + all_bbox_weights, pos_inds_list, neg_inds_list, + sampling_results_list) = multi_apply( + self._get_targets_single, + anchor_list, + valid_flag_list, + num_level_anchors_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore, + unmap_outputs=unmap_outputs) + # Get `avg_factor` of all images, which calculate in `SamplingResult`. + # When using sampling method, avg_factor is usually the sum of + # positive and negative priors. When using `PseudoSampler`, + # `avg_factor` is usually equal to the number of positive priors. + avg_factor = sum( + [results.avg_factor for results in sampling_results_list]) + # split targets to a list w.r.t. multiple levels + anchors_list = images_to_levels(all_anchors, num_level_anchors) + labels_list = images_to_levels(all_labels, num_level_anchors) + label_weights_list = images_to_levels(all_label_weights, + num_level_anchors) + bbox_targets_list = images_to_levels(all_bbox_targets, + num_level_anchors) + bbox_weights_list = images_to_levels(all_bbox_weights, + num_level_anchors) + + # diff + ori_anchors = all_anchors + ori_labels = all_labels + ori_bbox_targets = all_bbox_targets + return (anchors_list, labels_list, label_weights_list, + bbox_targets_list, bbox_weights_list, avg_factor, ori_anchors, + ori_labels, ori_bbox_targets) diff --git a/mmdetection/projects/CO-DETR/codetr/co_dino_head.py b/mmdetection/projects/CO-DETR/codetr/co_dino_head.py new file mode 100644 index 0000000..192acf9 --- /dev/null +++ b/mmdetection/projects/CO-DETR/codetr/co_dino_head.py @@ -0,0 +1,677 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Linear +from mmcv.ops import batched_nms +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models import DINOHead +from mmdet.models.layers import CdnQueryGenerator +from mmdet.models.layers.transformer import inverse_sigmoid +from mmdet.models.utils import multi_apply +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps, + bbox_xyxy_to_cxcywh) +from mmdet.utils import InstanceList, reduce_mean + + +@MODELS.register_module() +class CoDINOHead(DINOHead): + + def __init__(self, + *args, + num_query=900, + transformer=None, + in_channels=2048, + max_pos_coords=300, + dn_cfg=None, + use_zero_padding=False, + positional_encoding=dict( + type='SinePositionalEncoding', + num_feats=128, + normalize=True), + **kwargs): + self.with_box_refine = True + self.mixed_selection = True + self.in_channels = in_channels + self.max_pos_coords = max_pos_coords + self.positional_encoding = positional_encoding + self.num_query = num_query + self.use_zero_padding = use_zero_padding + + if 'two_stage_num_proposals' in transformer: + assert transformer['two_stage_num_proposals'] == num_query, \ + 'two_stage_num_proposals must be equal to num_query for DINO' + else: + transformer['two_stage_num_proposals'] = num_query + transformer['as_two_stage'] = True + if self.mixed_selection: + transformer['mixed_selection'] = self.mixed_selection + self.transformer = transformer + self.act_cfg = transformer.get('act_cfg', + dict(type='ReLU', inplace=True)) + + super().__init__(*args, **kwargs) + + self.activate = MODELS.build(self.act_cfg) + self.positional_encoding = MODELS.build(self.positional_encoding) + self.init_denoising(dn_cfg) + + def _init_layers(self): + self.transformer = MODELS.build(self.transformer) + self.embed_dims = self.transformer.embed_dims + assert hasattr(self.positional_encoding, 'num_feats') + num_feats = self.positional_encoding.num_feats + assert num_feats * 2 == self.embed_dims, 'embed_dims should' \ + f' be exactly 2 times of num_feats. Found {self.embed_dims}' \ + f' and {num_feats}.' + """Initialize classification branch and regression branch of head.""" + fc_cls = Linear(self.embed_dims, self.cls_out_channels) + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, 4)) + reg_branch = nn.Sequential(*reg_branch) + + def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + # last reg_branch is used to generate proposal from + # encode feature map when as_two_stage is True. + num_pred = (self.transformer.decoder.num_layers + 1) if \ + self.as_two_stage else self.transformer.decoder.num_layers + + self.cls_branches = _get_clones(fc_cls, num_pred) + self.reg_branches = _get_clones(reg_branch, num_pred) + + self.downsample = nn.Sequential( + nn.Conv2d( + self.embed_dims, + self.embed_dims, + kernel_size=3, + stride=2, + padding=1), nn.GroupNorm(32, self.embed_dims)) + + def init_denoising(self, dn_cfg): + if dn_cfg is not None: + dn_cfg['num_classes'] = self.num_classes + dn_cfg['num_matching_queries'] = self.num_query + dn_cfg['embed_dims'] = self.embed_dims + self.dn_generator = CdnQueryGenerator(**dn_cfg) + + def forward(self, + mlvl_feats, + img_metas, + dn_label_query=None, + dn_bbox_query=None, + attn_mask=None): + batch_size = mlvl_feats[0].size(0) + input_img_h, input_img_w = img_metas[0]['batch_input_shape'] + img_masks = mlvl_feats[0].new_ones( + (batch_size, input_img_h, input_img_w)) + for img_id in range(batch_size): + img_h, img_w = img_metas[img_id]['img_shape'] + img_masks[img_id, :img_h, :img_w] = 0 + + mlvl_masks = [] + mlvl_positional_encodings = [] + for feat in mlvl_feats: + mlvl_masks.append( + F.interpolate(img_masks[None], + size=feat.shape[-2:]).to(torch.bool).squeeze(0)) + mlvl_positional_encodings.append( + self.positional_encoding(mlvl_masks[-1])) + + query_embeds = None + hs, inter_references, topk_score, topk_anchor, enc_outputs = \ + self.transformer( + mlvl_feats, + mlvl_masks, + query_embeds, + mlvl_positional_encodings, + dn_label_query, + dn_bbox_query, + attn_mask, + reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 + cls_branches=self.cls_branches if self.as_two_stage else None # noqa:E501 + ) + outs = [] + num_level = len(mlvl_feats) + start = 0 + for lvl in range(num_level): + bs, c, h, w = mlvl_feats[lvl].shape + end = start + h * w + feat = enc_outputs[start:end].permute(1, 2, 0).contiguous() + start = end + outs.append(feat.reshape(bs, c, h, w)) + outs.append(self.downsample(outs[-1])) + + hs = hs.permute(0, 2, 1, 3) + + if dn_label_query is not None and dn_label_query.size(1) == 0: + # NOTE: If there is no target in the image, the parameters of + # label_embedding won't be used in producing loss, which raises + # RuntimeError when using distributed mode. + hs[0] += self.dn_generator.label_embedding.weight[0, 0] * 0.0 + + outputs_classes = [] + outputs_coords = [] + + for lvl in range(hs.shape[0]): + reference = inter_references[lvl] + reference = inverse_sigmoid(reference, eps=1e-3) + outputs_class = self.cls_branches[lvl](hs[lvl]) + tmp = self.reg_branches[lvl](hs[lvl]) + if reference.shape[-1] == 4: + tmp += reference + else: + assert reference.shape[-1] == 2 + tmp[..., :2] += reference + outputs_coord = tmp.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + + outputs_classes = torch.stack(outputs_classes) + outputs_coords = torch.stack(outputs_coords) + + return outputs_classes, outputs_coords, topk_score, topk_anchor, outs + + def predict(self, + feats: List[Tensor], + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + outs = self.forward(feats, batch_img_metas) + + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas, rescale=rescale) + + return predictions + + def predict_by_feat(self, + all_cls_scores, + all_bbox_preds, + enc_cls_scores, + enc_bbox_preds, + enc_outputs, + batch_img_metas, + rescale=True): + + cls_scores = all_cls_scores[-1] + bbox_preds = all_bbox_preds[-1] + + result_list = [] + for img_id in range(len(batch_img_metas)): + cls_score = cls_scores[img_id] + bbox_pred = bbox_preds[img_id] + img_meta = batch_img_metas[img_id] + results = self._predict_by_feat_single(cls_score, bbox_pred, + img_meta, rescale) + result_list.append(results) + return result_list + + def _predict_by_feat_single(self, + cls_score: Tensor, + bbox_pred: Tensor, + img_meta: dict, + rescale: bool = True) -> InstanceData: + """Transform outputs from the last decoder layer into bbox predictions + for each image. + + Args: + cls_score (Tensor): Box score logits from the last decoder layer + for each image. Shape [num_queries, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from the last decoder layer + for each image, with coordinate format (cx, cy, w, h) and + shape [num_queries, 4]. + img_meta (dict): Image meta info. + rescale (bool): If True, return boxes in original image + space. Default True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_score) == len(bbox_pred) # num_queries + max_per_img = self.test_cfg.get('max_per_img', self.num_query) + score_thr = self.test_cfg.get('score_thr', 0) + with_nms = self.test_cfg.get('nms', None) + + img_shape = img_meta['img_shape'] + # exclude background + if self.loss_cls.use_sigmoid: + cls_score = cls_score.sigmoid() + scores, indexes = cls_score.view(-1).topk(max_per_img) + det_labels = indexes % self.num_classes + bbox_index = indexes // self.num_classes + bbox_pred = bbox_pred[bbox_index] + else: + scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1) + scores, bbox_index = scores.topk(max_per_img) + bbox_pred = bbox_pred[bbox_index] + det_labels = det_labels[bbox_index] + + if score_thr > 0: + valid_mask = scores > score_thr + scores = scores[valid_mask] + bbox_pred = bbox_pred[valid_mask] + det_labels = det_labels[valid_mask] + + det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred) + det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1] + det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0] + det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1]) + det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0]) + if rescale: + assert img_meta.get('scale_factor') is not None + det_bboxes /= det_bboxes.new_tensor( + img_meta['scale_factor']).repeat((1, 2)) + + results = InstanceData() + results.bboxes = det_bboxes + results.scores = scores + results.labels = det_labels + + if with_nms and results.bboxes.numel() > 0: + det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores, + results.labels, + self.test_cfg.nms) + results = results[keep_idxs] + results.scores = det_bboxes[:, -1] + results = results[:max_per_img] + + return results + + def loss(self, x, batch_data_samples): + assert self.dn_generator is not None, '"dn_cfg" must be set' + + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + dn_label_query, dn_bbox_query, attn_mask, dn_meta = \ + self.dn_generator(batch_data_samples) + + outs = self(x, batch_img_metas, dn_label_query, dn_bbox_query, + attn_mask) + + loss_inputs = outs[:-1] + (batch_gt_instances, batch_img_metas, + dn_meta) + losses = self.loss_by_feat(*loss_inputs) + enc_outputs = outs[-1] + return losses, enc_outputs + + def forward_aux(self, mlvl_feats, img_metas, aux_targets, head_idx): + """Forward function. + + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 4D-tensor with shape + (N, C, H, W). + img_metas (list[dict]): List of image information. + + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, h). \ + Shape [nb_dec, bs, num_query, 4]. + enc_outputs_class (Tensor): The score of each point on encode \ + feature map, has shape (N, h*w, num_class). Only when \ + as_two_stage is True it would be returned, otherwise \ + `None` would be returned. + enc_outputs_coord (Tensor): The proposal generate from the \ + encode feature map, has shape (N, h*w, 4). Only when \ + as_two_stage is True it would be returned, otherwise \ + `None` would be returned. + """ + aux_coords, aux_labels, aux_targets, aux_label_weights, \ + aux_bbox_weights, aux_feats, attn_masks = aux_targets + batch_size = mlvl_feats[0].size(0) + input_img_h, input_img_w = img_metas[0]['batch_input_shape'] + img_masks = mlvl_feats[0].new_ones( + (batch_size, input_img_h, input_img_w)) + for img_id in range(batch_size): + img_h, img_w = img_metas[img_id]['img_shape'] + img_masks[img_id, :img_h, :img_w] = 0 + + mlvl_masks = [] + mlvl_positional_encodings = [] + for feat in mlvl_feats: + mlvl_masks.append( + F.interpolate(img_masks[None], + size=feat.shape[-2:]).to(torch.bool).squeeze(0)) + mlvl_positional_encodings.append( + self.positional_encoding(mlvl_masks[-1])) + + query_embeds = None + hs, inter_references = self.transformer.forward_aux( + mlvl_feats, + mlvl_masks, + query_embeds, + mlvl_positional_encodings, + aux_coords, + pos_feats=aux_feats, + reg_branches=self.reg_branches if self.with_box_refine else None, + cls_branches=self.cls_branches if self.as_two_stage else None, + return_encoder_output=True, + attn_masks=attn_masks, + head_idx=head_idx) + + hs = hs.permute(0, 2, 1, 3) + outputs_classes = [] + outputs_coords = [] + + for lvl in range(hs.shape[0]): + reference = inter_references[lvl] + reference = inverse_sigmoid(reference, eps=1e-3) + outputs_class = self.cls_branches[lvl](hs[lvl]) + tmp = self.reg_branches[lvl](hs[lvl]) + if reference.shape[-1] == 4: + tmp += reference + else: + assert reference.shape[-1] == 2 + tmp[..., :2] += reference + outputs_coord = tmp.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + + outputs_classes = torch.stack(outputs_classes) + outputs_coords = torch.stack(outputs_coords) + + return outputs_classes, outputs_coords, None, None + + def loss_aux(self, + x, + pos_coords=None, + head_idx=0, + batch_data_samples=None): + batch_gt_instances = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + batch_img_metas.append(data_sample.metainfo) + batch_gt_instances.append(data_sample.gt_instances) + + gt_bboxes = [b.bboxes for b in batch_gt_instances] + gt_labels = [b.labels for b in batch_gt_instances] + + aux_targets = self.get_aux_targets(pos_coords, batch_img_metas, x, + head_idx) + outs = self.forward_aux(x[:-1], batch_img_metas, aux_targets, head_idx) + outs = outs + aux_targets + if gt_labels is None: + loss_inputs = outs + (gt_bboxes, batch_img_metas) + else: + loss_inputs = outs + (gt_bboxes, gt_labels, batch_img_metas) + losses = self.loss_aux_by_feat(*loss_inputs) + return losses + + def get_aux_targets(self, pos_coords, img_metas, mlvl_feats, head_idx): + coords, labels, targets = pos_coords[:3] + head_name = pos_coords[-1] + bs, c = len(coords), mlvl_feats[0].shape[1] + max_num_coords = 0 + all_feats = [] + for i in range(bs): + label = labels[i] + feats = [ + feat[i].reshape(c, -1).transpose(1, 0) for feat in mlvl_feats + ] + feats = torch.cat(feats, dim=0) + bg_class_ind = self.num_classes + pos_inds = ((label >= 0) + & (label < bg_class_ind)).nonzero().squeeze(1) + max_num_coords = max(max_num_coords, len(pos_inds)) + all_feats.append(feats) + max_num_coords = min(self.max_pos_coords, max_num_coords) + max_num_coords = max(9, max_num_coords) + + if self.use_zero_padding: + attn_masks = [] + label_weights = coords[0].new_zeros([bs, max_num_coords]) + else: + attn_masks = None + label_weights = coords[0].new_ones([bs, max_num_coords]) + bbox_weights = coords[0].new_zeros([bs, max_num_coords, 4]) + + aux_coords, aux_labels, aux_targets, aux_feats = [], [], [], [] + + for i in range(bs): + coord, label, target = coords[i], labels[i], targets[i] + feats = all_feats[i] + if 'rcnn' in head_name: + feats = pos_coords[-2][i] + num_coords_per_point = 1 + else: + num_coords_per_point = coord.shape[0] // feats.shape[0] + feats = feats.unsqueeze(1).repeat(1, num_coords_per_point, 1) + feats = feats.reshape(feats.shape[0] * num_coords_per_point, + feats.shape[-1]) + img_meta = img_metas[i] + img_h, img_w = img_meta['img_shape'] + factor = coord.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0) + bg_class_ind = self.num_classes + pos_inds = ((label >= 0) + & (label < bg_class_ind)).nonzero().squeeze(1) + neg_inds = (label == bg_class_ind).nonzero().squeeze(1) + if pos_inds.shape[0] > max_num_coords: + indices = torch.randperm( + pos_inds.shape[0])[:max_num_coords].cuda() + pos_inds = pos_inds[indices] + + coord = bbox_xyxy_to_cxcywh(coord[pos_inds] / factor) + label = label[pos_inds] + target = bbox_xyxy_to_cxcywh(target[pos_inds] / factor) + feat = feats[pos_inds] + + if self.use_zero_padding: + label_weights[i][:len(label)] = 1 + bbox_weights[i][:len(label)] = 1 + attn_mask = torch.zeros([ + max_num_coords, + max_num_coords, + ]).bool().to(coord.device) + else: + bbox_weights[i][:len(label)] = 1 + + if coord.shape[0] < max_num_coords: + padding_shape = max_num_coords - coord.shape[0] + if self.use_zero_padding: + padding_coord = coord.new_zeros([padding_shape, 4]) + padding_label = label.new_ones([padding_shape + ]) * self.num_classes + padding_target = target.new_zeros([padding_shape, 4]) + padding_feat = feat.new_zeros([padding_shape, c]) + attn_mask[coord.shape[0]:, 0:coord.shape[0], ] = True + attn_mask[:, coord.shape[0]:, ] = True + else: + indices = torch.randperm( + neg_inds.shape[0])[:padding_shape].cuda() + neg_inds = neg_inds[indices] + padding_coord = bbox_xyxy_to_cxcywh(coords[i][neg_inds] / + factor) + padding_label = labels[i][neg_inds] + padding_target = bbox_xyxy_to_cxcywh(targets[i][neg_inds] / + factor) + padding_feat = feats[neg_inds] + coord = torch.cat((coord, padding_coord), dim=0) + label = torch.cat((label, padding_label), dim=0) + target = torch.cat((target, padding_target), dim=0) + feat = torch.cat((feat, padding_feat), dim=0) + if self.use_zero_padding: + attn_masks.append(attn_mask.unsqueeze(0)) + aux_coords.append(coord.unsqueeze(0)) + aux_labels.append(label.unsqueeze(0)) + aux_targets.append(target.unsqueeze(0)) + aux_feats.append(feat.unsqueeze(0)) + + if self.use_zero_padding: + attn_masks = torch.cat( + attn_masks, dim=0).unsqueeze(1).repeat(1, 8, 1, 1) + attn_masks = attn_masks.reshape(bs * 8, max_num_coords, + max_num_coords) + else: + attn_masks = None + + aux_coords = torch.cat(aux_coords, dim=0) + aux_labels = torch.cat(aux_labels, dim=0) + aux_targets = torch.cat(aux_targets, dim=0) + aux_feats = torch.cat(aux_feats, dim=0) + aux_label_weights = label_weights + aux_bbox_weights = bbox_weights + return (aux_coords, aux_labels, aux_targets, aux_label_weights, + aux_bbox_weights, aux_feats, attn_masks) + + def loss_aux_by_feat(self, + all_cls_scores, + all_bbox_preds, + enc_cls_scores, + enc_bbox_preds, + aux_coords, + aux_labels, + aux_targets, + aux_label_weights, + aux_bbox_weights, + aux_feats, + attn_masks, + gt_bboxes_list, + gt_labels_list, + img_metas, + gt_bboxes_ignore=None): + num_dec_layers = len(all_cls_scores) + all_labels = [aux_labels for _ in range(num_dec_layers)] + all_label_weights = [aux_label_weights for _ in range(num_dec_layers)] + all_bbox_targets = [aux_targets for _ in range(num_dec_layers)] + all_bbox_weights = [aux_bbox_weights for _ in range(num_dec_layers)] + img_metas_list = [img_metas for _ in range(num_dec_layers)] + all_gt_bboxes_ignore_list = [ + gt_bboxes_ignore for _ in range(num_dec_layers) + ] + + losses_cls, losses_bbox, losses_iou = multi_apply( + self._loss_aux_by_feat_single, all_cls_scores, all_bbox_preds, + all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, + img_metas_list, all_gt_bboxes_ignore_list) + + loss_dict = dict() + # loss of proposal generated from encode feature map. + + # loss from the last decoder layer + loss_dict['loss_cls_aux'] = losses_cls[-1] + loss_dict['loss_bbox_aux'] = losses_bbox[-1] + loss_dict['loss_iou_aux'] = losses_iou[-1] + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1], + losses_bbox[:-1], + losses_iou[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls_aux'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox_aux'] = loss_bbox_i + loss_dict[f'd{num_dec_layer}.loss_iou_aux'] = loss_iou_i + num_dec_layer += 1 + return loss_dict + + def _loss_aux_by_feat_single(self, + cls_scores, + bbox_preds, + labels, + label_weights, + bbox_targets, + bbox_weights, + img_metas, + gt_bboxes_ignore_list=None): + num_imgs = cls_scores.size(0) + num_q = cls_scores.size(1) + + try: + labels = labels.reshape(num_imgs * num_q) + label_weights = label_weights.reshape(num_imgs * num_q) + bbox_targets = bbox_targets.reshape(num_imgs * num_q, 4) + bbox_weights = bbox_weights.reshape(num_imgs * num_q, 4) + except Exception: + return cls_scores.mean() * 0, cls_scores.mean( + ) * 0, cls_scores.mean() * 0 + + bg_class_ind = self.num_classes + num_total_pos = len( + ((labels >= 0) & (labels < bg_class_ind)).nonzero().squeeze(1)) + num_total_neg = num_imgs * num_q - num_total_pos + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + cls_avg_factor = max(cls_avg_factor, 1) + + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + scores = label_weights.new_zeros(labels.shape) + pos_bbox_targets = bbox_targets[pos_inds] + pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets) + pos_bbox_pred = bbox_preds.reshape(-1, 4)[pos_inds] + pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred) + scores[pos_inds] = bbox_overlaps( + pos_decode_bbox_pred.detach(), + pos_decode_bbox_targets, + is_aligned=True) + loss_cls = self.loss_cls( + cls_scores, (labels, scores), + weight=label_weights, + avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes across all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # construct factors used for rescale bboxes + factors = [] + for img_meta, bbox_pred in zip(img_metas, bbox_preds): + img_h, img_w = img_meta['img_shape'] + factor = bbox_pred.new_tensor([img_w, img_h, img_w, + img_h]).unsqueeze(0).repeat( + bbox_pred.size(0), 1) + factors.append(factor) + factors = torch.cat(factors, 0) + + # DETR regress the relative position of boxes (cxcywh) in the image, + # thus the learning target is normalized by the image size. So here + # we need to re-scale them for calculating IoU loss + bbox_preds = bbox_preds.reshape(-1, 4) + bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors + bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors + + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_iou( + bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos) + + # regression L1 loss + loss_bbox = self.loss_bbox( + bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos) + return loss_cls, loss_bbox, loss_iou diff --git a/mmdetection/projects/CO-DETR/codetr/co_roi_head.py b/mmdetection/projects/CO-DETR/codetr/co_roi_head.py new file mode 100644 index 0000000..9aafb53 --- /dev/null +++ b/mmdetection/projects/CO-DETR/codetr/co_roi_head.py @@ -0,0 +1,108 @@ +from typing import List, Tuple + +import torch +from torch import Tensor + +from mmdet.models.roi_heads import StandardRoIHead +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.models.utils import unpack_gt_instances +from mmdet.registry import MODELS +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import bbox2roi +from mmdet.utils import InstanceList + + +@MODELS.register_module() +class CoStandardRoIHead(StandardRoIHead): + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: List[DetDataSample]) -> dict: + max_proposal = 2000 + + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, _ = outputs + + # assign gts and sample proposals + num_imgs = len(batch_data_samples) + sampling_results = [] + for i in range(num_imgs): + # rename rpn_results.bboxes to rpn_results.priors + rpn_results = rpn_results_list[i] + rpn_results.priors = rpn_results.pop('bboxes') + + assign_result = self.bbox_assigner.assign( + rpn_results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + sampling_result = self.bbox_sampler.sample( + assign_result, + rpn_results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + losses = dict() + # bbox head forward and loss + if self.with_bbox: + bbox_results = self.bbox_loss(x, sampling_results) + losses.update(bbox_results['loss_bbox']) + + bbox_targets = bbox_results['bbox_targets'] + for res in sampling_results: + max_proposal = min(max_proposal, res.bboxes.shape[0]) + ori_coords = bbox2roi([res.bboxes for res in sampling_results]) + ori_proposals, ori_labels, \ + ori_bbox_targets, ori_bbox_feats = [], [], [], [] + for i in range(num_imgs): + idx = (ori_coords[:, 0] == i).nonzero().squeeze(1) + idx = idx[:max_proposal] + ori_proposal = ori_coords[idx][:, 1:].unsqueeze(0) + ori_label = bbox_targets[0][idx].unsqueeze(0) + ori_bbox_target = bbox_targets[2][idx].unsqueeze(0) + ori_bbox_feat = bbox_results['bbox_feats'].mean(-1).mean(-1) + ori_bbox_feat = ori_bbox_feat[idx].unsqueeze(0) + ori_proposals.append(ori_proposal) + ori_labels.append(ori_label) + ori_bbox_targets.append(ori_bbox_target) + ori_bbox_feats.append(ori_bbox_feat) + ori_coords = torch.cat(ori_proposals, dim=0) + ori_labels = torch.cat(ori_labels, dim=0) + ori_bbox_targets = torch.cat(ori_bbox_targets, dim=0) + ori_bbox_feats = torch.cat(ori_bbox_feats, dim=0) + pos_coords = (ori_coords, ori_labels, ori_bbox_targets, + ori_bbox_feats, 'rcnn') + losses.update(pos_coords=pos_coords) + + return losses + + def bbox_loss(self, x: Tuple[Tensor], + sampling_results: List[SamplingResult]) -> dict: + """Perform forward propagation and loss calculation of the bbox head on + the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + """ + rois = bbox2roi([res.priors for res in sampling_results]) + bbox_results = self._bbox_forward(x, rois) + + bbox_loss_and_target = self.bbox_head.loss_and_target( + cls_score=bbox_results['cls_score'], + bbox_pred=bbox_results['bbox_pred'], + rois=rois, + sampling_results=sampling_results, + rcnn_train_cfg=self.train_cfg) + + bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox']) + # diff + bbox_results.update(bbox_targets=bbox_loss_and_target['bbox_targets']) + return bbox_results diff --git a/mmdetection/projects/CO-DETR/codetr/codetr.py b/mmdetection/projects/CO-DETR/codetr/codetr.py new file mode 100644 index 0000000..82826f6 --- /dev/null +++ b/mmdetection/projects/CO-DETR/codetr/codetr.py @@ -0,0 +1,320 @@ +import copy +from typing import Tuple, Union + +import torch +import torch.nn as nn +from torch import Tensor + +from mmdet.models.detectors.base import BaseDetector +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList, SampleList +from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class CoDETR(BaseDetector): + + def __init__( + self, + backbone, + neck=None, + query_head=None, # detr head + rpn_head=None, # two-stage rpn + roi_head=[None], # two-stage + bbox_head=[None], # one-stage + train_cfg=[None, None], + test_cfg=[None, None], + # Control whether to consider positive samples + # from the auxiliary head as additional positive queries. + with_pos_coord=True, + use_lsj=True, + eval_module='detr', + # Evaluate the Nth head. + eval_index=0, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super(CoDETR, self).__init__( + data_preprocessor=data_preprocessor, init_cfg=init_cfg) + self.with_pos_coord = with_pos_coord + self.use_lsj = use_lsj + + assert eval_module in ['detr', 'one-stage', 'two-stage'] + self.eval_module = eval_module + + self.backbone = MODELS.build(backbone) + if neck is not None: + self.neck = MODELS.build(neck) + # Module index for evaluation + self.eval_index = eval_index + head_idx = 0 + if query_head is not None: + query_head.update(train_cfg=train_cfg[head_idx] if ( + train_cfg is not None and train_cfg[head_idx] is not None + ) else None) + query_head.update(test_cfg=test_cfg[head_idx]) + self.query_head = MODELS.build(query_head) + self.query_head.init_weights() + head_idx += 1 + + if rpn_head is not None: + rpn_train_cfg = train_cfg[head_idx].rpn if ( + train_cfg is not None + and train_cfg[head_idx] is not None) else None + rpn_head_ = rpn_head.copy() + rpn_head_.update( + train_cfg=rpn_train_cfg, test_cfg=test_cfg[head_idx].rpn) + self.rpn_head = MODELS.build(rpn_head_) + self.rpn_head.init_weights() + + self.roi_head = nn.ModuleList() + for i in range(len(roi_head)): + if roi_head[i]: + rcnn_train_cfg = train_cfg[i + head_idx].rcnn if ( + train_cfg + and train_cfg[i + head_idx] is not None) else None + roi_head[i].update(train_cfg=rcnn_train_cfg) + roi_head[i].update(test_cfg=test_cfg[i + head_idx].rcnn) + self.roi_head.append(MODELS.build(roi_head[i])) + self.roi_head[-1].init_weights() + + self.bbox_head = nn.ModuleList() + for i in range(len(bbox_head)): + if bbox_head[i]: + bbox_head[i].update( + train_cfg=train_cfg[i + head_idx + len(self.roi_head)] if ( + train_cfg and train_cfg[i + head_idx + + len(self.roi_head)] is not None + ) else None) + bbox_head[i].update(test_cfg=test_cfg[i + head_idx + + len(self.roi_head)]) + self.bbox_head.append(MODELS.build(bbox_head[i])) + self.bbox_head[-1].init_weights() + + self.head_idx = head_idx + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + @property + def with_rpn(self): + """bool: whether the detector has RPN""" + return hasattr(self, 'rpn_head') and self.rpn_head is not None + + @property + def with_query_head(self): + """bool: whether the detector has a RoI head""" + return hasattr(self, 'query_head') and self.query_head is not None + + @property + def with_roi_head(self): + """bool: whether the detector has a RoI head""" + return hasattr(self, 'roi_head') and self.roi_head is not None and len( + self.roi_head) > 0 + + @property + def with_shared_head(self): + """bool: whether the detector has a shared head in the RoI Head""" + return hasattr(self, 'roi_head') and self.roi_head[0].with_shared_head + + @property + def with_bbox(self): + """bool: whether the detector has a bbox head""" + return ((hasattr(self, 'roi_head') and self.roi_head is not None + and len(self.roi_head) > 0) + or (hasattr(self, 'bbox_head') and self.bbox_head is not None + and len(self.bbox_head) > 0)) + + def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]: + """Extract features. + + Args: + batch_inputs (Tensor): Image tensor, has shape (bs, dim, H, W). + + Returns: + tuple[Tensor]: Tuple of feature maps from neck. Each feature map + has shape (bs, dim, H, W). + """ + x = self.backbone(batch_inputs) + if self.with_neck: + x = self.neck(x) + return x + + def _forward(self, + batch_inputs: Tensor, + batch_data_samples: OptSampleList = None): + pass + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, list]: + batch_input_shape = batch_data_samples[0].batch_input_shape + if self.use_lsj: + for data_samples in batch_data_samples: + img_metas = data_samples.metainfo + input_img_h, input_img_w = batch_input_shape + img_metas['img_shape'] = [input_img_h, input_img_w] + + x = self.extract_feat(batch_inputs) + + losses = dict() + + def upd_loss(losses, idx, weight=1): + new_losses = dict() + for k, v in losses.items(): + new_k = '{}{}'.format(k, idx) + if isinstance(v, list) or isinstance(v, tuple): + new_losses[new_k] = [i * weight for i in v] + else: + new_losses[new_k] = v * weight + return new_losses + + # DETR encoder and decoder forward + if self.with_query_head: + bbox_losses, x = self.query_head.loss(x, batch_data_samples) + losses.update(bbox_losses) + + # RPN forward and loss + if self.with_rpn: + proposal_cfg = self.train_cfg[self.head_idx].get( + 'rpn_proposal', self.test_cfg[self.head_idx].rpn) + + rpn_data_samples = copy.deepcopy(batch_data_samples) + # set cat_id of gt_labels to 0 in RPN + for data_sample in rpn_data_samples: + data_sample.gt_instances.labels = \ + torch.zeros_like(data_sample.gt_instances.labels) + + rpn_losses, proposal_list = self.rpn_head.loss_and_predict( + x, rpn_data_samples, proposal_cfg=proposal_cfg) + + # avoid get same name with roi_head loss + keys = rpn_losses.keys() + for key in list(keys): + if 'loss' in key and 'rpn' not in key: + rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) + + losses.update(rpn_losses) + else: + assert batch_data_samples[0].get('proposals', None) is not None + # use pre-defined proposals in InstanceData for the second stage + # to extract ROI features. + proposal_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + positive_coords = [] + for i in range(len(self.roi_head)): + roi_losses = self.roi_head[i].loss(x, proposal_list, + batch_data_samples) + if self.with_pos_coord: + positive_coords.append(roi_losses.pop('pos_coords')) + else: + if 'pos_coords' in roi_losses.keys(): + roi_losses.pop('pos_coords') + roi_losses = upd_loss(roi_losses, idx=i) + losses.update(roi_losses) + + for i in range(len(self.bbox_head)): + bbox_losses = self.bbox_head[i].loss(x, batch_data_samples) + if self.with_pos_coord: + pos_coords = bbox_losses.pop('pos_coords') + positive_coords.append(pos_coords) + else: + if 'pos_coords' in bbox_losses.keys(): + bbox_losses.pop('pos_coords') + bbox_losses = upd_loss(bbox_losses, idx=i + len(self.roi_head)) + losses.update(bbox_losses) + + if self.with_pos_coord and len(positive_coords) > 0: + for i in range(len(positive_coords)): + bbox_losses = self.query_head.loss_aux(x, positive_coords[i], + i, batch_data_samples) + bbox_losses = upd_loss(bbox_losses, idx=i) + losses.update(bbox_losses) + + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + list[:obj:`DetDataSample`]: Detection results of the input images. + Each DetDataSample usually contain 'pred_instances'. And the + `pred_instances` usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert self.eval_module in ['detr', 'one-stage', 'two-stage'] + + if self.use_lsj: + for data_samples in batch_data_samples: + img_metas = data_samples.metainfo + input_img_h, input_img_w = img_metas['batch_input_shape'] + img_metas['img_shape'] = [input_img_h, input_img_w] + + img_feats = self.extract_feat(batch_inputs) + if self.with_bbox and self.eval_module == 'one-stage': + results_list = self.predict_bbox_head( + img_feats, batch_data_samples, rescale=rescale) + elif self.with_roi_head and self.eval_module == 'two-stage': + results_list = self.predict_roi_head( + img_feats, batch_data_samples, rescale=rescale) + else: + results_list = self.predict_query_head( + img_feats, batch_data_samples, rescale=rescale) + + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples + + def predict_query_head(self, + mlvl_feats: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + return self.query_head.predict( + mlvl_feats, batch_data_samples=batch_data_samples, rescale=rescale) + + def predict_roi_head(self, + mlvl_feats: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + assert self.with_bbox, 'Bbox head must be implemented.' + if self.with_query_head: + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + results = self.query_head.forward(mlvl_feats, batch_img_metas) + mlvl_feats = results[-1] + rpn_results_list = self.rpn_head.predict( + mlvl_feats, batch_data_samples, rescale=False) + return self.roi_head[self.eval_index].predict( + mlvl_feats, rpn_results_list, batch_data_samples, rescale=rescale) + + def predict_bbox_head(self, + mlvl_feats: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = True) -> InstanceList: + assert self.with_bbox, 'Bbox head must be implemented.' + if self.with_query_head: + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + results = self.query_head.forward(mlvl_feats, batch_img_metas) + mlvl_feats = results[-1] + return self.bbox_head[self.eval_index].predict( + mlvl_feats, batch_data_samples, rescale=rescale) diff --git a/mmdetection/projects/CO-DETR/codetr/transformer.py b/mmdetection/projects/CO-DETR/codetr/transformer.py new file mode 100644 index 0000000..009f94a --- /dev/null +++ b/mmdetection/projects/CO-DETR/codetr/transformer.py @@ -0,0 +1,1376 @@ +import math +import warnings + +import torch +import torch.nn as nn +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, + TransformerLayerSequence, + build_transformer_layer_sequence) +from mmcv.ops import MultiScaleDeformableAttention +from mmengine.model import BaseModule +from mmengine.model.weight_init import xavier_init +from torch.nn.init import normal_ + +from mmdet.models.layers.transformer import inverse_sigmoid +from mmdet.registry import MODELS + +try: + from fairscale.nn.checkpoint import checkpoint_wrapper +except Exception: + checkpoint_wrapper = None + +# In order to save the cost and effort of reproduction, +# I did not refactor it into the style of mmdet 3.x DETR. + + +class Transformer(BaseModule): + """Implements the DETR transformer. + + Following the official DETR implementation, this module copy-paste + from torch.nn.Transformer with modifications: + + * positional encodings are passed in MultiheadAttention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers + + See `paper: End-to-End Object Detection with Transformers + `_ for details. + + Args: + encoder (`mmcv.ConfigDict` | Dict): Config of + TransformerEncoder. Defaults to None. + decoder ((`mmcv.ConfigDict` | Dict)): Config of + TransformerDecoder. Defaults to None + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Defaults to None. + """ + + def __init__(self, encoder=None, decoder=None, init_cfg=None): + super(Transformer, self).__init__(init_cfg=init_cfg) + self.encoder = build_transformer_layer_sequence(encoder) + self.decoder = build_transformer_layer_sequence(decoder) + self.embed_dims = self.encoder.embed_dims + + def init_weights(self): + # follow the official DETR to init parameters + for m in self.modules(): + if hasattr(m, 'weight') and m.weight.dim() > 1: + xavier_init(m, distribution='uniform') + self._is_init = True + + def forward(self, x, mask, query_embed, pos_embed): + """Forward function for `Transformer`. + + Args: + x (Tensor): Input query with shape [bs, c, h, w] where + c = embed_dims. + mask (Tensor): The key_padding_mask used for encoder and decoder, + with shape [bs, h, w]. + query_embed (Tensor): The query embedding for decoder, with shape + [num_query, c]. + pos_embed (Tensor): The positional encoding for encoder and + decoder, with the same shape as `x`. + + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + + - out_dec: Output from decoder. If return_intermediate_dec \ + is True output has shape [num_dec_layers, bs, + num_query, embed_dims], else has shape [1, bs, \ + num_query, embed_dims]. + - memory: Output results from encoder, with shape \ + [bs, embed_dims, h, w]. + """ + bs, c, h, w = x.shape + # use `view` instead of `flatten` for dynamically exporting to ONNX + x = x.view(bs, c, -1).permute(2, 0, 1) # [bs, c, h, w] -> [h*w, bs, c] + pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1) + query_embed = query_embed.unsqueeze(1).repeat( + 1, bs, 1) # [num_query, dim] -> [num_query, bs, dim] + mask = mask.view(bs, -1) # [bs, h, w] -> [bs, h*w] + memory = self.encoder( + query=x, + key=None, + value=None, + query_pos=pos_embed, + query_key_padding_mask=mask) + target = torch.zeros_like(query_embed) + # out_dec: [num_layers, num_query, bs, dim] + out_dec = self.decoder( + query=target, + key=memory, + value=memory, + key_pos=pos_embed, + query_pos=query_embed, + key_padding_mask=mask) + out_dec = out_dec.transpose(1, 2) + memory = memory.permute(1, 2, 0).reshape(bs, c, h, w) + return out_dec, memory + + +@MODELS.register_module(force=True) +class DeformableDetrTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR transformer. + + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, return_intermediate=False, **kwargs): + + super(DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + + def forward(self, + query, + *args, + reference_points=None, + valid_ratios=None, + reg_branches=None, + **kwargs): + """Forward function for `TransformerDecoder`. + + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + valid_ratios (Tensor): The radios of valid + points on the feature map, has shape + (bs, num_levels, 2) + reg_branch: (obj:`nn.ModuleList`): Used for + refining the regression results. Only would + be passed when with_box_refine is True, + otherwise would be passed a `None`. + + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for lid, layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = reference_points[:, :, None] * \ + torch.cat([valid_ratios, valid_ratios], -1)[:, None] + else: + assert reference_points.shape[-1] == 2 + reference_points_input = reference_points[:, :, None] * \ + valid_ratios[:, None] + output = layer( + output, + *args, + reference_points=reference_points_input, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + if reference_points.shape[-1] == 4: + new_reference_points = tmp + inverse_sigmoid( + reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + assert reference_points.shape[-1] == 2 + new_reference_points = tmp + new_reference_points[..., :2] = tmp[ + ..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +@MODELS.register_module(force=True) +class DeformableDetrTransformer(Transformer): + """Implements the DeformableDETR transformer. + + Args: + as_two_stage (bool): Generate query from encoder features. + Default: False. + num_feature_levels (int): Number of feature maps from FPN: + Default: 4. + two_stage_num_proposals (int): Number of proposals when set + `as_two_stage` as True. Default: 300. + """ + + def __init__(self, + as_two_stage=False, + num_feature_levels=4, + two_stage_num_proposals=300, + **kwargs): + super(DeformableDetrTransformer, self).__init__(**kwargs) + self.as_two_stage = as_two_stage + self.num_feature_levels = num_feature_levels + self.two_stage_num_proposals = two_stage_num_proposals + self.embed_dims = self.encoder.embed_dims + self.init_layers() + + def init_layers(self): + """Initialize layers of the DeformableDetrTransformer.""" + self.level_embeds = nn.Parameter( + torch.Tensor(self.num_feature_levels, self.embed_dims)) + + if self.as_two_stage: + self.enc_output = nn.Linear(self.embed_dims, self.embed_dims) + self.enc_output_norm = nn.LayerNorm(self.embed_dims) + self.pos_trans = nn.Linear(self.embed_dims * 2, + self.embed_dims * 2) + self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2) + else: + self.reference_points = nn.Linear(self.embed_dims, 2) + + def init_weights(self): + """Initialize the transformer weights.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MultiScaleDeformableAttention): + m.init_weights() + if not self.as_two_stage: + xavier_init(self.reference_points, distribution='uniform', bias=0.) + normal_(self.level_embeds) + + def gen_encoder_output_proposals(self, memory, memory_padding_mask, + spatial_shapes): + """Generate proposals from encoded memory. + + Args: + memory (Tensor) : The output of encoder, + has shape (bs, num_key, embed_dim). num_key is + equal the number of points on feature map from + all level. + memory_padding_mask (Tensor): Padding mask for memory. + has shape (bs, num_key). + spatial_shapes (Tensor): The shape of all feature maps. + has shape (num_level, 2). + + Returns: + tuple: A tuple of feature map and bbox prediction. + + - output_memory (Tensor): The input of decoder, \ + has shape (bs, num_key, embed_dim). num_key is \ + equal the number of points on feature map from \ + all levels. + - output_proposals (Tensor): The normalized proposal \ + after a inverse sigmoid, has shape \ + (bs, num_keys, 4). + """ + + N, S, C = memory.shape + proposals = [] + _cur = 0 + for lvl, (H, W) in enumerate(spatial_shapes): + mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].view( + N, H, W, 1) + valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = torch.meshgrid( + torch.linspace( + 0, H - 1, H, dtype=torch.float32, device=memory.device), + torch.linspace( + 0, W - 1, W, dtype=torch.float32, device=memory.device)) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_W.unsqueeze(-1), + valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale + wh = torch.ones_like(grid) * 0.05 * (2.0**lvl) + proposal = torch.cat((grid, wh), -1).view(N, -1, 4) + proposals.append(proposal) + _cur += (H * W) + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & + (output_proposals < 0.99)).all( + -1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) + output_proposals = output_proposals.masked_fill( + memory_padding_mask.unsqueeze(-1), float('inf')) + output_proposals = output_proposals.masked_fill( + ~output_proposals_valid, float('inf')) + + output_memory = memory + output_memory = output_memory.masked_fill( + memory_padding_mask.unsqueeze(-1), float(0)) + output_memory = output_memory.masked_fill(~output_proposals_valid, + float(0)) + output_memory = self.enc_output_norm(self.enc_output(output_memory)) + return output_memory, output_proposals + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """Get the reference points used in decoder. + + Args: + spatial_shapes (Tensor): The shape of all + feature maps, has shape (num_level, 2). + valid_ratios (Tensor): The radios of valid + points on the feature map, has shape + (bs, num_levels, 2) + device (obj:`device`): The device where + reference_points should be. + + Returns: + Tensor: reference points used in decoder, has \ + shape (bs, num_keys, num_levels, 2). + """ + reference_points_list = [] + for lvl, (H, W) in enumerate(spatial_shapes): + ref_y, ref_x = torch.meshgrid( + torch.linspace( + 0.5, H - 0.5, H, dtype=torch.float32, device=device), + torch.linspace( + 0.5, W - 0.5, W, dtype=torch.float32, device=device)) + ref_y = ref_y.reshape(-1)[None] / ( + valid_ratios[:, None, lvl, 1] * H) + ref_x = ref_x.reshape(-1)[None] / ( + valid_ratios[:, None, lvl, 0] * W) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def get_valid_ratio(self, mask): + """Get the valid radios of feature maps of all level.""" + _, H, W = mask.shape + valid_H = torch.sum(~mask[:, :, 0], 1) + valid_W = torch.sum(~mask[:, 0, :], 1) + valid_ratio_h = valid_H.float() / H + valid_ratio_w = valid_W.float() / W + valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) + return valid_ratio + + def get_proposal_pos_embed(self, + proposals, + num_pos_feats=128, + temperature=10000): + """Get the position embedding of proposal.""" + scale = 2 * math.pi + dim_t = torch.arange( + num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats) + # N, L, 4 + proposals = proposals.sigmoid() * scale + # N, L, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # N, L, 4, 64, 2 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), + dim=4).flatten(2) + return pos + + def forward(self, + mlvl_feats, + mlvl_masks, + query_embed, + mlvl_pos_embeds, + reg_branches=None, + cls_branches=None, + **kwargs): + """Forward function for `Transformer`. + + Args: + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, embed_dims, h, w]. + mlvl_masks (list(Tensor)): The key_padding_mask from + different level used for encoder and decoder, + each element has shape [bs, h, w]. + query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + mlvl_pos_embeds (list(Tensor)): The positional encoding + of feats from different level, has the shape + [bs, embed_dims, h, w]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when + `with_box_refine` is True. Default to None. + cls_branches (obj:`nn.ModuleList`): Classification heads + for feature maps from each decoder layer. Only would + be passed when `as_two_stage` + is True. Default to None. + + + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + assert self.as_two_stage or query_embed is not None + + feat_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for lvl, (feat, mask, pos_embed) in enumerate( + zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): + bs, c, h, w = feat.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + feat = feat.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + feat_flatten.append(feat) + mask_flatten.append(mask) + feat_flatten = torch.cat(feat_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=feat_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack( + [self.get_valid_ratio(m) for m in mlvl_masks], 1) + + reference_points = \ + self.get_reference_points(spatial_shapes, + valid_ratios, + device=feat.device) + + feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) + lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute( + 1, 0, 2) # (H*W, bs, embed_dims) + memory = self.encoder( + query=feat_flatten, + key=None, + value=None, + query_pos=lvl_pos_embed_flatten, + query_key_padding_mask=mask_flatten, + spatial_shapes=spatial_shapes, + reference_points=reference_points, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + **kwargs) + + memory = memory.permute(1, 0, 2) + bs, _, c = memory.shape + if self.as_two_stage: + output_memory, output_proposals = \ + self.gen_encoder_output_proposals( + memory, mask_flatten, spatial_shapes) + enc_outputs_class = cls_branches[self.decoder.num_layers]( + output_memory) + enc_outputs_coord_unact = \ + reg_branches[ + self.decoder.num_layers](output_memory) + output_proposals + + topk = self.two_stage_num_proposals + # We only use the first channel in enc_outputs_class as foreground, + # the other (num_classes - 1) channels are actually not used. + # Its targets are set to be 0s, which indicates the first + # class (foreground) because we use [0, num_classes - 1] to + # indicate class labels, background class is indicated by + # num_classes (similar convention in RPN). + # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa + # This follows the official implementation of Deformable DETR. + topk_proposals = torch.topk( + enc_outputs_class[..., 0], topk, dim=1)[1] + topk_coords_unact = torch.gather( + enc_outputs_coord_unact, 1, + topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) + topk_coords_unact = topk_coords_unact.detach() + reference_points = topk_coords_unact.sigmoid() + init_reference_out = reference_points + pos_trans_out = self.pos_trans_norm( + self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact))) + query_pos, query = torch.split(pos_trans_out, c, dim=2) + else: + query_pos, query = torch.split(query_embed, c, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) + query = query.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_pos).sigmoid() + init_reference_out = reference_points + + # decoder + query = query.permute(1, 0, 2) + memory = memory.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=memory, + query_pos=query_pos, + key_padding_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reg_branches=reg_branches, + **kwargs) + + inter_references_out = inter_references + if self.as_two_stage: + return inter_states, init_reference_out,\ + inter_references_out, enc_outputs_class,\ + enc_outputs_coord_unact + return inter_states, init_reference_out, \ + inter_references_out, None, None + + +@MODELS.register_module() +class CoDeformableDetrTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR transformer. + + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, + *args, + return_intermediate=False, + look_forward_twice=False, + **kwargs): + + super(CoDeformableDetrTransformerDecoder, + self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + self.look_forward_twice = look_forward_twice + + def forward(self, + query, + *args, + reference_points=None, + valid_ratios=None, + reg_branches=None, + **kwargs): + """Forward function for `TransformerDecoder`. + + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + valid_ratios (Tensor): The radios of valid + points on the feature map, has shape + (bs, num_levels, 2) + reg_branch: (obj:`nn.ModuleList`): Used for + refining the regression results. Only would + be passed when with_box_refine is True, + otherwise would be passed a `None`. + + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for lid, layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = reference_points[:, :, None] * \ + torch.cat([valid_ratios, valid_ratios], -1)[:, None] + else: + assert reference_points.shape[-1] == 2 + reference_points_input = reference_points[:, :, None] * \ + valid_ratios[:, None] + output = layer( + output, + *args, + reference_points=reference_points_input, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + if reference_points.shape[-1] == 4: + new_reference_points = tmp + inverse_sigmoid( + reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + assert reference_points.shape[-1] == 2 + new_reference_points = tmp + new_reference_points[..., :2] = tmp[ + ..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append( + new_reference_points if self. + look_forward_twice else reference_points) + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +@MODELS.register_module() +class CoDeformableDetrTransformer(DeformableDetrTransformer): + + def __init__(self, + mixed_selection=True, + with_pos_coord=True, + with_coord_feat=True, + num_co_heads=1, + **kwargs): + self.mixed_selection = mixed_selection + self.with_pos_coord = with_pos_coord + self.with_coord_feat = with_coord_feat + self.num_co_heads = num_co_heads + super(CoDeformableDetrTransformer, self).__init__(**kwargs) + self._init_layers() + + def _init_layers(self): + """Initialize layers of the CoDeformableDetrTransformer.""" + if self.with_pos_coord: + if self.num_co_heads > 0: + # bug: this code should be 'self.head_pos_embed = + # nn.Embedding(self.num_co_heads, self.embed_dims)', + # we keep this bug for reproducing our results with ResNet-50. + # You can fix this bug when reproducing results with + # swin transformer. + self.head_pos_embed = nn.Embedding(self.num_co_heads, 1, 1, + self.embed_dims) + self.aux_pos_trans = nn.ModuleList() + self.aux_pos_trans_norm = nn.ModuleList() + self.pos_feats_trans = nn.ModuleList() + self.pos_feats_norm = nn.ModuleList() + for i in range(self.num_co_heads): + self.aux_pos_trans.append( + nn.Linear(self.embed_dims * 2, self.embed_dims * 2)) + self.aux_pos_trans_norm.append( + nn.LayerNorm(self.embed_dims * 2)) + if self.with_coord_feat: + self.pos_feats_trans.append( + nn.Linear(self.embed_dims, self.embed_dims)) + self.pos_feats_norm.append( + nn.LayerNorm(self.embed_dims)) + + def get_proposal_pos_embed(self, + proposals, + num_pos_feats=128, + temperature=10000): + """Get the position embedding of proposal.""" + num_pos_feats = self.embed_dims // 2 + scale = 2 * math.pi + dim_t = torch.arange( + num_pos_feats, dtype=torch.float32, device=proposals.device) + dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats) + # N, L, 4 + proposals = proposals.sigmoid() * scale + # N, L, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # N, L, 4, 64, 2 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), + dim=4).flatten(2) + return pos + + def forward(self, + mlvl_feats, + mlvl_masks, + query_embed, + mlvl_pos_embeds, + reg_branches=None, + cls_branches=None, + return_encoder_output=False, + attn_masks=None, + **kwargs): + """Forward function for `Transformer`. + + Args: + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, embed_dims, h, w]. + mlvl_masks (list(Tensor)): The key_padding_mask from + different level used for encoder and decoder, + each element has shape [bs, h, w]. + query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + mlvl_pos_embeds (list(Tensor)): The positional encoding + of feats from different level, has the shape + [bs, embed_dims, h, w]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when + `with_box_refine` is True. Default to None. + cls_branches (obj:`nn.ModuleList`): Classification heads + for feature maps from each decoder layer. Only would + be passed when `as_two_stage` + is True. Default to None. + + + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + assert self.as_two_stage or query_embed is not None + + feat_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for lvl, (feat, mask, pos_embed) in enumerate( + zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): + bs, c, h, w = feat.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + feat = feat.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + feat_flatten.append(feat) + mask_flatten.append(mask) + feat_flatten = torch.cat(feat_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=feat_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack( + [self.get_valid_ratio(m) for m in mlvl_masks], 1) + + reference_points = \ + self.get_reference_points(spatial_shapes, + valid_ratios, + device=feat.device) + + feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) + lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute( + 1, 0, 2) # (H*W, bs, embed_dims) + memory = self.encoder( + query=feat_flatten, + key=None, + value=None, + query_pos=lvl_pos_embed_flatten, + query_key_padding_mask=mask_flatten, + spatial_shapes=spatial_shapes, + reference_points=reference_points, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + **kwargs) + + memory = memory.permute(1, 0, 2) + bs, _, c = memory.shape + if self.as_two_stage: + output_memory, output_proposals = \ + self.gen_encoder_output_proposals( + memory, mask_flatten, spatial_shapes) + enc_outputs_class = cls_branches[self.decoder.num_layers]( + output_memory) + enc_outputs_coord_unact = \ + reg_branches[ + self.decoder.num_layers](output_memory) + output_proposals + + topk = self.two_stage_num_proposals + topk = query_embed.shape[0] + topk_proposals = torch.topk( + enc_outputs_class[..., 0], topk, dim=1)[1] + topk_coords_unact = torch.gather( + enc_outputs_coord_unact, 1, + topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) + topk_coords_unact = topk_coords_unact.detach() + reference_points = topk_coords_unact.sigmoid() + init_reference_out = reference_points + pos_trans_out = self.pos_trans_norm( + self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact))) + + if not self.mixed_selection: + query_pos, query = torch.split(pos_trans_out, c, dim=2) + else: + # query_embed here is the content embed for deformable DETR + query = query_embed.unsqueeze(0).expand(bs, -1, -1) + query_pos, _ = torch.split(pos_trans_out, c, dim=2) + else: + query_pos, query = torch.split(query_embed, c, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) + query = query.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_pos).sigmoid() + init_reference_out = reference_points + + # decoder + query = query.permute(1, 0, 2) + memory = memory.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=memory, + query_pos=query_pos, + key_padding_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reg_branches=reg_branches, + attn_masks=attn_masks, + **kwargs) + + inter_references_out = inter_references + if self.as_two_stage: + if return_encoder_output: + return inter_states, init_reference_out,\ + inter_references_out, enc_outputs_class,\ + enc_outputs_coord_unact, memory + return inter_states, init_reference_out,\ + inter_references_out, enc_outputs_class,\ + enc_outputs_coord_unact + if return_encoder_output: + return inter_states, init_reference_out, \ + inter_references_out, None, None, memory + return inter_states, init_reference_out, \ + inter_references_out, None, None + + def forward_aux(self, + mlvl_feats, + mlvl_masks, + query_embed, + mlvl_pos_embeds, + pos_anchors, + pos_feats=None, + reg_branches=None, + cls_branches=None, + return_encoder_output=False, + attn_masks=None, + head_idx=0, + **kwargs): + feat_flatten = [] + mask_flatten = [] + spatial_shapes = [] + for lvl, (feat, mask, pos_embed) in enumerate( + zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): + bs, c, h, w = feat.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + feat = feat.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + feat_flatten.append(feat) + mask_flatten.append(mask) + feat_flatten = torch.cat(feat_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=feat_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack( + [self.get_valid_ratio(m) for m in mlvl_masks], 1) + + feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) + + memory = feat_flatten + memory = memory.permute(1, 0, 2) + bs, _, c = memory.shape + + topk_coords_unact = inverse_sigmoid(pos_anchors) + reference_points = pos_anchors + init_reference_out = reference_points + if self.num_co_heads > 0: + pos_trans_out = self.aux_pos_trans_norm[head_idx]( + self.aux_pos_trans[head_idx]( + self.get_proposal_pos_embed(topk_coords_unact))) + query_pos, query = torch.split(pos_trans_out, c, dim=2) + if self.with_coord_feat: + query = query + self.pos_feats_norm[head_idx]( + self.pos_feats_trans[head_idx](pos_feats)) + query_pos = query_pos + self.head_pos_embed.weight[head_idx] + + # decoder + query = query.permute(1, 0, 2) + memory = memory.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=memory, + query_pos=query_pos, + key_padding_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reg_branches=reg_branches, + attn_masks=attn_masks, + **kwargs) + + inter_references_out = inter_references + return inter_states, init_reference_out, \ + inter_references_out + + +def build_MLP(input_dim, hidden_dim, output_dim, num_layers): + assert num_layers > 1, \ + f'num_layers should be greater than 1 but got {num_layers}' + h = [hidden_dim] * (num_layers - 1) + layers = list() + for n, k in zip([input_dim] + h[:-1], h): + layers.extend((nn.Linear(n, k), nn.ReLU())) + # Note that the relu func of MLP in original DETR repo is set + # 'inplace=False', however the ReLU cfg of FFN in mmdet is set + # 'inplace=True' by default. + layers.append(nn.Linear(hidden_dim, output_dim)) + return nn.Sequential(*layers) + + +@MODELS.register_module() +class DinoTransformerDecoder(DeformableDetrTransformerDecoder): + + def __init__(self, *args, **kwargs): + super(DinoTransformerDecoder, self).__init__(*args, **kwargs) + self._init_layers() + + def _init_layers(self): + self.ref_point_head = build_MLP(self.embed_dims * 2, self.embed_dims, + self.embed_dims, 2) + self.norm = nn.LayerNorm(self.embed_dims) + + @staticmethod + def gen_sineembed_for_position(pos_tensor, pos_feat): + # n_query, bs, _ = pos_tensor.size() + # sineembed_tensor = torch.zeros(n_query, bs, 256) + scale = 2 * math.pi + dim_t = torch.arange( + pos_feat, dtype=torch.float32, device=pos_tensor.device) + dim_t = 10000**(2 * (dim_t // 2) / pos_feat) + x_embed = pos_tensor[:, :, 0] * scale + y_embed = pos_tensor[:, :, 1] * scale + pos_x = x_embed[:, :, None] / dim_t + pos_y = y_embed[:, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), + dim=3).flatten(2) + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), + dim=3).flatten(2) + if pos_tensor.size(-1) == 2: + pos = torch.cat((pos_y, pos_x), dim=2) + elif pos_tensor.size(-1) == 4: + w_embed = pos_tensor[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + pos_w = torch.stack( + (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), + dim=3).flatten(2) + + h_embed = pos_tensor[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + pos_h = torch.stack( + (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), + dim=3).flatten(2) + + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) + else: + raise ValueError('Unknown pos_tensor shape(-1):{}'.format( + pos_tensor.size(-1))) + return pos + + def forward(self, + query, + *args, + reference_points=None, + valid_ratios=None, + reg_branches=None, + **kwargs): + output = query + intermediate = [] + intermediate_reference_points = [reference_points] + for lid, layer in enumerate(self.layers): + if reference_points.shape[-1] == 4: + reference_points_input = \ + reference_points[:, :, None] * torch.cat( + [valid_ratios, valid_ratios], -1)[:, None] + else: + assert reference_points.shape[-1] == 2 + reference_points_input = \ + reference_points[:, :, None] * valid_ratios[:, None] + + query_sine_embed = self.gen_sineembed_for_position( + reference_points_input[:, :, 0, :], self.embed_dims // 2) + query_pos = self.ref_point_head(query_sine_embed) + + query_pos = query_pos.permute(1, 0, 2) + output = layer( + output, + *args, + query_pos=query_pos, + reference_points=reference_points_input, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + assert reference_points.shape[-1] == 4 + new_reference_points = tmp + inverse_sigmoid( + reference_points, eps=1e-3) + new_reference_points = new_reference_points.sigmoid() + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(self.norm(output)) + intermediate_reference_points.append(new_reference_points) + # NOTE this is for the "Look Forward Twice" module, + # in the DeformDETR, reference_points was appended. + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +@MODELS.register_module() +class CoDinoTransformer(CoDeformableDetrTransformer): + + def __init__(self, *args, **kwargs): + super(CoDinoTransformer, self).__init__(*args, **kwargs) + + def init_layers(self): + """Initialize layers of the DinoTransformer.""" + self.level_embeds = nn.Parameter( + torch.Tensor(self.num_feature_levels, self.embed_dims)) + self.enc_output = nn.Linear(self.embed_dims, self.embed_dims) + self.enc_output_norm = nn.LayerNorm(self.embed_dims) + self.query_embed = nn.Embedding(self.two_stage_num_proposals, + self.embed_dims) + + def _init_layers(self): + if self.with_pos_coord: + if self.num_co_heads > 0: + self.aux_pos_trans = nn.ModuleList() + self.aux_pos_trans_norm = nn.ModuleList() + self.pos_feats_trans = nn.ModuleList() + self.pos_feats_norm = nn.ModuleList() + for i in range(self.num_co_heads): + self.aux_pos_trans.append( + nn.Linear(self.embed_dims * 2, self.embed_dims)) + self.aux_pos_trans_norm.append( + nn.LayerNorm(self.embed_dims)) + if self.with_coord_feat: + self.pos_feats_trans.append( + nn.Linear(self.embed_dims, self.embed_dims)) + self.pos_feats_norm.append( + nn.LayerNorm(self.embed_dims)) + + def init_weights(self): + super().init_weights() + nn.init.normal_(self.query_embed.weight.data) + + def forward(self, + mlvl_feats, + mlvl_masks, + query_embed, + mlvl_pos_embeds, + dn_label_query, + dn_bbox_query, + attn_mask, + reg_branches=None, + cls_branches=None, + **kwargs): + assert self.as_two_stage and query_embed is None, \ + 'as_two_stage must be True for DINO' + + feat_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for lvl, (feat, mask, pos_embed) in enumerate( + zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): + bs, c, h, w = feat.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + feat = feat.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + feat_flatten.append(feat) + mask_flatten.append(mask) + feat_flatten = torch.cat(feat_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=feat_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack( + [self.get_valid_ratio(m) for m in mlvl_masks], 1) + + reference_points = self.get_reference_points( + spatial_shapes, valid_ratios, device=feat.device) + + feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) + lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute( + 1, 0, 2) # (H*W, bs, embed_dims) + memory = self.encoder( + query=feat_flatten, + key=None, + value=None, + query_pos=lvl_pos_embed_flatten, + query_key_padding_mask=mask_flatten, + spatial_shapes=spatial_shapes, + reference_points=reference_points, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + **kwargs) + memory = memory.permute(1, 0, 2) + bs, _, c = memory.shape + + output_memory, output_proposals = self.gen_encoder_output_proposals( + memory, mask_flatten, spatial_shapes) + enc_outputs_class = cls_branches[self.decoder.num_layers]( + output_memory) + enc_outputs_coord_unact = reg_branches[self.decoder.num_layers]( + output_memory) + output_proposals + cls_out_features = cls_branches[self.decoder.num_layers].out_features + topk = self.two_stage_num_proposals + # NOTE In DeformDETR, enc_outputs_class[..., 0] is used for topk + topk_indices = torch.topk(enc_outputs_class.max(-1)[0], topk, dim=1)[1] + + topk_score = torch.gather( + enc_outputs_class, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features)) + topk_coords_unact = torch.gather( + enc_outputs_coord_unact, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, 4)) + topk_anchor = topk_coords_unact.sigmoid() + topk_coords_unact = topk_coords_unact.detach() + + query = self.query_embed.weight[:, None, :].repeat(1, bs, + 1).transpose(0, 1) + # NOTE the query_embed here is not spatial query as in DETR. + # It is actually content query, which is named tgt in other + # DETR-like models + if dn_label_query is not None: + query = torch.cat([dn_label_query, query], dim=1) + if dn_bbox_query is not None: + reference_points = torch.cat([dn_bbox_query, topk_coords_unact], + dim=1) + else: + reference_points = topk_coords_unact + reference_points = reference_points.sigmoid() + # decoder + query = query.permute(1, 0, 2) + memory = memory.permute(1, 0, 2) + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=memory, + attn_masks=attn_mask, + key_padding_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reg_branches=reg_branches, + **kwargs) + + inter_references_out = inter_references + + return inter_states, inter_references_out, \ + topk_score, topk_anchor, memory + + def forward_aux(self, + mlvl_feats, + mlvl_masks, + query_embed, + mlvl_pos_embeds, + pos_anchors, + pos_feats=None, + reg_branches=None, + cls_branches=None, + return_encoder_output=False, + attn_masks=None, + head_idx=0, + **kwargs): + feat_flatten = [] + mask_flatten = [] + spatial_shapes = [] + for lvl, (feat, mask, pos_embed) in enumerate( + zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): + bs, c, h, w = feat.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + feat = feat.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + feat_flatten.append(feat) + mask_flatten.append(mask) + feat_flatten = torch.cat(feat_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=feat_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack( + [self.get_valid_ratio(m) for m in mlvl_masks], 1) + + feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) + + memory = feat_flatten + memory = memory.permute(1, 0, 2) + bs, _, c = memory.shape + + topk_coords_unact = inverse_sigmoid(pos_anchors) + reference_points = pos_anchors + if self.num_co_heads > 0: + pos_trans_out = self.aux_pos_trans_norm[head_idx]( + self.aux_pos_trans[head_idx]( + self.get_proposal_pos_embed(topk_coords_unact))) + query = pos_trans_out + if self.with_coord_feat: + query = query + self.pos_feats_norm[head_idx]( + self.pos_feats_trans[head_idx](pos_feats)) + + # decoder + query = query.permute(1, 0, 2) + memory = memory.permute(1, 0, 2) + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=memory, + attn_masks=None, + key_padding_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + reg_branches=reg_branches, + **kwargs) + + inter_references_out = inter_references + + return inter_states, inter_references_out + + +@MODELS.register_module() +class DetrTransformerEncoder(TransformerLayerSequence): + """TransformerEncoder of DETR. + + Args: + post_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. Only used when `self.pre_norm` is `True` + """ + + def __init__(self, + *args, + post_norm_cfg=dict(type='LN'), + with_cp=-1, + **kwargs): + super(DetrTransformerEncoder, self).__init__(*args, **kwargs) + if post_norm_cfg is not None: + self.post_norm = build_norm_layer( + post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None + else: + assert not self.pre_norm, f'Use prenorm in ' \ + f'{self.__class__.__name__},' \ + f'Please specify post_norm_cfg' + self.post_norm = None + self.with_cp = with_cp + if self.with_cp > 0: + if checkpoint_wrapper is None: + warnings.warn('If you want to reduce GPU memory usage, \ + please install fairscale by executing the \ + following command: pip install fairscale.') + return + for i in range(self.with_cp): + self.layers[i] = checkpoint_wrapper(self.layers[i]) + + +@MODELS.register_module() +class DetrTransformerDecoderLayer(BaseTransformerLayer): + """Implements decoder layer in DETR transformer. + + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): + Configs for self_attention or cross_attention, the order + should be consistent with it in `operation_order`. If it is + a dict, it would be expand to the number of attention in + `operation_order`. + feedforward_channels (int): The hidden dimension for FFNs. + ffn_dropout (float): Probability of an element to be zeroed + in ffn. Default 0.0. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Default:None + act_cfg (dict): The activation config for FFNs. Default: `LN` + norm_cfg (dict): Config dict for normalization layer. + Default: `LN`. + ffn_num_fcs (int): The number of fully-connected layers in FFNs. + Default:2. + """ + + def __init__(self, + attn_cfgs, + feedforward_channels, + ffn_dropout=0.0, + operation_order=None, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN'), + ffn_num_fcs=2, + **kwargs): + super(DetrTransformerDecoderLayer, self).__init__( + attn_cfgs=attn_cfgs, + feedforward_channels=feedforward_channels, + ffn_dropout=ffn_dropout, + operation_order=operation_order, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + ffn_num_fcs=ffn_num_fcs, + **kwargs) + assert len(operation_order) == 6 + assert set(operation_order) == set( + ['self_attn', 'norm', 'cross_attn', 'ffn']) diff --git a/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py new file mode 100644 index 0000000..1a41304 --- /dev/null +++ b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_8xb2_1x_coco.py @@ -0,0 +1,68 @@ +_base_ = './co_dino_5scale_r50_lsj_8xb2_1x_coco.py' + +model = dict( + use_lsj=False, data_preprocessor=dict(pad_mask=False, batch_augments=None)) + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + dataset=dict( + _delete_=True, + type=_base_.dataset_type, + data_root=_base_.data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline, + backend_args=_base_.backend_args)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='Resize', scale=(1333, 800), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py new file mode 100644 index 0000000..876b90f --- /dev/null +++ b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_1x_coco.py @@ -0,0 +1,359 @@ +_base_ = 'mmdet::common/ssj_scp_270k_coco-instance.py' + +custom_imports = dict( + imports=['projects.CO-DETR.codetr'], allow_failed_imports=False) + +# model settings +num_dec_layer = 6 +loss_lambda = 2.0 +num_classes = 80 + +image_size = (1024, 1024) +batch_augments = [ + dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) +] +model = dict( + type='CoDETR', + # If using the lsj augmentation, + # it is recommended to set it to True. + use_lsj=True, + # detr: 52.1 + # one-stage: 49.4 + # two-stage: 47.9 + eval_module='detr', # in ['detr', 'one-stage', 'two-stage'] + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_mask=True, + batch_augments=batch_augments), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[256, 512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=5), + query_head=dict( + type='CoDINOHead', + num_query=900, + num_classes=num_classes, + in_channels=2048, + as_two_stage=True, + dn_cfg=dict( + label_noise_scale=0.5, + box_noise_scale=1.0, + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), + transformer=dict( + type='CoDinoTransformer', + with_coord_feat=False, + num_co_heads=2, # ATSS Aux Head + Faster RCNN Aux Head + num_feature_levels=5, + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + # number of layers that use checkpoint. + # The maximum value for the setting is num_layers. + # FairScale must be installed for it to work. + with_cp=4, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_levels=5, + dropout=0.0), + feedforward_channels=2048, + ffn_dropout=0.0, + operation_order=('self_attn', 'norm', 'ffn', 'norm'))), + decoder=dict( + type='DinoTransformerDecoder', + num_layers=6, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.0), + dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_levels=5, + dropout=0.0), + ], + feedforward_channels=2048, + ffn_dropout=0.0, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + positional_encoding=dict( + type='SinePositionalEncoding', + num_feats=128, + temperature=20, + normalize=True), + loss_cls=dict( # Different from the DINO + type='QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0 * num_dec_layer * loss_lambda), + loss_bbox=dict( + type='L1Loss', loss_weight=1.0 * num_dec_layer * loss_lambda)), + roi_head=[ + dict( + type='CoStandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32, 64], + finest_scale=56), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=num_classes, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + reg_decoded_bbox=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0 * num_dec_layer * loss_lambda), + loss_bbox=dict( + type='GIoULoss', + loss_weight=10.0 * num_dec_layer * loss_lambda))) + ], + bbox_head=[ + dict( + type='CoATSSHead', + num_classes=num_classes, + in_channels=256, + stacked_convs=1, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + ratios=[1.0], + octave_base_scale=8, + scales_per_octave=1, + strides=[4, 8, 16, 32, 64, 128]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0 * num_dec_layer * loss_lambda), + loss_bbox=dict( + type='GIoULoss', + loss_weight=2.0 * num_dec_layer * loss_lambda), + loss_centerness=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0 * num_dec_layer * loss_lambda)), + ], + # model training and testing settings + train_cfg=[ + dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=4000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + dict( + assigner=dict(type='ATSSAssigner', topk=9), + allowed_border=-1, + pos_weight=-1, + debug=False) + ], + test_cfg=[ + # Deferent from the DINO, we use the NMS. + dict( + max_per_img=300, + # NMS can improve the mAP by 0.2. + nms=dict(type='soft_nms', iou_threshold=0.8)), + dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.0, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100)), + dict( + # atss bbox head: + nms_pre=1000, + min_bbox_size=0, + score_thr=0.0, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100), + # soft-nms is also supported for rcnn testing + # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05) + ]) + +# LSJ + CopyPaste +load_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), +] + +train_pipeline = [ + dict(type='CopyPaste', max_num_pasted=100), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + pipeline=train_pipeline, + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=load_pipeline))) + +# follow ViTDet +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=image_size, keep_ratio=True), # diff + dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +optim_wrapper = dict( + _delete_=True, + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})) + +val_evaluator = dict(metric='bbox') +test_evaluator = val_evaluator + +max_epochs = 12 +train_cfg = dict( + _delete_=True, + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=1) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +default_hooks = dict( + checkpoint=dict(by_epoch=True, interval=1, max_keep_ckpts=3)) +log_processor = dict(by_epoch=True) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py new file mode 100644 index 0000000..9a9fc34 --- /dev/null +++ b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_r50_lsj_8xb2_3x_coco.py @@ -0,0 +1,4 @@ +_base_ = ['co_dino_5scale_r50_lsj_8xb2_1x_coco.py'] + +param_scheduler = [dict(milestones=[30])] +train_cfg = dict(max_epochs=36) diff --git a/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py new file mode 100644 index 0000000..8fdb732 --- /dev/null +++ b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_16e_o365tococo.py @@ -0,0 +1,115 @@ +_base_ = ['co_dino_5scale_r50_8xb2_1x_coco.py'] + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa +load_from = 'https://download.openmmlab.com/mmdetection/v3.0/codetr/co_dino_5scale_swin_large_22e_o365-0a33e247.pth' # noqa + +# model settings +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + # Please only add indices that would be used + # in FPN, otherwise some parameter will not be used + with_cp=True, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[192, 384, 768, 1536]), + query_head=dict( + dn_cfg=dict(box_noise_scale=0.4, group_cfg=dict(num_dn_queries=500)), + transformer=dict(encoder=dict(with_cp=6)))) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 2048), (512, 2048), (544, 2048), (576, 2048), + (608, 2048), (640, 2048), (672, 2048), (704, 2048), + (736, 2048), (768, 2048), (800, 2048), (832, 2048), + (864, 2048), (896, 2048), (928, 2048), (960, 2048), + (992, 2048), (1024, 2048), (1056, 2048), + (1088, 2048), (1120, 2048), (1152, 2048), + (1184, 2048), (1216, 2048), (1248, 2048), + (1280, 2048), (1312, 2048), (1344, 2048), + (1376, 2048), (1408, 2048), (1440, 2048), + (1472, 2048), (1504, 2048), (1536, 2048)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 2048), (512, 2048), (544, 2048), (576, 2048), + (608, 2048), (640, 2048), (672, 2048), (704, 2048), + (736, 2048), (768, 2048), (800, 2048), (832, 2048), + (864, 2048), (896, 2048), (928, 2048), (960, 2048), + (992, 2048), (1024, 2048), (1056, 2048), + (1088, 2048), (1120, 2048), (1152, 2048), + (1184, 2048), (1216, 2048), (1248, 2048), + (1280, 2048), (1312, 2048), (1344, 2048), + (1376, 2048), (1408, 2048), (1440, 2048), + (1472, 2048), (1504, 2048), (1536, 2048)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + batch_size=1, num_workers=1, dataset=dict(pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=(2048, 1280), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +optim_wrapper = dict(optimizer=dict(lr=1e-4)) + +max_epochs = 16 +train_cfg = dict(max_epochs=max_epochs) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[8], + gamma=0.1) +] diff --git a/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py new file mode 100644 index 0000000..d4a8734 --- /dev/null +++ b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_1x_coco.py @@ -0,0 +1,31 @@ +_base_ = ['co_dino_5scale_r50_8xb2_1x_coco.py'] + +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa + +# model settings +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + # Please only add indices that would be used + # in FPN, otherwise some parameter will not be used + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[192, 384, 768, 1536]), + query_head=dict(transformer=dict(encoder=dict(with_cp=6)))) + +train_dataloader = dict(batch_size=1, num_workers=1) diff --git a/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py new file mode 100644 index 0000000..c2fce29 --- /dev/null +++ b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_16xb1_3x_coco.py @@ -0,0 +1,6 @@ +_base_ = ['co_dino_5scale_swin_l_16xb1_1x_coco.py'] +# model settings +model = dict(backbone=dict(drop_path_rate=0.6)) + +param_scheduler = [dict(milestones=[30])] +train_cfg = dict(max_epochs=36) diff --git a/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py new file mode 100644 index 0000000..4a9b368 --- /dev/null +++ b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py @@ -0,0 +1,72 @@ +_base_ = ['co_dino_5scale_r50_lsj_8xb2_1x_coco.py'] + +image_size = (1280, 1280) +batch_augments = [ + dict(type='BatchFixedSizePad', size=image_size, pad_mask=True) +] +pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' # noqa + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=batch_augments), + backbone=dict( + _delete_=True, + type='SwinTransformer', + pretrain_img_size=384, + embed_dims=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + window_size=12, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(0, 1, 2, 3), + # Please only add indices that would be used + # in FPN, otherwise some parameter will not be used + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=pretrained)), + neck=dict(in_channels=[192, 384, 768, 1536]), + query_head=dict(transformer=dict(encoder=dict(with_cp=6)))) + +load_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), +] + +train_dataloader = dict( + batch_size=1, + num_workers=1, + dataset=dict(dataset=dict(pipeline=load_pipeline))) + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='Resize', scale=image_size, keep_ratio=True), + dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py new file mode 100644 index 0000000..0e5c00b --- /dev/null +++ b/mmdetection/projects/CO-DETR/configs/codino/co_dino_5scale_swin_l_lsj_16xb1_3x_coco.py @@ -0,0 +1,6 @@ +_base_ = ['co_dino_5scale_swin_l_lsj_16xb1_1x_coco.py'] + +model = dict(backbone=dict(drop_path_rate=0.5)) + +param_scheduler = [dict(milestones=[30])] +train_cfg = dict(max_epochs=36) diff --git a/mmdetection/projects/ConvNeXt-V2/README.md b/mmdetection/projects/ConvNeXt-V2/README.md new file mode 100644 index 0000000..7a9f56c --- /dev/null +++ b/mmdetection/projects/ConvNeXt-V2/README.md @@ -0,0 +1,37 @@ +# ConvNeXt-V2 + +> [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](http://arxiv.org/abs/2301.00808) + +## Abstract + +Driven by improved architectures and better representation learning frameworks, the field of visual recognition has enjoyed rapid modernization and performance boost in the early 2020s. For example, modern ConvNets, represented by ConvNeXt \[52\], have demonstrated strong performance in various scenarios. While these models were originally designed for supervised learning with ImageNet labels, they can also potentially benefit from self-supervised learning techniques such as masked autoencoders (MAE) . However, we found that simply combining these two approaches leads to subpar performance. In this paper, we propose a fully convolutional masked autoencoder framework and a new Global Response Normalization (GRN) layer that can be added to the ConvNeXt architecture to enhance inter-channel feature competition. This co-design of self-supervised learning techniques and architectural improvement results in a new model family called ConvNeXt V2, which significantly improves the performance of pure ConvNets on various recognition benchmarks, including ImageNet classification, COCO detection, and ADE20K segmentation. We also provide pre-trained ConvNeXt V2 models of various sizes, ranging from an efficient 3.7Mparameter Atto model with 76.7% top-1 accuracy on Im-ageNet, to a 650M Huge model that achieves a state-of-theart 88.9% accuracy using only public training data. + +
    + +
    + +## Results and models + +| Method | Backbone | Pretrain | Lr schd | Augmentation | Mem (GB) | box AP | mask AP | Config | Download | +| :--------: | :-----------: | :------: | :-----: | :----------: | :------: | :----: | :-----: | :----------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Mask R-CNN | ConvNeXt-V2-B | FCMAE | 3x | LSJ | 22.5 | 52.9 | 46.4 | [config](./mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/convnextv2/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco_20230113_110947-757ee2dd.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/convnextv2/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco_20230113_110947.log.json) | + +**Note**: + +- This is a pre-release version of ConvNeXt-V2 object detection. The official finetuning setting of ConvNeXt-V2 has not been released yet. +- ConvNeXt backbone needs to install [MMPretrain](https://github.com/open-mmlab/mmpretrain/) first, which has abundant backbones for downstream tasks. + +```shell +pip install mmpretrain +``` + +## Citation + +```bibtex +@article{Woo2023ConvNeXtV2, + title={ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders}, + author={Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon and Saining Xie}, + year={2023}, + journal={arXiv preprint arXiv:2301.00808}, +} +``` diff --git a/mmdetection/projects/ConvNeXt-V2/configs/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py b/mmdetection/projects/ConvNeXt-V2/configs/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py new file mode 100644 index 0000000..59e8955 --- /dev/null +++ b/mmdetection/projects/ConvNeXt-V2/configs/mask-rcnn_convnext-v2-b_fpn_lsj-3x-fcmae_coco.py @@ -0,0 +1,92 @@ +_base_ = [ + 'mmdet::_base_/models/mask-rcnn_r50_fpn.py', + 'mmdet::_base_/datasets/coco_instance.py', + 'mmdet::_base_/schedules/schedule_1x.py', + 'mmdet::_base_/default_runtime.py' +] + +# please install the mmpretrain +# import mmpretrain.models to trigger register_module in mmpretrain +custom_imports = dict( + imports=['mmpretrain.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext-v2/convnext-v2-base_3rdparty-fcmae_in1k_20230104-8a798eaf.pth' # noqa +image_size = (1024, 1024) + +model = dict( + backbone=dict( + _delete_=True, + type='mmpretrain.ConvNeXt', + arch='base', + out_indices=[0, 1, 2, 3], + # TODO: verify stochastic depth rate {0.1, 0.2, 0.3, 0.4} + drop_path_rate=0.4, + layer_scale_init_value=0., # disable layer scale when using GRN + gap_before_final_norm=False, + use_grn=True, # V2 uses GRN + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint_file, + prefix='backbone.')), + neck=dict(in_channels=[128, 256, 512, 1024]), + test_cfg=dict( + rpn=dict(nms=dict(type='nms')), # TODO: does RPN use soft_nms? + rcnn=dict(nms=dict(type='soft_nms')))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + batch_size=4, # total_batch_size 32 = 8 GPUS x 4 images + num_workers=8, + dataset=dict(pipeline=train_pipeline)) + +max_epochs = 36 +train_cfg = dict(max_epochs=max_epochs) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[27, 33], + gamma=0.1) +] + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + constructor='LearningRateDecayOptimizerConstructor', + paramwise_cfg={ + 'decay_rate': 0.95, + 'decay_type': 'layer_wise', # TODO: sweep layer-wise lr decay? + 'num_layers': 12 + }, + optimizer=dict( + _delete_=True, + type='AdamW', + lr=0.0001, + betas=(0.9, 0.999), + weight_decay=0.05, + )) + +default_hooks = dict(checkpoint=dict(max_keep_ckpts=1)) diff --git a/mmdetection/projects/Detic/README.md b/mmdetection/projects/Detic/README.md new file mode 100644 index 0000000..98cd705 --- /dev/null +++ b/mmdetection/projects/Detic/README.md @@ -0,0 +1,156 @@ +# Note: This project has been deprecated, please use [Detic_new](../Detic_new). + +# Detecting Twenty-thousand Classes using Image-level Supervision + +## Description + +**Detic**: A **Det**ector with **i**mage **c**lasses that can use image-level labels to easily train detectors. + +

    + +> [**Detecting Twenty-thousand Classes using Image-level Supervision**](http://arxiv.org/abs/2201.02605), +> Xingyi Zhou, Rohit Girdhar, Armand Joulin, Philipp Krähenbühl, Ishan Misra, +> *ECCV 2022 ([arXiv 2201.02605](http://arxiv.org/abs/2201.02605))* + +## Usage + + + +## Installation + +Detic requires to install CLIP. + +```shell +pip install git+https://github.com/openai/CLIP.git +``` + +### Demo + +#### Inference with existing dataset vocabulary embeddings + +First, go to the Detic project folder. + +```shell +cd projects/Detic +``` + +Then, download the pre-computed CLIP embeddings from [dataset metainfo](https://github.com/facebookresearch/Detic/tree/main/datasets/metadata) to the `datasets/metadata` folder. +The CLIP embeddings will be loaded to the zero-shot classifier during inference. +For example, you can download LVIS's class name embeddings with the following command: + +```shell +wget -P datasets/metadata https://raw.githubusercontent.com/facebookresearch/Detic/main/datasets/metadata/lvis_v1_clip_a%2Bcname.npy +``` + +You can run demo like this: + +```shell +python demo.py \ + ${IMAGE_PATH} \ + ${CONFIG_PATH} \ + ${MODEL_PATH} \ + --show \ + --score-thr 0.5 \ + --dataset lvis +``` + +![image](https://user-images.githubusercontent.com/12907710/213624759-f0a2ba0c-0f5c-4424-a350-5ba5349e5842.png) + +### Inference with custom vocabularies + +- Detic can detects any class given class names by using CLIP. + +You can detect custom classes with `--class-name` command: + +``` +python demo.py \ + ${IMAGE_PATH} \ + ${CONFIG_PATH} \ + ${MODEL_PATH} \ + --show \ + --score-thr 0.3 \ + --class-name headphone webcam paper coffe +``` + +![image](https://user-images.githubusercontent.com/12907710/213624637-e9e8a313-9821-4782-a18a-4408c876852b.png) + +Note that `headphone`, `paper` and `coffe` (typo intended) are not LVIS classes. Despite the misspelled class name, Detic can produce a reasonable detection for `coffe`. + +## Results + +Here we only provide the Detic Swin-B model for the open vocabulary demo. Multi-dataset training and open-vocabulary testing will be supported in the future. + +To find more variants, please visit the [official model zoo](https://github.com/facebookresearch/Detic/blob/main/docs/MODEL_ZOO.md). + +| Backbone | Training data | Config | Download | +| :------: | :------------------------: | :-------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Swin-B | ImageNet-21K & LVIS & COCO | [config](./configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k_20230120-0d301978.pth) | + +## Citation + +If you find Detic is useful in your research or applications, please consider giving a star 🌟 to the [official repository](https://github.com/facebookresearch/Detic) and citing Detic by the following BibTeX entry. + +```BibTeX +@inproceedings{zhou2022detecting, + title={Detecting Twenty-thousand Classes using Image-level Supervision}, + author={Zhou, Xingyi and Girdhar, Rohit and Joulin, Armand and Kr{\"a}henb{\"u}hl, Philipp and Misra, Ishan}, + booktitle={ECCV}, + year={2022} +} + +``` + +## Checklist + + + +- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [x] Finish the code + + + + - [x] Basic docstrings & proper citation + + + + - [x] Test-time correctness + + + + - [x] A full README + + + +- [ ] Milestone 2: Indicates a successful model implementation. + + - [ ] Training-time correctness + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Type hints and docstrings + + + + - [ ] Unit tests + + + + - [ ] Code polishing + + + + - [ ] Metafile.yml + + + +- [ ] Move your modules into the core package following the codebase's file hierarchy structure. + + + +- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/mmdetection/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py b/mmdetection/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py new file mode 100644 index 0000000..d554c40 --- /dev/null +++ b/mmdetection/projects/Detic/configs/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k.py @@ -0,0 +1,298 @@ +_base_ = 'mmdet::common/lsj-200e_coco-detection.py' + +custom_imports = dict( + imports=['projects.Detic.detic'], allow_failed_imports=False) + +image_size = (1024, 1024) +batch_augments = [dict(type='BatchFixedSizePad', size=image_size)] + +cls_layer = dict( + type='ZeroShotClassifier', + zs_weight_path='rand', + zs_weight_dim=512, + use_bias=0.0, + norm_weight=True, + norm_temperature=50.0) +reg_layer = [ + dict(type='Linear', in_features=1024, out_features=1024), + dict(type='ReLU', inplace=True), + dict(type='Linear', in_features=1024, out_features=4) +] + +num_classes = 22047 + +model = dict( + type='CascadeRCNN', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32, + batch_augments=batch_augments), + backbone=dict( + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024], + out_channels=256, + start_level=0, + add_extra_convs='on_output', + num_outs=5, + init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'), + relu_before_extra_convs=True), + rpn_head=dict( + type='CenterNetRPNHead', + num_classes=1, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + conv_bias=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + loss_cls=dict( + type='GaussianFocalLoss', + pos_weight=0.25, + neg_weight=0.75, + loss_weight=1.0), + loss_bbox=dict(type='GIoULoss', loss_weight=2.0), + ), + roi_head=dict( + type='DeticRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=0, + use_torchvision=True), + out_channels=256, + featmap_strides=[8, 16, 32], + # approximately equal to + # canonical_box_size=224, canonical_level=4 in D2 + finest_scale=112), + bbox_head=[ + dict( + type='DeticBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=num_classes, + cls_predictor_cfg=cls_layer, + reg_predictor_cfg=reg_layer, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='DeticBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=num_classes, + cls_predictor_cfg=cls_layer, + reg_predictor_cfg=reg_layer, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='DeticBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=num_classes, + cls_predictor_cfg=cls_layer, + reg_predictor_cfg=reg_layer, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[8, 16, 32], + # approximately equal to + # canonical_box_size=224, canonical_level=4 in D2 + finest_scale=112), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + class_agnostic=True, + num_classes=num_classes, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.8, + neg_iou_thr=0.8, + min_pos_iou=0.8, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + score_thr=0.0001, + nms_pre=1000, + max_per_img=256, + nms=dict(type='nms', iou_threshold=0.9), + min_bbox_size=0), + rcnn=dict( + score_thr=0.02, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=300, + mask_thr_binary=0.5))) + +backend = 'pillow' +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args, + imdecode_backend=backend), + dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(batch_size=8, num_workers=4) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='AmpOptimWrapper', + optimizer=dict( + type='SGD', lr=0.01 * 4, momentum=0.9, weight_decay=0.00004), + paramwise_cfg=dict(norm_decay_mult=0.)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.00025, + by_epoch=False, + begin=0, + end=4000), + dict( + type='MultiStepLR', + begin=0, + end=25, + by_epoch=True, + milestones=[22, 24], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (8 samples per GPU) +auto_scale_lr = dict(base_batch_size=64) diff --git a/mmdetection/projects/Detic/demo.py b/mmdetection/projects/Detic/demo.py new file mode 100644 index 0000000..d5c80c9 --- /dev/null +++ b/mmdetection/projects/Detic/demo.py @@ -0,0 +1,142 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import urllib +from argparse import ArgumentParser + +import mmcv +import torch +from mmengine.logging import print_log +from mmengine.utils import ProgressBar, scandir + +from mmdet.apis import inference_detector, init_detector +from mmdet.registry import VISUALIZERS +from mmdet.utils import register_all_modules + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def get_file_list(source_root: str) -> [list, dict]: + """Get file list. + + Args: + source_root (str): image or video source path + + Return: + source_file_path_list (list): A list for all source file. + source_type (dict): Source type: file or url or dir. + """ + is_dir = os.path.isdir(source_root) + is_url = source_root.startswith(('http:/', 'https:/')) + is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS + + source_file_path_list = [] + if is_dir: + # when input source is dir + for file in scandir(source_root, IMG_EXTENSIONS, recursive=True): + source_file_path_list.append(os.path.join(source_root, file)) + elif is_url: + # when input source is url + filename = os.path.basename( + urllib.parse.unquote(source_root).split('?')[0]) + file_save_path = os.path.join(os.getcwd(), filename) + print(f'Downloading source file to {file_save_path}') + torch.hub.download_url_to_file(source_root, file_save_path) + source_file_path_list = [file_save_path] + elif is_file: + # when input source is single image + source_file_path_list = [source_root] + else: + print('Cannot find image file.') + + source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file) + + return source_file_path_list, source_type + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--dataset', type=str, help='dataset name to load the text embedding') + parser.add_argument( + '--class-name', nargs='+', type=str, help='custom class names') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + # register all modules in mmdet into the registries + register_all_modules() + + # build the model from a config file and a checkpoint file + model = init_detector(args.config, args.checkpoint, device=args.device) + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + # get file list + files, source_type = get_file_list(args.img) + from detic.utils import (get_class_names, get_text_embeddings, + reset_cls_layer_weight) + + # class name embeddings + if args.class_name: + dataset_classes = args.class_name + elif args.dataset: + dataset_classes = get_class_names(args.dataset) + embedding = get_text_embeddings( + dataset=args.dataset, custom_vocabulary=args.class_name) + visualizer.dataset_meta['classes'] = dataset_classes + reset_cls_layer_weight(model, embedding) + + # start detector inference + progress_bar = ProgressBar(len(files)) + for file in files: + result = inference_detector(model, file) + + img = mmcv.imread(file) + img = mmcv.imconvert(img, 'bgr', 'rgb') + + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + progress_bar.update() + + visualizer.add_datasample( + filename, + img, + data_sample=result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=out_file, + pred_score_thr=args.score_thr) + + if not args.show: + print_log( + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + +if __name__ == '__main__': + main() diff --git a/mmdetection/projects/Detic/detic/__init__.py b/mmdetection/projects/Detic/detic/__init__.py new file mode 100644 index 0000000..d0ad070 --- /dev/null +++ b/mmdetection/projects/Detic/detic/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .centernet_rpn_head import CenterNetRPNHead +from .detic_bbox_head import DeticBBoxHead +from .detic_roi_head import DeticRoIHead +from .zero_shot_classifier import ZeroShotClassifier + +__all__ = [ + 'CenterNetRPNHead', 'DeticBBoxHead', 'DeticRoIHead', 'ZeroShotClassifier' +] diff --git a/mmdetection/projects/Detic/detic/centernet_rpn_head.py b/mmdetection/projects/Detic/detic/centernet_rpn_head.py new file mode 100644 index 0000000..765d6df --- /dev/null +++ b/mmdetection/projects/Detic/detic/centernet_rpn_head.py @@ -0,0 +1,196 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import Scale +from mmengine import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.dense_heads import CenterNetUpdateHead +from mmdet.models.utils import multi_apply +from mmdet.registry import MODELS + +INF = 1000000000 +RangeType = Sequence[Tuple[int, int]] + + +@MODELS.register_module(force=True) # avoid bug +class CenterNetRPNHead(CenterNetUpdateHead): + """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2. + + Paper link ``_. + """ + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self._init_reg_convs() + self._init_predictor() + + def _init_predictor(self) -> None: + """Initialize predictor layers of the head.""" + self.conv_cls = nn.Conv2d( + self.feat_channels, self.num_classes, 3, padding=1) + self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]: + """Forward features from the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: A tuple of each level outputs. + + - cls_scores (list[Tensor]): Box scores for each scale level, \ + each is a 4D-tensor, the channel number is num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for each \ + scale level, each is a 4D-tensor, the channel number is 4. + """ + res = multi_apply(self.forward_single, x, self.scales, self.strides) + return res + + def forward_single(self, x: Tensor, scale: Scale, + stride: int) -> Tuple[Tensor, Tensor]: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps. + + Returns: + tuple: scores for each class, bbox predictions of + input feature maps. + """ + for m in self.reg_convs: + x = m(x) + cls_score = self.conv_cls(x) + bbox_pred = self.conv_reg(x) + # scale the bbox_pred of different level + # float to avoid overflow when enabling FP16 + bbox_pred = scale(bbox_pred).float() + # bbox_pred needed for gradient computation has been modified + # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace + # F.relu(bbox_pred) with bbox_pred.clamp(min=0) + bbox_pred = bbox_pred.clamp(min=0) + if not self.training: + bbox_pred *= stride + return cls_score, bbox_pred # score aligned, box larger + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (mmengine.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bbox_preds = [] + mlvl_valid_priors = [] + mlvl_scores = [] + mlvl_labels = [] + + for level_idx, (cls_score, bbox_pred, score_factor, priors) in \ + enumerate(zip(cls_score_list, bbox_pred_list, + score_factor_list, mlvl_priors)): + + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + dim = self.bbox_coder.encode_size + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim) + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + heatmap = cls_score.sigmoid() + score_thr = cfg.get('score_thr', 0) + + candidate_inds = heatmap > score_thr # 0.05 + pre_nms_top_n = candidate_inds.sum() # N + pre_nms_top_n = pre_nms_top_n.clamp(max=nms_pre) # N + + heatmap = heatmap[candidate_inds] # n + + candidate_nonzeros = candidate_inds.nonzero() # n + box_loc = candidate_nonzeros[:, 0] # n + labels = candidate_nonzeros[:, 1] # n + + bbox_pred = bbox_pred[box_loc] # n x 4 + per_grids = priors[box_loc] # n x 2 + + if candidate_inds.sum().item() > pre_nms_top_n.item(): + heatmap, top_k_indices = \ + heatmap.topk(pre_nms_top_n, sorted=False) + labels = labels[top_k_indices] + bbox_pred = bbox_pred[top_k_indices] + per_grids = per_grids[top_k_indices] + + bboxes = self.bbox_coder.decode(per_grids, bbox_pred) + # avoid invalid boxes in RoI heads + bboxes[:, 2] = torch.max(bboxes[:, 2], bboxes[:, 0] + 0.01) + bboxes[:, 3] = torch.max(bboxes[:, 3], bboxes[:, 1] + 0.01) + + mlvl_bbox_preds.append(bboxes) + mlvl_valid_priors.append(priors) + mlvl_scores.append(torch.sqrt(heatmap)) + mlvl_labels.append(labels) + + results = InstanceData() + results.bboxes = torch.cat(mlvl_bbox_preds) + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) diff --git a/mmdetection/projects/Detic/detic/detic_bbox_head.py b/mmdetection/projects/Detic/detic/detic_bbox_head.py new file mode 100644 index 0000000..9408cbe --- /dev/null +++ b/mmdetection/projects/Detic/detic/detic_bbox_head.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.layers import multiclass_nms +from mmdet.models.roi_heads.bbox_heads import Shared2FCBBoxHead +from mmdet.models.utils import empty_instances +from mmdet.registry import MODELS +from mmdet.structures.bbox import get_box_tensor, scale_boxes + + +@MODELS.register_module(force=True) # avoid bug +class DeticBBoxHead(Shared2FCBBoxHead): + + def __init__(self, + *args, + init_cfg: Optional[Union[dict, ConfigDict]] = None, + **kwargs) -> None: + super().__init__(*args, init_cfg=init_cfg, **kwargs) + # reconstruct fc_cls and fc_reg since input channels are changed + assert self.with_cls + cls_channels = self.num_classes + cls_predictor_cfg_ = self.cls_predictor_cfg.copy() + cls_predictor_cfg_.update( + in_features=self.cls_last_dim, out_features=cls_channels) + self.fc_cls = MODELS.build(cls_predictor_cfg_) + + def _predict_by_feat_single( + self, + roi: Tensor, + cls_score: Tensor, + bbox_pred: Tensor, + img_meta: dict, + rescale: bool = False, + rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5). + last dimension 5 arrange as (batch_index, x1, y1, x2, y2). + cls_score (Tensor): Box scores, has shape + (num_boxes, num_classes + 1). + bbox_pred (Tensor): Box energies / deltas. + has shape (num_boxes, num_classes * 4). + img_meta (dict): image information. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. + Defaults to None + + Returns: + :obj:`InstanceData`: Detection results of each image\ + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + results = InstanceData() + if roi.shape[0] == 0: + return empty_instances([img_meta], + roi.device, + task_type='bbox', + instance_results=[results], + box_type=self.predict_box_type, + use_box_type=False, + num_classes=self.num_classes, + score_per_cls=rcnn_test_cfg is None)[0] + scores = cls_score + img_shape = img_meta['img_shape'] + num_rois = roi.size(0) + + num_classes = 1 if self.reg_class_agnostic else self.num_classes + roi = roi.repeat_interleave(num_classes, dim=0) + bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size) + bboxes = self.bbox_coder.decode( + roi[..., 1:], bbox_pred, max_shape=img_shape) + + if rescale and bboxes.size(0) > 0: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + bboxes = scale_boxes(bboxes, scale_factor) + + # Get the inside tensor when `bboxes` is a box type + bboxes = get_box_tensor(bboxes) + box_dim = bboxes.size(-1) + bboxes = bboxes.view(num_rois, -1) + + if rcnn_test_cfg is None: + # This means that it is aug test. + # It needs to return the raw results without nms. + results.bboxes = bboxes + results.scores = scores + else: + det_bboxes, det_labels = multiclass_nms( + bboxes, + scores, + rcnn_test_cfg.score_thr, + rcnn_test_cfg.nms, + rcnn_test_cfg.max_per_img, + box_dim=box_dim) + results.bboxes = det_bboxes[:, :-1] + results.scores = det_bboxes[:, -1] + results.labels = det_labels + return results diff --git a/mmdetection/projects/Detic/detic/detic_roi_head.py b/mmdetection/projects/Detic/detic/detic_roi_head.py new file mode 100644 index 0000000..a09c11c --- /dev/null +++ b/mmdetection/projects/Detic/detic/detic_roi_head.py @@ -0,0 +1,326 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.roi_heads import CascadeRoIHead +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.models.test_time_augs import merge_aug_masks +from mmdet.models.utils.misc import empty_instances +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi, get_box_tensor +from mmdet.utils import ConfigType, InstanceList, MultiConfig + + +@MODELS.register_module(force=True) # avoid bug +class DeticRoIHead(CascadeRoIHead): + + def init_mask_head(self, mask_roi_extractor: MultiConfig, + mask_head: MultiConfig) -> None: + """Initialize mask head and mask roi extractor. + + Args: + mask_head (dict): Config of mask in mask head. + mask_roi_extractor (:obj:`ConfigDict`, dict or list): + Config of mask roi extractor. + """ + self.mask_head = MODELS.build(mask_head) + + if mask_roi_extractor is not None: + self.share_roi_extractor = False + self.mask_roi_extractor = MODELS.build(mask_roi_extractor) + else: + self.share_roi_extractor = True + self.mask_roi_extractor = self.bbox_roi_extractor + + def _refine_roi(self, x: Tuple[Tensor], rois: Tensor, + batch_img_metas: List[dict], + num_proposals_per_img: Sequence[int], **kwargs) -> tuple: + """Multi-stage refinement of RoI. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2] + batch_img_metas (list[dict]): List of image information. + num_proposals_per_img (sequence[int]): number of proposals + in each image. + + Returns: + tuple: + + - rois (Tensor): Refined RoI. + - cls_scores (list[Tensor]): Average predicted + cls score per image. + - bbox_preds (list[Tensor]): Bbox branch predictions + for the last stage of per image. + """ + # "ms" in variable names means multi-stage + ms_scores = [] + for stage in range(self.num_stages): + bbox_results = self._bbox_forward( + stage=stage, x=x, rois=rois, **kwargs) + + # split batch bbox prediction back to each image + cls_scores = bbox_results['cls_score'].sigmoid() + bbox_preds = bbox_results['bbox_pred'] + + rois = rois.split(num_proposals_per_img, 0) + cls_scores = cls_scores.split(num_proposals_per_img, 0) + ms_scores.append(cls_scores) + bbox_preds = bbox_preds.split(num_proposals_per_img, 0) + + if stage < self.num_stages - 1: + bbox_head = self.bbox_head[stage] + refine_rois_list = [] + for i in range(len(batch_img_metas)): + if rois[i].shape[0] > 0: + bbox_label = cls_scores[i][:, :-1].argmax(dim=1) + # Refactor `bbox_head.regress_by_class` to only accept + # box tensor without img_idx concatenated. + refined_bboxes = bbox_head.regress_by_class( + rois[i][:, 1:], bbox_label, bbox_preds[i], + batch_img_metas[i]) + refined_bboxes = get_box_tensor(refined_bboxes) + refined_rois = torch.cat( + [rois[i][:, [0]], refined_bboxes], dim=1) + refine_rois_list.append(refined_rois) + rois = torch.cat(refine_rois_list) + # ms_scores aligned + # average scores of each image by stages + cls_scores = [ + sum([score[i] for score in ms_scores]) / float(len(ms_scores)) + for i in range(len(batch_img_metas)) + ] # aligned + return rois, cls_scores, bbox_preds + + def _bbox_forward(self, stage: int, x: Tuple[Tensor], + rois: Tensor) -> dict: + """Box head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict[str, Tensor]: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + """ + bbox_roi_extractor = self.bbox_roi_extractor[stage] + bbox_head = self.bbox_head[stage] + bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], + rois) + # do not support caffe_c4 model anymore + cls_score, bbox_pred = bbox_head(bbox_feats) + + bbox_results = dict( + cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats) + return bbox_results + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False, + **kwargs) -> InstanceList: + """Perform forward propagation of the bbox head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + proposals = [res.bboxes for res in rpn_results_list] + proposal_scores = [res.scores for res in rpn_results_list] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = bbox2roi(proposals) + + if rois.shape[0] == 0: + return empty_instances( + batch_img_metas, + rois.device, + task_type='bbox', + box_type=self.bbox_head[-1].predict_box_type, + num_classes=self.bbox_head[-1].num_classes, + score_per_cls=rcnn_test_cfg is None) + # rois aligned + rois, cls_scores, bbox_preds = self._refine_roi( + x=x, + rois=rois, + batch_img_metas=batch_img_metas, + num_proposals_per_img=num_proposals_per_img, + **kwargs) + + # score reweighting in centernet2 + cls_scores = [(s * ps[:, None])**0.5 + for s, ps in zip(cls_scores, proposal_scores)] + cls_scores = [ + s * (s == s[:, :-1].max(dim=1)[0][:, None]).float() + for s in cls_scores + ] + + # fast_rcnn_inference + results_list = self.bbox_head[-1].predict_by_feat( + rois=rois, + cls_scores=cls_scores, + bbox_preds=bbox_preds, + batch_img_metas=batch_img_metas, + rescale=rescale, + rcnn_test_cfg=rcnn_test_cfg) + return results_list + + def _mask_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + """Mask head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + """ + mask_feats = self.mask_roi_extractor( + x[:self.mask_roi_extractor.num_inputs], rois) + # do not support caffe_c4 model anymore + mask_preds = self.mask_head(mask_feats) + + mask_results = dict(mask_preds=mask_preds) + return mask_results + + def mask_loss(self, x, sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList) -> dict: + """Run forward function and calculate loss for mask head in training. + + Args: + x (tuple[Tensor]): Tuple of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `loss_mask` (dict): A dictionary of mask loss components. + """ + pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) + mask_results = self._mask_forward(x, pos_rois) + + mask_loss_and_target = self.mask_head.loss_and_target( + mask_preds=mask_results['mask_preds'], + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=self.train_cfg[-1]) + mask_results.update(mask_loss_and_target) + + return mask_results + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + raise NotImplementedError + + def predict_mask(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + results_list: List[InstanceData], + rescale: bool = False) -> List[InstanceData]: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + bboxes = [res.bboxes for res in results_list] + mask_rois = bbox2roi(bboxes) + if mask_rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas, + mask_rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + num_mask_rois_per_img = [len(res) for res in results_list] + aug_masks = [] + mask_results = self._mask_forward(x, mask_rois) + mask_preds = mask_results['mask_preds'] + # split batch mask prediction back to each image + mask_preds = mask_preds.split(num_mask_rois_per_img, 0) + aug_masks.append([m.sigmoid().detach() for m in mask_preds]) + + merged_masks = [] + for i in range(len(batch_img_metas)): + aug_mask = [mask[i] for mask in aug_masks] + merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i]) + merged_masks.append(merged_mask) + results_list = self.mask_head.predict_by_feat( + mask_preds=merged_masks, + results_list=results_list, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale, + activate_map=True) + return results_list diff --git a/mmdetection/projects/Detic/detic/text_encoder.py b/mmdetection/projects/Detic/detic/text_encoder.py new file mode 100644 index 0000000..f0024ef --- /dev/null +++ b/mmdetection/projects/Detic/detic/text_encoder.py @@ -0,0 +1,50 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +import torch.nn as nn + + +class CLIPTextEncoder(nn.Module): + + def __init__(self, model_name='ViT-B/32'): + super().__init__() + import clip + from clip.simple_tokenizer import SimpleTokenizer + self.tokenizer = SimpleTokenizer() + pretrained_model, _ = clip.load(model_name, device='cpu') + self.clip = pretrained_model + + @property + def device(self): + return self.clip.device + + @property + def dtype(self): + return self.clip.dtype + + def tokenize(self, + texts: Union[str, List[str]], + context_length: int = 77) -> torch.LongTensor: + if isinstance(texts, str): + texts = [texts] + + sot_token = self.tokenizer.encoder['<|startoftext|>'] + eot_token = self.tokenizer.encoder['<|endoftext|>'] + all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] + for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + st = torch.randint(len(tokens) - context_length + 1, + (1, ))[0].item() + tokens = tokens[st:st + context_length] + result[i, :len(tokens)] = torch.tensor(tokens) + + return result + + def forward(self, text): + text = self.tokenize(text) + text_features = self.clip.encode_text(text) + return text_features diff --git a/mmdetection/projects/Detic/detic/utils.py b/mmdetection/projects/Detic/detic/utils.py new file mode 100644 index 0000000..56d4fd4 --- /dev/null +++ b/mmdetection/projects/Detic/detic/utils.py @@ -0,0 +1,78 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +import torch.nn.functional as F +from mmengine.logging import print_log + +from .text_encoder import CLIPTextEncoder + +# download from +# https://github.com/facebookresearch/Detic/tree/main/datasets/metadata +DATASET_EMBEDDINGS = { + 'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy', + 'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy', + 'openimages': 'datasets/metadata/oid_clip_a+cname.npy', + 'coco': 'datasets/metadata/coco_clip_a+cname.npy', +} + + +def get_text_embeddings(dataset=None, + custom_vocabulary=None, + prompt_prefix='a '): + assert (dataset is None) ^ (custom_vocabulary is None), \ + 'Either `dataset` or `custom_vocabulary` should be specified.' + if dataset: + if dataset in DATASET_EMBEDDINGS: + return DATASET_EMBEDDINGS[dataset] + else: + custom_vocabulary = get_class_names(dataset) + + text_encoder = CLIPTextEncoder() + text_encoder.eval() + texts = [prompt_prefix + x for x in custom_vocabulary] + print_log( + f'Computing text embeddings for {len(custom_vocabulary)} classes.') + embeddings = text_encoder(texts).detach().permute(1, 0).contiguous().cpu() + return embeddings + + +def get_class_names(dataset): + if dataset == 'coco': + from mmdet.datasets import CocoDataset + class_names = CocoDataset.METAINFO['classes'] + elif dataset == 'cityscapes': + from mmdet.datasets import CityscapesDataset + class_names = CityscapesDataset.METAINFO['classes'] + elif dataset == 'voc': + from mmdet.datasets import VOCDataset + class_names = VOCDataset.METAINFO['classes'] + elif dataset == 'openimages': + from mmdet.datasets import OpenImagesDataset + class_names = OpenImagesDataset.METAINFO['classes'] + elif dataset == 'lvis': + from mmdet.datasets import LVISV1Dataset + class_names = LVISV1Dataset.METAINFO['classes'] + else: + raise TypeError(f'Invalid type for dataset name: {type(dataset)}') + return class_names + + +def reset_cls_layer_weight(model, weight): + if type(weight) == str: + print_log(f'Resetting cls_layer_weight from file: {weight}') + zs_weight = torch.tensor( + np.load(weight), + dtype=torch.float32).permute(1, 0).contiguous() # D x C + else: + zs_weight = weight + zs_weight = torch.cat( + [zs_weight, zs_weight.new_zeros( + (zs_weight.shape[0], 1))], dim=1) # D x (C + 1) + zs_weight = F.normalize(zs_weight, p=2, dim=0) + zs_weight = zs_weight.to('cuda') + num_classes = zs_weight.shape[-1] + + for bbox_head in model.roi_head.bbox_head: + bbox_head.num_classes = num_classes + del bbox_head.fc_cls.zs_weight + bbox_head.fc_cls.zs_weight = zs_weight diff --git a/mmdetection/projects/Detic/detic/zero_shot_classifier.py b/mmdetection/projects/Detic/detic/zero_shot_classifier.py new file mode 100644 index 0000000..35c9e49 --- /dev/null +++ b/mmdetection/projects/Detic/detic/zero_shot_classifier.py @@ -0,0 +1,73 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from mmdet.registry import MODELS + + +@MODELS.register_module(force=True) # avoid bug +class ZeroShotClassifier(nn.Module): + + def __init__( + self, + in_features: int, + out_features: int, # num_classes + zs_weight_path: str, + zs_weight_dim: int = 512, + use_bias: float = 0.0, + norm_weight: bool = True, + norm_temperature: float = 50.0, + ): + super().__init__() + num_classes = out_features + self.norm_weight = norm_weight + self.norm_temperature = norm_temperature + + self.use_bias = use_bias < 0 + if self.use_bias: + self.cls_bias = nn.Parameter(torch.ones(1) * use_bias) + + self.linear = nn.Linear(in_features, zs_weight_dim) + + if zs_weight_path == 'rand': + zs_weight = torch.randn((zs_weight_dim, num_classes)) + nn.init.normal_(zs_weight, std=0.01) + else: + zs_weight = torch.tensor( + np.load(zs_weight_path), + dtype=torch.float32).permute(1, 0).contiguous() # D x C + zs_weight = torch.cat( + [zs_weight, zs_weight.new_zeros( + (zs_weight_dim, 1))], dim=1) # D x (C + 1) + + if self.norm_weight: + zs_weight = F.normalize(zs_weight, p=2, dim=0) + + if zs_weight_path == 'rand': + self.zs_weight = nn.Parameter(zs_weight) + else: + self.register_buffer('zs_weight', zs_weight) + + assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape + + def forward(self, x, classifier=None): + ''' + Inputs: + x: B x D' + classifier_info: (C', C' x D) + ''' + x = self.linear(x) + if classifier is not None: + zs_weight = classifier.permute(1, 0).contiguous() # D x C' + zs_weight = F.normalize(zs_weight, p=2, dim=0) \ + if self.norm_weight else zs_weight + else: + zs_weight = self.zs_weight + if self.norm_weight: + x = self.norm_temperature * F.normalize(x, p=2, dim=1) + x = torch.mm(x, zs_weight) + if self.use_bias: + x = x + self.cls_bias + return x diff --git a/mmdetection/projects/Detic_new/README.md b/mmdetection/projects/Detic_new/README.md new file mode 100644 index 0000000..3c7714c --- /dev/null +++ b/mmdetection/projects/Detic_new/README.md @@ -0,0 +1,248 @@ +# Detecting Twenty-thousand Classes using Image-level Supervision + +## Description + +**Detic**: A **Det**ector with **i**mage **c**lasses that can use image-level labels to easily train detectors. + +

    + +> [**Detecting Twenty-thousand Classes using Image-level Supervision**](http://arxiv.org/abs/2201.02605), +> Xingyi Zhou, Rohit Girdhar, Armand Joulin, Philipp Krähenbühl, Ishan Misra, +> *ECCV 2022 ([arXiv 2201.02605](http://arxiv.org/abs/2201.02605))* + +## Usage + + + +## Installation + +Detic requires to install CLIP. + +```shell +pip install git+https://github.com/openai/CLIP.git +``` + +## Prepare Datasets + +It is recommended to download and extract the dataset somewhere outside the project directory and symlink the dataset root to `$MMDETECTION/data` as below. If your folder structure is different, you may need to change the corresponding paths in config files. + +### LVIS + +LVIS dataset is adopted as box-labeled data, [LVIS](https://www.lvisdataset.org/) is available from official website or mirror. You need to generate `lvis_v1_train_norare.json` according to the [official prepare datasets](https://github.com/facebookresearch/Detic/blob/main/datasets/README.md#coco-and-lvis) for open-vocabulary LVIS, which removes the labels of 337 rare-class from training. You can also download [lvis_v1_train_norare.json](https://download.openmmlab.com/mmdetection/v3.0/detic/data/lvis/annotations/lvis_v1_train_norare.json) from our backup. The directory should be like this. + +```shell +mmdetection +├── data +│ ├── lvis +│ │ ├── annotations +│ │ | ├── lvis_v1_train.json +│ │ | ├── lvis_v1_val.json +│ │ | ├── lvis_v1_train_norare.json +│ │ ├── train2017 +│ │ ├── val2017 +``` + +### ImageNet-LVIS + +ImageNet-LVIS is adopted as image-labeled data. You can download [ImageNet-21K](https://www.image-net.org/download.php) dataset from the official website. Then you need to unzip the overlapping classes of LVIS and convert them into LVIS annotation format according to the [official prepare datasets](https://github.com/facebookresearch/Detic/blob/main/datasets/README.md#imagenet-21k). The directory should be like this. + +```shell +mmdetection +├── data +│ ├── imagenet +│ │ ├── annotations +│ │ | ├── imagenet_lvis_image_info.json +│ │ ├── ImageNet-21K +│ │ | ├── n00007846 +│ │ | ├── n01318894 +│ │ | ├── ... +``` + +### Metadata + +`data/metadata/` is the preprocessed meta-data (included in the repo). Please follow the [official instruction](https://github.com/facebookresearch/Detic/blob/main/datasets/README.md#metadata) to pre-process the LVIS dataset. You will generate `lvis_v1_train_cat_info.json` for Federated loss, which contains the frequency of each category of training set of LVIS. In addition, `lvis_v1_clip_a+cname.npy` is the pre-computed CLIP embeddings for each category of LVIS. You can also choose to directly download [lvis_v1_train_cat_info](https://download.openmmlab.com/mmdetection/v3.0/detic/data/metadata/lvis_v1_train_cat_info.json) and [lvis_v1_clip_a+cname.npy](https://download.openmmlab.com/mmdetection/v3.0/detic/data/metadata/lvis_v1_clip_a%2Bcname.npy) form our backup. The directory should be like this. + +```shell +mmdetection +├── data +│ ├── metadata +│ │ ├── lvis_v1_train_cat_info.json +│ │ ├── lvis_v1_clip_a+cname.npy +``` + +## Demo + +Here we provide the Detic model for the open vocabulary demo. This model is trained on combined LVIS-COCO and ImageNet-21K for better demo purposes. LVIS models do not detect persons well due to its federated annotation protocol. LVIS+COCO models give better visual results. + +| Backbone | Training data | Config | Download | +| :------: | :----------------------------: | :-------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Swin-B | LVIS & COCO & ImageNet-21K | [config](./configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k/detic_centernet2_swin-b_fpn_4x_lvis-coco-in21k_20230120-0d301978.pth) | + +You can also download other models from [official model zoo](https://github.com/facebookresearch/Detic/blob/main/docs/MODEL_ZOO.md), and convert the format by run + +```shell +python tools/model_converters/detic_to_mmdet.py --src /path/to/detic_weight.pth --dst /path/to/mmdet_weight.pth +``` + +### Inference with existing dataset vocabulary + +You can detect classes of existing dataset with `--texts` command: + +```shell +python demo/image_demo.py \ + ${IMAGE_PATH} \ + ${CONFIG_PATH} \ + ${MODEL_PATH} \ + --texts lvis \ + --pred-score-thr 0.5 \ + --palette 'random' +``` + +![image](https://user-images.githubusercontent.com/12907710/213624759-f0a2ba0c-0f5c-4424-a350-5ba5349e5842.png) + +### Inference with custom vocabularies + +Detic can detects any class given class names by using CLIP. You can detect customized classes with `--texts` command: + +```shell +python demo/image_demo.py \ + ${IMAGE_PATH} \ + ${CONFIG_PATH} \ + ${MODEL_PATH} \ + --texts 'headphone . webcam . paper . coffe.' \ + --pred-score-thr 0.3 \ + --palette 'random' +``` + +![image](https://user-images.githubusercontent.com/12907710/213624637-e9e8a313-9821-4782-a18a-4408c876852b.png) + +Note that `headphone`, `paper` and `coffe` (typo intended) are not LVIS classes. Despite the misspelled class name, Detic can produce a reasonable detection for `coffe`. + +## Models and Results + +### Training + +There are two stages in the whole training process. The first stage is to train a model using images with box labels as the baseline. The second stage is to finetune from the baseline model and leverage image-labeled data. + +#### First stage + +To train the baseline with box-supervised, run + +```shell +bash ./tools/dist_train.sh projects/Detic_new/detic_centernet2_r50_fpn_4x_lvis_boxsup.py 8 +``` + +| Model (Config) | mask mAP | mask mAP(official) | mask mAP_rare | mask mAP_rare(officical) | +| :---------------------------------------------------------------------------------------------: | :------: | :----------------: | :-----------: | :----------------------: | +| [detic_centernet2_r50_fpn_4x_lvis_boxsup](./configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py) | 31.6 | 31.5 | 26.6 | 25.6 | + +#### Second stage + +The second stage uses both object detection and image classification datasets. + +##### Multi-Datasets Config + +We provide improved dataset_wrapper `ConcatDataset` to concatenate multiple datasets, all datasets could have different annotation types and different pipelines (e.g., image_size). You can also obtain the index of `dataset_source` for each sample through ` get_dataset_source` . We provide sampler `MultiDataSampler` to custom the ratios of different datasets. Beside, we provide batch_sampler `MultiDataAspectRatioBatchSampler` to enable different datasets to have different batchsizes. The config of multiple datasets is as follows: + +```python +dataset_det = dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type='LVISV1Dataset', + data_root='data/lvis/', + ann_file='annotations/lvis_v1_train.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline_det, + backend_args=backend_args)) + +dataset_cls = dict( + type='ImageNetLVISV1Dataset', + data_root='data/imagenet', + ann_file='annotations/imagenet_lvis_image_info.json', + data_prefix=dict(img='ImageNet-LVIS/'), + pipeline=train_pipeline_cls, + backend_args=backend_args) + +train_dataloader = dict( + batch_size=[8, 32], + num_workers=2, + persistent_workers=True, + sampler=dict( + type='MultiDataSampler', + dataset_ratio=[1, 4]), + batch_sampler=dict( + type='MultiDataAspectRatioBatchSampler', + num_datasets=2), + dataset=dict( + type='ConcatDataset', + datasets=[dataset_det, dataset_cls])) +``` + +###### Note: + +- If the one of the multiple datasets is `ConcatDataset` , it is still considered as a dataset for `num_datasets` in `MultiDataAspectRatioBatchSampler`. + +To finetune the baseline model with image-labeled data, run: + +```shell +bash ./tools/dist_train.sh projects/Detic_new/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py 8 +``` + +| Model (Config) | mask mAP | mask mAP(official) | mask mAP_rare | mask mAP_rare(officical) | +| :-----------------------------------------------------------------------------------------------------: | :------: | :----------------: | :-----------: | :----------------------: | +| [detic_centernet2_r50_fpn_4x_lvis_in21k-lvis](./configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py) | 32.9 | 33.2 | 30.9 | 29.7 | + +#### Standard LVIS Results + +| Model (Config) | mask mAP | mask mAP(official) | mask mAP_rare | mask mAP_rare(officical) | Download | +| :-----------------------------------------------------------------------------------------------------------: | :------: | :----------------: | :-----------: | :----------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [detic_centernet2_r50_fpn_4x_lvis_boxsup](./configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py) | 31.6 | 31.5 | 26.6 | 25.6 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis_boxsup/detic_centernet2_r50_fpn_4x_lvis_boxsup_20230911_233514-54116677.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis_boxsup/detic_centernet2_r50_fpn_4x_lvis_boxsup_20230911_233514.log.json) | +| [detic_centernet2_r50_fpn_4x_lvis_in21k-lvis](./configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py) | 32.9 | 33.2 | 30.9 | 29.7 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis_20230912_040619-9e7a3258.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis_20230912_040619.log.json) | +| [detic_centernet2_swin-b_fpn_4x_lvis_boxsup](./configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py) | 40.7 | 40.7 | 38.0 | 35.9 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis_boxsup/detic_centernet2_swin-b_fpn_4x_lvis_boxsup_20230825_061737-328e85f9.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis_boxsup/detic_centernet2_swin-b_fpn_4x_lvis_boxsup_20230825_061737.log.json) | +| [detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis](./configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py) | 41.7 | 41.7 | 41.7 | 41.7 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis_20230926_235410-0c152391.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis_20230926_235410.log.json) | + +#### Open-vocabulary LVIS Results + +| Model (Config) | mask mAP | mask mAP(official) | mask mAP_rare | mask mAP_rare(officical) | Download | +| :---------------------------------------------------------------------------------------------------------------: | :------: | :----------------: | :-----------: | :----------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [detic_centernet2_r50_fpn_4x_lvis-base_boxsup](./configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py) | 30.4 | 30.2 | 16.2 | 16.4 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_boxsup/detic_centernet2_r50_fpn_4x_lvis-base_boxsup_20230921_180638-c1685ee2.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_boxsup/detic_centernet2_r50_fpn_4x_lvis-base_boxsup_20230921_180638.log.json) | +| [detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis](./configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py) | 32.6 | 32.4 | 27.4 | 24.9 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis_20230925_014315-2d2cc8b7.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis_20230925_014315.log.json) | + +### Testing + +#### Test Command + +To evaluate a model with a trained model, run + +```shell +python ./tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} +``` + +#### Open-vocabulary LVIS Results + +The models are converted from the official model zoo. + +| Model (Config) | mask mAP | mask mAP_novel | Download | +| :---------------------------------------------------------------------------------------------------------------------: | :------: | :------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup](./configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py) | 38.4 | 21.9 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup-481281c8.pth) | +| [detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis](./configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py) | 40.7 | 34.0 | [model](https://download.openmmlab.com/mmdetection/v3.0/detic/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth) | + +###### Note: + +- The open-vocabulary LVIS setup is LVIS without rare class annotations in training, termed `lvisbase`. We evaluate rare classes as novel classes in testing. +- ` in21k-lvis` denotes that the model use the overlap classes between ImageNet-21K and LVIS as image-labeled data. + +## Citation + +If you find Detic is useful in your research or applications, please consider giving a star 🌟 to the [official repository](https://github.com/facebookresearch/Detic) and citing Detic by the following BibTeX entry. + +```BibTeX +@inproceedings{zhou2022detecting, + title={Detecting Twenty-thousand Classes using Image-level Supervision}, + author={Zhou, Xingyi and Girdhar, Rohit and Joulin, Armand and Kr{\"a}henb{\"u}hl, Philipp and Misra, Ishan}, + booktitle={ECCV}, + year={2022} +} +``` diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py new file mode 100644 index 0000000..8ca57b7 --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.py @@ -0,0 +1,9 @@ +_base_ = './detic_centernet2_r50_fpn_4x_lvis_boxsup.py' + +# 'lvis_v1_train_norare.json' is the annotations of lvis_v1 +# removing the labels of 337 rare-class +train_dataloader = dict( + dataset=dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict(ann_file='annotations/lvis_v1_train_norare.json'))) diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py new file mode 100644 index 0000000..034acb6 --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis-base_in21k-lvis.py @@ -0,0 +1,93 @@ +_base_ = './detic_centernet2_r50_fpn_4x_lvis_boxsup.py' +dataset_type = ['LVISV1Dataset', 'ImageNetLVISV1Dataset'] +image_size_det = (640, 640) +image_size_cls = (320, 320) + +# backend = 'pillow' +backend_args = None + +train_pipeline_det = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size_det, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size_det, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_pipeline_cls = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=False, with_label=True), + dict( + type='RandomResize', + scale=image_size_cls, + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size_cls, + recompute_bbox=False, + bbox_clip_border=False, + allow_negative_crop=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +# 'lvis_v1_train_norare.json' is the annotations of lvis_v1 +# removing the labels of 337 rare-class +dataset_det = dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type='LVISV1Dataset', + data_root='data/lvis/', + ann_file='annotations/lvis_v1_train_norare.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline_det, + backend_args=backend_args)) + +dataset_cls = dict( + type='ImageNetLVISV1Dataset', + data_root='data/imagenet', + ann_file='annotations/imagenet_lvis_image_info.json', + data_prefix=dict(img='ImageNet-LVIS/'), + pipeline=train_pipeline_cls, + backend_args=backend_args) + +train_dataloader = dict( + _delete_=True, + batch_size=[8, 32], + num_workers=2, + persistent_workers=True, + sampler=dict(type='MultiDataSampler', dataset_ratio=[1, 4]), + batch_sampler=dict( + type='MultiDataAspectRatioBatchSampler', num_datasets=2), + dataset=dict(type='ConcatDataset', datasets=[dataset_det, dataset_cls])) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + begin=0, + by_epoch=False, + T_max=90000, + ) +] + +load_from = './first_stage/detic_centernet2_r50_fpn_4x_lvis-base_boxsup.pth' + +find_unused_parameters = True diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py new file mode 100644 index 0000000..a11be37 --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_boxsup.py @@ -0,0 +1,410 @@ +_base_ = 'mmdet::_base_/default_runtime.py' +dataset_type = 'LVISV1Dataset' +custom_imports = dict( + imports=['projects.Detic_new.detic'], allow_failed_imports=False) + +num_classes = 1203 +lvis_cat_frequency_info = 'data/metadata/lvis_v1_train_cat_info.json' + +# 'data/metadata/lvis_v1_clip_a+cname.npy' is pre-computed +# CLIP embeddings for each category +cls_layer = dict( + type='ZeroShotClassifier', + zs_weight_path='data/metadata/lvis_v1_clip_a+cname.npy', + zs_weight_dim=512, + use_bias=0.0, + norm_weight=True, + norm_temperature=50.0) +reg_layer = [ + dict(type='Linear', in_features=1024, out_features=1024), + dict(type='ReLU', inplace=True), + dict(type='Linear', in_features=1024, out_features=4) +] + +model = dict( + type='Detic', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + init_cfg=dict( + type='Pretrained', + checkpoint='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/' + 'model-zoo/ImageNet_21K_P/models/resnet50_miil_21k.pth')), + neck=dict( + type='FPN', + in_channels=[512, 1024, 2048], + out_channels=256, + start_level=0, + add_extra_convs='on_output', + num_outs=5, + init_cfg=dict(type='Caffe2Xavier', layer='Conv2d'), + relu_before_extra_convs=True), + rpn_head=dict( + type='CenterNetRPNHead', + num_classes=1, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128], + conv_bias=True, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + loss_cls=dict( + type='HeatmapFocalLoss', + alpha=0.25, + beta=4.0, + gamma=2.0, + pos_weight=0.5, + neg_weight=0.5, + loss_weight=1.0, + ignore_high_fp=0.85, + ), + loss_bbox=dict(type='GIoULoss', eps=1e-6, loss_weight=1.0), + ), + roi_head=dict( + type='DeticRoIHead', + num_stages=3, + stage_loss_weights=[1.0, 1.0, 1.0], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', + output_size=7, + sampling_ratio=0, + use_torchvision=True), + out_channels=256, + featmap_strides=[8, 16, 32], + # approximately equal to + # canonical_box_size=224, canonical_level=4 in D2 + finest_scale=112), + bbox_head=[ + dict( + type='DeticBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=num_classes, + cls_predictor_cfg=cls_layer, + reg_predictor_cfg=reg_layer, + use_fed_loss=True, + cat_freq_path=lvis_cat_frequency_info, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='DeticBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=num_classes, + cls_predictor_cfg=cls_layer, + reg_predictor_cfg=reg_layer, + use_fed_loss=True, + cat_freq_path=lvis_cat_frequency_info, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.1, + loss_weight=1.0)), + dict( + type='DeticBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=num_classes, + cls_predictor_cfg=cls_layer, + reg_predictor_cfg=reg_layer, + use_fed_loss=True, + cat_freq_path=lvis_cat_frequency_info, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=0.1, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[8, 16, 32], + # approximately equal to + # canonical_box_size=224, canonical_level=4 in D2 + finest_scale=112), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + class_agnostic=True, + num_classes=num_classes, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + score_thr=0.0001, + nms_pre=4000, + max_per_img=2000, + nms=dict(type='nms', iou_threshold=0.9), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=False), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.8, + neg_iou_thr=0.8, + min_pos_iou=0.8, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=False), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + score_thr=0.0001, + nms_pre=1000, + max_per_img=256, + nms=dict(type='nms', iou_threshold=0.9), + min_bbox_size=0), + rcnn=dict( + score_thr=0.02, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=300, + mask_thr_binary=0.5))) + +# backend = 'pillow' +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=(640, 640), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(640, 640), + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=backend_args, + imdecode_backend=backend_args), + dict( + type='Resize', + scale=(1333, 800), + keep_ratio=True, + backend=backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'text', 'custom_entities')) +] + +val_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=backend_args, + imdecode_backend=backend_args), + dict( + type='Resize', + scale=(1333, 800), + keep_ratio=True, + backend=backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + poly2mask=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=8, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + batch_sampler=dict(type='AspectRatioBatchSampler'), + dataset=dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type='LVISV1Dataset', + data_root='data/lvis/', + ann_file='annotations/lvis_v1_train_norare.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline, + backend_args=backend_args))) + +val_dataloader = dict( + batch_size=8, + num_workers=2, + persistent_workers=True, + drop_last=False, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='LVISV1Dataset', + data_root='data/lvis/', + ann_file='annotations/lvis_v1_val.json', + data_prefix=dict(img=''), + pipeline=val_pipeline, + return_classes=False)) + +test_dataloader = dict( + batch_size=8, + num_workers=2, + persistent_workers=True, + drop_last=False, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='LVISV1Dataset', + data_root='data/lvis/', + ann_file='annotations/lvis_v1_val.json', + data_prefix=dict(img=''), + pipeline=test_pipeline, + return_classes=True)) + +val_evaluator = dict( + type='LVISMetric', + ann_file='data/lvis/annotations/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator + +# training schedule for 90k with batch_size of 64 +# with total batch_size of 16, 90k iters is equivalent to '1x' (12 epochs) +# with total batch_size of 64, 90k iters is equivalent to '4x' +max_iter = 90000 +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iter, val_interval=90000) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001), + paramwise_cfg=dict(norm_decay_mult=0.), + clip_grad=dict(max_norm=1.0, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=False, + begin=0, + end=10000), + dict( + type='CosineAnnealingLR', + begin=0, + by_epoch=False, + T_max=max_iter, + ) +] + +# only keep latest 5 checkpoints +default_hooks = dict( + checkpoint=dict(by_epoch=False, interval=30000, max_keep_ckpts=5), + logger=dict(type='LoggerHook', interval=50)) diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py new file mode 100644 index 0000000..ce97ed6 --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py @@ -0,0 +1,91 @@ +_base_ = './detic_centernet2_r50_fpn_4x_lvis_boxsup.py' +dataset_type = ['LVISV1Dataset', 'ImageNetLVISV1Dataset'] +image_size_det = (640, 640) +image_size_cls = (320, 320) + +# backend = 'pillow' +backend_args = None + +train_pipeline_det = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size_det, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size_det, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_pipeline_cls = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=False, with_label=True), + dict( + type='RandomResize', + scale=image_size_cls, + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size_cls, + recompute_bbox=False, + bbox_clip_border=False, + allow_negative_crop=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +dataset_det = dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type='LVISV1Dataset', + data_root='data/lvis/', + ann_file='annotations/lvis_v1_train.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline_det, + backend_args=backend_args)) + +dataset_cls = dict( + type='ImageNetLVISV1Dataset', + data_root='data/imagenet', + ann_file='annotations/imagenet_lvis_image_info.json', + data_prefix=dict(img='ImageNet-LVIS/'), + pipeline=train_pipeline_cls, + backend_args=backend_args) + +train_dataloader = dict( + _delete_=True, + batch_size=[8, 32], + num_workers=2, + persistent_workers=True, + sampler=dict(type='MultiDataSampler', dataset_ratio=[1, 4]), + batch_sampler=dict( + type='MultiDataAspectRatioBatchSampler', num_datasets=2), + dataset=dict(type='ConcatDataset', datasets=[dataset_det, dataset_cls])) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + begin=0, + by_epoch=False, + T_max=90000, + ) +] + +load_from = './first_stage/detic_centernet2_r50_fpn_4x_lvis_boxsup.pth' + +find_unused_parameters = True diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py new file mode 100644 index 0000000..efedd11 --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.py @@ -0,0 +1,9 @@ +_base_ = './detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py' + +# 'lvis_v1_train_norare.json' is the annotations of lvis_v1 +# removing the labels of 337 rare-class +train_dataloader = dict( + dataset=dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict(ann_file='annotations/lvis_v1_train_norare.json'))) diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py new file mode 100644 index 0000000..1df7097 --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis.py @@ -0,0 +1,118 @@ +_base_ = './detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py' + +image_size_det = (896, 896) +image_size_cls = (448, 448) + +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False), + neck=dict(in_channels=[256, 512, 1024])) + +backend_args = None +train_pipeline_det = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size_det, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size_det, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_pipeline_cls = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=False, with_label=True), + dict( + type='RandomResize', + scale=image_size_cls, + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size_cls, + recompute_bbox=False, + bbox_clip_border=False, + allow_negative_crop=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +# 'lvis_v1_train_norare.json' is the annotations of lvis_v1 +# removing the labels of 337 rare-class +dataset_det = dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type='LVISV1Dataset', + data_root='data/lvis/', + ann_file='annotations/lvis_v1_train_norare.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline_det, + backend_args=backend_args)) + +dataset_cls = dict( + type='ImageNetLVISV1Dataset', + data_root='data/imagenet', + ann_file='annotations/imagenet_lvis_image_info.json', + data_prefix=dict(img='ImageNet-LVIS/'), + pipeline=train_pipeline_cls, + backend_args=backend_args) + +train_dataloader = dict( + _delete_=True, + batch_size=[4, 16], + num_workers=2, + persistent_workers=True, + sampler=dict(type='MultiDataSampler', dataset_ratio=[1, 4]), + batch_sampler=dict( + type='MultiDataAspectRatioBatchSampler', num_datasets=2), + dataset=dict(type='ConcatDataset', datasets=[dataset_det, dataset_cls])) + +# training schedule for 180k +max_iter = 180000 +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iter, val_interval=180000) + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001)) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + begin=0, + by_epoch=False, + T_max=max_iter, + ) +] + +load_from = './first_stage/detic_centernet2_swin-b_fpn_4x_lvis-base_boxsup.pth' +find_unused_parameters = True diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py new file mode 100644 index 0000000..ce04a81 --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.py @@ -0,0 +1,78 @@ +_base_ = './detic_centernet2_r50_fpn_4x_lvis_boxsup.py' + +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict( + type='Pretrained', + checkpoint='https://github.com/SwinTransformer/storage/releases/' + 'download/v1.0.0/swin_base_patch4_window7_224_22k.pth')), + neck=dict(in_channels=[256, 512, 1024])) + +# backend = 'pillow' +backend_args = None + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=(896, 896), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(896, 896), + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_dataloader = dict( + dataset=dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict(pipeline=train_pipeline))) + +# training schedule for 180k +max_iter = 180000 +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iter, val_interval=180000) + +# Enable automatic-mixed-precision training with AmpOptimWrapper. +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=False, + begin=0, + end=10000), + dict( + type='CosineAnnealingLR', + begin=0, + by_epoch=False, + T_max=max_iter, + ) +] diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py new file mode 100644 index 0000000..a9ab2c6 --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_coco_in21k.py @@ -0,0 +1,2 @@ +# not support training, only for testing +_base_ = './detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py' diff --git a/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py new file mode 100644 index 0000000..de358ac --- /dev/null +++ b/mmdetection/projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis_in21k-lvis.py @@ -0,0 +1,116 @@ +_base_ = './detic_centernet2_r50_fpn_4x_lvis_in21k-lvis.py' + +image_size_det = (896, 896) +image_size_cls = (448, 448) + +model = dict( + backbone=dict( + _delete_=True, + type='SwinTransformer', + embed_dims=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.3, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False), + neck=dict(in_channels=[256, 512, 1024])) + +backend_args = None +train_pipeline_det = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='RandomResize', + scale=image_size_det, + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size_det, + recompute_bbox=True, + allow_negative_crop=True), + dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +train_pipeline_cls = [ + dict(type='LoadImageFromFile', backend_args=backend_args), + dict(type='LoadAnnotations', with_bbox=False, with_label=True), + dict( + type='RandomResize', + scale=image_size_cls, + ratio_range=(0.5, 1.5), + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=image_size_cls, + recompute_bbox=False, + bbox_clip_border=False, + allow_negative_crop=True), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] + +dataset_det = dict( + type='ClassBalancedDataset', + oversample_thr=1e-3, + dataset=dict( + type='LVISV1Dataset', + data_root='data/lvis/', + ann_file='annotations/lvis_v1_train.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline_det, + backend_args=backend_args)) + +dataset_cls = dict( + type='ImageNetLVISV1Dataset', + data_root='data/imagenet', + ann_file='annotations/imagenet_lvis_image_info.json', + data_prefix=dict(img='ImageNet-LVIS/'), + pipeline=train_pipeline_cls, + backend_args=backend_args) + +train_dataloader = dict( + _delete_=True, + batch_size=[4, 16], + num_workers=2, + persistent_workers=True, + sampler=dict(type='MultiDataSampler', dataset_ratio=[1, 4]), + batch_sampler=dict( + type='MultiDataAspectRatioBatchSampler', num_datasets=2), + dataset=dict(type='ConcatDataset', datasets=[dataset_det, dataset_cls])) + +# training schedule for 180k +max_iter = 180000 +train_cfg = dict( + type='IterBasedTrainLoop', max_iters=max_iter, val_interval=180000) + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001)) + +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, + end=1000), + dict( + type='CosineAnnealingLR', + begin=0, + by_epoch=False, + T_max=max_iter, + ) +] + +load_from = './first_stage/detic_centernet2_swin-b_fpn_4x_lvis_boxsup.pth' +find_unused_parameters = True diff --git a/mmdetection/projects/Detic_new/detic/__init__.py b/mmdetection/projects/Detic_new/detic/__init__.py new file mode 100644 index 0000000..e4b0d7b --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .centernet_rpn_head import CenterNetRPNHead +from .detic import Detic +from .detic_bbox_head import DeticBBoxHead +from .detic_roi_head import DeticRoIHead +from .heatmap_focal_loss import HeatmapFocalLoss +from .imagenet_lvis import ImageNetLVISV1Dataset +from .zero_shot_classifier import ZeroShotClassifier + +__all__ = [ + 'CenterNetRPNHead', 'Detic', 'DeticBBoxHead', 'DeticRoIHead', + 'ZeroShotClassifier', 'HeatmapFocalLoss', 'ImageNetLVISV1Dataset' +] diff --git a/mmdetection/projects/Detic_new/detic/centernet_rpn_head.py b/mmdetection/projects/Detic_new/detic/centernet_rpn_head.py new file mode 100644 index 0000000..6298728 --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/centernet_rpn_head.py @@ -0,0 +1,573 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Dict, List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import Scale +from mmengine import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.dense_heads import CenterNetUpdateHead +from mmdet.models.utils import unpack_gt_instances +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2distance +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, reduce_mean) +from .iou_loss import IOULoss + +# from .heatmap_focal_loss import binary_heatmap_focal_loss_jit +INF = 1000000000 +RangeType = Sequence[Tuple[int, int]] + + +@MODELS.register_module() +class CenterNetRPNHead(CenterNetUpdateHead): + """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2. + + Paper link ``_. + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channel in the input feature map. + regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple + level points. + hm_min_radius (int): Heatmap target minimum radius of cls branch. + Defaults to 4. + hm_min_overlap (float): Heatmap target minimum overlap of cls branch. + Defaults to 0.8. + more_pos_thresh (float): The filtering threshold when the cls branch + adds more positive samples. Defaults to 0.2. + more_pos_topk (int): The maximum number of additional positive samples + added to each gt. Defaults to 9. + soft_weight_on_reg (bool): Whether to use the soft target of the + cls branch as the soft weight of the bbox branch. + Defaults to False. + loss_cls (:obj:`ConfigDict` or dict): Config of cls loss. Defaults to + dict(type='GaussianFocalLoss', loss_weight=1.0) + loss_bbox (:obj:`ConfigDict` or dict): Config of bbox loss. Defaults to + dict(type='GIoULoss', loss_weight=2.0). + norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct + and config norm layer. Defaults to + ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config. + Unused in CenterNet. Reserved for compatibility with + SingleStageDetector. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config + of CenterNet. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + regress_ranges: RangeType = ((0, 80), (64, 160), (128, 320), + (256, 640), (512, INF)), + hm_min_radius: int = 4, + hm_min_overlap: float = 0.8, + more_pos: bool = False, + more_pos_thresh: float = 0.2, + more_pos_topk: int = 9, + soft_weight_on_reg: bool = False, + not_clamp_box: bool = False, + loss_cls: ConfigType = dict( + type='HeatmapFocalLoss', + alpha=0.25, + beta=4.0, + gamma=2.0, + pos_weight=1.0, + neg_weight=1.0, + sigmoid_clamp=1e-4, + ignore_high_fp=-1.0, + loss_weight=1.0, + ), + loss_bbox: ConfigType = dict( + type='GIoULoss', loss_weight=2.0), + norm_cfg: OptConfigType = dict( + type='GN', num_groups=32, requires_grad=True), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + **kwargs) -> None: + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + # loss_bbox=loss_bbox, + loss_cls=loss_cls, + norm_cfg=norm_cfg, + train_cfg=train_cfg, + test_cfg=test_cfg, + **kwargs) + self.soft_weight_on_reg = soft_weight_on_reg + self.hm_min_radius = hm_min_radius + self.more_pos_thresh = more_pos_thresh + self.more_pos_topk = more_pos_topk + self.more_pos = more_pos + self.not_clamp_box = not_clamp_box + self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap) + self.loss_bbox = IOULoss('giou') + + # GaussianFocalLoss must be sigmoid mode + self.use_sigmoid_cls = True + self.cls_out_channels = num_classes + + self.regress_ranges = regress_ranges + self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self._init_reg_convs() + self._init_predictor() + + def forward_single(self, x: Tensor, scale: Scale, + stride: int) -> Tuple[Tensor, Tensor]: + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps. + + Returns: + tuple: scores for each class, bbox predictions of + input feature maps. + """ + for m in self.reg_convs: + x = m(x) + cls_score = self.conv_cls(x) + bbox_pred = self.conv_reg(x) + # scale the bbox_pred of different level + # float to avoid overflow when enabling FP16 + bbox_pred = scale(bbox_pred).float() + # bbox_pred needed for gradient computation has been modified + # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace + # F.relu(bbox_pred) with bbox_pred.clamp(min=0) + bbox_pred = bbox_pred.clamp(min=0) + return cls_score, bbox_pred # score aligned, box larger + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + + num_imgs = cls_scores[0].size(0) + assert len(cls_scores) == len(bbox_preds) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.prior_generator.grid_priors( + featmap_sizes, + dtype=bbox_preds[0].dtype, + device=bbox_preds[0].device) + + # 1 flatten outputs + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + for bbox_pred in bbox_preds + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + + # repeat points to align with bbox_preds + flatten_points = torch.cat( + [points.repeat(num_imgs, 1) for points in all_level_points]) + + assert (torch.isfinite(flatten_bbox_preds).all().item()) + + # 2 calc reg and cls branch targets + cls_targets, bbox_targets = self.get_targets(all_level_points, + batch_gt_instances) + + # 3 pos index for cls branch + featmap_sizes = flatten_points.new_tensor(featmap_sizes) + + if self.more_pos: + pos_inds, cls_labels = self.add_cls_pos_inds( + flatten_points, flatten_bbox_preds, featmap_sizes, + batch_gt_instances) + else: + pos_inds = self._get_label_inds(batch_gt_instances, + batch_img_metas, featmap_sizes) + + # 4 calc cls loss + if pos_inds is None: + # num_gts=0 + num_pos_cls = bbox_preds[0].new_tensor(0, dtype=torch.float) + else: + num_pos_cls = bbox_preds[0].new_tensor( + len(pos_inds), dtype=torch.float) + num_pos_cls = max(reduce_mean(num_pos_cls), 1.0) + + cat_agn_cls_targets = cls_targets.max(dim=1)[0] # M + + cls_pos_loss, cls_neg_loss = self.loss_cls( + flatten_cls_scores.squeeze(1), cat_agn_cls_targets, pos_inds, + num_pos_cls) + + # 5 calc reg loss + pos_bbox_inds = torch.nonzero( + bbox_targets.max(dim=1)[0] >= 0).squeeze(1) + pos_bbox_preds = flatten_bbox_preds[pos_bbox_inds] + pos_bbox_targets = bbox_targets[pos_bbox_inds] + + bbox_weight_map = cls_targets.max(dim=1)[0] + bbox_weight_map = bbox_weight_map[pos_bbox_inds] + bbox_weight_map = bbox_weight_map if self.soft_weight_on_reg \ + else torch.ones_like(bbox_weight_map) + + num_pos_bbox = max(reduce_mean(bbox_weight_map.sum()), 1.0) + + if len(pos_bbox_inds) > 0: + bbox_loss = self.loss_bbox( + pos_bbox_preds, + pos_bbox_targets, + bbox_weight_map, + reduction='sum') / num_pos_bbox + else: + bbox_loss = flatten_bbox_preds.sum() * 0 + + return dict( + loss_bbox=bbox_loss, + loss_cls_pos=cls_pos_loss, + loss_cls_neg=cls_neg_loss) + + def loss_and_predict( + self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + proposal_cfg: Optional[ConfigDict] = None + ) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. + + Args: + x (tuple[Tensor]): Features from FPN. + batch_data_samples (list[:obj:`DetDataSample`]): Each item contains + the meta information of each image and corresponding + annotations. + proposal_cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + + Returns: + tuple: the return value is a tuple contains: + + - losses: (dict[str, Tensor]): A dictionary of loss components. + - predictions (list[:obj:`InstanceData`]): Detection + results of each image after the post process. + """ + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + outs = self(x) + + loss_inputs = outs + (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + losses = self.loss_by_feat(*loss_inputs) + predictions = self.predict_by_feat( + *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg) + return losses, predictions + + def _predict_by_feat_single(self, + cls_score_list: List[Tensor], + bbox_pred_list: List[Tensor], + score_factor_list: List[Tensor], + mlvl_priors: List[Tensor], + img_meta: dict, + cfg: ConfigDict, + rescale: bool = False, + with_nms: bool = True) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (mmengine.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + nms_pre = cfg.get('nms_pre', -1) + + mlvl_bbox_preds = [] + mlvl_valid_priors = [] + mlvl_scores = [] + mlvl_labels = [] + + for level_idx, (cls_score, bbox_pred, score_factor, priors) in \ + enumerate(zip(cls_score_list, bbox_pred_list, + score_factor_list, mlvl_priors)): + + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + bbox_pred = bbox_pred * self.strides[level_idx] + + dim = self.bbox_coder.encode_size + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim) + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + heatmap = cls_score.sigmoid() + score_thr = cfg.get('score_thr', 0) + + candidate_inds = heatmap > score_thr # 0.05 + pre_nms_top_n = candidate_inds.sum() # N + pre_nms_top_n = pre_nms_top_n.clamp(max=nms_pre) # N + + heatmap = heatmap[candidate_inds] # n + + candidate_nonzeros = candidate_inds.nonzero() # n + box_loc = candidate_nonzeros[:, 0] # n + labels = candidate_nonzeros[:, 1] # n + + bbox_pred = bbox_pred[box_loc] # n x 4 + per_grids = priors[box_loc] # n x 2 + + if candidate_inds.sum().item() > pre_nms_top_n.item(): + heatmap, top_k_indices = \ + heatmap.topk(pre_nms_top_n, sorted=False) + labels = labels[top_k_indices] + bbox_pred = bbox_pred[top_k_indices] + per_grids = per_grids[top_k_indices] + + bboxes = torch.stack([ + per_grids[:, 0] - bbox_pred[:, 0], + per_grids[:, 1] - bbox_pred[:, 1], + per_grids[:, 0] + bbox_pred[:, 2], + per_grids[:, 1] + bbox_pred[:, 3], + ], + dim=1) # n x 4 + + # avoid invalid boxes in RoI heads + bboxes[:, 2] = torch.max(bboxes[:, 2], bboxes[:, 0] + 0.01) + bboxes[:, 3] = torch.max(bboxes[:, 3], bboxes[:, 1] + 0.01) + + # bboxes = self.bbox_coder.decode(per_grids, bbox_pred) + # # avoid invalid boxes in RoI heads + # bboxes[:, 2] = torch.max(bboxes[:, 2], bboxes[:, 0] + 0.01) + # bboxes[:, 3] = torch.max(bboxes[:, 3], bboxes[:, 1] + 0.01) + + mlvl_bbox_preds.append(bboxes) + mlvl_valid_priors.append(priors) + mlvl_scores.append(torch.sqrt(heatmap)) + mlvl_labels.append(labels) + + results = InstanceData() + results.bboxes = torch.cat(mlvl_bbox_preds) + results.scores = torch.cat(mlvl_scores) + results.labels = torch.cat(mlvl_labels) + + return self._bbox_post_process( + results=results, + cfg=cfg, + rescale=rescale, + with_nms=with_nms, + img_meta=img_meta) + + def _get_label_inds(self, batch_gt_instances, batch_img_metas, + shapes_per_level): + ''' + Inputs: + batch_gt_instances: [n_i], sum n_i = N + shapes_per_level: L x 2 [(h_l, w_l)]_L + Returns: + pos_inds: N' + labels: N' + ''' + pos_inds = [] + L = len(self.strides) + B = len(batch_gt_instances) + shapes_per_level = shapes_per_level.long() + loc_per_level = (shapes_per_level[:, 0] * + shapes_per_level[:, 1]).long() # L + level_bases = [] + s = 0 + for i in range(L): + level_bases.append(s) + s = s + B * loc_per_level[i] + level_bases = shapes_per_level.new_tensor(level_bases).long() # L + strides_default = shapes_per_level.new_tensor( + self.strides).float() # L + for im_i in range(B): + targets_per_im = batch_gt_instances[im_i] + if hasattr(targets_per_im, 'bboxes'): + bboxes = targets_per_im.bboxes # n x 4 + else: + bboxes = targets_per_im.labels.new_tensor( + [], dtype=torch.float).reshape(-1, 4) + n = bboxes.shape[0] + centers = ((bboxes[:, [0, 1]] + bboxes[:, [2, 3]]) / 2) # n x 2 + centers = centers.view(n, 1, 2).expand(n, L, 2).contiguous() + if self.not_clamp_box: + h, w = batch_img_metas[im_i]._image_size + centers[:, :, 0].clamp_(min=0).clamp_(max=w - 1) + centers[:, :, 1].clamp_(min=0).clamp_(max=h - 1) + strides = strides_default.view(1, L, 1).expand(n, L, 2) + centers_inds = (centers / strides).long() # n x L x 2 + Ws = shapes_per_level[:, 1].view(1, L).expand(n, L) + pos_ind = level_bases.view(1, L).expand(n, L) \ + + im_i * loc_per_level.view(1, L).expand(n, L) \ + + centers_inds[:, :, 1] * Ws + centers_inds[:, :, 0] # n x L + is_cared_in_the_level = self.assign_fpn_level(bboxes) + pos_ind = pos_ind[is_cared_in_the_level].view(-1) + + pos_inds.append(pos_ind) # n' + pos_inds = torch.cat(pos_inds, dim=0).long() + return pos_inds # N, N + + def assign_fpn_level(self, boxes): + ''' + Inputs: + boxes: n x 4 + size_ranges: L x 2 + Return: + is_cared_in_the_level: n x L + ''' + size_ranges = boxes.new_tensor(self.regress_ranges).view( + len(self.regress_ranges), 2) # L x 2 + crit = ((boxes[:, 2:] - boxes[:, :2])**2).sum(dim=1)**0.5 / 2 # n + n, L = crit.shape[0], size_ranges.shape[0] + crit = crit.view(n, 1).expand(n, L) + size_ranges_expand = size_ranges.view(1, L, 2).expand(n, L, 2) + is_cared_in_the_level = (crit >= size_ranges_expand[:, :, 0]) & \ + (crit <= size_ranges_expand[:, :, 1]) + return is_cared_in_the_level + + def _get_targets_single(self, gt_instances: InstanceData, points: Tensor, + regress_ranges: Tensor, + strides: Tensor) -> Tuple[Tensor, Tensor]: + """Compute classification and bbox targets for a single image.""" + num_points = points.size(0) + num_gts = len(gt_instances) + gt_labels = gt_instances.labels + + if not hasattr(gt_instances, 'bboxes'): + gt_bboxes = gt_labels.new_tensor([], dtype=torch.float) + else: + gt_bboxes = gt_instances.bboxes + + if not hasattr(gt_instances, 'bboxes') or num_gts == 0: + return gt_labels.new_full((num_points, + self.num_classes), + self.num_classes, + dtype=torch.float), \ + gt_bboxes.new_full((num_points, 4), -1) + + # Calculate the regression tblr target corresponding to all points + points = points[:, None].expand(num_points, num_gts, 2) + gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) + strides = strides[:, None, None].expand(num_points, num_gts, 2) + + bbox_target = bbox2distance(points, gt_bboxes) # M x N x 4 + + # condition1: inside a gt bbox + inside_gt_bbox_mask = bbox_target.min(dim=2)[0] > 0 # M x N + + # condition2: Calculate the nearest points from + # the upper, lower, left and right ranges from + # the center of the gt bbox + centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2) + centers_discret = ((centers / strides).int() * strides).float() + \ + strides / 2 + + centers_discret_dist = points - centers_discret + dist_x = centers_discret_dist[..., 0].abs() + dist_y = centers_discret_dist[..., 1].abs() + inside_gt_center3x3_mask = (dist_x <= strides[..., 0]) & \ + (dist_y <= strides[..., 0]) + + # condition3: limit the regression range for each location + bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:] + crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2 + inside_fpn_level_mask = (crit >= regress_ranges[:, [0]]) & \ + (crit <= regress_ranges[:, [1]]) + bbox_target_mask = inside_gt_bbox_mask & \ + inside_gt_center3x3_mask & \ + inside_fpn_level_mask + + # Calculate the distance weight map + gt_center_peak_mask = ((centers_discret_dist**2).sum(dim=2) == 0) + weighted_dist = ((points - centers)**2).sum(dim=2) # M x N + weighted_dist[gt_center_peak_mask] = 0 + + areas = (gt_bboxes[..., 2] - gt_bboxes[..., 0]) * ( + gt_bboxes[..., 3] - gt_bboxes[..., 1]) + radius = self.delta**2 * 2 * areas + radius = torch.clamp(radius, min=self.hm_min_radius**2) + weighted_dist = weighted_dist / radius + + # Calculate bbox_target + bbox_weighted_dist = weighted_dist.clone() + bbox_weighted_dist[bbox_target_mask == 0] = INF * 1.0 + min_dist, min_inds = bbox_weighted_dist.min(dim=1) + bbox_target = bbox_target[range(len(bbox_target)), + min_inds] # M x N x 4 --> M x 4 + bbox_target[min_dist == INF] = -INF + + # Convert to feature map scale + bbox_target /= strides[:, 0, :].repeat(1, 2) + + # Calculate cls_target + cls_target = self._create_heatmaps_from_dist(weighted_dist, gt_labels) + + return cls_target, bbox_target diff --git a/mmdetection/projects/Detic_new/detic/detic.py b/mmdetection/projects/Detic_new/detic/detic.py new file mode 100644 index 0000000..7028690 --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/detic.py @@ -0,0 +1,274 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.logging import print_log +from torch import Tensor + +from mmdet.datasets import LVISV1Dataset +from mmdet.models.detectors.cascade_rcnn import CascadeRCNN +from mmdet.registry import MODELS +from mmdet.structures import SampleList + + +class CLIPTextEncoder(nn.Module): + + def __init__(self, model_name='ViT-B/32'): + super().__init__() + import clip + from clip.simple_tokenizer import SimpleTokenizer + self.tokenizer = SimpleTokenizer() + pretrained_model, _ = clip.load(model_name, device='cpu') + self.clip = pretrained_model + + @property + def device(self): + return self.clip.device + + @property + def dtype(self): + return self.clip.dtype + + def tokenize(self, + texts: Union[str, List[str]], + context_length: int = 77) -> torch.LongTensor: + if isinstance(texts, str): + texts = [texts] + + sot_token = self.tokenizer.encoder['<|startoftext|>'] + eot_token = self.tokenizer.encoder['<|endoftext|>'] + all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] + for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + st = torch.randint(len(tokens) - context_length + 1, + (1, ))[0].item() + tokens = tokens[st:st + context_length] + result[i, :len(tokens)] = torch.tensor(tokens) + + return result + + def forward(self, text): + text = self.tokenize(text) + text_features = self.clip.encode_text(text) + return text_features + + +def get_class_weight(original_caption, prompt_prefix='a '): + if isinstance(original_caption, str): + if original_caption == 'coco': + from mmdet.datasets import CocoDataset + class_names = CocoDataset.METAINFO['classes'] + elif original_caption == 'cityscapes': + from mmdet.datasets import CityscapesDataset + class_names = CityscapesDataset.METAINFO['classes'] + elif original_caption == 'voc': + from mmdet.datasets import VOCDataset + class_names = VOCDataset.METAINFO['classes'] + elif original_caption == 'openimages': + from mmdet.datasets import OpenImagesDataset + class_names = OpenImagesDataset.METAINFO['classes'] + elif original_caption == 'lvis': + from mmdet.datasets import LVISV1Dataset + class_names = LVISV1Dataset.METAINFO['classes'] + else: + if not original_caption.endswith('.'): + original_caption = original_caption + ' . ' + original_caption = original_caption.split(' . ') + class_names = list(filter(lambda x: len(x) > 0, original_caption)) + + # for test.py + else: + class_names = list(original_caption) + + text_encoder = CLIPTextEncoder() + text_encoder.eval() + texts = [prompt_prefix + x for x in class_names] + print_log(f'Computing text embeddings for {len(class_names)} classes.') + embeddings = text_encoder(texts).detach().permute(1, 0).contiguous().cpu() + return class_names, embeddings + + +def reset_cls_layer_weight(roi_head, weight): + if type(weight) == str: + print_log(f'Resetting cls_layer_weight from file: {weight}') + zs_weight = torch.tensor( + np.load(weight), + dtype=torch.float32).permute(1, 0).contiguous() # D x C + else: + zs_weight = weight + zs_weight = torch.cat( + [zs_weight, zs_weight.new_zeros( + (zs_weight.shape[0], 1))], dim=1) # D x (C + 1) + zs_weight = F.normalize(zs_weight, p=2, dim=0) + zs_weight = zs_weight.to('cuda') + num_classes = zs_weight.shape[-1] + + for bbox_head in roi_head.bbox_head: + bbox_head.num_classes = num_classes + del bbox_head.fc_cls.zs_weight + bbox_head.fc_cls.zs_weight = zs_weight + + +@MODELS.register_module() +class Detic(CascadeRCNN): + + def __init__(self, + with_image_labels: bool = False, + sync_caption_batch: bool = False, + fp16: bool = False, + roi_head_name: str = '', + cap_batch_ratio: int = 4, + with_caption: bool = False, + dynamic_classifier: bool = False, + **kwargs) -> None: + super().__init__(**kwargs) + + self._entities = LVISV1Dataset.METAINFO['classes'] + self._text_prompts = None + # Turn on co-training with classification data + self.with_image_labels = with_image_labels + # Caption losses + self.with_caption = with_caption + # synchronize across GPUs to enlarge # "classes" + self.sync_caption_batch = sync_caption_batch + # Ratio between detection data and caption data + self.cap_batch_ratio = cap_batch_ratio + self.fp16 = fp16 + self.roi_head_name = roi_head_name + # dynamic class sampling when training with 21K classes, + # Federated loss is enabled when DYNAMIC_CLASSIFIER is on + self.dynamic_classifier = dynamic_classifier + self.return_proposal = False + if self.dynamic_classifier: + self.freq_weight = kwargs.pop('freq_weight') + self.num_classes = kwargs.pop('num_classes') + self.num_sample_cats = kwargs.pop('num_sample_cats') + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> dict: + """Calculate losses from a batch of inputs and data samples. + + Args: + batch_inputs (Tensor): Input images of shape (N, C, H, W). + These should usually be mean centered and std scaled. + batch_data_samples (List[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components + """ + + x = self.extract_feat(batch_inputs) + losses = dict() + + # RPN forward and loss + if self.with_rpn: + proposal_cfg = self.train_cfg.get('rpn_proposal', + self.test_cfg.rpn) + rpn_data_samples = copy.deepcopy(batch_data_samples) + # set cat_id of gt_labels to 0 in RPN + for data_sample in rpn_data_samples: + data_sample.gt_instances.labels = \ + torch.zeros_like(data_sample.gt_instances.labels) + + rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict( + x, rpn_data_samples, proposal_cfg=proposal_cfg) + + # avoid get same name with roi_head loss + keys = rpn_losses.keys() + for key in list(keys): + if 'loss' in key and 'rpn' not in key: + rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key) + losses.update(rpn_losses) + # if not hasattr(batch_data_samples[0].gt_instances, 'bboxes'): + # losses.update({k: v * 0 for k, v in rpn_losses.items()}) + # else: + # losses.update(rpn_losses) + else: + assert batch_data_samples[0].get('proposals', None) is not None + # use pre-defined proposals in InstanceData for the second stage + # to extract ROI features. + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + roi_losses = self.roi_head.loss(x, rpn_results_list, + batch_data_samples) + + losses.update(roi_losses) + + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + + Args: + batch_inputs (Tensor): Inputs with shape (N, C, H, W). + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool): Whether to rescale the results. + Defaults to True. + + Returns: + list[:obj:`DetDataSample`]: Return the detection results of the + input images. The returns value is DetDataSample, + which usually contain 'pred_instances'. And the + ``pred_instances`` usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + # For single image inference + if 'custom_entities' in batch_data_samples[0]: + text_prompts = batch_data_samples[0].text + if text_prompts != self._text_prompts: + self._text_prompts = text_prompts + class_names, zs_weight = get_class_weight(text_prompts) + self._entities = class_names + reset_cls_layer_weight(self.roi_head, zs_weight) + + assert self.with_bbox, 'Bbox head must be implemented.' + + x = self.extract_feat(batch_inputs) + + # If there are no pre-defined proposals, use RPN to get proposals + if batch_data_samples[0].get('proposals', None) is None: + rpn_results_list = self.rpn_head.predict( + x, batch_data_samples, rescale=False) + else: + rpn_results_list = [ + data_sample.proposals for data_sample in batch_data_samples + ] + + results_list = self.roi_head.predict( + x, rpn_results_list, batch_data_samples, rescale=rescale) + + for data_sample, pred_instances in zip(batch_data_samples, + results_list): + if len(pred_instances) > 0: + label_names = [] + for labels in pred_instances.labels: + label_names.append(self._entities[labels]) + # for visualization + pred_instances.label_names = label_names + data_sample.pred_instances = pred_instances + + return batch_data_samples diff --git a/mmdetection/projects/Detic_new/detic/detic_bbox_head.py b/mmdetection/projects/Detic_new/detic/detic_bbox_head.py new file mode 100644 index 0000000..8779494 --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/detic_bbox_head.py @@ -0,0 +1,434 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +from typing import List, Optional + +import torch +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor +from torch.nn import functional as F + +from mmdet.models.layers import multiclass_nms +from mmdet.models.losses import accuracy +from mmdet.models.roi_heads.bbox_heads import Shared2FCBBoxHead +from mmdet.models.utils import empty_instances +from mmdet.registry import MODELS +from mmdet.structures.bbox import get_box_tensor, scale_boxes +from mmdet.utils import ConfigType, InstanceList + + +def load_class_freq(path='datasets/metadata/lvis_v1_train_cat_info.json', + freq_weight=0.5): + cat_info = json.load(open(path, 'r')) + cat_info = torch.tensor( + [c['image_count'] for c in sorted(cat_info, key=lambda x: x['id'])]) + freq_weight = cat_info.float()**freq_weight + return freq_weight + + +def get_fed_loss_inds(labels, num_sample_cats, C, weight=None): + + appeared = torch.unique(labels) # C' + prob = appeared.new_ones(C + 1).float() + prob[-1] = 0 + if len(appeared) < num_sample_cats: + if weight is not None: + prob[:C] = weight.float().clone() + prob[appeared] = 0 + more_appeared = torch.multinomial( + prob, num_sample_cats - len(appeared), replacement=False) + appeared = torch.cat([appeared, more_appeared]) + return appeared + + +@MODELS.register_module() +class DeticBBoxHead(Shared2FCBBoxHead): + + def __init__(self, + image_loss_weight: float = 0.1, + use_fed_loss: bool = False, + cat_freq_path: str = '', + fed_loss_freq_weight: float = 0.5, + fed_loss_num_cat: int = 50, + cls_predictor_cfg: ConfigType = dict( + type='ZeroShotClassifier'), + *args, + **kwargs) -> None: + super().__init__(*args, **kwargs) + # reconstruct fc_cls and fc_reg since input channels are changed + assert self.with_cls + + self.cls_predictor_cfg = cls_predictor_cfg + cls_channels = self.num_classes + self.cls_predictor_cfg.update( + in_features=self.cls_last_dim, out_features=cls_channels) + self.fc_cls = MODELS.build(self.cls_predictor_cfg) + + self.init_cfg += [ + dict(type='Caffe2Xavier', override=dict(name='reg_fcs')) + ] + + self.image_loss_weight = image_loss_weight + self.use_fed_loss = use_fed_loss + self.cat_freq_path = cat_freq_path + self.fed_loss_freq_weight = fed_loss_freq_weight + self.fed_loss_num_cat = fed_loss_num_cat + + if self.use_fed_loss: + freq_weight = load_class_freq(cat_freq_path, fed_loss_freq_weight) + self.register_buffer('freq_weight', freq_weight) + else: + self.freq_weight = None + + def _predict_by_feat_single( + self, + roi: Tensor, + cls_score: Tensor, + bbox_pred: Tensor, + img_meta: dict, + rescale: bool = False, + rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData: + """Transform a single image's features extracted from the head into + bbox results. + + Args: + roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5). + last dimension 5 arrange as (batch_index, x1, y1, x2, y2). + cls_score (Tensor): Box scores, has shape + (num_boxes, num_classes + 1). + bbox_pred (Tensor): Box energies / deltas. + has shape (num_boxes, num_classes * 4). + img_meta (dict): image information. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head. + Defaults to None + + Returns: + :obj:`InstanceData`: Detection results of each image\ + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + results = InstanceData() + if roi.shape[0] == 0: + return empty_instances([img_meta], + roi.device, + task_type='bbox', + instance_results=[results], + box_type=self.predict_box_type, + use_box_type=False, + num_classes=self.num_classes, + score_per_cls=rcnn_test_cfg is None)[0] + scores = cls_score + img_shape = img_meta['img_shape'] + num_rois = roi.size(0) + + num_classes = 1 if self.reg_class_agnostic else self.num_classes + roi = roi.repeat_interleave(num_classes, dim=0) + bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size) + bboxes = self.bbox_coder.decode( + roi[..., 1:], bbox_pred, max_shape=img_shape) + + if rescale and bboxes.size(0) > 0: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + bboxes = scale_boxes(bboxes, scale_factor) + + # Get the inside tensor when `bboxes` is a box type + bboxes = get_box_tensor(bboxes) + box_dim = bboxes.size(-1) + bboxes = bboxes.view(num_rois, -1) + + if rcnn_test_cfg is None: + # This means that it is aug test. + # It needs to return the raw results without nms. + results.bboxes = bboxes + results.scores = scores + else: + det_bboxes, det_labels = multiclass_nms( + bboxes, + scores, + rcnn_test_cfg.score_thr, + rcnn_test_cfg.nms, + rcnn_test_cfg.max_per_img, + box_dim=box_dim) + results.bboxes = det_bboxes[:, :-1] + results.scores = det_bboxes[:, -1] + results.labels = det_labels + return results + + def loss(self, + cls_score: Tensor, + bbox_pred: Tensor, + rois: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + bbox_weights: Tensor, + reduction_override: Optional[str] = None) -> dict: + """Calculate the loss based on the network predictions and targets. + + Args: + cls_score (Tensor): Classification prediction + results of all class, has shape + (batch_size * num_proposals_single_image, num_classes) + bbox_pred (Tensor): Regression prediction results, + has shape + (batch_size * num_proposals_single_image, 4), the last + dimension 4 represents [tl_x, tl_y, br_x, br_y]. + rois (Tensor): RoIs with the shape + (batch_size * num_proposals_single_image, 5) where the first + column indicates batch id of each RoI. + labels (Tensor): Gt_labels for all proposals in a batch, has + shape (batch_size * num_proposals_single_image, ). + label_weights (Tensor): Labels_weights for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, ). + bbox_targets (Tensor): Regression target for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, 4), + the last dimension 4 represents [tl_x, tl_y, br_x, br_y]. + bbox_weights (Tensor): Regression weights for all proposals in a + batch, has shape (batch_size * num_proposals_single_image, 4). + reduction_override (str, optional): The reduction + method used to override the original reduction + method of the loss. Options are "none", + "mean" and "sum". Defaults to None, + + Returns: + dict: A dictionary of loss. + """ + + losses = dict() + + if cls_score is not None: + + if cls_score.numel() > 0: + loss_cls_ = self.sigmoid_cross_entropy_loss(cls_score, labels) + if isinstance(loss_cls_, dict): + losses.update(loss_cls_) + else: + losses['loss_cls'] = loss_cls_ + if self.custom_activation: + acc_ = self.loss_cls.get_accuracy(cls_score, labels) + losses.update(acc_) + else: + losses['acc'] = accuracy(cls_score, labels) + if bbox_pred is not None: + bg_class_ind = self.num_classes + # 0~self.num_classes-1 are FG, self.num_classes is BG + pos_inds = (labels >= 0) & (labels < bg_class_ind) + # do not perform bounding box regression for BG anymore. + if pos_inds.any(): + if self.reg_decoded_bbox: + # When the regression loss (e.g. `IouLoss`, + # `GIouLoss`, `DIouLoss`) is applied directly on + # the decoded bounding boxes, it decodes the + # already encoded coordinates to absolute format. + bbox_pred = self.bbox_coder.decode(rois[:, 1:], bbox_pred) + bbox_pred = get_box_tensor(bbox_pred) + if self.reg_class_agnostic: + pos_bbox_pred = bbox_pred.view( + bbox_pred.size(0), -1)[pos_inds.type(torch.bool)] + else: + pos_bbox_pred = bbox_pred.view( + bbox_pred.size(0), self.num_classes, + -1)[pos_inds.type(torch.bool), + labels[pos_inds.type(torch.bool)]] + + losses['loss_bbox'] = self.loss_bbox( + pos_bbox_pred, + bbox_targets[pos_inds.type(torch.bool)], + bbox_weights[pos_inds.type(torch.bool)], + avg_factor=bbox_targets.size(0), + reduction_override=reduction_override) + else: + losses['loss_bbox'] = bbox_pred[pos_inds].sum() + return losses + + def sigmoid_cross_entropy_loss(self, cls_score, labels): + if cls_score.numel() == 0: + return cls_score.new_zeros( + [1])[0] # This is more robust than .sum() * 0. + B = cls_score.shape[0] + C = cls_score.shape[1] - 1 + + target = cls_score.new_zeros(B, C + 1) + target[range(len(labels)), labels] = 1 # B x (C + 1) + target = target[:, :C] # B x C + + weight = 1 + if self.use_fed_loss and (self.freq_weight is not None): # fedloss + appeared = get_fed_loss_inds( + labels, + num_sample_cats=self.fed_loss_num_cat, + C=C, + weight=self.freq_weight) + appeared_mask = appeared.new_zeros(C + 1) + appeared_mask[appeared] = 1 # C + 1 + appeared_mask = appeared_mask[:C] + fed_w = appeared_mask.view(1, C).expand(B, C) + weight = weight * fed_w.float() + # if self.ignore_zero_cats and (self.freq_weight is not None): + # w = (self.freq_weight.view(-1) > 1e-4).float() + # weight = weight * w.view(1, C).expand(B, C) + # # import pdb; pdb.set_trace() + + cls_loss = F.binary_cross_entropy_with_logits( + cls_score[:, :-1], target, reduction='none') # B x C + loss = torch.sum(cls_loss * weight) / B + return loss + + def image_label_losses(self, cls_score, sampling_results, image_labels): + ''' + Inputs: + cls_score: N x (C + 1) + image_labels B x 1 + ''' + num_inst_per_image = [ + len(pred_instances) for pred_instances in sampling_results + ] + cls_score = cls_score.split( + num_inst_per_image, dim=0) # B x n x (C + 1) + B = len(cls_score) + loss = cls_score[0].new_zeros([1])[0] + for (score, labels, pred_instances) in zip(cls_score, image_labels, + sampling_results): + if score.shape[0] == 0: + loss += score.new_zeros([1])[0] + continue + # find out max-size idx + bboxes = pred_instances.bboxes + areas = (bboxes[:, 2] - bboxes[:, 0]) * ( + bboxes[:, 3] - bboxes[:, 1]) + idx = areas[:-1].argmax().item() if len(areas) > 1 else 0 + + for label in labels: + target = score.new_zeros(score.shape[1]) + target[label] = 1 + loss_i = F.binary_cross_entropy_with_logits( + score[idx], target, reduction='sum') + loss += loss_i / len(labels) + loss = loss / B + + return loss * self.image_loss_weight + + def refine_bboxes(self, bbox_results: dict, + batch_img_metas: List[dict]) -> InstanceList: + """Refine bboxes during training. + + Args: + bbox_results (dict): Usually is a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `rois` (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + - `bbox_targets` (tuple): Ground truth for proposals in a + single image. Containing the following list of Tensors: + (labels, label_weights, bbox_targets, bbox_weights) + batch_img_metas (List[dict]): List of image information. + + Returns: + list[:obj:`InstanceData`]: Refined bboxes of each image. + + Example: + >>> # xdoctest: +REQUIRES(module:kwarray) + >>> import numpy as np + >>> from mmdet.models.task_modules.samplers. + ... sampling_result import random_boxes + >>> from mmdet.models.task_modules.samplers import SamplingResult + >>> self = BBoxHead(reg_class_agnostic=True) + >>> n_roi = 2 + >>> n_img = 4 + >>> scale = 512 + >>> rng = np.random.RandomState(0) + ... batch_img_metas = [{'img_shape': (scale, scale)} + >>> for _ in range(n_img)] + >>> sampling_results = [SamplingResult.random(rng=10) + ... for _ in range(n_img)] + >>> # Create rois in the expected format + >>> roi_boxes = random_boxes(n_roi, scale=scale, rng=rng) + >>> img_ids = torch.randint(0, n_img, (n_roi,)) + >>> img_ids = img_ids.float() + >>> rois = torch.cat([img_ids[:, None], roi_boxes], dim=1) + >>> # Create other args + >>> labels = torch.randint(0, 81, (scale,)).long() + >>> bbox_preds = random_boxes(n_roi, scale=scale, rng=rng) + >>> cls_score = torch.randn((scale, 81)) + ... # For each image, pretend random positive boxes are gts + >>> bbox_targets = (labels, None, None, None) + ... bbox_results = dict(rois=rois, bbox_pred=bbox_preds, + ... cls_score=cls_score, + ... bbox_targets=bbox_targets) + >>> bboxes_list = self.refine_bboxes(sampling_results, + ... bbox_results, + ... batch_img_metas) + >>> print(bboxes_list) + """ + # bbox_targets is a tuple + cls_scores = bbox_results['cls_score'] + rois = bbox_results['rois'] + bbox_preds = bbox_results['bbox_pred'] + if self.custom_activation: + # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead + cls_scores = self.loss_cls.get_activation(cls_scores) + if cls_scores.numel() == 0: + return None + if cls_scores.shape[-1] == self.num_classes + 1: + # remove background class + cls_scores = cls_scores[:, :-1] + elif cls_scores.shape[-1] != self.num_classes: + raise ValueError('The last dim of `cls_scores` should equal to ' + '`num_classes` or `num_classes + 1`,' + f'but got {cls_scores.shape[-1]}.') + + img_ids = rois[:, 0].long().unique(sorted=True) + assert img_ids.numel() <= len(batch_img_metas) + + results_list = [] + for i in range(len(batch_img_metas)): + inds = torch.nonzero( + rois[:, 0] == i, as_tuple=False).squeeze(dim=1) + + bboxes_ = rois[inds, 1:] + bbox_pred_ = bbox_preds[inds] + img_meta_ = batch_img_metas[i] + + bboxes = self.regress(bboxes_, bbox_pred_, img_meta_) + + # don't filter gt bboxes like D2 + results = InstanceData(bboxes=bboxes) + results_list.append(results) + + return results_list + + def regress(self, priors: Tensor, bbox_pred: Tensor, + img_meta: dict) -> Tensor: + """Regress the bbox for the predicted class. Used in Cascade R-CNN. + + Args: + priors (Tensor): Priors from `rpn_head` or last stage + `bbox_head`, has shape (num_proposals, 4). + label (Tensor): Only used when `self.reg_class_agnostic` + is False, has shape (num_proposals, ). + bbox_pred (Tensor): Regression prediction of + current stage `bbox_head`. When `self.reg_class_agnostic` + is False, it has shape (n, num_classes * 4), otherwise + it has shape (n, 4). + img_meta (dict): Image meta info. + + Returns: + Tensor: Regressed bboxes, the same shape as input rois. + """ + reg_dim = self.bbox_coder.encode_size + assert bbox_pred.size()[1] == reg_dim + + max_shape = img_meta['img_shape'] + regressed_bboxes = self.bbox_coder.decode( + priors, bbox_pred, max_shape=max_shape) + return regressed_bboxes diff --git a/mmdetection/projects/Detic_new/detic/detic_roi_head.py b/mmdetection/projects/Detic_new/detic/detic_roi_head.py new file mode 100644 index 0000000..35785cd --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/detic_roi_head.py @@ -0,0 +1,440 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple + +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.models.roi_heads import CascadeRoIHead +from mmdet.models.task_modules.samplers import SamplingResult +from mmdet.models.test_time_augs import merge_aug_masks +from mmdet.models.utils import empty_instances, unpack_gt_instances +from mmdet.registry import MODELS +from mmdet.structures import SampleList +from mmdet.structures.bbox import bbox2roi, get_box_tensor +from mmdet.utils import ConfigType, InstanceList, MultiConfig + + +@MODELS.register_module() +class DeticRoIHead(CascadeRoIHead): + + def __init__( + self, + *, + mult_proposal_score: bool = False, + with_image_labels: bool = False, + add_image_box: bool = False, + image_box_size: float = 1.0, + ws_num_props: int = 128, + add_feature_to_prop: bool = False, + mask_weight: float = 1.0, + one_class_per_proposal: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + self.mult_proposal_score = mult_proposal_score + self.with_image_labels = with_image_labels + self.add_image_box = add_image_box + self.image_box_size = image_box_size + self.ws_num_props = ws_num_props + self.add_feature_to_prop = add_feature_to_prop + self.mask_weight = mask_weight + self.one_class_per_proposal = one_class_per_proposal + + def init_mask_head(self, mask_roi_extractor: MultiConfig, + mask_head: MultiConfig) -> None: + """Initialize mask head and mask roi extractor. + + Args: + mask_head (dict): Config of mask in mask head. + mask_roi_extractor (:obj:`ConfigDict`, dict or list): + Config of mask roi extractor. + """ + self.mask_head = MODELS.build(mask_head) + + if mask_roi_extractor is not None: + self.share_roi_extractor = False + self.mask_roi_extractor = MODELS.build(mask_roi_extractor) + else: + self.share_roi_extractor = True + self.mask_roi_extractor = self.bbox_roi_extractor + + def _refine_roi(self, x: Tuple[Tensor], rois: Tensor, + batch_img_metas: List[dict], + num_proposals_per_img: Sequence[int], **kwargs) -> tuple: + """Multi-stage refinement of RoI. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2] + batch_img_metas (list[dict]): List of image information. + num_proposals_per_img (sequence[int]): number of proposals + in each image. + + Returns: + tuple: + + - rois (Tensor): Refined RoI. + - cls_scores (list[Tensor]): Average predicted + cls score per image. + - bbox_preds (list[Tensor]): Bbox branch predictions + for the last stage of per image. + """ + # "ms" in variable names means multi-stage + ms_scores = [] + for stage in range(self.num_stages): + bbox_results = self._bbox_forward( + stage=stage, x=x, rois=rois, **kwargs) + + # split batch bbox prediction back to each image + cls_scores = bbox_results['cls_score'].sigmoid() + bbox_preds = bbox_results['bbox_pred'] + + rois = rois.split(num_proposals_per_img, 0) + cls_scores = cls_scores.split(num_proposals_per_img, 0) + ms_scores.append(cls_scores) + bbox_preds = bbox_preds.split(num_proposals_per_img, 0) + + if stage < self.num_stages - 1: + bbox_head = self.bbox_head[stage] + refine_rois_list = [] + for i in range(len(batch_img_metas)): + if rois[i].shape[0] > 0: + bbox_label = cls_scores[i][:, :-1].argmax(dim=1) + # Refactor `bbox_head.regress_by_class` to only accept + # box tensor without img_idx concatenated. + refined_bboxes = bbox_head.regress_by_class( + rois[i][:, 1:], bbox_label, bbox_preds[i], + batch_img_metas[i]) + refined_bboxes = get_box_tensor(refined_bboxes) + refined_rois = torch.cat( + [rois[i][:, [0]], refined_bboxes], dim=1) + refine_rois_list.append(refined_rois) + rois = torch.cat(refine_rois_list) + # ms_scores aligned + # average scores of each image by stages + cls_scores = [ + sum([score[i] for score in ms_scores]) / float(len(ms_scores)) + for i in range(len(batch_img_metas)) + ] # aligned + return rois, cls_scores, bbox_preds + + def predict_bbox(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + rpn_results_list: InstanceList, + rcnn_test_cfg: ConfigType, + rescale: bool = False, + **kwargs) -> InstanceList: + """Perform forward propagation of the bbox head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + proposals = [res.bboxes for res in rpn_results_list] + proposal_scores = [res.scores for res in rpn_results_list] + num_proposals_per_img = tuple(len(p) for p in proposals) + rois = bbox2roi(proposals) + + if rois.shape[0] == 0: + return empty_instances( + batch_img_metas, + rois.device, + task_type='bbox', + box_type=self.bbox_head[-1].predict_box_type, + num_classes=self.bbox_head[-1].num_classes, + score_per_cls=rcnn_test_cfg is None) + # rois aligned + rois, cls_scores, bbox_preds = self._refine_roi( + x=x, + rois=rois, + batch_img_metas=batch_img_metas, + num_proposals_per_img=num_proposals_per_img, + **kwargs) + + # score reweighting in centernet2 + cls_scores = [(s * ps[:, None])**0.5 + for s, ps in zip(cls_scores, proposal_scores)] + # # for demo + # cls_scores = [ + # s * (s == s[:, :-1].max(dim=1)[0][:, None]).float() + # for s in cls_scores + # ] + + # fast_rcnn_inference + results_list = self.bbox_head[-1].predict_by_feat( + rois=rois, + cls_scores=cls_scores, + bbox_preds=bbox_preds, + batch_img_metas=batch_img_metas, + rescale=rescale, + rcnn_test_cfg=rcnn_test_cfg) + return results_list + + def _mask_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict: + """Mask head forward function used in both training and testing. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): Tuple of multi-level img features. + rois (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + """ + mask_feats = self.mask_roi_extractor( + x[:self.mask_roi_extractor.num_inputs], rois) + # do not support caffe_c4 model anymore + mask_preds = self.mask_head(mask_feats) + + mask_results = dict(mask_preds=mask_preds) + return mask_results + + def mask_loss(self, x, sampling_results: List[SamplingResult], + batch_gt_instances: InstanceList) -> dict: + """Run forward function and calculate loss for mask head in training. + + Args: + x (tuple[Tensor]): Tuple of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes``, ``labels``, and + ``masks`` attributes. + + Returns: + dict: Usually returns a dictionary with keys: + + - `mask_preds` (Tensor): Mask prediction. + - `loss_mask` (dict): A dictionary of mask loss components. + """ + pos_rois = bbox2roi([res.pos_priors for res in sampling_results]) + mask_results = self._mask_forward(x, pos_rois) + + mask_loss_and_target = self.mask_head.loss_and_target( + mask_preds=mask_results['mask_preds'], + sampling_results=sampling_results, + batch_gt_instances=batch_gt_instances, + rcnn_train_cfg=self.train_cfg[-1]) + mask_results.update(mask_loss_and_target) + + return mask_results + + def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList, + batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + roi on the features of the upstream network. + + Args: + x (tuple[Tensor]): List of multi-level img features. + rpn_results_list (list[:obj:`InstanceData`]): List of region + proposals. + batch_data_samples (list[:obj:`DetDataSample`]): The batch + data samples. It usually includes information such + as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`. + + Returns: + dict[str, Tensor]: A dictionary of loss components + """ + assert len(rpn_results_list) == len(batch_data_samples) + outputs = unpack_gt_instances(batch_data_samples) + batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \ + = outputs + + num_imgs = len(batch_data_samples) + image_labels = [x.gt_instances.labels for x in batch_data_samples] + losses = dict() + results_list = rpn_results_list + + for stage in range(self.num_stages): + self.current_stage = stage + stage_loss_weight = self.stage_loss_weights[stage] + if hasattr(batch_gt_instances[0], 'bboxes'): + # assign gts and sample proposals + sampling_results = [] + if self.with_bbox or self.with_mask: + bbox_assigner = self.bbox_assigner[stage] + bbox_sampler = self.bbox_sampler[stage] + + for i in range(num_imgs): + results = results_list[i] + # rename rpn_results.bboxes to rpn_results.priors + results.priors = results.pop('bboxes') + + assign_result = bbox_assigner.assign( + results, batch_gt_instances[i], + batch_gt_instances_ignore[i]) + + sampling_result = bbox_sampler.sample( + assign_result, + results, + batch_gt_instances[i], + feats=[lvl_feat[i][None] for lvl_feat in x]) + + sampling_results.append(sampling_result) + + # bbox head forward and loss + bbox_results = self.bbox_loss(stage, x, sampling_results) + + for name, value in bbox_results['loss_bbox'].items(): + losses[f's{stage}.{name}'] = ( + value * stage_loss_weight if 'loss' in name else value) + losses[f's{stage}.image_loss'] = x[0].new_zeros([1])[0] + + # mask head forward and loss + # D2 only forward stage.0 + if self.with_mask and stage == 0: + mask_results = self.mask_loss(x, sampling_results, + batch_gt_instances) + for name, value in mask_results['loss_mask'].items(): + losses[name] = ( + value * + stage_loss_weight if 'loss' in name else value) + + else: + # get ws_num_props pred_instances for each image + sampling_results = [ + pred_instances[:self.ws_num_props] + for pred_instances in results_list + ] + for i, pred_instances in enumerate(sampling_results): + pred_instances.bboxes = pred_instances.bboxes.detach() + bbox_results = self.image_loss(stage, x, sampling_results, + image_labels) + losses[f's{stage}.image_loss'] = bbox_results['image_loss'] + + for name in ['loss_cls', 'loss_bbox']: + losses[f's{stage}.{name}'] = x[0].new_zeros([1])[0] + if stage == 0: + losses['loss_mask'] = x[0].new_zeros([1])[0] + + # refine bboxes + if stage < self.num_stages - 1: + bbox_head = self.bbox_head[stage] + with torch.no_grad(): + results_list = bbox_head.refine_bboxes( + bbox_results, batch_img_metas) + # Empty proposal + if results_list is None: + break + + return losses + + def image_loss(self, stage: int, x: Tuple[Tensor], + sampling_results: List[SamplingResult], + image_labels) -> dict: + """Run forward function and calculate loss for box head in training. + + Args: + stage (int): The current stage in Cascade RoI Head. + x (tuple[Tensor]): List of multi-level img features. + sampling_results (list["obj:`SamplingResult`]): Sampling results. + + Returns: + dict: Usually returns a dictionary with keys: + + - `cls_score` (Tensor): Classification scores. + - `bbox_pred` (Tensor): Box energies / deltas. + - `bbox_feats` (Tensor): Extract bbox RoI features. + - `loss_bbox` (dict): A dictionary of bbox loss components. + - `rois` (Tensor): RoIs with the shape (n, 5) where the first + column indicates batch id of each RoI. + - `bbox_targets` (tuple): Ground truth for proposals in a + single image. Containing the following list of Tensors: + (labels, label_weights, bbox_targets, bbox_weights) + """ + bbox_head = self.bbox_head[stage] + rois = bbox2roi([res.bboxes for res in sampling_results]) + bbox_results = self._bbox_forward(stage, x, rois) + bbox_results.update(rois=rois) + + image_loss = bbox_head.image_label_losses( + cls_score=bbox_results['cls_score'], + sampling_results=sampling_results, + image_labels=image_labels) + bbox_results.update(dict(image_loss=image_loss)) + + return bbox_results + + def predict_mask(self, + x: Tuple[Tensor], + batch_img_metas: List[dict], + results_list: List[InstanceData], + rescale: bool = False) -> List[InstanceData]: + """Perform forward propagation of the mask head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Feature maps of all scale level. + batch_img_metas (list[dict]): List of image information. + results_list (list[:obj:`InstanceData`]): Detection results of + each image. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + + Returns: + list[:obj:`InstanceData`]: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, H, W). + """ + bboxes = [res.bboxes for res in results_list] + mask_rois = bbox2roi(bboxes) + if mask_rois.shape[0] == 0: + results_list = empty_instances( + batch_img_metas, + mask_rois.device, + task_type='mask', + instance_results=results_list, + mask_thr_binary=self.test_cfg.mask_thr_binary) + return results_list + + num_mask_rois_per_img = [len(res) for res in results_list] + aug_masks = [] + mask_results = self._mask_forward(x, mask_rois) + mask_preds = mask_results['mask_preds'] + # split batch mask prediction back to each image + mask_preds = mask_preds.split(num_mask_rois_per_img, 0) + aug_masks.append([m.sigmoid().detach() for m in mask_preds]) + + merged_masks = [] + for i in range(len(batch_img_metas)): + aug_mask = [mask[i] for mask in aug_masks] + merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i]) + merged_masks.append(merged_mask) + results_list = self.mask_head.predict_by_feat( + mask_preds=merged_masks, + results_list=results_list, + batch_img_metas=batch_img_metas, + rcnn_test_cfg=self.test_cfg, + rescale=rescale, + activate_map=True) + return results_list diff --git a/mmdetection/projects/Detic_new/detic/heatmap_focal_loss.py b/mmdetection/projects/Detic_new/detic/heatmap_focal_loss.py new file mode 100644 index 0000000..021a5b2 --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/heatmap_focal_loss.py @@ -0,0 +1,131 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +import torch +import torch.nn as nn +from torch import Tensor + +from mmdet.registry import MODELS + + +# support class-agnostic heatmap_focal_loss +def heatmap_focal_loss_with_pos_inds( + pred: Tensor, + targets: Tensor, + pos_inds: Tensor, + alpha: float = 2.0, + beta: float = 4.0, + gamma: float = 4.0, + sigmoid_clamp: float = 1e-4, + ignore_high_fp: float = -1.0, + pos_weight: float = 1.0, + neg_weight: float = 1.0, + avg_factor: Optional[Union[int, float]] = None) -> Tensor: + + pred = torch.clamp( + pred.sigmoid_(), min=sigmoid_clamp, max=1 - sigmoid_clamp) + + neg_weights = torch.pow(1 - targets, beta) + + pos_pred = pred[pos_inds] + pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, gamma) + neg_loss = torch.log(1 - pred) * torch.pow(pred, gamma) * neg_weights + if ignore_high_fp > 0: + not_high_fp = (pred < ignore_high_fp).float() + neg_loss = not_high_fp * neg_loss + + pos_loss = -pos_loss.sum() + neg_loss = -neg_loss.sum() + if alpha >= 0: + pos_loss = alpha * pos_loss + neg_loss = (1 - alpha) * neg_loss + + pos_loss = pos_weight * pos_loss / avg_factor + neg_loss = neg_weight * neg_loss / avg_factor + + return pos_loss, neg_loss + + +@MODELS.register_module() +class HeatmapFocalLoss(nn.Module): + """GaussianFocalLoss is a variant of focal loss. + + More details can be found in the `paper + `_ + Code is modified from `kp_utils.py + `_ # noqa: E501 + Please notice that the target in GaussianFocalLoss is a gaussian heatmap, + not 0/1 binary target. + + Args: + alpha (float): Power of prediction. + gamma (float): Power of target for negative samples. + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Loss weight of current loss. + pos_weight(float): Positive sample loss weight. Defaults to 1.0. + neg_weight(float): Negative sample loss weight. Defaults to 1.0. + """ + + def __init__( + self, + alpha: float = 2.0, + beta: float = 4.0, + gamma: float = 4.0, + sigmoid_clamp: float = 1e-4, + ignore_high_fp: float = -1.0, + loss_weight: float = 1.0, + pos_weight: float = 1.0, + neg_weight: float = 1.0, + ) -> None: + super().__init__() + self.alpha = alpha + self.beta = beta + self.gamma = gamma + self.sigmoid_clamp = sigmoid_clamp + self.ignore_high_fp = ignore_high_fp + self.loss_weight = loss_weight + self.pos_weight = pos_weight + self.neg_weight = neg_weight + + def forward(self, + pred: Tensor, + target: Tensor, + pos_inds: Optional[Tensor] = None, + avg_factor: Optional[Union[int, float]] = None) -> Tensor: + """Forward function. + + If you want to manually determine which positions are + positive samples, you can set the pos_index and pos_label + parameter. Currently, only the CenterNet update version uses + the parameter. + + Args: + pred (torch.Tensor): The prediction. The shape is (N, num_classes). + target (torch.Tensor): The learning target of the prediction + in gaussian distribution. The shape is (N, num_classes). + pos_inds (torch.Tensor): The positive sample index. + Defaults to None. + pos_labels (torch.Tensor): The label corresponding to the positive + sample index. Defaults to None. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, float, optional): Average factor that is used to + average the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + + pos_loss, neg_loss = heatmap_focal_loss_with_pos_inds( + pred, + target, + pos_inds, + alpha=self.alpha, + beta=self.beta, + gamma=self.gamma, + sigmoid_clamp=self.sigmoid_clamp, + ignore_high_fp=self.ignore_high_fp, + pos_weight=self.pos_weight, + neg_weight=self.neg_weight, + avg_factor=avg_factor) + return pos_loss, neg_loss diff --git a/mmdetection/projects/Detic_new/detic/imagenet_lvis.py b/mmdetection/projects/Detic_new/detic/imagenet_lvis.py new file mode 100644 index 0000000..3375a08 --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/imagenet_lvis.py @@ -0,0 +1,395 @@ +# Copyright (c) OpenMMLab. All rights reserved.METAINFO +import copy +import os.path as osp +import pickle +import warnings +from typing import List, Union + +from mmengine.fileio import get_local_path + +from mmdet.datasets import LVISV1Dataset +from mmdet.registry import DATASETS + + +@DATASETS.register_module() +class ImageNetLVISV1Dataset(LVISV1Dataset): + """LVIS v1 dataset for detection.""" + + METAINFO = { + 'classes': + ('aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', + 'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', + 'antenna', 'apple', 'applesauce', 'apricot', 'apron', 'aquarium', + 'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor', + 'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer', + 'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy', + 'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel', + 'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon', + 'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo', + 'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow', + 'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap', + 'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)', + 'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)', + 'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie', + 'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper', + 'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt', + 'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor', + 'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath', + 'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card', + 'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket', + 'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry', + 'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg', + 'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase', + 'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle', + 'bottle_opener', 'bouquet', 'bow_(weapon)', + 'bow_(decorative_ribbons)', 'bow-tie', 'bowl', 'pipe_bowl', + 'bowler_hat', 'bowling_ball', 'box', 'boxing_glove', 'suspenders', + 'bracelet', 'brass_plaque', 'brassiere', 'bread-bin', 'bread', + 'breechcloth', 'bridal_gown', 'briefcase', 'broccoli', 'broach', + 'broom', 'brownie', 'brussels_sprouts', 'bubble_gum', 'bucket', + 'horse_buggy', 'bull', 'bulldog', 'bulldozer', 'bullet_train', + 'bulletin_board', 'bulletproof_vest', 'bullhorn', 'bun', 'bunk_bed', + 'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butter', + 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car', 'cabinet', + 'locker', 'cake', 'calculator', 'calendar', 'calf', 'camcorder', + 'camel', 'camera', 'camera_lens', 'camper_(vehicle)', 'can', + 'can_opener', 'candle', 'candle_holder', 'candy_bar', 'candy_cane', + 'walking_cane', 'canister', 'canoe', 'cantaloup', 'canteen', + 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino', + 'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car', + 'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship', + 'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton', + 'cash_register', 'casserole', 'cassette', 'cast', 'cat', + 'cauliflower', 'cayenne_(spice)', 'CD_player', 'celery', + 'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue', + 'chalice', 'chandelier', 'chap', 'checkbook', 'checkerboard', + 'cherry', 'chessboard', 'chicken_(animal)', 'chickpea', + 'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)', + 'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk', + 'chocolate_mousse', 'choker', 'chopping_board', 'chopstick', + 'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette', + 'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent', + 'cleat_(for_securing_rope)', 'clementine', 'clip', 'clipboard', + 'clippers_(for_plants)', 'cloak', 'clock', 'clock_tower', + 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat', + 'coat_hanger', 'coatrack', 'cock', 'cockroach', 'cocoa_(beverage)', + 'coconut', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil', + 'coin', 'colander', 'coleslaw', 'coloring_material', + 'combination_lock', 'pacifier', 'comic_book', 'compass', + 'computer_keyboard', 'condiment', 'cone', 'control', + 'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie', + 'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)', + 'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet', + 'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall', + 'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker', + 'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib', + 'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown', + 'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch', + 'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup', + 'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain', + 'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard', + 'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk', + 'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', + 'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher', + 'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup', + 'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin', + 'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', + 'dragonfly', 'drawer', 'underdrawers', 'dress', 'dress_hat', + 'dress_suit', 'dresser', 'drill', 'drone', 'dropper', + 'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling', + 'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan', 'eagle', + 'earphone', 'earplug', 'earring', 'easel', 'eclair', 'eel', 'egg', + 'egg_roll', 'egg_yolk', 'eggbeater', 'eggplant', 'electric_chair', + 'refrigerator', 'elephant', 'elk', 'envelope', 'eraser', 'escargot', + 'eyepatch', 'falcon', 'fan', 'faucet', 'fedora', 'ferret', + 'Ferris_wheel', 'ferry', 'fig_(fruit)', 'fighter_jet', 'figurine', + 'file_cabinet', 'file_(tool)', 'fire_alarm', 'fire_engine', + 'fire_extinguisher', 'fire_hose', 'fireplace', 'fireplug', + 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl', 'fishing_rod', + 'flag', 'flagpole', 'flamingo', 'flannel', 'flap', 'flash', + 'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)', + 'flower_arrangement', 'flute_glass', 'foal', 'folding_chair', + 'food_processor', 'football_(American)', 'football_helmet', + 'footstool', 'fork', 'forklift', 'freight_car', 'French_toast', + 'freshener', 'frisbee', 'frog', 'fruit_juice', 'frying_pan', 'fudge', + 'funnel', 'futon', 'gag', 'garbage', 'garbage_truck', 'garden_hose', + 'gargle', 'gargoyle', 'garlic', 'gasmask', 'gazelle', 'gelatin', + 'gemstone', 'generator', 'giant_panda', 'gift_wrap', 'ginger', + 'giraffe', 'cincture', 'glass_(drink_container)', 'globe', 'glove', + 'goat', 'goggles', 'goldfish', 'golf_club', 'golfcart', + 'gondola_(boat)', 'goose', 'gorilla', 'gourd', 'grape', 'grater', + 'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle', + 'grill', 'grits', 'grizzly', 'grocery_bag', 'guitar', 'gull', 'gun', + 'hairbrush', 'hairnet', 'hairpin', 'halter_top', 'ham', 'hamburger', + 'hammer', 'hammock', 'hamper', 'hamster', 'hair_dryer', 'hand_glass', + 'hand_towel', 'handcart', 'handcuff', 'handkerchief', 'handle', + 'handsaw', 'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', + 'headband', 'headboard', 'headlight', 'headscarf', 'headset', + 'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet', + 'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog', + 'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah', + 'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce', + 'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear', + 'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate', + 'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board', + 'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey', + 'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak', + 'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono', + 'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit', + 'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)', + 'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)', + 'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard', + 'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather', + 'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', + 'lettuce', 'license_plate', 'life_buoy', 'life_jacket', 'lightbulb', + 'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor', + 'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat', + 'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)', + 'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', + 'manger', 'manhole', 'map', 'marker', 'martini', 'mascot', + 'mashed_potato', 'masher', 'mask', 'mast', 'mat_(gym_equipment)', + 'matchbox', 'mattress', 'measuring_cup', 'measuring_stick', + 'meatball', 'medicine', 'melon', 'microphone', 'microscope', + 'microwave_oven', 'milestone', 'milk', 'milk_can', 'milkshake', + 'minivan', 'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', + 'money', 'monitor_(computer_equipment) computer_monitor', 'monkey', + 'motor', 'motor_scooter', 'motor_vehicle', 'motorcycle', + 'mound_(baseball)', 'mouse_(computer_equipment)', 'mousepad', + 'muffin', 'mug', 'mushroom', 'music_stool', 'musical_instrument', + 'nailfile', 'napkin', 'neckerchief', 'necklace', 'necktie', 'needle', + 'nest', 'newspaper', 'newsstand', 'nightshirt', + 'nosebag_(for_animals)', 'noseband_(for_animals)', 'notebook', + 'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)', + 'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion', + 'orange_(fruit)', 'orange_juice', 'ostrich', 'ottoman', 'oven', + 'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle', + 'padlock', 'paintbrush', 'painting', 'pajamas', 'palette', + 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose', + 'papaya', 'paper_plate', 'paper_towel', 'paperback_book', + 'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', + 'parasol', 'parchment', 'parka', 'parking_meter', 'parrot', + 'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport', + 'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter', + 'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg', + 'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box', + 'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)', + 'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet', + 'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano', + 'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow', + 'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball', + 'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)', + 'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat', + 'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)', + 'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)', + 'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)', + 'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', + 'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', + 'pretzel', 'printer', 'projectile_(weapon)', 'projector', 'propeller', + 'prune', 'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', + 'puncher', 'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', + 'rabbit', 'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', + 'radish', 'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', + 'rat', 'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt', + 'recliner', 'record_player', 'reflector', 'remote_control', + 'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map', + 'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade', + 'rolling_pin', 'root_beer', 'router_(computer_equipment)', + 'rubber_band', 'runner_(carpet)', 'plastic_bag', + 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin', + 'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)', + 'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)', + 'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse', + 'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf', + 'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver', + 'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane', + 'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark', + 'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl', + 'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt', + 'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass', + 'shoulder_bag', 'shovel', 'shower_head', 'shower_cap', + 'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink', + 'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole', + 'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)', + 'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman', + 'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball', + 'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon', + 'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)', + 'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish', + 'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)', + 'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish', + 'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel', + 'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', + 'stirrer', 'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', + 'strainer', 'strap', 'straw_(for_drinking)', 'strawberry', + 'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer', + 'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', + 'sunglasses', 'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', + 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', + 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table', + 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight', + 'tambourine', 'army_tank', 'tank_(storage_vessel)', + 'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure', + 'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup', + 'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth', + 'telephone_pole', 'telephoto_lens', 'television_camera', + 'television_set', 'tennis_ball', 'tennis_racket', 'tequila', + 'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread', + 'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', + 'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', + 'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs', + 'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover', + 'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy', + 'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike', + 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray', + 'trench_coat', 'triangle_(musical_instrument)', 'tricycle', 'tripod', + 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat', 'turban', + 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)', + 'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn', + 'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest', + 'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture', + 'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick', + 'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe', + 'washbasin', 'automatic_washer', 'watch', 'water_bottle', + 'water_cooler', 'water_faucet', 'water_heater', 'water_jug', + 'water_gun', 'water_scooter', 'water_ski', 'water_tower', + 'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake', + 'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream', + 'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)', + 'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket', + 'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon', + 'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt', + 'yoke_(animal_equipment)', 'zebra', 'zucchini'), + 'palette': + None + } + + def get_data_info(self, idx: int) -> dict: + """Get annotation by index and automatically call ``full_init`` if the + dataset has not been fully initialized. + + Args: + idx (int): The index of data. + + Returns: + dict: The idx-th annotation of the dataset. + """ + if self.serialize_data: + start_addr = 0 if idx == 0 else self.data_address[idx - 1].item() + end_addr = self.data_address[idx].item() + bytes = memoryview( + self.data_bytes[start_addr:end_addr]) # type: ignore + data_info = pickle.loads(bytes) # type: ignore + else: + data_info = copy.deepcopy(self.data_list[idx]) + + # Some codebase needs `sample_idx` of data information. Here we convert + # the idx to a positive number and save it in data information. + if idx >= 0: + data_info['sample_idx'] = idx + else: + data_info['sample_idx'] = len(self) + idx + + return data_info + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + try: + import lvis + if getattr(lvis, '__version__', '0') >= '10.5.3': + warnings.warn( + 'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"', # noqa: E501 + UserWarning) + from lvis import LVIS + except ImportError: + raise ImportError( + 'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".' # noqa: E501 + ) + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.lvis = LVIS(local_path) + self.cat_ids = self.lvis.get_cat_ids() + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map) + img_ids = self.lvis.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.lvis.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.lvis.get_ann_ids(img_ids=[img_id]) + total_ann_ids.extend(ann_ids) + parsed_data_info = self.parse_data_info( + {'raw_img_info': raw_img_info}) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.lvis + # print(data_list) + return data_list + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file`` + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + + data_info = {} + + # TODO: need to change data_prefix['img'] to data_prefix['img_path'] + img_path = osp.join(self.data_prefix['img'], img_info['file_name']) + if self.data_prefix.get('seg', None): + seg_map_path = osp.join( + self.data_prefix['seg'], + img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix) + else: + seg_map_path = None + data_info['img_path'] = img_path + data_info['img_id'] = img_info['img_id'] + data_info['seg_map_path'] = seg_map_path + data_info['height'] = img_info['height'] + data_info['width'] = img_info['width'] + + if self.return_classes: + data_info['text'] = self.metainfo['classes'] + data_info['custom_entities'] = True + + instances = [] + image_labels = [ + self.cat2label[x] for x in img_info['pos_category_ids'] + ] + for image_label in image_labels: + instance = {} + instance['bbox_label'] = image_label + instances.append(instance) + data_info['instances'] = instances + + return data_info + + def get_cat_ids(self, idx: int) -> List[int]: + """Get COCO category ids by index. + + Args: + idx (int): Index of data. + + Returns: + List[int]: All categories in the image of specified index. + """ + data_info = self.get_data_info(idx) + image_labels = [] + for instance in data_info['instances']: + image_labels.append(instance['bbox_label']) + + return image_labels diff --git a/mmdetection/projects/Detic_new/detic/iou_loss.py b/mmdetection/projects/Detic_new/detic/iou_loss.py new file mode 100644 index 0000000..349545c --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/iou_loss.py @@ -0,0 +1,125 @@ +import torch +from torch import nn + + +# support calculate IOULoss with box_pred +class IOULoss(nn.Module): + + def __init__(self, loc_loss_type='iou'): + super(IOULoss, self).__init__() + self.loc_loss_type = loc_loss_type + + def forward(self, pred, target, weight=None, reduction='sum'): + pred_left = pred[:, 0] + pred_top = pred[:, 1] + pred_right = pred[:, 2] + pred_bottom = pred[:, 3] + + target_left = target[:, 0] + target_top = target[:, 1] + target_right = target[:, 2] + target_bottom = target[:, 3] + + target_aera = (target_left + target_right) * ( + target_top + target_bottom) + pred_aera = (pred_left + pred_right) * (pred_top + pred_bottom) + + w_intersect = torch.min(pred_left, target_left) + torch.min( + pred_right, target_right) + h_intersect = torch.min(pred_bottom, target_bottom) + torch.min( + pred_top, target_top) + + g_w_intersect = torch.max(pred_left, target_left) + torch.max( + pred_right, target_right) + g_h_intersect = torch.max(pred_bottom, target_bottom) + torch.max( + pred_top, target_top) + ac_uion = g_w_intersect * g_h_intersect + + area_intersect = w_intersect * h_intersect + area_union = target_aera + pred_aera - area_intersect + + ious = (area_intersect + 1.0) / (area_union + 1.0) + gious = ious - (ac_uion - area_union) / ac_uion + if self.loc_loss_type == 'iou': + losses = -torch.log(ious) + elif self.loc_loss_type == 'linear_iou': + losses = 1 - ious + elif self.loc_loss_type == 'giou': + losses = 1 - gious + else: + raise NotImplementedError + + if weight is not None: + losses = losses * weight + else: + losses = losses + + if reduction == 'sum': + return losses.sum() + elif reduction == 'batch': + return losses.sum(dim=[1]) + elif reduction == 'none': + return losses + else: + raise NotImplementedError + + +def giou_loss( + boxes1: torch.Tensor, + boxes2: torch.Tensor, + reduction: str = 'none', + eps: float = 1e-7, +) -> torch.Tensor: + """Generalized Intersection over Union Loss (Hamid Rezatofighi et. + + al) + https://arxiv.org/abs/1902.09630 + Gradient-friendly IoU loss with an additional penalty that is + non-zero when the boxes do not overlap and scales with the size + of their smallest enclosing box. This loss is symmetric, so the + boxes1 and boxes2 arguments are interchangeable. + Args: + boxes1, boxes2 (Tensor): box locations in XYXY format, shape + (N, 4) or (4,). + reduction: 'none' | 'mean' | 'sum' + 'none': No reduction will be applied to the output. + 'mean': The output will be averaged. + 'sum': The output will be summed. + eps (float): small number to prevent division by zero + """ + + x1, y1, x2, y2 = boxes1.unbind(dim=-1) + x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1) + + assert (x2 >= x1).all(), 'bad box: x1 larger than x2' + assert (y2 >= y1).all(), 'bad box: y1 larger than y2' + + # Intersection keypoints + xkis1 = torch.max(x1, x1g) + ykis1 = torch.max(y1, y1g) + xkis2 = torch.min(x2, x2g) + ykis2 = torch.min(y2, y2g) + + intsctk = torch.zeros_like(x1) + mask = (ykis2 > ykis1) & (xkis2 > xkis1) + intsctk[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask]) + unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk + iouk = intsctk / (unionk + eps) + + # smallest enclosing box + xc1 = torch.min(x1, x1g) + yc1 = torch.min(y1, y1g) + xc2 = torch.max(x2, x2g) + yc2 = torch.max(y2, y2g) + + area_c = (xc2 - xc1) * (yc2 - yc1) + miouk = iouk - ((area_c - unionk) / (area_c + eps)) + + loss = 1 - miouk + + if reduction == 'mean': + loss = loss.mean() + elif reduction == 'sum': + loss = loss.sum() + + return loss diff --git a/mmdetection/projects/Detic_new/detic/zero_shot_classifier.py b/mmdetection/projects/Detic_new/detic/zero_shot_classifier.py new file mode 100644 index 0000000..cb9946d --- /dev/null +++ b/mmdetection/projects/Detic_new/detic/zero_shot_classifier.py @@ -0,0 +1,73 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from mmdet.registry import MODELS + + +@MODELS.register_module() +class ZeroShotClassifier(nn.Module): + + def __init__( + self, + in_features: int, + out_features: int, # num_classes + zs_weight_path: str, + zs_weight_dim: int = 512, + use_bias: float = 0.0, + norm_weight: bool = True, + norm_temperature: float = 50.0, + ): + super().__init__() + num_classes = out_features + self.norm_weight = norm_weight + self.norm_temperature = norm_temperature + + self.use_bias = use_bias < 0 + if self.use_bias: + self.cls_bias = nn.Parameter(torch.ones(1) * use_bias) + + self.linear = nn.Linear(in_features, zs_weight_dim) + + if zs_weight_path == 'rand': + zs_weight = torch.randn((zs_weight_dim, num_classes)) + nn.init.normal_(zs_weight, std=0.01) + else: + zs_weight = torch.tensor( + np.load(zs_weight_path), + dtype=torch.float32).permute(1, 0).contiguous() # D x C + zs_weight = torch.cat( + [zs_weight, zs_weight.new_zeros( + (zs_weight_dim, 1))], dim=1) # D x (C + 1) + + if self.norm_weight: + zs_weight = F.normalize(zs_weight, p=2, dim=0) + + if zs_weight_path == 'rand': + self.zs_weight = nn.Parameter(zs_weight) + else: + self.register_buffer('zs_weight', zs_weight) + + assert self.zs_weight.shape[1] == num_classes + 1, self.zs_weight.shape + + def forward(self, x, classifier=None): + ''' + Inputs: + x: B x D' + classifier_info: (C', C' x D) + ''' + x = self.linear(x) + if classifier is not None: + zs_weight = classifier.permute(1, 0).contiguous() # D x C' + zs_weight = F.normalize(zs_weight, p=2, dim=0) \ + if self.norm_weight else zs_weight + else: + zs_weight = self.zs_weight + if self.norm_weight: + x = self.norm_temperature * F.normalize(x, p=2, dim=1) + x = torch.mm(x, zs_weight) + if self.use_bias: + x = x + self.cls_bias + return x diff --git a/mmdetection/projects/DiffusionDet/README.md b/mmdetection/projects/DiffusionDet/README.md new file mode 100644 index 0000000..5542d9a --- /dev/null +++ b/mmdetection/projects/DiffusionDet/README.md @@ -0,0 +1,172 @@ +## Description + +This is an implementation of [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet) based on [MMDetection](https://github.com/open-mmlab/mmdetection/tree/main), [MMCV](https://github.com/open-mmlab/mmcv), and [MMEngine](https://github.com/open-mmlab/mmengine). + +
    + +
    + +## Usage + + + +### Comparison of results + +1. Download the [DiffusionDet released model](https://github.com/ShoufaChen/DiffusionDet#models). + +2. Convert model from DiffusionDet version to MMDetection version. We give a [sample script](model_converters/diffusiondet_resnet_to_mmdet.py) + to convert `DiffusionDet-resnet50` model. Users can download the corresponding models from [here](https://github.com/ShoufaChen/DiffusionDet/releases/download/v0.1/diffdet_coco_res50.pth). + + ```shell + python projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py ${DiffusionDet ckpt path} ${MMDetectron ckpt path} + ``` + +3. Testing the model in MMDetection. + + ```shell + python tools/test.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py ${CHECKPOINT_PATH} + ``` + +**Note:** During inference time, DiffusionDet will randomly generate noisy boxes, +which may affect the AP results. If users want to get the same result every inference time, setting seed is a good way. +We give a table to compare the inference results on `ResNet50-500-proposals` between DiffusionDet and MMDetection. + +| Config | Step | AP | +| :---------------------------------------------------------------------------------------------------------------------: | :--: | :-------: | +| [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet/blob/main/configs/diffdet.coco.res50.yaml) (released results) | 1 | 45.5 | +| [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet/blob/main/configs/diffdet.coco.res50.yaml) (seed=0) | 1 | 45.66 | +| [MMDetection](configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) (seed=0) | 1 | 45.7 | +| [MMDetection](configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) (random seed) | 1 | 45.6~45.8 | +| [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet/blob/main/configs/diffdet.coco.res50.yaml) (released results) | 4 | 46.1 | +| [DiffusionDet](https://github.com/ShoufaChen/DiffusionDet/blob/main/configs/diffdet.coco.res50.yaml) (seed=0) | 4 | 46.38 | +| [MMDetection](configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) (seed=0) | 4 | 46.4 | +| [MMDetection](configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) (random seed) | 4 | 46.2~46.4 | + +- `seed=0` means hard set seed before generating random boxes. + ```python + # hard set seed=0 before generating random boxes + seed = 0 + random.seed(seed) + torch.manual_seed(seed) + # torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + ... + noise_bboxes_raw = torch.randn( + (self.num_proposals, 4), + device=device) + ... + ``` +- `random seed` means do not hard set seed before generating random boxes. + +### Training commands + +In MMDetection's root directory, run the following command to train the model: + +```bash +python tools/train.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py +``` + +For multi-gpu training, run: + +```bash +python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${NUM_GPUS} --master_port=29506 --master_addr="127.0.0.1" tools/train.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py +``` + +### Testing commands + +In MMDetection's root directory, run the following command to test the model: + +```bash +# for 1 step inference +# test command +python tools/test.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py ${CHECKPOINT_PATH} + +# for 4 steps inference + +# test command +python tools/test.py projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py ${CHECKPOINT_PATH} --cfg-options model.bbox_head.sampling_timesteps=4 +``` + +**Note:** There is no difference between 1 step or 4 steps (or other multi-step) during training. Users can set different steps during inference through `--cfg-options model.bbox_head.sampling_timesteps=${STEPS}`, but larger `sampling_timesteps` will affect the inference time. + +## Results + +Here we provide the baseline version of DiffusionDet with ResNet50 backbone. + +To find more variants, please visit the [official model zoo](https://github.com/ShoufaChen/DiffusionDet#models). + +| Backbone | Style | Lr schd | AP (Step=1) | AP (Step=4) | Config | Download | +| :------: | :-----: | :-----: | :---------: | :---------: | :----------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| R-50 | PyTorch | 450k | 44.5 | 46.2 | [config](./configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/diffusiondet/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco_20230215_090925-7d6ed504.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/diffusiondet/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco_20230215_090925.log.json) | + +## License + +DiffusionDet is under the [CC-BY-NC 4.0 license](https://github.com/ShoufaChen/DiffusionDet/blob/main/LICENSE). Users should be careful about adopting these features in any commercial matters. + +## Citation + +If you find DiffusionDet is useful in your research or applications, please consider giving a star 🌟 to the [official repository](https://github.com/ShoufaChen/DiffusionDet) and citing DiffusionDet by the following BibTeX entry. + +```BibTeX +@article{chen2022diffusiondet, + title={DiffusionDet: Diffusion Model for Object Detection}, + author={Chen, Shoufa and Sun, Peize and Song, Yibing and Luo, Ping}, + journal={arXiv preprint arXiv:2211.09788}, + year={2022} +} +``` + +## Checklist + + + +- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [x] Finish the code + + + + - [x] Basic docstrings & proper citation + + + + - [x] Test-time correctness + + + + - [x] A full README + + + +- [x] Milestone 2: Indicates a successful model implementation. + + - [x] Training-time correctness + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Type hints and docstrings + + + + - [ ] Unit tests + + + + - [ ] Code polishing + + + + - [ ] Metafile.yml + + + +- [ ] Move your modules into the core package following the codebase's file hierarchy structure. + + + +- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/mmdetection/projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py b/mmdetection/projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py new file mode 100644 index 0000000..187cdc3 --- /dev/null +++ b/mmdetection/projects/DiffusionDet/configs/diffusiondet_r50_fpn_500-proposals_1-step_crop-ms-480-800-450k_coco.py @@ -0,0 +1,185 @@ +_base_ = [ + 'mmdet::_base_/datasets/coco_detection.py', + 'mmdet::_base_/schedules/schedule_1x.py', + 'mmdet::_base_/default_runtime.py' +] + +custom_imports = dict( + imports=['projects.DiffusionDet.diffusiondet'], allow_failed_imports=False) + +# model settings +model = dict( + type='DiffusionDet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=4), + bbox_head=dict( + type='DynamicDiffusionDetHead', + num_classes=80, + feat_channels=256, + num_proposals=500, + num_heads=6, + deep_supervision=True, + prior_prob=0.01, + snr_scale=2.0, + sampling_timesteps=1, + ddim_sampling_eta=1.0, + single_head=dict( + type='SingleDiffusionDetHead', + num_cls_convs=1, + num_reg_convs=3, + dim_feedforward=2048, + num_heads=8, + dropout=0.0, + act_cfg=dict(type='ReLU', inplace=True), + dynamic_conv=dict(dynamic_dim=64, dynamic_num=2)), + roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + # criterion + criterion=dict( + type='DiffusionDetCriterion', + num_classes=80, + assigner=dict( + type='DiffusionDetMatcher', + match_costs=[ + dict( + type='FocalLossCost', + alpha=0.25, + gamma=2.0, + weight=2.0, + eps=1e-8), + dict(type='BBoxL1Cost', weight=5.0, box_format='xyxy'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ], + center_radius=2.5, + candidate_topk=5), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + alpha=0.25, + gamma=2.0, + reduction='sum', + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=5.0), + loss_giou=dict(type='GIoULoss', reduction='sum', + loss_weight=2.0))), + test_cfg=dict( + use_nms=True, + score_thr=0.5, + min_bbox_size=0, + nms=dict(type='nms', iou_threshold=0.5), + )) + +backend = 'pillow' +train_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args, + imdecode_backend=backend), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[[ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True, + backend=backend), + ], + [ + dict( + type='RandomChoiceResize', + scales=[(400, 1333), (500, 1333), (600, 1333)], + keep_ratio=True, + backend=backend), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + keep_ratio=True, + backend=backend) + ]]), + dict(type='PackDetInputs') +] + +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args, + imdecode_backend=backend), + dict(type='Resize', scale=(1333, 800), keep_ratio=True, backend=backend), + # If you don't have a gt annotation, delete the pipeline + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +train_dataloader = dict( + sampler=dict(type='InfiniteSampler'), + dataset=dict( + filter_cfg=dict(filter_empty_gt=False, min_size=1e-5), + pipeline=train_pipeline)) + +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + _delete_=True, type='AdamW', lr=0.000025, weight_decay=0.0001), + clip_grad=dict(max_norm=1.0, norm_type=2)) +train_cfg = dict( + _delete_=True, + type='IterBasedTrainLoop', + max_iters=450000, + val_interval=75000) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=1000), + dict( + type='MultiStepLR', + begin=0, + end=450000, + by_epoch=False, + milestones=[350000, 420000], + gamma=0.1) +] + +default_hooks = dict( + checkpoint=dict(by_epoch=False, interval=75000, max_keep_ckpts=3)) +log_processor = dict(by_epoch=False) diff --git a/mmdetection/projects/DiffusionDet/diffusiondet/__init__.py b/mmdetection/projects/DiffusionDet/diffusiondet/__init__.py new file mode 100644 index 0000000..35d6032 --- /dev/null +++ b/mmdetection/projects/DiffusionDet/diffusiondet/__init__.py @@ -0,0 +1,10 @@ +from .diffusiondet import DiffusionDet +from .head import (DynamicConv, DynamicDiffusionDetHead, + SingleDiffusionDetHead, SinusoidalPositionEmbeddings) +from .loss import DiffusionDetCriterion, DiffusionDetMatcher + +__all__ = [ + 'DiffusionDet', 'DynamicDiffusionDetHead', 'SingleDiffusionDetHead', + 'SinusoidalPositionEmbeddings', 'DynamicConv', 'DiffusionDetCriterion', + 'DiffusionDetMatcher' +] diff --git a/mmdetection/projects/DiffusionDet/diffusiondet/diffusiondet.py b/mmdetection/projects/DiffusionDet/diffusiondet/diffusiondet.py new file mode 100644 index 0000000..5a46ddf --- /dev/null +++ b/mmdetection/projects/DiffusionDet/diffusiondet/diffusiondet.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.models import SingleStageDetector +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class DiffusionDet(SingleStageDetector): + """Implementation of `DiffusionDet <>`_""" + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/projects/DiffusionDet/diffusiondet/head.py b/mmdetection/projects/DiffusionDet/diffusiondet/head.py new file mode 100644 index 0000000..794c9c9 --- /dev/null +++ b/mmdetection/projects/DiffusionDet/diffusiondet/head.py @@ -0,0 +1,1034 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Modified from https://github.com/ShoufaChen/DiffusionDet/blob/main/diffusiondet/detector.py # noqa +# Modified from https://github.com/ShoufaChen/DiffusionDet/blob/main/diffusiondet/head.py # noqa + +# This work is licensed under the CC-BY-NC 4.0 License. +# Users should be careful about adopting these features in any commercial matters. # noqa +# For more details, please refer to https://github.com/ShoufaChen/DiffusionDet/blob/main/LICENSE # noqa + +import copy +import math +import random +import warnings +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_activation_layer +from mmcv.ops import batched_nms +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures import SampleList +from mmdet.structures.bbox import (bbox2roi, bbox_cxcywh_to_xyxy, + bbox_xyxy_to_cxcywh, get_box_wh, + scale_boxes) +from mmdet.utils import InstanceList + +_DEFAULT_SCALE_CLAMP = math.log(100000.0 / 16) + + +def cosine_beta_schedule(timesteps, s=0.008): + """Cosine schedule as proposed in + https://openreview.net/forum?id=-NEXDKk8gZ.""" + steps = timesteps + 1 + x = torch.linspace(0, timesteps, steps, dtype=torch.float64) + alphas_cumprod = torch.cos( + ((x / timesteps) + s) / (1 + s) * math.pi * 0.5)**2 + alphas_cumprod = alphas_cumprod / alphas_cumprod[0] + betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1]) + return torch.clip(betas, 0, 0.999) + + +def extract(a, t, x_shape): + """extract the appropriate t index for a batch of indices.""" + batch_size = t.shape[0] + out = a.gather(-1, t) + return out.reshape(batch_size, *((1, ) * (len(x_shape) - 1))) + + +class SinusoidalPositionEmbeddings(nn.Module): + + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, time): + device = time.device + half_dim = self.dim // 2 + embeddings = math.log(10000) / (half_dim - 1) + embeddings = torch.exp( + torch.arange(half_dim, device=device) * -embeddings) + embeddings = time[:, None] * embeddings[None, :] + embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1) + return embeddings + + +@MODELS.register_module() +class DynamicDiffusionDetHead(nn.Module): + + def __init__(self, + num_classes=80, + feat_channels=256, + num_proposals=500, + num_heads=6, + prior_prob=0.01, + snr_scale=2.0, + timesteps=1000, + sampling_timesteps=1, + self_condition=False, + box_renewal=True, + use_ensemble=True, + deep_supervision=True, + ddim_sampling_eta=1.0, + criterion=dict( + type='DiffusionDetCriterion', + num_classes=80, + assigner=dict( + type='DiffusionDetMatcher', + match_costs=[ + dict( + type='FocalLossCost', + alpha=2.0, + gamma=0.25, + weight=2.0), + dict( + type='BBoxL1Cost', + weight=5.0, + box_format='xyxy'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ], + center_radius=2.5, + candidate_topk=5), + ), + single_head=dict( + type='DiffusionDetHead', + num_cls_convs=1, + num_reg_convs=3, + dim_feedforward=2048, + num_heads=8, + dropout=0.0, + act_cfg=dict(type='ReLU'), + dynamic_conv=dict(dynamic_dim=64, dynamic_num=2)), + roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict( + type='RoIAlign', output_size=7, sampling_ratio=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + test_cfg=None, + **kwargs) -> None: + super().__init__() + self.roi_extractor = MODELS.build(roi_extractor) + + self.num_classes = num_classes + self.num_classes = num_classes + self.feat_channels = feat_channels + self.num_proposals = num_proposals + self.num_heads = num_heads + # Build Diffusion + assert isinstance(timesteps, int), 'The type of `timesteps` should ' \ + f'be int but got {type(timesteps)}' + assert sampling_timesteps <= timesteps + self.timesteps = timesteps + self.sampling_timesteps = sampling_timesteps + self.snr_scale = snr_scale + + self.ddim_sampling = self.sampling_timesteps < self.timesteps + self.ddim_sampling_eta = ddim_sampling_eta + self.self_condition = self_condition + self.box_renewal = box_renewal + self.use_ensemble = use_ensemble + + self._build_diffusion() + + # Build assigner + assert criterion.get('assigner', None) is not None + assigner = TASK_UTILS.build(criterion.get('assigner')) + # Init parameters. + self.use_focal_loss = assigner.use_focal_loss + self.use_fed_loss = assigner.use_fed_loss + + # build criterion + criterion.update(deep_supervision=deep_supervision) + self.criterion = TASK_UTILS.build(criterion) + + # Build Dynamic Head. + single_head_ = single_head.copy() + single_head_num_classes = single_head_.get('num_classes', None) + if single_head_num_classes is None: + single_head_.update(num_classes=num_classes) + else: + if single_head_num_classes != num_classes: + warnings.warn( + 'The `num_classes` of `DynamicDiffusionDetHead` and ' + '`SingleDiffusionDetHead` should be same, changing ' + f'`single_head.num_classes` to {num_classes}') + single_head_.update(num_classes=num_classes) + + single_head_feat_channels = single_head_.get('feat_channels', None) + if single_head_feat_channels is None: + single_head_.update(feat_channels=feat_channels) + else: + if single_head_feat_channels != feat_channels: + warnings.warn( + 'The `feat_channels` of `DynamicDiffusionDetHead` and ' + '`SingleDiffusionDetHead` should be same, changing ' + f'`single_head.feat_channels` to {feat_channels}') + single_head_.update(feat_channels=feat_channels) + + default_pooler_resolution = roi_extractor['roi_layer'].get( + 'output_size') + assert default_pooler_resolution is not None + single_head_pooler_resolution = single_head_.get('pooler_resolution') + if single_head_pooler_resolution is None: + single_head_.update(pooler_resolution=default_pooler_resolution) + else: + if single_head_pooler_resolution != default_pooler_resolution: + warnings.warn( + 'The `pooler_resolution` of `DynamicDiffusionDetHead` ' + 'and `SingleDiffusionDetHead` should be same, changing ' + f'`single_head.pooler_resolution` to {num_classes}') + single_head_.update( + pooler_resolution=default_pooler_resolution) + + single_head_.update( + use_focal_loss=self.use_focal_loss, use_fed_loss=self.use_fed_loss) + single_head_module = MODELS.build(single_head_) + + self.num_heads = num_heads + self.head_series = nn.ModuleList( + [copy.deepcopy(single_head_module) for _ in range(num_heads)]) + + self.deep_supervision = deep_supervision + + # Gaussian random feature embedding layer for time + time_dim = feat_channels * 4 + self.time_mlp = nn.Sequential( + SinusoidalPositionEmbeddings(feat_channels), + nn.Linear(feat_channels, time_dim), nn.GELU(), + nn.Linear(time_dim, time_dim)) + + self.prior_prob = prior_prob + self.test_cfg = test_cfg + self.use_nms = self.test_cfg.get('use_nms', True) + self._init_weights() + + def _init_weights(self): + # init all parameters. + bias_value = -math.log((1 - self.prior_prob) / self.prior_prob) + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + # initialize the bias for focal loss and fed loss. + if self.use_focal_loss or self.use_fed_loss: + if p.shape[-1] == self.num_classes or \ + p.shape[-1] == self.num_classes + 1: + nn.init.constant_(p, bias_value) + + def _build_diffusion(self): + betas = cosine_beta_schedule(self.timesteps) + alphas = 1. - betas + alphas_cumprod = torch.cumprod(alphas, dim=0) + alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value=1.) + + self.register_buffer('betas', betas) + self.register_buffer('alphas_cumprod', alphas_cumprod) + self.register_buffer('alphas_cumprod_prev', alphas_cumprod_prev) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod)) + self.register_buffer('sqrt_one_minus_alphas_cumprod', + torch.sqrt(1. - alphas_cumprod)) + self.register_buffer('log_one_minus_alphas_cumprod', + torch.log(1. - alphas_cumprod)) + self.register_buffer('sqrt_recip_alphas_cumprod', + torch.sqrt(1. / alphas_cumprod)) + self.register_buffer('sqrt_recipm1_alphas_cumprod', + torch.sqrt(1. / alphas_cumprod - 1)) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + # equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) + posterior_variance = betas * (1. - alphas_cumprod_prev) / ( + 1. - alphas_cumprod) + self.register_buffer('posterior_variance', posterior_variance) + + # log calculation clipped because the posterior variance is 0 at + # the beginning of the diffusion chain + self.register_buffer('posterior_log_variance_clipped', + torch.log(posterior_variance.clamp(min=1e-20))) + self.register_buffer( + 'posterior_mean_coef1', + betas * torch.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)) + self.register_buffer('posterior_mean_coef2', + (1. - alphas_cumprod_prev) * torch.sqrt(alphas) / + (1. - alphas_cumprod)) + + def forward(self, features, init_bboxes, init_t, init_features=None): + time = self.time_mlp(init_t, ) + + inter_class_logits = [] + inter_pred_bboxes = [] + + bs = len(features[0]) + bboxes = init_bboxes + + if init_features is not None: + init_features = init_features[None].repeat(1, bs, 1) + proposal_features = init_features.clone() + else: + proposal_features = None + + for head_idx, single_head in enumerate(self.head_series): + class_logits, pred_bboxes, proposal_features = single_head( + features, bboxes, proposal_features, self.roi_extractor, time) + if self.deep_supervision: + inter_class_logits.append(class_logits) + inter_pred_bboxes.append(pred_bboxes) + bboxes = pred_bboxes.detach() + + if self.deep_supervision: + return torch.stack(inter_class_logits), torch.stack( + inter_pred_bboxes) + else: + return class_logits[None, ...], pred_bboxes[None, ...] + + def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + prepare_outputs = self.prepare_training_targets(batch_data_samples) + (batch_gt_instances, batch_pred_instances, batch_gt_instances_ignore, + batch_img_metas) = prepare_outputs + + batch_diff_bboxes = torch.stack([ + pred_instances.diff_bboxes_abs + for pred_instances in batch_pred_instances + ]) + batch_time = torch.stack( + [pred_instances.time for pred_instances in batch_pred_instances]) + + pred_logits, pred_bboxes = self(x, batch_diff_bboxes, batch_time) + + output = { + 'pred_logits': pred_logits[-1], + 'pred_boxes': pred_bboxes[-1] + } + if self.deep_supervision: + output['aux_outputs'] = [{ + 'pred_logits': a, + 'pred_boxes': b + } for a, b in zip(pred_logits[:-1], pred_bboxes[:-1])] + + losses = self.criterion(output, batch_gt_instances, batch_img_metas) + return losses + + def prepare_training_targets(self, batch_data_samples): + # hard-setting seed to keep results same (if necessary) + # random.seed(0) + # torch.manual_seed(0) + # torch.cuda.manual_seed_all(0) + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False + + batch_gt_instances = [] + batch_pred_instances = [] + batch_gt_instances_ignore = [] + batch_img_metas = [] + for data_sample in batch_data_samples: + img_meta = data_sample.metainfo + gt_instances = data_sample.gt_instances + + gt_bboxes = gt_instances.bboxes + h, w = img_meta['img_shape'] + image_size = gt_bboxes.new_tensor([w, h, w, h]) + + norm_gt_bboxes = gt_bboxes / image_size + norm_gt_bboxes_cxcywh = bbox_xyxy_to_cxcywh(norm_gt_bboxes) + pred_instances = self.prepare_diffusion(norm_gt_bboxes_cxcywh, + image_size) + + gt_instances.set_metainfo(dict(image_size=image_size)) + gt_instances.norm_bboxes_cxcywh = norm_gt_bboxes_cxcywh + + batch_gt_instances.append(gt_instances) + batch_pred_instances.append(pred_instances) + batch_img_metas.append(data_sample.metainfo) + if 'ignored_instances' in data_sample: + batch_gt_instances_ignore.append(data_sample.ignored_instances) + else: + batch_gt_instances_ignore.append(None) + return (batch_gt_instances, batch_pred_instances, + batch_gt_instances_ignore, batch_img_metas) + + def prepare_diffusion(self, gt_boxes, image_size): + device = gt_boxes.device + time = torch.randint( + 0, self.timesteps, (1, ), dtype=torch.long, device=device) + noise = torch.randn(self.num_proposals, 4, device=device) + + num_gt = gt_boxes.shape[0] + if num_gt < self.num_proposals: + # 3 * sigma = 1/2 --> sigma: 1/6 + box_placeholder = torch.randn( + self.num_proposals - num_gt, 4, device=device) / 6. + 0.5 + box_placeholder[:, 2:] = torch.clip( + box_placeholder[:, 2:], min=1e-4) + x_start = torch.cat((gt_boxes, box_placeholder), dim=0) + else: + select_mask = [True] * self.num_proposals + \ + [False] * (num_gt - self.num_proposals) + random.shuffle(select_mask) + x_start = gt_boxes[select_mask] + + x_start = (x_start * 2. - 1.) * self.snr_scale + + # noise sample + x = self.q_sample(x_start=x_start, time=time, noise=noise) + + x = torch.clamp(x, min=-1 * self.snr_scale, max=self.snr_scale) + x = ((x / self.snr_scale) + 1) / 2. + + diff_bboxes = bbox_cxcywh_to_xyxy(x) + # convert to abs bboxes + diff_bboxes_abs = diff_bboxes * image_size + + metainfo = dict(time=time.squeeze(-1)) + pred_instances = InstanceData(metainfo=metainfo) + pred_instances.diff_bboxes = diff_bboxes + pred_instances.diff_bboxes_abs = diff_bboxes_abs + pred_instances.noise = noise + return pred_instances + + # forward diffusion + def q_sample(self, x_start, time, noise=None): + if noise is None: + noise = torch.randn_like(x_start) + + x_start_shape = x_start.shape + + sqrt_alphas_cumprod_t = extract(self.sqrt_alphas_cumprod, time, + x_start_shape) + sqrt_one_minus_alphas_cumprod_t = extract( + self.sqrt_one_minus_alphas_cumprod, time, x_start_shape) + + return sqrt_alphas_cumprod_t * x_start + \ + sqrt_one_minus_alphas_cumprod_t * noise + + def predict(self, + x: Tuple[Tensor], + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[obj:`InstanceData`]: Detection results of each image + after the post process. + """ + # hard-setting seed to keep results same (if necessary) + # seed = 0 + # random.seed(seed) + # torch.manual_seed(seed) + # torch.cuda.manual_seed_all(seed) + + device = x[-1].device + + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + + (time_pairs, batch_noise_bboxes, batch_noise_bboxes_raw, + batch_image_size) = self.prepare_testing_targets( + batch_img_metas, device) + + predictions = self.predict_by_feat( + x, + time_pairs=time_pairs, + batch_noise_bboxes=batch_noise_bboxes, + batch_noise_bboxes_raw=batch_noise_bboxes_raw, + batch_image_size=batch_image_size, + device=device, + batch_img_metas=batch_img_metas) + return predictions + + def predict_by_feat(self, + x, + time_pairs, + batch_noise_bboxes, + batch_noise_bboxes_raw, + batch_image_size, + device, + batch_img_metas=None, + cfg=None, + rescale=True): + + batch_size = len(batch_img_metas) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + ensemble_score, ensemble_label, ensemble_coord = [], [], [] + for time, time_next in time_pairs: + batch_time = torch.full((batch_size, ), + time, + device=device, + dtype=torch.long) + # self_condition = x_start if self.self_condition else None + pred_logits, pred_bboxes = self(x, batch_noise_bboxes, batch_time) + + x_start = pred_bboxes[-1] + + x_start = x_start / batch_image_size[:, None, :] + x_start = bbox_xyxy_to_cxcywh(x_start) + x_start = (x_start * 2 - 1.) * self.snr_scale + x_start = torch.clamp( + x_start, min=-1 * self.snr_scale, max=self.snr_scale) + pred_noise = self.predict_noise_from_start(batch_noise_bboxes_raw, + batch_time, x_start) + pred_noise_list, x_start_list = [], [] + noise_bboxes_list, num_remain_list = [], [] + if self.box_renewal: # filter + score_thr = cfg.get('score_thr', 0) + for img_id in range(batch_size): + score_per_image = pred_logits[-1][img_id] + + score_per_image = torch.sigmoid(score_per_image) + value, _ = torch.max(score_per_image, -1, keepdim=False) + keep_idx = value > score_thr + + num_remain_list.append(torch.sum(keep_idx)) + pred_noise_list.append(pred_noise[img_id, keep_idx, :]) + x_start_list.append(x_start[img_id, keep_idx, :]) + noise_bboxes_list.append(batch_noise_bboxes[img_id, + keep_idx, :]) + if time_next < 0: + # Not same as original DiffusionDet + if self.use_ensemble and self.sampling_timesteps > 1: + box_pred_per_image, scores_per_image, labels_per_image = \ + self.inference( + box_cls=pred_logits[-1], + box_pred=pred_bboxes[-1], + cfg=cfg, + device=device) + ensemble_score.append(scores_per_image) + ensemble_label.append(labels_per_image) + ensemble_coord.append(box_pred_per_image) + continue + + alpha = self.alphas_cumprod[time] + alpha_next = self.alphas_cumprod[time_next] + + sigma = self.ddim_sampling_eta * ((1 - alpha / alpha_next) * + (1 - alpha_next) / + (1 - alpha)).sqrt() + c = (1 - alpha_next - sigma**2).sqrt() + + batch_noise_bboxes_list = [] + batch_noise_bboxes_raw_list = [] + for idx in range(batch_size): + pred_noise = pred_noise_list[idx] + x_start = x_start_list[idx] + noise_bboxes = noise_bboxes_list[idx] + num_remain = num_remain_list[idx] + noise = torch.randn_like(noise_bboxes) + + noise_bboxes = x_start * alpha_next.sqrt() + \ + c * pred_noise + sigma * noise + + if self.box_renewal: # filter + # replenish with randn boxes + if num_remain < self.num_proposals: + noise_bboxes = torch.cat( + (noise_bboxes, + torch.randn( + self.num_proposals - num_remain, + 4, + device=device)), + dim=0) + else: + select_mask = [True] * self.num_proposals + \ + [False] * (num_remain - + self.num_proposals) + random.shuffle(select_mask) + noise_bboxes = noise_bboxes[select_mask] + + # raw noise boxes + batch_noise_bboxes_raw_list.append(noise_bboxes) + # resize to xyxy + noise_bboxes = torch.clamp( + noise_bboxes, + min=-1 * self.snr_scale, + max=self.snr_scale) + noise_bboxes = ((noise_bboxes / self.snr_scale) + 1) / 2 + noise_bboxes = bbox_cxcywh_to_xyxy(noise_bboxes) + noise_bboxes = noise_bboxes * batch_image_size[idx] + + batch_noise_bboxes_list.append(noise_bboxes) + batch_noise_bboxes = torch.stack(batch_noise_bboxes_list) + batch_noise_bboxes_raw = torch.stack(batch_noise_bboxes_raw_list) + if self.use_ensemble and self.sampling_timesteps > 1: + box_pred_per_image, scores_per_image, labels_per_image = \ + self.inference( + box_cls=pred_logits[-1], + box_pred=pred_bboxes[-1], + cfg=cfg, + device=device) + ensemble_score.append(scores_per_image) + ensemble_label.append(labels_per_image) + ensemble_coord.append(box_pred_per_image) + if self.use_ensemble and self.sampling_timesteps > 1: + steps = len(ensemble_score) + results_list = [] + for idx in range(batch_size): + ensemble_score_per_img = [ + ensemble_score[i][idx] for i in range(steps) + ] + ensemble_label_per_img = [ + ensemble_label[i][idx] for i in range(steps) + ] + ensemble_coord_per_img = [ + ensemble_coord[i][idx] for i in range(steps) + ] + + scores_per_image = torch.cat(ensemble_score_per_img, dim=0) + labels_per_image = torch.cat(ensemble_label_per_img, dim=0) + box_pred_per_image = torch.cat(ensemble_coord_per_img, dim=0) + + if self.use_nms: + det_bboxes, keep_idxs = batched_nms( + box_pred_per_image, scores_per_image, labels_per_image, + cfg.nms) + box_pred_per_image = box_pred_per_image[keep_idxs] + labels_per_image = labels_per_image[keep_idxs] + scores_per_image = det_bboxes[:, -1] + results = InstanceData() + results.bboxes = box_pred_per_image + results.scores = scores_per_image + results.labels = labels_per_image + results_list.append(results) + else: + box_cls = pred_logits[-1] + box_pred = pred_bboxes[-1] + results_list = self.inference(box_cls, box_pred, cfg, device) + if rescale: + results_list = self.do_results_post_process( + results_list, cfg, batch_img_metas=batch_img_metas) + return results_list + + @staticmethod + def do_results_post_process(results_list, cfg, batch_img_metas=None): + processed_results = [] + for results, img_meta in zip(results_list, batch_img_metas): + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + # clip w, h + h, w = img_meta['ori_shape'] + results.bboxes[:, 0::2] = results.bboxes[:, 0::2].clamp( + min=0, max=w) + results.bboxes[:, 1::2] = results.bboxes[:, 1::2].clamp( + min=0, max=h) + + # filter small size bboxes + if cfg.get('min_bbox_size', 0) >= 0: + w, h = get_box_wh(results.bboxes) + valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + if not valid_mask.all(): + results = results[valid_mask] + processed_results.append(results) + + return processed_results + + def prepare_testing_targets(self, batch_img_metas, device): + # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == timesteps + times = torch.linspace( + -1, self.timesteps - 1, steps=self.sampling_timesteps + 1) + times = list(reversed(times.int().tolist())) + # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)] + time_pairs = list(zip(times[:-1], times[1:])) + + noise_bboxes_list = [] + noise_bboxes_raw_list = [] + image_size_list = [] + for img_meta in batch_img_metas: + h, w = img_meta['img_shape'] + image_size = torch.tensor([w, h, w, h], + dtype=torch.float32, + device=device) + noise_bboxes_raw = torch.randn((self.num_proposals, 4), + device=device) + noise_bboxes = torch.clamp( + noise_bboxes_raw, min=-1 * self.snr_scale, max=self.snr_scale) + noise_bboxes = ((noise_bboxes / self.snr_scale) + 1) / 2 + noise_bboxes = bbox_cxcywh_to_xyxy(noise_bboxes) + noise_bboxes = noise_bboxes * image_size + + noise_bboxes_raw_list.append(noise_bboxes_raw) + noise_bboxes_list.append(noise_bboxes) + image_size_list.append(image_size[None]) + batch_noise_bboxes = torch.stack(noise_bboxes_list) + batch_image_size = torch.cat(image_size_list) + batch_noise_bboxes_raw = torch.stack(noise_bboxes_raw_list) + return (time_pairs, batch_noise_bboxes, batch_noise_bboxes_raw, + batch_image_size) + + def predict_noise_from_start(self, x_t, t, x0): + results = (extract( + self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / \ + extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + return results + + def inference(self, box_cls, box_pred, cfg, device): + """ + Args: + box_cls (Tensor): tensor of shape (batch_size, num_proposals, K). + The tensor predicts the classification probability for + each proposal. + box_pred (Tensor): tensors of shape (batch_size, num_proposals, 4). + The tensor predicts 4-vector (x,y,w,h) box + regression values for every proposal + + Returns: + results (List[Instances]): a list of #images elements. + """ + results = [] + + if self.use_focal_loss or self.use_fed_loss: + scores = torch.sigmoid(box_cls) + labels = torch.arange( + self.num_classes, + device=device).unsqueeze(0).repeat(self.num_proposals, + 1).flatten(0, 1) + box_pred_list = [] + scores_list = [] + labels_list = [] + for i, (scores_per_image, + box_pred_per_image) in enumerate(zip(scores, box_pred)): + + scores_per_image, topk_indices = scores_per_image.flatten( + 0, 1).topk( + self.num_proposals, sorted=False) + labels_per_image = labels[topk_indices] + box_pred_per_image = box_pred_per_image.view(-1, 1, 4).repeat( + 1, self.num_classes, 1).view(-1, 4) + box_pred_per_image = box_pred_per_image[topk_indices] + + if self.use_ensemble and self.sampling_timesteps > 1: + box_pred_list.append(box_pred_per_image) + scores_list.append(scores_per_image) + labels_list.append(labels_per_image) + continue + + if self.use_nms: + det_bboxes, keep_idxs = batched_nms( + box_pred_per_image, scores_per_image, labels_per_image, + cfg.nms) + box_pred_per_image = box_pred_per_image[keep_idxs] + labels_per_image = labels_per_image[keep_idxs] + # some nms would reweight the score, such as softnms + scores_per_image = det_bboxes[:, -1] + result = InstanceData() + result.bboxes = box_pred_per_image + result.scores = scores_per_image + result.labels = labels_per_image + results.append(result) + + else: + # For each box we assign the best class or the second + # best if the best on is `no_object`. + scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1) + + for i, (scores_per_image, labels_per_image, + box_pred_per_image) in enumerate( + zip(scores, labels, box_pred)): + if self.use_ensemble and self.sampling_timesteps > 1: + return box_pred_per_image, scores_per_image, \ + labels_per_image + + if self.use_nms: + det_bboxes, keep_idxs = batched_nms( + box_pred_per_image, scores_per_image, labels_per_image, + cfg.nms) + box_pred_per_image = box_pred_per_image[keep_idxs] + labels_per_image = labels_per_image[keep_idxs] + # some nms would reweight the score, such as softnms + scores_per_image = det_bboxes[:, -1] + + result = InstanceData() + result.bboxes = box_pred_per_image + result.scores = scores_per_image + result.labels = labels_per_image + results.append(result) + if self.use_ensemble and self.sampling_timesteps > 1: + return box_pred_list, scores_list, labels_list + else: + return results + + +@MODELS.register_module() +class SingleDiffusionDetHead(nn.Module): + + def __init__( + self, + num_classes=80, + feat_channels=256, + dim_feedforward=2048, + num_cls_convs=1, + num_reg_convs=3, + num_heads=8, + dropout=0.0, + pooler_resolution=7, + scale_clamp=_DEFAULT_SCALE_CLAMP, + bbox_weights=(2.0, 2.0, 1.0, 1.0), + use_focal_loss=True, + use_fed_loss=False, + act_cfg=dict(type='ReLU', inplace=True), + dynamic_conv=dict(dynamic_dim=64, dynamic_num=2) + ) -> None: + super().__init__() + self.feat_channels = feat_channels + + # Dynamic + self.self_attn = nn.MultiheadAttention( + feat_channels, num_heads, dropout=dropout) + self.inst_interact = DynamicConv( + feat_channels=feat_channels, + pooler_resolution=pooler_resolution, + dynamic_dim=dynamic_conv['dynamic_dim'], + dynamic_num=dynamic_conv['dynamic_num']) + + self.linear1 = nn.Linear(feat_channels, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, feat_channels) + + self.norm1 = nn.LayerNorm(feat_channels) + self.norm2 = nn.LayerNorm(feat_channels) + self.norm3 = nn.LayerNorm(feat_channels) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation = build_activation_layer(act_cfg) + + # block time mlp + self.block_time_mlp = nn.Sequential( + nn.SiLU(), nn.Linear(feat_channels * 4, feat_channels * 2)) + + # cls. + cls_module = list() + for _ in range(num_cls_convs): + cls_module.append(nn.Linear(feat_channels, feat_channels, False)) + cls_module.append(nn.LayerNorm(feat_channels)) + cls_module.append(nn.ReLU(inplace=True)) + self.cls_module = nn.ModuleList(cls_module) + + # reg. + reg_module = list() + for _ in range(num_reg_convs): + reg_module.append(nn.Linear(feat_channels, feat_channels, False)) + reg_module.append(nn.LayerNorm(feat_channels)) + reg_module.append(nn.ReLU(inplace=True)) + self.reg_module = nn.ModuleList(reg_module) + + # pred. + self.use_focal_loss = use_focal_loss + self.use_fed_loss = use_fed_loss + if self.use_focal_loss or self.use_fed_loss: + self.class_logits = nn.Linear(feat_channels, num_classes) + else: + self.class_logits = nn.Linear(feat_channels, num_classes + 1) + self.bboxes_delta = nn.Linear(feat_channels, 4) + self.scale_clamp = scale_clamp + self.bbox_weights = bbox_weights + + def forward(self, features, bboxes, pro_features, pooler, time_emb): + """ + :param bboxes: (N, num_boxes, 4) + :param pro_features: (N, num_boxes, feat_channels) + """ + + N, num_boxes = bboxes.shape[:2] + + # roi_feature. + proposal_boxes = list() + for b in range(N): + proposal_boxes.append(bboxes[b]) + rois = bbox2roi(proposal_boxes) + + roi_features = pooler(features, rois) + + if pro_features is None: + pro_features = roi_features.view(N, num_boxes, self.feat_channels, + -1).mean(-1) + + roi_features = roi_features.view(N * num_boxes, self.feat_channels, + -1).permute(2, 0, 1) + + # self_att. + pro_features = pro_features.view(N, num_boxes, + self.feat_channels).permute(1, 0, 2) + pro_features2 = self.self_attn( + pro_features, pro_features, value=pro_features)[0] + pro_features = pro_features + self.dropout1(pro_features2) + pro_features = self.norm1(pro_features) + + # inst_interact. + pro_features = pro_features.view( + num_boxes, N, + self.feat_channels).permute(1, 0, + 2).reshape(1, N * num_boxes, + self.feat_channels) + pro_features2 = self.inst_interact(pro_features, roi_features) + pro_features = pro_features + self.dropout2(pro_features2) + obj_features = self.norm2(pro_features) + + # obj_feature. + obj_features2 = self.linear2( + self.dropout(self.activation(self.linear1(obj_features)))) + obj_features = obj_features + self.dropout3(obj_features2) + obj_features = self.norm3(obj_features) + + fc_feature = obj_features.transpose(0, 1).reshape(N * num_boxes, -1) + + scale_shift = self.block_time_mlp(time_emb) + scale_shift = torch.repeat_interleave(scale_shift, num_boxes, dim=0) + scale, shift = scale_shift.chunk(2, dim=1) + fc_feature = fc_feature * (scale + 1) + shift + + cls_feature = fc_feature.clone() + reg_feature = fc_feature.clone() + for cls_layer in self.cls_module: + cls_feature = cls_layer(cls_feature) + for reg_layer in self.reg_module: + reg_feature = reg_layer(reg_feature) + class_logits = self.class_logits(cls_feature) + bboxes_deltas = self.bboxes_delta(reg_feature) + pred_bboxes = self.apply_deltas(bboxes_deltas, bboxes.view(-1, 4)) + + return (class_logits.view(N, num_boxes, + -1), pred_bboxes.view(N, num_boxes, + -1), obj_features) + + def apply_deltas(self, deltas, boxes): + """Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`. + + Args: + deltas (Tensor): transformation deltas of shape (N, k*4), + where k >= 1. deltas[i] represents k potentially + different class-specific box transformations for + the single box boxes[i]. + boxes (Tensor): boxes to transform, of shape (N, 4) + """ + boxes = boxes.to(deltas.dtype) + + widths = boxes[:, 2] - boxes[:, 0] + heights = boxes[:, 3] - boxes[:, 1] + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = self.bbox_weights + dx = deltas[:, 0::4] / wx + dy = deltas[:, 1::4] / wy + dw = deltas[:, 2::4] / ww + dh = deltas[:, 3::4] / wh + + # Prevent sending too large values into torch.exp() + dw = torch.clamp(dw, max=self.scale_clamp) + dh = torch.clamp(dh, max=self.scale_clamp) + + pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] + pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] + pred_w = torch.exp(dw) * widths[:, None] + pred_h = torch.exp(dh) * heights[:, None] + + pred_boxes = torch.zeros_like(deltas) + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # x1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # y1 + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # x2 + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h # y2 + + return pred_boxes + + +class DynamicConv(nn.Module): + + def __init__(self, + feat_channels: int, + dynamic_dim: int = 64, + dynamic_num: int = 2, + pooler_resolution: int = 7) -> None: + super().__init__() + + self.feat_channels = feat_channels + self.dynamic_dim = dynamic_dim + self.dynamic_num = dynamic_num + self.num_params = self.feat_channels * self.dynamic_dim + self.dynamic_layer = nn.Linear(self.feat_channels, + self.dynamic_num * self.num_params) + + self.norm1 = nn.LayerNorm(self.dynamic_dim) + self.norm2 = nn.LayerNorm(self.feat_channels) + + self.activation = nn.ReLU(inplace=True) + + num_output = self.feat_channels * pooler_resolution**2 + self.out_layer = nn.Linear(num_output, self.feat_channels) + self.norm3 = nn.LayerNorm(self.feat_channels) + + def forward(self, pro_features: Tensor, roi_features: Tensor) -> Tensor: + """Forward function. + + Args: + pro_features: (1, N * num_boxes, self.feat_channels) + roi_features: (49, N * num_boxes, self.feat_channels) + + Returns: + """ + features = roi_features.permute(1, 0, 2) + parameters = self.dynamic_layer(pro_features).permute(1, 0, 2) + + param1 = parameters[:, :, :self.num_params].view( + -1, self.feat_channels, self.dynamic_dim) + param2 = parameters[:, :, + self.num_params:].view(-1, self.dynamic_dim, + self.feat_channels) + + features = torch.bmm(features, param1) + features = self.norm1(features) + features = self.activation(features) + + features = torch.bmm(features, param2) + features = self.norm2(features) + features = self.activation(features) + + features = features.flatten(1) + features = self.out_layer(features) + features = self.norm3(features) + features = self.activation(features) + + return features diff --git a/mmdetection/projects/DiffusionDet/diffusiondet/loss.py b/mmdetection/projects/DiffusionDet/diffusiondet/loss.py new file mode 100644 index 0000000..3d532f1 --- /dev/null +++ b/mmdetection/projects/DiffusionDet/diffusiondet/loss.py @@ -0,0 +1,341 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Modified from https://github.com/ShoufaChen/DiffusionDet/blob/main/diffusiondet/loss.py # noqa + +# This work is licensed under the CC-BY-NC 4.0 License. +# Users should be careful about adopting these features in any commercial matters. # noqa +# For more details, please refer to https://github.com/ShoufaChen/DiffusionDet/blob/main/LICENSE # noqa + +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmdet.registry import MODELS, TASK_UTILS +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh +from mmdet.utils import ConfigType + + +@TASK_UTILS.register_module() +class DiffusionDetCriterion(nn.Module): + + def __init__( + self, + num_classes, + assigner: Union[ConfigDict, nn.Module], + deep_supervision=True, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + alpha=0.25, + gamma=2.0, + reduction='sum', + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=5.0), + loss_giou=dict(type='GIoULoss', reduction='sum', loss_weight=2.0), + ): + + super().__init__() + self.num_classes = num_classes + + if isinstance(assigner, nn.Module): + self.assigner = assigner + else: + self.assigner = TASK_UTILS.build(assigner) + + self.deep_supervision = deep_supervision + + self.loss_cls = MODELS.build(loss_cls) + self.loss_bbox = MODELS.build(loss_bbox) + self.loss_giou = MODELS.build(loss_giou) + + def forward(self, outputs, batch_gt_instances, batch_img_metas): + batch_indices = self.assigner(outputs, batch_gt_instances, + batch_img_metas) + # Compute all the requested losses + loss_cls = self.loss_classification(outputs, batch_gt_instances, + batch_indices) + loss_bbox, loss_giou = self.loss_boxes(outputs, batch_gt_instances, + batch_indices) + + losses = dict( + loss_cls=loss_cls, loss_bbox=loss_bbox, loss_giou=loss_giou) + + if self.deep_supervision: + assert 'aux_outputs' in outputs + for i, aux_outputs in enumerate(outputs['aux_outputs']): + batch_indices = self.assigner(aux_outputs, batch_gt_instances, + batch_img_metas) + loss_cls = self.loss_classification(aux_outputs, + batch_gt_instances, + batch_indices) + loss_bbox, loss_giou = self.loss_boxes(aux_outputs, + batch_gt_instances, + batch_indices) + tmp_losses = dict( + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_giou=loss_giou) + for name, value in tmp_losses.items(): + losses[f's.{i}.{name}'] = value + return losses + + def loss_classification(self, outputs, batch_gt_instances, indices): + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + target_classes_list = [ + gt.labels[J] for gt, (_, J) in zip(batch_gt_instances, indices) + ] + target_classes = torch.full( + src_logits.shape[:2], + self.num_classes, + dtype=torch.int64, + device=src_logits.device) + for idx in range(len(batch_gt_instances)): + target_classes[idx, indices[idx][0]] = target_classes_list[idx] + + src_logits = src_logits.flatten(0, 1) + target_classes = target_classes.flatten(0, 1) + # comp focal loss. + num_instances = max(torch.cat(target_classes_list).shape[0], 1) + loss_cls = self.loss_cls( + src_logits, + target_classes, + ) / num_instances + return loss_cls + + def loss_boxes(self, outputs, batch_gt_instances, indices): + assert 'pred_boxes' in outputs + pred_boxes = outputs['pred_boxes'] + + target_bboxes_norm_list = [ + gt.norm_bboxes_cxcywh[J] + for gt, (_, J) in zip(batch_gt_instances, indices) + ] + target_bboxes_list = [ + gt.bboxes[J] for gt, (_, J) in zip(batch_gt_instances, indices) + ] + + pred_bboxes_list = [] + pred_bboxes_norm_list = [] + for idx in range(len(batch_gt_instances)): + pred_bboxes_list.append(pred_boxes[idx, indices[idx][0]]) + image_size = batch_gt_instances[idx].image_size + pred_bboxes_norm_list.append(pred_boxes[idx, indices[idx][0]] / + image_size) + + pred_boxes_cat = torch.cat(pred_bboxes_list) + pred_boxes_norm_cat = torch.cat(pred_bboxes_norm_list) + target_bboxes_cat = torch.cat(target_bboxes_list) + target_bboxes_norm_cat = torch.cat(target_bboxes_norm_list) + + if len(pred_boxes_cat) > 0: + num_instances = pred_boxes_cat.shape[0] + + loss_bbox = self.loss_bbox( + pred_boxes_norm_cat, + bbox_cxcywh_to_xyxy(target_bboxes_norm_cat)) / num_instances + loss_giou = self.loss_giou(pred_boxes_cat, + target_bboxes_cat) / num_instances + else: + loss_bbox = pred_boxes.sum() * 0 + loss_giou = pred_boxes.sum() * 0 + return loss_bbox, loss_giou + + +@TASK_UTILS.register_module() +class DiffusionDetMatcher(nn.Module): + """This class computes an assignment between the targets and the + predictions of the network For efficiency reasons, the targets don't + include the no_object. + + Because of this, in general, there are more predictions than targets. In + this case, we do a 1-to-k (dynamic) matching of the best predictions, while + the others are un-matched (and thus treated as non-objects). + """ + + def __init__(self, + match_costs: Union[List[Union[dict, ConfigDict]], dict, + ConfigDict], + center_radius: float = 2.5, + candidate_topk: int = 5, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D'), + **kwargs): + super().__init__() + + self.center_radius = center_radius + self.candidate_topk = candidate_topk + + if isinstance(match_costs, dict): + match_costs = [match_costs] + elif isinstance(match_costs, list): + assert len(match_costs) > 0, \ + 'match_costs must not be a empty list.' + self.use_focal_loss = False + self.use_fed_loss = False + for _match_cost in match_costs: + if _match_cost.get('type') == 'FocalLossCost': + self.use_focal_loss = True + if _match_cost.get('type') == 'FedLoss': + self.use_fed_loss = True + raise NotImplementedError + + self.match_costs = [ + TASK_UTILS.build(match_cost) for match_cost in match_costs + ] + self.iou_calculator = TASK_UTILS.build(iou_calculator) + + def forward(self, outputs, batch_gt_instances, batch_img_metas): + assert 'pred_logits' in outputs and 'pred_boxes' in outputs + + pred_logits = outputs['pred_logits'] + pred_bboxes = outputs['pred_boxes'] + batch_size = len(batch_gt_instances) + + assert batch_size == pred_logits.shape[0] == pred_bboxes.shape[0] + batch_indices = [] + for i in range(batch_size): + pred_instances = InstanceData() + pred_instances.bboxes = pred_bboxes[i, ...] + pred_instances.scores = pred_logits[i, ...] + gt_instances = batch_gt_instances[i] + img_meta = batch_img_metas[i] + indices = self.single_assigner(pred_instances, gt_instances, + img_meta) + batch_indices.append(indices) + return batch_indices + + def single_assigner(self, pred_instances, gt_instances, img_meta): + with torch.no_grad(): + gt_bboxes = gt_instances.bboxes + pred_bboxes = pred_instances.bboxes + num_gt = gt_bboxes.size(0) + + if num_gt == 0: # empty object in key frame + valid_mask = pred_bboxes.new_zeros((pred_bboxes.shape[0], ), + dtype=torch.bool) + matched_gt_inds = pred_bboxes.new_zeros((gt_bboxes.shape[0], ), + dtype=torch.long) + return valid_mask, matched_gt_inds + + valid_mask, is_in_boxes_and_center = \ + self.get_in_gt_and_in_center_info( + bbox_xyxy_to_cxcywh(pred_bboxes), + bbox_xyxy_to_cxcywh(gt_bboxes) + ) + + cost_list = [] + for match_cost in self.match_costs: + cost = match_cost( + pred_instances=pred_instances, + gt_instances=gt_instances, + img_meta=img_meta) + cost_list.append(cost) + + pairwise_ious = self.iou_calculator(pred_bboxes, gt_bboxes) + + cost_list.append((~is_in_boxes_and_center) * 100.0) + cost_matrix = torch.stack(cost_list).sum(0) + cost_matrix[~valid_mask] = cost_matrix[~valid_mask] + 10000.0 + + fg_mask_inboxes, matched_gt_inds = \ + self.dynamic_k_matching( + cost_matrix, pairwise_ious, num_gt) + return fg_mask_inboxes, matched_gt_inds + + def get_in_gt_and_in_center_info( + self, pred_bboxes: Tensor, + gt_bboxes: Tensor) -> Tuple[Tensor, Tensor]: + """Get the information of which prior is in gt bboxes and gt center + priors.""" + xy_target_gts = bbox_cxcywh_to_xyxy(gt_bboxes) # (x1, y1, x2, y2) + + pred_bboxes_center_x = pred_bboxes[:, 0].unsqueeze(1) + pred_bboxes_center_y = pred_bboxes[:, 1].unsqueeze(1) + + # whether the center of each anchor is inside a gt box + b_l = pred_bboxes_center_x > xy_target_gts[:, 0].unsqueeze(0) + b_r = pred_bboxes_center_x < xy_target_gts[:, 2].unsqueeze(0) + b_t = pred_bboxes_center_y > xy_target_gts[:, 1].unsqueeze(0) + b_b = pred_bboxes_center_y < xy_target_gts[:, 3].unsqueeze(0) + # (b_l.long()+b_r.long()+b_t.long()+b_b.long())==4 [300,num_gt] , + is_in_boxes = ((b_l.long() + b_r.long() + b_t.long() + + b_b.long()) == 4) + is_in_boxes_all = is_in_boxes.sum(1) > 0 # [num_query] + # in fixed center + center_radius = 2.5 + # Modified to self-adapted sampling --- the center size depends + # on the size of the gt boxes + # https://github.com/dulucas/UVO_Challenge/blob/main/Track1/detection/mmdet/core/bbox/assigners/rpn_sim_ota_assigner.py#L212 # noqa + b_l = pred_bboxes_center_x > ( + gt_bboxes[:, 0] - + (center_radius * + (xy_target_gts[:, 2] - xy_target_gts[:, 0]))).unsqueeze(0) + b_r = pred_bboxes_center_x < ( + gt_bboxes[:, 0] + + (center_radius * + (xy_target_gts[:, 2] - xy_target_gts[:, 0]))).unsqueeze(0) + b_t = pred_bboxes_center_y > ( + gt_bboxes[:, 1] - + (center_radius * + (xy_target_gts[:, 3] - xy_target_gts[:, 1]))).unsqueeze(0) + b_b = pred_bboxes_center_y < ( + gt_bboxes[:, 1] + + (center_radius * + (xy_target_gts[:, 3] - xy_target_gts[:, 1]))).unsqueeze(0) + + is_in_centers = ((b_l.long() + b_r.long() + b_t.long() + + b_b.long()) == 4) + is_in_centers_all = is_in_centers.sum(1) > 0 + + is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all + is_in_boxes_and_center = (is_in_boxes & is_in_centers) + + return is_in_boxes_anchor, is_in_boxes_and_center + + def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor, + num_gt: int) -> Tuple[Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets.""" + matching_matrix = torch.zeros_like(cost) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.candidate_topk, pairwise_ious.size(0)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1) + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) + matching_matrix[:, gt_idx][pos_idx] = 1 + + del topk_ious, dynamic_ks, pos_idx + + prior_match_gt_mask = matching_matrix.sum(1) > 1 + if prior_match_gt_mask.sum() > 0: + _, cost_argmin = torch.min(cost[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + + while (matching_matrix.sum(0) == 0).any(): + matched_query_id = matching_matrix.sum(1) > 0 + cost[matched_query_id] += 100000.0 + unmatch_id = torch.nonzero( + matching_matrix.sum(0) == 0, as_tuple=False).squeeze(1) + for gt_idx in unmatch_id: + pos_idx = torch.argmin(cost[:, gt_idx]) + matching_matrix[:, gt_idx][pos_idx] = 1.0 + if (matching_matrix.sum(1) > 1).sum() > 0: + _, cost_argmin = torch.min(cost[prior_match_gt_mask], dim=1) + matching_matrix[prior_match_gt_mask] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin, ] = 1 + + assert not (matching_matrix.sum(0) == 0).any() + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(1) > 0 + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + + return fg_mask_inboxes, matched_gt_inds diff --git a/mmdetection/projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py b/mmdetection/projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py new file mode 100644 index 0000000..101abd8 --- /dev/null +++ b/mmdetection/projects/DiffusionDet/model_converters/diffusiondet_resnet_to_mmdet.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import numpy as np +import torch +from mmengine.fileio import load + + +def convert(src, dst): + if src.endswith('pth'): + src_model = torch.load(src) + else: + src_model = load(src) + + dst_state_dict = OrderedDict() + for k, v in src_model['model'].items(): + key_name_split = k.split('.') + if 'backbone.fpn_lateral' in k: + lateral_id = int(key_name_split[-2][-1]) + name = f'neck.lateral_convs.{lateral_id - 2}.' \ + f'conv.{key_name_split[-1]}' + elif 'backbone.fpn_output' in k: + lateral_id = int(key_name_split[-2][-1]) + name = f'neck.fpn_convs.{lateral_id - 2}.conv.' \ + f'{key_name_split[-1]}' + elif 'backbone.bottom_up.stem.conv1.norm.' in k: + name = f'backbone.bn1.{key_name_split[-1]}' + elif 'backbone.bottom_up.stem.conv1.' in k: + name = f'backbone.conv1.{key_name_split[-1]}' + elif 'backbone.bottom_up.res' in k: + # weight_type = key_name_split[-1] + res_id = int(key_name_split[2][-1]) - 1 + # deal with short cut + if 'shortcut' in key_name_split[4]: + if 'shortcut' == key_name_split[-2]: + name = f'backbone.layer{res_id}.' \ + f'{key_name_split[3]}.downsample.0.' \ + f'{key_name_split[-1]}' + elif 'shortcut' == key_name_split[-3]: + name = f'backbone.layer{res_id}.' \ + f'{key_name_split[3]}.downsample.1.' \ + f'{key_name_split[-1]}' + else: + print(f'Unvalid key {k}') + # deal with conv + elif 'conv' in key_name_split[-2]: + conv_id = int(key_name_split[-2][-1]) + name = f'backbone.layer{res_id}.{key_name_split[3]}' \ + f'.conv{conv_id}.{key_name_split[-1]}' + # deal with BN + elif key_name_split[-2] == 'norm': + conv_id = int(key_name_split[-3][-1]) + name = f'backbone.layer{res_id}.{key_name_split[3]}.' \ + f'bn{conv_id}.{key_name_split[-1]}' + else: + print(f'{k} is invalid') + + elif key_name_split[0] == 'head': + # d2: head.xxx -> mmdet: bbox_head.xxx + name = f'bbox_{k}' + else: + # some base parameters such as beta will not convert + print(f'{k} is not converted!!') + continue + + if not isinstance(v, np.ndarray) and not isinstance(v, torch.Tensor): + raise ValueError( + 'Unsupported type found in checkpoint! {}: {}'.format( + k, type(v))) + if not isinstance(v, torch.Tensor): + dst_state_dict[name] = torch.from_numpy(v) + else: + dst_state_dict[name] = v + mmdet_model = dict(state_dict=dst_state_dict, meta=dict()) + torch.save(mmdet_model, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument('src', help='src detectron model path') + parser.add_argument('dst', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/mmdetection/projects/EfficientDet/README.md b/mmdetection/projects/EfficientDet/README.md new file mode 100644 index 0000000..36f4ed4 --- /dev/null +++ b/mmdetection/projects/EfficientDet/README.md @@ -0,0 +1,154 @@ +# EfficientDet + +> [**EfficientDet: Scalable and Efficient Object Detection**](https://arxiv.org/pdf/1911.09070.pdf), +> Mingxing Tan, Ruoming Pang, Quoc V. Le, +> *CVPR 2020* + +## Abstract + +This is an implementation of [EfficientDet](https://github.com/google/automl) based on [MMDetection](https://github.com/open-mmlab/mmdetection/tree/main), [MMCV](https://github.com/open-mmlab/mmcv), and [MMEngine](https://github.com/open-mmlab/mmengine). +
    +EfficientDet a new family of object detectors, which consistently achieve much better efficiency than prior art across a wide +spectrum of resource constraints. +In particular, with single model and single-scale, EfficientDet-D7 achieves stateof-the-art 55.1 AP on COCO test-dev with 77M parameters and 410B FLOP. +
    +BiFPN is a simple yet highly effective weighted bi-directional feature pyramid network, which introduces learnable weights to learn the importance of different input features, while repeatedly applying topdown and bottom-up multi-scale feature fusion. +
    +In contrast to other feature pyramid network, such as FPN, FPN + PAN, NAS-FPN, BiFPN achieves the best accuracy with fewer parameters and FLOPs. + +
    + +
    + +## Usage + +## Official TensorFlow Model + +This project also supports [official tensorflow model](https://github.com/google/automl), it uses 90 categories and yxyx box encoding in training. If you want to use the original model weight to get official results, please refer to the following steps. + +### Model conversion + +Firstly, download EfficientDet [weights](https://github.com/google/automl/tree/master/efficientdet) and unzip, please use the following command + +```bash +tar -xzvf {EFFICIENTDET_WEIGHT} +``` + +Then, install tensorflow, please use the following command + +```bash +pip install tensorflow-gpu==2.6.0 +``` + +Lastly, convert weights from tensorflow to pytorch, please use the following command + +```bash +python projects/EfficientDet/convert_tf_to_pt.py --backbone {BACKBONE_NAME} --tensorflow_weight {TENSORFLOW_WEIGHT_PATH} --out_weight {OUT_PATH} +``` + +### Testing commands + +In MMDetection's root directory, run the following command to test the model: + +```bash +python tools/test.py projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py ${CHECKPOINT_PATH} +``` + +## Reproduce Model + +For convenience, we recommend the current implementation version, it uses 80 categories and xyxy encoding in training. On this basis, a higher result was finally achieved. + +### Training commands + +In MMDetection's root directory, run the following command to train the model: + +```bash +python tools/train.py projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py +``` + +### Testing commands + +In MMDetection's root directory, run the following command to test the model: + +```bash +python tools/test.py projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py ${CHECKPOINT_PATH} +``` + +## Results + +Based on mmdetection, this project aligns the accuracy of the [official model](https://github.com/google/automl). + +| Method | Backbone | Pretrained Model | Training set | Test set | Epoch | Val Box AP | Official AP | Download | +| :------------------------------------------------------------------------------------------------------------------: | :-------------: | :--------------: | :------------: | :----------: | :---: | :--------: | :---------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [efficientdet-d0\*](projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py) | efficientnet-b0 | ImageNet | COCO2017 Train | COCO2017 Val | 300 | 34.4 | 34.3 | | +| [efficientdet-d3](projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py) | efficientnet-b3 | ImageNet | COCO2017 Train | COCO2017 Val | 300 | 47.2 | 46.8 | [model](https://download.openmmlab.com/mmdetection/v3.0/efficientdet/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco_20230223_122457-e6f7a833.pth) \| [log](https://download.openmmlab.com/mmdetection/v3.0/efficientdet/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco_20230223_122457.log.json) | + +**Note**: +\*means use [official tensorflow model](https://github.com/google/automl) weights to test. + +## Citation + +```BibTeX +@inproceedings{tan2020efficientdet, + title={Efficientdet: Scalable and efficient object detection}, + author={Tan, Mingxing and Pang, Ruoming and Le, Quoc V}, + booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, + pages={10781--10790}, + year={2020} +} +``` + +## Checklist + + + +- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [x] Finish the code + + + + - [x] Basic docstrings & proper citation + + + + - [x] Test-time correctness + + + + - [x] A full README + + + +- [x] Milestone 2: Indicates a successful model implementation. + + - [x] Training-time correctness + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Type hints and docstrings + + + + - [ ] Unit tests + + + + - [ ] Code polishing + + + + - [ ] Metafile.yml + + + +- [ ] Move your modules into the core package following the codebase's file hierarchy structure. + + + +- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/mmdetection/projects/EfficientDet/configs/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco.py b/mmdetection/projects/EfficientDet/configs/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco.py new file mode 100644 index 0000000..c7a3b30 --- /dev/null +++ b/mmdetection/projects/EfficientDet/configs/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco.py @@ -0,0 +1,171 @@ +_base_ = [ + 'mmdet::_base_/datasets/coco_detection.py', + 'mmdet::_base_/schedules/schedule_1x.py', + 'mmdet::_base_/default_runtime.py' +] +custom_imports = dict( + imports=['projects.EfficientDet.efficientdet'], allow_failed_imports=False) + +image_size = 512 +batch_augments = [ + dict(type='BatchFixedSizePad', size=(image_size, image_size)) +] +dataset_type = 'CocoDataset' +evalute_type = 'CocoMetric' +norm_cfg = dict(type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01) +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth' # noqa +model = dict( + type='EfficientDet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=image_size, + batch_augments=batch_augments), + backbone=dict( + type='EfficientNet', + arch='b0', + drop_path_rate=0.2, + out_indices=(3, 4, 5), + frozen_stages=0, + conv_cfg=dict(type='Conv2dSamePadding'), + norm_cfg=norm_cfg, + norm_eval=False, + init_cfg=dict( + type='Pretrained', prefix='backbone', checkpoint=checkpoint)), + neck=dict( + type='BiFPN', + num_stages=3, + in_channels=[40, 112, 320], + out_channels=64, + start_level=0, + norm_cfg=norm_cfg), + bbox_head=dict( + type='EfficientDetSepBNHead', + num_classes=80, + num_ins=5, + in_channels=64, + feat_channels=64, + stacked_convs=3, + norm_cfg=norm_cfg, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[1.0, 0.5, 2.0], + strides=[8, 16, 32, 64, 128], + center_offset=0.5), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=1.5, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='HuberLoss', beta=0.1, loss_weight=50)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0, + ignore_iof_thr=-1), + sampler=dict( + type='PseudoSampler'), # Focal loss should use PseudoSampler + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict( + type='soft_nms', + iou_threshold=0.3, + sigma=0.5, + min_score=1e-3, + method='gaussian'), + max_per_img=100)) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(image_size, image_size), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(image_size, image_size)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(image_size, image_size), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + dataset=dict(type=dataset_type, pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(type=dataset_type, pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type=evalute_type) +test_evaluator = val_evaluator + +optim_wrapper = dict( + optimizer=dict(lr=0.16, weight_decay=4e-5), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True), + clip_grad=dict(max_norm=10, norm_type=2)) + +# learning policy +max_epochs = 300 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=917), + dict( + type='CosineAnnealingLR', + eta_min=0.0, + begin=1, + T_max=299, + end=300, + by_epoch=True, + convert_to_iter_based=True) +] +train_cfg = dict(max_epochs=max_epochs, val_interval=1) + +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=15)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49) +] +# cudnn_benchmark=True can accelerate fix-size training +env_cfg = dict(cudnn_benchmark=True) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (16 samples per GPU) +auto_scale_lr = dict(base_batch_size=128) diff --git a/mmdetection/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco-90cls.py b/mmdetection/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco-90cls.py new file mode 100644 index 0000000..fe82a5e --- /dev/null +++ b/mmdetection/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco-90cls.py @@ -0,0 +1,171 @@ +_base_ = [ + 'mmdet::_base_/datasets/coco_detection.py', + 'mmdet::_base_/schedules/schedule_1x.py', + 'mmdet::_base_/default_runtime.py' +] +custom_imports = dict( + imports=['projects.EfficientDet.efficientdet'], allow_failed_imports=False) + +image_size = 896 +batch_augments = [ + dict(type='BatchFixedSizePad', size=(image_size, image_size)) +] +dataset_type = 'Coco90Dataset' +evalute_type = 'Coco90Metric' +norm_cfg = dict(type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01) +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth' # noqa +model = dict( + type='EfficientDet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=image_size, + batch_augments=batch_augments), + backbone=dict( + type='EfficientNet', + arch='b3', + drop_path_rate=0.3, + out_indices=(3, 4, 5), + frozen_stages=0, + conv_cfg=dict(type='Conv2dSamePadding'), + norm_cfg=norm_cfg, + norm_eval=False, + init_cfg=dict( + type='Pretrained', prefix='backbone', checkpoint=checkpoint)), + neck=dict( + type='BiFPN', + num_stages=6, + in_channels=[48, 136, 384], + out_channels=160, + start_level=0, + norm_cfg=norm_cfg), + bbox_head=dict( + type='EfficientDetSepBNHead', + num_classes=90, + num_ins=5, + in_channels=160, + feat_channels=160, + stacked_convs=4, + norm_cfg=norm_cfg, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[1.0, 0.5, 2.0], + strides=[8, 16, 32, 64, 128], + center_offset=0.5), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=1.5, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='HuberLoss', beta=0.1, loss_weight=50)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0, + ignore_iof_thr=-1), + sampler=dict( + type='PseudoSampler'), # Focal loss should use PseudoSampler + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict( + type='soft_nms', + iou_threshold=0.3, + sigma=0.5, + min_score=1e-3, + method='gaussian'), + max_per_img=100)) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(image_size, image_size), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(image_size, image_size)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(image_size, image_size), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + dataset=dict(type=dataset_type, pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(type=dataset_type, pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type=evalute_type) +test_evaluator = val_evaluator + +optim_wrapper = dict( + optimizer=dict(lr=0.16, weight_decay=4e-5), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True), + clip_grad=dict(max_norm=10, norm_type=2)) + +# learning policy +max_epochs = 300 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=917), + dict( + type='CosineAnnealingLR', + eta_min=0.0, + begin=1, + T_max=299, + end=300, + by_epoch=True, + convert_to_iter_based=True) +] +train_cfg = dict(max_epochs=max_epochs, val_interval=1) + +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=15)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49) +] +# cudnn_benchmark=True can accelerate fix-size training +env_cfg = dict(cudnn_benchmark=True) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (16 samples per GPU) +auto_scale_lr = dict(base_batch_size=128) diff --git a/mmdetection/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py b/mmdetection/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py new file mode 100644 index 0000000..2079e2a --- /dev/null +++ b/mmdetection/projects/EfficientDet/configs/efficientdet_effb3_bifpn_8xb16-crop896-300e_coco.py @@ -0,0 +1,171 @@ +_base_ = [ + 'mmdet::_base_/datasets/coco_detection.py', + 'mmdet::_base_/schedules/schedule_1x.py', + 'mmdet::_base_/default_runtime.py' +] +custom_imports = dict( + imports=['projects.EfficientDet.efficientdet'], allow_failed_imports=False) + +image_size = 896 +batch_augments = [ + dict(type='BatchFixedSizePad', size=(image_size, image_size)) +] +dataset_type = 'CocoDataset' +evalute_type = 'CocoMetric' +norm_cfg = dict(type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01) +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth' # noqa +model = dict( + type='EfficientDet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=image_size, + batch_augments=batch_augments), + backbone=dict( + type='EfficientNet', + arch='b3', + drop_path_rate=0.3, + out_indices=(3, 4, 5), + frozen_stages=0, + conv_cfg=dict(type='Conv2dSamePadding'), + norm_cfg=norm_cfg, + norm_eval=False, + init_cfg=dict( + type='Pretrained', prefix='backbone', checkpoint=checkpoint)), + neck=dict( + type='BiFPN', + num_stages=6, + in_channels=[48, 136, 384], + out_channels=160, + start_level=0, + norm_cfg=norm_cfg), + bbox_head=dict( + type='EfficientDetSepBNHead', + num_classes=80, + num_ins=5, + in_channels=160, + feat_channels=160, + stacked_convs=4, + norm_cfg=norm_cfg, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[1.0, 0.5, 2.0], + strides=[8, 16, 32, 64, 128], + center_offset=0.5), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=1.5, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='HuberLoss', beta=0.1, loss_weight=50)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0, + ignore_iof_thr=-1), + sampler=dict( + type='PseudoSampler'), # Focal loss should use PseudoSampler + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict( + type='soft_nms', + iou_threshold=0.3, + sigma=0.5, + min_score=1e-3, + method='gaussian'), + max_per_img=100)) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(image_size, image_size), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(image_size, image_size)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(image_size, image_size), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + dataset=dict(type=dataset_type, pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(type=dataset_type, pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type=evalute_type) +test_evaluator = val_evaluator + +optim_wrapper = dict( + optimizer=dict(lr=0.16, weight_decay=4e-5), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True), + clip_grad=dict(max_norm=10, norm_type=2)) + +# learning policy +max_epochs = 300 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=917), + dict( + type='CosineAnnealingLR', + eta_min=0.0, + begin=1, + T_max=299, + end=300, + by_epoch=True, + convert_to_iter_based=True) +] +train_cfg = dict(max_epochs=max_epochs, val_interval=1) + +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=15)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49) +] +# cudnn_benchmark=True can accelerate fix-size training +env_cfg = dict(cudnn_benchmark=True) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (16 samples per GPU) +auto_scale_lr = dict(base_batch_size=128) diff --git a/mmdetection/projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py b/mmdetection/projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py new file mode 100644 index 0000000..bf3d3fc --- /dev/null +++ b/mmdetection/projects/EfficientDet/configs/tensorflow/efficientdet_effb0_bifpn_8xb16-crop512-300e_coco_tf.py @@ -0,0 +1,171 @@ +_base_ = [ + 'mmdet::_base_/datasets/coco_detection.py', + 'mmdet::_base_/schedules/schedule_1x.py', + 'mmdet::_base_/default_runtime.py' +] +custom_imports = dict( + imports=['projects.EfficientDet.efficientdet'], allow_failed_imports=False) + +image_size = 512 +batch_augments = [ + dict(type='BatchFixedSizePad', size=(image_size, image_size)) +] +dataset_type = 'Coco90Dataset' +evalute_type = 'Coco90Metric' +norm_cfg = dict(type='SyncBN', requires_grad=True, eps=1e-3, momentum=0.01) +checkpoint = 'https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth' # noqa +model = dict( + type='EfficientDet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=image_size, + batch_augments=batch_augments), + backbone=dict( + type='EfficientNet', + arch='b0', + drop_path_rate=0.2, + out_indices=(3, 4, 5), + frozen_stages=0, + conv_cfg=dict(type='Conv2dSamePadding'), + norm_cfg=norm_cfg, + norm_eval=False, + init_cfg=dict( + type='Pretrained', prefix='backbone', checkpoint=checkpoint)), + neck=dict( + type='BiFPN', + num_stages=3, + in_channels=[40, 112, 320], + out_channels=64, + start_level=0, + norm_cfg=norm_cfg), + bbox_head=dict( + type='EfficientDetSepBNHead', + num_classes=90, + num_ins=5, + in_channels=64, + feat_channels=64, + stacked_convs=3, + norm_cfg=norm_cfg, + anchor_generator=dict( + type='YXYXAnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[1.0, 0.5, 2.0], + strides=[8, 16, 32, 64, 128], + center_offset=0.5), + bbox_coder=dict( + type='YXYXDeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=1.5, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='HuberLoss', beta=0.1, loss_weight=50)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='TransMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0, + ignore_iof_thr=-1), + sampler=dict( + type='PseudoSampler'), # Focal loss should use PseudoSampler + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict( + type='soft_nms', + iou_threshold=0.3, + sigma=0.5, + min_score=1e-3, + method='gaussian'), + max_per_img=100)) + +# dataset settings +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='RandomResize', + scale=(image_size, image_size), + ratio_range=(0.1, 2.0), + keep_ratio=True), + dict(type='RandomCrop', crop_size=(image_size, image_size)), + dict(type='RandomFlip', prob=0.5), + dict(type='PackDetInputs') +] +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args={{_base_.backend_args}}), + dict(type='Resize', scale=(image_size, image_size), keep_ratio=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + dataset=dict(type=dataset_type, pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(type=dataset_type, pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(type=evalute_type) +test_evaluator = val_evaluator + +optim_wrapper = dict( + optimizer=dict(lr=0.16, weight_decay=4e-5), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True), + clip_grad=dict(max_norm=10, norm_type=2)) + +# learning policy +max_epochs = 300 +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=917), + dict( + type='CosineAnnealingLR', + eta_min=0.0, + begin=1, + T_max=299, + end=300, + by_epoch=True, + convert_to_iter_based=True) +] +train_cfg = dict(max_epochs=max_epochs, val_interval=1) + +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=15)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + priority=49) +] +# cudnn_benchmark=True can accelerate fix-size training +env_cfg = dict(cudnn_benchmark=True) + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (16 samples per GPU) +auto_scale_lr = dict(base_batch_size=128) diff --git a/mmdetection/projects/EfficientDet/convert_tf_to_pt.py b/mmdetection/projects/EfficientDet/convert_tf_to_pt.py new file mode 100644 index 0000000..f3b127f --- /dev/null +++ b/mmdetection/projects/EfficientDet/convert_tf_to_pt.py @@ -0,0 +1,626 @@ +import argparse + +import numpy as np +import torch +from tensorflow.python.training import py_checkpoint_reader + +torch.set_printoptions(precision=20) + + +def tf2pth(v): + if v.ndim == 4: + return np.ascontiguousarray(v.transpose(3, 2, 0, 1)) + elif v.ndim == 2: + return np.ascontiguousarray(v.transpose()) + return v + + +def convert_key(model_name, bifpn_repeats, weights): + + p6_w1 = [ + torch.tensor([-1e4, -1e4], dtype=torch.float64) + for _ in range(bifpn_repeats) + ] + p5_w1 = [ + torch.tensor([-1e4, -1e4], dtype=torch.float64) + for _ in range(bifpn_repeats) + ] + p4_w1 = [ + torch.tensor([-1e4, -1e4], dtype=torch.float64) + for _ in range(bifpn_repeats) + ] + p3_w1 = [ + torch.tensor([-1e4, -1e4], dtype=torch.float64) + for _ in range(bifpn_repeats) + ] + p4_w2 = [ + torch.tensor([-1e4, -1e4, -1e4], dtype=torch.float64) + for _ in range(bifpn_repeats) + ] + p5_w2 = [ + torch.tensor([-1e4, -1e4, -1e4], dtype=torch.float64) + for _ in range(bifpn_repeats) + ] + p6_w2 = [ + torch.tensor([-1e4, -1e4, -1e4], dtype=torch.float64) + for _ in range(bifpn_repeats) + ] + p7_w2 = [ + torch.tensor([-1e4, -1e4], dtype=torch.float64) + for _ in range(bifpn_repeats) + ] + idx2key = { + 0: '1.0', + 1: '2.0', + 2: '2.1', + 3: '3.0', + 4: '3.1', + 5: '4.0', + 6: '4.1', + 7: '4.2', + 8: '4.3', + 9: '4.4', + 10: '4.5', + 11: '5.0', + 12: '5.1', + 13: '5.2', + 14: '5.3', + 15: '5.4' + } + m = dict() + for k, v in weights.items(): + + if 'Exponential' in k or 'global_step' in k: + continue + + seg = k.split('/') + if len(seg) == 1: + continue + if seg[2] == 'depthwise_conv2d': + v = v.transpose(1, 0) + + if seg[0] == model_name: + if seg[1] == 'stem': + prefix = 'backbone.layers.0' + mapping = { + 'conv2d/kernel': 'conv.weight', + 'tpu_batch_normalization/beta': 'bn.bias', + 'tpu_batch_normalization/gamma': 'bn.weight', + 'tpu_batch_normalization/moving_mean': 'bn.running_mean', + 'tpu_batch_normalization/moving_variance': + 'bn.running_var', + } + suffix = mapping['/'.join(seg[2:])] + m[prefix + '.' + suffix] = v + + elif seg[1].startswith('blocks_'): + idx = int(seg[1][7:]) + prefix = '.'.join(['backbone', 'layers', idx2key[idx]]) + base_mapping = { + 'depthwise_conv2d/depthwise_kernel': + 'depthwise_conv.conv.weight', + 'se/conv2d/kernel': 'se.conv1.conv.weight', + 'se/conv2d/bias': 'se.conv1.conv.bias', + 'se/conv2d_1/kernel': 'se.conv2.conv.weight', + 'se/conv2d_1/bias': 'se.conv2.conv.bias' + } + if idx == 0: + mapping = { + 'conv2d/kernel': + 'linear_conv.conv.weight', + 'tpu_batch_normalization/beta': + 'depthwise_conv.bn.bias', + 'tpu_batch_normalization/gamma': + 'depthwise_conv.bn.weight', + 'tpu_batch_normalization/moving_mean': + 'depthwise_conv.bn.running_mean', + 'tpu_batch_normalization/moving_variance': + 'depthwise_conv.bn.running_var', + 'tpu_batch_normalization_1/beta': + 'linear_conv.bn.bias', + 'tpu_batch_normalization_1/gamma': + 'linear_conv.bn.weight', + 'tpu_batch_normalization_1/moving_mean': + 'linear_conv.bn.running_mean', + 'tpu_batch_normalization_1/moving_variance': + 'linear_conv.bn.running_var', + } + else: + mapping = { + 'depthwise_conv2d/depthwise_kernel': + 'depthwise_conv.conv.weight', + 'conv2d/kernel': + 'expand_conv.conv.weight', + 'conv2d_1/kernel': + 'linear_conv.conv.weight', + 'tpu_batch_normalization/beta': + 'expand_conv.bn.bias', + 'tpu_batch_normalization/gamma': + 'expand_conv.bn.weight', + 'tpu_batch_normalization/moving_mean': + 'expand_conv.bn.running_mean', + 'tpu_batch_normalization/moving_variance': + 'expand_conv.bn.running_var', + 'tpu_batch_normalization_1/beta': + 'depthwise_conv.bn.bias', + 'tpu_batch_normalization_1/gamma': + 'depthwise_conv.bn.weight', + 'tpu_batch_normalization_1/moving_mean': + 'depthwise_conv.bn.running_mean', + 'tpu_batch_normalization_1/moving_variance': + 'depthwise_conv.bn.running_var', + 'tpu_batch_normalization_2/beta': + 'linear_conv.bn.bias', + 'tpu_batch_normalization_2/gamma': + 'linear_conv.bn.weight', + 'tpu_batch_normalization_2/moving_mean': + 'linear_conv.bn.running_mean', + 'tpu_batch_normalization_2/moving_variance': + 'linear_conv.bn.running_var', + } + mapping.update(base_mapping) + suffix = mapping['/'.join(seg[2:])] + m[prefix + '.' + suffix] = v + elif seg[0] == 'resample_p6': + prefix = 'neck.bifpn.0.p5_to_p6.0' + mapping = { + 'conv2d/kernel': 'down_conv.weight', + 'conv2d/bias': 'down_conv.bias', + 'bn/beta': 'bn.bias', + 'bn/gamma': 'bn.weight', + 'bn/moving_mean': 'bn.running_mean', + 'bn/moving_variance': 'bn.running_var', + } + suffix = mapping['/'.join(seg[1:])] + m[prefix + '.' + suffix] = v + elif seg[0] == 'fpn_cells': + fpn_idx = int(seg[1][5:]) + prefix = '.'.join(['neck', 'bifpn', str(fpn_idx)]) + fnode_id = int(seg[2][5]) + if fnode_id == 0: + mapping = { + 'op_after_combine5/conv/depthwise_kernel': + 'conv6_up.depthwise_conv.weight', + 'op_after_combine5/conv/pointwise_kernel': + 'conv6_up.pointwise_conv.weight', + 'op_after_combine5/conv/bias': + 'conv6_up.pointwise_conv.bias', + 'op_after_combine5/bn/beta': + 'conv6_up.bn.bias', + 'op_after_combine5/bn/gamma': + 'conv6_up.bn.weight', + 'op_after_combine5/bn/moving_mean': + 'conv6_up.bn.running_mean', + 'op_after_combine5/bn/moving_variance': + 'conv6_up.bn.running_var', + } + if seg[3] != 'WSM' and seg[3] != 'WSM_1': + suffix = mapping['/'.join(seg[3:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[3] == 'WSM': + p6_w1[fpn_idx][0] = v + elif seg[3] == 'WSM_1': + p6_w1[fpn_idx][1] = v + if torch.min(p6_w1[fpn_idx]) > -1e4: + m[prefix + '.p6_w1'] = p6_w1[fpn_idx] + elif fnode_id == 1: + base_mapping = { + 'op_after_combine6/conv/depthwise_kernel': + 'conv5_up.depthwise_conv.weight', + 'op_after_combine6/conv/pointwise_kernel': + 'conv5_up.pointwise_conv.weight', + 'op_after_combine6/conv/bias': + 'conv5_up.pointwise_conv.bias', + 'op_after_combine6/bn/beta': + 'conv5_up.bn.bias', + 'op_after_combine6/bn/gamma': + 'conv5_up.bn.weight', + 'op_after_combine6/bn/moving_mean': + 'conv5_up.bn.running_mean', + 'op_after_combine6/bn/moving_variance': + 'conv5_up.bn.running_var', + } + if fpn_idx == 0: + mapping = { + 'resample_0_2_6/conv2d/kernel': + 'p5_down_channel.down_conv.weight', + 'resample_0_2_6/conv2d/bias': + 'p5_down_channel.down_conv.bias', + 'resample_0_2_6/bn/beta': + 'p5_down_channel.bn.bias', + 'resample_0_2_6/bn/gamma': + 'p5_down_channel.bn.weight', + 'resample_0_2_6/bn/moving_mean': + 'p5_down_channel.bn.running_mean', + 'resample_0_2_6/bn/moving_variance': + 'p5_down_channel.bn.running_var', + } + base_mapping.update(mapping) + if seg[3] != 'WSM' and seg[3] != 'WSM_1': + suffix = base_mapping['/'.join(seg[3:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[3] == 'WSM': + p5_w1[fpn_idx][0] = v + elif seg[3] == 'WSM_1': + p5_w1[fpn_idx][1] = v + if torch.min(p5_w1[fpn_idx]) > -1e4: + m[prefix + '.p5_w1'] = p5_w1[fpn_idx] + elif fnode_id == 2: + base_mapping = { + 'op_after_combine7/conv/depthwise_kernel': + 'conv4_up.depthwise_conv.weight', + 'op_after_combine7/conv/pointwise_kernel': + 'conv4_up.pointwise_conv.weight', + 'op_after_combine7/conv/bias': + 'conv4_up.pointwise_conv.bias', + 'op_after_combine7/bn/beta': + 'conv4_up.bn.bias', + 'op_after_combine7/bn/gamma': + 'conv4_up.bn.weight', + 'op_after_combine7/bn/moving_mean': + 'conv4_up.bn.running_mean', + 'op_after_combine7/bn/moving_variance': + 'conv4_up.bn.running_var', + } + if fpn_idx == 0: + mapping = { + 'resample_0_1_7/conv2d/kernel': + 'p4_down_channel.down_conv.weight', + 'resample_0_1_7/conv2d/bias': + 'p4_down_channel.down_conv.bias', + 'resample_0_1_7/bn/beta': + 'p4_down_channel.bn.bias', + 'resample_0_1_7/bn/gamma': + 'p4_down_channel.bn.weight', + 'resample_0_1_7/bn/moving_mean': + 'p4_down_channel.bn.running_mean', + 'resample_0_1_7/bn/moving_variance': + 'p4_down_channel.bn.running_var', + } + base_mapping.update(mapping) + if seg[3] != 'WSM' and seg[3] != 'WSM_1': + suffix = base_mapping['/'.join(seg[3:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[3] == 'WSM': + p4_w1[fpn_idx][0] = v + elif seg[3] == 'WSM_1': + p4_w1[fpn_idx][1] = v + if torch.min(p4_w1[fpn_idx]) > -1e4: + m[prefix + '.p4_w1'] = p4_w1[fpn_idx] + elif fnode_id == 3: + + base_mapping = { + 'op_after_combine8/conv/depthwise_kernel': + 'conv3_up.depthwise_conv.weight', + 'op_after_combine8/conv/pointwise_kernel': + 'conv3_up.pointwise_conv.weight', + 'op_after_combine8/conv/bias': + 'conv3_up.pointwise_conv.bias', + 'op_after_combine8/bn/beta': + 'conv3_up.bn.bias', + 'op_after_combine8/bn/gamma': + 'conv3_up.bn.weight', + 'op_after_combine8/bn/moving_mean': + 'conv3_up.bn.running_mean', + 'op_after_combine8/bn/moving_variance': + 'conv3_up.bn.running_var', + } + if fpn_idx == 0: + mapping = { + 'resample_0_0_8/conv2d/kernel': + 'p3_down_channel.down_conv.weight', + 'resample_0_0_8/conv2d/bias': + 'p3_down_channel.down_conv.bias', + 'resample_0_0_8/bn/beta': + 'p3_down_channel.bn.bias', + 'resample_0_0_8/bn/gamma': + 'p3_down_channel.bn.weight', + 'resample_0_0_8/bn/moving_mean': + 'p3_down_channel.bn.running_mean', + 'resample_0_0_8/bn/moving_variance': + 'p3_down_channel.bn.running_var', + } + base_mapping.update(mapping) + if seg[3] != 'WSM' and seg[3] != 'WSM_1': + suffix = base_mapping['/'.join(seg[3:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[3] == 'WSM': + p3_w1[fpn_idx][0] = v + elif seg[3] == 'WSM_1': + p3_w1[fpn_idx][1] = v + if torch.min(p3_w1[fpn_idx]) > -1e4: + m[prefix + '.p3_w1'] = p3_w1[fpn_idx] + elif fnode_id == 4: + base_mapping = { + 'op_after_combine9/conv/depthwise_kernel': + 'conv4_down.depthwise_conv.weight', + 'op_after_combine9/conv/pointwise_kernel': + 'conv4_down.pointwise_conv.weight', + 'op_after_combine9/conv/bias': + 'conv4_down.pointwise_conv.bias', + 'op_after_combine9/bn/beta': + 'conv4_down.bn.bias', + 'op_after_combine9/bn/gamma': + 'conv4_down.bn.weight', + 'op_after_combine9/bn/moving_mean': + 'conv4_down.bn.running_mean', + 'op_after_combine9/bn/moving_variance': + 'conv4_down.bn.running_var', + } + if fpn_idx == 0: + mapping = { + 'resample_0_1_9/conv2d/kernel': + 'p4_level_connection.down_conv.weight', + 'resample_0_1_9/conv2d/bias': + 'p4_level_connection.down_conv.bias', + 'resample_0_1_9/bn/beta': + 'p4_level_connection.bn.bias', + 'resample_0_1_9/bn/gamma': + 'p4_level_connection.bn.weight', + 'resample_0_1_9/bn/moving_mean': + 'p4_level_connection.bn.running_mean', + 'resample_0_1_9/bn/moving_variance': + 'p4_level_connection.bn.running_var', + } + base_mapping.update(mapping) + if seg[3] != 'WSM' and seg[3] != 'WSM_1' and seg[3] != 'WSM_2': + suffix = base_mapping['/'.join(seg[3:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[3] == 'WSM': + p4_w2[fpn_idx][0] = v + elif seg[3] == 'WSM_1': + p4_w2[fpn_idx][1] = v + elif seg[3] == 'WSM_2': + p4_w2[fpn_idx][2] = v + if torch.min(p4_w2[fpn_idx]) > -1e4: + m[prefix + '.p4_w2'] = p4_w2[fpn_idx] + elif fnode_id == 5: + base_mapping = { + 'op_after_combine10/conv/depthwise_kernel': + 'conv5_down.depthwise_conv.weight', + 'op_after_combine10/conv/pointwise_kernel': + 'conv5_down.pointwise_conv.weight', + 'op_after_combine10/conv/bias': + 'conv5_down.pointwise_conv.bias', + 'op_after_combine10/bn/beta': + 'conv5_down.bn.bias', + 'op_after_combine10/bn/gamma': + 'conv5_down.bn.weight', + 'op_after_combine10/bn/moving_mean': + 'conv5_down.bn.running_mean', + 'op_after_combine10/bn/moving_variance': + 'conv5_down.bn.running_var', + } + if fpn_idx == 0: + mapping = { + 'resample_0_2_10/conv2d/kernel': + 'p5_level_connection.down_conv.weight', + 'resample_0_2_10/conv2d/bias': + 'p5_level_connection.down_conv.bias', + 'resample_0_2_10/bn/beta': + 'p5_level_connection.bn.bias', + 'resample_0_2_10/bn/gamma': + 'p5_level_connection.bn.weight', + 'resample_0_2_10/bn/moving_mean': + 'p5_level_connection.bn.running_mean', + 'resample_0_2_10/bn/moving_variance': + 'p5_level_connection.bn.running_var', + } + base_mapping.update(mapping) + if seg[3] != 'WSM' and seg[3] != 'WSM_1' and seg[3] != 'WSM_2': + suffix = base_mapping['/'.join(seg[3:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[3] == 'WSM': + p5_w2[fpn_idx][0] = v + elif seg[3] == 'WSM_1': + p5_w2[fpn_idx][1] = v + elif seg[3] == 'WSM_2': + p5_w2[fpn_idx][2] = v + if torch.min(p5_w2[fpn_idx]) > -1e4: + m[prefix + '.p5_w2'] = p5_w2[fpn_idx] + elif fnode_id == 6: + base_mapping = { + 'op_after_combine11/conv/depthwise_kernel': + 'conv6_down.depthwise_conv.weight', + 'op_after_combine11/conv/pointwise_kernel': + 'conv6_down.pointwise_conv.weight', + 'op_after_combine11/conv/bias': + 'conv6_down.pointwise_conv.bias', + 'op_after_combine11/bn/beta': + 'conv6_down.bn.bias', + 'op_after_combine11/bn/gamma': + 'conv6_down.bn.weight', + 'op_after_combine11/bn/moving_mean': + 'conv6_down.bn.running_mean', + 'op_after_combine11/bn/moving_variance': + 'conv6_down.bn.running_var', + } + if seg[3] != 'WSM' and seg[3] != 'WSM_1' and seg[3] != 'WSM_2': + suffix = base_mapping['/'.join(seg[3:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[3] == 'WSM': + p6_w2[fpn_idx][0] = v + elif seg[3] == 'WSM_1': + p6_w2[fpn_idx][1] = v + elif seg[3] == 'WSM_2': + p6_w2[fpn_idx][2] = v + if torch.min(p6_w2[fpn_idx]) > -1e4: + m[prefix + '.p6_w2'] = p6_w2[fpn_idx] + elif fnode_id == 7: + base_mapping = { + 'op_after_combine12/conv/depthwise_kernel': + 'conv7_down.depthwise_conv.weight', + 'op_after_combine12/conv/pointwise_kernel': + 'conv7_down.pointwise_conv.weight', + 'op_after_combine12/conv/bias': + 'conv7_down.pointwise_conv.bias', + 'op_after_combine12/bn/beta': + 'conv7_down.bn.bias', + 'op_after_combine12/bn/gamma': + 'conv7_down.bn.weight', + 'op_after_combine12/bn/moving_mean': + 'conv7_down.bn.running_mean', + 'op_after_combine12/bn/moving_variance': + 'conv7_down.bn.running_var', + } + if seg[3] != 'WSM' and seg[3] != 'WSM_1' and seg[3] != 'WSM_2': + suffix = base_mapping['/'.join(seg[3:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[3] == 'WSM': + p7_w2[fpn_idx][0] = v + elif seg[3] == 'WSM_1': + p7_w2[fpn_idx][1] = v + if torch.min(p7_w2[fpn_idx]) > -1e4: + m[prefix + '.p7_w2'] = p7_w2[fpn_idx] + elif seg[0] == 'box_net': + if 'box-predict' in seg[1]: + prefix = '.'.join(['bbox_head', 'reg_header']) + base_mapping = { + 'depthwise_kernel': 'depthwise_conv.weight', + 'pointwise_kernel': 'pointwise_conv.weight', + 'bias': 'pointwise_conv.bias' + } + suffix = base_mapping['/'.join(seg[2:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif 'bn' in seg[1]: + bbox_conv_idx = int(seg[1][4]) + bbox_bn_idx = int(seg[1][9]) - 3 + prefix = '.'.join([ + 'bbox_head', 'reg_bn_list', + str(bbox_conv_idx), + str(bbox_bn_idx) + ]) + base_mapping = { + 'beta': 'bias', + 'gamma': 'weight', + 'moving_mean': 'running_mean', + 'moving_variance': 'running_var' + } + suffix = base_mapping['/'.join(seg[2:])] + m[prefix + '.' + suffix] = v + else: + bbox_conv_idx = int(seg[1][4]) + prefix = '.'.join( + ['bbox_head', 'reg_conv_list', + str(bbox_conv_idx)]) + base_mapping = { + 'depthwise_kernel': 'depthwise_conv.weight', + 'pointwise_kernel': 'pointwise_conv.weight', + 'bias': 'pointwise_conv.bias' + } + suffix = base_mapping['/'.join(seg[2:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif seg[0] == 'class_net': + if 'class-predict' in seg[1]: + prefix = '.'.join(['bbox_head', 'cls_header']) + base_mapping = { + 'depthwise_kernel': 'depthwise_conv.weight', + 'pointwise_kernel': 'pointwise_conv.weight', + 'bias': 'pointwise_conv.bias' + } + suffix = base_mapping['/'.join(seg[2:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + elif 'bn' in seg[1]: + cls_conv_idx = int(seg[1][6]) + cls_bn_idx = int(seg[1][11]) - 3 + prefix = '.'.join([ + 'bbox_head', 'cls_bn_list', + str(cls_conv_idx), + str(cls_bn_idx) + ]) + base_mapping = { + 'beta': 'bias', + 'gamma': 'weight', + 'moving_mean': 'running_mean', + 'moving_variance': 'running_var' + } + suffix = base_mapping['/'.join(seg[2:])] + m[prefix + '.' + suffix] = v + else: + cls_conv_idx = int(seg[1][6]) + prefix = '.'.join( + ['bbox_head', 'cls_conv_list', + str(cls_conv_idx)]) + base_mapping = { + 'depthwise_kernel': 'depthwise_conv.weight', + 'pointwise_kernel': 'pointwise_conv.weight', + 'bias': 'pointwise_conv.bias' + } + suffix = base_mapping['/'.join(seg[2:])] + if 'depthwise_conv' in suffix: + v = v.transpose(1, 0) + m[prefix + '.' + suffix] = v + return m + + +def parse_args(): + parser = argparse.ArgumentParser( + description='convert efficientdet weight from tensorflow to pytorch') + parser.add_argument( + '--backbone', + type=str, + help='efficientnet model name, like efficientnet-b0') + parser.add_argument( + '--tensorflow_weight', + type=str, + help='efficientdet tensorflow weight name, like efficientdet-d0/model') + parser.add_argument( + '--out_weight', + type=str, + help='efficientdet pytorch weight name like demo.pth') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + model_name = args.backbone + ori_weight_name = args.tensorflow_weight + out_name = args.out_weight + + repeat_map = { + 0: 3, + 1: 4, + 2: 5, + 3: 6, + 4: 7, + 5: 7, + 6: 8, + 7: 8, + } + + reader = py_checkpoint_reader.NewCheckpointReader(ori_weight_name) + weights = { + n: torch.as_tensor(tf2pth(reader.get_tensor(n))) + for (n, _) in reader.get_variable_to_shape_map().items() + } + bifpn_repeats = repeat_map[int(model_name[14])] + out = convert_key(model_name, bifpn_repeats, weights) + result = {'state_dict': out} + torch.save(result, out_name) + + +if __name__ == '__main__': + main() diff --git a/mmdetection/projects/EfficientDet/efficientdet/__init__.py b/mmdetection/projects/EfficientDet/efficientdet/__init__.py new file mode 100644 index 0000000..b6c66bc --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/__init__.py @@ -0,0 +1,16 @@ +from .bifpn import BiFPN +from .efficientdet import EfficientDet +from .efficientdet_head import EfficientDetSepBNHead +from .huber_loss import HuberLoss +from .tensorflow.anchor_generator import YXYXAnchorGenerator +from .tensorflow.coco_90class import Coco90Dataset +from .tensorflow.coco_90metric import Coco90Metric +from .tensorflow.trans_max_iou_assigner import TransMaxIoUAssigner +from .tensorflow.yxyx_bbox_coder import YXYXDeltaXYWHBBoxCoder +from .utils import Conv2dSamePadding + +__all__ = [ + 'EfficientDet', 'BiFPN', 'HuberLoss', 'EfficientDetSepBNHead', + 'Conv2dSamePadding', 'Coco90Dataset', 'Coco90Metric', + 'YXYXAnchorGenerator', 'TransMaxIoUAssigner', 'YXYXDeltaXYWHBBoxCoder' +] diff --git a/mmdetection/projects/EfficientDet/efficientdet/bifpn.py b/mmdetection/projects/EfficientDet/efficientdet/bifpn.py new file mode 100644 index 0000000..56356c3 --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/bifpn.py @@ -0,0 +1,306 @@ +from typing import List + +import torch +import torch.nn as nn +from mmcv.cnn.bricks import Swish +from mmengine.model import BaseModule + +from mmdet.registry import MODELS +from mmdet.utils import MultiConfig, OptConfigType +from .utils import DepthWiseConvBlock, DownChannelBlock, MaxPool2dSamePadding + + +class BiFPNStage(nn.Module): + """ + in_channels: List[int], input dim for P3, P4, P5 + out_channels: int, output dim for P2 - P7 + first_time: int, whether is the first bifpnstage + conv_bn_act_pattern: bool, whether use conv_bn_act_pattern + norm_cfg: (:obj:`ConfigDict` or dict, optional): Config dict for + normalization layer. + epsilon: float, hyperparameter in fusion features + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + first_time: bool = False, + apply_bn_for_resampling: bool = True, + conv_bn_act_pattern: bool = False, + norm_cfg: OptConfigType = dict( + type='BN', momentum=1e-2, eps=1e-3), + epsilon: float = 1e-4) -> None: + super().__init__() + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.first_time = first_time + self.apply_bn_for_resampling = apply_bn_for_resampling + self.conv_bn_act_pattern = conv_bn_act_pattern + self.norm_cfg = norm_cfg + self.epsilon = epsilon + + if self.first_time: + self.p5_down_channel = DownChannelBlock( + self.in_channels[-1], + self.out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.p4_down_channel = DownChannelBlock( + self.in_channels[-2], + self.out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.p3_down_channel = DownChannelBlock( + self.in_channels[-3], + self.out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.p5_to_p6 = nn.Sequential( + DownChannelBlock( + self.in_channels[-1], + self.out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg), MaxPool2dSamePadding(3, 2)) + self.p6_to_p7 = MaxPool2dSamePadding(3, 2) + self.p4_level_connection = DownChannelBlock( + self.in_channels[-2], + self.out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.p5_level_connection = DownChannelBlock( + self.in_channels[-1], + self.out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + + self.p6_upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.p3_upsample = nn.Upsample(scale_factor=2, mode='nearest') + + # bottom to up: feature map down_sample module + self.p4_down_sample = MaxPool2dSamePadding(3, 2) + self.p5_down_sample = MaxPool2dSamePadding(3, 2) + self.p6_down_sample = MaxPool2dSamePadding(3, 2) + self.p7_down_sample = MaxPool2dSamePadding(3, 2) + + # Fuse Conv Layers + self.conv6_up = DepthWiseConvBlock( + out_channels, + out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.conv5_up = DepthWiseConvBlock( + out_channels, + out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.conv4_up = DepthWiseConvBlock( + out_channels, + out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.conv3_up = DepthWiseConvBlock( + out_channels, + out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.conv4_down = DepthWiseConvBlock( + out_channels, + out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.conv5_down = DepthWiseConvBlock( + out_channels, + out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.conv6_down = DepthWiseConvBlock( + out_channels, + out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + self.conv7_down = DepthWiseConvBlock( + out_channels, + out_channels, + apply_norm=self.apply_bn_for_resampling, + conv_bn_act_pattern=self.conv_bn_act_pattern, + norm_cfg=norm_cfg) + # weights + self.p6_w1 = nn.Parameter( + torch.ones(2, dtype=torch.float32), requires_grad=True) + self.p6_w1_relu = nn.ReLU() + self.p5_w1 = nn.Parameter( + torch.ones(2, dtype=torch.float32), requires_grad=True) + self.p5_w1_relu = nn.ReLU() + self.p4_w1 = nn.Parameter( + torch.ones(2, dtype=torch.float32), requires_grad=True) + self.p4_w1_relu = nn.ReLU() + self.p3_w1 = nn.Parameter( + torch.ones(2, dtype=torch.float32), requires_grad=True) + self.p3_w1_relu = nn.ReLU() + + self.p4_w2 = nn.Parameter( + torch.ones(3, dtype=torch.float32), requires_grad=True) + self.p4_w2_relu = nn.ReLU() + self.p5_w2 = nn.Parameter( + torch.ones(3, dtype=torch.float32), requires_grad=True) + self.p5_w2_relu = nn.ReLU() + self.p6_w2 = nn.Parameter( + torch.ones(3, dtype=torch.float32), requires_grad=True) + self.p6_w2_relu = nn.ReLU() + self.p7_w2 = nn.Parameter( + torch.ones(2, dtype=torch.float32), requires_grad=True) + self.p7_w2_relu = nn.ReLU() + + self.swish = Swish() + + def combine(self, x): + if not self.conv_bn_act_pattern: + x = self.swish(x) + + return x + + def forward(self, x): + if self.first_time: + p3, p4, p5 = x + # build feature map P6 + p6_in = self.p5_to_p6(p5) + # build feature map P7 + p7_in = self.p6_to_p7(p6_in) + + p3_in = self.p3_down_channel(p3) + p4_in = self.p4_down_channel(p4) + p5_in = self.p5_down_channel(p5) + + else: + p3_in, p4_in, p5_in, p6_in, p7_in = x + + # Weights for P6_0 and P7_0 to P6_1 + p6_w1 = self.p6_w1_relu(self.p6_w1) + weight = p6_w1 / (torch.sum(p6_w1, dim=0) + self.epsilon) + # Connections for P6_0 and P7_0 to P6_1 respectively + p6_up = self.conv6_up( + self.combine(weight[0] * p6_in + + weight[1] * self.p6_upsample(p7_in))) + + # Weights for P5_0 and P6_1 to P5_1 + p5_w1 = self.p5_w1_relu(self.p5_w1) + weight = p5_w1 / (torch.sum(p5_w1, dim=0) + self.epsilon) + # Connections for P5_0 and P6_1 to P5_1 respectively + p5_up = self.conv5_up( + self.combine(weight[0] * p5_in + + weight[1] * self.p5_upsample(p6_up))) + + # Weights for P4_0 and P5_1 to P4_1 + p4_w1 = self.p4_w1_relu(self.p4_w1) + weight = p4_w1 / (torch.sum(p4_w1, dim=0) + self.epsilon) + # Connections for P4_0 and P5_1 to P4_1 respectively + p4_up = self.conv4_up( + self.combine(weight[0] * p4_in + + weight[1] * self.p4_upsample(p5_up))) + + # Weights for P3_0 and P4_1 to P3_2 + p3_w1 = self.p3_w1_relu(self.p3_w1) + weight = p3_w1 / (torch.sum(p3_w1, dim=0) + self.epsilon) + # Connections for P3_0 and P4_1 to P3_2 respectively + p3_out = self.conv3_up( + self.combine(weight[0] * p3_in + + weight[1] * self.p3_upsample(p4_up))) + + if self.first_time: + p4_in = self.p4_level_connection(p4) + p5_in = self.p5_level_connection(p5) + + # Weights for P4_0, P4_1 and P3_2 to P4_2 + p4_w2 = self.p4_w2_relu(self.p4_w2) + weight = p4_w2 / (torch.sum(p4_w2, dim=0) + self.epsilon) + # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively + p4_out = self.conv4_down( + self.combine(weight[0] * p4_in + weight[1] * p4_up + + weight[2] * self.p4_down_sample(p3_out))) + + # Weights for P5_0, P5_1 and P4_2 to P5_2 + p5_w2 = self.p5_w2_relu(self.p5_w2) + weight = p5_w2 / (torch.sum(p5_w2, dim=0) + self.epsilon) + # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively + p5_out = self.conv5_down( + self.combine(weight[0] * p5_in + weight[1] * p5_up + + weight[2] * self.p5_down_sample(p4_out))) + + # Weights for P6_0, P6_1 and P5_2 to P6_2 + p6_w2 = self.p6_w2_relu(self.p6_w2) + weight = p6_w2 / (torch.sum(p6_w2, dim=0) + self.epsilon) + # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively + p6_out = self.conv6_down( + self.combine(weight[0] * p6_in + weight[1] * p6_up + + weight[2] * self.p6_down_sample(p5_out))) + + # Weights for P7_0 and P6_2 to P7_2 + p7_w2 = self.p7_w2_relu(self.p7_w2) + weight = p7_w2 / (torch.sum(p7_w2, dim=0) + self.epsilon) + # Connections for P7_0 and P6_2 to P7_2 + p7_out = self.conv7_down( + self.combine(weight[0] * p7_in + + weight[1] * self.p7_down_sample(p6_out))) + return p3_out, p4_out, p5_out, p6_out, p7_out + + +@MODELS.register_module() +class BiFPN(BaseModule): + """ + num_stages: int, bifpn number of repeats + in_channels: List[int], input dim for P3, P4, P5 + out_channels: int, output dim for P2 - P7 + start_level: int, Index of input features in backbone + epsilon: float, hyperparameter in fusion features + apply_bn_for_resampling: bool, whether use bn after resampling + conv_bn_act_pattern: bool, whether use conv_bn_act_pattern + norm_cfg: (:obj:`ConfigDict` or dict, optional): Config dict for + normalization layer. + init_cfg: MultiConfig: init method + """ + + def __init__(self, + num_stages: int, + in_channels: List[int], + out_channels: int, + start_level: int = 0, + epsilon: float = 1e-4, + apply_bn_for_resampling: bool = True, + conv_bn_act_pattern: bool = False, + norm_cfg: OptConfigType = dict( + type='BN', momentum=1e-2, eps=1e-3), + init_cfg: MultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + self.start_level = start_level + self.bifpn = nn.Sequential(*[ + BiFPNStage( + in_channels=in_channels, + out_channels=out_channels, + first_time=True if _ == 0 else False, + apply_bn_for_resampling=apply_bn_for_resampling, + conv_bn_act_pattern=conv_bn_act_pattern, + norm_cfg=norm_cfg, + epsilon=epsilon) for _ in range(num_stages) + ]) + + def forward(self, x): + x = x[self.start_level:] + x = self.bifpn(x) + + return x diff --git a/mmdetection/projects/EfficientDet/efficientdet/efficientdet.py b/mmdetection/projects/EfficientDet/efficientdet/efficientdet.py new file mode 100644 index 0000000..84e1778 --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/efficientdet.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.models.detectors.single_stage import SingleStageDetector +from mmdet.registry import MODELS +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig + + +@MODELS.register_module() +class EfficientDet(SingleStageDetector): + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None) -> None: + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) diff --git a/mmdetection/projects/EfficientDet/efficientdet/efficientdet_head.py b/mmdetection/projects/EfficientDet/efficientdet/efficientdet_head.py new file mode 100644 index 0000000..ae3efbe --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/efficientdet_head.py @@ -0,0 +1,261 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn.bricks import Swish, build_norm_layer +from mmengine.model import bias_init_with_prob +from torch import Tensor + +from mmdet.models.dense_heads.anchor_head import AnchorHead +from mmdet.models.utils import images_to_levels, multi_apply +from mmdet.registry import MODELS +from mmdet.structures.bbox import cat_boxes, get_box_tensor +from mmdet.utils import (InstanceList, OptConfigType, OptInstanceList, + OptMultiConfig, reduce_mean) +from .utils import DepthWiseConvBlock + + +@MODELS.register_module() +class EfficientDetSepBNHead(AnchorHead): + """EfficientDetHead with separate BN. + + num_classes (int): Number of categories num_ins (int): Number of the input + feature map. in_channels (int): Number of channels in the input feature + map. feat_channels (int): Number of hidden channels. stacked_convs (int): + Number of repetitions of conv norm_cfg (dict): Config dict for + normalization layer. anchor_generator (dict): Config dict for anchor + generator bbox_coder (dict): Config of bounding box coder. loss_cls (dict): + Config of classification loss. loss_bbox (dict): Config of localization + loss. train_cfg (dict): Training config of anchor head. test_cfg (dict): + Testing config of anchor head. init_cfg (dict or list[dict], optional): + Initialization config dict. + """ + + def __init__(self, + num_classes: int, + num_ins: int, + in_channels: int, + feat_channels: int, + stacked_convs: int = 3, + norm_cfg: OptConfigType = dict( + type='BN', momentum=1e-2, eps=1e-3), + init_cfg: OptMultiConfig = None, + **kwargs) -> None: + self.num_ins = num_ins + self.stacked_convs = stacked_convs + self.norm_cfg = norm_cfg + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + feat_channels=feat_channels, + init_cfg=init_cfg, + **kwargs) + + def _init_layers(self) -> None: + """Initialize layers of the head.""" + self.reg_conv_list = nn.ModuleList() + self.cls_conv_list = nn.ModuleList() + for i in range(self.stacked_convs): + channels = self.in_channels if i == 0 else self.feat_channels + self.reg_conv_list.append( + DepthWiseConvBlock( + channels, self.feat_channels, apply_norm=False)) + self.cls_conv_list.append( + DepthWiseConvBlock( + channels, self.feat_channels, apply_norm=False)) + + self.reg_bn_list = nn.ModuleList([ + nn.ModuleList([ + build_norm_layer( + self.norm_cfg, num_features=self.feat_channels)[1] + for j in range(self.num_ins) + ]) for i in range(self.stacked_convs) + ]) + + self.cls_bn_list = nn.ModuleList([ + nn.ModuleList([ + build_norm_layer( + self.norm_cfg, num_features=self.feat_channels)[1] + for j in range(self.num_ins) + ]) for i in range(self.stacked_convs) + ]) + + self.cls_header = DepthWiseConvBlock( + self.in_channels, + self.num_base_priors * self.cls_out_channels, + apply_norm=False) + self.reg_header = DepthWiseConvBlock( + self.in_channels, self.num_base_priors * 4, apply_norm=False) + self.swish = Swish() + + def init_weights(self) -> None: + """Initialize weights of the head.""" + for m in self.reg_conv_list: + nn.init.constant_(m.pointwise_conv.bias, 0.0) + for m in self.cls_conv_list: + nn.init.constant_(m.pointwise_conv.bias, 0.0) + bias_cls = bias_init_with_prob(0.01) + nn.init.constant_(self.cls_header.pointwise_conv.bias, bias_cls) + nn.init.constant_(self.reg_header.pointwise_conv.bias, 0.0) + + def forward_single_bbox(self, feat: Tensor, level_id: int, + i: int) -> Tensor: + conv_op = self.reg_conv_list[i] + bn = self.reg_bn_list[i][level_id] + + feat = conv_op(feat) + feat = bn(feat) + feat = self.swish(feat) + + return feat + + def forward_single_cls(self, feat: Tensor, level_id: int, + i: int) -> Tensor: + conv_op = self.cls_conv_list[i] + bn = self.cls_bn_list[i][level_id] + + feat = conv_op(feat) + feat = bn(feat) + feat = self.swish(feat) + + return feat + + def forward(self, feats: Tuple[Tensor]) -> tuple: + cls_scores = [] + bbox_preds = [] + for level_id in range(self.num_ins): + feat = feats[level_id] + for i in range(self.stacked_convs): + feat = self.forward_single_bbox(feat, level_id, i) + bbox_pred = self.reg_header(feat) + bbox_preds.append(bbox_pred) + for level_id in range(self.num_ins): + feat = feats[level_id] + for i in range(self.stacked_convs): + feat = self.forward_single_cls(feat, level_id, i) + cls_score = self.cls_header(feat) + cls_scores.append(cls_score) + + return cls_scores, bbox_preds + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + has shape (N, num_anchors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, batch_img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + batch_gt_instances, + batch_img_metas, + batch_gt_instances_ignore=batch_gt_instances_ignore) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + avg_factor) = cls_reg_targets + + # anchor number of multi levels + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + # concat all level anchors and flags to a single tensor + concat_anchor_list = [] + for i in range(len(anchor_list)): + concat_anchor_list.append(cat_boxes(anchor_list[i])) + all_anchor_list = images_to_levels(concat_anchor_list, + num_level_anchors) + + avg_factor = reduce_mean( + torch.tensor(avg_factor, dtype=torch.float, device=device)).item() + avg_factor = max(avg_factor, 1.0) + losses_cls, losses_bbox = multi_apply( + self.loss_by_feat_single, + cls_scores, + bbox_preds, + all_anchor_list, + labels_list, + label_weights_list, + bbox_targets_list, + bbox_weights_list, + avg_factor=avg_factor) + return dict(loss_cls=losses_cls, loss_bbox=losses_bbox) + + def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor, + anchors: Tensor, labels: Tensor, + label_weights: Tensor, bbox_targets: Tensor, + bbox_weights: Tensor, avg_factor: int) -> tuple: + """Calculate the loss of a single scale level based on the features + extracted by the detection head. + + Args: + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + anchors (Tensor): Box reference for each scale level with shape + (N, num_total_anchors, 4). + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of each anchor + weight shape (N, num_total_anchors, 4). + bbox_weights (Tensor): BBox regression loss weights of each anchor + with shape (N, num_total_anchors, 4). + avg_factor (int): Average factor that is used to average the loss. + + Returns: + tuple: loss components. + """ + + # classification loss + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + cls_score = cls_score.permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + loss_cls = self.loss_cls( + cls_score, labels, label_weights, avg_factor=avg_factor) + # regression loss + target_dim = bbox_targets.size(-1) + bbox_targets = bbox_targets.reshape(-1, target_dim) + bbox_weights = bbox_weights.reshape(-1, target_dim) + bbox_pred = bbox_pred.permute(0, 2, 3, + 1).reshape(-1, + self.bbox_coder.encode_size) + if self.reg_decoded_bbox: + # When the regression loss (e.g. `IouLoss`, `GIouLoss`) + # is applied directly on the decoded bounding boxes, it + # decodes the already encoded coordinates to absolute format. + anchors = anchors.reshape(-1, anchors.size(-1)) + bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) + bbox_pred = get_box_tensor(bbox_pred) + loss_bbox = self.loss_bbox( + bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor * 4) + return loss_cls, loss_bbox diff --git a/mmdetection/projects/EfficientDet/efficientdet/huber_loss.py b/mmdetection/projects/EfficientDet/efficientdet/huber_loss.py new file mode 100644 index 0000000..091963f --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/huber_loss.py @@ -0,0 +1,91 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from mmdet.models.losses.utils import weighted_loss +from mmdet.registry import MODELS + + +@weighted_loss +def huber_loss(pred: Tensor, target: Tensor, beta: float = 1.0) -> Tensor: + """Huber loss. + + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + beta (float, optional): The threshold in the piecewise function. + Defaults to 1.0. + + Returns: + Tensor: Calculated loss + """ + assert beta > 0 + if target.numel() == 0: + return pred.sum() * 0 + + assert pred.size() == target.size() + diff = torch.abs(pred - target) + loss = torch.where(diff < beta, 0.5 * diff * diff, + beta * diff - 0.5 * beta * beta) + return loss + + +@MODELS.register_module() +class HuberLoss(nn.Module): + """Huber loss. + + Args: + beta (float, optional): The threshold in the piecewise function. + Defaults to 1.0. + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". Defaults to "mean". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, + beta: float = 1.0, + reduction: str = 'mean', + loss_weight: float = 1.0) -> None: + super().__init__() + self.beta = beta + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred: Tensor, + target: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None, + **kwargs) -> Tensor: + """Forward function. + + Args: + pred (Tensor): The prediction. + target (Tensor): The learning target of the prediction. + weight (Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + + Returns: + Tensor: Calculated loss + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + loss_bbox = self.loss_weight * huber_loss( + pred, + target, + weight, + beta=self.beta, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss_bbox diff --git a/mmdetection/projects/EfficientDet/efficientdet/tensorflow/anchor_generator.py b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/anchor_generator.py new file mode 100644 index 0000000..51936a3 --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/anchor_generator.py @@ -0,0 +1,109 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple, Union + +import torch +from torch import Tensor + +from mmdet.models.task_modules.prior_generators.anchor_generator import \ + AnchorGenerator +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import HorizontalBoxes + +DeviceType = Union[str, torch.device] + + +@TASK_UTILS.register_module() +class YXYXAnchorGenerator(AnchorGenerator): + + def gen_single_level_base_anchors(self, + base_size: Union[int, float], + scales: Tensor, + ratios: Tensor, + center: Optional[Tuple[float]] = None) \ + -> Tensor: + """Generate base anchors of a single level. + + Args: + base_size (int | float): Basic size of an anchor. + scales (torch.Tensor): Scales of the anchor. + ratios (torch.Tensor): The ratio between the height + and width of anchors in a single level. + center (tuple[float], optional): The center of the base anchor + related to a single feature grid. Defaults to None. + + Returns: + torch.Tensor: Anchors in a single-level feature maps. + """ + + w = base_size + h = base_size + if center is None: + x_center = self.center_offset * w + y_center = self.center_offset * h + else: + x_center, y_center = center + + h_ratios = torch.sqrt(ratios) + w_ratios = 1 / h_ratios + if self.scale_major: + ws = (w * scales[:, None] * w_ratios[None, :]).view(-1) + hs = (h * scales[:, None] * h_ratios[None, :]).view(-1) + else: + ws = (w * scales[:, None] * w_ratios[None, :]).view(-1) + hs = (h * scales[:, None] * h_ratios[None, :]).view(-1) + + # use float anchor and the anchor's center is aligned with the + # pixel center + base_anchors = [ + y_center - 0.5 * hs, + x_center - 0.5 * ws, + y_center + 0.5 * hs, + x_center + 0.5 * ws, + ] + base_anchors = torch.stack(base_anchors, dim=-1) + + return base_anchors + + def single_level_grid_priors(self, + featmap_size: Tuple[int, int], + level_idx: int, + dtype: torch.dtype = torch.float32, + device: DeviceType = 'cuda') -> Tensor: + """Generate grid anchors of a single level. + + Note: + This function is usually called by method ``self.grid_priors``. + + Args: + featmap_size (tuple[int, int]): Size of the feature maps. + level_idx (int): The index of corresponding feature map level. + dtype (obj:`torch.dtype`): Date type of points.Defaults to + ``torch.float32``. + device (str | torch.device): The device the tensor will be put on. + Defaults to 'cuda'. + + Returns: + torch.Tensor: Anchors in the overall feature maps. + """ + base_anchors = self.base_anchors[level_idx].to(device).to(dtype) + feat_h, feat_w = featmap_size + stride_w, stride_h = self.strides[level_idx] + # First create Range with the default dtype, than convert to + # target `dtype` for onnx exporting. + shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w + shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h + + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + shifts = torch.stack([shift_yy, shift_xx, shift_yy, shift_xx], dim=-1) + # first feat_w elements correspond to the first row of shifts + # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get + # shifted anchors (K, A, 4), reshape to (K*A, 4) + + all_anchors = base_anchors[None, :, :] + shifts[:, None, :] + all_anchors = all_anchors.view(-1, 4) + # first A rows correspond to A anchors of (0, 0) in feature map, + # then (0, 1), (0, 2), ... + if self.use_box_type: + all_anchors = HorizontalBoxes(all_anchors) + + return all_anchors diff --git a/mmdetection/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/__init__.py b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/__init__.py new file mode 100644 index 0000000..a27afc4 --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .coco_api import COCO, COCOeval, COCOPanoptic + +__all__ = ['COCO', 'COCOeval', 'COCOPanoptic'] diff --git a/mmdetection/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/coco_api.py b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/coco_api.py new file mode 100644 index 0000000..142f27d --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/api_wrappers/coco_api.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This file add snake case alias for coco api + +import warnings +from collections import defaultdict +from typing import List, Optional, Union + +import pycocotools +from pycocotools.coco import COCO as _COCO +from pycocotools.cocoeval import COCOeval as _COCOeval + + +class COCO(_COCO): + """This class is almost the same as official pycocotools package. + + It implements some snake case function aliases. So that the COCO class has + the same interface as LVIS class. + """ + + def __init__(self, annotation_file=None): + if getattr(pycocotools, '__version__', '0') >= '12.0.2': + warnings.warn( + 'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"', # noqa: E501 + UserWarning) + super().__init__(annotation_file=annotation_file) + self.img_ann_map = self.imgToAnns + self.cat_img_map = self.catToImgs + + def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None): + return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd) + + def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]): + cat_ids_coco = self.getCatIds(cat_names, sup_names, cat_ids) + if None in cat_names: + index = [i for i, v in enumerate(cat_names) if v is not None] + cat_ids = list(range(len(cat_names))) + for i in range(len(index)): + cat_ids[index[i]] = cat_ids_coco[i] + return cat_ids + else: + return cat_ids_coco + + def get_img_ids(self, img_ids=[], cat_ids=[]): + return self.getImgIds(img_ids, cat_ids) + + def load_anns(self, ids): + return self.loadAnns(ids) + + def load_cats(self, ids): + return self.loadCats(ids) + + def load_imgs(self, ids): + return self.loadImgs(ids) + + +# just for the ease of import +COCOeval = _COCOeval + + +class COCOPanoptic(COCO): + """This wrapper is for loading the panoptic style annotation file. + + The format is shown in the CocoPanopticDataset class. + + Args: + annotation_file (str, optional): Path of annotation file. + Defaults to None. + """ + + def __init__(self, annotation_file: Optional[str] = None) -> None: + super(COCOPanoptic, self).__init__(annotation_file) + + def createIndex(self) -> None: + """Create index.""" + # create index + print('creating index...') + # anns stores 'segment_id -> annotation' + anns, cats, imgs = {}, {}, {} + img_to_anns, cat_to_imgs = defaultdict(list), defaultdict(list) + if 'annotations' in self.dataset: + for ann in self.dataset['annotations']: + for seg_ann in ann['segments_info']: + # to match with instance.json + seg_ann['image_id'] = ann['image_id'] + img_to_anns[ann['image_id']].append(seg_ann) + # segment_id is not unique in coco dataset orz... + # annotations from different images but + # may have same segment_id + if seg_ann['id'] in anns.keys(): + anns[seg_ann['id']].append(seg_ann) + else: + anns[seg_ann['id']] = [seg_ann] + + # filter out annotations from other images + img_to_anns_ = defaultdict(list) + for k, v in img_to_anns.items(): + img_to_anns_[k] = [x for x in v if x['image_id'] == k] + img_to_anns = img_to_anns_ + + if 'images' in self.dataset: + for img_info in self.dataset['images']: + img_info['segm_file'] = img_info['file_name'].replace( + 'jpg', 'png') + imgs[img_info['id']] = img_info + + if 'categories' in self.dataset: + for cat in self.dataset['categories']: + cats[cat['id']] = cat + + if 'annotations' in self.dataset and 'categories' in self.dataset: + for ann in self.dataset['annotations']: + for seg_ann in ann['segments_info']: + cat_to_imgs[seg_ann['category_id']].append(ann['image_id']) + + print('index created!') + + self.anns = anns + self.imgToAnns = img_to_anns + self.catToImgs = cat_to_imgs + self.imgs = imgs + self.cats = cats + + def load_anns(self, + ids: Union[List[int], int] = []) -> Optional[List[dict]]: + """Load anns with the specified ids. + + ``self.anns`` is a list of annotation lists instead of a + list of annotations. + + Args: + ids (Union[List[int], int]): Integer ids specifying anns. + + Returns: + anns (List[dict], optional): Loaded ann objects. + """ + anns = [] + + if hasattr(ids, '__iter__') and hasattr(ids, '__len__'): + # self.anns is a list of annotation lists instead of + # a list of annotations + for id in ids: + anns += self.anns[id] + return anns + elif type(ids) == int: + return self.anns[ids] diff --git a/mmdetection/projects/EfficientDet/efficientdet/tensorflow/coco_90class.py b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/coco_90class.py new file mode 100644 index 0000000..d2996cc --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/coco_90class.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +from typing import List, Union + +from mmengine.fileio import get_local_path + +from mmdet.datasets.base_det_dataset import BaseDetDataset +from mmdet.registry import DATASETS +from .api_wrappers import COCO + + +@DATASETS.register_module() +class Coco90Dataset(BaseDetDataset): + """Dataset for COCO.""" + + METAINFO = { + 'classes': + ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', None, 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', + 'cow', 'elephant', 'bear', 'zebra', 'giraffe', None, 'backpack', + 'umbrella', None, None, 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', + 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', None, 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', + 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', None, 'dining table', None, None, 'toilet', None, 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', None, 'book', 'clock', + 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'), + # palette is a list of color tuples, which is used for visualization. + 'palette': + [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228), + (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30), + (100, 170, 30), None, (220, 220, 0), (175, 116, 175), (250, 0, 30), + (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255), + (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255), + (199, 100, 0), (72, 0, 118), None, + (255, 179, 240), (0, 125, 92), None, None, (209, 0, 151), + (188, 208, 182), (0, 220, 176), (255, 99, 164), (92, 0, 73), + (133, 129, 255), (78, 180, 255), (0, 228, 0), (174, 255, 243), + (45, 89, 255), (134, 134, 103), (145, 148, 174), (255, 208, 186), + (197, 226, 255), None, (171, 134, 1), (109, 63, 54), (207, 138, 255), + (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105), + (166, 196, 102), (208, 195, 210), (255, 109, 65), (0, 143, 149), + (179, 0, 194), (209, 99, 106), (5, 121, 0), (227, 255, 205), + (147, 186, 208), (153, 69, 1), (3, 95, 161), (163, 255, 0), + (119, 0, 170), None, (0, 182, 199), None, None, (0, 165, 120), None, + (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133), + (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62), + (65, 70, 15), (127, 167, 115), (59, 105, 106), None, (142, 108, 45), + (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1), + (246, 0, 122), (191, 162, 208)] + } + COCOAPI = COCO + # ann_id is unique in coco dataset. + ANN_ID_UNIQUE = True + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + # The order of returned `cat_ids` will not + # change with the order of the `classes` + self.cat_ids = self.coco.get_cat_ids( + cat_names=self.metainfo['classes']) + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + + return data_list + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file`` + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + ann_info = raw_data_info['raw_ann_info'] + + data_info = {} + + # TODO: need to change data_prefix['img'] to data_prefix['img_path'] + img_path = osp.join(self.data_prefix['img'], img_info['file_name']) + if self.data_prefix.get('seg', None): + seg_map_path = osp.join( + self.data_prefix['seg'], + img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix) + else: + seg_map_path = None + data_info['img_path'] = img_path + data_info['img_id'] = img_info['img_id'] + data_info['seg_map_path'] = seg_map_path + data_info['height'] = img_info['height'] + data_info['width'] = img_info['width'] + + instances = [] + for i, ann in enumerate(ann_info): + instance = {} + + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) + inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + if ann['category_id'] not in self.cat_ids: + continue + bbox = [x1, y1, x1 + w, y1 + h] + + if ann.get('iscrowd', False): + instance['ignore_flag'] = 1 + else: + instance['ignore_flag'] = 0 + instance['bbox'] = bbox + instance['bbox_label'] = self.cat2label[ann['category_id']] + + if ann.get('segmentation', None): + instance['mask'] = ann['segmentation'] + + instances.append(instance) + data_info['instances'] = instances + return data_info + + def filter_data(self) -> List[dict]: + """Filter annotations according to filter_cfg. + + Returns: + List[dict]: Filtered results. + """ + if self.test_mode: + return self.data_list + + if self.filter_cfg is None: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) + min_size = self.filter_cfg.get('min_size', 0) + + # obtain images that contain annotation + ids_with_ann = set(data_info['img_id'] for data_info in self.data_list) + # obtain images that contain annotations of the required categories + ids_in_cat = set() + for i, class_id in enumerate(self.cat_ids): + ids_in_cat |= set(self.cat_img_map[class_id]) + # merge the image id sets of the two conditions and use the merged set + # to filter out images if self.filter_empty_gt=True + ids_in_cat &= ids_with_ann + + valid_data_infos = [] + for i, data_info in enumerate(self.data_list): + img_id = data_info['img_id'] + width = data_info['width'] + height = data_info['height'] + if filter_empty_gt and img_id not in ids_in_cat: + continue + if min(width, height) >= min_size: + valid_data_infos.append(data_info) + + return valid_data_infos diff --git a/mmdetection/projects/EfficientDet/efficientdet/tensorflow/coco_90metric.py b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/coco_90metric.py new file mode 100644 index 0000000..eed6522 --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/coco_90metric.py @@ -0,0 +1,540 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import itertools +import os.path as osp +import tempfile +from collections import OrderedDict +from typing import Dict, List, Optional, Sequence, Union + +import numpy as np +from mmengine.evaluator import BaseMetric +from mmengine.fileio import dump, get_local_path, load +from mmengine.logging import MMLogger +from terminaltables import AsciiTable + +from mmdet.evaluation.functional import eval_recalls +from mmdet.registry import METRICS +from mmdet.structures.mask import encode_mask_results +from .api_wrappers import COCO, COCOeval + + +@METRICS.register_module() +class Coco90Metric(BaseMetric): + """COCO evaluation metric. + + Evaluate AR, AP, and mAP for detection tasks including proposal/box + detection and instance segmentation. Please refer to + https://cocodataset.org/#detection-eval for more details. + + Args: + ann_file (str, optional): Path to the coco format annotation file. + If not specified, ground truth annotations from the dataset will + be converted to coco format. Defaults to None. + metric (str | List[str]): Metrics to be evaluated. Valid metrics + include 'bbox', 'segm', 'proposal', and 'proposal_fast'. + Defaults to 'bbox'. + classwise (bool): Whether to evaluate the metric class-wise. + Defaults to False. + proposal_nums (Sequence[int]): Numbers of proposals to be evaluated. + Defaults to (100, 300, 1000). + iou_thrs (float | List[float], optional): IoU threshold to compute AP + and AR. If not specified, IoUs from 0.5 to 0.95 will be used. + Defaults to None. + metric_items (List[str], optional): Metric result names to be + recorded in the evaluation result. Defaults to None. + format_only (bool): Format the output results without perform + evaluation. It is useful when you want to format the result + to a specific format and submit it to the test server. + Defaults to False. + outfile_prefix (str, optional): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Defaults to None. + backend_args (dict, optional): Arguments to instantiate the + corresponding backend. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + """ + default_prefix: Optional[str] = 'coco' + + def __init__(self, + ann_file: Optional[str] = None, + metric: Union[str, List[str]] = 'bbox', + classwise: bool = False, + proposal_nums: Sequence[int] = (100, 300, 1000), + iou_thrs: Optional[Union[float, Sequence[float]]] = None, + metric_items: Optional[Sequence[str]] = None, + format_only: bool = False, + outfile_prefix: Optional[str] = None, + backend_args: dict = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device=collect_device, prefix=prefix) + # coco evaluation metrics + self.metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast'] + for metric in self.metrics: + if metric not in allowed_metrics: + raise KeyError( + "metric should be one of 'bbox', 'segm', 'proposal', " + f"'proposal_fast', but got {metric}.") + + # do class wise evaluation, default False + self.classwise = classwise + + # proposal_nums used to compute recall or precision. + self.proposal_nums = list(proposal_nums) + + # iou_thrs used to compute recall or precision. + if iou_thrs is None: + iou_thrs = np.linspace( + .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + self.iou_thrs = iou_thrs + self.metric_items = metric_items + self.format_only = format_only + if self.format_only: + assert outfile_prefix is not None, 'outfile_prefix must be not' + 'None when format_only is True, otherwise the result files will' + 'be saved to a temp directory which will be cleaned up at the end.' + + self.outfile_prefix = outfile_prefix + + self.backend_args = backend_args + + # if ann_file is not specified, + # initialize coco api with the converted dataset + if ann_file is not None: + with get_local_path( + ann_file, backend_args=self.backend_args) as local_path: + self._coco_api = COCO(local_path) + else: + self._coco_api = None + + # handle dataset lazy init + self.cat_ids = None + self.img_ids = None + + def fast_eval_recall(self, + results: List[dict], + proposal_nums: Sequence[int], + iou_thrs: Sequence[float], + logger: Optional[MMLogger] = None) -> np.ndarray: + """Evaluate proposal recall with COCO's fast_eval_recall. + + Args: + results (List[dict]): Results of the dataset. + proposal_nums (Sequence[int]): Proposal numbers used for + evaluation. + iou_thrs (Sequence[float]): IoU thresholds used for evaluation. + logger (MMLogger, optional): Logger used for logging the recall + summary. + Returns: + np.ndarray: Averaged recall results. + """ + gt_bboxes = [] + pred_bboxes = [result['bboxes'] for result in results] + for i in range(len(self.img_ids)): + ann_ids = self._coco_api.get_ann_ids(img_ids=self.img_ids[i]) + ann_info = self._coco_api.load_anns(ann_ids) + if len(ann_info) == 0: + gt_bboxes.append(np.zeros((0, 4))) + continue + bboxes = [] + for ann in ann_info: + if ann.get('ignore', False) or ann['iscrowd']: + continue + x1, y1, w, h = ann['bbox'] + bboxes.append([x1, y1, x1 + w, y1 + h]) + bboxes = np.array(bboxes, dtype=np.float32) + if bboxes.shape[0] == 0: + bboxes = np.zeros((0, 4)) + gt_bboxes.append(bboxes) + + recalls = eval_recalls( + gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger) + ar = recalls.mean(axis=1) + return ar + + def xyxy2xywh(self, bbox: np.ndarray) -> list: + """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO + evaluation. + + Args: + bbox (numpy.ndarray): The bounding boxes, shape (4, ), in + ``xyxy`` order. + + Returns: + list[float]: The converted bounding boxes, in ``xywh`` order. + """ + + _bbox: List = bbox.tolist() + return [ + _bbox[0], + _bbox[1], + _bbox[2] - _bbox[0], + _bbox[3] - _bbox[1], + ] + + def results2json(self, results: Sequence[dict], + outfile_prefix: str) -> dict: + """Dump the detection results to a COCO style json file. + + There are 3 types of results: proposals, bbox predictions, mask + predictions, and they have different data types. This method will + automatically recognize the type, and dump them to json files. + + Args: + results (Sequence[dict]): Testing results of the + dataset. + outfile_prefix (str): The filename prefix of the json files. If the + prefix is "somepath/xxx", the json files will be named + "somepath/xxx.bbox.json", "somepath/xxx.segm.json", + "somepath/xxx.proposal.json". + + Returns: + dict: Possible keys are "bbox", "segm", "proposal", and + values are corresponding filenames. + """ + bbox_json_results = [] + segm_json_results = [] if 'masks' in results[0] else None + for idx, result in enumerate(results): + image_id = result.get('img_id', idx) + labels = result['labels'] + bboxes = result['bboxes'] + scores = result['scores'] + # bbox results + for i, label in enumerate(labels): + data = dict() + data['image_id'] = image_id + data['bbox'] = self.xyxy2xywh(bboxes[i]) + data['score'] = float(scores[i]) + data['category_id'] = self.cat_ids[label] + bbox_json_results.append(data) + + if segm_json_results is None: + continue + + # segm results + masks = result['masks'] + mask_scores = result.get('mask_scores', scores) + for i, label in enumerate(labels): + data = dict() + data['image_id'] = image_id + data['bbox'] = self.xyxy2xywh(bboxes[i]) + data['score'] = float(mask_scores[i]) + data['category_id'] = self.cat_ids[label] + if isinstance(masks[i]['counts'], bytes): + masks[i]['counts'] = masks[i]['counts'].decode() + data['segmentation'] = masks[i] + segm_json_results.append(data) + + result_files = dict() + result_files['bbox'] = f'{outfile_prefix}.bbox.json' + result_files['proposal'] = f'{outfile_prefix}.bbox.json' + dump(bbox_json_results, result_files['bbox']) + + if segm_json_results is not None: + result_files['segm'] = f'{outfile_prefix}.segm.json' + dump(segm_json_results, result_files['segm']) + + return result_files + + def gt_to_coco_json(self, gt_dicts: Sequence[dict], + outfile_prefix: str) -> str: + """Convert ground truth to coco format json file. + + Args: + gt_dicts (Sequence[dict]): Ground truth of the dataset. + outfile_prefix (str): The filename prefix of the json files. If the + prefix is "somepath/xxx", the json file will be named + "somepath/xxx.gt.json". + Returns: + str: The filename of the json file. + """ + categories = [ + dict(id=id, name=name) + for id, name in enumerate(self.dataset_meta['classes']) + ] + image_infos = [] + annotations = [] + + for idx, gt_dict in enumerate(gt_dicts): + img_id = gt_dict.get('img_id', idx) + image_info = dict( + id=img_id, + width=gt_dict['width'], + height=gt_dict['height'], + file_name='') + image_infos.append(image_info) + for ann in gt_dict['anns']: + label = ann['bbox_label'] + bbox = ann['bbox'] + coco_bbox = [ + bbox[0], + bbox[1], + bbox[2] - bbox[0], + bbox[3] - bbox[1], + ] + + annotation = dict( + id=len(annotations) + + 1, # coco api requires id starts with 1 + image_id=img_id, + bbox=coco_bbox, + iscrowd=ann.get('ignore_flag', 0), + category_id=int(label), + area=coco_bbox[2] * coco_bbox[3]) + if ann.get('mask', None): + mask = ann['mask'] + # area = mask_util.area(mask) + if isinstance(mask, dict) and isinstance( + mask['counts'], bytes): + mask['counts'] = mask['counts'].decode() + annotation['segmentation'] = mask + # annotation['area'] = float(area) + annotations.append(annotation) + + info = dict( + date_created=str(datetime.datetime.now()), + description='Coco json file converted by mmdet CocoMetric.') + coco_json = dict( + info=info, + images=image_infos, + categories=categories, + licenses=None, + ) + if len(annotations) > 0: + coco_json['annotations'] = annotations + converted_json_path = f'{outfile_prefix}.gt.json' + dump(coco_json, converted_json_path) + return converted_json_path + + # TODO: data_batch is no longer needed, consider adjusting the + # parameter position + def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None: + """Process one batch of data samples and predictions. The processed + results should be stored in ``self.results``, which will be used to + compute the metrics when all batches have been processed. + + Args: + data_batch (dict): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of data samples that + contain annotations and predictions. + """ + for data_sample in data_samples: + result = dict() + pred = data_sample['pred_instances'] + result['img_id'] = data_sample['img_id'] + result['bboxes'] = pred['bboxes'].cpu().numpy() + result['scores'] = pred['scores'].cpu().numpy() + result['labels'] = pred['labels'].cpu().numpy() + # encode mask to RLE + if 'masks' in pred: + result['masks'] = encode_mask_results( + pred['masks'].detach().cpu().numpy()) + # some detectors use different scores for bbox and mask + if 'mask_scores' in pred: + result['mask_scores'] = pred['mask_scores'].cpu().numpy() + + # parse gt + gt = dict() + gt['width'] = data_sample['ori_shape'][1] + gt['height'] = data_sample['ori_shape'][0] + gt['img_id'] = data_sample['img_id'] + if self._coco_api is None: + # TODO: Need to refactor to support LoadAnnotations + assert 'instances' in data_sample, \ + 'ground truth is required for evaluation when ' \ + '`ann_file` is not provided' + gt['anns'] = data_sample['instances'] + # add converted result to the results list + self.results.append((gt, result)) + + def compute_metrics(self, results: list) -> Dict[str, float]: + """Compute the metrics from processed results. + + Args: + results (list): The processed results of each batch. + + Returns: + Dict[str, float]: The computed metrics. The keys are the names of + the metrics, and the values are corresponding results. + """ + logger: MMLogger = MMLogger.get_current_instance() + + # split gt and prediction list + gts, preds = zip(*results) + + tmp_dir = None + if self.outfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + outfile_prefix = osp.join(tmp_dir.name, 'results') + else: + outfile_prefix = self.outfile_prefix + + if self._coco_api is None: + # use converted gt json file to initialize coco api + logger.info('Converting ground truth to coco format...') + coco_json_path = self.gt_to_coco_json( + gt_dicts=gts, outfile_prefix=outfile_prefix) + self._coco_api = COCO(coco_json_path) + + # handle lazy init + if self.cat_ids is None: + self.cat_ids = self._coco_api.get_cat_ids( + cat_names=self.dataset_meta['classes']) + if self.img_ids is None: + self.img_ids = self._coco_api.get_img_ids() + + # convert predictions to coco format and dump to json file + result_files = self.results2json(preds, outfile_prefix) + + eval_results = OrderedDict() + if self.format_only: + logger.info('results are saved in ' + f'{osp.dirname(outfile_prefix)}') + return eval_results + + for metric in self.metrics: + logger.info(f'Evaluating {metric}...') + + # TODO: May refactor fast_eval_recall to an independent metric? + # fast eval recall + if metric == 'proposal_fast': + ar = self.fast_eval_recall( + preds, self.proposal_nums, self.iou_thrs, logger=logger) + log_msg = [] + for i, num in enumerate(self.proposal_nums): + eval_results[f'AR@{num}'] = ar[i] + log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}') + log_msg = ''.join(log_msg) + logger.info(log_msg) + continue + + # evaluate proposal, bbox and segm + iou_type = 'bbox' if metric == 'proposal' else metric + if metric not in result_files: + raise KeyError(f'{metric} is not in results') + try: + predictions = load(result_files[metric]) + if iou_type == 'segm': + # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331 # noqa + # When evaluating mask AP, if the results contain bbox, + # cocoapi will use the box area instead of the mask area + # for calculating the instance area. Though the overall AP + # is not affected, this leads to different + # small/medium/large mask AP results. + for x in predictions: + x.pop('bbox') + coco_dt = self._coco_api.loadRes(predictions) + + except IndexError: + logger.error( + 'The testing results of the whole dataset is empty.') + break + + coco_eval = COCOeval(self._coco_api, coco_dt, iou_type) + + coco_eval.params.catIds = self.cat_ids + coco_eval.params.imgIds = self.img_ids + coco_eval.params.maxDets = list(self.proposal_nums) + coco_eval.params.iouThrs = self.iou_thrs + + # mapping of cocoEval.stats + coco_metric_names = { + 'mAP': 0, + 'mAP_50': 1, + 'mAP_75': 2, + 'mAP_s': 3, + 'mAP_m': 4, + 'mAP_l': 5, + 'AR@100': 6, + 'AR@300': 7, + 'AR@1000': 8, + 'AR_s@1000': 9, + 'AR_m@1000': 10, + 'AR_l@1000': 11 + } + metric_items = self.metric_items + if metric_items is not None: + for metric_item in metric_items: + if metric_item not in coco_metric_names: + raise KeyError( + f'metric item "{metric_item}" is not supported') + + if metric == 'proposal': + coco_eval.params.useCats = 0 + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + if metric_items is None: + metric_items = [ + 'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000', + 'AR_m@1000', 'AR_l@1000' + ] + + for item in metric_items: + val = float( + f'{coco_eval.stats[coco_metric_names[item]]:.3f}') + eval_results[item] = val + else: + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + if self.classwise: # Compute per-category AP + # Compute per-category AP + # from https://github.com/facebookresearch/detectron2/ + precisions = coco_eval.eval['precision'] + # precision: (iou, recall, cls, area range, max dets) + assert len(self.cat_ids) == precisions.shape[2] + + results_per_category = [] + for idx, cat_id in enumerate(self.cat_ids): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + nm = self._coco_api.loadCats(cat_id)[0] + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + if precision.size: + ap = np.mean(precision) + else: + ap = float('nan') + results_per_category.append( + (f'{nm["name"]}', f'{round(ap, 3)}')) + eval_results[f'{nm["name"]}_precision'] = round(ap, 3) + + num_columns = min(6, len(results_per_category) * 2) + results_flatten = list( + itertools.chain(*results_per_category)) + headers = ['category', 'AP'] * (num_columns // 2) + results_2d = itertools.zip_longest(*[ + results_flatten[i::num_columns] + for i in range(num_columns) + ]) + table_data = [headers] + table_data += [result for result in results_2d] + table = AsciiTable(table_data) + logger.info('\n' + table.table) + + if metric_items is None: + metric_items = [ + 'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l' + ] + + for metric_item in metric_items: + key = f'{metric}_{metric_item}' + val = coco_eval.stats[coco_metric_names[metric_item]] + eval_results[key] = float(f'{round(val, 3)}') + + ap = coco_eval.stats[:6] + logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} ' + f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} ' + f'{ap[4]:.3f} {ap[5]:.3f}') + + if tmp_dir is not None: + tmp_dir.cleanup() + return eval_results diff --git a/mmdetection/projects/EfficientDet/efficientdet/tensorflow/trans_max_iou_assigner.py b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/trans_max_iou_assigner.py new file mode 100644 index 0000000..10fc45b --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/trans_max_iou_assigner.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from mmengine.structures import InstanceData + +from mmdet.models.task_modules.assigners.assign_result import AssignResult +from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner +from mmdet.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class TransMaxIoUAssigner(MaxIoUAssigner): + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to bboxes. + + This method assign a gt bbox to every bbox (proposal/anchor), each bbox + will be assigned with -1, or a semi-positive number. -1 means negative + sample, semi-positive number is the index (0-based) of assigned gt. + The assignment is done in following steps, the order matters. + + 1. assign every bbox to the background + 2. assign proposals whose iou with all gts < neg_iou_thr to 0 + 3. for each bbox, if the iou with its nearest gt >= pos_iou_thr, + assign it to that bbox + 4. for each gt bbox, assign its nearest proposals (may be more than + one) to itself + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + + Returns: + :obj:`AssignResult`: The assign result. + + Example: + >>> from mmengine.structures import InstanceData + >>> self = MaxIoUAssigner(0.5, 0.5) + >>> pred_instances = InstanceData() + >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10], + ... [10, 10, 20, 20]]) + >>> gt_instances = InstanceData() + >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 9]]) + >>> gt_instances.labels = torch.Tensor([0]) + >>> assign_result = self.assign(pred_instances, gt_instances) + >>> expected_gt_inds = torch.LongTensor([1, 0]) + >>> assert torch.all(assign_result.gt_inds == expected_gt_inds) + """ + gt_bboxes = gt_instances.bboxes + priors = pred_instances.priors + gt_labels = gt_instances.labels + if gt_instances_ignore is not None: + gt_bboxes_ignore = gt_instances_ignore.bboxes + else: + gt_bboxes_ignore = None + + assign_on_cpu = True if (self.gpu_assign_thr > 0) and ( + gt_bboxes.shape[0] > self.gpu_assign_thr) else False + # compute overlap and assign gt on CPU when number of GT is large + if assign_on_cpu: + device = priors.device + priors = priors.cpu() + gt_bboxes = gt_bboxes.cpu() + gt_labels = gt_labels.cpu() + if gt_bboxes_ignore is not None: + gt_bboxes_ignore = gt_bboxes_ignore.cpu() + + trans_priors = torch.cat([ + priors[..., 1].view(-1, 1), priors[..., 0].view(-1, 1), + priors[..., 3].view(-1, 1), priors[..., 2].view(-1, 1) + ], + dim=-1) + overlaps = self.iou_calculator(gt_bboxes, trans_priors) + + if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None + and gt_bboxes_ignore.numel() > 0 and trans_priors.numel() > 0): + if self.ignore_wrt_candidates: + ignore_overlaps = self.iou_calculator( + trans_priors, gt_bboxes_ignore, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=1) + else: + ignore_overlaps = self.iou_calculator( + gt_bboxes_ignore, trans_priors, mode='iof') + ignore_max_overlaps, _ = ignore_overlaps.max(dim=0) + overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1 + + assign_result = self.assign_wrt_overlaps(overlaps, gt_labels) + if assign_on_cpu: + assign_result.gt_inds = assign_result.gt_inds.to(device) + assign_result.max_overlaps = assign_result.max_overlaps.to(device) + if assign_result.labels is not None: + assign_result.labels = assign_result.labels.to(device) + return assign_result diff --git a/mmdetection/projects/EfficientDet/efficientdet/tensorflow/yxyx_bbox_coder.py b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/yxyx_bbox_coder.py new file mode 100644 index 0000000..63e2330 --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/tensorflow/yxyx_bbox_coder.py @@ -0,0 +1,369 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np +import torch + +from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \ + DeltaXYWHBBoxCoder +from mmdet.registry import TASK_UTILS +from mmdet.structures.bbox import HorizontalBoxes, get_box_tensor + + +@TASK_UTILS.register_module() +class YXYXDeltaXYWHBBoxCoder(DeltaXYWHBBoxCoder): + + def encode(self, bboxes, gt_bboxes): + """Get box regression transformation deltas that can be used to + transform the ``bboxes`` into the ``gt_bboxes``. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes, + e.g., object proposals. + gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the + transformation, e.g., ground-truth boxes. + + Returns: + torch.Tensor: Box transformation deltas + """ + bboxes = get_box_tensor(bboxes) + gt_bboxes = get_box_tensor(gt_bboxes) + assert bboxes.size(0) == gt_bboxes.size(0) + assert bboxes.size(-1) == gt_bboxes.size(-1) == 4 + encoded_bboxes = YXbbox2delta(bboxes, gt_bboxes, self.means, self.stds) + return encoded_bboxes + + def decode(self, + bboxes, + pred_bboxes, + max_shape=None, + wh_ratio_clip=16 / 1000): + """Apply transformation `pred_bboxes` to `boxes`. + + Args: + bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape + (B, N, 4) or (N, 4) + pred_bboxes (Tensor): Encoded offsets with respect to each roi. + Has shape (B, N, num_classes * 4) or (B, N, 4) or + (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H + when rois is a grid of anchors.Offset encoding follows [1]_. + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]] + and the length of max_shape should also be B. + wh_ratio_clip (float, optional): The allowed ratio between + width and height. + + Returns: + Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes. + """ + bboxes = get_box_tensor(bboxes) + assert pred_bboxes.size(0) == bboxes.size(0) + if pred_bboxes.ndim == 3: + assert pred_bboxes.size(1) == bboxes.size(1) + + if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export(): + # single image decode + decoded_bboxes = YXdelta2bbox(bboxes, pred_bboxes, self.means, + self.stds, max_shape, wh_ratio_clip, + self.clip_border, self.add_ctr_clamp, + self.ctr_clamp) + else: + if pred_bboxes.ndim == 3 and not torch.onnx.is_in_onnx_export(): + warnings.warn( + 'DeprecationWarning: onnx_delta2bbox is deprecated ' + 'in the case of batch decoding and non-ONNX, ' + 'please use “delta2bbox” instead. In order to improve ' + 'the decoding speed, the batch function will no ' + 'longer be supported. ') + decoded_bboxes = YXonnx_delta2bbox(bboxes, pred_bboxes, self.means, + self.stds, max_shape, + wh_ratio_clip, self.clip_border, + self.add_ctr_clamp, + self.ctr_clamp) + + if self.use_box_type: + assert decoded_bboxes.size(-1) == 4, \ + ('Cannot warp decoded boxes with box type when decoded boxes' + 'have shape of (N, num_classes * 4)') + decoded_bboxes = HorizontalBoxes(decoded_bboxes) + return decoded_bboxes + + +def YXdelta2bbox(rois, + deltas, + means=(0., 0., 0., 0.), + stds=(1., 1., 1., 1.), + max_shape=None, + hw_ratio_clip=1000 / 16, + clip_border=True, + add_ctr_clamp=False, + ctr_clamp=32): + """Apply deltas to shift/scale base boxes. + + Typically the rois are anchor or proposed bounding boxes and the deltas are + network outputs used to shift/scale those boxes. + This is the inverse function of :func:`bbox2delta`. + + Args: + rois (Tensor): Boxes to be transformed. Has shape (N, 4). + deltas (Tensor): Encoded offsets relative to each roi. + Has shape (N, num_classes * 4) or (N, 4). Note + N = num_base_anchors * W * H, when rois is a grid of + anchors. Offset encoding follows [1]_. + means (Sequence[float]): Denormalizing means for delta coordinates. + Default (0., 0., 0., 0.). + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates. Default (1., 1., 1., 1.). + max_shape (tuple[int, int]): Maximum bounds for boxes, specifies + (H, W). Default None. + wh_ratio_clip (float): Maximum aspect ratio for boxes. Default + 16 / 1000. + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Default True. + add_ctr_clamp (bool): Whether to add center clamp. When set to True, + the center of the prediction bounding box will be clamped to + avoid being too far away from the center of the anchor. + Only used by YOLOF. Default False. + ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF. + Default 32. + + Returns: + Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4 + represent tl_x, tl_y, br_x, br_y. + + References: + .. [1] https://arxiv.org/abs/1311.2524 + + Example: + >>> rois = torch.Tensor([[ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 5., 5., 5., 5.]]) + >>> deltas = torch.Tensor([[ 0., 0., 0., 0.], + >>> [ 1., 1., 1., 1.], + >>> [ 0., 0., 2., -1.], + >>> [ 0.7, -1.9, -0.5, 0.3]]) + >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3)) + tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.1409, 0.1409, 2.8591, 2.8591], + [0.0000, 0.3161, 4.1945, 0.6839], + [5.0000, 5.0000, 5.0000, 5.0000]]) + """ + num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4 + if num_bboxes == 0: + return deltas + + deltas = deltas.reshape(-1, 4) + + means = deltas.new_tensor(means).view(1, -1) + stds = deltas.new_tensor(stds).view(1, -1) + denorm_deltas = deltas * stds + means + + dyx = denorm_deltas[:, :2] + dhw = denorm_deltas[:, 2:] + + # Compute width/height of each roi + rois_ = rois.repeat(1, num_classes).reshape(-1, 4) + pyx = ((rois_[:, :2] + rois_[:, 2:]) * 0.5) + phw = (rois_[:, 2:] - rois_[:, :2]) + + dyx_hw = phw * dyx + + max_ratio = np.abs(np.log(hw_ratio_clip)) + if add_ctr_clamp: + dyx_hw = torch.clamp(dyx_hw, max=ctr_clamp, min=-ctr_clamp) + dhw = torch.clamp(dhw, max=max_ratio) + else: + dhw = dhw.clamp(min=-max_ratio, max=max_ratio) + + gyx = pyx + dyx_hw + ghw = phw * dhw.exp() + y1x1 = gyx - (ghw * 0.5) + y2x2 = gyx + (ghw * 0.5) + ymin, xmin = y1x1[:, 0].reshape(-1, 1), y1x1[:, 1].reshape(-1, 1) + ymax, xmax = y2x2[:, 0].reshape(-1, 1), y2x2[:, 1].reshape(-1, 1) + bboxes = torch.cat([xmin, ymin, xmax, ymax], dim=-1) + if clip_border and max_shape is not None: + bboxes[..., 0::2].clamp_(min=0, max=max_shape[1]) + bboxes[..., 1::2].clamp_(min=0, max=max_shape[0]) + bboxes = bboxes.reshape(num_bboxes, -1) + return bboxes + + +def YXbbox2delta(proposals, gt, means=(0., 0., 0., 0.), stds=(1., 1., 1., 1.)): + """Compute deltas of proposals w.r.t. gt. + + We usually compute the deltas of x, y, w, h of proposals w.r.t ground + truth bboxes to get regression target. + This is the inverse function of :func:`delta2bbox`. + + Args: + proposals (Tensor): Boxes to be transformed, shape (N, ..., 4) + gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4) + means (Sequence[float]): Denormalizing means for delta coordinates + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates + + Returns: + Tensor: deltas with shape (N, 4), where columns represent dx, dy, + dw, dh. + """ + assert proposals.size() == gt.size() + + proposals = proposals.float() + gt = gt.float() + py = (proposals[..., 0] + proposals[..., 2]) * 0.5 + px = (proposals[..., 1] + proposals[..., 3]) * 0.5 + ph = proposals[..., 2] - proposals[..., 0] + pw = proposals[..., 3] - proposals[..., 1] + + gx = (gt[..., 0] + gt[..., 2]) * 0.5 + gy = (gt[..., 1] + gt[..., 3]) * 0.5 + gw = gt[..., 2] - gt[..., 0] + gh = gt[..., 3] - gt[..., 1] + + dx = (gx - px) / pw + dy = (gy - py) / ph + dw = torch.log(gw / pw) + dh = torch.log(gh / ph) + deltas = torch.stack([dy, dx, dh, dw], dim=-1) + + means = deltas.new_tensor(means).unsqueeze(0) + stds = deltas.new_tensor(stds).unsqueeze(0) + deltas = deltas.sub_(means).div_(stds) + + return deltas + + +def YXonnx_delta2bbox(rois, + deltas, + means=(0., 0., 0., 0.), + stds=(1., 1., 1., 1.), + max_shape=None, + wh_ratio_clip=16 / 1000, + clip_border=True, + add_ctr_clamp=False, + ctr_clamp=32): + """Apply deltas to shift/scale base boxes. + + Typically the rois are anchor or proposed bounding boxes and the deltas are + network outputs used to shift/scale those boxes. + This is the inverse function of :func:`bbox2delta`. + + Args: + rois (Tensor): Boxes to be transformed. Has shape (N, 4) or (B, N, 4) + deltas (Tensor): Encoded offsets with respect to each roi. + Has shape (B, N, num_classes * 4) or (B, N, 4) or + (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H + when rois is a grid of anchors.Offset encoding follows [1]_. + means (Sequence[float]): Denormalizing means for delta coordinates. + Default (0., 0., 0., 0.). + stds (Sequence[float]): Denormalizing standard deviation for delta + coordinates. Default (1., 1., 1., 1.). + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If rois shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]] + and the length of max_shape should also be B. Default None. + wh_ratio_clip (float): Maximum aspect ratio for boxes. + Default 16 / 1000. + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Default True. + add_ctr_clamp (bool): Whether to add center clamp, when added, the + predicted box is clamped is its center is too far away from + the original anchor's center. Only used by YOLOF. Default False. + ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF. + Default 32. + + Returns: + Tensor: Boxes with shape (B, N, num_classes * 4) or (B, N, 4) or + (N, num_classes * 4) or (N, 4), where 4 represent + tl_x, tl_y, br_x, br_y. + + References: + .. [1] https://arxiv.org/abs/1311.2524 + + Example: + >>> rois = torch.Tensor([[ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 0., 0., 1., 1.], + >>> [ 5., 5., 5., 5.]]) + >>> deltas = torch.Tensor([[ 0., 0., 0., 0.], + >>> [ 1., 1., 1., 1.], + >>> [ 0., 0., 2., -1.], + >>> [ 0.7, -1.9, -0.5, 0.3]]) + >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3)) + tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.1409, 0.1409, 2.8591, 2.8591], + [0.0000, 0.3161, 4.1945, 0.6839], + [5.0000, 5.0000, 5.0000, 5.0000]]) + """ + means = deltas.new_tensor(means).view(1, + -1).repeat(1, + deltas.size(-1) // 4) + stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(-1) // 4) + denorm_deltas = deltas * stds + means + dy = denorm_deltas[..., 0::4] + dx = denorm_deltas[..., 1::4] + dh = denorm_deltas[..., 2::4] + dw = denorm_deltas[..., 3::4] + + y1, x1 = rois[..., 0], rois[..., 1] + y2, x2 = rois[..., 2], rois[..., 3] + # Compute center of each roi + px = ((x1 + x2) * 0.5).unsqueeze(-1).expand_as(dx) + py = ((y1 + y2) * 0.5).unsqueeze(-1).expand_as(dy) + # Compute width/height of each roi + pw = (x2 - x1).unsqueeze(-1).expand_as(dw) + ph = (y2 - y1).unsqueeze(-1).expand_as(dh) + + dx_width = pw * dx + dy_height = ph * dy + + max_ratio = np.abs(np.log(wh_ratio_clip)) + if add_ctr_clamp: + dx_width = torch.clamp(dx_width, max=ctr_clamp, min=-ctr_clamp) + dy_height = torch.clamp(dy_height, max=ctr_clamp, min=-ctr_clamp) + dw = torch.clamp(dw, max=max_ratio) + dh = torch.clamp(dh, max=max_ratio) + else: + dw = dw.clamp(min=-max_ratio, max=max_ratio) + dh = dh.clamp(min=-max_ratio, max=max_ratio) + # Use exp(network energy) to enlarge/shrink each roi + gw = pw * dw.exp() + gh = ph * dh.exp() + # Use network energy to shift the center of each roi + gx = px + dx_width + gy = py + dy_height + # Convert center-xy/width/height to top-left, bottom-right + x1 = gx - gw * 0.5 + y1 = gy - gh * 0.5 + x2 = gx + gw * 0.5 + y2 = gy + gh * 0.5 + + bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size()) + + if clip_border and max_shape is not None: + # clip bboxes with dynamic `min` and `max` for onnx + if torch.onnx.is_in_onnx_export(): + from mmdet.core.export import dynamic_clip_for_onnx + x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape) + bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size()) + return bboxes + if not isinstance(max_shape, torch.Tensor): + max_shape = x1.new_tensor(max_shape) + max_shape = max_shape[..., :2].type_as(x1) + if max_shape.ndim == 2: + assert bboxes.ndim == 3 + assert max_shape.size(0) == bboxes.size(0) + + min_xy = x1.new_tensor(0) + max_xy = torch.cat( + [max_shape] * (deltas.size(-1) // 2), + dim=-1).flip(-1).unsqueeze(-2) + bboxes = torch.where(bboxes < min_xy, min_xy, bboxes) + bboxes = torch.where(bboxes > max_xy, max_xy, bboxes) + + return bboxes diff --git a/mmdetection/projects/EfficientDet/efficientdet/utils.py b/mmdetection/projects/EfficientDet/efficientdet/utils.py new file mode 100644 index 0000000..9c30a01 --- /dev/null +++ b/mmdetection/projects/EfficientDet/efficientdet/utils.py @@ -0,0 +1,154 @@ +import math +from typing import Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn.bricks import Swish, build_norm_layer +from torch.nn import functional as F +from torch.nn.init import _calculate_fan_in_and_fan_out, trunc_normal_ + +from mmdet.registry import MODELS +from mmdet.utils import OptConfigType + + +def variance_scaling_trunc(tensor, gain=1.): + fan_in, _ = _calculate_fan_in_and_fan_out(tensor) + gain /= max(1.0, fan_in) + std = math.sqrt(gain) / .87962566103423978 + return trunc_normal_(tensor, 0., std) + + +@MODELS.register_module() +class Conv2dSamePadding(nn.Conv2d): + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: bool = True): + super().__init__(in_channels, out_channels, kernel_size, stride, 0, + dilation, groups, bias) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + img_h, img_w = x.size()[-2:] + kernel_h, kernel_w = self.weight.size()[-2:] + extra_w = (math.ceil(img_w / self.stride[1]) - + 1) * self.stride[1] - img_w + kernel_w + extra_h = (math.ceil(img_h / self.stride[0]) - + 1) * self.stride[0] - img_h + kernel_h + + left = extra_w // 2 + right = extra_w - left + top = extra_h // 2 + bottom = extra_h - top + x = F.pad(x, [left, right, top, bottom]) + return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, + self.dilation, self.groups) + + +class MaxPool2dSamePadding(nn.Module): + + def __init__(self, + kernel_size: Union[int, Tuple[int, int]] = 3, + stride: Union[int, Tuple[int, int]] = 2, + **kwargs): + super().__init__() + self.pool = nn.MaxPool2d(kernel_size, stride, **kwargs) + self.stride = self.pool.stride + self.kernel_size = self.pool.kernel_size + + if isinstance(self.stride, int): + self.stride = [self.stride] * 2 + if isinstance(self.kernel_size, int): + self.kernel_size = [self.kernel_size] * 2 + + def forward(self, x): + h, w = x.shape[-2:] + + extra_h = (math.ceil(w / self.stride[1]) - + 1) * self.stride[1] - w + self.kernel_size[1] + extra_v = (math.ceil(h / self.stride[0]) - + 1) * self.stride[0] - h + self.kernel_size[0] + + left = extra_h // 2 + right = extra_h - left + top = extra_v // 2 + bottom = extra_v - top + + x = F.pad(x, [left, right, top, bottom]) + x = self.pool(x) + + return x + + +class DepthWiseConvBlock(nn.Module): + + def __init__( + self, + in_channels: int, + out_channels: int, + apply_norm: bool = True, + conv_bn_act_pattern: bool = False, + norm_cfg: OptConfigType = dict(type='BN', momentum=1e-2, eps=1e-3) + ) -> None: + super(DepthWiseConvBlock, self).__init__() + self.depthwise_conv = Conv2dSamePadding( + in_channels, + in_channels, + kernel_size=3, + stride=1, + groups=in_channels, + bias=False) + self.pointwise_conv = Conv2dSamePadding( + in_channels, out_channels, kernel_size=1, stride=1) + + self.apply_norm = apply_norm + if self.apply_norm: + self.bn = build_norm_layer(norm_cfg, num_features=out_channels)[1] + + self.apply_activation = conv_bn_act_pattern + if self.apply_activation: + self.swish = Swish() + + def forward(self, x): + x = self.depthwise_conv(x) + x = self.pointwise_conv(x) + if self.apply_norm: + x = self.bn(x) + if self.apply_activation: + x = self.swish(x) + + return x + + +class DownChannelBlock(nn.Module): + + def __init__( + self, + in_channels: int, + out_channels: int, + apply_norm: bool = True, + conv_bn_act_pattern: bool = False, + norm_cfg: OptConfigType = dict(type='BN', momentum=1e-2, eps=1e-3) + ) -> None: + super(DownChannelBlock, self).__init__() + self.down_conv = Conv2dSamePadding(in_channels, out_channels, 1) + self.apply_norm = apply_norm + if self.apply_norm: + self.bn = build_norm_layer(norm_cfg, num_features=out_channels)[1] + self.apply_activation = conv_bn_act_pattern + if self.apply_activation: + self.swish = Swish() + + def forward(self, x): + x = self.down_conv(x) + if self.apply_norm: + x = self.bn(x) + if self.apply_activation: + x = self.swish(x) + + return x diff --git a/mmdetection/projects/HDINO/README.md b/mmdetection/projects/HDINO/README.md new file mode 100644 index 0000000..078ca42 --- /dev/null +++ b/mmdetection/projects/HDINO/README.md @@ -0,0 +1,35 @@ +# H-DETR + +> [DETRs with Hybrid Matching](https://arxiv.org/abs/2207.13080) + + + +## Abstract + +One-to-one set matching is a key design for DETR to establish its end-to-end capability, so that object detection does not require a hand-crafted NMS (non-maximum suppression) to remove duplicate detections. This end-to-end signature is important for the versatility of DETR, and it has been generalized to broader vision tasks. However, we note that there are few queries assigned as positive samples and the one-to-one set matching significantly reduces the training efficacy of positive samples. We propose a simple yet effective method based on a hybrid matching scheme that combines the original one-to-one matching branch with an auxiliary one-to-many matching branch during training. Our hybrid strategy has been shown to significantly improve accuracy. In inference, only the original one-to-one match branch is used, thus maintaining the end-to-end merit and the same inference efficiency of DETR. The method is named H-DETR, and it shows that a wide range of representative DETR methods can be consistently improved across a wide range of visual tasks, including DeformableDETR, PETRv2, PETR, and TransTrack, among others. + +
    + +
    + +## Results and Models + +| Backbone | Model | Lr schd | box AP | Config | Download | +| :------: | :-----------: | :-----: | :----: | :--------------------------------------------: | :------: | +| R-50 | H-DINO-4scale | 12e | 48.0 | [config](./h-dino-4scale_r50_8xb2-12e_coco.py) | | + +### NOTE + +1. We are based on `DINO` rather than `Deformable DETR` to support the `Hybrid Matching` algorithm. +2. We found that directly applying Hybrid Matching to the DINO algorithm results in a significant decrease in performance. If you have any other insights or suggestions, please feel free to comment or submit a pull request (PR). + +## Citation + +```latex +@article{jia2022detrs, + title={DETRs with Hybrid Matching}, + author={Jia, Ding and Yuan, Yuhui and He, Haodi and Wu, Xiaopei and Yu, Haojun and Lin, Weihong and Sun, Lei and Zhang, Chao and Hu, Han}, + journal={arXiv preprint arXiv:2207.13080}, + year={2022} +} +``` diff --git a/mmdetection/projects/HDINO/__init__.py b/mmdetection/projects/HDINO/__init__.py new file mode 100644 index 0000000..f8c3478 --- /dev/null +++ b/mmdetection/projects/HDINO/__init__.py @@ -0,0 +1,4 @@ +from .h_dino import HDINO +from .h_dino_head import HybridDINOHead + +__all__ = ['HDINO', 'HybridDINOHead'] diff --git a/mmdetection/projects/HDINO/h-dino-4scale_r50_8xb2-12e_coco.py b/mmdetection/projects/HDINO/h-dino-4scale_r50_8xb2-12e_coco.py new file mode 100644 index 0000000..7b16b48 --- /dev/null +++ b/mmdetection/projects/HDINO/h-dino-4scale_r50_8xb2-12e_coco.py @@ -0,0 +1,168 @@ +_base_ = [ + '../../configs/_base_/datasets/coco_detection.py', + '../../configs/_base_/default_runtime.py' +] + +custom_imports = dict(imports=['projects.HDINO'], allow_failed_imports=False) + +model = dict( + type='HDINO', + num_queries=1800, # num_total_queries: 900+900 + with_box_refine=True, + as_two_stage=True, + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True, + pad_size_divisor=1), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='ChannelMapper', + in_channels=[512, 1024, 2048], + kernel_size=1, + out_channels=256, + act_cfg=None, + norm_cfg=dict(type='GN', num_groups=32), + num_outs=4), + encoder=dict( + num_layers=6, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0))), # 0.1 for DeformDETR + decoder=dict( + num_layers=6, + return_intermediate=True, + layer_cfg=dict( + self_attn_cfg=dict(embed_dims=256, num_heads=8, + dropout=0.0), # 0.1 for DeformDETR + cross_attn_cfg=dict(embed_dims=256, num_levels=4, + dropout=0.0), # 0.1 for DeformDETR + ffn_cfg=dict( + embed_dims=256, + feedforward_channels=2048, # 1024 for DeformDETR + ffn_drop=0.0)), # 0.1 for DeformDETR + post_norm_cfg=None), + positional_encoding=dict( + num_feats=128, + normalize=True, + offset=0.0, # -0.5 for DeformDETR + temperature=20), # 10000 for DeformDETR + bbox_head=dict( + type='HybridDINOHead', + num_classes=80, + sync_cls_avg_factor=True, + num_query_one2one=900, + k_one2many=2, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), # 2.0 in DeformDETR + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + dn_cfg=dict( + label_noise_scale=0.5, + box_noise_scale=1.0, # 0.4 for DN-DETR + group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), + # training and testing settings + train_cfg=dict( + assigner=dict( + type='HungarianAssigner', + match_costs=[ + dict(type='FocalLossCost', weight=2.0), + dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + dict(type='IoUCost', iou_mode='giou', weight=2.0) + ])), + test_cfg=dict(max_per_img=300)) # 100 for DeformDETR + +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='RandomFlip', prob=0.5), + dict( + type='RandomChoice', + transforms=[ + [ + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ], + [ + dict( + type='RandomChoiceResize', + # The radio of all image in train dataset < 7 + # follow the original implement + scales=[(400, 4200), (500, 4200), (600, 4200)], + keep_ratio=True), + dict( + type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=True), + dict( + type='RandomChoiceResize', + scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), + (608, 1333), (640, 1333), (672, 1333), (704, 1333), + (736, 1333), (768, 1333), (800, 1333)], + keep_ratio=True) + ] + ]), + dict(type='PackDetInputs') +] +train_dataloader = dict( + dataset=dict( + filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='AdamW', + lr=0.0001, # 0.0002 for DeformDETR + weight_decay=0.0001), + clip_grad=dict(max_norm=0.1, norm_type=2), + paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}) +) # custom_keys contains sampling_offsets and reference_points in DeformDETR # noqa + +# learning policy +max_epochs = 12 +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[11], + gamma=0.1) +] + +# NOTE: `auto_scale_lr` is for automatically scaling LR, +# USER SHOULD NOT CHANGE ITS VALUES. +# base_batch_size = (8 GPUs) x (2 samples per GPU) +auto_scale_lr = dict(base_batch_size=16) diff --git a/mmdetection/projects/HDINO/h_dino.py b/mmdetection/projects/HDINO/h_dino.py new file mode 100644 index 0000000..3f9d116 --- /dev/null +++ b/mmdetection/projects/HDINO/h_dino.py @@ -0,0 +1,149 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +from torch import Tensor, nn +from torch.nn.init import normal_ + +from mmdet.models.detectors import DINO, DeformableDETR +from mmdet.models.detectors.deformable_detr import \ + MultiScaleDeformableAttention +from mmdet.registry import MODELS +from mmdet.structures import OptSampleList +from mmdet.utils import OptConfigType + + +@MODELS.register_module() +class HDINO(DINO): + + def __init__(self, + *args, + bbox_head: OptConfigType = None, + **kwargs) -> None: + self.method = 0 + self.num_query_one2one = bbox_head['num_query_one2one'] + super(HDINO, self).__init__(*args, bbox_head=bbox_head, **kwargs) + + def _init_layers(self) -> None: + super(HDINO, self)._init_layers() + self.query_embedding = None + if self.method == 1: + self.query_map = nn.Linear(self.embed_dims, self.embed_dims) + else: + self.pos_trans_fc = nn.Linear(self.embed_dims * 2, self.embed_dims) + self.pos_trans_norm = nn.LayerNorm(self.embed_dims) + + def init_weights(self) -> None: + super(DeformableDETR, self).init_weights() + """Initialize weights for Transformer and other components.""" + for coder in self.encoder, self.decoder: + for p in coder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MultiScaleDeformableAttention): + m.init_weights() + nn.init.xavier_uniform_(self.memory_trans_fc.weight) + normal_(self.level_embed) + + if self.method == 1: + nn.init.xavier_uniform_(self.query_map.weight) + else: + nn.init.xavier_uniform_(self.pos_trans_fc.weight) + + def pre_decoder( + self, + memory: Tensor, + memory_mask: Tensor, + spatial_shapes: Tensor, + batch_data_samples: OptSampleList = None, + ) -> Tuple[dict, dict]: + + bs, _, c = memory.shape + cls_out_features = self.bbox_head.cls_branches[ + self.decoder.num_layers].out_features + + output_memory, output_proposals = self.gen_encoder_output_proposals( + memory, memory_mask, spatial_shapes) + enc_outputs_class = self.bbox_head.cls_branches[ + self.decoder.num_layers]( + output_memory) + enc_outputs_coord_unact = self.bbox_head.reg_branches[ + self.decoder.num_layers](output_memory) + output_proposals + + # NOTE The DINO selects top-k proposals according to scores of + # multi-class classification, while DeformDETR, where the input + # is `enc_outputs_class[..., 0]` selects according to scores of + # binary classification. + topk_indices = torch.topk( + enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1] + topk_score = torch.gather( + enc_outputs_class, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features)) + topk_coords_unact = torch.gather( + enc_outputs_coord_unact, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, 4)) + topk_coords = topk_coords_unact.sigmoid() + topk_coords_unact = topk_coords_unact.detach() + + # We only made changes here. + # ------------------------------------- + if self.method == 1: + map_memory = self.query_map(memory.detach()) + query = torch.gather( + map_memory, 1, + topk_indices.unsqueeze(-1).repeat(1, 1, self.embed_dims)) + else: + pos_trans_out = self.pos_trans_fc( + self.get_proposal_pos_embed(topk_coords_unact)) + query = self.pos_trans_norm(pos_trans_out) + # ------------------------------------- + + if self.training: + dn_label_query, dn_bbox_query, dn_mask, dn_meta = \ + self.dn_query_generator(batch_data_samples) + query = torch.cat([dn_label_query, query], dim=1) + reference_points = torch.cat([dn_bbox_query, topk_coords_unact], + dim=1) + else: + reference_points = topk_coords_unact + dn_mask, dn_meta = None, None + reference_points = reference_points.sigmoid() + + decoder_inputs_dict = dict( + query=query, + memory=memory, + reference_points=reference_points, + dn_mask=dn_mask) + # NOTE DINO calculates encoder losses on scores and coordinates + # of selected top-k encoder queries, while DeformDETR is of all + # encoder queries. + head_inputs_dict = dict( + enc_outputs_class=topk_score, + enc_outputs_coord=topk_coords, + dn_meta=dn_meta) if self.training else dict() + + # We only made changes here. + # ------------------------------------- + if self.training: + # train: num_denoising_queries + num_query_one2one + # + num_query_one2many + dn_mask = decoder_inputs_dict['dn_mask'] + num_denoising_queries = head_inputs_dict['dn_meta'][ + 'num_denoising_queries'] + num_query_one2one = num_denoising_queries + self.num_query_one2one + # dn_mask[num_query_one2one:, :num_query_one2one] = True + dn_mask[num_denoising_queries:num_query_one2one, + num_query_one2one:] = True + decoder_inputs_dict['dn_mask'] = dn_mask + else: + # test: num_query_one2one + # + num_query_one2many + query = decoder_inputs_dict['query'] + reference_points = decoder_inputs_dict['reference_points'] + num_query_one2many = self.num_queries - self.num_query_one2one + decoder_inputs_dict['query'] = query[:num_query_one2many] + decoder_inputs_dict[ + 'reference_points'] = reference_points[:num_query_one2many] + # ------------------------------------- + return decoder_inputs_dict, head_inputs_dict diff --git a/mmdetection/projects/HDINO/h_dino_head.py b/mmdetection/projects/HDINO/h_dino_head.py new file mode 100644 index 0000000..aa1d086 --- /dev/null +++ b/mmdetection/projects/HDINO/h_dino_head.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List + +from torch import Tensor + +from mmdet.models.dense_heads.dino_head import DINOHead +from mmdet.models.utils import multi_apply +from mmdet.registry import MODELS +from mmdet.utils import InstanceList, OptInstanceList + + +@MODELS.register_module() +class HybridDINOHead(DINOHead): + """Head of the Hybrid Matching.""" + + def __init__(self, + *args, + num_query_one2one: int = 900, + k_one2many: int = 2, + **kwargs) -> None: + self.num_query_one2one = num_query_one2one + self.k_one2many = k_one2many + super().__init__(*args, **kwargs) + + def loss_by_feat( + self, + all_layers_cls_scores: Tensor, + all_layers_bbox_preds: Tensor, + enc_cls_scores: Tensor, + enc_bbox_preds: Tensor, + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + dn_meta: Dict[str, int], + batch_gt_instances_ignore: OptInstanceList = None + ) -> Dict[str, Tensor]: + """Loss function. + + Args: + all_layers_cls_scores (Tensor): Classification scores of all + decoder layers, has shape (num_decoder_layers, bs, + num_queries_total, cls_out_channels), where + `num_queries_total` is the sum of `num_denoising_queries` + and `num_matching_queries`. + all_layers_bbox_preds (Tensor): Regression outputs of all decoder + layers. Each is a 4D-tensor with normalized coordinate format + (cx, cy, w, h) and has shape (num_decoder_layers, bs, + num_queries_total, 4). + enc_cls_scores (Tensor): The score of each point on encode + feature map, has shape (bs, num_feat_points, cls_out_channels). + enc_bbox_preds (Tensor): The proposal generate from the encode + feature map, has shape (bs, num_feat_points, 4) with the last + dimension arranged as (cx, cy, w, h). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + dn_meta (Dict[str, int]): The dictionary saves information about + group collation, including 'num_denoising_queries' and + 'num_denoising_groups'. It will be used for split outputs of + denoising and matching parts and loss calculation. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + # train: num_denoising_queries + num_query_one2one + # + num_query_one2many + num_query_one2one = dn_meta[ + 'num_denoising_queries'] + self.num_query_one2one + outputs_classes_one2one = \ + all_layers_cls_scores[:, :, 0: num_query_one2one, :] + outputs_coords_one2one = \ + all_layers_bbox_preds[:, :, 0: num_query_one2one, :] + # hybrid-matching part + outputs_classes_one2many = \ + all_layers_cls_scores[:, :, num_query_one2one:, :] + outputs_coords_one2many = \ + all_layers_bbox_preds[:, :, num_query_one2one:, :] + + loss_dict = super(HybridDINOHead, self).loss_by_feat( + outputs_classes_one2one, outputs_coords_one2one, enc_cls_scores, + enc_bbox_preds, batch_gt_instances, batch_img_metas, dn_meta, + batch_gt_instances_ignore) + + o2m_batch_gt_instances = [] + for gt_instance in batch_gt_instances: + bboxes = gt_instance.bboxes.repeat(self.k_one2many, 1) + labels = gt_instance.labels.repeat(self.k_one2many) + new_gt_instance = gt_instance.new(bboxes=bboxes, labels=labels) + o2m_batch_gt_instances.append(new_gt_instance) + + losses_cls_o2m, losses_bbox_o2m, losses_iou_o2m = multi_apply( + self.loss_by_feat_single, + outputs_classes_one2many, + outputs_coords_one2many, + batch_gt_instances=o2m_batch_gt_instances, + batch_img_metas=batch_img_metas) + + loss_dict['loss_cls_o2m'] = losses_cls_o2m[-1] + loss_dict['loss_bbox_o2m'] = losses_bbox_o2m[-1] + loss_dict['loss_iou_o2m'] = losses_iou_o2m[-1] + for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \ + enumerate(zip(losses_cls_o2m[:-1], losses_bbox_o2m[:-1], + losses_iou_o2m[:-1])): + loss_dict[f'd{num_dec_layer}.loss_cls_o2m'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox_o2m'] = loss_bbox_i + loss_dict[f'd{num_dec_layer}.loss_iou_o2m'] = loss_iou_i + return loss_dict diff --git a/mmdetection/projects/LabelStudio/backend_template/_wsgi.py b/mmdetection/projects/LabelStudio/backend_template/_wsgi.py new file mode 100644 index 0000000..1f8fb68 --- /dev/null +++ b/mmdetection/projects/LabelStudio/backend_template/_wsgi.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import logging +import logging.config +import os + +logging.config.dictConfig({ + 'version': 1, + 'formatters': { + 'standard': { + 'format': + '[%(asctime)s] [%(levelname)s] [%(name)s::%(funcName)s::%(lineno)d] %(message)s' # noqa E501 + } + }, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler', + 'level': 'DEBUG', + 'stream': 'ext://sys.stdout', + 'formatter': 'standard' + } + }, + 'root': { + 'level': 'ERROR', + 'handlers': ['console'], + 'propagate': True + } +}) + +_DEFAULT_CONFIG_PATH = os.path.join(os.path.dirname(__file__), 'config.json') + + +def get_kwargs_from_config(config_path=_DEFAULT_CONFIG_PATH): + if not os.path.exists(config_path): + return dict() + with open(config_path) as f: + config = json.load(f) + assert isinstance(config, dict) + return config + + +if __name__ == '__main__': + + from label_studio_ml.api import init_app + + from projects.LabelStudio.backend_template.mmdetection import MMDetection + + parser = argparse.ArgumentParser(description='Label studio') + parser.add_argument( + '-p', + '--port', + dest='port', + type=int, + default=9090, + help='Server port') + parser.add_argument( + '--host', dest='host', type=str, default='0.0.0.0', help='Server host') + parser.add_argument( + '--kwargs', + '--with', + dest='kwargs', + metavar='KEY=VAL', + nargs='+', + type=lambda kv: kv.split('='), + help='Additional LabelStudioMLBase model initialization kwargs') + parser.add_argument( + '-d', + '--debug', + dest='debug', + action='store_true', + help='Switch debug mode') + parser.add_argument( + '--log-level', + dest='log_level', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], + default=None, + help='Logging level') + parser.add_argument( + '--model-dir', + dest='model_dir', + default=os.path.dirname(__file__), + help='Directory models are store', + ) + parser.add_argument( + '--check', + dest='check', + action='store_true', + help='Validate model instance before launching server') + + args = parser.parse_args() + + # setup logging level + if args.log_level: + logging.root.setLevel(args.log_level) + + def isfloat(value): + try: + float(value) + return True + except ValueError: + return False + + def parse_kwargs(): + param = dict() + for k, v in args.kwargs: + if v.isdigit(): + param[k] = int(v) + elif v == 'True' or v == 'true': + param[k] = True + elif v == 'False' or v == 'False': + param[k] = False + elif isfloat(v): + param[k] = float(v) + else: + param[k] = v + return param + + kwargs = get_kwargs_from_config() + + if args.kwargs: + kwargs.update(parse_kwargs()) + + if args.check: + print('Check "' + MMDetection.__name__ + '" instance creation..') + model = MMDetection(**kwargs) + + app = init_app( + model_class=MMDetection, + model_dir=os.environ.get('MODEL_DIR', args.model_dir), + redis_queue=os.environ.get('RQ_QUEUE_NAME', 'default'), + redis_host=os.environ.get('REDIS_HOST', 'localhost'), + redis_port=os.environ.get('REDIS_PORT', 6379), + **kwargs) + + app.run(host=args.host, port=args.port, debug=args.debug) + +else: + # for uWSGI use + app = init_app( + model_class=MMDetection, + model_dir=os.environ.get('MODEL_DIR', os.path.dirname(__file__)), + redis_queue=os.environ.get('RQ_QUEUE_NAME', 'default'), + redis_host=os.environ.get('REDIS_HOST', 'localhost'), + redis_port=os.environ.get('REDIS_PORT', 6379)) diff --git a/mmdetection/projects/LabelStudio/backend_template/mmdetection.py b/mmdetection/projects/LabelStudio/backend_template/mmdetection.py new file mode 100644 index 0000000..f25e80e --- /dev/null +++ b/mmdetection/projects/LabelStudio/backend_template/mmdetection.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import io +import json +import logging +import os +from urllib.parse import urlparse + +import boto3 +from botocore.exceptions import ClientError +from label_studio_ml.model import LabelStudioMLBase +from label_studio_ml.utils import (DATA_UNDEFINED_NAME, get_image_size, + get_single_tag_keys) +from label_studio_tools.core.utils.io import get_data_dir + +from mmdet.apis import inference_detector, init_detector + +logger = logging.getLogger(__name__) + + +class MMDetection(LabelStudioMLBase): + """Object detector based on https://github.com/open-mmlab/mmdetection.""" + + def __init__(self, + config_file=None, + checkpoint_file=None, + image_dir=None, + labels_file=None, + score_threshold=0.5, + device='cpu', + **kwargs): + + super(MMDetection, self).__init__(**kwargs) + config_file = config_file or os.environ['config_file'] + checkpoint_file = checkpoint_file or os.environ['checkpoint_file'] + self.config_file = config_file + self.checkpoint_file = checkpoint_file + self.labels_file = labels_file + # default Label Studio image upload folder + upload_dir = os.path.join(get_data_dir(), 'media', 'upload') + self.image_dir = image_dir or upload_dir + logger.debug( + f'{self.__class__.__name__} reads images from {self.image_dir}') + if self.labels_file and os.path.exists(self.labels_file): + self.label_map = json_load(self.labels_file) + else: + self.label_map = {} + + self.from_name, self.to_name, self.value, self.labels_in_config = get_single_tag_keys( # noqa E501 + self.parsed_label_config, 'RectangleLabels', 'Image') + schema = list(self.parsed_label_config.values())[0] + self.labels_in_config = set(self.labels_in_config) + + # Collect label maps from `predicted_values="airplane,car"` attribute in