Merge pull request #41 from cameron-a-johnson/dev/simple-tcn-input-ve…

…ctor Adding simple 'locsAndConfs' TCN input vector
PTG-Kitware · Nov 6, 2024 · 79cf1d1 · 79cf1d1
2 parents 22866fc + 202f9aa
commit 79cf1d1
Show file tree

Hide file tree

Showing 5 changed files with 311 additions and 18 deletions.
diff --git a/configs/data/ptg.yaml b/configs/data/ptg.yaml
@@ -3,14 +3,16 @@ _target_: tcn_hpl.data.ptg_datamodule.PTGDataModule
 train_dataset:
   _target_: tcn_hpl.data.tcn_dataset.TCNDataset
   window_size: 15
-  vectorizer:
-    _target_: tcn_hpl.data.vectorize.classic.Classic
-    feat_version: 6
-    top_k: 1
-    num_classes: 7
-    background_idx: 0
-    hand_left_idx: 5
-    hand_right_idx: 6
+    # No vectorizer should be specified here, as there should be no "default". 
+  # Example of a vectorizer:
+  # vectorizer:
+  #  _target_: tcn_hpl.data.vectorize.classic.Classic
+  #  feat_version: 6
+  #  top_k: 1
+  #  num_classes: 7
+  #  background_idx: 0
+  #  hand_left_idx: 5
+  #  hand_right_idx: 6
   transform:
     _target_: torchvision.transforms.Compose
     transforms: []

diff --git a/configs/experiment/m2/feat_locsConfs.yaml b/configs/experiment/m2/feat_locsConfs.yaml
@@ -0,0 +1,121 @@
+# @package _global_
+
+# to execute this experiment run:
+# python train.py experiment=example
+task: "m2"
+# feature_version: 6
+topic: "medical"
+
+defaults:
+  - override /data: ptg
+  - override /model: ptg
+  - override /callbacks: default
+  - override /trainer: gpu
+  - override /paths: default
+  #- override /logger: aim
+  - override /logger: csv
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+# Change this name to something descriptive and unique for this experiment.
+# This will differentiate the run logs and output to be separate from other
+# experiments that may have been run under the configured
+# Setting this value influences:
+# - the name of the directory under `${paths.root_dir}/logs/` in which training
+#   run files are stored.
+# Default is "train" set in the "configs/train.yaml" file.
+#task_name:
+
+# simply provide checkpoint path to resume training
+#ckpt_path: null
+
+tags: ["m2", "ms_tcn", "debug"]
+
+seed: 12345
+
+trainer:
+  min_epochs: 50
+  max_epochs: 500
+  log_every_n_steps: 1
+
+model:
+  compile: false
+  net:
+    # Length of feature vector for a single frame.
+    # Currently derived from feature version and other hyperparameters.
+    dim: 102
+  num_classes: 9
+
+data:
+  coco_train_activities: "${paths.coco_file_root}/TRAIN-activity_truth.coco.json"
+  coco_train_objects: "${paths.coco_file_root}/TRAIN-object_detections.coco.json"
+  coco_train_poses: "${paths.coco_file_root}/TRAIN-pose_estimates.coco.json"
+
+  coco_validation_activities: "${paths.coco_file_root}/VALIDATION-activity_truth.coco.json"
+  coco_validation_objects: "${paths.coco_file_root}/VALIDATION-object_detections.coco.json"
+  coco_validation_poses: "${paths.coco_file_root}/VALIDATION-pose_estimates.coco.json"
+
+  coco_test_activities: "${paths.coco_file_root}/TEST-activity_truth.coco.json"
+  coco_test_objects: "${paths.coco_file_root}/TEST-object_detections.coco.json"
+  coco_test_poses: "${paths.coco_file_root}/TEST-pose_estimates.coco.json"
+
+  batch_size: 16384
+  num_workers: 16
+  target_framerate: 15  # BBN Hololens2 Framerate
+  epoch_length: 200000
+
+  train_dataset:
+    window_size: 25
+    vectorizer:
+      _target_: tcn_hpl.data.vectorize.locs_and_confs.LocsAndConfs
+      top_k: 1
+      num_classes: 7
+      use_joint_confs: True
+      use_pixel_norm: True
+      use_hand_obj_offsets: False
+      background_idx: 0
+    transform:
+      transforms: []  # no transforms
+#        - _target_: tcn_hpl.data.components.augmentations.MoveCenterPts
+#          hand_dist_delta: 0.05
+#          obj_dist_delta: 0.05
+#          joint_dist_delta: 0.025
+#          im_w: 1280
+#          im_h: 720
+#          num_obj_classes: 42
+#          feat_version: 2
+#          top_k_objects: 1
+#        - _target_: tcn_hpl.data.components.augmentations.NormalizePixelPts
+#          im_w: 1280
+#          im_h: 720
+#          num_obj_classes: 42
+#          feat_version: 2
+#          top_k_objects: 1
+  val_dataset:
+    transform:
+      transforms: []  # no transforms
+#        - _target_: tcn_hpl.data.components.augmentations.NormalizePixelPts
+#          im_w: 1280
+#          im_h: 720
+#          num_obj_classes: 42
+#          feat_version: 2
+#          top_k_objects: 1
+  # Test dataset usually configured the same as val, unless there is some
+  # different set of transforms that should be used during test/prediction.
+
+paths:
+  # root_dir: "/data/PTG/medical/training/activity_classifier/TCN_HPL/"
+  # root_dir: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens/training_root"
+  root_dir: "/home/local/KHQ/cameron.johnson/code/TCN_HPL/tcn_hpl/train-TCN-M2_bbn_hololens/training_root"
+
+  # Convenience variable to where your train/val/test split COCO file datasets
+  # are stored.
+  # coco_file_root: "/home/local/KHQ/paul.tunison/data/darpa-ptg/train-TCN-M2_bbn_hololens"
+  coco_file_root: "/home/local/KHQ/cameron.johnson/code/TCN_HPL/train-TCN-M2_bbn_hololens"
+
+#exp_name: "tcn_training_revive"
+#logger:
+#  aim:
+#    experiment: ${task_name}
+#    capture_terminal_logs: true
diff --git a/tcn_hpl/data/tcn_dataset.py b/tcn_hpl/data/tcn_dataset.py
@@ -306,6 +306,12 @@ def load_data_offline(
                         )
                     else:
                         frame_dets = empty_dets
+
+                    # Frame height and width should be available.
+                    img_info = activity_coco.index.imgs[img_id]
+                    assert "height" in img_info
+                    assert "width" in img_info
+                    frame_size = (img_info["width"], img_info["height"])
 
                     # Only consider annotations that actually have keypoints.
                     # There may be no poses on this frame.
@@ -336,7 +342,8 @@ def load_data_offline(
                         )
                     else:
                         frame_poses = empty_pose
-                    vid_frame_data.append(FrameData(frame_dets, frame_poses))
+                    # import ipdb; ipdb.set_trace()
+                    vid_frame_data.append(FrameData(frame_dets, frame_poses, frame_size))
 
                 # Compose a list of indices into frame_data that this video's
                 # worth of content resides.
@@ -639,15 +646,14 @@ def test_dataset_for_input(
     pose_coco = kwcoco.CocoDataset(pose_coco)
 
     # TODO: Some method of configuring which vectorizer to use.
-    from tcn_hpl.data.vectorize.classic import Classic
-    vectorizer = Classic(
-        feat_version=6,
-        top_k=1,
-        # M2/R18 object detection class indices
-        num_classes=7,
-        background_idx=0,
-        hand_left_idx=5,
-        hand_right_idx=6,
+    from tcn_hpl.data.vectorize.locs_and_confs import LocsAndConfs
+    vectorizer = LocsAndConfs(
+        top_k = 1,
+        num_classes = 7,
+        use_joint_confs = True,
+        use_pixel_norm = True,
+        use_hand_obj_offsets = False,
+        background_idx = 0
     )
 
     dataset = TCNDataset(window_size=window_size, vectorizer=vectorizer)

diff --git a/tcn_hpl/data/vectorize/_data.py b/tcn_hpl/data/vectorize/_data.py
@@ -90,6 +90,9 @@ class FrameData:
     # This may be None, which implies that an object pose estimation was not
     # run for this frame.
     poses: tg.Optional[FramePoses]
+    # FrameSize: Length-2 tuple expected: (width, height).
+    # This is the video frame's width and height in pixels.
+    size: tg.Tuple[int, int]
 
     def __bool__(self):
         """

diff --git a/tcn_hpl/data/vectorize/locs_and_confs.py b/tcn_hpl/data/vectorize/locs_and_confs.py
@@ -0,0 +1,161 @@
+import functools
+import typing as tg
+
+import numpy as np
+from numpy import typing as npt
+
+from tcn_hpl.data.vectorize._interface import Vectorize, FrameData
+
+NUM_POSE_JOINTS = 22
+
+class LocsAndConfs(Vectorize):
+    """
+    Previous manual approach to vectorization.
+
+    Arguments:
+        top_k: The number of top per-class examples to use in vector
+            construction.
+        num_classes: the number of classes in the object detector.
+        use_joint_confs: use the confidence of each pose joint.
+            (changes the length of the input vector, which needs to
+            be manually updated if this flag changes.)
+        use_pixel_norm: Normalize pixel coordinates by dividing by
+            frame height and width, respectively. Normalized values 
+            are between 0 and 1. Does not change input vector length.
+        use_joint_obj_offsets: add abs(X and Y offsets) for between joints and
+            each object.
+            (changes the length of the input vector, which needs to
+            be manually updated if this flag changes.)
+        
+    """
+
+    def __init__(
+        self,
+        top_k: int = 1,
+        num_classes: int = 7,
+        use_joint_confs: bool = True,
+        use_pixel_norm: bool = True,
+        use_joint_obj_offsets: bool = False,
+        background_idx: int = 0
+    ):
+        super().__init__()
+
+        self._top_k = top_k
+        self._num_classes = num_classes
+        self._use_joint_confs = use_joint_confs
+        self._use_pixel_norm = use_pixel_norm
+        self._use_joint_obj_offsets = use_joint_obj_offsets
+        self._background_idx = background_idx
+
+    # Get the top "k" object indexes for each object
+    @staticmethod
+    def get_top_k_indexes_of_one_obj_type(f_dets, k, label_ind):
+        """
+        Find all instances of a label index in object detections.
+        Then sort them and return the top K.
+        Inputs:
+        - object_dets:
+        """
+        labels = f_dets.labels
+        scores = f_dets.scores
+        # Get all labels of an obj type
+        filtered_idxs = [i for i, e in enumerate(labels) if e == label_ind]
+        if not filtered_idxs:
+            return None
+        filtered_scores = [scores[i] for i in filtered_idxs]
+        # Sort labels by score values.
+        sorted_inds = [i[1] for i in sorted(zip(filtered_scores, filtered_idxs))]
+        return sorted_inds[:k]
+
+    @staticmethod
+    def append_vector(frame_feat, i, number):
+        frame_feat[i] = number
+        return frame_feat, i + 1
+
+    def determine_vector_length(self, data: FrameData) -> int:
+        #########################
+        # Feature vector
+        #########################
+        # Length: pose confs * 22, pose X's * 22, pose Y's * 22,
+        #         obj confs * num_objects(7 for M2), 
+        #         obj X * num_objects(7 for M2), 
+        #         obj Y * num_objects(7 for M2)
+        #         obj W * num_objects(7 for M2)
+        #         obj H * num_objects(7 for M2)
+        #         casualty conf * 1
+        vector_length = 0
+        # Joint confidences
+        if self._use_joint_confs:
+            vector_length += NUM_POSE_JOINTS
+        # X and Y for each joint
+        vector_length += 2 * NUM_POSE_JOINTS
+        # [Conf, X, Y, W, H] for k instances of each object class.
+        vector_length = 5 * self._top_k * self._num_classes
+        return vector_length
+
+
+    def vectorize(self, data: FrameData) -> npt.NDArray[np.float32]:
+
+        vector_len = self.determine_vector_length(data)
+        frame_feat = np.zeros(vector_len, dtype=np.float32)
+        # TODO: instead of carrying around this vector_ind, we should
+        # directly compute the offset of each feature we add to the TCN
+        # input vector. This would be much easier to debug.
+        vector_ind = 0
+        if self._use_pixel_norm:
+            W = data.size[0]
+            H = data.size[1]
+        else:
+            W = 1
+            H = 1
+        f_dets = data.object_detections
+
+        # Loop through all classes: populate obj conf, obj X, obj Y.
+        # Assumption: class labels are [0, 1, 2,... num_classes-1].
+        # TODO: this will break if top_k is ever > 1. Fix that.
+        for obj_ind in range(0,self._num_classes):
+            top_k_idxs = self.get_top_k_indexes_of_one_obj_type(f_dets, self._top_k, obj_ind)
+            if top_k_idxs: # This is None if there were no detections to sort for this class
+                for idx in top_k_idxs:
+                    # Conf
+                    frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_dets.scores[idx])
+                    # X
+                    frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_dets.boxes[idx][0] / W)
+                    # Y
+                    frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_dets.boxes[idx][1] / H)
+                    # W
+                    frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_dets.boxes[idx][2] / W)
+                    # H
+                    frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_dets.boxes[idx][3] / H)
+            else:
+                for _ in range(0, self._top_k * 5):
+                    # 5 Zeros
+                    frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, 0)
+
+        f_poses = data.poses
+        if f_poses:
+            # Find most confident body detection
+            confident_pose_idx = np.argmax(f_poses.scores)
+            num_joints = f_poses.joint_positions.shape[1]
+            frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_poses.scores[confident_pose_idx])
+
+            for joint_ind in range(0, num_joints):
+                # Conf
+                if self._use_joint_confs:
+                    frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_poses.joint_scores[confident_pose_idx][joint_ind])
+                # X
+                frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_poses.joint_positions[confident_pose_idx][joint_ind][0] / W)
+                # Y
+                frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, f_poses.joint_positions[confident_pose_idx][joint_ind][1] / H)
+        else:
+            num_joints = f_poses.joint_positions.shape[1]
+            if self._use_joint_confs:
+                rows_per_joint = 3
+            else:
+                rows_per_joint = 2
+            for _ in range(num_joints * rows_per_joint + 1):
+                frame_feat, vector_ind = self.append_vector(frame_feat, vector_ind, 0)
+
+        assert vector_ind == vector_len
+
+        return frame_feat