clean video inference. add individuals to metadata.

DeepLabCut · n-poulsen · Oct 18, 2024 · Sep 18, 2024 · Oct 1, 2024 · Oct 1, 2024
commit 3f87fee9a50adece4dccb45c9cb144a00764ddb1
diff --git a/deeplabcut/pose_estimation_pytorch/README.md b/deeplabcut/pose_estimation_pytorch/README.md
@@ -469,7 +469,7 @@ pose_runner, detector_runner = get_inference_runners(
 )
 
 predictions = video_inference(
-    video_path=video_path,
+    video=video_path,
     task=pose_task,
     pose_runner=pose_runner,
     detector_runner=detector_runner,

diff --git a/deeplabcut/pose_estimation_pytorch/apis/analyze_videos.py b/deeplabcut/pose_estimation_pytorch/apis/analyze_videos.py
@@ -91,31 +91,42 @@ def __next__(self) -> np.ndarray | tuple[str, dict[str, Any]]:
 
 
 def video_inference(
-    video_path: str | Path,
+    video: str | Path | VideoIterator,
     task: Task,
     pose_runner: InferenceRunner,
     detector_runner: InferenceRunner | None = None,
     with_identity: bool = False,
-    return_video_metadata: bool = False,
     cropping: list[int] | None = None,
 ) -> list[dict[str, np.ndarray]]:
-    """Runs inference on a video"""
-    video = VideoIterator(str(video_path), cropping=cropping)
+    """Runs inference on a video
+
+    Args:
+        video: The video to analyze
+        task: The pose task to run (bottom-up or top-down)
+        pose_runner: The pose runner to run inference with
+        detector_runner: When ``task==Task.TOP_DOWN``, the detector runner to obtain
+            bounding boxes for the video.
+        with_identity: Whether identity predictions should be made with the model.
+        cropping: Optionally, video inference can be run on a cropped version of the
+            video. To do so, pass a list containing 4 elements to specify which area
+            of the video should be analyzed: ``[xmin, xmax, ymin, ymax]``.
+
+    Returns:
+        Predictions for each frame in the video.
+    """
+    if not isinstance(video, VideoIterator):
+        video = VideoIterator(str(video), cropping=cropping)
+
     n_frames = video.get_n_frames()
     vid_w, vid_h = video.dimensions
-    print(f"Starting to analyze {video_path}")
+    print(f"Starting to analyze {video.video_path}")
     print(
         f"Video metadata: \n"
         f"  Overall # of frames:    {n_frames}\n"
         f"  Duration of video [s]:  {n_frames / max(1, video.fps):.2f}\n"
         f"  fps:                    {video.fps}\n"
         f"  resolution:             w={vid_w}, h={vid_h}\n"
     )
-    video_metadata = {
-        "n_frames": n_frames,
-        "fps": video.fps,
-        "resolution": (vid_w, vid_h),
-    }
 
     if task == Task.TOP_DOWN:
         # Get bounding boxes for context
@@ -147,9 +158,6 @@ def video_inference(
             f"video (tips on how to do that: {tip_url}{header})"
         )
 
-    if return_video_metadata:
-        return predictions, video_metadata
-
     return predictions
 
 
@@ -337,7 +345,7 @@ def analyze_videos(
         else:
             runtime = [time.time()]
             predictions = video_inference(
-                video_path=video,
+                video=video,
                 pose_runner=pose_runner,
                 task=pose_task,
                 detector_runner=detector_runner,
@@ -558,6 +566,9 @@ def _generate_metadata(
         "training set fraction": train_fraction,
         "cropping": cropping is not None,
         "cropping_parameters": cropping_parameters,
+        "individuals": pytorch_config["metadata"]["individuals"],
+        "bodyparts": pytorch_config["metadata"]["bodyparts"],
+        "unique_bodyparts": pytorch_config["metadata"]["unique_bodyparts"],
     }
     return {"data": metadata}
 

diff --git a/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py b/deeplabcut/pose_estimation_pytorch/modelzoo/inference.py
@@ -19,6 +19,7 @@
 from deeplabcut.pose_estimation_pytorch.apis.analyze_videos import (
     create_df_from_prediction,
     video_inference,
+    VideoIterator,
 )
 from deeplabcut.pose_estimation_pytorch.apis.utils import get_inference_runners
 from deeplabcut.pose_estimation_pytorch.modelzoo.utils import (
@@ -127,17 +128,19 @@ def _video_inference_superanimal(
             # str(output_h5).replace(".h5", "_after_adapt.json")
             output_json = output_json.with_stem(output_h5.stem + output_suffix)
 
-        predictions, video_metadata = video_inference(
-            video_path,
+        video = VideoIterator(video_path)
+        predictions = video_inference(
+            video,
             task=pose_task,
             pose_runner=pose_runner,
             detector_runner=detector_runner,
-            return_video_metadata=True,
         )
+
         pred_bodyparts = np.stack([p["bodyparts"][..., :3] for p in predictions])
         pred_unique_bodyparts = None
 
-        bbox = (0, video_metadata["resolution"][0], 0, video_metadata["resolution"][1])
+        vid_w, vid_h = video.dimensions
+        bbox = (0, vid_w, 0, vid_h)
         print(f"Saving results to {dest_folder}")
 
         df = create_df_from_prediction(
@@ -164,7 +167,7 @@ def _video_inference_superanimal(
             video_path,
             output_h5,
             pcutoff=pcutoff,
-            fps=video_metadata["fps"],
+            fps=video.fps,
             bbox=bbox,
             cmap=colormap,
             output_path=str(output_video),