datajuicer · Cathy0908 · Mar 6, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/data_juicer/config/config_all.yaml b/data_juicer/config/config_all.yaml
@@ -560,18 +560,18 @@ process:
       if_output_point_maps_from_projection: True              # Determines whether to output point maps directly inferred by VGGT.
       if_output_point_maps_from_unprojection: True            # Determines whether to output point maps constructed from depth maps and camera parameters.
       if_output_point_tracks: True                            # Determines whether to output point tracks.
-  - video_camera_calibration_static_deepcalib_mapper:   # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib.
+  - video_camera_calibration_deepcalib_mapper:   # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib.
       model_path: "weights_10_0.02.h5"                        # The path to the DeepCalib Regression model.
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       duration: 0                                             # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
-      tag_field_name: 'static_camera_calibration_deepcalib_tags'   # the field name to store the tags. It's "static_camera_calibration_deepcalib_tags" in default.
+      tag_field_name: 'camera_calibration_deepcalib_tags'   # the field name to store the tags. It's "camera_calibration_deepcalib_tags" in default.
       frame_dir: None                                         # Output directory to save extracted frames.
       output_info_dir: None                                   # Output directory for saving camera parameters.
-  - video_camera_calibration_static_moge_mapper:        # Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib).
+  - video_camera_calibration_moge_mapper:        # Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib).
       model_path: "Ruicheng/moge-2-vitl"                      # The path to the Moge-2 model.
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       duration: 0                                             # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
-      tag_field_name: 'static_camera_calibration_moge_tags'   # the field name to store the tags. It's "static_camera_calibration_moge_tags" in default.
+      tag_field_name: 'camera_calibration_moge_tags'   # the field name to store the tags. It's "camera_calibration_moge_tags" in default.
       frame_dir: None                                         # Output directory to save extracted frames.
       if_output_info: True                                    # Whether to save the camera parameters results to an JSON file.
       output_info_dir: None                                   # Output directory for saving camera parameters.
@@ -972,6 +972,7 @@ process:
       min_score: 0.25                                         # the minimum motion score to keep samples
       max_score: 10000.0                                      # the maximum motion score to keep samples
       sampling_fps: 2                                         # the samplig rate of frames_per_second to compute optical flow
+      original_fps: null                                      # the original FPS of the video from which the frames were extracted, only used when frame_field is specified
       size: null                                              # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w)
       max_size: null                                          # maximum allowed for the longer edge of resized frames
       divisible: 1                                            # The number that the dimensions must be divisible by.
@@ -981,6 +982,7 @@ process:
       min_score: 1.0                                          # the minimum motion score to keep samples
       max_score: 10000.0                                      # the maximum motion score to keep samples
       sampling_fps: 2                                         # the samplig rate of frames_per_second to compute optical flow
+      original_fps: null                                      # the original FPS of the video from which the frames were extracted, only used when frame_field is specified
       size: null                                              # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w)
       max_size: null                                          # maximum allowed for the longer edge of resized frames
       divisible: 8                                            # The number that the dimensions must be divisible by.

diff --git a/data_juicer/ops/common/mano_func.py b/data_juicer/ops/common/mano_func.py
@@ -45,6 +45,25 @@ def forward(self, *args, **kwargs) -> smplx.utils.MANOOutput:
         mano_output.joints = joints
         return mano_output
 
+    @classmethod
+    def build_left(cls, model_path, fix_shapedirs=True):
+        """Build a LEFT-hand MANO model.
+
+        HaWoR uses a separate MANO_LEFT.pkl with is_rhand=False and a
+        shapedirs bug-fix (https://github.com/vchoutas/smplx/issues/48).
+
+        Args:
+            model_path: Path to MANO_LEFT.pkl
+            fix_shapedirs: Apply the left-hand shapedirs fix (default True).
+
+        Returns:
+            MANO model configured for left hand.
+        """
+        model = cls(model_path=model_path, is_rhand=False)
+        if fix_shapedirs:
+            model.shapedirs[:, 0, :] *= -1
+        return model
+
     def query(self, hmr_output):
         batch_size = hmr_output["pred_rotmat"].shape[0]
         pred_rotmat = hmr_output["pred_rotmat"].reshape(batch_size, -1, 3, 3)

diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py
@@ -53,6 +53,7 @@ def __init__(
         max_score: float = sys.float_info.max,
         frame_field: Optional[str] = None,
         sampling_fps: PositiveFloat = 2,
+        original_fps: Optional[PositiveFloat] = None,
         size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None,
         max_size: Optional[PositiveInt] = None,
         divisible: PositiveInt = 1,
@@ -72,6 +73,11 @@ def __init__(
             If frame_field is None, extract frames from the video field.
         :param sampling_fps: The sampling rate in frames_per_second for
             optical flow calculations.
+        :param original_fps: The original FPS of the video from which the
+            frames were extracted. Only used when `frame_field` is specified.
+            When provided, frames will be sampled at `sampling_fps` rate
+            by computing `sampling_step = round(original_fps / sampling_fps)`.
+            If None, all frames will be processed without sampling.
         :param size: Resize frames before computing optical flow. If size is a
             sequence like (h, w), frame size will be matched to this. If size
             is an int, smaller edge of frames will be matched to this number.
@@ -101,6 +107,7 @@ def __init__(
         self.min_score = min_score
         self.max_score = max_score
         self.sampling_fps = sampling_fps
+        self.original_fps = original_fps
         self.frame_field = frame_field
 
         if isinstance(size, (list, tuple)):
@@ -198,7 +205,18 @@ def _compute_motion_scores_from_frames(self, frames):
         video_motion_scores = []
         optical_flows = []
         prev_frame = None
-        for frame in frames:
+
+        # compute sampling step if original_fps is provided
+        sampling_step = 1
+        if self.original_fps is not None and self.original_fps > 0:
+            effective_fps = min(self.sampling_fps, self.original_fps)
+            sampling_step = max(round(self.original_fps / effective_fps), 1)
+
+        for frame_idx, frame in enumerate(frames):
+            # skip frames according to sampling_step
+            if sampling_step > 1 and frame_idx % sampling_step != 0:
+                continue
+
             if isinstance(frame, bytes):
                 image_array = np.frombuffer(frame, dtype=np.uint8)
                 frame = cv2.imdecode(image_array, cv2.IMREAD_COLOR)

diff --git a/data_juicer/ops/filter/video_motion_score_ptlflow_filter.py b/data_juicer/ops/filter/video_motion_score_ptlflow_filter.py
@@ -55,6 +55,7 @@ def __init__(
         ckpt_path: Optional[str] = "things",
         get_model_args: Optional[dict] = None,
         sampling_fps: PositiveFloat = 2,
+        original_fps: Optional[PositiveFloat] = None,
         size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None,
         max_size: Optional[PositiveInt] = None,
         divisible: PositiveInt = 8,
@@ -70,6 +71,7 @@ def __init__(
             max_score,
             frame_field,
             sampling_fps,
+            original_fps,
             size,
             max_size,
             divisible,

diff --git a/data_juicer/ops/filter/video_motion_score_raft_filter.py b/data_juicer/ops/filter/video_motion_score_raft_filter.py
@@ -52,6 +52,7 @@ def __init__(
         max_score: float = sys.float_info.max,
         frame_field: Optional[str] = None,
         sampling_fps: PositiveFloat = 2,
+        original_fps: Optional[PositiveFloat] = None,
         size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None,
         max_size: Optional[PositiveInt] = None,
         divisible: PositiveInt = 8,
@@ -67,6 +68,7 @@ def __init__(
             max_score,
             frame_field,
             sampling_fps,
+            original_fps,
             size,
             max_size,
             divisible,

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -21,6 +21,7 @@
 from .dialog_topic_detection_mapper import DialogTopicDetectionMapper
 from .download_file_mapper import DownloadFileMapper
 from .expand_macro_mapper import ExpandMacroMapper
+from .export_to_lerobot_mapper import ExportToLeRobotMapper
 from .extract_entity_attribute_mapper import ExtractEntityAttributeMapper
 from .extract_entity_relation_mapper import ExtractEntityRelationMapper
 from .extract_event_mapper import ExtractEventMapper
@@ -84,22 +85,27 @@
 from .text_chunk_mapper import TextChunkMapper
 from .text_tagging_by_prompt_mapper import TextTaggingByPromptMapper
 from .vggt_mapper import VggtMapper
-from .video_camera_calibration_static_deepcalib_mapper import (
-    VideoCameraCalibrationStaticDeepcalibMapper,
+from .video_atomic_action_segment_mapper import VideoAtomicActionSegmentMapper
+from .video_camera_calibration_deepcalib_mapper import (
+    VideoCameraCalibrationDeepcalibMapper,
 )
-from .video_camera_calibration_static_moge_mapper import (
-    VideoCameraCalibrationStaticMogeMapper,
+from .video_camera_calibration_droidcalib_mapper import (
+    VideoCameraCalibrationDroidCalibMapper,
 )
-from .video_camera_pose_mapper import VideoCameraPoseMapper
+from .video_camera_calibration_moge_mapper import VideoCameraCalibrationMogeMapper
+from .video_camera_pose_megasam_mapper import VideoCameraPoseMegaSaMMapper
 from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
 from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper
 from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper
 from .video_captioning_from_video_mapper import VideoCaptioningFromVideoMapper
 from .video_captioning_from_vlm_mapper import VideoCaptioningFromVLMMapper
+from .video_clip_reassembly_mapper import VideoClipReassemblyMapper
 from .video_depth_estimation_mapper import VideoDepthEstimationMapper
 from .video_extract_frames_mapper import VideoExtractFramesMapper
 from .video_face_blur_mapper import VideoFaceBlurMapper
 from .video_ffmpeg_wrapped_mapper import VideoFFmpegWrappedMapper
+from .video_hand_action_compute_mapper import VideoHandActionComputeMapper
+from .video_hand_motion_smooth_mapper import VideoHandMotionSmoothMapper
 from .video_hand_reconstruction_hawor_mapper import VideoHandReconstructionHaworMapper
 from .video_hand_reconstruction_mapper import VideoHandReconstructionMapper
 from .video_object_segmenting_mapper import VideoObjectSegmentingMapper
@@ -111,11 +117,14 @@
 from .video_split_by_scene_mapper import VideoSplitBySceneMapper
 from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper
 from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
+from .video_trajectory_overlay_mapper import VideoTrajectoryOverlayMapper
 from .video_undistort_mapper import VideoUndistortMapper
 from .video_whole_body_pose_estimation_mapper import VideoWholeBodyPoseEstimationMapper
 from .whitespace_normalization_mapper import WhitespaceNormalizationMapper
 
 __all__ = [
+    "VideoCameraCalibrationDroidCalibMapper",
+    "VideoCameraPoseMegaSaMMapper",
     "AudioAddGaussianNoiseMapper",
     "AudioFFmpegWrappedMapper",
     "CalibrateQAMapper",
@@ -140,6 +149,7 @@
     "ExpandMacroMapper",
     "ExtractEntityAttributeMapper",
     "ExtractEntityRelationMapper",
+    "ExportToLeRobotMapper",
     "ExtractEventMapper",
     "ExtractKeywordMapper",
     "ExtractNicknameMapper",
@@ -196,8 +206,8 @@
     "TextChunkMapper",
     "TextTaggingByPromptMapper",
     "VggtMapper",
-    "VideoCameraCalibrationStaticDeepcalibMapper",
-    "VideoCameraCalibrationStaticMogeMapper",
+    "VideoCameraCalibrationDeepcalibMapper",
+    "VideoCameraCalibrationMogeMapper",
     "VideoCaptioningFromAudioMapper",
     "VideoCaptioningFromFramesMapper",
     "VideoCaptioningFromSummarizerMapper",
@@ -208,6 +218,11 @@
     "VideoFFmpegWrappedMapper",
     "VideoHandReconstructionHaworMapper",
     "VideoHandReconstructionMapper",
+    "VideoHandActionComputeMapper",
+    "VideoHandMotionSmoothMapper",
+    "VideoClipReassemblyMapper",
+    "VideoAtomicActionSegmentMapper",
+    "VideoTrajectoryOverlayMapper",
     "VideoFaceBlurMapper",
     "VideoObjectSegmentingMapper",
     "VideoRemoveWatermarkMapper",