diff --git a/data_juicer/config/config_all.yaml b/data_juicer/config/config_all.yaml index 30d6b5f71b..835cba532c 100644 --- a/data_juicer/config/config_all.yaml +++ b/data_juicer/config/config_all.yaml @@ -560,18 +560,18 @@ process: if_output_point_maps_from_projection: True # Determines whether to output point maps directly inferred by VGGT. if_output_point_maps_from_unprojection: True # Determines whether to output point maps constructed from depth maps and camera parameters. if_output_point_tracks: True # Determines whether to output point tracks. - - video_camera_calibration_static_deepcalib_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib. + - video_camera_calibration_deepcalib_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib. model_path: "weights_10_0.02.h5" # The path to the DeepCalib Regression model. frame_num: 3 # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. duration: 0 # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment. - tag_field_name: 'static_camera_calibration_deepcalib_tags' # the field name to store the tags. It's "static_camera_calibration_deepcalib_tags" in default. + tag_field_name: 'camera_calibration_deepcalib_tags' # the field name to store the tags. It's "camera_calibration_deepcalib_tags" in default. frame_dir: None # Output directory to save extracted frames. output_info_dir: None # Output directory for saving camera parameters. - - video_camera_calibration_static_moge_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib). + - video_camera_calibration_moge_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib). model_path: "Ruicheng/moge-2-vitl" # The path to the Moge-2 model. frame_num: 3 # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. duration: 0 # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment. - tag_field_name: 'static_camera_calibration_moge_tags' # the field name to store the tags. It's "static_camera_calibration_moge_tags" in default. + tag_field_name: 'camera_calibration_moge_tags' # the field name to store the tags. It's "camera_calibration_moge_tags" in default. frame_dir: None # Output directory to save extracted frames. if_output_info: True # Whether to save the camera parameters results to an JSON file. output_info_dir: None # Output directory for saving camera parameters. @@ -972,6 +972,7 @@ process: min_score: 0.25 # the minimum motion score to keep samples max_score: 10000.0 # the maximum motion score to keep samples sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow + original_fps: null # the original FPS of the video from which the frames were extracted, only used when frame_field is specified size: null # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w) max_size: null # maximum allowed for the longer edge of resized frames divisible: 1 # The number that the dimensions must be divisible by. @@ -981,6 +982,7 @@ process: min_score: 1.0 # the minimum motion score to keep samples max_score: 10000.0 # the maximum motion score to keep samples sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow + original_fps: null # the original FPS of the video from which the frames were extracted, only used when frame_field is specified size: null # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w) max_size: null # maximum allowed for the longer edge of resized frames divisible: 8 # The number that the dimensions must be divisible by. diff --git a/data_juicer/ops/common/mano_func.py b/data_juicer/ops/common/mano_func.py index b825197820..c9c0099e3f 100644 --- a/data_juicer/ops/common/mano_func.py +++ b/data_juicer/ops/common/mano_func.py @@ -45,6 +45,25 @@ def forward(self, *args, **kwargs) -> smplx.utils.MANOOutput: mano_output.joints = joints return mano_output + @classmethod + def build_left(cls, model_path, fix_shapedirs=True): + """Build a LEFT-hand MANO model. + + HaWoR uses a separate MANO_LEFT.pkl with is_rhand=False and a + shapedirs bug-fix (https://github.com/vchoutas/smplx/issues/48). + + Args: + model_path: Path to MANO_LEFT.pkl + fix_shapedirs: Apply the left-hand shapedirs fix (default True). + + Returns: + MANO model configured for left hand. + """ + model = cls(model_path=model_path, is_rhand=False) + if fix_shapedirs: + model.shapedirs[:, 0, :] *= -1 + return model + def query(self, hmr_output): batch_size = hmr_output["pred_rotmat"].shape[0] pred_rotmat = hmr_output["pred_rotmat"].reshape(batch_size, -1, 3, 3) diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py index b91e689a4e..08a4f473f6 100644 --- a/data_juicer/ops/filter/video_motion_score_filter.py +++ b/data_juicer/ops/filter/video_motion_score_filter.py @@ -53,6 +53,7 @@ def __init__( max_score: float = sys.float_info.max, frame_field: Optional[str] = None, sampling_fps: PositiveFloat = 2, + original_fps: Optional[PositiveFloat] = None, size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None, max_size: Optional[PositiveInt] = None, divisible: PositiveInt = 1, @@ -72,6 +73,11 @@ def __init__( If frame_field is None, extract frames from the video field. :param sampling_fps: The sampling rate in frames_per_second for optical flow calculations. + :param original_fps: The original FPS of the video from which the + frames were extracted. Only used when `frame_field` is specified. + When provided, frames will be sampled at `sampling_fps` rate + by computing `sampling_step = round(original_fps / sampling_fps)`. + If None, all frames will be processed without sampling. :param size: Resize frames before computing optical flow. If size is a sequence like (h, w), frame size will be matched to this. If size is an int, smaller edge of frames will be matched to this number. @@ -101,6 +107,7 @@ def __init__( self.min_score = min_score self.max_score = max_score self.sampling_fps = sampling_fps + self.original_fps = original_fps self.frame_field = frame_field if isinstance(size, (list, tuple)): @@ -198,7 +205,18 @@ def _compute_motion_scores_from_frames(self, frames): video_motion_scores = [] optical_flows = [] prev_frame = None - for frame in frames: + + # compute sampling step if original_fps is provided + sampling_step = 1 + if self.original_fps is not None and self.original_fps > 0: + effective_fps = min(self.sampling_fps, self.original_fps) + sampling_step = max(round(self.original_fps / effective_fps), 1) + + for frame_idx, frame in enumerate(frames): + # skip frames according to sampling_step + if sampling_step > 1 and frame_idx % sampling_step != 0: + continue + if isinstance(frame, bytes): image_array = np.frombuffer(frame, dtype=np.uint8) frame = cv2.imdecode(image_array, cv2.IMREAD_COLOR) diff --git a/data_juicer/ops/filter/video_motion_score_ptlflow_filter.py b/data_juicer/ops/filter/video_motion_score_ptlflow_filter.py index 514cbb8e37..6df11e584e 100644 --- a/data_juicer/ops/filter/video_motion_score_ptlflow_filter.py +++ b/data_juicer/ops/filter/video_motion_score_ptlflow_filter.py @@ -55,6 +55,7 @@ def __init__( ckpt_path: Optional[str] = "things", get_model_args: Optional[dict] = None, sampling_fps: PositiveFloat = 2, + original_fps: Optional[PositiveFloat] = None, size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None, max_size: Optional[PositiveInt] = None, divisible: PositiveInt = 8, @@ -70,6 +71,7 @@ def __init__( max_score, frame_field, sampling_fps, + original_fps, size, max_size, divisible, diff --git a/data_juicer/ops/filter/video_motion_score_raft_filter.py b/data_juicer/ops/filter/video_motion_score_raft_filter.py index 1bfa6f9cc4..55b160c55c 100644 --- a/data_juicer/ops/filter/video_motion_score_raft_filter.py +++ b/data_juicer/ops/filter/video_motion_score_raft_filter.py @@ -52,6 +52,7 @@ def __init__( max_score: float = sys.float_info.max, frame_field: Optional[str] = None, sampling_fps: PositiveFloat = 2, + original_fps: Optional[PositiveFloat] = None, size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None, max_size: Optional[PositiveInt] = None, divisible: PositiveInt = 8, @@ -67,6 +68,7 @@ def __init__( max_score, frame_field, sampling_fps, + original_fps, size, max_size, divisible, diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index 9be5b8accd..617d8e9c60 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -21,6 +21,7 @@ from .dialog_topic_detection_mapper import DialogTopicDetectionMapper from .download_file_mapper import DownloadFileMapper from .expand_macro_mapper import ExpandMacroMapper +from .export_to_lerobot_mapper import ExportToLeRobotMapper from .extract_entity_attribute_mapper import ExtractEntityAttributeMapper from .extract_entity_relation_mapper import ExtractEntityRelationMapper from .extract_event_mapper import ExtractEventMapper @@ -84,22 +85,27 @@ from .text_chunk_mapper import TextChunkMapper from .text_tagging_by_prompt_mapper import TextTaggingByPromptMapper from .vggt_mapper import VggtMapper -from .video_camera_calibration_static_deepcalib_mapper import ( - VideoCameraCalibrationStaticDeepcalibMapper, +from .video_atomic_action_segment_mapper import VideoAtomicActionSegmentMapper +from .video_camera_calibration_deepcalib_mapper import ( + VideoCameraCalibrationDeepcalibMapper, ) -from .video_camera_calibration_static_moge_mapper import ( - VideoCameraCalibrationStaticMogeMapper, +from .video_camera_calibration_droidcalib_mapper import ( + VideoCameraCalibrationDroidCalibMapper, ) -from .video_camera_pose_mapper import VideoCameraPoseMapper +from .video_camera_calibration_moge_mapper import VideoCameraCalibrationMogeMapper +from .video_camera_pose_megasam_mapper import VideoCameraPoseMegaSaMMapper from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper from .video_captioning_from_video_mapper import VideoCaptioningFromVideoMapper from .video_captioning_from_vlm_mapper import VideoCaptioningFromVLMMapper +from .video_clip_reassembly_mapper import VideoClipReassemblyMapper from .video_depth_estimation_mapper import VideoDepthEstimationMapper from .video_extract_frames_mapper import VideoExtractFramesMapper from .video_face_blur_mapper import VideoFaceBlurMapper from .video_ffmpeg_wrapped_mapper import VideoFFmpegWrappedMapper +from .video_hand_action_compute_mapper import VideoHandActionComputeMapper +from .video_hand_motion_smooth_mapper import VideoHandMotionSmoothMapper from .video_hand_reconstruction_hawor_mapper import VideoHandReconstructionHaworMapper from .video_hand_reconstruction_mapper import VideoHandReconstructionMapper from .video_object_segmenting_mapper import VideoObjectSegmentingMapper @@ -111,11 +117,14 @@ from .video_split_by_scene_mapper import VideoSplitBySceneMapper from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper +from .video_trajectory_overlay_mapper import VideoTrajectoryOverlayMapper from .video_undistort_mapper import VideoUndistortMapper from .video_whole_body_pose_estimation_mapper import VideoWholeBodyPoseEstimationMapper from .whitespace_normalization_mapper import WhitespaceNormalizationMapper __all__ = [ + "VideoCameraCalibrationDroidCalibMapper", + "VideoCameraPoseMegaSaMMapper", "AudioAddGaussianNoiseMapper", "AudioFFmpegWrappedMapper", "CalibrateQAMapper", @@ -140,6 +149,7 @@ "ExpandMacroMapper", "ExtractEntityAttributeMapper", "ExtractEntityRelationMapper", + "ExportToLeRobotMapper", "ExtractEventMapper", "ExtractKeywordMapper", "ExtractNicknameMapper", @@ -196,8 +206,8 @@ "TextChunkMapper", "TextTaggingByPromptMapper", "VggtMapper", - "VideoCameraCalibrationStaticDeepcalibMapper", - "VideoCameraCalibrationStaticMogeMapper", + "VideoCameraCalibrationDeepcalibMapper", + "VideoCameraCalibrationMogeMapper", "VideoCaptioningFromAudioMapper", "VideoCaptioningFromFramesMapper", "VideoCaptioningFromSummarizerMapper", @@ -208,6 +218,11 @@ "VideoFFmpegWrappedMapper", "VideoHandReconstructionHaworMapper", "VideoHandReconstructionMapper", + "VideoHandActionComputeMapper", + "VideoHandMotionSmoothMapper", + "VideoClipReassemblyMapper", + "VideoAtomicActionSegmentMapper", + "VideoTrajectoryOverlayMapper", "VideoFaceBlurMapper", "VideoObjectSegmentingMapper", "VideoRemoveWatermarkMapper", diff --git a/data_juicer/ops/mapper/export_to_lerobot_mapper.py b/data_juicer/ops/mapper/export_to_lerobot_mapper.py new file mode 100644 index 0000000000..39186c1e51 --- /dev/null +++ b/data_juicer/ops/mapper/export_to_lerobot_mapper.py @@ -0,0 +1,730 @@ +import json +import os +import shutil +import subprocess +import uuid + +import cv2 +import numpy as np +from loguru import logger + +from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.utils.lazy_loader import LazyLoader + +from ..base_op import OPERATORS, Mapper + +OP_NAME = "export_to_lerobot_mapper" + +pa = LazyLoader("pyarrow", "pyarrow") +pd = LazyLoader("pandas", "pandas") + +DEFAULT_CHUNKS_SIZE = 1000 + + +@OPERATORS.register_module(OP_NAME) +class ExportToLeRobotMapper(Mapper): + """Export processed video data to LeRobot v2.0 dataset format (LIBERO-style). + + Designed for Ray distributed execution: each actor writes files + independently using UUID-based names (no cross-process coordination). + After all actors finish, call `finalize_dataset()` once to assign + sequential episode indices, rename files, and generate metadata. + + Processing phase (parallel, per actor): + staging/ + ├── data/{uuid}.parquet + ├── videos/{uuid}.mp4 + └── meta/episodes_{uuid}.jsonl + + After finalize_dataset() (single-threaded): + dataset_dir/ + ├── data/chunk-{NNN}/episode_XXXXXX.parquet + ├── videos/chunk-{NNN}/observation.images.image/episode_XXXXXX.mp4 + └── meta/ + ├── info.json + ├── tasks.jsonl + ├── episodes.jsonl + └── modality.json + """ + + def __init__( + self, + output_dir: str = "./lerobot_output", + hand_action_field: str = "hand_action_tags", + fps: int = 10, + robot_type: str = "egodex_hand", + chunks_size: int = DEFAULT_CHUNKS_SIZE, + segment_field: str = None, + frame_field: str = MetaKeys.video_frames, + *args, + **kwargs, + ): + """ + Initialization method. + + :param output_dir: Root directory for the LeRobot dataset output. + :param hand_action_field: Meta field with action/state data. + Used in whole-video mode (segment_field=None). + :param fps: Frames per second for the dataset. + :param robot_type: Robot type identifier for info.json. + :param chunks_size: Max episodes per chunk directory (default 1000). + :param segment_field: Meta field storing atomic action segments. + When set, each segment becomes a separate episode with its + own caption as task description. When None (default), falls + back to whole-video export via hand_action_field. + :param frame_field: Sample field with extracted frame image paths. + Used in segment mode to create per-segment videos. + """ + super().__init__(*args, **kwargs) + self.output_dir = output_dir + self.hand_action_field = hand_action_field + self.fps = fps + self.robot_type = robot_type + self.chunks_size = chunks_size + self.segment_field = segment_field + self.frame_field = frame_field + + # Staging directories for parallel-safe writes + self.staging_data_dir = os.path.join(output_dir, "staging", "data") + self.staging_video_dir = os.path.join(output_dir, "staging", "videos") + self.staging_meta_dir = os.path.join(output_dir, "staging", "meta") + self.meta_dir = os.path.join(output_dir, "meta") + os.makedirs(self.staging_data_dir, exist_ok=True) + os.makedirs(self.staging_video_dir, exist_ok=True) + os.makedirs(self.staging_meta_dir, exist_ok=True) + os.makedirs(self.meta_dir, exist_ok=True) + + def _stage_video(self, video_source, ep_uuid): + """Copy or write video to staging with UUID name. + + :param video_source: Video file path (str) or video bytes (bytes). + :param ep_uuid: Unique episode identifier. + """ + if isinstance(video_source, bytes): + dst = os.path.join(self.staging_video_dir, f"{ep_uuid}.mp4") + if not os.path.exists(dst): + with open(dst, "wb") as f: + f.write(video_source) + else: + ext = os.path.splitext(video_source)[1] or ".mp4" + dst = os.path.join(self.staging_video_dir, f"{ep_uuid}{ext}") + if not os.path.exists(dst): + shutil.copy2(video_source, dst) + return dst + + def _stage_parquet(self, states, actions, ep_uuid, valid_frame_ids=None): + """Write parquet to staging with UUID name. + + episode_index, index, task_index are placeholders — they will + be rewritten by finalize_dataset(). + + :param valid_frame_ids: Original video frame indices corresponding + to each state/action row. Used as frame_index so that LeRobot + can align parquet rows with video frames. Falls back to + sequential 0..T-1 when not provided. + """ + T = len(states) + states_arr = np.array(states, dtype=np.float32) + actions_arr = np.array(actions, dtype=np.float32) + + rows = [] + for t in range(T): + frame_id = valid_frame_ids[t] if valid_frame_ids else t + rows.append( + { + "observation.state": states_arr[t].tolist(), + "action": actions_arr[t].tolist(), + "timestamp": float(frame_id) / self.fps, + "frame_index": frame_id, + "episode_index": 0, # placeholder + "index": t, # placeholder + "task_index": 0, # placeholder + "next.done": t == T - 1, + } + ) + + df = pd.DataFrame(rows) + path = os.path.join(self.staging_data_dir, f"{ep_uuid}.parquet") + table = pa.Table.from_pandas(df) + pa.parquet.write_table(table, path) + + return path, T + + def _stage_episode_meta(self, ep_uuid, num_frames, task_desc, video_path): + """Write per-episode metadata to a UUID-named jsonl fragment. + + Each actor writes its own file — no cross-process contention. + """ + meta_path = os.path.join(self.staging_meta_dir, f"{ep_uuid}.jsonl") + if video_path and isinstance(video_path, str): + video_ext = os.path.splitext(video_path)[1] or ".mp4" + else: + video_ext = ".mp4" + entry = { + "uuid": ep_uuid, + "length": num_frames, + "task": task_desc, + "video_ext": video_ext, + } + with open(meta_path, "w", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + @staticmethod + def _encode_frames_to_video(frame_paths, output_path, fps=30): + """Encode a sequence of frame images into an H.264 mp4 video. + + :param frame_paths: List of image file paths. + :param output_path: Destination mp4 path. + :param fps: Output video frame rate. + :return: output_path on success, None on failure. + """ + if not frame_paths: + return None + + # Read all frames and collect raw bytes upfront + first = cv2.imread(frame_paths[0]) + if first is None: + return None + h, w = first.shape[:2] + + raw_chunks = [first.tobytes()] + for p in frame_paths[1:]: + img = cv2.imread(p) + if img is None: + continue + if img.shape[:2] != (h, w): + img = cv2.resize(img, (w, h)) + raw_chunks.append(img.tobytes()) + raw_data = b"".join(raw_chunks) + + cmd = [ + "ffmpeg", + "-y", + "-loglevel", + "error", + "-f", + "rawvideo", + "-vcodec", + "rawvideo", + "-pix_fmt", + "bgr24", + "-s", + f"{w}x{h}", + "-r", + str(fps), + "-i", + "-", + "-c:v", + "libx264", + "-pix_fmt", + "yuv420p", + "-preset", + "fast", + "-crf", + "23", + "-movflags", + "frag_keyframe+empty_moov", + output_path, + ] + try: + proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + _, stderr = proc.communicate(input=raw_data) + if proc.returncode != 0: + logger.warning(f"ffmpeg encode failed (rc={proc.returncode}): " f"{stderr.decode()[-300:]}") + return None + except Exception as e: + logger.warning(f"ffmpeg encode error: {e}") + return None + + return output_path if os.path.exists(output_path) else None + + def _get_frame_paths(self, sample): + """Get flat list of frame image paths from sample.""" + frame_data = sample.get(self.frame_field, []) + if not frame_data: + # Also check inside meta + frame_data = sample.get(Fields.meta, {}).get(self.frame_field, []) + # Unwrap nested list from reassembly: [[frames]] → [frames] + if isinstance(frame_data, list) and frame_data and isinstance(frame_data[0], list): + frame_data = frame_data[0] + return frame_data + + def process_single(self, sample=None, rank=None): + if Fields.meta not in sample: + return sample + + if self.segment_field: + return self._process_segments(sample) + return self._process_whole_video(sample) + + def _process_segments(self, sample): + """Per-segment export: each atomic action segment → one episode. + + Each segment's VLM caption becomes the episode's task description. + A segment video is created from the extracted frame images. + """ + meta = sample.get(Fields.meta, {}) + segments = meta.get(self.segment_field, []) + if not segments: + logger.warning("No segments found, skipping export.") + sample[Fields.meta]["lerobot_export"] = [] + return sample + + all_frames = self._get_frame_paths(sample) + exported_episodes = [] + + for seg in segments: + states = seg.get("states", []) + actions = seg.get("actions", []) + + if len(states) < 2: + continue + + # Pad actions with zeros if missing (e.g. last segment) + if not actions or len(actions) < len(states): + actions = [[0.0] * 7] * len(states) + + # Task description from VLM caption + caption = seg.get("caption", {}) + if isinstance(caption, dict): + task_desc = caption.get("action", "") + else: + task_desc = str(caption) if caption else "" + + # Skip segments explicitly marked as no action + if task_desc == "N/A": + continue + + # Fallback description when caption is empty + if not task_desc: + hand = seg.get("hand_type", "hand") + task_desc = f"{hand} hand action" + + # Frame range and valid IDs + start = seg.get("start_frame", 0) + end = seg.get("end_frame", len(all_frames) - 1) + valid_fids = seg.get("valid_frame_ids", list(range(start, end + 1))) + # Convert to segment-relative frame indices (0-based) + seg_relative_fids = [fid - start for fid in valid_fids] + + ep_uuid = uuid.uuid4().hex + + # Stage parquet + parquet_path, num_frames = self._stage_parquet(states, actions, ep_uuid, seg_relative_fids) + + # Create segment video from frame images + video_dst = None + if all_frames: + seg_frame_paths = [ + all_frames[fid] + for fid in range(start, min(end + 1, len(all_frames))) + if fid < len(all_frames) and all_frames[fid] + ] + if seg_frame_paths: + video_path = os.path.join(self.staging_video_dir, f"{ep_uuid}.mp4") + video_dst = self._encode_frames_to_video(seg_frame_paths, video_path, self.fps) + + # Stage metadata + self._stage_episode_meta(ep_uuid, num_frames, task_desc, video_dst) + + exported_episodes.append( + { + "uuid": ep_uuid, + "parquet_path": parquet_path, + "video_path": video_dst, + "num_frames": num_frames, + "segment_id": seg.get("segment_id", -1), + "hand_type": seg.get("hand_type", "unknown"), + } + ) + + sample[Fields.meta]["lerobot_export"] = exported_episodes + return sample + + def _process_whole_video(self, sample): + """Original whole-video export: one video → one episode.""" + action_data_list = sample[Fields.meta].get(self.hand_action_field, []) + if not action_data_list: + logger.warning("No hand action data found, skipping export.") + return sample + + # Get task description from text field + task_desc = sample.get(self.text_key, "") + if not task_desc: + task_desc = "manipulate object" + + # Get video sources (paths or bytes) + video_sources = sample.get(self.video_key, []) + + # Track export results + exported_episodes = [] + + for video_idx, video_action_data in enumerate(action_data_list): + # Support both old format (flat dict) and new format + # (dict keyed by hand_type). + if "states" in video_action_data: + action_data = video_action_data + else: + action_data = {} + for ht in ["right", "left"]: + hand_entry = video_action_data.get(ht, {}) + if hand_entry.get("states", []): + action_data = hand_entry + break + + states = action_data.get("states", []) + actions = action_data.get("actions", []) + valid_frame_ids = action_data.get("valid_frame_ids", None) + + if len(states) < 2: + continue + + # Generate a unique ID for this episode — no coordination + ep_uuid = uuid.uuid4().hex + + # Write parquet to staging (use valid_frame_ids as frame_index) + parquet_path, num_frames = self._stage_parquet(states, actions, ep_uuid, valid_frame_ids) + + # Copy/write video to staging (supports both path and bytes) + video_dst = None + if video_idx < len(video_sources): + video_dst = self._stage_video(video_sources[video_idx], ep_uuid) + + # Write episode metadata fragment + self._stage_episode_meta(ep_uuid, num_frames, task_desc, video_dst) + + exported_episodes.append( + { + "uuid": ep_uuid, + "parquet_path": parquet_path, + "video_path": video_dst, + "num_frames": num_frames, + } + ) + + sample[Fields.meta]["lerobot_export"] = exported_episodes + return sample + + @staticmethod + def _write_modality_json(meta_dir): + """Write modality.json following StarVLA LIBERO convention.""" + modality = { + "state": { + "x": {"start": 0, "end": 1}, + "y": {"start": 1, "end": 2}, + "z": {"start": 2, "end": 3}, + "roll": {"start": 3, "end": 4}, + "pitch": {"start": 4, "end": 5}, + "yaw": {"start": 5, "end": 6}, + "pad": {"start": 6, "end": 7}, + "gripper": {"start": 7, "end": 8}, + }, + "action": { + "x": {"start": 0, "end": 1}, + "y": {"start": 1, "end": 2}, + "z": {"start": 2, "end": 3}, + "roll": {"start": 3, "end": 4}, + "pitch": {"start": 4, "end": 5}, + "yaw": {"start": 5, "end": 6}, + "gripper": {"start": 6, "end": 7}, + }, + "video": { + "primary_image": { + "original_key": "observation.images.image", + }, + }, + "annotation": { + "human.action.task_description": { + "original_key": "task_index", + }, + }, + } + path = os.path.join(meta_dir, "modality.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(modality, f, indent=4) + + @staticmethod + def _probe_video_resolution(video_base_dir): + """Probe the first video file to get resolution and codec info.""" + if not os.path.exists(video_base_dir): + raise ValueError(f"Video directory {video_base_dir} does not exist.") + + # Find the first video file + video_path = None + for root, _dirs, files in os.walk(video_base_dir): + for f in sorted(files): + if f.endswith((".mp4", ".avi", ".mkv")): + video_path = os.path.join(root, f) + break + if video_path: + break + + if not video_path: + raise ValueError("No video files found.") + + defaults = { + "width": 0, + "height": 0, + "channels": 3, + "codec": "av1", + "pix_fmt": "yuv420p", + } + + try: + import cv2 + + cap = cv2.VideoCapture(video_path) + if cap.isOpened(): + defaults["width"] = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + defaults["height"] = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + ret, frame = cap.read() + if ret and frame is not None: + defaults["channels"] = frame.shape[2] if frame.ndim == 3 else 1 + cap.release() + except Exception: + pass + + try: + import subprocess + + result = subprocess.run( + [ + "ffprobe", + "-v", + "quiet", + "-print_format", + "json", + "-show_streams", + "-select_streams", + "v:0", + video_path, + ], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + import json as _json + + probe = _json.loads(result.stdout) + if probe.get("streams"): + stream = probe["streams"][0] + defaults["codec"] = stream.get("codec_name", defaults["codec"]) + pix_fmt = stream.get("pix_fmt", defaults["pix_fmt"]) + defaults["pix_fmt"] = pix_fmt + # Infer channels from pix_fmt + if "gray" in pix_fmt: + defaults["channels"] = 1 + elif "a" in pix_fmt and pix_fmt not in ("yuv420p", "yuvj420p"): + defaults["channels"] = 4 + else: + defaults["channels"] = 3 + except Exception: + pass + + return defaults + + @staticmethod + def finalize_dataset(output_dir, fps=10, robot_type="egodex_hand", chunks_size=DEFAULT_CHUNKS_SIZE): + """Merge staged files into final LeRobot dataset structure. + + Must be called ONCE after all Ray actors have finished. + This is single-threaded — no concurrency issues. + + Steps: + 1. Collect all episode metadata fragments from staging + 2. Sort by UUID for deterministic ordering + 3. Assign sequential episode_index (0, 1, 2, ...) + 4. Rewrite parquet files with correct episode_index / index + 5. Move video files to chunk directories + 6. Write episodes.jsonl, tasks.jsonl, info.json + 7. Clean up staging directory + """ + staging_dir = os.path.join(output_dir, "staging") + staging_data = os.path.join(staging_dir, "data") + staging_video = os.path.join(staging_dir, "videos") + staging_meta = os.path.join(staging_dir, "meta") + meta_dir = os.path.join(output_dir, "meta") + + ExportToLeRobotMapper._write_modality_json(meta_dir) + + # 1. Collect all episode metadata fragments + episodes = [] + if os.path.exists(staging_meta): + for fname in sorted(os.listdir(staging_meta)): + if not fname.endswith(".jsonl"): + continue + fpath = os.path.join(staging_meta, fname) + with open(fpath, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + episodes.append(json.loads(line)) + + if not episodes: + logger.warning("No staged episodes found. Nothing to finalize.") + return + + # 2. Sort for deterministic ordering + episodes.sort(key=lambda e: e["uuid"]) + + # 3. Assign sequential episode_index, build task index + task_to_index = {} + global_frame_offset = 0 + + for ep_idx, ep in enumerate(episodes): + ep["episode_index"] = ep_idx + ep["global_frame_offset"] = global_frame_offset + global_frame_offset += ep["length"] + + task = ep["task"] + if task not in task_to_index: + task_to_index[task] = len(task_to_index) + ep["task_index"] = task_to_index[task] + + total_episodes = len(episodes) + total_frames = global_frame_offset + total_chunks = max(1, (total_episodes + chunks_size - 1) // chunks_size) + + # 4. Create chunk directories + for chunk_idx in range(total_chunks): + chunk_name = f"chunk-{chunk_idx:03d}" + os.makedirs(os.path.join(output_dir, "data", chunk_name), exist_ok=True) + os.makedirs(os.path.join(output_dir, "videos", chunk_name, "observation.images.image"), exist_ok=True) + + # 5. Process each episode: rewrite parquet, move video + total_videos = 0 + for ep in episodes: + ep_uuid = ep["uuid"] + ep_idx = ep["episode_index"] + chunk_name = f"chunk-{ep_idx // chunks_size:03d}" + + # Rewrite parquet with correct indices + src_parquet = os.path.join(staging_data, f"{ep_uuid}.parquet") + if os.path.exists(src_parquet): + table = pa.parquet.read_table(src_parquet) + df = table.to_pandas() + + df["episode_index"] = ep_idx + df["task_index"] = ep["task_index"] + df["index"] = ep["global_frame_offset"] + df["frame_index"].values + + dst_parquet = os.path.join(output_dir, "data", chunk_name, f"episode_{ep_idx:06d}.parquet") + out_table = pa.Table.from_pandas(df) + pa.parquet.write_table(out_table, dst_parquet) + + # Move video file + video_ext = ep.get("video_ext", ".mp4") + src_video = os.path.join(staging_video, f"{ep_uuid}{video_ext}") + if os.path.exists(src_video): + dst_video = os.path.join( + output_dir, "videos", chunk_name, "observation.images.image", f"episode_{ep_idx:06d}{video_ext}" + ) + shutil.move(src_video, dst_video) + total_videos += 1 + + # 6. Write episodes.jsonl + episodes_path = os.path.join(meta_dir, "episodes.jsonl") + with open(episodes_path, "w", encoding="utf-8") as f: + for ep in episodes: + entry = { + "episode_index": ep["episode_index"], + "length": ep["length"], + "task": ep["task"], + } + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + # 7. Write tasks.jsonl + tasks_path = os.path.join(meta_dir, "tasks.jsonl") + with open(tasks_path, "w", encoding="utf-8") as f: + for task, idx in sorted(task_to_index.items(), key=lambda x: x[1]): + f.write(json.dumps({"task_index": idx, "task": task}, ensure_ascii=False) + "\n") + + # 8. Probe video resolution + video_base_dir = os.path.join(output_dir, "videos") + video_info = ExportToLeRobotMapper._probe_video_resolution(video_base_dir) + + # 9. Write info.json with features + features = { + "observation.state": { + "dtype": "float32", + "shape": [8], + }, + "action": { + "dtype": "float32", + "shape": [7], + }, + "observation.images.image": { + "dtype": "video", + "shape": [ + video_info["height"], + video_info["width"], + video_info["channels"], + ], + "names": ["height", "width", "channels"], + "info": { + "video.height": video_info["height"], + "video.width": video_info["width"], + "video.channels": video_info["channels"], + "video.codec": video_info["codec"], + "video.pix_fmt": video_info["pix_fmt"], + "video.is_depth_map": False, + "video.fps": fps, + "has_audio": False, + }, + }, + "timestamp": { + "dtype": "float32", + "shape": [1], + "names": None, + }, + "frame_index": { + "dtype": "int64", + "shape": [1], + "names": None, + }, + "episode_index": { + "dtype": "int64", + "shape": [1], + "names": None, + }, + "index": { + "dtype": "int64", + "shape": [1], + "names": None, + }, + "task_index": { + "dtype": "int64", + "shape": [1], + "names": None, + }, + } + + info = { + "codebase_version": "v2.0", + "robot_type": robot_type, + "total_episodes": total_episodes, + "total_frames": total_frames, + "total_tasks": len(task_to_index), + "total_videos": total_videos, + "total_chunks": total_chunks, + "chunks_size": chunks_size, + "fps": fps, + "splits": {"train": f"0:{total_episodes}"}, + "data_path": "data/chunk-{episode_chunk:03d}/" "episode_{episode_index:06d}.parquet", + "video_path": "videos/chunk-{episode_chunk:03d}/" "{video_key}/episode_{episode_index:06d}.mp4", + "features": features, + } + + info_path = os.path.join(meta_dir, "info.json") + with open(info_path, "w", encoding="utf-8") as f: + json.dump(info, f, indent=2, ensure_ascii=False) + + # 10. Clean up staging directory + shutil.rmtree(staging_dir, ignore_errors=True) + + logger.info( + f"LeRobot dataset finalized: {total_episodes} episodes, " + f"{total_frames} frames, {len(task_to_index)} tasks, " + f"{total_chunks} chunks" + ) diff --git a/data_juicer/ops/mapper/video_atomic_action_segment_mapper.py b/data_juicer/ops/mapper/video_atomic_action_segment_mapper.py new file mode 100644 index 0000000000..b7f534bf30 --- /dev/null +++ b/data_juicer/ops/mapper/video_atomic_action_segment_mapper.py @@ -0,0 +1,301 @@ +import numpy as np +from loguru import logger + +from data_juicer.utils.constant import Fields, MetaKeys + +from ..base_op import OPERATORS, Mapper + +OP_NAME = "video_atomic_action_segment_mapper" + + +@OPERATORS.register_module(OP_NAME) +class VideoAtomicActionSegmentMapper(Mapper): + """Segment a unified hand trajectory into atomic action clips. + + Implements the algorithm from paper https://arxiv.org/pdf/2510.21571: + + "we detect speed minima of the 3D hand wrists in the world space + and use them as cutting points. We smooth the hand trajectory and + select points that are local speed minima within a fixed window + centered on each point." + + The operator reads the merged hand_action_tags (output of + ``VideoClipReassemblyMapper``) and produces a list of segments. + Each segment contains the start and end frame indices, plus sliced + states / actions / joints for that segment. + + Segmentation is applied **independently** for left and right hands. + A frame is a cutting point if it is a speed local minimum within a + window of ``min_window`` frames on each side. + + Output field (``segment_field``) structure:: + + [ + { + "hand_type": "right", + "segment_id": 0, + "start_frame": 10, + "end_frame": 45, + "states": [...], + "actions": [...], + "valid_frame_ids": [...], + "joints_world": [...], + }, + ... + ] + """ + + def __init__( + self, + hand_action_field: str = MetaKeys.hand_action_tags, + segment_field: str = "atomic_action_segments", + speed_smooth_window: int = 5, + min_window: int = 15, + min_segment_frames: int = 8, + max_segment_frames: int = 300, + hand_type: str = "both", + *args, + **kwargs, + ): + """ + Initialization method. + + :param hand_action_field: Meta field storing merged hand action + results (output of VideoClipReassemblyMapper). + :param segment_field: Output meta field for atomic segments. + :param speed_smooth_window: Window size for Savitzky-Golay + smoothing of the speed signal before minima detection. + Must be odd. + :param min_window: Half-window size for local minima detection. + A frame is a local minimum only if it is the minimum + within ``[t - min_window, t + min_window]``. + Larger values → fewer, longer segments. + :param min_segment_frames: Minimum frames per segment. + Segments shorter than this are merged with neighbors. + :param max_segment_frames: Maximum frames per segment. + Segments longer than this are forcibly split at + the deepest speed minimum. + :param hand_type: Which hand(s) to segment: 'left', 'right', + or 'both'. + """ + super().__init__(*args, **kwargs) + self.hand_action_field = hand_action_field + self.segment_field = segment_field + self.speed_smooth_window = speed_smooth_window + self.min_window = min_window + self.min_segment_frames = min_segment_frames + self.max_segment_frames = max_segment_frames + self.hand_type = hand_type + + # ------------------------------------------------------------------ + # Speed computation & smoothing + # ------------------------------------------------------------------ + @staticmethod + def _compute_speed(positions: np.ndarray) -> np.ndarray: + """Compute per-frame wrist speed from world-space positions. + + Returns an array of length N where speed[0] = 0. + """ + if len(positions) < 2: + return np.zeros(len(positions)) + vel = np.linalg.norm(np.diff(positions, axis=0), axis=1) + return np.concatenate([[0.0], vel]) + + @staticmethod + def _smooth_speed( + speed: np.ndarray, + window: int, + ) -> np.ndarray: + """Smooth speed signal with Savitzky-Golay filter.""" + n = len(speed) + if n < 5: + return speed.copy() + + try: + from scipy.signal import savgol_filter + + win = min(window, n) + if win % 2 == 0: + win -= 1 + if win < 3: + return speed.copy() + return savgol_filter(speed, win, polyorder=2) + except Exception: + return speed.copy() + + # ------------------------------------------------------------------ + # Local minima detection + # ------------------------------------------------------------------ + @staticmethod + def _find_local_minima( + speed: np.ndarray, + half_window: int, + ) -> list[int]: + """Find indices that are local speed minima within a window. + + A frame t is a local minimum if speed[t] <= speed[k] for all k + in [t - half_window, t + half_window]. + """ + n = len(speed) + minima = [] + for t in range(1, n - 1): + lo = max(0, t - half_window) + hi = min(n, t + half_window + 1) + if speed[t] <= np.min(speed[lo:hi]): + minima.append(t) + return minima + + # ------------------------------------------------------------------ + # Segment merging (too-short) and splitting (too-long) + # ------------------------------------------------------------------ + def _merge_short_segments( + self, + cut_points: list[int], + n_frames: int, + ) -> list[int]: + """Remove cut points that would produce segments shorter than + ``min_segment_frames``.""" + if not cut_points: + return cut_points + + filtered = [cut_points[0]] + for cp in cut_points[1:]: + if cp - filtered[-1] >= self.min_segment_frames: + filtered.append(cp) + # Check last segment + if n_frames - filtered[-1] < self.min_segment_frames and len(filtered) > 1: + filtered.pop() + return filtered + + def _split_long_segments( + self, + cut_points: list[int], + speed: np.ndarray, + n_frames: int, + ) -> list[int]: + """Split segments exceeding ``max_segment_frames`` at the + deepest speed minimum within the segment.""" + boundaries = [0] + cut_points + [n_frames] + new_cuts = [] + + for i in range(len(boundaries) - 1): + start = boundaries[i] + end = boundaries[i + 1] + if i > 0: + new_cuts.append(start) + + seg_len = end - start + if seg_len <= self.max_segment_frames: + continue + + # Find the deepest minimum in this segment to split + mid = start + np.argmin(speed[start:end]) + if mid > start + self.min_segment_frames and end - mid > self.min_segment_frames: + new_cuts.append(mid) + + return sorted(set(new_cuts)) + + # ------------------------------------------------------------------ + # Segment one hand + # ------------------------------------------------------------------ + def _segment_hand( + self, + hand_data: dict, + hand_type: str, + ) -> list[dict]: + """Segment a single hand's trajectory into atomic actions.""" + states = hand_data.get("states") + if not states or len(states) < self.min_segment_frames: + return [] + + states_arr = np.asarray(states, dtype=np.float64) + positions = states_arr[:, 0:3] + n_frames = len(states_arr) + + # 1. Compute and smooth speed + speed = self._compute_speed(positions) + smooth_speed = self._smooth_speed(speed, self.speed_smooth_window) + + # 2. Detect local minima + minima = self._find_local_minima(smooth_speed, self.min_window) + + # 3. Merge short segments, split long ones + cut_points = self._merge_short_segments(minima, n_frames) + cut_points = self._split_long_segments( + cut_points, + smooth_speed, + n_frames, + ) + + # 4. Build segment boundaries + boundaries = [0] + cut_points + [n_frames] + + valid_fids = hand_data.get("valid_frame_ids", list(range(n_frames))) + actions = hand_data.get("actions", []) + joints_world = hand_data.get("joints_world", []) + joints_cam = hand_data.get("joints_cam", []) + + segments = [] + for seg_idx in range(len(boundaries) - 1): + s = boundaries[seg_idx] + e = boundaries[seg_idx + 1] + if e - s < 2: + continue + + seg = { + "hand_type": hand_type, + "segment_id": seg_idx, + "start_frame": valid_fids[s] if s < len(valid_fids) else s, + "end_frame": (valid_fids[e - 1] if e - 1 < len(valid_fids) else e - 1), + "states": states[s:e], + "actions": actions[s:e] if actions else [], + "valid_frame_ids": valid_fids[s:e], + } + if joints_world: + seg["joints_world"] = joints_world[s:e] + if joints_cam: + seg["joints_cam"] = joints_cam[s:e] + + segments.append(seg) + + logger.debug( + f"Segmented {hand_type} hand: {len(segments)} atomic actions " + f"from {n_frames} frames, cut_points={cut_points}", + ) + return segments + + # ------------------------------------------------------------------ + # Main entry + # ------------------------------------------------------------------ + def process_single(self, sample=None, rank=None): + if Fields.meta not in sample: + return sample + + meta = sample[Fields.meta] + hand_action_list = meta.get(self.hand_action_field) + if not hand_action_list: + return sample + + # After reassembly, hand_action_list is [merged_result] + # merged_result is a dict: {"right": {...}, "left": {...}} + hand_types = ["right", "left"] if self.hand_type == "both" else [self.hand_type] + + all_segments = [] + for clip_result in hand_action_list: + if not clip_result or not isinstance(clip_result, dict): + continue + for ht in hand_types: + hand_data = clip_result.get(ht) + if not hand_data or not hand_data.get("states"): + continue + segs = self._segment_hand(hand_data, ht) + all_segments.extend(segs) + + # Sort segments by start_frame for consistent ordering + all_segments.sort(key=lambda s: (s["start_frame"], s["hand_type"])) + + meta[self.segment_field] = all_segments + logger.info( + f"Atomic action segmentation: {len(all_segments)} segments", + ) + return sample diff --git a/data_juicer/ops/mapper/video_camera_calibration_deepcalib_mapper.py b/data_juicer/ops/mapper/video_camera_calibration_deepcalib_mapper.py new file mode 100644 index 0000000000..e55236fdf6 --- /dev/null +++ b/data_juicer/ops/mapper/video_camera_calibration_deepcalib_mapper.py @@ -0,0 +1,174 @@ +import numpy as np + +from data_juicer.utils.constant import CameraCalibrationKeys, Fields, MetaKeys +from data_juicer.utils.lazy_loader import LazyLoader +from data_juicer.utils.model_utils import get_model, prepare_model + +from ..base_op import OPERATORS, Mapper +from ..op_fusion import LOADED_VIDEOS + +OP_NAME = "video_camera_calibration_deepcalib_mapper" + +cv2 = LazyLoader("cv2", "opencv-python") + + +@OPERATORS.register_module(OP_NAME) +@LOADED_VIDEOS.register_module(OP_NAME) +class VideoCameraCalibrationDeepcalibMapper(Mapper): + """Compute the camera intrinsics and field of view (FOV) + for a static camera using DeepCalib.""" + + _accelerator = "cuda" + + def __init__( + self, + model_path: str = "weights_10_0.02.h5", + frame_field: str = MetaKeys.video_frames, + tag_field_name: str = MetaKeys.camera_calibration_deepcalib_tags, + frame_batch_size: int = 8, + *args, + **kwargs, + ): + """ + Initialization method. + + :param model_path: The path to the DeepCalib Regression model. + :param frame_field: The field name where the video frames are stored. + :param tag_field_name: The field name to store the tags. It's + "camera_calibration_deepcalib_tags" in default. + :param frame_batch_size: Number of frames to batch together for GPU + inference. Larger values improve throughput but require more VRAM. + Default: 8. + :param args: extra args + :param kwargs: extra args + + """ + + super().__init__(*args, **kwargs) + + LazyLoader.check_packages(["tensorflow"]) + import keras + from keras.applications.imagenet_utils import preprocess_input + + self.keras = keras + self.preprocess_input = preprocess_input + + self.model_key = prepare_model(model_type="deepcalib", model_path=model_path) + self.frame_field = frame_field + self.tag_field_name = tag_field_name + self.frame_batch_size = frame_batch_size + + self.INPUT_SIZE = 299 + self.focal_start = 40 + self.focal_end = 500 + + def _decode_and_preprocess_frame(self, frame): + """Decode a single frame, preprocess it for DeepCalib, and return + (preprocessed_image, original_height, original_width).""" + if isinstance(frame, bytes): + image_array = np.frombuffer(frame, dtype=np.uint8) + image = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + else: + image = cv2.imread(frame) + + height, width, channels = image.shape + + image = cv2.resize(image, (self.INPUT_SIZE, self.INPUT_SIZE)) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + image = image / 255.0 + image = image - 0.5 + image = image * 2.0 + + return image, height, width + + def process_single(self, sample=None, rank=None): + + # check if it's generated already + if self.tag_field_name in sample[Fields.meta]: + return sample + + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + return [] + + # load videos + videos_frames = sample[self.frame_field] + model = get_model(self.model_key, rank, self.use_cuda()) + + sample[Fields.meta][self.tag_field_name] = [] + + for video_idx in range(len(videos_frames)): + # Step 1: Decode and preprocess all frames, record original dimensions + preprocessed_images = [] + heights = [] + widths = [] + + for frame in videos_frames[video_idx]: + image, h, w = self._decode_and_preprocess_frame(frame) + preprocessed_images.append(image) + heights.append(h) + widths.append(w) + + num_frames = len(preprocessed_images) + + final_k_list = [] + final_xi_list = [] + final_hfov_list = [] + final_vfov_list = [] + + # Step 2: Batch inference + # All frames are resized to INPUT_SIZE x INPUT_SIZE, so they can + # always be stacked into batches regardless of original resolution. + for batch_start in range(0, num_frames, self.frame_batch_size): + batch_end = min(batch_start + self.frame_batch_size, num_frames) + batch_images = np.array(preprocessed_images[batch_start:batch_end]) # (B, H, W, C) + batch_images = self.preprocess_input(batch_images) + + prediction = model.predict(batch_images) + prediction_focal = prediction[0] # (B, 1) + prediction_dist = prediction[1] # (B, 1) + + for i in range(batch_end - batch_start): + idx = batch_start + i + orig_w = widths[idx] + orig_h = heights[idx] + + # Scale the focal length based on the original width of the image. + curr_focal_pred = ( + ( + prediction_focal[i][0] * (self.focal_end + 1.0 - self.focal_start * 1.0) + + self.focal_start * 1.0 + ) + * (orig_w * 1.0) + / (self.INPUT_SIZE * 1.0) + ) + curr_focal_pred = curr_focal_pred.item() + + # Following DeepCalib's official codes + curr_dist_pred = prediction_dist[i][0] * 1.2 + curr_dist_pred = curr_dist_pred.item() + + temp_k = [[curr_focal_pred, 0, orig_w / 2], [0, curr_focal_pred, orig_h / 2], [0, 0, 1]] + temp_xi = curr_dist_pred + + temp_hfov = 2 * np.arctan(orig_w / 2 / curr_focal_pred) # rad + temp_vfov = 2 * np.arctan(orig_h / 2 / curr_focal_pred) + + temp_hfov = temp_hfov.item() + temp_vfov = temp_vfov.item() + + final_k_list.append(temp_k) + final_xi_list.append(temp_xi) + final_hfov_list.append(temp_hfov) + final_vfov_list.append(temp_vfov) + + sample[Fields.meta][self.tag_field_name].append( + { + CameraCalibrationKeys.intrinsics: final_k_list, + CameraCalibrationKeys.xi: final_xi_list, + CameraCalibrationKeys.hfov: final_hfov_list, + CameraCalibrationKeys.vfov: final_vfov_list, + } + ) + + return sample diff --git a/data_juicer/ops/mapper/video_camera_calibration_droidcalib_mapper.py b/data_juicer/ops/mapper/video_camera_calibration_droidcalib_mapper.py new file mode 100644 index 0000000000..50b942384b --- /dev/null +++ b/data_juicer/ops/mapper/video_camera_calibration_droidcalib_mapper.py @@ -0,0 +1,326 @@ +import argparse +import importlib +import os +import subprocess +import sys +from typing import Optional + +import numpy as np +from loguru import logger + +from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE +from data_juicer.utils.constant import CameraCalibrationKeys, Fields, MetaKeys +from data_juicer.utils.lazy_loader import LazyLoader + +from ..base_op import OPERATORS, Mapper + +torch = LazyLoader("torch") +cv2 = LazyLoader("cv2", "opencv-python") + +OP_NAME = "video_camera_calibration_droidcalib_mapper" + + +@OPERATORS.register_module(OP_NAME) +class VideoCameraCalibrationDroidCalibMapper(Mapper): + """ + Extract camera intrinsics from videos using DroidCalib. + + **Notice**: This operator will download the DroidCalib component from + GitHub at runtime. This component follows the AGPL-3.0 license, please + be aware for commercial use. + """ + + _accelerator = "cuda" + + def __init__( + self, + weights_path: Optional[str] = None, + image_size: list = [384, 512], + stride: int = 2, + max_frames: int = 300, + buffer: int = 1024, + beta: float = 0.3, + filter_thresh: float = 2.4, + warmup: int = 8, + keyframe_thresh: float = 4.0, + frontend_thresh: float = 16.0, + frontend_window: int = 25, + frontend_radius: int = 2, + frontend_nms: int = 1, + backend_thresh: float = 22.0, + backend_radius: int = 2, + backend_nms: int = 3, + upsample: bool = False, + disable_vis: bool = True, + verbose: bool = False, + tag_field_name: str = MetaKeys.camera_calibration_droidcalib_tags, + *args, + **kwargs, + ): + """ + Initialization method. + + :param weights_path: Path to the model weights. + :param image_size: Target image size [height, width]. + :param stride: Frame stride. + :param max_frames: Maximum number of frames to process. + :param buffer: Buffer size for Droid. + :param beta: Weight for translation / rotation components of flow. + :param filter_thresh: Motion threshold before considering new keyframe. + :param warmup: Number of warmup frames. + :param keyframe_thresh: Threshold to create a new keyframe. + :param frontend_thresh: Add edges between frames within this distance. + :param frontend_window: Frontend optimization window. + :param frontend_radius: Force edges between frames within radius. + :param frontend_nms: Non-maximal suppression of edges. + :param backend_thresh: Backend threshold. + :param backend_radius: Backend radius. + :param backend_nms: Backend NMS. + :param upsample: Whether to upsample. + :param disable_vis: Whether to disable visualization. + """ + super().__init__(*args, **kwargs) + + self.verbose = verbose + self._deps_ready = False + + self.droid_calib_home = os.path.join(DATA_JUICER_ASSETS_CACHE, "DroidCalib") + self.droid_slam_path = os.path.join(self.droid_calib_home, "droid_slam") + + self._ensure_droidcalib_ready() + + self.weights_path = weights_path + if self.weights_path is None: + self.weights_path = os.path.join(self.droid_calib_home, "droidcalib.pth") + + self.image_size = image_size + self.stride = stride + self.max_frames = max_frames + + # Droid args + self.droid_args = argparse.Namespace() + self.droid_args.weights = self.weights_path + self.droid_args.buffer = buffer + self.droid_args.image_size = image_size + self.droid_args.beta = beta + self.droid_args.filter_thresh = filter_thresh + self.droid_args.warmup = warmup + self.droid_args.keyframe_thresh = keyframe_thresh + self.droid_args.frontend_thresh = frontend_thresh + self.droid_args.frontend_window = frontend_window + self.droid_args.frontend_radius = frontend_radius + self.droid_args.frontend_nms = frontend_nms + self.droid_args.backend_thresh = backend_thresh + self.droid_args.backend_radius = backend_radius + self.droid_args.backend_nms = backend_nms + self.droid_args.upsample = upsample + self.droid_args.disable_vis = disable_vis + self.droid_args.stereo = False + self.droid_args.camera_model = "pinhole" # Default to pinhole + self.droid_args.opt_intr = True + self.tag_field_name = tag_field_name + + self._ensure_droidcalib_ready() + + def _ensure_droidcalib_ready(self) -> bool: + """Ensure DroidCalib is importable in the *current process*. + + This matters because `Dataset.map(num_proc>1)` may execute in child + processes where `sys.path` changes from `__init__` are not present. + """ + + if not os.path.exists(self.droid_calib_home): + logger.info("Clone DroidCalib...") + try: + subprocess.run( + [ + "git", + "clone", + "--recursive", + # "https://github.com/boschresearch/DroidCalib.git", + "https://github.com/1van2ha0/DroidCalib.git", + f"{self.droid_calib_home}", + ], + check=True, + ) + except Exception: + raise ValueError( + "Failed to clone DroidCalib repository. Please ensure you have git installed and an internet connection, or manually clone the repository to the path " + ) + + if self._deps_ready: + return True + + try: + import torch_scatter # noqa F401 + except ImportError: + # Please refer to https://github.com/rusty1s/pytorch_scatter to locate the + # installation link that is compatible with your PyTorch and CUDA versions. + # For example: + # torch_version = "2.6.0" + # cuda_version = "cu124" + subprocess.run( + [ + "pip", + "install", + "torch-scatter", + # "-f", + # f"https://data.pyg.org/whl/torch-{torch_version}+{cuda_version}.html", + ], + check=True, + ) + + try: + self._load_droid_module() + except ImportError: + subprocess.run(["pip", "uninstall", "droid_backends", "-y"]) + subprocess.run(["python", "setup.py", "install"], cwd=self.droid_calib_home, check=True) + + self._deps_ready = True + return True + + def _load_droid_module(self): + if self.droid_slam_path not in sys.path: + sys.path.insert(1, self.droid_slam_path) + + droid_module_path = f"{self.droid_slam_path}/droid.py" + spec = importlib.util.spec_from_file_location("droid", droid_module_path) + if spec is None: + raise ImportError(f"Could not load spec from {droid_module_path}") + droid_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(droid_module) + + return droid_module + + def _image_stream(self, video_path): + """ + Generator that yields (t, image, intrinsics, size_factor) + """ + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + return + + # Initial calibration guess (center of image) + w0 = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h0 = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # fx, fy, cx, cy + calib = np.array([(w0 + h0) / 2, (w0 + h0) / 2, w0 / 2, h0 / 2]) + fx, fy, cx, cy = calib + + ht, wd = self.image_size # Target size [h, w] + + t = 0 + frame_idx = 0 + + while cap.isOpened(): + ret, image = cap.read() + if not ret: + break + + if frame_idx % self.stride != 0: + frame_idx += 1 + continue + + if self.max_frames and t >= self.max_frames: + break + + h0, w0, _ = image.shape + + # Resize logic from demo.py + # h1 = int(h0 * np.sqrt((ht * wd) / (h0 * w0))) + # w1 = int(w0 * np.sqrt((ht * wd) / (h0 * w0))) + # Actually demo.py logic seems to try to maintain aspect ratio but target specific area? + # Let's stick to demo.py logic + ratio = np.sqrt((ht * wd) / (h0 * w0)) + h1 = int(h0 * ratio) + w1 = int(w0 * ratio) + + image = cv2.resize(image, (w1, h1)) + image = image[: h1 - h1 % 8, : w1 - w1 % 8] # Crop to be divisible by 8 + + image_tensor = torch.as_tensor(image).permute(2, 0, 1) + + intrinsics = torch.as_tensor([fx, fy, cx, cy]) + + # Adjust intrinsics for resize + h_final, w_final = image.shape[:2] + size_factor = [(w_final / w0), (h_final / h0)] + intrinsics[0::2] *= size_factor[0] + intrinsics[1::2] *= size_factor[1] + + yield t, image_tensor[None], intrinsics, size_factor + + t += 1 + frame_idx += 1 + + cap.release() + + def _process_video_file(self, video_path): + droid_module = self._load_droid_module() + Droid = droid_module.Droid + + # from droid import Droid + + if not os.path.exists(video_path): + return None + + # Let's create a generator + stream = self._image_stream(video_path) + + droid = None + sf = None # size factor + intr_est_list = None + + # try: + for t, image, intrinsics, size_factor in stream: + if droid is None: + # Update args with actual image size + self.droid_args.image_size = [image.shape[2], image.shape[3]] + droid = Droid(self.droid_args) + + droid.track(t, image, intrinsics=intrinsics) + sf = size_factor + + if droid is not None: + # Terminate and get results + # We need to pass the stream again for terminate? + # demo.py: droid.terminate(image_stream(...)) + # It seems terminate does a final BA pass using the stream? + # Let's recreate stream + stream_second_pass = self._image_stream(video_path) + traj_est, intr_est = droid.terminate(stream_second_pass) + + # Rescale intrinsics back to original resolution + if sf: + intr_est = intr_est.copy() + intr_est[0:4:2] /= sf[0] + intr_est[1:4:2] /= sf[1] + + intr_est_list = intr_est.tolist() + + if droid: + del droid + torch.cuda.empty_cache() + + return intr_est_list + + def process_single(self, sample, rank=None): + video_paths = sample[self.video_key] + if isinstance(video_paths, str): + video_paths = [video_paths] + + if Fields.meta not in sample: + sample[Fields.meta] = {} + + if not sample[Fields.meta].get(self.tag_field_name, None): + sample[Fields.meta][self.tag_field_name] = [] + + for video_path in video_paths: + res = self._process_video_file(video_path) + if res is not None: + fx, fy, cx, cy = res + res = [[fx, 0, cx], [0, fy, cy], [0, 0, 1]] + sample[Fields.meta][self.tag_field_name].append({CameraCalibrationKeys.intrinsics: res}) + + return sample diff --git a/data_juicer/ops/mapper/video_camera_calibration_moge_mapper.py b/data_juicer/ops/mapper/video_camera_calibration_moge_mapper.py new file mode 100644 index 0000000000..8a03047ce0 --- /dev/null +++ b/data_juicer/ops/mapper/video_camera_calibration_moge_mapper.py @@ -0,0 +1,302 @@ +import os +import uuid + +import numpy as np +from loguru import logger + +from data_juicer.utils.constant import CameraCalibrationKeys, Fields, MetaKeys +from data_juicer.utils.lazy_loader import LazyLoader +from data_juicer.utils.model_utils import get_model, prepare_model + +from ..base_op import OPERATORS, Mapper +from ..op_fusion import LOADED_VIDEOS + +OP_NAME = "video_camera_calibration_moge_mapper" + +cv2 = LazyLoader("cv2", "opencv-python") +torch = LazyLoader("torch") + + +@OPERATORS.register_module(OP_NAME) +@LOADED_VIDEOS.register_module(OP_NAME) +class VideoCameraCalibrationMogeMapper(Mapper): + """Compute the camera intrinsics and field of view (FOV) + for a static camera using Moge-2 (more accurate + than DeepCalib).""" + + _accelerator = "cuda" + + def __init__( + self, + model_path: str = "Ruicheng/moge-2-vitl", + tag_field_name: str = MetaKeys.camera_calibration_moge_tags, + frame_field: str = MetaKeys.video_frames, + output_intrinsics: bool = True, + output_hfov: bool = True, + output_vfov: bool = True, + output_points: bool = True, + output_depth: bool = True, + output_mask: bool = True, + frame_batch_size: int = 8, + save_dir: str = None, + *args, + **kwargs, + ): + """ + Initialization method. + + :param model_path: The path to the Moge-2 model. + :param tag_field_name: The field name to store the tags. It's + "camera_calibration_moge_tags" in default. + :param frame_field: The field name where the video frames are stored. + :param output_intrinsics: Determines whether to output camera intrinsics. + :param output_hfov: Determines whether to output horizontal field of view. + :param output_vfov: Determines whether to output vertical field of view. + :param output_points: Determines whether to output point map + in OpenCV camera coordinate system (x right, y down, z forward). + For MoGe-2, the point map is in metric scale. + :param output_depth: Determines whether to output depth maps. + :param output_mask: Determines whether to output a binary mask for valid pixels. + :param frame_batch_size: Number of frames to batch together for GPU + inference. Larger values improve throughput but require more VRAM. + Default: 8. + :param save_dir: Directory to save large numpy arrays (depth, mask, + points) as .npy files instead of storing them inline. When set, + tag_dict stores file paths (strings) instead of numpy arrays, + which avoids memory limit. + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + + self.model_key = prepare_model(model_type="moge", model_path=model_path) + self.tag_field_name = tag_field_name + self.frame_field = frame_field + self.output_points = output_points + self.output_depth = output_depth + self.output_mask = output_mask + self.output_intrinsics = output_intrinsics + self.output_hfov = output_hfov + self.output_vfov = output_vfov + self.frame_batch_size = frame_batch_size + self.save_dir = save_dir + if save_dir is not None: + os.makedirs(save_dir, exist_ok=True) + assert ( + self.output_points + or self.output_depth + or self.output_mask + or self.output_intrinsics + or self.output_hfov + or self.output_vfov + ), "At least one type of output info must be True." + + def _need_anything(self, sample) -> bool: + """Whether this video still needs any requested outputs.""" + + existing_tags = sample[Fields.meta].get(self.tag_field_name) + if not existing_tags: + return True + + if not isinstance(existing_tags[0], dict): + raise ValueError( + f"The existing field {self.tag_field_name} in sample[Fields.meta] should be a sequence of dict, but get {existing_tags}." + ) + + # Map: instance flag -> corresponding tag key + requirements = { + "output_intrinsics": CameraCalibrationKeys.intrinsics, + "output_hfov": CameraCalibrationKeys.hfov, + "output_vfov": CameraCalibrationKeys.vfov, + "output_points": CameraCalibrationKeys.points, + "output_depth": CameraCalibrationKeys.depth, + "output_mask": CameraCalibrationKeys.mask, + } + + for tag_dict in existing_tags: + missing_any = any(getattr(self, flag, False) and key not in tag_dict for flag, key in requirements.items()) + if missing_any: + return True + + return False + + def _save_numpy(self, arr: np.ndarray, prefix: str) -> str: + """Save a numpy array to a .npy file and return the path.""" + filename = f"{prefix}_{uuid.uuid4().hex[:12]}.npy" + path = os.path.join(self.save_dir, filename) + np.save(path, arr) + return path + + def _decode_frame(self, frame, device): + """Decode a single frame to a (3, H, W) float32 tensor and return (tensor, H, W).""" + if isinstance(frame, bytes): + image_array = np.frombuffer(frame, dtype=np.uint8) + image = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + else: + image = cv2.imread(frame) + + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + h, w = image.shape[:2] + tensor = torch.tensor(image / 255, dtype=torch.float32, device=device).permute(2, 0, 1) + return tensor, h, w + + def _process_video_frames_batched(self, frames, model, device, tag_dict): + """Process all frames of one video using batched MoGe inference. + + MoGe v2 infer() natively supports (B, 3, H, W) batch input. + Same-resolution frames (within a single clip) are stacked and + inferred together for significantly better GPU utilization. + """ + need_K = self.output_intrinsics and CameraCalibrationKeys.intrinsics not in tag_dict + need_hfov = self.output_hfov and CameraCalibrationKeys.hfov not in tag_dict + need_vfov = self.output_vfov and CameraCalibrationKeys.vfov not in tag_dict + need_points = self.output_points and CameraCalibrationKeys.points not in tag_dict + need_depth = self.output_depth and CameraCalibrationKeys.depth not in tag_dict + need_mask = self.output_mask and CameraCalibrationKeys.mask not in tag_dict + need_intrinsics_related = need_K or need_hfov or need_vfov + + # Step 1: Decode all frames and record their dimensions + tensors = [] + heights = [] + widths = [] + for frame in frames: + t, h, w = self._decode_frame(frame, device) + tensors.append(t) + heights.append(h) + widths.append(w) + + num_frames = len(tensors) + if num_frames == 0: + return + + # Step 2: Check if all frames share the same resolution (typical for a single clip) + all_same_size = all(h == heights[0] and w == widths[0] for h, w in zip(heights, widths)) + + final_k_list = [] + final_hfov_list = [] + final_vfov_list = [] + final_points_list = [] + final_depth_list = [] + final_mask_list = [] + + if all_same_size: + # Batched inference path: stack frames and process in chunks + height, width = heights[0], widths[0] + for batch_start in range(0, num_frames, self.frame_batch_size): + batch_end = min(batch_start + self.frame_batch_size, num_frames) + batch_tensor = torch.stack(tensors[batch_start:batch_end], dim=0) # (B, 3, H, W) + + output = model.infer(batch_tensor) + + batch_len = batch_end - batch_start + for i in range(batch_len): + if need_intrinsics_related: + intr_np = output["intrinsics"][i].cpu().numpy() + if need_K: + final_k_list.append( + [ + [float(intr_np[0][0]) * width, 0, float(intr_np[0][2]) * width], + [0, float(intr_np[1][1]) * height, float(intr_np[1][2]) * height], + [0, 0, 1], + ] + ) + if need_hfov: + final_hfov_list.append(float(2 * np.arctan(1 / 2 / intr_np[0][0]))) + if need_vfov: + final_vfov_list.append(float(2 * np.arctan(1 / 2 / intr_np[1][1]))) + if need_points: + final_points_list.append(output["points"][i].cpu().numpy()) + if need_depth: + final_depth_list.append(output["depth"][i].cpu().numpy()) + if need_mask: + final_mask_list.append(output["mask"][i].cpu().numpy()) + else: + # Fallback: per-frame inference when frames have different sizes + logger.debug("Frames have mixed resolutions, falling back to per-frame inference.") + for i in range(num_frames): + output = model.infer(tensors[i]) + height, width = heights[i], widths[i] + + if need_intrinsics_related: + intr_np = output["intrinsics"].cpu().numpy() + if need_K: + final_k_list.append( + [ + [float(intr_np[0][0]) * width, 0, float(intr_np[0][2]) * width], + [0, float(intr_np[1][1]) * height, float(intr_np[1][2]) * height], + [0, 0, 1], + ] + ) + if need_hfov: + final_hfov_list.append(float(2 * np.arctan(1 / 2 / intr_np[0][0]))) + if need_vfov: + final_vfov_list.append(float(2 * np.arctan(1 / 2 / intr_np[1][1]))) + if need_points: + final_points_list.append(output["points"].cpu().numpy()) + if need_depth: + final_depth_list.append(output["depth"].cpu().numpy()) + if need_mask: + final_mask_list.append(output["mask"].cpu().numpy()) + + # Step 3: Write results to tag_dict + # For large numpy arrays (depth, mask, points), save to .npy files + # when save_dir is configured, to avoid memory limit. + if need_K: + tag_dict[CameraCalibrationKeys.intrinsics] = final_k_list + if need_hfov: + tag_dict[CameraCalibrationKeys.hfov] = final_hfov_list + if need_vfov: + tag_dict[CameraCalibrationKeys.vfov] = final_vfov_list + if need_points: + if self.save_dir is not None: + tag_dict[CameraCalibrationKeys.points] = [self._save_numpy(arr, "points") for arr in final_points_list] + else: + tag_dict[CameraCalibrationKeys.points] = final_points_list + if need_depth: + if self.save_dir is not None: + tag_dict[CameraCalibrationKeys.depth] = [self._save_numpy(arr, "depth") for arr in final_depth_list] + else: + tag_dict[CameraCalibrationKeys.depth] = final_depth_list + if need_mask: + if self.save_dir is not None: + tag_dict[CameraCalibrationKeys.mask] = [self._save_numpy(arr, "mask") for arr in final_mask_list] + else: + tag_dict[CameraCalibrationKeys.mask] = final_mask_list + + def process_single(self, sample=None, rank=None): + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + return sample + + if sample.get(self.frame_field) is None: + return sample + + if not self._need_anything(sample): + return sample + + model = get_model(self.model_key, rank, self.use_cuda()) + + videos_frames = sample[self.frame_field] + num_videos = len(videos_frames) + + if self.tag_field_name not in sample[Fields.meta]: + sample[Fields.meta][self.tag_field_name] = [{} for _ in range(num_videos)] + + tags_list = sample[Fields.meta][self.tag_field_name] + + if len(tags_list) != num_videos: + raise ValueError( + f"The field {self.tag_field_name} in sample[Fields.meta] " + "should be a list of dict with the same length as the number of videos." + ) + + if rank is not None: + device = f"cuda:{rank}" if self.use_cuda() else "cpu" + else: + device = "cuda" if self.use_cuda() else "cpu" + + for video_idx in range(num_videos): + tag_dict = tags_list[video_idx] + self._process_video_frames_batched(videos_frames[video_idx], model, device, tag_dict) + + return sample diff --git a/data_juicer/ops/mapper/video_camera_calibration_static_deepcalib_mapper.py b/data_juicer/ops/mapper/video_camera_calibration_static_deepcalib_mapper.py deleted file mode 100644 index 568336786e..0000000000 --- a/data_juicer/ops/mapper/video_camera_calibration_static_deepcalib_mapper.py +++ /dev/null @@ -1,185 +0,0 @@ -import json -import os - -import numpy as np -from pydantic import PositiveInt - -import data_juicer -from data_juicer.ops.load import load_ops -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE -from data_juicer.utils.constant import Fields, MetaKeys -from data_juicer.utils.lazy_loader import LazyLoader -from data_juicer.utils.mm_utils import SpecialTokens -from data_juicer.utils.model_utils import get_model, prepare_model - -from ..base_op import OPERATORS, Mapper -from ..op_fusion import LOADED_VIDEOS - -OP_NAME = "video_camera_calibration_static_deepcalib_mapper" - -cv2 = LazyLoader("cv2", "opencv-contrib-python") - - -@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoCameraCalibrationStaticDeepcalibMapper(Mapper): - """Compute the camera intrinsics and field of view (FOV) - for a static camera using DeepCalib.""" - - _accelerator = "cuda" - - def __init__( - self, - model_path: str = "weights_10_0.02.h5", - frame_num: PositiveInt = 3, - duration: float = 0, - tag_field_name: str = MetaKeys.static_camera_calibration_deepcalib_tags, - frame_dir: str = DATA_JUICER_ASSETS_CACHE, - if_output_info: bool = True, - output_info_dir: str = DATA_JUICER_ASSETS_CACHE, - *args, - **kwargs, - ): - """ - Initialization method. - - :param model_path: The path to the DeepCalib Regression model. - :param frame_num: The number of frames to be extracted uniformly from - the video. If it's 1, only the middle frame will be extracted. If - it's 2, only the first and the last frames will be extracted. If - it's larger than 2, in addition to the first and the last frames, - other frames will be extracted uniformly within the video duration. - If "duration" > 0, frame_num is the number of frames per segment. - :param duration: The duration of each segment in seconds. - If 0, frames are extracted from the entire video. - If duration > 0, the video is segmented into multiple segments - based on duration, and frames are extracted from each segment. - :param tag_field_name: The field name to store the tags. It's - "static_camera_calibration_deepcalib_tags" in default. - :param frame_dir: Output directory to save extracted frames. - :param if_output_info: Whether to save the camera parameters results - to an JSON file. - :param output_info_dir: Output directory for saving camera parameters. - :param args: extra args - :param kwargs: extra args - - """ - - super().__init__(*args, **kwargs) - - LazyLoader.check_packages(["tensorflow==2.20.0"]) - import keras - from keras.applications.imagenet_utils import preprocess_input - - self.keras = keras - self.preprocess_input = preprocess_input - - self.video_extract_frames_mapper_args = { - "frame_sampling_method": "uniform", - "frame_num": frame_num, - "duration": duration, - "frame_dir": frame_dir, - "frame_key": MetaKeys.video_frames, - } - self.fused_ops = load_ops([{"video_extract_frames_mapper": self.video_extract_frames_mapper_args}]) - self.model_key = prepare_model(model_type="deepcalib", model_path=model_path) - - self.frame_num = frame_num - self.duration = duration - self.tag_field_name = tag_field_name - self.frame_dir = frame_dir - self.if_output_info = if_output_info - self.output_info_dir = output_info_dir - self.INPUT_SIZE = 299 - self.focal_start = 40 - self.focal_end = 500 - - def process_single(self, sample=None, rank=None): - - # check if it's generated already - if self.tag_field_name in sample[Fields.meta]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - return [] - - # load videos - ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}] - - dataset = data_juicer.core.data.NestedDataset.from_list(ds_list) - dataset = self.fused_ops[0].run(dataset) - - frames_root = os.path.join(self.frame_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]) - frame_names = os.listdir(frames_root) - frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names]) - model = get_model(self.model_key, rank, self.use_cuda()) - - final_k_list = [] - final_xi_list = [] - final_hfov_list = [] - final_vfov_list = [] - - for i, path in enumerate(frames_path): - image = cv2.imread(path) - height, width, channels = image.shape - - image = cv2.resize(image, (self.INPUT_SIZE, self.INPUT_SIZE)) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - image = image / 255.0 - image = image - 0.5 - image = image * 2.0 - image = np.expand_dims(image, 0) - - image = self.preprocess_input(image) - - prediction = model.predict(image) - prediction_focal = prediction[0] - prediction_dist = prediction[1] - - # Scale the focal length based on the original width of the image. - curr_focal_pred = ( - (prediction_focal[0][0] * (self.focal_end + 1.0 - self.focal_start * 1.0) + self.focal_start * 1.0) - * (width * 1.0) - / (self.INPUT_SIZE * 1.0) - ) - curr_focal_pred = curr_focal_pred.item() - - # Following DeepCalib's official codes - curr_dist_pred = prediction_dist[0][0] * 1.2 - curr_dist_pred = curr_dist_pred.item() - - temp_k = [[curr_focal_pred, 0, width / 2], [0, curr_focal_pred, height / 2], [0, 0, 1]] - temp_xi = curr_dist_pred - - temp_hfov = 2 * np.arctan(width / 2 / curr_focal_pred) # rad - temp_vfov = 2 * np.arctan(height / 2 / curr_focal_pred) - - temp_hfov = temp_hfov.item() - temp_vfov = temp_vfov.item() - - final_k_list.append(temp_k) - final_xi_list.append(temp_xi) - final_hfov_list.append(temp_hfov) - final_vfov_list.append(temp_vfov) - - sample[Fields.meta][self.tag_field_name] = { - "frames_folder": frames_root, - "frame_names": frame_names, - "intrinsics_list": final_k_list, - "xi_list": final_xi_list, - "hfov_list": final_hfov_list, - "vfov_list": final_vfov_list, - } - - if self.if_output_info: - os.makedirs(self.output_info_dir, exist_ok=True) - with open( - os.path.join( - self.output_info_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0] + ".json" - ), - "w", - ) as f: - json.dump(sample[Fields.meta][self.tag_field_name], f) - - return sample diff --git a/data_juicer/ops/mapper/video_camera_calibration_static_moge_mapper.py b/data_juicer/ops/mapper/video_camera_calibration_static_moge_mapper.py deleted file mode 100644 index 7a9860b01a..0000000000 --- a/data_juicer/ops/mapper/video_camera_calibration_static_moge_mapper.py +++ /dev/null @@ -1,191 +0,0 @@ -import json -import os - -import numpy as np -from pydantic import PositiveInt - -import data_juicer -from data_juicer.ops.load import load_ops -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE -from data_juicer.utils.constant import Fields, MetaKeys -from data_juicer.utils.lazy_loader import LazyLoader -from data_juicer.utils.mm_utils import SpecialTokens -from data_juicer.utils.model_utils import get_model, prepare_model - -from ..base_op import OPERATORS, Mapper -from ..op_fusion import LOADED_VIDEOS - -OP_NAME = "video_camera_calibration_static_moge_mapper" - -cv2 = LazyLoader("cv2", "opencv-contrib-python") -torch = LazyLoader("torch") - - -@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoCameraCalibrationStaticMogeMapper(Mapper): - """Compute the camera intrinsics and field of view (FOV) - for a static camera using Moge-2 (more accurate - than DeepCalib).""" - - _accelerator = "cuda" - - def __init__( - self, - model_path: str = "Ruicheng/moge-2-vitl", - frame_num: PositiveInt = 3, - duration: float = 0, - tag_field_name: str = MetaKeys.static_camera_calibration_moge_tags, - frame_dir: str = DATA_JUICER_ASSETS_CACHE, - if_output_info: bool = True, - output_info_dir: str = DATA_JUICER_ASSETS_CACHE, - if_output_points_info: bool = True, - if_output_depth_info: bool = True, - if_output_mask_info: bool = True, - *args, - **kwargs, - ): - """ - Initialization method. - - :param model_path: The path to the Moge-2 model. - :param frame_num: The number of frames to be extracted uniformly from - the video. If it's 1, only the middle frame will be extracted. If - it's 2, only the first and the last frames will be extracted. If - it's larger than 2, in addition to the first and the last frames, - other frames will be extracted uniformly within the video duration. - If "duration" > 0, frame_num is the number of frames per segment. - :param duration: The duration of each segment in seconds. - If 0, frames are extracted from the entire video. - If duration > 0, the video is segmented into multiple segments - based on duration, and frames are extracted from each segment. - :param tag_field_name: The field name to store the tags. It's - "static_camera_calibration_moge_tags" in default. - :param frame_dir: Output directory to save extracted frames. - :param if_output_info: Whether to save the camera parameters results - to an JSON file. - :param output_info_dir: Output directory for saving camera parameters. - :param if_output_points_info: Determines whether to output point map - in OpenCV camera coordinate system (x right, y down, z forward). - For MoGe-2, the point map is in metric scale. - :param if_output_depth_info: Determines whether to output - depth maps. - :param if_output_mask_info: Determines whether to output a - binary mask for valid pixels. - :param args: extra args - :param kwargs: extra args - - """ - - super().__init__(*args, **kwargs) - - self.video_extract_frames_mapper_args = { - "frame_sampling_method": "uniform", - "frame_num": frame_num, - "duration": duration, - "frame_dir": frame_dir, - "frame_key": MetaKeys.video_frames, - } - self.fused_ops = load_ops([{"video_extract_frames_mapper": self.video_extract_frames_mapper_args}]) - self.model_key = prepare_model(model_type="moge", model_path=model_path) - - self.frame_num = frame_num - self.duration = duration - self.tag_field_name = tag_field_name - self.frame_dir = frame_dir - self.output_info_dir = output_info_dir - self.if_output_points_info = if_output_points_info - self.if_output_depth_info = if_output_depth_info - self.if_output_mask_info = if_output_mask_info - self.if_output_info = if_output_info - - def process_single(self, sample=None, rank=None): - - # check if it's generated already - if self.tag_field_name in sample[Fields.meta]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - return [] - - # load videos - ds_list = [{"text": SpecialTokens.video, "videos": sample[self.video_key]}] - - dataset = data_juicer.core.data.NestedDataset.from_list(ds_list) - dataset = self.fused_ops[0].run(dataset) - - frames_root = os.path.join(self.frame_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0]) - frame_names = os.listdir(frames_root) - frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names]) - model = get_model(self.model_key, rank, self.use_cuda()) - - final_k_list = [] - final_hfov_list = [] - final_vfov_list = [] - final_points_list = [] - final_depth_list = [] - final_mask_list = [] - - if rank is not None: - device = f"cuda:{rank}" if self.use_cuda() else "cpu" - else: - device = "cuda" if self.use_cuda() else "cpu" - - for i, path in enumerate(frames_path): - - input_image = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB) - height, width, channels = input_image.shape - input_image = torch.tensor(input_image / 255, dtype=torch.float32, device=device).permute(2, 0, 1) - - output = model.infer(input_image) - - points = output["points"].cpu().tolist() - depth = output["depth"].cpu().tolist() - mask = output["mask"].cpu().tolist() - intrinsics = output["intrinsics"].cpu().tolist() - - temp_k = [ - [intrinsics[0][0] * width, 0, intrinsics[0][2] * width], - [0, intrinsics[1][1] * height, intrinsics[1][2] * height], - [0, 0, 1], - ] - - temp_hfov = 2 * np.arctan(1 / 2 / intrinsics[0][0]) # rad - temp_vfov = 2 * np.arctan(1 / 2 / intrinsics[1][1]) - - final_k_list.append(temp_k) - final_hfov_list.append(temp_hfov) - final_vfov_list.append(temp_vfov) - - if self.if_output_points_info: - final_points_list.append(points) - - if self.if_output_depth_info: - final_depth_list.append(depth) - - if self.if_output_mask_info: - final_mask_list.append(mask) - - sample[Fields.meta][self.tag_field_name] = { - "frames_folder": frames_root, - "frame_names": frame_names, - "intrinsics_list": final_k_list, - "hfov_list": final_hfov_list, - "vfov_list": final_vfov_list, - "points_list": final_points_list, - "depth_list": final_depth_list, - "mask_list": final_mask_list, - } - - if self.if_output_info: - os.makedirs(self.output_info_dir, exist_ok=True) - with open( - os.path.join( - self.output_info_dir, os.path.splitext(os.path.basename(sample[self.video_key][0]))[0] + ".json" - ), - "w", - ) as f: - json.dump(sample[Fields.meta][self.tag_field_name], f) - - return sample diff --git a/data_juicer/ops/mapper/video_camera_pose_mapper.py b/data_juicer/ops/mapper/video_camera_pose_mapper.py deleted file mode 100644 index 9b517fcd20..0000000000 --- a/data_juicer/ops/mapper/video_camera_pose_mapper.py +++ /dev/null @@ -1,337 +0,0 @@ -import os -import subprocess - -import numpy as np -from pydantic import PositiveInt - -import data_juicer -from data_juicer.ops.load import load_ops -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE -from data_juicer.utils.constant import Fields, MetaKeys -from data_juicer.utils.lazy_loader import LazyLoader - -from ..base_op import OPERATORS, Mapper -from ..op_fusion import LOADED_VIDEOS - -OP_NAME = "video_camera_pose_mapper" - -cv2 = LazyLoader("cv2", "opencv-contrib-python") -torch = LazyLoader("torch") - - -@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoCameraPoseMapper(Mapper): - """Extract camera poses by leveraging MegaSaM and MoGe-2.""" - - _accelerator = "cuda" - - def __init__( - self, - moge_model_path: str = "Ruicheng/moge-2-vitl", - frame_num: PositiveInt = 3, - duration: float = 0, - tag_field_name: str = MetaKeys.video_camera_pose_tags, - frame_dir: str = DATA_JUICER_ASSETS_CACHE, - if_output_moge_info: bool = False, - moge_output_info_dir: str = DATA_JUICER_ASSETS_CACHE, - if_save_info: bool = True, - output_info_dir: str = DATA_JUICER_ASSETS_CACHE, - max_frames: int = 1000, - *args, - **kwargs, - ): - """ - Initialization method. - - :param moge_model_path: The path to the Moge-2 model. - :param frame_num: The number of frames to be extracted uniformly from - the video. If it's 1, only the middle frame will be extracted. If - it's 2, only the first and the last frames will be extracted. If - it's larger than 2, in addition to the first and the last frames, - other frames will be extracted uniformly within the video duration. - If "duration" > 0, frame_num is the number of frames per segment. - :param duration: The duration of each segment in seconds. - If 0, frames are extracted from the entire video. - If duration > 0, the video is segmented into multiple segments - based on duration, and frames are extracted from each segment. - :param tag_field_name: The field name to store the tags. It's - "video_camera_pose_tags" in default. - :param frame_dir: Output directory to save extracted frames. - :param if_output_moge_info: Whether to save the results from MoGe-2 - to an JSON file. - :param moge_output_info_dir: Output directory for saving camera - parameters. - :param if_save_info: Whether to save the results to an npz file. - :param output_info_dir: Path for saving the results. - :param max_frames: Maximum number of frames to save. - :param args: extra args - :param kwargs: extra args - - """ - - super().__init__(*args, **kwargs) - - self.video_camera_calibration_static_moge_mapper_args = { - "model_path": moge_model_path, - "frame_num": frame_num, - "duration": duration, - "frame_dir": frame_dir, - "if_output_points_info": False, - "if_output_depth_info": True, - "if_output_mask_info": True, - "if_output_info": if_output_moge_info, - "output_info_dir": moge_output_info_dir, - } - self.fused_ops = load_ops( - [{"video_camera_calibration_static_moge_mapper": self.video_camera_calibration_static_moge_mapper_args}] - ) - - megasam_repo_path = os.path.join(DATA_JUICER_ASSETS_CACHE, "mega-sam") - if not os.path.exists(megasam_repo_path): - subprocess.run(["git", "clone", "https://github.com/mega-sam/mega-sam.git", megasam_repo_path], check=True) - subprocess.run( - ["git", "submodule", "update", "--init", "--recursive"], cwd=os.path.join(megasam_repo_path, "base") - ) - - with open(os.path.join(megasam_repo_path, "base", "src", "altcorr_kernel.cu"), "r") as f: - temp_file_content = f.read() - temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") - - with open(os.path.join(megasam_repo_path, "base", "src", "altcorr_kernel.cu"), "w") as f: - f.write(temp_file_content) - - with open(os.path.join(megasam_repo_path, "base", "src", "correlation_kernels.cu"), "r") as f: - temp_file_content = f.read() - temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") - - with open(os.path.join(megasam_repo_path, "base", "src", "correlation_kernels.cu"), "w") as f: - f.write(temp_file_content) - - with open(os.path.join(megasam_repo_path, "base", "src", "droid_kernels.cu"), "r") as f: - temp_file_content = f.read() - temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") - - with open(os.path.join(megasam_repo_path, "base", "src", "droid_kernels.cu"), "w") as f: - f.write(temp_file_content) - - with open( - os.path.join(megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_gpu.cu"), - "r", - ) as f: - temp_file_content = f.read() - temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") - - with open( - os.path.join(megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_gpu.cu"), - "w", - ) as f: - f.write(temp_file_content) - - with open( - os.path.join( - megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_cpu.cpp" - ), - "r", - ) as f: - temp_file_content = f.read() - temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") - - with open( - os.path.join( - megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_cpu.cpp" - ), - "w", - ) as f: - f.write(temp_file_content) - - try: - import droid_backends - import lietorch - - self.droid_backends = droid_backends - self.lietorch = lietorch - except ImportError: - subprocess.run(["python", "setup.py", "install"], cwd=os.path.join(megasam_repo_path, "base")) - - try: - import torch_scatter - - self.torch_scatter = torch_scatter - except ImportError: - """ "Please refer to https://github.com/rusty1s/pytorch_scatter to locate the - installation link that is compatible with your PyTorch and CUDA versions.""" - torch_version = "2.8.0" - cuda_version = "cu128" - subprocess.run( - [ - "pip", - "install", - "torch-scatter", - "-f", - f"https://data.pyg.org/whl/torch-{torch_version}+{cuda_version}.html", - ], - cwd=os.path.join(megasam_repo_path, "base"), - ) - - import sys - - sys.path.append(os.path.join(megasam_repo_path, "base", "droid_slam")) - from droid import Droid - from lietorch import SE3 - - self.SE3 = SE3 - self.Droid = Droid - - self.tag_field_name = tag_field_name - self.if_save_info = if_save_info - self.output_info_dir = output_info_dir - self.max_frames = max_frames - self.frame_dir = frame_dir - - def image_stream(self, frames_path, depth_list, intrinsics_list): - - for t, (image_path, depth, intrinsics) in enumerate(zip(frames_path, depth_list, intrinsics_list)): - image = cv2.imread(image_path) - h0, w0, _ = image.shape - h1 = int(h0 * np.sqrt((384 * 512) / (h0 * w0))) - w1 = int(w0 * np.sqrt((384 * 512) / (h0 * w0))) - - image = cv2.resize(image, (w1, h1), interpolation=cv2.INTER_AREA) - image = image[: h1 - h1 % 8, : w1 - w1 % 8] - image = torch.as_tensor(image).permute(2, 0, 1) - image = image[None] - - depth = torch.as_tensor(depth) - depth = torch.nn.functional.interpolate(depth[None, None], (h1, w1), mode="nearest-exact").squeeze() - depth = depth[: h1 - h1 % 8, : w1 - w1 % 8] - - mask = torch.ones_like(depth) - - intrinsics = torch.as_tensor([intrinsics[0][0], intrinsics[1][1], intrinsics[0][2], intrinsics[1][2]]) - intrinsics[0::2] *= w1 / w0 - intrinsics[1::2] *= h1 / h0 - - yield t, image, depth, intrinsics, mask - - def process_single(self, sample=None, rank=None): - # check if it's generated already - if self.tag_field_name in sample[Fields.meta]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - return [] - - ds_list = [{"videos": sample[self.video_key]}] - - dataset = data_juicer.core.data.NestedDataset.from_list(ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, column=[{}] * dataset.num_rows) - dataset = dataset.map(self.fused_ops[0].process, num_proc=1, with_rank=True) - res_list = dataset.to_list() - - temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0] - frames_root = os.path.join(self.frame_dir, temp_frame_name) - frame_names = os.listdir(frames_root) - frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names]) - - depth_list = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["depth_list"] - intrinsics_list = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["intrinsics_list"] - - valid_image_list = [] - valid_depth_list = [] - valid_intrinsics_list = [] - valid_mask_list = [] - - # for t, (image_path, depth, intrinsics) in enumerate(zip(frames_path, depth_list, intrinsics_list)): - - for t, image, depth, intrinsics, mask in self.image_stream(frames_path, depth_list, intrinsics_list): - - valid_image_list.append(image[0]) - valid_depth_list.append(depth) - valid_mask_list.append(mask) - valid_intrinsics_list.append(intrinsics) - - if t == 0: - args = droid_args(image_size=[image.shape[2], image.shape[3]]) - droid = self.Droid(args) - - droid.track(t, image, depth, intrinsics=intrinsics, mask=mask) - - droid.track_final(t, image, depth, intrinsics=intrinsics, mask=mask) - - traj_est, depth_est, motion_prob = droid.terminate( - self.image_stream(frames_path, depth_list, intrinsics_list), - _opt_intr=True, - full_ba=True, - scene_name=temp_frame_name, - ) - - t = traj_est.shape[0] - images = np.array(valid_image_list[:t]) - disps = 1.0 / (np.array(valid_depth_list[:t]) + 1e-6) - - poses = traj_est - intrinsics = droid.video.intrinsics[:t].cpu().numpy() - - intrinsics = intrinsics[0] * 8.0 - poses_th = torch.as_tensor(poses, device="cpu") - cam_c2w = self.SE3(poses_th).inv().matrix().numpy() - - K = np.eye(3) - K[0, 0] = intrinsics[0] - K[1, 1] = intrinsics[1] - K[0, 2] = intrinsics[2] - K[1, 2] = intrinsics[3] - - max_frames = min(self.max_frames, images.shape[0]) - - return_images = np.uint8(images[:max_frames, ::-1, ...].transpose(0, 2, 3, 1)) - return_depths = np.float32(1.0 / disps[:max_frames, ...]) - return_cam_c2w = cam_c2w[:max_frames] - - if self.if_save_info: - os.makedirs(self.output_info_dir, exist_ok=True) - - np.savez( - os.path.join(self.output_info_dir, "%s_droid.npz" % temp_frame_name), - images=return_images, - depths=return_depths, - intrinsic=K, - cam_c2w=return_cam_c2w, - ) - - sample[Fields.meta][self.tag_field_name] = { - "frames_folder": frames_root, - "frame_names": frame_names, - "images": return_images, - "depths": return_depths, - "intrinsic": K, - "cam_c2w": return_cam_c2w, - } - - return sample - - -class droid_args: - def __init__(self, image_size): - self.weights = os.path.join(DATA_JUICER_ASSETS_CACHE, "mega-sam", "checkpoints", "megasam_final.pth") - self.disable_vis = True - self.image_size = image_size - self.buffer = 1024 - self.stereo = False - self.filter_thresh = 2.0 - - self.warmup = 8 - self.beta = 0.3 - self.frontend_nms = 1 - self.keyframe_thresh = 2.0 - self.frontend_window = 25 - self.frontend_thresh = 12.0 - self.frontend_radius = 2 - - self.upsample = False - self.backend_thresh = 16.0 - self.backend_radius = 2 - self.backend_nms = 3 diff --git a/data_juicer/ops/mapper/video_camera_pose_megasam_mapper.py b/data_juicer/ops/mapper/video_camera_pose_megasam_mapper.py new file mode 100644 index 0000000000..0ebdd3f27b --- /dev/null +++ b/data_juicer/ops/mapper/video_camera_pose_megasam_mapper.py @@ -0,0 +1,405 @@ +import importlib +import os +import subprocess +import sys +import uuid + +import numpy as np +from loguru import logger + +from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE +from data_juicer.utils.constant import CameraCalibrationKeys, Fields, MetaKeys +from data_juicer.utils.file_utils import load_numpy +from data_juicer.utils.lazy_loader import LazyLoader + +from ..base_op import OPERATORS, Mapper +from ..op_fusion import LOADED_VIDEOS + +OP_NAME = "video_camera_pose_megasam_mapper" + +cv2 = LazyLoader("cv2", "opencv-python") +torch = LazyLoader("torch") + + +def to_standard_list(obj): + if isinstance(obj, np.ndarray): + return to_standard_list(obj.tolist()) + elif isinstance(obj, list): + return [to_standard_list(item) for item in obj] + elif isinstance(obj, tuple): + return tuple(to_standard_list(item) for item in obj) + else: + return obj + + +@OPERATORS.register_module(OP_NAME) +@LOADED_VIDEOS.register_module(OP_NAME) +class VideoCameraPoseMegaSaMMapper(Mapper): + """Extract camera poses by leveraging MegaSaM and MoGe-2.""" + + _accelerator = "cuda" + + def __init__( + self, + tag_field_name: str = MetaKeys.video_camera_pose_tags, + frame_field: str = MetaKeys.video_frames, + camera_calibration_field: str = "camera_calibration", + max_frames: int = 1000, + droid_buffer: int = 1024, + save_dir: str = None, + *args, + **kwargs, + ): + """ + Initialization method. + :param tag_field_name: The field name to store the tags. It's "video_camera_pose_tags" in default. + :param frame_field: The field name where the video frames are stored. + :param camera_calibration_field: The field name where the camera calibration info is stored. + :param max_frames: Maximum number of frames to save. + :param droid_buffer: DROID SLAM pre-allocated frame buffer size. + Controls GPU memory usage — each buffer slot pre-allocates + correlation volumes on GPU. Default 1024, sufficient for + clips up to ~100 frames. Reduce for shorter clips to save + VRAM, increase for longer videos. + :param save_dir: Directory to save large numpy arrays (depth, + cam_c2w) as .npy files instead of storing them inline. + When set, tag_dict stores file paths (strings) instead of + numpy arrays, which avoids memory limit. + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self.droid_buffer = droid_buffer + self.save_dir = save_dir + if save_dir is not None: + os.makedirs(save_dir, exist_ok=True) + + megasam_repo_path = os.path.join(DATA_JUICER_ASSETS_CACHE, "mega-sam") + # droid_slam conflict with the VideoCalibrationMapper + droid_slam_home = os.path.join(megasam_repo_path, "base", "droid_slam") + + self._prepare_env(megasam_repo_path, droid_slam_home) + + droid_module_path = f"{droid_slam_home}/droid.py" + spec = importlib.util.spec_from_file_location("droid", droid_module_path) + if spec is None: + raise ImportError(f"Could not load spec from {droid_module_path}") + droid = importlib.util.module_from_spec(spec) + spec.loader.exec_module(droid) + + from lietorch import SE3 + + self.SE3 = SE3 + self.Droid = droid.Droid + + self.tag_field_name = tag_field_name + self.max_frames = max_frames + self.frame_field = frame_field + self.camera_calibration_field = camera_calibration_field + + def _prepare_env(self, megasam_repo_path, droid_slam_home): + for i in range(len(sys.path)): + if "DroidCalib/droid_slam" in sys.path[i]: + logger.warning("Removing DroidCalib/droid_slam from sys.path, it maybe conflicting with mega-sam.") + sys.path.pop(i) + break + + if droid_slam_home not in sys.path: + sys.path.insert(1, droid_slam_home) + + if not os.path.exists(megasam_repo_path): + subprocess.run( + ["git", "clone", "--recursive", "https://github.com/mega-sam/mega-sam.git", megasam_repo_path], + check=True, + ) + + with open(os.path.join(megasam_repo_path, "base", "src", "altcorr_kernel.cu"), "r") as f: + temp_file_content = f.read() + temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") + + with open(os.path.join(megasam_repo_path, "base", "src", "altcorr_kernel.cu"), "w") as f: + f.write(temp_file_content) + + with open(os.path.join(megasam_repo_path, "base", "src", "correlation_kernels.cu"), "r") as f: + temp_file_content = f.read() + temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") + + with open(os.path.join(megasam_repo_path, "base", "src", "correlation_kernels.cu"), "w") as f: + f.write(temp_file_content) + + with open(os.path.join(megasam_repo_path, "base", "src", "droid_kernels.cu"), "r") as f: + temp_file_content = f.read() + temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") + + with open(os.path.join(megasam_repo_path, "base", "src", "droid_kernels.cu"), "w") as f: + f.write(temp_file_content) + + with open( + os.path.join(megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_gpu.cu"), + "r", + ) as f: + temp_file_content = f.read() + temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") + + with open( + os.path.join(megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_gpu.cu"), + "w", + ) as f: + f.write(temp_file_content) + + with open( + os.path.join( + megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_cpu.cpp" + ), + "r", + ) as f: + temp_file_content = f.read() + temp_file_content = temp_file_content.replace(".type()", ".scalar_type()") + + with open( + os.path.join( + megasam_repo_path, "base", "thirdparty", "lietorch", "lietorch", "src", "lietorch_cpu.cpp" + ), + "w", + ) as f: + f.write(temp_file_content) + + try: + import torch_scatter # noqa F401 + except ImportError: + """ "Please refer to https://github.com/rusty1s/pytorch_scatter to locate the + installation link that is compatible with your PyTorch and CUDA versions.""" + # torch_version = "2.6.0" + # cuda_version = "cu124" + subprocess.run( + [ + "pip", + "install", + "torch-scatter", + # "-f", + # f"https://data.pyg.org/whl/torch-{torch_version}+{cuda_version}.html", + ], + cwd=os.path.join(megasam_repo_path, "base"), + ) + + try: + import droid_backends # noqa F401 + import lietorch # noqa F401 + except ImportError: + subprocess.run(["pip", "uninstall", "droid_backends", "-y"]) + subprocess.run(["python", "setup.py", "install"], cwd=os.path.join(megasam_repo_path, "base")) + + def _preprocess_stream(self, frames, depth_list, intrinsics_list): + """Pre-process all frames once and cache the results. + + Returns a list of (t, image, depth, intrinsics, mask) tuples. + Avoids the cost of repeated image decoding / resize when + image_stream is consumed multiple times (tracking + terminate). + """ + from loguru import logger as _logger + + cached = [] + for t, (raw_image, raw_depth, raw_intr) in enumerate(zip(frames, depth_list, intrinsics_list)): + if isinstance(raw_image, bytes): + image_array = np.frombuffer(raw_image, dtype=np.uint8) + image = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + else: + image = cv2.imread(raw_image) + if image is None: + _logger.warning(f"MegaSaM: frame {t} decode failed, skipping.") + continue + h0, w0, _ = image.shape + h1 = int(h0 * np.sqrt((384 * 512) / (h0 * w0))) + w1 = int(w0 * np.sqrt((384 * 512) / (h0 * w0))) + + image = cv2.resize(image, (w1, h1), interpolation=cv2.INTER_AREA) + image = image[: h1 - h1 % 8, : w1 - w1 % 8] + image = torch.as_tensor(image).permute(2, 0, 1) + image = image[None] + + raw_depth = load_numpy(raw_depth) + if isinstance(raw_depth, np.ndarray): + depth = torch.from_numpy(raw_depth.astype(np.float32)) + else: + depth = torch.as_tensor(raw_depth, dtype=torch.float32) + depth = torch.nn.functional.interpolate(depth[None, None], (h1, w1), mode="nearest-exact").squeeze() + depth = depth[: h1 - h1 % 8, : w1 - w1 % 8] + + mask = torch.ones_like(depth) + + if isinstance(raw_intr, np.ndarray): + intr_arr = raw_intr.astype(np.float32) + intrinsics = torch.tensor([intr_arr[0, 0], intr_arr[1, 1], intr_arr[0, 2], intr_arr[1, 2]]) + else: + intrinsics = torch.as_tensor([raw_intr[0][0], raw_intr[1][1], raw_intr[0][2], raw_intr[1][2]]) + intrinsics[0::2] *= w1 / w0 + intrinsics[1::2] *= h1 / h0 + + cached.append((t, image, depth, intrinsics, mask)) + return cached + + @staticmethod + def _iter_cached(cached): + """Yield from a cached preprocessed list (same interface as image_stream).""" + for item in cached: + yield item + + def process_single(self, sample=None, rank=None): + # check if it's generated already + if self.tag_field_name in sample[Fields.meta]: + return sample + + # there is no video in this sample + if self.video_key not in sample or not sample[self.video_key]: + return sample + + videos_frames = sample[self.frame_field] + + sample[Fields.meta][self.tag_field_name] = [] + + # DROID SLAM requires at least this many frames for warmup + MIN_FRAMES_FOR_SLAM = 8 + + for video_idx in range(len(videos_frames)): + frames = videos_frames[video_idx] + + if len(frames) < MIN_FRAMES_FOR_SLAM: + from loguru import logger as _logger + + _logger.warning( + f"Video {video_idx}: only {len(frames)} frames, " + f"need >= {MIN_FRAMES_FOR_SLAM} for SLAM. " + f"Producing empty camera pose." + ) + sample[Fields.meta][self.tag_field_name].append({}) + continue + + cur_video_calibration = sample[Fields.meta][self.camera_calibration_field][video_idx] + depth_list = cur_video_calibration[CameraCalibrationKeys.depth] + intrinsics = cur_video_calibration[CameraCalibrationKeys.intrinsics] + + if isinstance(intrinsics, np.ndarray): + intrinsics = intrinsics.astype(np.float32) + else: + intrinsics = np.array(to_standard_list(intrinsics), dtype=np.float32) + + # (3, 3) -> (N, 3, 3) + if intrinsics.ndim == 2: + assert intrinsics.shape == (3, 3) + intrinsics = np.tile(intrinsics[np.newaxis, :, :], (len(frames), 1, 1)) + elif intrinsics.ndim == 3: + assert len(intrinsics) == len(frames), f"Expected {len(frames)}, got {len(intrinsics)}" + else: + raise ValueError(f"Invalid intrinsics shape: {intrinsics.shape}, expected (N, 3, 3) or (3, 3)") + + intrinsics_list = intrinsics.tolist() + + # Pre-process all frames once (avoids double decode + resize) + cached_stream = self._preprocess_stream(frames, depth_list, intrinsics_list) + + valid_image_list = [] + valid_depth_list = [] + valid_intrinsics_list = [] + valid_mask_list = [] + + for t, image, depth, intr, mask in cached_stream: + + valid_image_list.append(image[0]) + valid_depth_list.append(depth) + valid_mask_list.append(mask) + valid_intrinsics_list.append(intr) + + if t == 0: + args = droid_args(image_size=[image.shape[2], image.shape[3]], buffer=self.droid_buffer) + droid = self.Droid(args) + + droid.track(t, image, depth, intrinsics=intr, mask=mask) + + droid.track_final(t, image, depth, intrinsics=intr, mask=mask) + + # Reuse cached stream for terminate (no re-decode) + traj_est, depth_est, motion_prob = droid.terminate( + self._iter_cached(cached_stream), + _opt_intr=True, + full_ba=True, + ) + + t = traj_est.shape[0] + images = np.array(valid_image_list[:t]) + disps = 1.0 / (np.array(valid_depth_list[:t]) + 1e-6) + + poses = traj_est + intrinsics = droid.video.intrinsics[:t].cpu().numpy() + + # release droid slam instance and its pre-allocated GPU buffer + # to avoid cumulative GPU memory in subsequent clip processing or other actors + del droid + torch.cuda.empty_cache() + + intrinsics = intrinsics[0] * 8.0 + poses_th = torch.as_tensor(poses, device="cpu") + cam_c2w = self.SE3(poses_th).inv().matrix().numpy() + + K = np.eye(3) + K[0, 0] = intrinsics[0] + K[1, 1] = intrinsics[1] + K[0, 2] = intrinsics[2] + K[1, 2] = intrinsics[3] + + max_frames = min(self.max_frames, images.shape[0]) + + # return_images = np.uint8(images[:max_frames, ::-1, ...].transpose(0, 2, 3, 1)) + return_depths = np.float32(1.0 / disps[:max_frames, ...]) + return_cam_c2w = cam_c2w[:max_frames] + + if self.save_dir is not None: + depth_path = os.path.join( + self.save_dir, + f"megasam_depth_{uuid.uuid4().hex[:12]}.npy", + ) + np.save(depth_path, return_depths) + c2w_path = os.path.join( + self.save_dir, + f"megasam_c2w_{uuid.uuid4().hex[:12]}.npy", + ) + np.save(c2w_path, return_cam_c2w) + sample[Fields.meta][self.tag_field_name].append( + { + CameraCalibrationKeys.depth: depth_path, + CameraCalibrationKeys.intrinsics: K, + CameraCalibrationKeys.cam_c2w: c2w_path, + } + ) + else: + sample[Fields.meta][self.tag_field_name].append( + { + CameraCalibrationKeys.depth: return_depths, + CameraCalibrationKeys.intrinsics: K, + CameraCalibrationKeys.cam_c2w: return_cam_c2w, + } + ) + + return sample + + +class droid_args: + def __init__(self, image_size, buffer=1024): + self.weights = os.path.join(DATA_JUICER_ASSETS_CACHE, "mega-sam", "checkpoints", "megasam_final.pth") + self.disable_vis = True + self.image_size = image_size + self.buffer = buffer + self.stereo = False + self.filter_thresh = 2.0 + + self.warmup = 8 + self.beta = 0.3 + self.frontend_nms = 1 + self.keyframe_thresh = 2.0 + self.frontend_window = 25 + self.frontend_thresh = 12.0 + self.frontend_radius = 2 + + self.upsample = False + self.backend_thresh = 16.0 + self.backend_radius = 2 + self.backend_nms = 3 diff --git a/data_juicer/ops/mapper/video_captioning_from_vlm_mapper.py b/data_juicer/ops/mapper/video_captioning_from_vlm_mapper.py index 7984c63493..85b2bde912 100644 --- a/data_juicer/ops/mapper/video_captioning_from_vlm_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_vlm_mapper.py @@ -64,6 +64,7 @@ def __init__( keep_original_sample: bool = True, prompt: Optional[str] = None, prompt_key: Optional[str] = None, + system_prompt: Optional[str] = None, model_params: Dict = None, sampling_params: Dict = None, *args, @@ -109,6 +110,9 @@ def __init__( for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default. + :param system_prompt: a system prompt string used to set the context + of the conversation and provide global guidance for the VLM model. + If it's None, no system prompt will be used. It's None in default. :param model_params: Parameters for initializing the model. :param sampling_params: Extra parameters passed to the model calling. e.g {'temperature': 0.9, 'top_p': 0.95} @@ -148,6 +152,7 @@ def __init__( self.keep_original_sample = keep_original_sample self.prompt = prompt self.prompt_key = prompt_key + self.system_prompt = system_prompt self.extra_args = kwargs self.enable_vllm = enable_vllm @@ -244,7 +249,13 @@ def _process_single_sample(self, ori_sample, rank=None, context=False): else: prompt_text = DEFAULT_PROMPT - messages = [ + messages = [] + if self.system_prompt: + messages.append({ + "role": "system", + "content": self.system_prompt, + }) + messages.append( { "role": "user", "content": [ @@ -258,7 +269,7 @@ def _process_single_sample(self, ori_sample, rank=None, context=False): } ] } - ] + ) if self.enable_vllm: inputs = [prepare_qwen_vl_inputs_for_vllm(messages, processor)] diff --git a/data_juicer/ops/mapper/video_clip_reassembly_mapper.py b/data_juicer/ops/mapper/video_clip_reassembly_mapper.py new file mode 100644 index 0000000000..e14081f4ba --- /dev/null +++ b/data_juicer/ops/mapper/video_clip_reassembly_mapper.py @@ -0,0 +1,934 @@ +import hashlib + +import cv2 +import numpy as np +from loguru import logger + +from data_juicer.utils.constant import CameraCalibrationKeys, Fields, MetaKeys +from data_juicer.utils.file_utils import load_numpy + +from ..base_op import OPERATORS, Mapper + +OP_NAME = "video_clip_reassembly_mapper" + + +@OPERATORS.register_module(OP_NAME) +class VideoClipReassemblyMapper(Mapper): + """Reassemble hand-action results from overlapping video clips. + + When long videos are chopped into overlapping clips (e.g. 5 s with 2 s + overlap via ``VideoSplitByDurationMapper``), each clip is processed + independently through the 3-D motion labelling pipeline. This operator + merges the per-clip results back into **one unified result** per original + video, including: + + * ``hand_action_tags`` — states, actions, valid_frame_ids, joints + * ``video_camera_pose_tags`` — ``cam_c2w`` array + * ``hand_reconstruction_hawor_tags`` — frame_ids converted to global + * ``video_frames`` — per-clip frame path lists merged into one global list + * ``camera_calibration_moge_tags`` — per-clip depth/intrinsics merged + * ``clips`` — replaced with the original video path + + Clip global offsets are determined automatically by **pixel-matching** + overlapping frames between consecutive clips, rather than assuming an + ideal step size. This handles ffmpeg keyframe-alignment drift that + causes actual clip boundaries to differ from the nominal + ``(split_duration - overlap_duration) * fps`` calculation. + + Reference (paper §3.1): + "To enhance efficiency, we chop long videos into overlapping + 20-second clips in this stage and recompose their results." + """ + + def __init__( + self, + hand_action_field: str = MetaKeys.hand_action_tags, + camera_pose_field: str = MetaKeys.video_camera_pose_tags, + hand_reconstruction_field: str = (MetaKeys.hand_reconstruction_hawor_tags), + frame_field: str = MetaKeys.video_frames, + moge_field: str = MetaKeys.camera_calibration_moge_tags, + clip_field: str = "clips", + video_key: str = "videos", + split_duration: float = None, + overlap_duration: float = None, + fps: float = None, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.hand_action_field = hand_action_field + self.camera_pose_field = camera_pose_field + self.hand_reconstruction_field = hand_reconstruction_field + self.frame_field = frame_field + self.moge_field = moge_field + self.clip_field = clip_field + self.video_key = video_key + self.split_duration = split_duration + self.overlap_duration = overlap_duration + self.fps = fps + + # ------------------------------------------------------------------ + # Detect actual clip offsets via frame content matching + # ------------------------------------------------------------------ + @staticmethod + def _frame_hash(path: str) -> str: + """Compute a fast content hash for a frame image file.""" + img = cv2.imread(path) + if img is None: + return None + return hashlib.md5(img.tobytes()).hexdigest() + + @classmethod + def _detect_clip_offsets( + cls, + per_clip_frames: list[list[str]], + nominal_step: int = None, + ) -> list[int]: + """Determine the global frame offset for each clip. + + Compares the first frame of clip[i] against frames of clip[i-1] + to find the actual overlap point. Falls back to the nominal step + if pixel matching fails. + + Returns: + List of global offsets, one per clip. offsets[0] is always 0. + """ + n_clips = len(per_clip_frames) + offsets = [0] + + for ci in range(1, n_clips): + prev_frames = per_clip_frames[ci - 1] + curr_frames = per_clip_frames[ci] + + if not curr_frames or not prev_frames: + step = nominal_step or len(prev_frames) + offsets.append(offsets[-1] + step) + continue + + # Hash the first frame of the current clip + h_curr_0 = cls._frame_hash(curr_frames[0]) + if h_curr_0 is None: + step = nominal_step or len(prev_frames) + offsets.append(offsets[-1] + step) + continue + + # Search for a match in the previous clip + # Start from a reasonable range around the nominal step + search_start = max(0, (nominal_step or len(prev_frames)) - 30) + search_end = min(len(prev_frames), (nominal_step or len(prev_frames)) + 30) + + found = False + for j in range(search_start, search_end): + h_prev = cls._frame_hash(prev_frames[j]) + if h_prev == h_curr_0: + # Verify with a second frame to avoid hash collision + if len(curr_frames) > 1 and j + 1 < len(prev_frames): + h_c1 = cls._frame_hash(curr_frames[1]) + h_p1 = cls._frame_hash(prev_frames[j + 1]) + if h_c1 != h_p1: + continue + offsets.append(offsets[-1] + j) + found = True + logger.debug( + f"Clip {ci}: detected offset {j} from clip {ci-1} " f"(global offset {offsets[-1]})", + ) + break + + if not found: + step = nominal_step or len(prev_frames) + offsets.append(offsets[-1] + step) + logger.warning( + f"Clip {ci}: frame matching failed, using nominal " f"step {step} (global offset {offsets[-1]})", + ) + + return offsets + + # ------------------------------------------------------------------ + # World-frame alignment between clips + # ------------------------------------------------------------------ + @staticmethod + def _compute_alignment_transforms( + cam_pose_list: list[dict], + offsets: list[int], + clip_lengths: list[int], + ) -> list[np.ndarray]: + """Compute 4x4 transforms to align each clip's world frame to clip 0. + + Uses cam_c2w matrices from the overlap region: + T_0i = c2w_0[g] @ inv(c2w_i[local]) + where g is a global frame index present in both clips. + + Returns: + List of (4, 4) transforms. transforms[0] = identity. + """ + from scipy.spatial.transform import Rotation + + n_clips = len(cam_pose_list) + transforms = [np.eye(4, dtype=np.float64)] + + for ci in range(1, n_clips): + cp_prev = cam_pose_list[ci - 1] + cp_curr = cam_pose_list[ci] + + if not cp_prev or not isinstance(cp_prev, dict) or not cp_curr or not isinstance(cp_curr, dict): + transforms.append(transforms[-1].copy()) + continue + + raw_prev = cp_prev.get(CameraCalibrationKeys.cam_c2w) + raw_curr = cp_curr.get(CameraCalibrationKeys.cam_c2w) + if raw_prev is None or raw_curr is None: + transforms.append(transforms[-1].copy()) + continue + + c2w_prev = np.asarray(load_numpy(raw_prev), dtype=np.float64) + c2w_curr = np.asarray(load_numpy(raw_curr), dtype=np.float64) + + # Overlap: clip_curr[k] corresponds to clip_prev[offsets[ci] - offsets[ci-1] + k] + step_in_prev = offsets[ci] - offsets[ci - 1] + overlap_len = clip_lengths[ci - 1] - step_in_prev + + if overlap_len <= 0: + transforms.append(transforms[-1].copy()) + continue + + # Compute T for each overlap frame, then average + Rs = [] + ts = [] + for k in range(min(overlap_len, len(c2w_curr))): + prev_idx = step_in_prev + k + if prev_idx >= len(c2w_prev): + break + T_local = c2w_prev[prev_idx] @ np.linalg.inv(c2w_curr[k]) + Rs.append(T_local[:3, :3]) + ts.append(T_local[:3, 3]) + + if not Rs: + transforms.append(transforms[-1].copy()) + continue + + # Robust average: median translation, mean quaternion rotation + t_median = np.median(np.array(ts), axis=0) + + quats = Rotation.from_matrix(np.array(Rs)).as_quat() + for j in range(1, len(quats)): + if np.dot(quats[j], quats[j - 1]) < 0: + quats[j] = -quats[j] + mean_quat = np.mean(quats, axis=0) + mean_quat /= np.linalg.norm(mean_quat) + R_mean = Rotation.from_quat(mean_quat).as_matrix() + + # This gives T: prev_world -> curr_world + # Chain with the accumulated transform to get clip_0 world + T_prev_curr = np.eye(4, dtype=np.float64) + T_prev_curr[:3, :3] = R_mean + T_prev_curr[:3, 3] = t_median + transforms.append(transforms[ci - 1] @ T_prev_curr) + + logger.debug( + f"Clip {ci} alignment: rotation " + f"{np.degrees(Rotation.from_matrix(R_mean).magnitude()):.1f}°, " + f"translation {np.linalg.norm(t_median):.4f}m", + ) + + return transforms + + @staticmethod + def _apply_transform_to_hand_data( + hand_data: dict, + T: np.ndarray, + ) -> dict: + """Transform a clip's hand states/joints from its local world frame + to the target world frame using rigid transform T (4x4). + """ + from scipy.spatial.transform import Rotation + + if not hand_data or not hand_data.get("states"): + return hand_data + + R = T[:3, :3] + t = T[:3, 3] + R_rot = Rotation.from_matrix(R) + + states = np.asarray(hand_data["states"], dtype=np.float64) + # Transform positions + states[:, 0:3] = (R @ states[:, 0:3].T).T + t + # Transform orientations + orig_rots = Rotation.from_euler("xyz", states[:, 3:6], degrees=False) + new_rots = R_rot * orig_rots + states[:, 3:6] = new_rots.as_euler("xyz", degrees=False) + + result = dict(hand_data) + result["states"] = states.tolist() + + # Transform joints_world + jw = hand_data.get("joints_world") + if jw and len(jw) > 0: + jw_arr = np.asarray(jw, dtype=np.float64) + # (T, 21, 3) -> transform each joint + orig_shape = jw_arr.shape + flat = jw_arr.reshape(-1, 3) + flat_aligned = (R @ flat.T).T + t + result["joints_world"] = flat_aligned.reshape( + orig_shape, + ).tolist() + + # Recompute actions from transformed states + from data_juicer.ops.mapper.video_hand_motion_smooth_mapper import ( + _recompute_actions, + ) + + result["actions"] = _recompute_actions(states).tolist() + + return result + + @staticmethod + def _apply_transform_to_c2w( + c2w: np.ndarray, + T: np.ndarray, + ) -> np.ndarray: + """Transform cam_c2w array from local world to target world frame.""" + # c2w maps camera -> local_world + # T maps local_world -> target_world + # new_c2w = T @ c2w + return np.einsum("ij,njk->nik", T, c2w) + + # ------------------------------------------------------------------ + # helpers + # ------------------------------------------------------------------ + @staticmethod + def _empty_hand_result(hand_type: str) -> dict: + return { + "hand_type": hand_type, + "states": [], + "actions": [], + "valid_frame_ids": [], + "joints_cam": [], + "joints_world": [], + } + + @staticmethod + def _recompute_actions(states: np.ndarray) -> np.ndarray: + """Recompute 7-DoF actions from 8-dim states.""" + from scipy.spatial.transform import Rotation + + T = len(states) + actions = np.zeros((T, 7), dtype=np.float32) + for t in range(T - 1): + actions[t, 0:3] = states[t + 1, 0:3] - states[t, 0:3] + R_prev = Rotation.from_euler( + "xyz", + states[t, 3:6], + degrees=False, + ) + R_next = Rotation.from_euler( + "xyz", + states[t + 1, 3:6], + degrees=False, + ) + R_delta = R_next * R_prev.inv() + actions[t, 3:6] = R_delta.as_euler("xyz", degrees=False) + actions[t, 6] = states[t + 1, 7] + if T > 0: + actions[T - 1, 6] = states[T - 1, 7] + return actions + + def _compute_nominal_step(self) -> int: + """Compute the nominal step from constructor params (fallback).""" + if self.split_duration and self.overlap_duration and self.fps: + return int( + (self.split_duration - self.overlap_duration) * self.fps, + ) + return None + + def _blend_weight( + self, + clip_idx: int, + local_fid: int, + n_clips: int, + clip_len: int, + overlap_prev: int, + overlap_next: int, + ) -> float: + """Compute the blending weight for a frame given its clip position. + + Args: + overlap_prev: number of frames this clip overlaps with the + previous clip (ramp-up at start). + overlap_next: number of frames this clip overlaps with the + next clip (ramp-down at end). + """ + w = 1.0 + if clip_idx > 0 and overlap_prev > 0 and local_fid < overlap_prev: + w = (local_fid + 1) / (overlap_prev + 1) + if clip_idx < n_clips - 1 and overlap_next > 0 and local_fid >= clip_len - overlap_next: + frames_from_end = clip_len - local_fid + w_end = frames_from_end / (overlap_next + 1) + w = min(w, w_end) + return w + + # ------------------------------------------------------------------ + # video_frames merge + # ------------------------------------------------------------------ + @staticmethod + def _merge_video_frames( + per_clip_frames: list[list[str]], + offsets: list[int], + ) -> list[str]: + """Merge per-clip frame path lists into one global ordered list.""" + total_frames = 0 + for ci, clip_frames in enumerate(per_clip_frames): + end = offsets[ci] + len(clip_frames) + if end > total_frames: + total_frames = end + + merged = [None] * total_frames + for ci, clip_frames in enumerate(per_clip_frames): + offset = offsets[ci] + for local_fid, frame_path in enumerate(clip_frames): + gfid = offset + local_fid + if gfid < total_frames and merged[gfid] is None: + merged[gfid] = frame_path + + # Fill any remaining None slots + for i in range(len(merged)): + if merged[i] is None: + for delta in range(1, len(merged)): + if i - delta >= 0 and merged[i - delta] is not None: + merged[i] = merged[i - delta] + break + if i + delta < len(merged) and merged[i + delta] is not None: + merged[i] = merged[i + delta] + break + + return merged + + # ------------------------------------------------------------------ + # moge calibration merge + # ------------------------------------------------------------------ + @staticmethod + def _merge_moge( + moge_list: list[dict], + offsets: list[int], + ) -> dict: + """Merge per-clip MoGe calibration results into one global result.""" + total_frames = 0 + for ci, m in enumerate(moge_list): + if not m or not isinstance(m, dict): + continue + for k in ("depth", "hfov", "intrinsics", "vfov"): + v = m.get(k) + if isinstance(v, list) and len(v) > 0: + end = offsets[ci] + len(v) + if end > total_frames: + total_frames = end + break + + if total_frames == 0: + return moge_list[0] if moge_list else {} + + per_frame_keys = set() + scalar_fields = {} + for m in moge_list: + if not m or not isinstance(m, dict): + continue + for k, v in m.items(): + if isinstance(v, list) and len(v) > 1: + per_frame_keys.add(k) + elif k not in scalar_fields: + scalar_fields[k] = v + + merged = dict(scalar_fields) + for key in per_frame_keys: + arr = [None] * total_frames + for ci, m in enumerate(moge_list): + if not m or not isinstance(m, dict): + continue + vals = m.get(key) + if not isinstance(vals, list): + continue + offset = offsets[ci] + for local_fid, val in enumerate(vals): + gfid = offset + local_fid + if gfid < total_frames and arr[gfid] is None: + arr[gfid] = val + for i in range(len(arr)): + if arr[i] is None: + for delta in range(1, len(arr)): + if i - delta >= 0 and arr[i - delta] is not None: + arr[i] = arr[i - delta] + break + if i + delta < len(arr) and arr[i + delta] is not None: + arr[i] = arr[i + delta] + break + merged[key] = arr + + return merged + + # ------------------------------------------------------------------ + # hand action merge + # ------------------------------------------------------------------ + def _merge_hand_across_clips( + self, + clips_hand_data: list, + hand_type: str, + n_clips: int, + offsets: list[int], + clip_lengths: list[int], + ) -> dict: + """Merge one hand's data across all clips into a single trajectory.""" + clip_entries = [] + for clip_idx, hand_data in enumerate(clips_hand_data): + if not hand_data or not hand_data.get("states"): + continue + global_offset = offsets[clip_idx] + local_ids = hand_data["valid_frame_ids"] + global_ids = [fid + global_offset for fid in local_ids] + + jw = hand_data.get("joints_world") + jc = hand_data.get("joints_cam") + clip_entries.append( + { + "clip_idx": clip_idx, + "local_ids": local_ids, + "global_ids": global_ids, + "states": np.asarray(hand_data["states"], dtype=np.float64), + "joints_world": (np.asarray(jw, dtype=np.float64) if jw and len(jw) > 0 else None), + "joints_cam": (np.asarray(jc, dtype=np.float64) if jc and len(jc) > 0 else None), + } + ) + + if not clip_entries: + return self._empty_hand_result(hand_type) + + if len(clip_entries) == 1: + e = clip_entries[0] + src = clips_hand_data[e["clip_idx"]] + return { + "hand_type": hand_type, + "states": src["states"], + "actions": src["actions"], + "valid_frame_ids": e["global_ids"], + "joints_cam": src.get("joints_cam", []), + "joints_world": src.get("joints_world", []), + } + + # Global frame range + all_gids = [] + for e in clip_entries: + all_gids.extend(e["global_ids"]) + min_fid = min(all_gids) + max_fid = max(all_gids) + n_total = max_fid - min_fid + 1 + + state_sum = np.zeros((n_total, 8), dtype=np.float64) + weight_sum = np.zeros(n_total, dtype=np.float64) + has_jw = any(e["joints_world"] is not None for e in clip_entries) + has_jc = any(e["joints_cam"] is not None for e in clip_entries) + jw_sum = np.zeros((n_total, 21, 3), dtype=np.float64) if has_jw else None + jc_sum = np.zeros((n_total, 21, 3), dtype=np.float64) if has_jc else None + + for entry in clip_entries: + ci = entry["clip_idx"] + clip_len = clip_lengths[ci] + # Compute overlap with previous clip + if ci > 0: + prev_end = offsets[ci - 1] + clip_lengths[ci - 1] + overlap_prev = max(0, prev_end - offsets[ci]) + else: + overlap_prev = 0 + # Compute overlap with next clip + if ci < n_clips - 1: + next_offset = offsets[ci + 1] + this_end = offsets[ci] + clip_len + overlap_next = max(0, this_end - next_offset) + else: + overlap_next = 0 + + for i, gfid in enumerate(entry["global_ids"]): + local_fid = entry["local_ids"][i] + idx = gfid - min_fid + w = self._blend_weight( + ci, + local_fid, + n_clips, + clip_len, + overlap_prev, + overlap_next, + ) + state_sum[idx] += entry["states"][i] * w + weight_sum[idx] += w + if has_jw and entry["joints_world"] is not None and i < len(entry["joints_world"]): + jw_sum[idx] += entry["joints_world"][i] * w + if has_jc and entry["joints_cam"] is not None and i < len(entry["joints_cam"]): + jc_sum[idx] += entry["joints_cam"][i] * w + + valid_mask = weight_sum > 1e-8 + valid_idx = np.where(valid_mask)[0] + if len(valid_idx) == 0: + return self._empty_hand_result(hand_type) + + w_col = weight_sum[valid_idx, np.newaxis] + merged_states = state_sum[valid_idx] / w_col + merged_fids = (valid_idx + min_fid).tolist() + + merged_jw = None + if has_jw: + merged_jw = (jw_sum[valid_idx] / weight_sum[valid_idx, np.newaxis, np.newaxis]).tolist() + + merged_jc = None + if has_jc: + merged_jc = (jc_sum[valid_idx] / weight_sum[valid_idx, np.newaxis, np.newaxis]).tolist() + + actions = self._recompute_actions(merged_states) + + return { + "hand_type": hand_type, + "states": merged_states.astype(np.float32).tolist(), + "actions": actions.tolist(), + "valid_frame_ids": merged_fids, + "joints_cam": merged_jc if merged_jc else [], + "joints_world": merged_jw if merged_jw else [], + } + + # ------------------------------------------------------------------ + # camera pose (cam_c2w) merge + # ------------------------------------------------------------------ + def _merge_cam_c2w( + self, + cam_pose_list: list[dict], + offsets: list[int], + clip_lengths: list[int], + ) -> dict: + """Merge per-clip cam_c2w (N,4,4) arrays into a single global array.""" + n_clips = len(cam_pose_list) + + clip_c2ws: list[tuple[int, np.ndarray]] = [] + for ci, cp in enumerate(cam_pose_list): + if not cp or not isinstance(cp, dict): + continue + raw = cp.get(CameraCalibrationKeys.cam_c2w) + if raw is None: + continue + arr = np.asarray(load_numpy(raw), dtype=np.float64) + clip_c2ws.append((ci, arr)) + + if not clip_c2ws: + return cam_pose_list[0] if cam_pose_list else {} + + max_global = 0 + for ci, arr in clip_c2ws: + end = offsets[ci] + len(arr) + if end > max_global: + max_global = end + + c2w_sum = np.zeros((max_global, 4, 4), dtype=np.float64) + w_sum = np.zeros(max_global, dtype=np.float64) + + for ci, arr in clip_c2ws: + offset = offsets[ci] + clip_len = clip_lengths[ci] + # Compute overlap with previous clip + if ci > 0: + prev_end = offsets[ci - 1] + clip_lengths[ci - 1] + overlap_prev = max(0, prev_end - offset) + else: + overlap_prev = 0 + # Compute overlap with next clip + if ci < n_clips - 1: + next_offset = offsets[ci + 1] + this_end = offset + clip_len + overlap_next = max(0, this_end - next_offset) + else: + overlap_next = 0 + + for local_fid in range(len(arr)): + gfid = offset + local_fid + w = self._blend_weight( + ci, + local_fid, + n_clips, + clip_len, + overlap_prev, + overlap_next, + ) + c2w_sum[gfid] += arr[local_fid] * w + w_sum[gfid] += w + + valid = w_sum > 1e-8 + for gfid in range(max_global): + if valid[gfid]: + c2w_sum[gfid] /= w_sum[gfid] + else: + c2w_sum[gfid] = np.eye(4) + + merged: dict = {} + for cp in cam_pose_list: + if cp and isinstance(cp, dict): + for k, v in cp.items(): + if k != CameraCalibrationKeys.cam_c2w and k not in merged: + merged[k] = v + break + + merged[CameraCalibrationKeys.cam_c2w] = c2w_sum.tolist() + return merged + + # ------------------------------------------------------------------ + # hawor reconstruction merge + # ------------------------------------------------------------------ + def _merge_hawor( + self, + hawor_list: list[dict], + offsets: list[int], + ) -> dict: + """Merge per-clip HaWoR results: convert local frame_ids to global.""" + n_clips = len(hawor_list) + + merged: dict = {} + for hw in hawor_list: + if hw and isinstance(hw, dict): + for k in ("fov_x", "img_focal"): + if k in hw and k not in merged: + merged[k] = hw[k] + break + + hand_types: set[str] = set() + for hw in hawor_list: + if hw and isinstance(hw, dict): + for k in ("left", "right"): + if k in hw and isinstance(hw[k], dict): + hand_types.add(k) + + for ht in sorted(hand_types): + seen_global: set[int] = set() + merged_fids: list[int] = [] + merged_transl: list = [] + merged_orient: list = [] + merged_pose: list = [] + merged_betas: list = [] + merged_joints_cam: list = [] + + for ci in range(n_clips): + hw = hawor_list[ci] + if not hw or not isinstance(hw, dict): + continue + hand = hw.get(ht, {}) + if not hand or not isinstance(hand, dict): + continue + + fids = hand.get("frame_ids", []) + offset = offsets[ci] + transl = hand.get("transl", []) + orient = hand.get("global_orient", []) + pose = hand.get("hand_pose", []) + betas = hand.get("betas", []) + jc = hand.get("joints_cam", None) + + for i, local_fid in enumerate(fids): + gfid = local_fid + offset + if gfid in seen_global: + continue + seen_global.add(gfid) + merged_fids.append(gfid) + if i < len(transl): + merged_transl.append(transl[i]) + if i < len(orient): + merged_orient.append(orient[i]) + if i < len(pose): + merged_pose.append(pose[i]) + if i < len(betas): + merged_betas.append(betas[i]) + if jc is not None and i < len(jc): + merged_joints_cam.append(jc[i]) + + if merged_fids: + order = np.argsort(merged_fids).tolist() + merged_fids = [merged_fids[j] for j in order] + merged_transl = [merged_transl[j] for j in order] if merged_transl else [] + merged_orient = [merged_orient[j] for j in order] if merged_orient else [] + merged_pose = [merged_pose[j] for j in order] if merged_pose else [] + merged_betas = [merged_betas[j] for j in order] if merged_betas else [] + merged_joints_cam = [merged_joints_cam[j] for j in order] if merged_joints_cam else None + + merged[ht] = { + "frame_ids": merged_fids, + "transl": merged_transl, + "global_orient": merged_orient, + "hand_pose": merged_pose, + "betas": merged_betas, + } + if merged_joints_cam is not None: + merged[ht]["joints_cam"] = merged_joints_cam + + return merged + + # ------------------------------------------------------------------ + # main entry + # ------------------------------------------------------------------ + def process_single(self, sample=None, rank=None): + if Fields.meta not in sample: + return sample + + meta = sample[Fields.meta] + hand_action_list = meta.get(self.hand_action_field) + + # --- detect actual clip offsets from frame content --- + per_clip_frames = sample.get(self.frame_field) + has_multi_clips = ( + per_clip_frames + and isinstance(per_clip_frames, list) + and len(per_clip_frames) > 1 + and isinstance(per_clip_frames[0], list) + ) + + if not has_multi_clips: + return sample + + n_clips = len(per_clip_frames) + clip_lengths = [len(cf) for cf in per_clip_frames] + nominal_step = self._compute_nominal_step() + + offsets = self._detect_clip_offsets(per_clip_frames, nominal_step) + total_frames = max(off + clen for off, clen in zip(offsets, clip_lengths)) + logger.info( + f"Clip offsets: {offsets}, clip_lengths: {clip_lengths}, " f"total_frames: {total_frames}", + ) + + # --- merge video_frames --- + try: + merged_frames = self._merge_video_frames( + per_clip_frames, + offsets, + ) + sample[self.frame_field] = [merged_frames] + logger.debug( + f"Merged {n_clips} clip frame lists into " f"{len(merged_frames)} global frames", + ) + except Exception as e: + logger.warning(f"video_frames reassembly failed: {e}") + + # --- merge moge --- + moge_list = meta.get(self.moge_field) + if moge_list and isinstance(moge_list, list) and len(moge_list) > 1: + try: + merged_moge = self._merge_moge(moge_list, offsets) + meta[self.moge_field] = [merged_moge] + except Exception as e: + logger.warning(f"MoGe reassembly failed: {e}") + + # --- compute world-frame alignment transforms --- + cam_pose_list = meta.get(self.camera_pose_field) + align_transforms = None + if cam_pose_list and len(cam_pose_list) > 1: + try: + align_transforms = self._compute_alignment_transforms( + cam_pose_list, + offsets, + clip_lengths, + ) + except Exception as e: + logger.warning(f"Alignment transform computation failed: {e}") + + # --- align hand actions to clip 0's world frame, then merge --- + if hand_action_list and len(hand_action_list) > 1: + if align_transforms: + for ci in range(1, len(hand_action_list)): + T = align_transforms[ci] + if hand_action_list[ci] and not np.allclose(T, np.eye(4)): + for ht in hand_action_list[ci]: + try: + hand_action_list[ci][ht] = self._apply_transform_to_hand_data( + hand_action_list[ci][ht], + T, + ) + except Exception as e: + logger.warning( + f"Alignment failed clip {ci} {ht}: {e}", + ) + + # --- merge hand actions --- + if hand_action_list and len(hand_action_list) > 1: + hand_types: set[str] = set() + for clip_result in hand_action_list: + if clip_result: + hand_types.update(clip_result.keys()) + + merged_result: dict = {} + for ht in sorted(hand_types): + per_clip = [(cr.get(ht) if cr else None) for cr in hand_action_list] + try: + merged_result[ht] = self._merge_hand_across_clips( + per_clip, + ht, + n_clips, + offsets, + clip_lengths, + ) + except Exception as e: + logger.warning( + f"Hand '{ht}' reassembly failed: {e}. " f"Falling back to first clip.", + ) + first_valid = next( + (d for d in per_clip if d and d.get("states")), + None, + ) + merged_result[ht] = first_valid if first_valid else self._empty_hand_result(ht) + + meta[self.hand_action_field] = [merged_result] + + # --- align cam_c2w to clip 0's world frame, then merge --- + cam_pose_list = meta.get(self.camera_pose_field) + if cam_pose_list and len(cam_pose_list) > 1: + # Apply alignment transforms to each clip's c2w before merging + if align_transforms: + for ci in range(1, len(cam_pose_list)): + cp = cam_pose_list[ci] + if not cp or not isinstance(cp, dict): + continue + raw = cp.get(CameraCalibrationKeys.cam_c2w) + if raw is None: + continue + T = align_transforms[ci] + if np.allclose(T, np.eye(4)): + continue + try: + c2w_arr = np.asarray( + load_numpy(raw), + dtype=np.float64, + ) + aligned = self._apply_transform_to_c2w(c2w_arr, T) + cam_pose_list[ci] = dict(cp) + cam_pose_list[ci][CameraCalibrationKeys.cam_c2w] = aligned.tolist() + except Exception as e: + logger.warning( + f"cam_c2w alignment failed clip {ci}: {e}", + ) + + try: + merged_cam = self._merge_cam_c2w( + cam_pose_list, + offsets, + clip_lengths, + ) + meta[self.camera_pose_field] = [merged_cam] + except Exception as e: + logger.warning(f"cam_c2w reassembly failed: {e}") + + # --- merge hawor --- + hawor_list = meta.get(self.hand_reconstruction_field) + if hawor_list and len(hawor_list) > 1: + try: + merged_hawor = self._merge_hawor(hawor_list, offsets) + meta[self.hand_reconstruction_field] = [merged_hawor] + except Exception as e: + logger.warning(f"HaWoR reassembly failed: {e}") + + # --- merge clips → original video --- + clips = sample.get(self.clip_field) + if clips and isinstance(clips, list) and len(clips) > 1: + videos = sample.get(self.video_key) + if videos and isinstance(videos, list) and len(videos) > 0: + sample[self.clip_field] = videos + else: + sample[self.clip_field] = [clips[0]] + + return sample diff --git a/data_juicer/ops/mapper/video_extract_frames_mapper.py b/data_juicer/ops/mapper/video_extract_frames_mapper.py index 3dae518a44..a2ad33c359 100644 --- a/data_juicer/ops/mapper/video_extract_frames_mapper.py +++ b/data_juicer/ops/mapper/video_extract_frames_mapper.py @@ -54,6 +54,7 @@ class VideoExtractFramesMapper(Mapper): - **Frame Sampling Methods**: - "all_keyframes": Extracts all keyframes from the video. + - "all_frames": Extracts all frames from the video. - "uniform": Extracts a specified number of frames uniformly from the video. - If `duration` is set, the video is segmented into multiple segments based on the duration, and frames are extracted from each segment. @@ -81,10 +82,12 @@ def __init__( Initialization method. :param frame_sampling_method: sampling method of extracting frame videos from the videos. Should be one of - ["all_keyframes", "uniform"]. - The former one extracts all key frames (the number - of which depends on the duration of the video) and the latter - one extract specified number of frames uniformly from the video. + ["all_keyframes", "all_frames", "uniform"]. + "all_keyframes" extracts all key frames (the number + of which depends on the duration of the video). + "all_frames" extracts every frame of the video. + "uniform" extracts a specified number of frames uniformly + from the video. If "duration" > 0, frame_sampling_method acts on every segment. Default: "all_keyframes". :param output_format: The output format of the frame videos. @@ -133,11 +136,11 @@ def __init__( "bytes", ], f"output_format '{output_format}' is not supported. Can only be one of ['path', 'bytes']." - if frame_sampling_method not in ["all_keyframes", "uniform"]: + if frame_sampling_method not in ["all_keyframes", "all_frames", "uniform"]: raise ValueError( f"Frame sampling method " f"[{frame_sampling_method}] is not supported. " - f'Can only be one of ["all_keyframes", "uniform"].' + f'Can only be one of ["all_keyframes", "all_frames", "uniform"].' ) self.frame_dir = frame_dir @@ -167,10 +170,16 @@ def __init__( self.video_backend = video_backend assert self.video_backend in ["ffmpeg", "av"] - if self.frame_sampling_method == "uniform": - assert self.video_backend == "av", "Only 'av' backend is supported for 'uniform' frame sampling method." + if self.frame_sampling_method in ["uniform"]: + assert self.video_backend in [ + "av", + "ffmpeg", + ], f"Only 'av' and 'ffmpeg' backends are supported for '{self.frame_sampling_method}' frame sampling method." if self.duration > 0: - assert self.video_backend == "av", "Only 'av' backend is supported when duration > 0." + assert self.video_backend in [ + "av", + "ffmpeg", + ], f"Only 'av' and 'ffmpeg' backends are supported when duration > 0." def _get_default_frame_dir(self, original_filepath): original_dir = os.path.dirname(original_filepath) @@ -187,21 +196,42 @@ def extract_frames(self, video): # extract frame videos if self.frame_sampling_method == "all_keyframes": if self.duration: - # only support av backend when duration > 0 - frames = extract_key_frames_by_seconds(video.container, self.duration) - frames = [frame.to_image() for frame in frames] + if self.video_backend == "av": + frames = extract_key_frames_by_seconds(video.container, self.duration) + frames = [frame.to_image() for frame in frames] + else: + # For non-av backends, extract keyframes from each segment + video_duration = video.metadata.duration + import numpy as _np + + timestamps = _np.arange(0, video_duration, self.duration).tolist() + timestamps.append(video_duration) + frames = [] + for i in range(len(timestamps) - 1): + kf = video.extract_keyframes(start_time=timestamps[i], end_time=timestamps[i + 1]) + frames.extend([Image.fromarray(img) for img in kf.frames]) else: frames = video.extract_keyframes().frames frames = [Image.fromarray(img) for img in frames] + elif self.frame_sampling_method == "all_frames": + frames = [Image.fromarray(img) for img in video.extract_frames()] elif self.frame_sampling_method == "uniform": - # only support av backend if using uniform sampling - if self.duration: - frames = extract_video_frames_uniformly_by_seconds( - video.container, self.frame_num, duration=self.duration - ) + if self.video_backend == "av": + # Use legacy av-specific functions + if self.duration: + frames = extract_video_frames_uniformly_by_seconds( + video.container, self.frame_num, duration=self.duration + ) + else: + frames = extract_video_frames_uniformly(video.container, self.frame_num) + frames = [frame.to_image() for frame in frames] else: - frames = extract_video_frames_uniformly(video.container, self.frame_num) - frames = [frame.to_image() for frame in frames] + # Use VideoReader interface (works for ffmpeg and other backends) + if self.duration: + frames = video.extract_frames_uniformly_by_seconds(self.frame_num, duration=self.duration) + else: + frames = video.extract_frames_uniformly(self.frame_num) + frames = [Image.fromarray(img) for img in frames] else: raise ValueError( f"Not support sampling method \ @@ -234,7 +264,7 @@ def _process_video(self, video, video_key): def process_single(self, sample, context=False): # check if it's generated already - if self.frame_field in sample: + if self.frame_field in sample and sample[self.frame_field]: return sample # there is no videos in this sample diff --git a/data_juicer/ops/mapper/video_hand_action_compute_mapper.py b/data_juicer/ops/mapper/video_hand_action_compute_mapper.py new file mode 100644 index 0000000000..e9d2df2caf --- /dev/null +++ b/data_juicer/ops/mapper/video_hand_action_compute_mapper.py @@ -0,0 +1,393 @@ +import numpy as np +from loguru import logger + +from data_juicer.utils.constant import CameraCalibrationKeys, Fields, MetaKeys +from data_juicer.utils.file_utils import load_numpy +from data_juicer.utils.lazy_loader import LazyLoader + +from ..base_op import OPERATORS, Mapper + +OP_NAME = "video_hand_action_compute_mapper" + +scipy_rotation = LazyLoader("scipy.spatial.transform", "scipy") + + +def _rotation_matrix_to_euler(R): + """Convert 3x3 rotation matrix to Euler angles (roll, pitch, yaw). + + Uses scipy Rotation with 'xyz' extrinsic convention, consistent with + LIBERO / Open X-Embodiment action space. + """ + from scipy.spatial.transform import Rotation + + rot = Rotation.from_matrix(R) + return rot.as_euler("xyz", degrees=False) # (3,) [roll, pitch, yaw] + + +def _euler_to_rotation_matrix(euler): + """Convert Euler angles (roll, pitch, yaw) to 3x3 rotation matrix.""" + from scipy.spatial.transform import Rotation + + return Rotation.from_euler("xyz", euler, degrees=False).as_matrix() + + +def _delta_rotation_euler(euler_prev, euler_next): + """Compute relative rotation as Euler angles: R_delta = R_next @ R_prev^T.""" + from scipy.spatial.transform import Rotation + + R_prev = Rotation.from_euler("xyz", euler_prev, degrees=False) + R_next = Rotation.from_euler("xyz", euler_next, degrees=False) + R_delta = R_next * R_prev.inv() + return R_delta.as_euler("xyz", degrees=False) + + +def _estimate_gripper_from_hand_pose(hand_pose): + """Estimate gripper state from MANO hand_pose parameters. + + Uses the average rotation angle of all 15 finger joints to estimate + whether the hand is open or closed. + + Accepts either: + - (15, 3, 3) rotation matrices + - (45,) axis-angle (3 values per joint) + + Returns: + float: gripper state in [-1, 1]. 1 = open, -1 = closed. + """ + if hand_pose is None or len(hand_pose) == 0: + return 1.0 # default: open + + hand_pose = np.asarray(hand_pose, dtype=np.float64) + + # Convert axis-angle (45,) to per-joint angles + if hand_pose.ndim == 1 and hand_pose.shape[0] == 45: + # axis-angle: angle = norm of each 3-vector + hand_pose = hand_pose.reshape(15, 3) + angles = [np.linalg.norm(hand_pose[j]) for j in range(15)] + elif hand_pose.ndim == 2 and hand_pose.shape == (15, 3): + # Already (15, 3) axis-angle + angles = [np.linalg.norm(hand_pose[j]) for j in range(15)] + else: + # (15, 3, 3) rotation matrices + angles = [] + for j in range(hand_pose.shape[0]): + R = hand_pose[j] + trace_val = np.clip((np.trace(R) - 1.0) / 2.0, -1.0, 1.0) + angle = np.arccos(trace_val) + angles.append(angle) + + avg_angle = np.mean(angles) + + # Thresholds calibrated from typical MANO hand poses: + # - Fully open hand: avg angle ~0.1 rad + # - Fully closed fist: avg angle ~0.8 rad + open_threshold = 0.15 + close_threshold = 0.6 + + if avg_angle <= open_threshold: + return 1.0 + elif avg_angle >= close_threshold: + return -1.0 + else: + # Linear interpolation between open and closed + t = (avg_angle - open_threshold) / (close_threshold - open_threshold) + return 1.0 - 2.0 * t + + +@OPERATORS.register_module(OP_NAME) +class VideoHandActionComputeMapper(Mapper): + """Compute 7-DoF actions and 8-dim states from hand reconstruction + and camera pose results. + + Reads hand MANO parameters (from VideoHandReconstructionHaworMapper) + and camera-to-world transforms (from VideoCameraPoseMegaSaMMapper), + then produces per-frame state [x,y,z,roll,pitch,yaw,pad,gripper] + and per-frame action [dx,dy,dz,droll,dpitch,dyaw,gripper] compatible + with LIBERO / StarVLA LeRobot format. + """ + + def __init__( + self, + hand_reconstruction_field: str = MetaKeys.hand_reconstruction_hawor_tags, + camera_pose_field: str = MetaKeys.video_camera_pose_tags, + tag_field_name: str = MetaKeys.hand_action_tags, + hand_type: str = "both", + *args, + **kwargs, + ): + """ + Initialization method. + + :param hand_reconstruction_field: Meta field storing HaWoR hand + reconstruction results. + :param camera_pose_field: Meta field storing camera pose + (cam_c2w) results. + :param tag_field_name: Output field name in Fields.meta. + :param hand_type: Which hand to compute actions for. + 'right', 'left', or 'both'. Default is 'both'. + """ + super().__init__(*args, **kwargs) + self.hand_reconstruction_field = hand_reconstruction_field + self.camera_pose_field = camera_pose_field + self.tag_field_name = tag_field_name + self.hand_type = hand_type + + def _get_hand_data(self, hand_recon, hand_type): + """Extract frame-indexed hand data for the specified hand type.""" + # Support both new structured format and legacy flat format + hand = hand_recon.get(hand_type, {}) if isinstance(hand_recon, dict) else {} + if not hand: + return [], [], [], [] + frame_ids = hand.get("frame_ids", []) + transl_list = hand.get("transl", []) + orient_list = hand.get("global_orient", []) + hand_pose_list = hand.get("hand_pose", []) + + return frame_ids, transl_list, orient_list, hand_pose_list + + def _compute_state_for_frame(self, transl, global_orient, hand_pose, cam_c2w): + """Compute 8-dim state for a single frame. + + Transforms hand pose from camera space to world space. + + Args: + transl: (3,) translation in camera space + global_orient: (3,3) rotation matrix OR (3,) axis-angle + hand_pose: hand pose parameters (rotation matrices or axis-angle) + cam_c2w: (4,4) camera-to-world transform + + Returns: + np.ndarray: (8,) [x, y, z, roll, pitch, yaw, pad, gripper] + """ + from scipy.spatial.transform import Rotation + + transl = np.asarray(transl, dtype=np.float64) + global_orient = np.asarray(global_orient, dtype=np.float64) + cam_c2w = np.asarray(cam_c2w, dtype=np.float64) + + # Convert axis-angle (3,) to rotation matrix (3,3) if needed + if global_orient.shape == (3,): + global_orient = Rotation.from_rotvec(global_orient).as_matrix() + + # Transform position: camera → world + R_c2w = cam_c2w[:3, :3] + t_c2w = cam_c2w[:3, 3] + pos_world = R_c2w @ transl + t_c2w + + # Transform orientation: camera → world + orient_world = R_c2w @ global_orient + euler = _rotation_matrix_to_euler(orient_world) # [roll, pitch, yaw] + + # Estimate gripper state from finger articulation + gripper = _estimate_gripper_from_hand_pose(hand_pose) + + state = np.array( + [ + pos_world[0], + pos_world[1], + pos_world[2], + euler[0], + euler[1], + euler[2], + 0.0, # pad (consistent with LIBERO 8-dim state) + gripper, + ], + dtype=np.float32, + ) + return state + + def _compute_actions(self, states): + """Compute 7-dim delta actions from consecutive states. + + action[t] = state[t+1] - state[t] for position, + delta_rotation for orientation, + gripper from state[t]. + + The last frame's action is set to zeros with current gripper. + + Returns: + np.ndarray: (T, 7) actions + """ + T = len(states) + actions = np.zeros((T, 7), dtype=np.float32) + + for t in range(T - 1): + # Position delta + actions[t, 0:3] = states[t + 1, 0:3] - states[t, 0:3] + + # Rotation delta (in Euler angles) + euler_cur = states[t, 3:6] + euler_next = states[t + 1, 3:6] + actions[t, 3:6] = _delta_rotation_euler(euler_cur, euler_next) + + # Gripper: use next frame's gripper state + actions[t, 6] = states[t + 1, 7] # index 7 is gripper in state + + # Last frame: zero action with current gripper + if T > 0: + actions[T - 1, 6] = states[T - 1, 7] + + return actions + + @staticmethod + def _transform_joints_cam_to_world(joints_cam, cam_c2w_all, frame_ids): + """Transform MANO 21-joint positions from camera space to world space. + + Args: + joints_cam: numpy array (T_all, 21, 3) or None — all valid frames' + joints in camera space (from HaWoR). + cam_c2w_all: numpy array (N, 4, 4) — camera-to-world transforms. + frame_ids: list of frame indices that have valid hand data. + + Returns: + joints_world: list of (21, 3) arrays for valid frames, or None. + """ + if joints_cam is None: + return None + + joints_cam = np.asarray(joints_cam, dtype=np.float64) + joints_world_list = [] + + for i, fid in enumerate(frame_ids): + if fid >= len(cam_c2w_all) or i >= len(joints_cam): + continue + R_c2w = cam_c2w_all[fid, :3, :3] # (3, 3) + t_c2w = cam_c2w_all[fid, :3, 3] # (3,) + # joints_cam[i]: (21, 3) in camera space + # world = R @ cam + t (applied per joint) + j_world = (R_c2w @ joints_cam[i].T).T + t_c2w # (21, 3) + joints_world_list.append(j_world.tolist()) + + return joints_world_list + + def _compute_hand_actions(self, hand_recon, cam_c2w_all, hand_type, video_idx): + """Compute states and actions for a single hand in a single video. + + Returns: + dict with 'states', 'actions', 'valid_frame_ids', 'hand_type', + 'joints_cam', 'joints_world', or None if insufficient data. + """ + frame_ids, transl_list, orient_list, hp_list = self._get_hand_data(hand_recon, hand_type) + + if len(frame_ids) < 2: + logger.debug(f"Video {video_idx}: insufficient {hand_type} hand " f"frames ({len(frame_ids)}), skipping.") + return None + + states = [] + valid_frame_ids = [] + for i, fid in enumerate(frame_ids): + if fid >= len(cam_c2w_all): + logger.debug(f"Frame {fid} exceeds cam_c2w length " f"{len(cam_c2w_all)}, skipping.") + continue + + state = self._compute_state_for_frame(transl_list[i], orient_list[i], hp_list[i], cam_c2w_all[fid]) + states.append(state) + valid_frame_ids.append(fid) + + if len(states) < 2: + return None + + states = np.stack(states, axis=0) # (T, 8) + actions = self._compute_actions(states) # (T, 7) + + # Transform MANO joints from camera space to world space + hand_data = hand_recon[hand_type] + joints_cam = hand_data.get("joints_cam", None) + joints_world = self._transform_joints_cam_to_world(joints_cam, cam_c2w_all, frame_ids) + + # Also pass through joints_cam for valid frames only + joints_cam_valid = None + if joints_cam is not None: + joints_cam = np.asarray(joints_cam, dtype=np.float64) + joints_cam_valid = [ + joints_cam[i].tolist() + for i, fid in enumerate(frame_ids) + if fid < len(cam_c2w_all) and i < len(joints_cam) + ] + + return { + "states": states.tolist(), + "actions": actions.tolist(), + "valid_frame_ids": valid_frame_ids, + "hand_type": hand_type, + "joints_cam": joints_cam_valid, # (T, 21, 3) camera space + "joints_world": joints_world, # (T, 21, 3) world space + } + + def process_single(self, sample=None, rank=None): + # Check if already processed + if self.tag_field_name in sample.get(Fields.meta, {}): + return sample + + if Fields.meta not in sample: + sample[Fields.meta] = {} + + hand_recon_list = sample[Fields.meta].get(self.hand_reconstruction_field, []) + camera_pose_list = sample[Fields.meta].get(self.camera_pose_field, []) + + if not hand_recon_list or not camera_pose_list: + logger.warning( + f"Missing hand reconstruction or camera pose data. " + f"hand_recon={len(hand_recon_list)}, " + f"camera_pose={len(camera_pose_list)}" + ) + sample[Fields.meta][self.tag_field_name] = [] + return sample + + # Determine which hands to process + if self.hand_type == "both": + hand_types = ["right", "left"] + else: + hand_types = [self.hand_type] + + all_video_results = [] + + if len(hand_recon_list) != len(camera_pose_list): + logger.warning( + f"hand_recon ({len(hand_recon_list)}) and camera_pose " + f"({len(camera_pose_list)}) list length mismatch. " + f"Processing min of both." + ) + + for video_idx in range(min(len(hand_recon_list), len(camera_pose_list))): + hand_recon = hand_recon_list[video_idx] + camera_pose = camera_pose_list[video_idx] + + cam_c2w_raw = camera_pose.get(CameraCalibrationKeys.cam_c2w, None) if camera_pose else None + if cam_c2w_raw is None: + logger.warning(f"Video {video_idx}: missing cam_c2w, skipping.") + empty = { + ht: { + "states": [], + "actions": [], + "valid_frame_ids": [], + "hand_type": ht, + "joints_cam": [], + "joints_world": [], + } + for ht in hand_types + } + all_video_results.append(empty) + continue + + cam_c2w_all = np.asarray(load_numpy(cam_c2w_raw), dtype=np.float64) + + video_result = {} + for ht in hand_types: + result = self._compute_hand_actions(hand_recon, cam_c2w_all, ht, video_idx) + if result is None: + video_result[ht] = { + "states": [], + "actions": [], + "valid_frame_ids": [], + "hand_type": ht, + "joints_cam": [], + "joints_world": [], + } + else: + video_result[ht] = result + + all_video_results.append(video_result) + + sample[Fields.meta][self.tag_field_name] = all_video_results + return sample diff --git a/data_juicer/ops/mapper/video_hand_motion_smooth_mapper.py b/data_juicer/ops/mapper/video_hand_motion_smooth_mapper.py new file mode 100644 index 0000000000..840ce9245c --- /dev/null +++ b/data_juicer/ops/mapper/video_hand_motion_smooth_mapper.py @@ -0,0 +1,356 @@ +import numpy as np +from loguru import logger + +from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.utils.lazy_loader import LazyLoader + +from ..base_op import OPERATORS, Mapper + +OP_NAME = "video_hand_motion_smooth_mapper" + +scipy_interpolate = LazyLoader("scipy.interpolate", "scipy") + + +def _recompute_actions(states: np.ndarray) -> np.ndarray: + """Compute 7-dim delta actions from consecutive 8-dim states. + + Args: + states: (T, 8) array — [x, y, z, roll, pitch, yaw, pad, gripper]. + + Returns: + (T, 7) actions — [dx, dy, dz, droll, dpitch, dyaw, gripper]. + """ + from scipy.spatial.transform import Rotation + + T = len(states) + actions = np.zeros((T, 7), dtype=np.float32) + + for t in range(T - 1): + actions[t, 0:3] = states[t + 1, 0:3] - states[t, 0:3] + R_prev = Rotation.from_euler("xyz", states[t, 3:6], degrees=False) + R_next = Rotation.from_euler("xyz", states[t + 1, 3:6], degrees=False) + R_delta = R_next * R_prev.inv() + actions[t, 3:6] = R_delta.as_euler("xyz", degrees=False) + actions[t, 6] = states[t + 1, 7] + + if T > 0: + actions[T - 1, 6] = states[T - 1, 7] + + return actions + + +@OPERATORS.register_module(OP_NAME) +class VideoHandMotionSmoothMapper(Mapper): + """Apply smoothing to world-space hand motions and remove outliers. + + Reads hand action results (states, actions, joints_world) produced by + ``VideoHandActionComputeMapper`` and applies: + + 1. **Extreme outlier replacement** — frames whose instantaneous wrist + speed exceeds ``median + outlier_velocity_threshold * MAD`` are + replaced by linear interpolation from neighbors (not deleted). + 2. **Savitzky-Golay smoothing** — positions are smoothed with a + Savitzky-Golay filter that preserves motion peaks while removing + high-frequency jitter. + 3. **Quaternion smoothing** — orientations are smoothed in quaternion + space to avoid gimbal lock and discontinuities. + 4. **Action recomputation** — 7-DoF actions are re-derived from the + smoothed states so they stay consistent. + + Reference (paper §3.1): + "we apply spline smoothing to the world-space hand motions and remove + outliers" + """ + + def __init__( + self, + hand_action_field: str = MetaKeys.hand_action_tags, + savgol_window: int = 11, + savgol_polyorder: int = 3, + outlier_velocity_threshold: float = 5.0, + min_frames_for_smoothing: int = 5, + smooth_joints: bool = True, + *args, + **kwargs, + ): + """ + Initialization method. + + :param hand_action_field: Meta field storing hand action results + (output of VideoHandActionComputeMapper). + :param savgol_window: Window length for Savitzky-Golay filter. + Must be odd. Larger = smoother but may lose fast motions. + :param savgol_polyorder: Polynomial order for Savitzky-Golay filter. + Must be less than savgol_window. + :param outlier_velocity_threshold: Frames whose wrist speed exceeds + ``median + threshold * MAD`` are replaced by interpolation. + Higher = more conservative (fewer replacements). + :param min_frames_for_smoothing: Minimum number of valid frames + required to apply smoothing. + :param smooth_joints: Whether to also smooth ``joints_world`` + (21-joint MANO skeleton in world space). + """ + super().__init__(*args, **kwargs) + self.hand_action_field = hand_action_field + self.savgol_window = savgol_window + self.savgol_polyorder = savgol_polyorder + self.outlier_velocity_threshold = outlier_velocity_threshold + self.min_frames_for_smoothing = min_frames_for_smoothing + self.smooth_joints = smooth_joints + + # ------------------------------------------------------------------ + # Outlier replacement (interpolate, don't delete) + # ------------------------------------------------------------------ + @staticmethod + def _replace_outliers( + positions: np.ndarray, + threshold_mad: float, + ) -> np.ndarray: + """Replace extreme outlier frames by linear interpolation. + + Uses median + MAD (median absolute deviation) which is more robust + than mean + std for heavy-tailed distributions. + + Returns a copy with outliers replaced — no frames are deleted. + """ + n = len(positions) + if n < 4: + return positions.copy() + + result = positions.copy() + velocities = np.diff(positions, axis=0) + speed = np.linalg.norm(velocities, axis=1) + + median_speed = np.median(speed) + mad = np.median(np.abs(speed - median_speed)) + if mad < 1e-8: + return result + + limit = median_speed + threshold_mad * mad * 1.4826 # MAD→σ scale + outlier_mask = speed > limit + + n_outliers = int(np.sum(outlier_mask)) + if n_outliers == 0: + return result + + # Replace outlier destination frames by linear interpolation + for i in range(len(outlier_mask)): + if not outlier_mask[i]: + continue + target = i + 1 # destination frame of the jump + + # Find nearest good frames before and after + prev_good = i + while prev_good > 0 and ( + prev_good > i - 1 + and prev_good - 1 >= 0 + and prev_good - 1 < len(outlier_mask) + and outlier_mask[prev_good - 1] + ): + prev_good -= 1 + + next_good = target + 1 + while next_good < n and next_good - 1 < len(outlier_mask) and outlier_mask[next_good - 1]: + next_good += 1 + + if next_good >= n: + next_good = n - 1 + if prev_good == target or next_good == target: + continue + + # Linear interpolation + alpha = (target - prev_good) / max(next_good - prev_good, 1) + result[target] = (1 - alpha) * result[prev_good] + alpha * result[next_good] + + return result + + # ------------------------------------------------------------------ + # Savitzky-Golay smoothing + # ------------------------------------------------------------------ + @staticmethod + def _savgol_smooth( + data: np.ndarray, + window: int, + polyorder: int, + ) -> np.ndarray: + """Apply Savitzky-Golay filter to each column of data.""" + from scipy.signal import savgol_filter + + n = len(data) + # Ensure window is valid + win = min(window, n) + if win % 2 == 0: + win -= 1 + if win < polyorder + 2: + return data.copy() + + result = np.empty_like(data) + if data.ndim == 1: + result = savgol_filter(data, win, polyorder) + else: + for d in range(data.shape[1]): + result[:, d] = savgol_filter(data[:, d], win, polyorder) + return result + + # ------------------------------------------------------------------ + # Quaternion orientation smoothing + # ------------------------------------------------------------------ + @staticmethod + def _smooth_orientations( + eulers: np.ndarray, + window: int, + polyorder: int, + ) -> np.ndarray: + """Smooth orientations in quaternion space with Savitzky-Golay.""" + from scipy.signal import savgol_filter + from scipy.spatial.transform import Rotation + + n = len(eulers) + win = min(window, n) + if win % 2 == 0: + win -= 1 + if win < polyorder + 2: + return eulers.copy() + + try: + rots = Rotation.from_euler("xyz", eulers, degrees=False) + quats = rots.as_quat() # (N, 4) — [x, y, z, w] + + # Ensure quaternion hemisphere continuity + for i in range(1, len(quats)): + if np.dot(quats[i], quats[i - 1]) < 0: + quats[i] = -quats[i] + + # Savgol on each component + smoothed_quats = np.empty_like(quats) + for d in range(4): + smoothed_quats[:, d] = savgol_filter( + quats[:, d], + win, + polyorder, + ) + + # Re-normalize + norms = np.linalg.norm(smoothed_quats, axis=1, keepdims=True) + norms = np.clip(norms, 1e-8, None) + smoothed_quats = smoothed_quats / norms + + return Rotation.from_quat(smoothed_quats).as_euler( + "xyz", + degrees=False, + ) + except Exception: + # Fallback: unwrap + savgol on Euler angles + smoothed = np.empty_like(eulers) + for d in range(3): + unwrapped = np.unwrap(eulers[:, d]) + smoothed[:, d] = savgol_filter(unwrapped, win, polyorder) + return smoothed + + # ------------------------------------------------------------------ + # Per-hand smoothing + # ------------------------------------------------------------------ + def _smooth_hand_result(self, hand_result: dict) -> dict: + """Smooth a single hand's trajectory and recompute actions.""" + states = np.asarray(hand_result["states"], dtype=np.float64) + valid_frame_ids = hand_result["valid_frame_ids"] + + if len(states) < self.min_frames_for_smoothing: + return hand_result + + positions = states[:, 0:3] + eulers = states[:, 3:6] + grippers = states[:, 7].copy() + + # --- Step 1: replace extreme outliers (don't delete!) --- + positions = self._replace_outliers( + positions, + self.outlier_velocity_threshold, + ) + + # --- Step 2: Savitzky-Golay smoothing --- + smoothed_pos = self._savgol_smooth( + positions, + self.savgol_window, + self.savgol_polyorder, + ) + smoothed_euler = self._smooth_orientations( + eulers, + self.savgol_window, + self.savgol_polyorder, + ) + + # Reconstruct state matrix (same frame count as original) + smoothed_states = np.zeros_like(states) + smoothed_states[:, 0:3] = smoothed_pos + smoothed_states[:, 3:6] = smoothed_euler + smoothed_states[:, 6] = 0.0 # pad + smoothed_states[:, 7] = grippers + + # --- Step 3: recompute actions --- + actions = _recompute_actions(smoothed_states) + + result = { + "hand_type": hand_result["hand_type"], + "states": smoothed_states.astype(np.float32).tolist(), + "actions": actions.tolist(), + "valid_frame_ids": valid_frame_ids, # unchanged! no frames removed + } + + # --- Step 4: optionally smooth joints_world --- + joints_world = hand_result.get("joints_world") + if self.smooth_joints and joints_world and len(joints_world) > 0: + joints_arr = np.asarray(joints_world, dtype=np.float64) + if len(joints_arr) == len(states): + smoothed_joints = np.empty_like(joints_arr) + for j_idx in range(joints_arr.shape[1]): # 21 joints + smoothed_joints[:, j_idx, :] = self._savgol_smooth( + joints_arr[:, j_idx, :], + self.savgol_window, + self.savgol_polyorder, + ) + result["joints_world"] = smoothed_joints.tolist() + else: + result["joints_world"] = joints_world + else: + result["joints_world"] = joints_world if joints_world else [] + + # Pass through joints_cam unchanged + result["joints_cam"] = hand_result.get("joints_cam", []) + + return result + + # ------------------------------------------------------------------ + # Main entry + # ------------------------------------------------------------------ + def process_single(self, sample=None, rank=None): + if Fields.meta not in sample: + return sample + + hand_action_list = sample[Fields.meta].get(self.hand_action_field) + if not hand_action_list: + return sample + + smoothed_results = [] + for clip_result in hand_action_list: + if not clip_result: + smoothed_results.append(clip_result) + continue + smoothed_clip = {} + for hand_type, hand_data in clip_result.items(): + if not hand_data or not hand_data.get("states"): + smoothed_clip[hand_type] = hand_data + continue + try: + smoothed_clip[hand_type] = self._smooth_hand_result( + hand_data, + ) + except Exception as e: + logger.warning( + f"Smoothing failed for hand '{hand_type}': {e}. " f"Keeping original data.", + ) + smoothed_clip[hand_type] = hand_data + smoothed_results.append(smoothed_clip) + + sample[Fields.meta][self.hand_action_field] = smoothed_results + return sample diff --git a/data_juicer/ops/mapper/video_hand_reconstruction_hawor_mapper.py b/data_juicer/ops/mapper/video_hand_reconstruction_hawor_mapper.py index 764cf75b65..7dbb9d7cd4 100644 --- a/data_juicer/ops/mapper/video_hand_reconstruction_hawor_mapper.py +++ b/data_juicer/ops/mapper/video_hand_reconstruction_hawor_mapper.py @@ -1,18 +1,15 @@ -import copy import os import subprocess import sys +import tempfile import numpy as np -from pydantic import PositiveInt -import data_juicer -from data_juicer.ops.load import load_ops from data_juicer.utils.cache_utils import ( DATA_JUICER_ASSETS_CACHE, DATA_JUICER_MODELS_CACHE, ) -from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.utils.constant import CameraCalibrationKeys, Fields, MetaKeys from data_juicer.utils.lazy_loader import LazyLoader from data_juicer.utils.model_utils import get_model, prepare_model @@ -21,7 +18,7 @@ OP_NAME = "video_hand_reconstruction_hawor_mapper" -cv2 = LazyLoader("cv2", "opencv-contrib-python") +cv2 = LazyLoader("cv2", "opencv-python") ultralytics = LazyLoader("ultralytics") torch = LazyLoader("torch") @@ -38,15 +35,12 @@ def __init__( hawor_model_path: str = "hawor.ckpt", hawor_config_path: str = "model_config.yaml", hawor_detector_path: str = "detector.pt", - moge_model_path: str = "Ruicheng/moge-2-vitl", mano_right_path: str = "path_to_mano_right_pkl", - frame_num: PositiveInt = 3, - duration: float = 0, - thresh: float = 0.2, + mano_left_path: str = "path_to_mano_left_pkl", + frame_field: str = MetaKeys.video_frames, + camera_calibration_field: str = "camera_calibration", tag_field_name: str = MetaKeys.hand_reconstruction_hawor_tags, - frame_dir: str = DATA_JUICER_ASSETS_CACHE, - if_output_moge_info: bool = False, - moge_output_info_dir: str = DATA_JUICER_ASSETS_CACHE, + thresh: float = 0.2, *args, **kwargs, ): @@ -59,28 +53,18 @@ def __init__( HaWoR model. :param hawor_detector_path: The path to 'detector.pt' for the HaWoR model. - :param moge_model_path: The path to the Moge-2 model. :param mano_right_path: The path to 'MANO_RIGHT.pkl'. Users need to download this file from https://mano.is.tue.mpg.de/ and comply with the MANO license. - :param frame_num: The number of frames to be extracted uniformly from - the video. If it's 1, only the middle frame will be extracted. If - it's 2, only the first and the last frames will be extracted. If - it's larger than 2, in addition to the first and the last frames, - other frames will be extracted uniformly within the video duration. - If "duration" > 0, frame_num is the number of frames per segment. - :param duration: The duration of each segment in seconds. - If 0, frames are extracted from the entire video. - If duration > 0, the video is segmented into multiple segments - based on duration, and frames are extracted from each segment. - :param thresh: Confidence threshold for hand detection. + :param mano_left_path: The path to 'MANO_LEFT.pkl'. Users need to + download this file from https://mano.is.tue.mpg.de/ and comply + with the MANO license. Used for accurate left-hand wrist + offset computation (with shapedirs bug-fix). + :param frame_field: The field name where the video frames are stored. + :param camera_calibration_field: The field name where the camera calibration info is stored. :param tag_field_name: The field name to store the tags. It's "hand_reconstruction_hawor_tags" in default. - :param frame_dir: Output directory to save extracted frames. - :param if_output_moge_info: Whether to save the results from MoGe-2 - to an JSON file. - :param moge_output_info_dir: Output directory for saving camera - parameters. + :param thresh: The confidence threshold for hand detection. Default is 0.2. :param args: extra args :param kwargs: extra args @@ -93,43 +77,30 @@ def __init__( super().__init__(*args, **kwargs) - self.video_camera_calibration_static_moge_mapper_args = { - "model_path": moge_model_path, - "frame_num": frame_num, - "duration": duration, - "frame_dir": frame_dir, - "if_output_points_info": False, - "if_output_depth_info": False, - "if_output_mask_info": False, - "if_output_info": if_output_moge_info, - "output_info_dir": moge_output_info_dir, - } - self.fused_ops = load_ops( - [{"video_camera_calibration_static_moge_mapper": self.video_camera_calibration_static_moge_mapper_args}] - ) - hawor_repo_path = os.path.join(DATA_JUICER_ASSETS_CACHE, "HaWoR") if not os.path.exists(hawor_repo_path): subprocess.run(["git", "clone", "https://github.com/ThunderVVV/HaWoR.git", hawor_repo_path], check=True) sys.path.append(hawor_repo_path) - from hawor.utils.rotation import ( - angle_axis_to_rotation_matrix, - rotation_matrix_to_angle_axis, - ) + from hawor.utils.rotation import rotation_matrix_to_angle_axis from lib.eval_utils.custom_utils import interpolate_bboxes from lib.pipeline.tools import parse_chunks self.interpolate_bboxes = interpolate_bboxes self.parse_chunks = parse_chunks - self.angle_axis_to_rotation_matrix = angle_axis_to_rotation_matrix self.rotation_matrix_to_angle_axis = rotation_matrix_to_angle_axis + self.frame_field = frame_field + self.hawor_detector_path = hawor_detector_path + self.tag_field_name = tag_field_name + self.thresh = thresh + self.camera_calibration_field = camera_calibration_field self.model_key = prepare_model( model_type="hawor", hawor_model_path=hawor_model_path, hawor_config_path=hawor_config_path, mano_right_path=mano_right_path, + mano_left_path=mano_left_path, ) if not os.path.exists(hawor_detector_path): @@ -140,17 +111,13 @@ def __init__( [ "wget", "https://huggingface.co/ThunderVVV/HaWoR/resolve/main/external/detector.pt", + "-O", hawor_detector_path, ], check=True, ) - self.hawor_detector_path = hawor_detector_path - self.frame_num = frame_num - self.duration = duration - self.tag_field_name = tag_field_name - self.frame_dir = frame_dir - self.thresh = thresh + self.det_model_key = prepare_model(model_type="yolo", model_path=self.hawor_detector_path) def detect_track(self, imgfiles: list, hand_det_model, thresh: float = 0.5) -> tuple: """ @@ -167,49 +134,47 @@ def detect_track(self, imgfiles: list, hand_det_model, thresh: float = 0.5) -> t boxes_ = [] tracks = {} - for t, img_cv2 in enumerate(imgfiles): + with torch.no_grad(), torch.amp.autocast("cuda"): + for t, img_cv2 in enumerate(imgfiles): + results = hand_det_model.track(img_cv2, conf=thresh, persist=True, verbose=False) - with torch.no_grad(): - with torch.amp.autocast("cuda"): - results = hand_det_model.track(img_cv2, conf=thresh, persist=True, verbose=False) - - boxes = results[0].boxes.xyxy.cpu().numpy() - confs = results[0].boxes.conf.cpu().numpy() - handedness = results[0].boxes.cls.cpu().numpy() - if not results[0].boxes.id is None: - track_id = results[0].boxes.id.cpu().numpy() - else: - track_id = [-1] * len(boxes) + boxes = results[0].boxes.xyxy.cpu().numpy() + confs = results[0].boxes.conf.cpu().numpy() + handedness = results[0].boxes.cls.cpu().numpy() + if results[0].boxes.id is not None: + track_id = results[0].boxes.id.cpu().numpy() + else: + track_id = [-1] * len(boxes) - boxes = np.hstack([boxes, confs[:, None]]) + boxes = np.hstack([boxes, confs[:, None]]) - find_right = False - find_left = False + find_right = False + find_left = False - for idx, box in enumerate(boxes): - if track_id[idx] == -1: - if handedness[[idx]] > 0: - id = int(10000) - else: - id = int(5000) + for idx, box in enumerate(boxes): + if track_id[idx] == -1: + if handedness[[idx]] > 0: + id = int(10000) + else: + id = int(5000) + else: + id = track_id[idx] + subj = dict() + subj["frame"] = t + subj["det"] = True + subj["det_box"] = boxes[[idx]] + subj["det_handedness"] = handedness[[idx]] + + if (not find_right and handedness[[idx]] > 0) or (not find_left and handedness[[idx]] == 0): + if id in tracks: + tracks[id].append(subj) else: - id = track_id[idx] - subj = dict() - subj["frame"] = t - subj["det"] = True - subj["det_box"] = boxes[[idx]] - subj["det_handedness"] = handedness[[idx]] - - if (not find_right and handedness[[idx]] > 0) or (not find_left and handedness[[idx]] == 0): - if id in tracks: - tracks[id].append(subj) - else: - tracks[id] = [subj] - - if handedness[[idx]] > 0: - find_right = True - elif handedness[[idx]] == 0: - find_left = True + tracks[id] = [subj] + + if handedness[[idx]] > 0: + find_right = True + elif handedness[[idx]] == 0: + find_left = True return boxes_, tracks @@ -219,18 +184,19 @@ def hawor_motion_estimation( tracks: dict, model, img_focal: float, - img_paths: list, + frame_file_paths: list, single_image: bool = False, ) -> dict: """ Performs HAWOR 3D hand reconstruction on detected and tracked hand regions. Args: - imgfiles (list): List of image frames. + imgfiles (list): List of decoded image frames (numpy arrays). tracks (dict): Dictionary mapping track ID to a list of detection objects. model (HAWOR): The initialized HAWOR model. img_focal (float): Camera focal length. - img_paths (list): List of images paths. + frame_file_paths (list): List of file paths readable by HaWoR + (pre-materialized on disk if input was bytes). single_image (bool): Flag for single-image processing mode. Returns: @@ -250,7 +216,7 @@ def hawor_motion_estimation( valid = np.array([t["det"] for t in trk]) is_right = np.concatenate([t["det_handedness"] for t in trk])[valid] - if is_right.sum() / len(is_right) < 0.5: + if len(is_right) == 0 or is_right.sum() / len(is_right) < 0.5: left_trk.extend(trk) else: right_trk.extend(trk) @@ -287,7 +253,7 @@ def hawor_motion_estimation( is_right = np.concatenate([t["det_handedness"] for t in trk])[valid] frame = np.array([t["frame"] for t in trk])[valid] - if is_right.sum() / len(is_right) < 0.5: + if len(is_right) == 0 or is_right.sum() / len(is_right) < 0.5: is_right = np.zeros((len(boxes), 1)) else: is_right = np.ones((len(boxes), 1)) @@ -298,7 +264,7 @@ def hawor_motion_estimation( continue for frame_ck, boxes_ck in zip(frame_chunks, boxes_chunks): - img_ck = [img_paths[i] for i in frame_ck] + img_ck = [frame_file_paths[i] for i in frame_ck] if is_right[0] > 0: do_flip = False else: @@ -313,24 +279,35 @@ def hawor_motion_estimation( "init_betas": results["pred_shape"][None, :], # (B, T, 10) } - # flip left hand - init_root = self.rotation_matrix_to_angle_axis(data_out["init_root_orient"]) - init_hand_pose = self.rotation_matrix_to_angle_axis(data_out["init_hand_pose"]) + # Convert to axis-angle (HaWoR native format) + init_root = self.rotation_matrix_to_angle_axis(data_out["init_root_orient"]) # (B, T, 3) + init_hand_pose = self.rotation_matrix_to_angle_axis(data_out["init_hand_pose"]) # (B, T, 15, 3) + + # Flip Y/Z axis-angle components for left hand + # (this is HaWoR's convention for run_mano_left) if do_flip: init_root[..., 1] *= -1 init_root[..., 2] *= -1 - data_out["init_root_orient"] = self.angle_axis_to_rotation_matrix(init_root) - data_out["init_hand_pose"] = self.angle_axis_to_rotation_matrix(init_hand_pose) + init_hand_pose[..., 1] *= -1 + init_hand_pose[..., 2] *= -1 + + T = data_out["init_betas"].shape[1] + betas_all = data_out["init_betas"][0, :T].cpu().numpy() # (T, 10) + orient_all = init_root[0, :T].cpu().numpy() # (T, 3) + hand_pose_all = init_hand_pose[0, :T].reshape(T, -1).cpu().numpy() # (T, 45) + transl_all = data_out["init_trans"][0, :T].cpu().numpy() # (T, 3) s_frame = frame_ck[0] e_frame = frame_ck[-1] for frame_id in range(s_frame, e_frame + 1): - result = {} - result["beta"] = data_out["init_betas"][0, frame_id - s_frame].cpu().numpy() - result["hand_pose"] = data_out["init_hand_pose"][0, frame_id - s_frame].cpu().numpy() - result["global_orient"] = data_out["init_root_orient"][0, frame_id - s_frame].cpu().numpy() - result["transl"] = data_out["init_trans"][0, frame_id - s_frame].cpu().numpy() + fi = frame_id - s_frame + result = { + "betas": betas_all[fi], + "global_orient": orient_all[fi], + "hand_pose": hand_pose_all[fi], + "transl": transl_all[fi], + } if idx == 0: left_results[frame_id] = result @@ -341,6 +318,93 @@ def hawor_motion_estimation( return reformat_results + @staticmethod + def _compute_mano_joints(mano_model, global_orient_list, hand_pose_list, betas_list, transl_list): + """Compute MANO 21-joint positions in camera space via forward kinematics. + + Args: + mano_model: MANO model (right or left hand), on GPU. + global_orient_list: List of (3,) axis-angle per frame. + hand_pose_list: List of (45,) axis-angle per frame. + betas_list: List of (10,) shape params per frame. + transl_list: List of (3,) translation per frame. + + Returns: + numpy array of shape (T, 21, 3) — 21 joint positions in camera space. + """ + import torch as _torch + from hawor.utils.geometry import aa_to_rotmat + + T = len(global_orient_list) + device = next(mano_model.parameters()).device + + # Stack into tensors: (T, ...) + orient_aa = _torch.tensor(global_orient_list, dtype=_torch.float32) # (T, 3) + hand_aa = _torch.tensor(hand_pose_list, dtype=_torch.float32) # (T, 45) + betas = _torch.tensor(betas_list, dtype=_torch.float32) # (T, 10) + transl = _torch.tensor(transl_list, dtype=_torch.float32) # (T, 3) + + # Convert axis-angle to rotation matrices + orient_rotmat = aa_to_rotmat(orient_aa).view(T, 1, 3, 3) # (T, 1, 3, 3) + hand_rotmat = aa_to_rotmat(hand_aa.reshape(T * 15, 3)).view(T, 15, 3, 3) # (T, 15, 3, 3) + + # MANO forward pass on GPU + with _torch.no_grad(): + mano_out = mano_model( + global_orient=orient_rotmat.to(device), + hand_pose=hand_rotmat.to(device), + betas=betas.to(device), + transl=transl.to(device), + pose2rot=False, + ) + + # mano_out.joints: (T, 21, 3) in camera space + joints_cam = mano_out.joints.cpu().numpy() # (T, 21, 3) + return joints_cam + + @staticmethod + def _decode_frames(raw_frames): + """Decode raw frames (bytes or paths) to numpy arrays. + + Returns: + images: list of decoded BGR numpy arrays (None entries skipped) + """ + from loguru import logger as _logger + + images = [] + for i, frame in enumerate(raw_frames): + if isinstance(frame, bytes): + image_array = np.frombuffer(frame, dtype=np.uint8) + image = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + else: + image = cv2.imread(frame) + + if image is None: + _logger.warning(f"Frame {i} decode failed, skipping.") + continue + images.append(image) + return images + + @staticmethod + def _materialize_bytes_to_files(raw_frames, temp_dir): + """Write byte-frames to disk once, return file paths. + + If frames are already file paths, returns them directly. + This avoids repeated per-chunk temp dir creation and disk I/O. + """ + if not raw_frames: + return [] + if not isinstance(raw_frames[0], bytes): + return raw_frames + + file_paths = [] + for i, frame_bytes in enumerate(raw_frames): + file_path = os.path.join(temp_dir, f"frame_{i}.jpg") + with open(file_path, "wb") as f: + f.write(frame_bytes) + file_paths.append(file_path) + return file_paths + def process_single(self, sample=None, rank=None): # check if it's generated already @@ -349,125 +413,125 @@ def process_single(self, sample=None, rank=None): # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: - return [] + return sample - # --- 1. FoV Estimation (MoGe) --- - ds_list = [{"videos": sample[self.video_key]}] + hawor_model, model_cfg, mano_right, mano_left = get_model(self.model_key, rank, self.use_cuda()) + hand_det_model = get_model(self.det_model_key, rank, self.use_cuda()) - dataset = data_juicer.core.data.NestedDataset.from_list(ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, column=[{}] * dataset.num_rows) - dataset = dataset.map(self.fused_ops[0].process, num_proc=1, with_rank=True) - res_list = dataset.to_list() + videos_frames = sample[self.frame_field] - all_fov_x = res_list[0][Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["hfov_list"] + sample[Fields.meta][self.tag_field_name] = [] - temp_frame_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0] - frames_root = os.path.join(self.frame_dir, temp_frame_name) - frame_names = os.listdir(frames_root) - frames_path = sorted([os.path.join(frames_root, frame_name) for frame_name in frame_names]) + for video_idx in range(len(videos_frames)): + cur_camera_calibration = sample[Fields.meta][self.camera_calibration_field][video_idx] + all_fov_x = cur_camera_calibration.get(CameraCalibrationKeys.hfov, None) - images = [] - for temp_frame_path in frames_path: - images.append(cv2.imread(temp_frame_path)) + # If horizontal FoV is not directly available, compute from intrinsics. + # K is in pixel coordinates: hfov = 2 * arctan(cx / fx), + # where cx ≈ width/2 (principal point convention). + if all_fov_x is None: + intrinsics = cur_camera_calibration.get(CameraCalibrationKeys.intrinsics, None) + if intrinsics is not None: + all_fov_x = [2 * np.arctan(k[0][2] / k[0][0]) for k in intrinsics] + else: + raise ValueError( + f"The sample must include an '{CameraCalibrationKeys.hfov}' field or an '{CameraCalibrationKeys.intrinsics}' field in the camera calibration info to store the horizontal FoV for hand reconstruction." + ) + + frames = videos_frames[video_idx] + images = self._decode_frames(frames) + + N = len(images) + if N == 0: + from loguru import logger as _logger + + _logger.warning(f"Video {video_idx}: all frames decode failed, " "producing empty hand output.") + empty_hand = { + "frame_ids": [], + "global_orient": [], + "hand_pose": [], + "betas": [], + "transl": [], + "joints_cam": None, + } + sample[Fields.meta][self.tag_field_name].append( + { + "fov_x": 0.0, + "img_focal": 0.0, + "left": dict(empty_hand), + "right": dict(empty_hand), + } + ) + continue + H, W = images[0].shape[:2] - N = len(images) - H, W = images[0].shape[:2] + # Use median FoV across all frames + fov_x = np.median(np.array(all_fov_x)) + img_focal = 0.5 * W / np.tan(0.5 * fov_x) - # Use median FoV across all frames - fov_x = np.median(np.array(all_fov_x)) - img_focal = 0.5 * W / np.tan(0.5 * fov_x) + _, tracks = self.detect_track(images, hand_det_model, thresh=self.thresh) - # --- 2. Hand Pose and Translation Estimation (HaWoR) --- - if rank is not None: - torch.cuda.set_device(rank) - device = f"cuda:{rank}" if self.use_cuda() else "cpu" - else: - device = "cuda" if self.use_cuda() else "cpu" + with tempfile.TemporaryDirectory() as temp_dir: + frame_file_paths = self._materialize_bytes_to_files(frames, temp_dir) - hawor_model, model_cfg, mano_model = get_model(self.model_key, rank, self.use_cuda()) - hand_det_model = ultralytics.YOLO(self.hawor_detector_path).to(device) - _, tracks = self.detect_track(images, hand_det_model, thresh=self.thresh) + recon_results = self.hawor_motion_estimation( + images, + tracks, + hawor_model, + img_focal, + frame_file_paths=frame_file_paths, + single_image=(N == 1), + ) - recon_results = self.hawor_motion_estimation( - images, tracks, hawor_model, img_focal, single_image=(N == 1), img_paths=frames_path - ) - del hand_det_model - - # --- 3. Re-calculate Global Translation (MANO Alignment) --- - left_frame_id_list = [] - left_beta_list = [] - left_hand_pose_list = [] - left_global_orient_list = [] - left_transl_list = [] - - right_frame_id_list = [] - right_beta_list = [] - right_hand_pose_list = [] - right_global_orient_list = [] - right_transl_list = [] - - for img_idx in range(N): + # Collect per-hand results in structured format + hand_output = {} for hand_type in ["left", "right"]: - if hand_type == "left": - if img_idx not in recon_results["left"]: - continue - result = recon_results["left"][img_idx] - else: - if img_idx not in recon_results["right"]: + frame_ids = [] + global_orient_list = [] + hand_pose_list = [] + betas_list = [] + transl_list = [] + + for img_idx in range(N): + if img_idx not in recon_results[hand_type]: continue - result = recon_results["right"][img_idx] - - # Convert results to tensors - betas = torch.from_numpy(result["beta"]).unsqueeze(0).to(device) - hand_pose = torch.from_numpy(result["hand_pose"]).unsqueeze(0).to(device) - transl = torch.from_numpy(result["transl"]).unsqueeze(0).to(device) - - # Forward pass through MANO model - model_output = mano_model(betas=betas, hand_pose=hand_pose) - verts_m = model_output.vertices[0] - joints_m = model_output.joints[0] - - # Flip x-axis for left hand consistency - if hand_type == "left": - verts_m[:, 0] = -1 * verts_m[:, 0] - joints_m[:, 0] = -1 * joints_m[:, 0] - - wrist = joints_m[0] - - # Calculate new translation - transl_new = wrist + transl - - # Store results with the new translation - result_new_transl = copy.deepcopy(result) - result_new_transl["transl"] = transl_new[0].cpu().numpy() - - if hand_type == "left": - left_frame_id_list.append(img_idx) - left_beta_list.append(result_new_transl["beta"]) - left_hand_pose_list.append(result_new_transl["hand_pose"]) - left_global_orient_list.append(result_new_transl["global_orient"]) - left_transl_list.append(result_new_transl["transl"]) + result = recon_results[hand_type][img_idx] + frame_ids.append(img_idx) + global_orient_list.append(result["global_orient"].tolist()) # (3,) + hand_pose_list.append(result["hand_pose"].tolist()) # (45,) + betas_list.append(result["betas"].tolist()) # (10,) + transl_list.append(result["transl"].tolist()) # (3,) + + # Compute MANO 21-joint positions in camera space + joints_cam = None + T_valid = len(frame_ids) + if T_valid > 0: + mano_model = mano_left if hand_type == "left" else mano_right + if mano_model is not None: + joints_cam = self._compute_mano_joints( + mano_model, + global_orient_list, + hand_pose_list, + betas_list, + transl_list, + ) # (T, 21, 3) numpy + + hand_output[hand_type] = { + "frame_ids": frame_ids, + "global_orient": global_orient_list, + "hand_pose": hand_pose_list, + "betas": betas_list, + "transl": transl_list, + "joints_cam": joints_cam, # (T, 21, 3) or None + } - else: - right_frame_id_list.append(img_idx) - right_beta_list.append(result_new_transl["beta"]) - right_hand_pose_list.append(result_new_transl["hand_pose"]) - right_global_orient_list.append(result_new_transl["global_orient"]) - right_transl_list.append(result_new_transl["transl"]) - - sample[Fields.meta][self.tag_field_name] = { - "fov_x": fov_x, - "left_frame_id_list": left_frame_id_list, - "left_beta_list": left_beta_list, - "left_hand_pose_list": left_hand_pose_list, - "left_global_orient_list": left_global_orient_list, - "left_transl_list": left_transl_list, - "right_frame_id_list": right_frame_id_list, - "right_beta_list": right_beta_list, - "right_hand_pose_list": right_hand_pose_list, - "right_global_orient_list": right_global_orient_list, - "right_transl_list": right_transl_list, - } + sample[Fields.meta][self.tag_field_name].append( + { + "fov_x": float(fov_x), + "img_focal": float(img_focal), + "left": hand_output["left"], + "right": hand_output["right"], + } + ) return sample diff --git a/data_juicer/ops/mapper/video_split_by_duration_mapper.py b/data_juicer/ops/mapper/video_split_by_duration_mapper.py index 83b260feb8..1ab43b566c 100644 --- a/data_juicer/ops/mapper/video_split_by_duration_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_duration_mapper.py @@ -1,7 +1,10 @@ import copy +import os import re +import uuid import numpy as np +from loguru import logger from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import add_suffix_to_filename, transfer_filename @@ -46,11 +49,15 @@ class VideoSplitByDurationMapper(Mapper): def __init__( self, split_duration: float = 10, + overlap_duration: float = 0, min_last_split_duration: float = 0, keep_original_sample: bool = True, save_dir: str = None, video_backend: str = "ffmpeg", ffmpeg_extra_args: str = "", + output_format: str = "path", + save_field: str = None, + legacy_split_by_text_token: bool = True, *args, **kwargs, ): @@ -58,6 +65,11 @@ def __init__( Initialization method. :param split_duration: duration of each video split in seconds. + :param overlap_duration: overlap duration in seconds between + consecutive splits. For example, with split_duration=20 and + overlap_duration=5, clips will be [0-20, 15-35, 30-50, ...]. + Must be non-negative and less than split_duration. Default: 0 + (no overlap). :param min_last_split_duration: The minimum allowable duration in seconds for the last video split. If the duration of the last split is less than this value, it will be discarded. @@ -70,6 +82,16 @@ def __init__( This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable. :param video_backend: video backend, can be `ffmpeg`, `av`. :param ffmpeg_extra_args: Extra ffmpeg args for splitting video, only valid when `video_backend` is `ffmpeg`. + :param output_format: The output format of the videos. + Supported formats are: ["path", "bytes"]. + If format is "path", the output is a list of lists, where each inner + list contains the path of the split videos. + If format is "bytes", the output is a list of lists, where each inner + list contains the bytes of the split videos. + :param save_field: The new field name to save generated video files path. + If not specified, will overwrite the original video field. + :param legacy_split_by_text_token: Whether to split by special tokens (e.g. <__dj__video>) + in the text field and read videos in order, or use the 'videos' or 'frames' field directly. :param args: extra args :param kwargs: extra args """ @@ -78,6 +100,11 @@ def __init__( self._init_parameters.pop("save_dir", None) self.split_duration = split_duration + self.overlap_duration = overlap_duration + assert self.overlap_duration >= 0, f"overlap_duration must be >= 0, got {overlap_duration}" + assert self.overlap_duration < self.split_duration, ( + f"overlap_duration ({overlap_duration}) must be less than " f"split_duration ({split_duration})" + ) self.min_last_split_duration = min_last_split_duration self.keep_original_sample = keep_original_sample self.extra_args = kwargs @@ -85,28 +112,66 @@ def __init__( self.video_backend = video_backend assert self.video_backend in ["ffmpeg", "av"] self.ffmpeg_extra_args = ffmpeg_extra_args + self.output_format = output_format.lower() + assert self.output_format in [ + "path", + "bytes", + ], f"output_format '{output_format}' is not supported. Can only be one of ['path', 'bytes']." + self.save_field = save_field + self.legacy_split_by_text_token = legacy_split_by_text_token + if self.legacy_split_by_text_token: + logger.warning( + "`legacy_split_by_text_token` is set to true, " + "spliting the text field by special tokens " + "(e.g. <__dj__video>) to read videos in order. " + "This behavior will be deprecated in future versions. " + "Please set `legacy_split_by_text_token` to False, " + 'and use the "videos" or "frames" field directly.' + ) - def split_videos_by_duration(self, video_key, container): + def split_videos_by_duration(self, container, video_key: str = None): video_duration = container.metadata.duration - timestamps = np.arange(0, video_duration, self.split_duration).tolist() + if video_duration <= self.split_duration: + if video_key: + return [video_key] + return [] + + # Step size: split_duration - overlap_duration + # e.g. split=20, overlap=5 → step=15 → starts=[0, 15, 30, ...] + step = self.split_duration - self.overlap_duration + start_times = np.arange(0, video_duration, step).tolist() + count = 0 split_video_keys = [] - unique_video_key = transfer_filename(video_key, OP_NAME, self.save_dir, **self._init_parameters) + + if video_key: + unique_video_key = transfer_filename(video_key, OP_NAME, self.save_dir, **self._init_parameters) + else: + unique_video_key = os.path.join(self.save_dir, f"{uuid.uuid4().hex}.mp4") + if self.video_backend == "ffmpeg" and self.ffmpeg_extra_args: kwargs = {"ffmpeg_extra_args": self.ffmpeg_extra_args} else: kwargs = {} - for i in range(1, len(timestamps)): - split_video_key = add_suffix_to_filename(unique_video_key, f"_{count}") - if container.extract_clip(timestamps[i - 1], timestamps[i], split_video_key, **kwargs): - split_video_keys.append(split_video_key) - count += 1 - if video_duration - timestamps[-1] >= self.min_last_split_duration: - split_video_key = add_suffix_to_filename(unique_video_key, f"_{count}") + for start in start_times: + end = start + self.split_duration + + if end >= video_duration: + # Last segment: check minimum duration + remaining = video_duration - start + if remaining >= self.min_last_split_duration: + split_video_key = add_suffix_to_filename(unique_video_key, f"_{count}") + if container.extract_clip(start, None, split_video_key, **kwargs): + split_video_keys.append(split_video_key) + count += 1 + break + else: + split_video_key = add_suffix_to_filename(unique_video_key, f"_{count}") + if container.extract_clip(start, end, split_video_key, **kwargs): + split_video_keys.append(split_video_key) + count += 1 - if container.extract_clip(timestamps[-1], None, split_video_key, **kwargs): - split_video_keys.append(split_video_key) return split_video_keys def _process_single_sample(self, sample): @@ -115,8 +180,10 @@ def _process_single_sample(self, sample): sample[Fields.source_file] = [] return [] + is_video_path = isinstance(sample[self.video_key][0], str) if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.video_key] + if is_video_path: + sample[Fields.source_file] = sample[self.video_key] # the split results split_sample = copy.deepcopy(sample) @@ -124,39 +191,68 @@ def _process_single_sample(self, sample): split_sample[Fields.source_file] = [] # load all video(s) - loaded_video_keys = sample[self.video_key] + loaded_videos = sample[self.video_key] videos = {} - for loaded_video_key in loaded_video_keys: - if loaded_video_key not in videos: + for video_idx, loaded_video in enumerate(loaded_videos): + if video_idx not in videos: # avoid loading the same videos - video = create_video_reader(loaded_video_key, backend=self.video_backend) - videos[loaded_video_key] = video + video = create_video_reader(loaded_video, backend=self.video_backend) + videos[video_idx] = video split_video_keys = [] - offset = 0 - # split each video chunk by chunk - for chunk in sample[self.text_key].split(SpecialTokens.eoc): - # skip empty chunks or contents after the last eoc token - if not chunk.strip(): - continue - else: - video_count = chunk.count(SpecialTokens.video) - place_holders = [] - for video_key in loaded_video_keys[offset : offset + video_count]: - video = videos[video_key] - new_video_keys = self.split_videos_by_duration(video_key, video) - video.close() - split_video_keys.extend(new_video_keys) - place_holders.append(SpecialTokens.video * len(new_video_keys)) - split_sample[Fields.source_file].extend([video_key] * len(new_video_keys)) - - # insert the generated text according to given mode - replacer_function = create_replacer(place_holders) - new_split_text_per_chunk = re.sub(SpecialTokens.video, replacer_function, chunk) - split_sample[self.text_key] += f"{new_split_text_per_chunk}{SpecialTokens.eoc}" # noqa: E501 - offset += video_count - - split_sample[self.video_key] = split_video_keys + + if self.legacy_split_by_text_token: + offset = 0 + # split each video chunk by chunk + for chunk in sample[self.text_key].split(SpecialTokens.eoc): + # skip empty chunks or contents after the last eoc token + if not chunk.strip(): + continue + else: + video_count = chunk.count(SpecialTokens.video) + place_holders = [] + for idx in range(offset, offset + video_count): + video = videos[idx] + if is_video_path: + video_path = loaded_videos[idx] + new_video_keys = self.split_videos_by_duration(video, video_path) + split_sample[Fields.source_file].extend([video_path] * len(new_video_keys)) + else: + new_video_keys = self.split_videos_by_duration(video, None) + split_sample[Fields.source_file].extend(new_video_keys) + video.close() + split_video_keys.extend(new_video_keys) + place_holders.append(SpecialTokens.video * len(new_video_keys)) + + # insert the generated text according to given mode + replacer_function = create_replacer(place_holders) + new_split_text_per_chunk = re.sub(SpecialTokens.video, replacer_function, chunk) + split_sample[self.text_key] += f"{new_split_text_per_chunk}{SpecialTokens.eoc}" # noqa: E501 + offset += video_count + else: + # TODO: handle the text field update + for video_idx, video in videos.items(): + if is_video_path: + video_path = loaded_videos[video_idx] + new_video_keys = self.split_videos_by_duration(video, video_path) + split_sample[Fields.source_file].extend([video_path] * len(new_video_keys)) + else: + new_video_keys = self.split_videos_by_duration(video, None) + split_sample[Fields.source_file].extend(new_video_keys) + video.close() + split_video_keys.extend(new_video_keys) + + if self.output_format == "bytes": + from data_juicer.utils.mm_utils import load_file_byte + + split_videos = [load_file_byte(f) for f in split_video_keys] + else: + split_videos = split_video_keys + + if self.save_field: + split_sample[self.save_field] = split_videos + else: + split_sample[self.video_key] = split_videos return [split_sample] def process_batched(self, samples): diff --git a/data_juicer/ops/mapper/video_trajectory_overlay_mapper.py b/data_juicer/ops/mapper/video_trajectory_overlay_mapper.py new file mode 100644 index 0000000000..70fbb7d2f6 --- /dev/null +++ b/data_juicer/ops/mapper/video_trajectory_overlay_mapper.py @@ -0,0 +1,517 @@ +import os + +import cv2 +import numpy as np +from loguru import logger + +from data_juicer.utils.constant import Fields, MetaKeys + +from ..base_op import OPERATORS, Mapper + +OP_NAME = "video_trajectory_overlay_mapper" + + +@OPERATORS.register_module(OP_NAME) +class VideoTrajectoryOverlayMapper(Mapper): + """Prepare VLM-ready frames by sampling and overlaying hand trajectories. + + Implements the visualization step from paper https://arxiv.org/pdf/2510.21571: + + "From each segment, we evenly sample 8 frames and highlight hand + trajectories on each frame by projecting the world-space trajectory + of the hand palm from the current frame to the end of the clip." + + For each atomic action segment (output of + ``VideoAtomicActionSegmentMapper``), this operator: + + 1. Evenly samples ``n_sample_frames`` frames from the segment. + 2. For each sampled frame, projects the **future** world-space wrist + trajectory (from the current frame to the end of the segment) onto + the image using camera intrinsics and cam_c2w. + 3. Draws the trajectory as a colored line with a dot at the current + wrist position. + 4. Saves the overlay images and stores their paths in the segment. + + The output is written back into each segment dict under + ``"overlay_frames"``, ready to be consumed by the VLM captioning + operator. + """ + + # MANO joint index for palm center (middle finger MCP). + # Paper §3.3: "trajectory of the hand palm" + PALM_JOINT_INDEX = 9 + + def __init__( + self, + segment_field: str = "atomic_action_segments", + camera_pose_field: str = MetaKeys.video_camera_pose_tags, + moge_field: str = MetaKeys.camera_calibration_moge_tags, + frame_field: str = MetaKeys.video_frames, + save_dir: str = None, + n_sample_frames: int = 8, + palm_joint_index: int = 9, + dot_radius: int = 10, + line_thickness: int = 4, + trajectory_alpha: float = 0.7, + *args, + **kwargs, + ): + """ + Initialization method. + + :param segment_field: Meta field storing atomic action segments. + :param camera_pose_field: Meta field storing camera pose (cam_c2w). + :param moge_field: Meta field storing MoGe calibration (for fov_x). + :param frame_field: Field storing frame image paths. + :param save_dir: Directory to save overlay images. If None, uses + a temp directory derived from the first frame path. + :param n_sample_frames: Number of frames to evenly sample from + each segment. + :param palm_joint_index: MANO joint index for the palm position. + Default 9 = middle finger MCP (palm center proxy). + Joint 0 = wrist root. + :param dot_radius: Radius of the dot at the current wrist position. + :param line_thickness: Thickness of the trajectory line. + :param trajectory_alpha: Alpha blending for the trajectory overlay. + """ + super().__init__(*args, **kwargs) + self.segment_field = segment_field + self.camera_pose_field = camera_pose_field + self.moge_field = moge_field + self.frame_field = frame_field + self.save_dir = save_dir + self.n_sample_frames = n_sample_frames + self.palm_joint_index = palm_joint_index + self.dot_radius = dot_radius + self.line_thickness = line_thickness + self.trajectory_alpha = trajectory_alpha + + # ------------------------------------------------------------------ + # Projection helpers + # ------------------------------------------------------------------ + @staticmethod + def _world_to_camera( + pos_world: np.ndarray, + cam_c2w: np.ndarray, + ) -> np.ndarray: + """Convert world position(s) to camera space. + + Args: + pos_world: (..., 3) world positions. + cam_c2w: (4, 4) camera-to-world transform. + + Returns: + (..., 3) camera-space positions. + """ + R = cam_c2w[:3, :3] + t = cam_c2w[:3, 3] + # cam = R^T @ (world - t) + return (pos_world - t) @ R # equivalent to (R.T @ (p - t).T).T + + @staticmethod + def _project_to_2d( + pos_cam: np.ndarray, + width: int, + height: int, + K: np.ndarray = None, + fov_x: float = None, + ) -> np.ndarray: + """Project camera-space positions to 2D pixel coords. + + Args: + pos_cam: (..., 3) camera-space positions. + width: image width. + height: image height. + K: (3, 3) intrinsics matrix (preferred). If provided, fov_x + is ignored and fx, fy, cx, cy are taken from K directly. + fov_x: horizontal field of view in radians (fallback when K + is not available). + + Returns: + (..., 2) pixel coordinates (u, v). + """ + if K is not None: + K = np.asarray(K, dtype=np.float64) + fx = K[0, 0] + fy = K[1, 1] + cx = K[0, 2] + cy = K[1, 2] + elif fov_x is not None: + fx = width / (2.0 * np.tan(fov_x / 2.0)) + fy = fx + cx, cy = width / 2.0, height / 2.0 + else: + raise ValueError("Either K or fov_x must be provided") + + z = pos_cam[..., 2] + z_safe = np.where(np.abs(z) < 1e-6, 1e-6, z) + u = fx * pos_cam[..., 0] / z_safe + cx + v = fy * pos_cam[..., 1] / z_safe + cy + return np.stack([u, v], axis=-1) + + # ------------------------------------------------------------------ + # Drawing helpers + # ------------------------------------------------------------------ + @staticmethod + def _temporal_color(t: float) -> tuple: + """Map normalized time t ∈ [0, 1] to BGR color along + blue → green → red gradient. + + t=0 (current) → blue, t=0.5 → green, t=1 (future end) → red. + """ + # BGR format + if t < 0.5: + # blue → green + ratio = t / 0.5 + b = int(255 * (1 - ratio)) + g = int(255 * ratio) + r = 0 + else: + # green → red + ratio = (t - 0.5) / 0.5 + b = 0 + g = int(255 * (1 - ratio)) + r = int(255 * ratio) + return (b, g, r) + + def _draw_trajectory( + self, + frame: np.ndarray, + points_2d: np.ndarray, + current_idx: int = 0, + ) -> np.ndarray: + """Draw trajectory line with blue→green→red temporal gradient + and a blue dot at the current palm position. + + Args: + frame: BGR image to draw on (modified in place). + points_2d: (N, 2) pixel coordinates of trajectory points. + current_idx: index of the current frame's position in + points_2d (drawn as a blue dot). + """ + h, w = frame.shape[:2] + overlay = frame.copy() + + # Filter out-of-frame and behind-camera points, keep original index + valid = [] + valid_indices = [] + for i, pt in enumerate(points_2d): + if 0 <= pt[0] < w and 0 <= pt[1] < h: + valid.append((int(pt[0]), int(pt[1]))) + valid_indices.append(i) + elif valid: + # Keep trajectory continuous by clamping + valid.append( + ( + int(np.clip(pt[0], 0, w - 1)), + int(np.clip(pt[1], 0, h - 1)), + ) + ) + valid_indices.append(i) + + # Draw trajectory line with temporal color gradient + n_pts = len(points_2d) + if len(valid) >= 2: + for i in range(len(valid) - 1): + t = valid_indices[i] / max(n_pts - 1, 1) + line_color = self._temporal_color(t) + cv2.line( + overlay, + valid[i], + valid[i + 1], + line_color, + self.line_thickness, + lineType=cv2.LINE_AA, + ) + + # Draw current position as a blue dot + if current_idx < len(points_2d): + pt = points_2d[current_idx] + if 0 <= pt[0] < w and 0 <= pt[1] < h: + blue_bgr = (255, 100, 0) # blue in BGR + cv2.circle( + overlay, + (int(pt[0]), int(pt[1])), + self.dot_radius, + blue_bgr, + -1, + lineType=cv2.LINE_AA, + ) + # White border for visibility + cv2.circle( + overlay, + (int(pt[0]), int(pt[1])), + self.dot_radius + 1, + (255, 255, 255), + 1, + lineType=cv2.LINE_AA, + ) + + # Alpha blend + cv2.addWeighted( + overlay, + self.trajectory_alpha, + frame, + 1 - self.trajectory_alpha, + 0, + frame, + ) + return frame + + # ------------------------------------------------------------------ + # Process one segment + # ------------------------------------------------------------------ + def _process_segment( + self, + segment: dict, + all_frames: list[str], + cam_c2w_all: np.ndarray, + save_dir: str, + intrinsics_list: list = None, + fov_x: float = None, + file_prefix: str = "", + ) -> dict: + """Process a single segment: sample frames, overlay trajectory. + + Args: + intrinsics_list: per-frame (3,3) intrinsics matrices from MoGe. + If provided, used for accurate projection (preferred). + fov_x: fallback horizontal FOV in radians when intrinsics_list + is not available. + file_prefix: prefix added to overlay filenames to avoid + collisions when multiple videos share the same save_dir. + + Returns the segment dict with ``overlay_frames`` added. + """ + hand_type = segment["hand_type"] + valid_fids = segment["valid_frame_ids"] + + # Use joints_world for the palm trajectory (paper §3.3). + # states[:, 0:3] is MANO's root transl, NOT the actual palm/wrist + # position — there is a significant offset (~10cm) between them. + joints_world = segment.get("joints_world") + if joints_world and len(joints_world) > 0: + jw_arr = np.asarray(joints_world, dtype=np.float64) + palm_positions = jw_arr[:, self.palm_joint_index, :] + else: + # Fallback to states (less accurate) + states = np.asarray(segment["states"], dtype=np.float64) + palm_positions = states[:, 0:3] + logger.debug( + f"No joints_world for {hand_type} segment, " f"falling back to states[:, 0:3]", + ) + + n = len(palm_positions) + + if n < 2: + segment["overlay_frames"] = [] + segment["sampled_frame_indices"] = [] + return segment + + # Evenly sample frame indices within the segment + if n <= self.n_sample_frames: + sample_indices = list(range(n)) + else: + sample_indices = np.linspace( + 0, + n - 1, + self.n_sample_frames, + dtype=int, + ).tolist() + + seg_id = segment.get("segment_id", 0) + overlay_paths = [] + + for local_idx in sample_indices: + fid = valid_fids[local_idx] + if fid >= len(all_frames) or not all_frames[fid]: + continue + + frame_path = all_frames[fid] + frame = cv2.imread(frame_path) + if frame is None: + continue + + h, w = frame.shape[:2] + + # Get future trajectory: from current frame to end of segment + future_positions = palm_positions[local_idx:] + + if fid >= len(cam_c2w_all): + continue + + # Project all future world positions using the CURRENT frame's + # camera (we are drawing on the current frame's image). + cam = cam_c2w_all[fid] + + # Determine per-frame intrinsics for projection + frame_K = None + if intrinsics_list is not None and fid < len(intrinsics_list): + frame_K = np.asarray(intrinsics_list[fid], dtype=np.float64) + + points_2d_list = [] + for j in range(len(future_positions)): + pos_cam = self._world_to_camera(future_positions[j], cam) + pt_2d = self._project_to_2d( + pos_cam, + w, + h, + K=frame_K, + fov_x=fov_x, + ) + points_2d_list.append(pt_2d) + + if not points_2d_list: + continue + + points_2d = np.array(points_2d_list) + frame = self._draw_trajectory(frame, points_2d, 0) + + # Save overlay frame (prefix avoids collisions across videos) + fname = (f"{file_prefix}_" if file_prefix else "") + f"seg{seg_id}_{hand_type}_f{fid:06d}_overlay.jpg" + out_path = os.path.join(save_dir, fname) + cv2.imwrite(out_path, frame) + overlay_paths.append(out_path) + + segment["overlay_frames"] = overlay_paths + segment["sampled_frame_indices"] = [valid_fids[i] for i in sample_indices if i < len(valid_fids)] + return segment + + # ------------------------------------------------------------------ + # Main entry + # ------------------------------------------------------------------ + def _sample_prefix(self, sample: dict) -> str: + """Derive a short unique prefix from the sample's video path. + + Used to namespace overlay files so different videos sharing + the same save_dir do not overwrite each other. + """ + videos = sample.get(self.video_key, []) + if videos: + v = videos[0] if isinstance(videos, list) else videos + return os.path.splitext(os.path.basename(v))[0] + + return "unknown" + + def process_single(self, sample=None, rank=None): + if Fields.meta not in sample: + return sample + + meta = sample[Fields.meta] + segments = meta.get(self.segment_field) + if not segments: + return sample + + # Get frame paths + frame_data = sample.get(self.frame_field, []) + if not frame_data: + return sample + all_frames = ( + frame_data[0] + if isinstance(frame_data, list) and frame_data and isinstance(frame_data[0], list) + else frame_data + ) + + # Get cam_c2w + cam_pose_list = meta.get(self.camera_pose_field, []) + if not cam_pose_list: + logger.warning("No camera pose data for trajectory overlay.") + return sample + + from data_juicer.utils.constant import CameraCalibrationKeys + from data_juicer.utils.file_utils import load_numpy + + cam_pose = cam_pose_list[0] if isinstance(cam_pose_list, list) else cam_pose_list + raw_c2w = cam_pose.get(CameraCalibrationKeys.cam_c2w) + if raw_c2w is None: + logger.warning("No cam_c2w for trajectory overlay.") + return sample + cam_c2w_all = np.asarray(load_numpy(raw_c2w), dtype=np.float64) + + # Get camera intrinsics (prefer full K matrix, fallback to fov_x) + intrinsics_list, fov_x = self._get_intrinsics(meta) + if intrinsics_list is None and fov_x is None: + logger.warning( + "Cannot determine camera intrinsics, skipping overlay.", + ) + return sample + + # Determine save directory + save_dir = self.save_dir + if save_dir is None and all_frames: + save_dir = os.path.join( + os.path.dirname(all_frames[0]), + "trajectory_overlays", + ) + if save_dir: + os.makedirs(save_dir, exist_ok=True) + + # Unique prefix to avoid filename collisions across videos + prefix = self._sample_prefix(sample) + + # Process each segment + for i, seg in enumerate(segments): + try: + segments[i] = self._process_segment( + seg, + all_frames, + cam_c2w_all, + save_dir, + intrinsics_list=intrinsics_list, + fov_x=fov_x, + file_prefix=prefix, + ) + except Exception as e: + logger.warning( + f"Trajectory overlay failed for segment {i}: {e}", + ) + seg["overlay_frames"] = [] + seg["sampled_frame_indices"] = [] + + meta[self.segment_field] = segments + return sample + + def _get_intrinsics(self, meta: dict) -> tuple: + """Extract camera intrinsics for projection. + + Returns: + (intrinsics_list, fov_x): intrinsics_list is a per-frame list + of (3,3) K matrices if available (preferred), otherwise None. + fov_x is a scalar fallback FOV in radians. + At least one of them will be non-None if calibration data exists. + """ + from data_juicer.utils.constant import CameraCalibrationKeys + + intrinsics_list = None + fov_x = None + + # Try MoGe calibration — prefer full intrinsics matrix K + moge_list = meta.get(self.moge_field, []) + if moge_list: + moge = moge_list[0] if isinstance(moge_list, list) else moge_list + if isinstance(moge, dict): + # Prefer per-frame intrinsics K matrix + K_list = moge.get(CameraCalibrationKeys.intrinsics) + if K_list and isinstance(K_list, list) and len(K_list) > 0: + intrinsics_list = K_list + + # Also get hfov as fallback + hfov = moge.get(CameraCalibrationKeys.hfov) + if hfov is not None: + if isinstance(hfov, list) and hfov: + fov_x = float(np.median(hfov)) + else: + fov_x = float(hfov) + + # Try HaWoR fov_x (HaWoR uses median of MoGe hfov, most consistent) + if fov_x is None: + hawor_field = MetaKeys.hand_reconstruction_hawor_tags + hawor_list = meta.get(hawor_field, []) + if hawor_list: + hawor = hawor_list[0] if isinstance(hawor_list, list) else hawor_list + if isinstance(hawor, dict): + hawor_fov = hawor.get("fov_x") + if hawor_fov is not None: + fov_x = float(hawor_fov) + + return intrinsics_list, fov_x diff --git a/data_juicer/ops/mapper/video_undistort_mapper.py b/data_juicer/ops/mapper/video_undistort_mapper.py index 0a297a2d0a..83e08cc056 100644 --- a/data_juicer/ops/mapper/video_undistort_mapper.py +++ b/data_juicer/ops/mapper/video_undistort_mapper.py @@ -1,10 +1,8 @@ import os -import subprocess import numpy as np -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE -from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.utils.constant import CameraCalibrationKeys, Fields, MetaKeys from data_juicer.utils.lazy_loader import LazyLoader from ..base_op import OPERATORS, Mapper @@ -15,6 +13,21 @@ ffmpeg = LazyLoader("ffmpeg", "ffmpeg-python") +def get_global_intrinsics(ks): + fx = ks[:, 0, 0] + fy = ks[:, 1, 1] + cx = ks[:, 0, 2] + cy = ks[:, 1, 2] + + global_k = np.eye(3) + global_k[0, 0] = np.median(fx) + global_k[1, 1] = np.median(fy) + global_k[0, 2] = np.median(cx) + global_k[1, 2] = np.median(cy) + + return global_k + + @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoUndistortMapper(Mapper): @@ -23,8 +36,9 @@ class VideoUndistortMapper(Mapper): def __init__( self, - output_video_dir: str = DATA_JUICER_ASSETS_CACHE, - tag_field_name: str = MetaKeys.video_undistortion_tags, + output_video_dir: str = None, + undistorted_video_field: str = MetaKeys.undistorted_video, + camera_calibration_field: str = "camera_calibration", batch_size_each_video: int = 1000, crf: int = 22, *args, @@ -34,47 +48,18 @@ def __init__( Initialization method. :param output_video_dir: Output directory to save undistorted videos. - :param tag_field_name: The field name to store the tags. It's - "video_undistortion_tags" in default. + :param undistorted_video_field: The field name to store the tags. It's + "undistorted_video" in default. + :param camera_calibration_field: The field name where the camera calibration info is stored. :param batch_size_each_video: Number of frames to process and save per temporary TS file batch. :param crf: Constant Rate Factor (CRF) for FFmpeg encoding quality. :param args: extra args :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - # check if only opencv-contrib-python is installed - opencv_packages = subprocess.run(["pip", "list"], capture_output=True, text=True, check=True) - installed_opencv = [ - line.split()[0] for line in opencv_packages.stdout.splitlines() if line.startswith("opencv") - ] - - # if not, uninstall all opencv-related modules and reinstall opencv-contrib-python - if set(installed_opencv) != {"opencv-contrib-python"}: - # uninstall all opencv-related modules - subprocess.run( - [ - "pip", - "uninstall", - "-y", - "opencv-python", - "opencv-python-headless", - "opencv-contrib-python", - "opencv-contrib-python-headless", - ], - check=False, - ) - - # reinstall opencv-contrib-python - LazyLoader.check_packages(["opencv-contrib-python"]) - - # fix the version of numpy - subprocess.run(["pip", "install", "numpy==1.26.4"], check=True) - - cv2 = LazyLoader("cv2", "opencv-contrib-python") + import cv2 self.VideoCapture = cv2.VideoCapture self.CAP_PROP_FRAME_HEIGHT = cv2.CAP_PROP_FRAME_HEIGHT @@ -89,9 +74,13 @@ def __init__( self.COLOR_BGR2RGB = cv2.COLOR_BGR2RGB self.output_video_dir = output_video_dir - self.tag_field_name = tag_field_name + assert self.output_video_dir is not None, "output_video_dir must be specified to save the undistorted videos." + os.makedirs(self.output_video_dir, exist_ok=True) + + self.undistorted_video_field = undistorted_video_field self.batch_size_each_video = batch_size_each_video self.crf = crf + self.camera_calibration_field = camera_calibration_field def concatenate_ts_files(self, folder, video_name, batch_counts): """Concatenate batch TS files into final mp4.""" @@ -104,8 +93,10 @@ def concatenate_ts_files(self, folder, video_name, batch_counts): # Merge using ffmpeg concat demuxer ffmpeg.input(inputs_path, format="concat", safe=0).output( - os.path.join(folder, f"{video_name}.mp4"), c="copy" - ).run() + os.path.join(folder, f"{video_name}.mp4"), + c="copy", + movflags="frag_keyframe+empty_moov", + ).run(overwrite_output=True) # Cleanup temporary TS files and list file for i in range(batch_counts): @@ -131,6 +122,7 @@ def create_ffmpeg_writer(self, output_path, width, height, fps, crf): "c:v": "libx264", "crf": str(crf), "r": fps, + "movflags": "frag_keyframe+empty_moov", }, ) .overwrite_output() @@ -138,116 +130,143 @@ def create_ffmpeg_writer(self, output_path, width, height, fps, crf): ) def process_single(self, sample, context=False): - # check if it's generated already - if self.tag_field_name in sample[Fields.meta]: + if self.undistorted_video_field in sample: return sample # there is no videos in this sample if self.video_key not in sample or not sample[self.video_key]: return [] - cap = self.VideoCapture(sample[self.video_key][0]) - video_name = os.path.splitext(os.path.basename(sample[self.video_key][0]))[0] - - # Get video properties - height = int(cap.get(self.CAP_PROP_FRAME_HEIGHT)) - width = int(cap.get(self.CAP_PROP_FRAME_WIDTH)) - fps = cap.get(self.CAP_PROP_FPS) - - if "intrinsics" not in sample or sample["intrinsics"] is None: - raise ValueError("The sample must include an 'intrinsics' field to store the 3x3 camera intrinsics matrix.") - - if "xi" not in sample or sample["xi"] is None: - raise ValueError("The sample must include an 'xi' field to store the parameter xi in CMei's model.") - - K = sample["intrinsics"] # 3x3 camera intrinsics. - D = sample.get( - "distortion_coefficients", None - ) # Distortion coefficients (k1,k2,p1,p2). If D is None then zero distortion is used. - xi = sample["xi"] # The parameter xi for CMei's model. - R = sample.get( - "rotation_matrix", None - ) # Rotation transform between the original and object space. If it is None, there is no rotation. - new_K = sample.get( - "intrinsics_new", None - ) # New camera intrinsics. if new_K is empty then identity intrinsics are used. - - K = np.array(K, dtype=np.float32) - xi = np.array(xi, dtype=np.float32) - - if D is None: - D = np.array([0, 0, 0, 0], dtype=np.float32) - else: - D = np.array(D, dtype=np.float32) - - if R is None: - R = np.eye(3) - else: - R = np.array(R, dtype=np.float32) - - if new_K is None: - new_K = K - else: - new_K = np.array(new_K, dtype=np.float32) - - map1, map2 = self.omnidir.initUndistortRectifyMap( - K, D, xi, R, new_K, (width, height), self.CV_16SC2, self.omnidir.RECTIFY_PERSPECTIVE - ) + camera_calibration_field = self.camera_calibration_field + intrinsics_field = CameraCalibrationKeys.intrinsics + xi_field = CameraCalibrationKeys.xi + dist_coeffs_field = CameraCalibrationKeys.dist_coeffs + rotation_matrix_field = CameraCalibrationKeys.rectify_R + new_intrinsics_field = CameraCalibrationKeys.new_intrinsics - # Initialize the first batch ffmpeg writer - os.makedirs(self.output_video_dir, exist_ok=True) - batch_number = 0 - writer = self.create_ffmpeg_writer( - os.path.join(self.output_video_dir, f"{video_name}_b{batch_number:04d}.ts"), width, height, fps, self.crf - ) + sample[self.undistorted_video_field] = [] + + for video_idx in range(len(sample[self.video_key])): + cur_video_calibration = sample[Fields.meta][camera_calibration_field][video_idx] + if not cur_video_calibration.get(intrinsics_field): + raise ValueError( + f"The sample must include an '{intrinsics_field}' field to store the 3x3 camera intrinsics matrix." + ) + + if not cur_video_calibration.get(xi_field): + raise ValueError( + f"The sample must include an '{xi_field}' field to store the parameter xi in CMei's model." + ) - idx = 0 - # Read and process frames - while True: - ret, frame = cap.read() - if not ret: - # End of video stream: close the last writer - writer.stdin.close() - writer.wait() - break - - # Undistort the frame - undistorted_frame = self.remap( - frame, map1, map2, interpolation=self.INTER_CUBIC, borderMode=self.BORDER_CONSTANT + video_path = sample[self.video_key][video_idx] + cap = self.VideoCapture(video_path) + video_name = os.path.splitext(os.path.basename(video_path))[0] + + # Get video properties + height = int(cap.get(self.CAP_PROP_FRAME_HEIGHT)) + width = int(cap.get(self.CAP_PROP_FRAME_WIDTH)) + fps = cap.get(self.CAP_PROP_FPS) + + K = cur_video_calibration.get(intrinsics_field) # 3x3 camera intrinsics. + xi = cur_video_calibration.get(xi_field) # The parameter xi for CMei's model. + + D = cur_video_calibration.get( + dist_coeffs_field, None + ) # Distortion coefficients (k1,k2,p1,p2). If D is None then zero distortion is used. + R = cur_video_calibration.get( + rotation_matrix_field, None + ) # Rotation transform between the original and object space. If it is None, there is no rotation. + new_K = cur_video_calibration.get( + new_intrinsics_field, None + ) # New camera intrinsics. if new_K is empty then identity intrinsics are used. + + K = np.array(K, dtype=np.float32) + xi = np.array(xi, dtype=np.float32) + + # frames k + if len(K.shape) == 3: + K = get_global_intrinsics(K) + if len(xi) > 1: + + xi = np.median(xi) + + xi = np.array([xi], dtype=np.float64) + + if D is None: + D = np.array([0, 0, 0, 0], dtype=np.float32) + else: + D = np.array(D, dtype=np.float32) + + if R is None: + R = np.eye(3) + else: + R = np.array(R, dtype=np.float32) + + if new_K is None: + new_K = K + else: + new_K = np.array(new_K, dtype=np.float32) + + map1, map2 = self.omnidir.initUndistortRectifyMap( + K, D, xi, R, new_K, (width, height), self.CV_16SC2, self.omnidir.RECTIFY_PERSPECTIVE ) - # Convert BGR to RGB before writing to ffmpeg (FFmpeg expects RGB) - undistorted_frame = self.cvtColor(undistorted_frame, self.COLOR_BGR2RGB) - - # Write to ffmpeg stdin - writer.stdin.write(undistorted_frame.tobytes()) - - # Check if the current batch is complete (for idx + 1) - if (idx + 1) % self.batch_size_each_video == 0: - # Finalize the current batch writer - writer.stdin.close() - writer.wait() - - # Start the next batch writer - batch_number += 1 - writer = self.create_ffmpeg_writer( - os.path.join(self.output_video_dir, f"{video_name}_b{batch_number:04d}.ts"), - width, - height, - fps, - self.crf, + # Initialize the first batch ffmpeg writer + batch_number = 0 + writer = self.create_ffmpeg_writer( + os.path.join(self.output_video_dir, f"{video_name}_b{batch_number:04d}.ts"), + width, + height, + fps, + self.crf, + ) + + idx = 0 + # Read and process frames + while True: + ret, frame = cap.read() + if not ret: + # End of video stream: close the last writer + writer.stdin.close() + writer.wait() + break + + # Undistort the frame + undistorted_frame = self.remap( + frame, map1, map2, interpolation=self.INTER_CUBIC, borderMode=self.BORDER_CONSTANT ) - idx += 1 + # Convert BGR to RGB before writing to ffmpeg (FFmpeg expects RGB) + undistorted_frame = self.cvtColor(undistorted_frame, self.COLOR_BGR2RGB) + + # Write to ffmpeg stdin + writer.stdin.write(undistorted_frame.tobytes()) + + # Check if the current batch is complete (for idx + 1) + if (idx + 1) % self.batch_size_each_video == 0: + # Finalize the current batch writer + writer.stdin.close() + writer.wait() + + # Start the next batch writer + batch_number += 1 + writer = self.create_ffmpeg_writer( + os.path.join(self.output_video_dir, f"{video_name}_b{batch_number:04d}.ts"), + width, + height, + fps, + self.crf, + ) + + idx += 1 - cap.release() + cap.release() - # Merge all temporary TS chunks into the final MP4 file - self.concatenate_ts_files(self.output_video_dir, video_name, batch_number + 1) + # Merge all temporary TS chunks into the final MP4 file + self.concatenate_ts_files(self.output_video_dir, video_name, batch_number + 1) + out_video = os.path.join(self.output_video_dir, f"{video_name}.mp4") - sample[Fields.meta][self.tag_field_name] = { - "new_video_path": os.path.join(self.output_video_dir, f"{video_name}.mp4") - } + sample[self.undistorted_video_field].append(out_video) return sample diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py index 73fd3c93e3..2e54c0a92e 100644 --- a/data_juicer/utils/constant.py +++ b/data_juicer/utils/constant.py @@ -86,14 +86,18 @@ class MetaKeys(object): # # pose information pose_info = "pose_info" - # # Static Camera Calibration Info (for DeepCalib) - static_camera_calibration_deepcalib_tags = "static_camera_calibration_deepcalib_tags" - # # Static Camera Calibration Info (for Moge-2) - static_camera_calibration_moge_tags = "static_camera_calibration_moge_tags" + # # Camera Calibration Info (for DeepCalib) + camera_calibration_deepcalib_tags = "camera_calibration_deepcalib_tags" + # # Camera Calibration Info (for Moge-2) + camera_calibration_moge_tags = "camera_calibration_moge_tags" # # Video Undistortion Info - video_undistortion_tags = "video_undistortion_tags" + undistorted_video = "undistorted_video" # # Camera Pose Info video_camera_pose_tags = "video_camera_pose_tags" + # # Camera Calibration Info (for DroidCalib) + camera_calibration_droidcalib_tags = "camera_calibration_droidcalib_tags" + # # Hand action (state + action) computed from hand recon + camera pose + hand_action_tags = "hand_action_tags" # === info extraction related tags === # # for event extraction @@ -365,3 +369,17 @@ class JobRequiredKeys(Enum): local = "local" dj_configs = "dj_configs" extra_configs = "extra_configs" + + +class CameraCalibrationKeys(object): + intrinsics = "intrinsics" + xi = "xi" + hfov = "hfov" + vfov = "vfov" + points = "points" + mask = "mask" + depth = "depth" + cam_c2w = "cam_c2w" + dist_coeffs = "dist_coeffs" + rectify_R = "rectify_R" # or "rotation_matrix" + new_intrinsics = "new_intrinsics" diff --git a/data_juicer/utils/file_utils.py b/data_juicer/utils/file_utils.py index 625eb0f631..a040f6c408 100644 --- a/data_juicer/utils/file_utils.py +++ b/data_juicer/utils/file_utils.py @@ -18,6 +18,32 @@ from data_juicer.utils.constant import DEFAULT_PREFIX, Fields +def load_numpy(value): + """Load a numpy array that may be stored inline or as a .npy file path. + + :param value: Either a numpy ndarray (returned as-is) or a string + path to a ``.npy`` file. + :returns: numpy ndarray + """ + import numpy as np + + if isinstance(value, np.ndarray): + return value + if isinstance(value, str): + return np.load(value) + # Fallback: convert lists / other array-like to ndarray + return np.asarray(value) + + +def load_numpy_list(values): + """Load a list of numpy fields (each may be an array or a path). + + :param values: list of numpy arrays or string paths to .npy files. + :returns: list of numpy ndarrays + """ + return [load_numpy(v) for v in values] + + class Sizes: KiB = 2**10 # 1024 MiB = 2**20 # 1024*1024 diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index 6a58058c5a..6d413c64b0 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -425,8 +425,9 @@ def get_processor(): def prepare_deepcalib_model(model_path, **model_params): device = model_params.pop("device", None) - if device is None: - raise ValueError("video_camera_calibration_static_deepcalib_mapper currently supports GPU usage only.") + if device is None or device == "cpu": + raise ValueError("CUDA device must be specified for deepcalib model.") + device = device.replace("cuda", "/gpu") if not os.path.exists(model_path): @@ -629,7 +630,7 @@ def prepare_huggingface_model( return (model, processor) if return_model else processor -def prepare_hawor_model(hawor_model_path, hawor_config_path, mano_right_path, **model_params): +def prepare_hawor_model(hawor_model_path, hawor_config_path, mano_right_path, mano_left_path=None, **model_params): device = model_params.pop("device", "cpu") @@ -652,13 +653,13 @@ def prepare_hawor_model(hawor_model_path, hawor_config_path, mano_right_path, ** hawor_model_dir = os.path.join(DJMC, "HaWor") os.makedirs(hawor_model_dir, exist_ok=True) hawor_model_path = os.path.join(hawor_model_dir, "hawor.ckpt") - subprocess.run(["wget", BACKUP_MODEL_LINKS["hawor_model_path"], hawor_model_path], check=True) + subprocess.run(["wget", BACKUP_MODEL_LINKS["hawor_model_path"], "-O", hawor_model_path], check=True) if not os.path.exists(hawor_config_path): hawor_model_dir = os.path.join(DJMC, "HaWor") os.makedirs(hawor_model_dir, exist_ok=True) hawor_config_path = os.path.join(hawor_model_dir, "model_config.yaml") - subprocess.run(["wget", BACKUP_MODEL_LINKS["hawor_config_path"], hawor_config_path], check=True) + subprocess.run(["wget", BACKUP_MODEL_LINKS["hawor_config_path"], "-O", hawor_config_path], check=True) model_cfg = get_config(hawor_config_path, update_cachedir=True) @@ -683,9 +684,13 @@ def prepare_hawor_model(hawor_model_path, hawor_config_path, mano_right_path, ** from data_juicer.ops.common.mano_func import MANO - mano_model = MANO(model_path=mano_right_path).to(device) + mano_right_model = MANO(model_path=mano_right_path).to(device) + + mano_left_model = None + if mano_left_path and os.path.exists(mano_left_path): + mano_left_model = MANO.build_left(model_path=mano_left_path).to(device) - return hawor_model, model_cfg, mano_model + return hawor_model, model_cfg, mano_right_model, mano_left_model def prepare_kenlm_model(lang, name_pattern="{}.arpa.bin", **model_params): diff --git a/data_juicer/utils/video_utils.py b/data_juicer/utils/video_utils.py index 2218787f8e..1ff1d32649 100644 --- a/data_juicer/utils/video_utils.py +++ b/data_juicer/utils/video_utils.py @@ -126,6 +126,80 @@ def extract_clip( """ raise NotImplementedError + def extract_frames_uniformly(self, frame_num: int) -> List[np.ndarray]: + """Extract a number of frames uniformly distributed across the video. + + :param frame_num: Number of frames to extract. If 1, the middle frame + is extracted. If 2, the first and last frames. If >2, frames are + spaced uniformly including the first and last. + :return: A list of numpy arrays (RGB frames). + """ + from loguru import logger + + duration = self.metadata.duration + total_frames = self.metadata.num_frames + + if total_frames < frame_num: + logger.warning( + "Number of frames to be extracted is larger than the " + "total number of frames in this video. Set it to the " + "total number of frames." + ) + frame_num = total_frames + + if frame_num == 1: + extract_seconds = [duration / 2] + else: + step = duration / (frame_num - 1) + extract_seconds = [step * i for i in range(frame_num)] + + frames = [] + for ts in extract_seconds: + for frame in self.extract_frames(start_time=ts): + frames.append(frame) + break + return frames + + def extract_frames_uniformly_by_seconds(self, frame_num: int, duration: float) -> List[np.ndarray]: + """Extract frames uniformly from each segment of given duration. + + The video is split into segments of the specified duration, and + ``frame_num`` frames are extracted uniformly from each segment. + Only full-length segments are processed (matching AV behavior); + a trailing partial segment shorter than ``duration`` is skipped. + + :param frame_num: Number of frames per segment. + :param duration: Duration of each segment in seconds. + :return: A list of numpy arrays (RGB frames). + """ + video_duration = self.metadata.duration + # np.arange excludes the endpoint, so partial trailing segments + # shorter than `duration` are naturally skipped. + timestamps = np.arange(0, video_duration, duration).tolist() + + all_frames = [] + for i in range(1, len(timestamps)): + seg_start = timestamps[i - 1] + seg_end = timestamps[i] + seg_duration = seg_end - seg_start + + if seg_duration <= 0: + continue + + # Calculate uniform timestamps within this segment + if frame_num == 1: + seg_seconds = [seg_start + seg_duration / 2] + else: + seg_step = seg_duration / (frame_num - 1) + seg_seconds = [seg_start + seg_step * j for j in range(frame_num)] + + for ts in seg_seconds: + for frame in self.extract_frames(start_time=ts): + all_frames.append(frame) + break + + return all_frames + def check_time_span( self, start_time: Optional[float] = 0.0, @@ -652,6 +726,77 @@ def read_stderr(): return Frames(frames=key_frames, indices=list(frame_indices), pts_time=list(pts_time)) + def extract_frames_uniformly(self, frame_num: int) -> List[np.ndarray]: + """Extract frames uniformly using ffmpeg select filter (single pass). + + Overrides the base class to use a single ffmpeg process with the + ``select`` filter, which is much more efficient than seeking for each + frame individually. + """ + from loguru import logger + + total_frames = self.metadata.num_frames + duration = self.metadata.duration + fps = self.metadata.fps + + if total_frames < frame_num: + logger.warning( + "Number of frames to be extracted is larger than the " + "total number of frames in this video. Set it to the " + "total number of frames." + ) + frame_num = total_frames + + if frame_num == 1: + extract_seconds = [duration / 2] + else: + step = duration / (frame_num - 1) + extract_seconds = [step * i for i in range(frame_num)] + + # Convert timestamps to frame indices + frame_indices = [] + for ts in extract_seconds: + idx = min(int(ts * fps), max(total_frames - 1, 0)) + frame_indices.append(idx) + + # Build select filter expression + select_expr = "+".join(f"eq(n\\,{idx})" for idx in frame_indices) + + cmd = [ + "ffmpeg", + "-v", + "quiet", + "-i", + self.video_path, + "-vf", + f"select='{select_expr}'", + "-vsync", + "vfr", + "-f", + "rawvideo", + "-pix_fmt", + self.frame_format, + "-", + ] + + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + w, h = self.metadata.width, self.metadata.height + frame_size = w * h * 3 + frames = [] + + try: + while len(frames) < frame_num: + raw = process.stdout.read(frame_size) + if len(raw) < frame_size: + break + frame = np.frombuffer(raw, dtype=np.uint8).reshape((h, w, 3)) + frames.append(frame) + finally: + self._kill_process(process) + + return frames + def extract_clip(self, start_time, end_time, output_path: str = None, to_numpy=True, **kwargs): """ Extract a clip from the video based on the start and end time. diff --git a/demos/ego_hand_action_annotation/Dockerfile b/demos/ego_hand_action_annotation/Dockerfile new file mode 100644 index 0000000000..e59c7a6d43 --- /dev/null +++ b/demos/ego_hand_action_annotation/Dockerfile @@ -0,0 +1,90 @@ +FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-devel + +LABEL maintainer="data-juicer" +LABEL description="Data-Juicer VLA Pipeline pre-built image with MoGe-2, HaWoR, MegaSaM, PyTorch3D support" + +ENV DEBIAN_FRONTEND=noninteractive +ENV TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" +ENV FORCE_CUDA=1 + +# init conda for non-interactive use +RUN conda init bash +SHELL ["/bin/bash", "-c"] + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + wget \ + curl \ + ffmpeg \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + build-essential \ + ninja-build \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir py-data-juicer +RUN pip install --no-cache-dir -U "ray[default]" + +RUN git clone --depth 1 https://github.com/ThunderVVV/HaWoR.git /root/.cache/data_juicer/assets/HaWoR + +# modify requirements.txt: +# - remove chumpy (need --no-build-isolation, install separately) +# - remove aitviewer / moderngl-window / pyrender (OpenGL rendering, no head environment needed) +# - relax torch-scatter version locking (2.1.2 may be incompatible with PyTorch 2.4.1) +# - relax numpy version locking (to avoid conflict with PyTorch 2.4.1) +RUN cd /root/.cache/data_juicer/assets/HaWoR \ + && sed -i '/^chumpy/d' requirements.txt \ + && sed -i '/^aitviewer/d' requirements.txt \ + && sed -i '/^moderngl-window/d' requirements.txt \ + && sed -i '/^pyrender/d' requirements.txt \ + && sed -i 's/^torch-scatter==.*/# &/' requirements.txt \ + && sed -i 's/^numpy==.*/numpy/' requirements.txt + +RUN pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-2.4.1+cu124.html + +# MoGe-2 camera calibration +RUN pip install --no-cache-dir "moge @ git+https://github.com/microsoft/MoGe.git" + +# HaWoR dependencies (install from requirements.txt) +RUN pip install --no-cache-dir -r /root/.cache/data_juicer/assets/HaWoR/requirements.txt +# chumpy needs --no-build-isolation, install separately +RUN pip install --no-cache-dir --no-build-isolation "chumpy @ git+https://github.com/mattloper/chumpy" + +RUN pip install --no-cache-dir scipy pyarrow av opencv-contrib-python Pillow +RUN pip install --no-cache-dir openai +RUN pip install numpy==1.26.4 +RUN pip install opencv-python==4.10.0.84 opencv-contrib-python==4.10.0.84 +RUN pip install --upgrade --force-reinstall click + +# ===============create mega-sam conda environment================ +RUN conda create -n mega-sam --clone base -y + +# clone mega-sam repository and patch source code +RUN git clone --recursive https://github.com/mega-sam/mega-sam.git /root/.cache/data_juicer/assets/mega-sam \ + && cd /root/.cache/data_juicer/assets/mega-sam \ + # patch .type() -> .scalar_type() to be compatible with new PyTorch + && sed -i 's/\.type()/\.scalar_type()/g' \ + base/src/altcorr_kernel.cu \ + base/src/correlation_kernels.cu \ + base/src/droid_kernels.cu \ + base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu \ + base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp \ + # patch setup.py: add L20(8.9) and H20(9.0) CUDA arch support + && sed -i "/compute_86,code=sm_86/a\ '-gencode=arch=compute_89,code=sm_89',\n '-gencode=arch=compute_90,code=sm_90'," base/setup.py + +# build droid_backends + lietorch (CUDA compile, time-consuming) +RUN cd /root/.cache/data_juicer/assets/mega-sam/base && conda run -n mega-sam python setup.py install + +RUN conda run -n mega-sam pip install --force-reinstall pydantic pydantic-core typing-extensions +RUN conda run -n mega-sam pip install numpy==1.26.4 +RUN conda run -n mega-sam pip install opencv-python==4.10.0.84 opencv-contrib-python==4.10.0.84 +RUN conda run -n mega-sam pip install --upgrade --force-reinstall click + +RUN conda clean -afy && pip cache purge 2>/dev/null || true + +WORKDIR /workspace + +ENTRYPOINT ["python"] diff --git a/demos/ego_hand_action_annotation/Dockerfile.uv b/demos/ego_hand_action_annotation/Dockerfile.uv new file mode 100644 index 0000000000..65b7bdf134 --- /dev/null +++ b/demos/ego_hand_action_annotation/Dockerfile.uv @@ -0,0 +1,135 @@ +# ============================================================ +# Environment: +# /opt/venv/ - main environment, install all shared dependencies (excluding modified droid_backends) +# /opt/megasam-ext/ - MegaSaM modified droid_backends + lietorch +# ============================================================ +FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + +LABEL maintainer="data-juicer" +LABEL description="Data-Juicer VLA Pipeline" + +ENV DEBIAN_FRONTEND=noninteractive PYTHONUNBUFFERED=1 PYTHONIOENCODING=utf-8 LANG=C.UTF-8 + +# public +# ENV PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ PIP_TRUSTED_HOST=mirrors.aliyun.com UV_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ +# internal +ENV PIP_INDEX_URL=http://mirrors.cloud.aliyuncs.com/pypi/simple/ PIP_TRUSTED_HOST=mirrors.cloud.aliyuncs.com UV_INDEX_URL=http://mirrors.cloud.aliyuncs.com/pypi/simple/ + +ENV PIP_DEFAULT_TIMEOUT=1800 HTTP_TIMEOUT=1800 UV_HTTP_TIMEOUT=1800 +ENV TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" +ENV FORCE_CUDA=1 + +# ------------------------------------------------------------------ +# System packages + Python 3.11 +# ------------------------------------------------------------------ +RUN mv /etc/apt/sources.list /etc/apt/sources.list.bak \ + && cat > /etc/apt/sources.list << 'EOF' +deb http://mirrors.cloud.aliyuncs.com/ubuntu/ jammy main restricted universe multiverse +deb http://mirrors.cloud.aliyuncs.com/ubuntu/ jammy-updates main restricted universe multiverse +deb http://mirrors.cloud.aliyuncs.com/ubuntu/ jammy-backports main restricted universe multiverse +deb http://mirrors.cloud.aliyuncs.com/ubuntu/ jammy-security main restricted universe multiverse +EOF + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.11 \ + python3.11-venv \ + python3.11-dev \ + git \ + wget \ + curl \ + ffmpeg \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + build-essential \ + ninja-build \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y python3-pip \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install uv + +# Create base virtual environment +RUN uv venv /opt/venv --python python3.11 +ENV VIRTUAL_ENV=/opt/venv +ENV PATH="/opt/venv/bin:${PATH}" + +RUN uv pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124 + +# TF 2.20.0 was compiled with cuDNN 9.3.0, but the environment has cuDNN 9.1.0 bundled with PyTorch. +# This version mismatch causes a "DNN library initialization failed" error. +RUN uv pip install "nvidia-cudnn-cu12>=9.3.0" + +RUN uv pip install "py-data-juicer @ git+https://github.com/datajuicer/data-juicer.git@vla" +RUN uv pip install -U "ray[default]" + +# torch-scatter (needs matching PyTorch/CUDA wheels) +RUN uv pip install torch-scatter -f https://data.pyg.org/whl/torch-2.6.0+cu124.html + +# MoGe-2 camera calibration +RUN uv pip install "moge @ git+https://github.com/microsoft/MoGe.git" + +# HaWoR hand reconstruction +RUN git clone --depth 1 https://github.com/ThunderVVV/HaWoR.git /root/.cache/data_juicer/assets/HaWoR + +# Patch requirements.txt: +# - remove chumpy (needs --no-build-isolation, install separately) +# - remove aitviewer / moderngl-window / pyrender (OpenGL, headless not needed) +# - relax torch-scatter version lock +# - relax numpy version lock +RUN cd /root/.cache/data_juicer/assets/HaWoR \ + && sed -i '/^chumpy/d' requirements.txt \ + && sed -i '/^aitviewer/d' requirements.txt \ + && sed -i '/^moderngl-window/d' requirements.txt \ + && sed -i '/^pyrender/d' requirements.txt \ + && sed -i '/pytorch3d/s/^/# /' requirements.txt \ + && sed -i 's/^mmcv==.*/# &/' requirements.txt \ + && sed -i 's/^torch-scatter==.*/# &/' requirements.txt \ + && sed -i 's/^numpy==.*/numpy/' requirements.txt + +RUN pip install --no-build-isolation mmcv==1.3.9 +RUN uv pip install --no-build-isolation "pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git" +RUN uv pip install -r /root/.cache/data_juicer/assets/HaWoR/requirements.txt +RUN uv pip install pip && uv pip install --no-build-isolation "chumpy @ git+https://github.com/mattloper/chumpy" +RUN uv pip install pytorch-lightning==2.6.1 + +# DroidCalib +RUN git clone --recursive https://github.com/1van2ha0/DroidCalib.git /root/.cache/data_juicer/assets/DroidCalib +RUN cd /root/.cache/data_juicer/assets/DroidCalib && python setup.py install + +# compile MegaSaM into /opt/megasam-ext (conflict with DroidCalib), load at runtime via PYTHONPATH +# ------------------------------------------------------------------ +RUN git clone --recursive https://github.com/mega-sam/mega-sam.git /root/.cache/data_juicer/assets/mega-sam \ + && cd /root/.cache/data_juicer/assets/mega-sam \ + # patch .type() -> .scalar_type() for PyTorch 2.6 compatibility + && sed -i 's/\.type()/\.scalar_type()/g' \ + base/src/altcorr_kernel.cu \ + base/src/correlation_kernels.cu \ + base/src/droid_kernels.cu \ + base/thirdparty/lietorch/lietorch/src/lietorch_gpu.cu \ + base/thirdparty/lietorch/lietorch/src/lietorch_cpu.cpp \ + # add L20 (8.9) and H20 (9.0) CUDA arch support + && sed -i "/compute_86,code=sm_86/a\ '-gencode=arch=compute_89,code=sm_89',\n '-gencode=arch=compute_90,code=sm_90'," base/setup.py + +# Build droid_backends + lietorch into /opt/megasam-ext (isolated) +RUN cd /root/.cache/data_juicer/assets/mega-sam/base \ + && python setup.py build_ext --inplace \ + && python setup.py install --install-lib=/opt/megasam-ext --no-compile +# ------------------------------------------------------------------ + +# Pin shared dependencies to compatible versions +RUN uv pip install scipy pyarrow av Pillow openai +RUN uv pip install numpy==1.26.4 +RUN uv pip install opencv-python==4.10.0.84 opencv-contrib-python==4.10.0.84 +RUN uv pip install --force-reinstall pydantic pydantic-core typing-extensions +RUN uv pip install --upgrade --force-reinstall click +RUN uv pip install "pandas>=2.0,<2.2" + +RUN uv cache clean 2>/dev/null || true && rm -rf /tmp/* /root/.cache/pip + +WORKDIR /workspace + +ENTRYPOINT ["python"] diff --git a/demos/ego_hand_action_annotation/README.md b/demos/ego_hand_action_annotation/README.md new file mode 100644 index 0000000000..a36d75d954 --- /dev/null +++ b/demos/ego_hand_action_annotation/README.md @@ -0,0 +1,132 @@ +# VLA Visualization Demo + +This demo provides a complete pipeline for **egocentric video hand action recognition and LeRobot dataset export**. It extracts frames from ego-view videos, estimates camera intrinsics and poses, reconstructs 3D hands, computes hand actions, and exports the results as a [LeRobot v2.0](https://github.com/huggingface/lerobot) dataset. + +## Pipeline Overview + +``` +Video Input + │ + ▼ +VideoExtractFramesMapper # Extract video keyframes + │ + ▼ +VideoCameraCalibrationMogeMapper # MoGe-2 camera calibration + depth estimation + │ + ▼ +VideoHandReconstructionHaworMapper # HaWoR 3D hand reconstruction + │ + ▼ +VideoCameraPoseMegaSaMMapper # MegaSaM camera pose estimation (⚠️ requires separate conda env) + │ + ▼ +VideoHandActionComputeMapper # Compute 7-DoF actions + 8-dim states + │ + ▼ +VideoActionCaptioningMapper # action instruction captioning + │ + ▼ +ExportToLeRobotMapper # Export to LeRobot v2.0 dataset +``` + +## Output Format + +- **Action**: 7-dim `[dx, dy, dz, droll, dpitch, dyaw, gripper]` +- **State**: 8-dim `[x, y, z, roll, pitch, yaw, pad, gripper]` +- **Gripper**: 1.0 (open) to -1.0 (closed), estimated from finger joint angles + +## Prerequisites + +### 1. Base Environment + +Create an image based on the Dockerfile. + +The `VideoCameraPoseMegaSaMMapper` operator depends on MegaSaM (based on DROID-SLAM). Its CUDA compiled components (`droid_backends`, `lietorch`, `torch-scatter`) **conflict with the main environment** and must run in a separate conda environment. + +> **Note**: This environment is automatically activated at runtime via Ray's `runtime_env={"conda": "mega-sam"}` mechanism. You do not need to manually switch environments. All other operators run in the default environment. + + +### 2. Ray Cluster + +The pipeline runs on Ray. You need to start a Ray cluster. + +### 3. MANO Hand Model + +Download MANO v1.2 from the [MANO website](https://mano.is.tue.mpg.de/). Update the `mano_right_path` and `mano_left_path` in the config or script to point to your `MANO_RIGHT.pkl` and `MANO_LEFT.pkl` files. + + +## Running the Demo + +### Option 1: Python Script (Recommended) + +```bash +cd demos/ego_hand_action_annotation +python vla_pipeline.py +``` + +### Option 2: YAML Config + +```bash +python tools/process_data.py --config demos/ego_hand_action_annotation/configs/vla_pipeline.yaml +``` + +## Input Data Format + +Each sample is a JSON object containing a video path list: + +```json +{ + "videos": ["./data/1018.mp4"], + "text": "", + "__dj__meta__": {} +} +``` + +The demo includes two sample videos: `data/1018.mp4` and `data/1034.mp4`. + +## Output Structure + +``` +output/ +├── frames/ # Extracted video frames +├── lerobot_dataset/ # LeRobot v2.0 dataset +│ ├── data/ +│ │ └── chunk-000/ +│ │ ├── episode_000000.parquet +│ │ └── ... +│ ├── videos/ +│ │ └── chunk-000/ +│ │ ├── observation.images.main/ +│ │ │ ├── episode_000000.mp4 +│ │ │ └── ... +│ ├── meta/ +│ │ ├── info.json +│ │ ├── episodes.jsonl +│ │ ├── stats.json +│ │ └── tasks.jsonl +│ └── modality.json +└── *.parquet # Ray output results +``` + +## Visualization Tools + +Two visualization scripts are provided for inspecting processing results: + +### Action Annotation Verification (vis_hand_action_demo.py) + +Verify hand action annotations with hand trajectory, state, and action value overlays: + +```bash +python vis_hand_action_demo.py --data_path output/xxx.parquet +``` + +## Troubleshooting + +### MegaSaM compilation fails +Ensure the `mega-sam` conda environment has a CUDA toolkit matching your PyTorch version. Verify with `nvcc --version`. + +### MANO model loading fails +Check that `mano_right_path` and `mano_left_path` point to valid files. MANO models must be downloaded separately from the official website. + +### Ray GPU resource exhaustion +Multiple operators require GPU. By default each uses 0.1 GPU (10 operators can share 1 GPU). Adjust `num_gpus` or add more GPUs if needed. diff --git a/demos/ego_hand_action_annotation/configs/vla_pipeline.yaml b/demos/ego_hand_action_annotation/configs/vla_pipeline.yaml new file mode 100644 index 0000000000..873e47811a --- /dev/null +++ b/demos/ego_hand_action_annotation/configs/vla_pipeline.yaml @@ -0,0 +1,51 @@ +# VLA Hand Action Pipeline + +project_name: 'vla-hand-action-pipeline' +executor_type: 'ray' +dataset_path: './demos/ego_hand_action_annotation/data/demo-dataset.jsonl' +export_path: './demos/ego_hand_action_annotation/output/processed' +ray_address: 'auto' + +process: + - video_extract_frames_mapper: + frame_sampling_method: 'all_keyframes' + output_format: 'path' + frame_dir: './demos/ego_hand_action_annotation/output/frames' + frame_field: 'video_frames' + legacy_split_by_text_token: false + video_backend: 'ffmpeg' + batch_mode: true + - video_camera_calibration_moge_mapper: + model_path: 'moge-2-vitl-normal/model.pt' + tag_field_name: 'camera_calibration_moge_tags' + frame_field: 'video_frames' + output_depth: true + output_points: false + output_mask: false + batch_mode: true + - video_hand_reconstruction_hawor_mapper: + camera_calibration_field: 'camera_calibration_moge_tags' + tag_field_name: 'hand_reconstruction_hawor_tags' + mano_right_path: '/path/to/MANO_RIGHT.pkl' + mano_left_path: '/path/to/MANO_LEFT.pkl' + frame_field: 'video_frames' + batch_mode: true + # ⚠️ the operator requires a separate conda environment "mega-sam" + - video_camera_pose_megasam_mapper: + tag_field_name: 'video_camera_pose_tags' + camera_calibration_field: 'camera_calibration_moge_tags' + batch_mode: true + runtime_env: {'conda': 'mega-sam'} + - video_hand_action_compute_mapper: + hand_reconstruction_field: 'hand_reconstruction_hawor_tags' + camera_pose_field: 'video_camera_pose_tags' + tag_field_name: 'hand_action_tags' + hand_type: 'both' + batch_mode: true + - export_to_lerobot_mapper: + output_dir: './demos/ego_hand_action_annotation/output/lerobot_dataset' + hand_action_field: 'hand_action_tags' + frame_field: 'video_frames' + fps: 10 + robot_type: 'egodex_hand' + batch_mode: true diff --git a/demos/ego_hand_action_annotation/custom_ops/__init__.py b/demos/ego_hand_action_annotation/custom_ops/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/demos/ego_hand_action_annotation/custom_ops/video_action_captioning_mapper.py b/demos/ego_hand_action_annotation/custom_ops/video_action_captioning_mapper.py new file mode 100644 index 0000000000..c32da1ff22 --- /dev/null +++ b/demos/ego_hand_action_annotation/custom_ops/video_action_captioning_mapper.py @@ -0,0 +1,308 @@ +# yapf: disable +import json +import re +from typing import Dict, List, Optional + +from loguru import logger +from pydantic import PositiveInt + +from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.utils.lazy_loader import LazyLoader +from data_juicer.utils.mm_utils import image_path_to_base64, image_byte_to_base64 +from data_juicer.utils.model_utils import ( + get_model, + prepare_model, + update_sampling_params, +) + +from data_juicer.ops.base_op import OPERATORS, TAGGING_OPS, Mapper + +vllm = LazyLoader("vllm") + +OP_NAME = 'video_action_captioning_mapper' + +DEFAULT_SYSTEM_PROMPT = ( + 'You are a multimodal expert specializing in video captioning ' + 'for egocentric human-object interaction (HOI) clips.' +) + +DEFAULT_USER_PROMPT_TEMPLATE = """Below are video frames sampled from an egocentric video containing a single atomic hand-object interaction. Describe the specific {hand_type}-hand action shown in these frames. + +The {hand_type}-hand palm position is marked with a blue dot. Do not confuse it with the {opposite_hand_type} hand. Respect the temporal order of frames (Frame 1 is earliest, last frame is latest). Consider the hand status in each frame, whether there is an interacted object, and the temporal progression. + +Rules for describing the {hand_type}-hand action: +- Only describe {hand_type}-hand actions. Ignore the {opposite_hand_type} hand completely. +- Write in imperative form (e.g., "Insert the key," not "The hand is inserting..."). Do not use personal pronouns. +- Use specific, descriptive verbs. Prefer verbs like "pick" and "place" when applicable. Avoid vague terms like "clean", "spray", or "fix". +- Describe the interacted object only if: (1) the {hand_type} hand clearly interacts with it, or (2) the hand is purposefully moving toward it with clear intent. Otherwise, return "N/A" as the action. +- Do not hallucinate: if no clear hand action or object is present, return "N/A" as the action. +- Do not guess the action based on context. + +Return your answer in JSON format: +{{"think": "", "action": ""}} + +Here are the frames: +""" # noqa: E501 + + +@TAGGING_OPS.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class VideoActionCaptioningMapper(Mapper): + """Generates hand action captions from pre-extracted video frames + using a VLM model (via API or vLLM). + + This operator reads frames from a specified field (e.g., video_frames), + sends them along with a configurable prompt to a VLM, and stores the + structured JSON response (think + action) in a meta field. + The action description is also written to the text field. + + Supports annotating 'left', 'right', or 'both' hands. When hand_type + is 'both', the operator runs two separate VLM calls (one per hand) + and joins the action descriptions with '; ' in the text field. + """ + + _accelerator = 'cuda' + + def __init__( + self, + api_or_hf_model: str = 'Qwen/Qwen2.5-VL-7B-Instruct', + is_api_model: bool = False, + *, + hand_type: str = 'right', + frame_field: str = MetaKeys.video_frames, + tag_field_name: str = 'hand_action_caption', + api_endpoint: Optional[str] = None, + response_path: Optional[str] = None, + system_prompt: Optional[str] = None, + user_prompt_template: Optional[str] = None, + model_params: Dict = {}, + sampling_params: Dict = {}, + try_num: PositiveInt = 3, + **kwargs, + ): + """ + Initialization method. + + :param api_or_hf_model: API model name or HuggingFace model name. + :param is_api_model: Whether the model is an API model. + If true, use OpenAI-compatible API; otherwise use vLLM. + :param hand_type: Which hand to describe: 'left', 'right', or + 'both'. When 'both', two separate calls are made and actions + are joined with '; ' in the text field. + :param frame_field: The field name where pre-extracted frames + are stored. Each element is a list of frame paths (one list + per video). + :param tag_field_name: The meta field name to store the generated + caption result (JSON with 'think' and 'action'). + :param api_endpoint: URL endpoint for the API. + :param response_path: Path to extract content from the API response. + Defaults to 'choices.0.message.content'. + :param system_prompt: System prompt for the VLM. If None, uses the + default egocentric HOI system prompt. + :param user_prompt_template: User prompt template string. Supports + {hand_type} and {opposite_hand_type} placeholders. + If None, uses the default template. + :param model_params: Parameters for initializing the model. + :param sampling_params: Extra parameters passed to the model. + e.g {'temperature': 0.9, 'top_p': 0.95} + :param try_num: The number of retry attempts when there is an API + call error or output parsing error. + :param kwargs: Extra keyword arguments. + """ + super().__init__(**kwargs) + self.is_api_model = is_api_model + + if hand_type not in ('left', 'right', 'both'): + raise ValueError( + f"hand_type must be 'left', 'right', or 'both', " + f"got '{hand_type}'") + self.hand_type = hand_type + + self.frame_field = frame_field + self.tag_field_name = tag_field_name + self.try_num = try_num + + self.system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT + self.user_prompt_template = ( + user_prompt_template or DEFAULT_USER_PROMPT_TEMPLATE + ) + + sampling_params = update_sampling_params( + sampling_params, api_or_hf_model, not self.is_api_model) + + if self.is_api_model: + self.sampling_params = sampling_params + self.model_key = prepare_model( + model_type='api', + model=api_or_hf_model, + endpoint=api_endpoint, + response_path=response_path, + **model_params, + ) + else: + self.num_proc = 1 + self.model_key = prepare_model( + model_type='vllm', + pretrained_model_name_or_path=api_or_hf_model, + **model_params, + ) + self.sampling_params = vllm.SamplingParams(**sampling_params) + + def _build_messages(self, frames, hand_type, opposite_hand_type): + """Build the chat messages with frames embedded as images.""" + user_text = self.user_prompt_template.format( + hand_type=hand_type, + opposite_hand_type=opposite_hand_type, + ) + + # Build multimodal content: prompt text + Frame N: [image] ... + user_content = [{'type': 'text', 'text': user_text}] + for i, frame in enumerate(frames): + image_data = image_byte_to_base64(frame) if isinstance(frame, bytes) else image_path_to_base64(frame) + user_content.append({ + 'type': 'text', + 'text': f'Frame {i + 1}:', + }) + user_content.append({ + 'type': 'image_url', + 'image_url': { + 'url': f'data:image/jpeg;base64,' + f'{image_data}', + }, + }) + user_content.append({ + 'type': 'text', + 'text': '\nAnalyze the frames above and return the JSON result.', + }) + + messages = [] + if self.system_prompt: + messages.append({ + 'role': 'system', + 'content': self.system_prompt, + }) + messages.append({ + 'role': 'user', + 'content': user_content, + }) + return messages + + def _call_model(self, messages, rank=None): + """Call the model and return raw text output.""" + if self.is_api_model: + output = '' + for attempt in range(self.try_num): + try: + client = get_model(self.model_key, rank=rank) + output = client(messages, **self.sampling_params) + break + except Exception as e: + logger.warning( + f'API call failed (attempt {attempt + 1}' + f'/{self.try_num}): {e}') + else: + model, _ = get_model(self.model_key, rank, self.use_cuda()) + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text + return output + + @staticmethod + def _parse_output(raw_output): + """Parse the JSON output from the model. + + Handles cases where the model wraps JSON in ```json...``` fences + and/or appends extra commentary after the JSON block. + """ + text = raw_output.strip() + + # Try to extract JSON from markdown code fences first + fence_match = re.search( + r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL) + if fence_match: + text = fence_match.group(1) + else: + # Try to extract the first {...} block + brace_match = re.search(r'\{.*\}', text, re.DOTALL) + if brace_match: + text = brace_match.group(0) + + try: + result = json.loads(text, strict=False) + except json.JSONDecodeError: + try: + result = json.loads(text.replace("'", '"'), strict=False) + except Exception: + logger.warning( + f'Failed to parse model output as JSON: {raw_output}') + return {'think': '', 'action': ''} + + if not isinstance(result, dict): + return {'think': '', 'action': str(result)} + + return { + 'think': result.get('think', ''), + 'action': result.get('action', ''), + } + + def _caption_single_hand(self, frames, hand_type, rank=None): + """Run captioning for a single hand and return parsed result.""" + opposite = 'left' if hand_type == 'right' else 'right' + messages = self._build_messages(frames, hand_type, opposite) + output = self._call_model(messages, rank=rank) + return self._parse_output(output) + + def process_single(self, sample, rank=None, context=False): + # check if it's generated already + if self.tag_field_name in sample.get(Fields.meta, {}): + return sample + + if Fields.meta not in sample: + sample[Fields.meta] = {} + + # get frames from the frame_field + frame_data = sample.get(self.frame_field, []) + if not frame_data: + sample[Fields.meta][self.tag_field_name] = { + 'think': '', 'action': 'N/A'} + return sample + + # frame_data is a list of lists (one per video), flatten if needed + if isinstance(frame_data[0], list): + frames = frame_data[0] + else: + frames = frame_data + + if not frames: + sample[Fields.meta][self.tag_field_name] = { + 'think': '', 'action': 'N/A'} + return sample + + if self.hand_type == 'both': + right_result = self._caption_single_hand( + frames, 'right', rank=rank) + left_result = self._caption_single_hand( + frames, 'left', rank=rank) + + sample[Fields.meta][self.tag_field_name] = { + 'right': right_result, + 'left': left_result, + } + + # join non-N/A actions into text + actions = [] + for side, result in [('right', right_result), + ('left', left_result)]: + action = result.get('action', '') + if action and action != 'N/A': + actions.append(f'{side} hand: {action}') + if actions: + sample[self.text_key] = '; '.join(actions) + else: + result = self._caption_single_hand( + frames, self.hand_type, rank=rank) + sample[Fields.meta][self.tag_field_name] = result + action = result.get('action', '') + if action and action != 'N/A': + sample[self.text_key] = action + + return sample diff --git a/demos/ego_hand_action_annotation/custom_ops/video_action_captioning_mapper2.py b/demos/ego_hand_action_annotation/custom_ops/video_action_captioning_mapper2.py new file mode 100644 index 0000000000..0870246871 --- /dev/null +++ b/demos/ego_hand_action_annotation/custom_ops/video_action_captioning_mapper2.py @@ -0,0 +1,359 @@ +# yapf: disable +import json +import re +from typing import Dict, List, Optional + +from loguru import logger +from pydantic import PositiveInt + +from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.utils.lazy_loader import LazyLoader +from data_juicer.utils.mm_utils import image_path_to_base64, image_byte_to_base64 +from data_juicer.utils.model_utils import ( + get_model, + prepare_model, + update_sampling_params, +) + +from data_juicer.ops.base_op import OPERATORS, TAGGING_OPS, Mapper + +vllm = LazyLoader("vllm") + +OP_NAME = 'video_action_captioning_mapper' + +DEFAULT_SYSTEM_PROMPT = ( + 'You are a multimodal expert specializing in video captioning ' + 'for egocentric human-object interaction (HOI) clips.' +) + +DEFAULT_USER_PROMPT_TEMPLATE = """I will send you a set of video frames. Your goal is to describe the specific {hand_type}-hand action shown in the provided video frames below. These frames are sampled from an egocentric video and contain a single atomic hand-object interaction. A projected 2D hand trajectory is overlaid \u2013 this path represents the 3D palm center over time, with color gradually transitioning from blue to green to red to indicate temporal progression. The {hand_type}-hand palm position is marked with a blue dot. Do not confuse it with the {opposite_hand_type} hand. Respect the temporal order of frames. Each one is labeled by number (e.g., "Frame 1", "Frame 2", etc.), indicating its place in the time sequence. Please analyze the action step by step. Consider the hand status in each frame, whether there is an interacted object in each frame, and the temporal order of the frames. + +Generate a one-sentence description of the {hand_type}-hand action shown in the entire sequence. When describing the {hand_type}-hand action, please follow these rules: + +Only describe {hand_type}-hand actions. Ignore the {opposite_hand_type} hand completely. +Write in imperative form (e.g., "Insert the key," not "The hand is inserting..."). Do not use personal pronouns. +Use specific, descriptive verbs. If the action clearly involves picking up or placing an object, prefer verbs like "pick" and "place" to highlight the action intent. Avoid vague or generic terms like "clean", "spray", or "fix". +Describe the interacted object only if: +(1) the {hand_type} hand clearly interacts with it, +(2) or, if not, the hand is purposefully moving toward it with clear intent. +(3) If neither applies, return "N/A". +Be careful not to misidentify objects or their colors due to the trajectory overlay. +Do not hallucinate: if no clear or meaningful hand action, or object is present, return: "N/A". +Do not guess the action based on context. For example, do not assume someone is brushing something just because there's a sink. +Return your answer in JSON format with two fields: +(1) "think": a brief, step-by-step reasoning process (no longer than 3-4 sentences) explaining how the {hand_type}-hand action was determined from the hand motion trajectory and visual content. +(2) "action": the final one-sentence description of the {hand_type}-hand action, following all the rules above. +Please prepare to receive the frames. +""" # noqa: E501 + + +@TAGGING_OPS.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class VideoActionCaptioningMapper(Mapper): + """Generate per-segment hand action captions using a VLM. + + This operator iterates over atomic action segments produced by + ``VideoAtomicActionSegmentMapper`` and, for each segment, sends its + ``overlay_frames`` (trajectory-overlaid images from + ``VideoTrajectoryOverlayMapper``) to a VLM to obtain a structured + JSON caption (``{"think": "...", "action": "..."}``). + + Pipeline position: must run **after** both + ``VideoAtomicActionSegmentMapper`` (stage 7) and + ``VideoTrajectoryOverlayMapper`` (stage 8). + + Supports filtering by ``hand_type`` ('left', 'right', or 'both'). + The per-segment caption is stored inside each segment dict, and all + non-N/A actions are joined into the sample's ``text`` field. + """ + + _accelerator = 'cuda' + + def __init__( + self, + api_or_hf_model: str = 'Qwen/Qwen2.5-VL-7B-Instruct', + is_api_model: bool = False, + *, + hand_type: str = 'right', + segment_field: str = 'atomic_action_segments', + frame_field: str = MetaKeys.video_frames, + tag_field_name: str = 'hand_action_caption', + api_endpoint: Optional[str] = None, + response_path: Optional[str] = None, + system_prompt: Optional[str] = None, + user_prompt_template: Optional[str] = None, + model_params: Dict = {}, + sampling_params: Dict = {}, + try_num: PositiveInt = 3, + **kwargs, + ): + """ + Initialization method. + + :param api_or_hf_model: API model name or HuggingFace model name. + :param is_api_model: Whether the model is an API model. + If true, use OpenAI-compatible API; otherwise use vLLM. + :param hand_type: Which hand to caption: 'left', 'right', or + 'both'. Only segments matching the specified hand_type(s) + are sent to the VLM. + :param segment_field: Meta field storing atomic action segments + (output of VideoAtomicActionSegmentMapper). Each segment + must contain an ``overlay_frames`` list (output of + VideoTrajectoryOverlayMapper). + :param frame_field: Fallback field for raw frame paths. Only + used when a segment has no ``overlay_frames``. + :param tag_field_name: The meta field name to store the generated + caption result (JSON with 'think' and 'action'). + :param api_endpoint: URL endpoint for the API. + :param response_path: Path to extract content from the API response. + Defaults to 'choices.0.message.content'. + :param system_prompt: System prompt for the VLM. If None, uses the + default egocentric HOI system prompt. + :param user_prompt_template: User prompt template string. Supports + {hand_type} and {opposite_hand_type} placeholders. + If None, uses the default template. + :param model_params: Parameters for initializing the model. + :param sampling_params: Extra parameters passed to the model. + e.g {'temperature': 0.9, 'top_p': 0.95} + :param try_num: The number of retry attempts when there is an API + call error or output parsing error. + :param kwargs: Extra keyword arguments. + """ + super().__init__(**kwargs) + self.is_api_model = is_api_model + + if hand_type not in ('left', 'right', 'both'): + raise ValueError( + f"hand_type must be 'left', 'right', or 'both', " + f"got '{hand_type}'") + self.hand_type = hand_type + + self.segment_field = segment_field + self.frame_field = frame_field + self.tag_field_name = tag_field_name + self.try_num = try_num + + self.system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT + self.user_prompt_template = ( + user_prompt_template or DEFAULT_USER_PROMPT_TEMPLATE + ) + + sampling_params = update_sampling_params( + sampling_params, api_or_hf_model, not self.is_api_model) + + if self.is_api_model: + self.sampling_params = sampling_params + self.model_key = prepare_model( + model_type='api', + model=api_or_hf_model, + endpoint=api_endpoint, + response_path=response_path, + **model_params, + ) + else: + self.num_proc = 1 + self.model_key = prepare_model( + model_type='vllm', + pretrained_model_name_or_path=api_or_hf_model, + **model_params, + ) + self.sampling_params = vllm.SamplingParams(**sampling_params) + + def _build_messages(self, frames, hand_type, opposite_hand_type): + """Build the chat messages with frames embedded as images. + + Matches the paper prompt format: + + Frame 1: [image] Frame 2: [image] ... + Please now analyze and generate the results. + """ + user_text = self.user_prompt_template.format( + hand_type=hand_type, + opposite_hand_type=opposite_hand_type, + ) + + # Build multimodal content: prompt text + Frame N: [image] ... + user_content = [{'type': 'text', 'text': user_text}] + for i, frame in enumerate(frames): + image_data = image_byte_to_base64(frame) if isinstance(frame, bytes) else image_path_to_base64(frame) + user_content.append({ + 'type': 'text', + 'text': f'Frame {i + 1}:', + }) + user_content.append({ + 'type': 'image_url', + 'image_url': { + 'url': f'data:image/jpeg;base64,' + f'{image_data}', + }, + }) + user_content.append({ + 'type': 'text', + 'text': '\nPlease now analyze and generate the results.', + }) + + messages = [] + if self.system_prompt: + messages.append({ + 'role': 'system', + 'content': self.system_prompt, + }) + messages.append({ + 'role': 'user', + 'content': user_content, + }) + return messages + + def _call_model(self, messages, rank=None): + """Call the model and return raw text output.""" + if self.is_api_model: + output = '' + for attempt in range(self.try_num): + try: + client = get_model(self.model_key, rank=rank) + output = client(messages, **self.sampling_params) + break + except Exception as e: + logger.warning( + f'API call failed (attempt {attempt + 1}' + f'/{self.try_num}): {e}') + else: + model, _ = get_model(self.model_key, rank, self.use_cuda()) + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text + return output + + @staticmethod + def _parse_output(raw_output): + """Parse the JSON output from the model. + + Handles cases where the model wraps JSON in ```json...``` fences + and/or appends extra commentary after the JSON block. + """ + text = raw_output.strip() + + # Try to extract JSON from markdown code fences first + fence_match = re.search( + r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL) + if fence_match: + text = fence_match.group(1) + else: + # Try to extract the first {...} block + brace_match = re.search(r'\{.*\}', text, re.DOTALL) + if brace_match: + text = brace_match.group(0) + + try: + result = json.loads(text, strict=False) + except json.JSONDecodeError: + try: + result = json.loads(text.replace("'", '"'), strict=False) + except Exception: + logger.warning( + f'Failed to parse model output as JSON: {raw_output}') + return {'think': '', 'action': ''} + + if not isinstance(result, dict): + return {'think': '', 'action': str(result)} + + return { + 'think': result.get('think', ''), + 'action': result.get('action', ''), + } + + def _caption_single_hand(self, frames, hand_type, rank=None): + """Run captioning for a single hand and return parsed result.""" + opposite = 'left' if hand_type == 'right' else 'right' + messages = self._build_messages(frames, hand_type, opposite) + output = self._call_model(messages, rank=rank) + return self._parse_output(output) + + def process_single(self, sample, rank=None, context=False): + if Fields.meta not in sample: + sample[Fields.meta] = {} + + meta = sample[Fields.meta] + segments = meta.get(self.segment_field) + if not segments: + return sample + + # Determine which hand types to caption + target_hands = ( + {'left', 'right'} + if self.hand_type == 'both' + else {self.hand_type} + ) + + all_actions = [] # collect non-N/A actions for the text field + + for seg in segments: + hand_type = seg.get('hand_type', 'right') + if hand_type not in target_hands: + continue + + # Skip if already captioned + if 'caption' in seg: + action = seg['caption'].get('action', '') + if action and action != 'N/A': + all_actions.append( + f"{hand_type} hand seg{seg.get('segment_id', '?')}: " + f"{action}") + continue + + # Use overlay_frames (trajectory-overlaid images) if available, + # otherwise fall back to raw frames via sampled_frame_indices. + frames = seg.get('overlay_frames', []) + if not frames: + frames = self._fallback_frames(sample, seg) + + if not frames: + seg['caption'] = {'think': '', 'action': 'N/A'} + continue + + result = self._caption_single_hand(frames, hand_type, rank=rank) + seg['caption'] = result + + action = result.get('action', '') + if action and action != 'N/A': + all_actions.append( + f"{hand_type} hand seg{seg.get('segment_id', '?')}: " + f"{action}") + + # Store the updated segments back + meta[self.segment_field] = segments + + # Join all non-N/A actions into the text field + if all_actions: + sample[self.text_key] = '; '.join(all_actions) + + return sample + + def _fallback_frames(self, sample, seg): + """Extract raw frames for a segment when overlay_frames is empty.""" + frame_data = sample.get(self.frame_field, []) + if not frame_data: + return [] + + # Unwrap nested list from reassembly: [[frames]] → [frames] + if (isinstance(frame_data, list) and frame_data + and isinstance(frame_data[0], list)): + all_frames = frame_data[0] + else: + all_frames = frame_data + + # Use sampled_frame_indices if available, else evenly sample + indices = seg.get('sampled_frame_indices', []) + if not indices: + start = seg.get('start_frame', 0) + end = seg.get('end_frame', len(all_frames) - 1) + n = min(8, end - start + 1) + if n <= 0: + return [] + import numpy as np + indices = np.linspace(start, end, n, dtype=int).tolist() + + frames = [] + for idx in indices: + if 0 <= idx < len(all_frames) and all_frames[idx]: + frames.append(all_frames[idx]) + return frames diff --git a/demos/ego_hand_action_annotation/data/1018.mp4 b/demos/ego_hand_action_annotation/data/1018.mp4 new file mode 100644 index 0000000000..a516aec179 Binary files /dev/null and b/demos/ego_hand_action_annotation/data/1018.mp4 differ diff --git a/demos/ego_hand_action_annotation/data/1034.mp4 b/demos/ego_hand_action_annotation/data/1034.mp4 new file mode 100644 index 0000000000..a76890b892 Binary files /dev/null and b/demos/ego_hand_action_annotation/data/1034.mp4 differ diff --git a/demos/ego_hand_action_annotation/data/demo-dataset.jsonl b/demos/ego_hand_action_annotation/data/demo-dataset.jsonl new file mode 100644 index 0000000000..69fb992f55 --- /dev/null +++ b/demos/ego_hand_action_annotation/data/demo-dataset.jsonl @@ -0,0 +1,2 @@ +{"videos": ["./demos/ego_hand_action_annotation/data/1018.mp4"], "text": "", "__dj__meta__": {}} +{"videos": ["./demos/ego_hand_action_annotation/data/1034.mp4"], "text": "", "__dj__meta__": {}} \ No newline at end of file diff --git a/demos/ego_hand_action_annotation/hawor_utils/__init__.py b/demos/ego_hand_action_annotation/hawor_utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/demos/ego_hand_action_annotation/hawor_utils/common_utils.py b/demos/ego_hand_action_annotation/hawor_utils/common_utils.py new file mode 100644 index 0000000000..22ab982bfe --- /dev/null +++ b/demos/ego_hand_action_annotation/hawor_utils/common_utils.py @@ -0,0 +1,22 @@ +import os +import sys +import subprocess + +from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE + + +def prepare_hawor_repo(): + hawor_home = os.path.join(DATA_JUICER_ASSETS_CACHE, "HaWoR") + if not os.path.exists(hawor_home): + subprocess.run(["git", "clone", "https://github.com/ThunderVVV/HaWoR.git", hawor_home], check=True) + + return hawor_home + + +def prepare_hawor_and_add_to_path(): + hawor_home = prepare_hawor_repo() + + if hawor_home not in sys.path: + sys.path.insert(0, hawor_home) + + return hawor_home diff --git a/demos/ego_hand_action_annotation/hawor_utils/patches/__init__.py b/demos/ego_hand_action_annotation/hawor_utils/patches/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/demos/ego_hand_action_annotation/hawor_utils/patches/process.py b/demos/ego_hand_action_annotation/hawor_utils/patches/process.py new file mode 100644 index 0000000000..a70680de8f --- /dev/null +++ b/demos/ego_hand_action_annotation/hawor_utils/patches/process.py @@ -0,0 +1,214 @@ +""" +Modified from https://github.com/ThunderVVV/HaWoR/blob/main/hawor/utils/process.py. +Update fixed `DATA_DIR` and `MODEL_PATH` in MANO_cfg. +""" +from ..common_utils import prepare_hawor_repo, prepare_hawor_and_add_to_path +prepare_hawor_and_add_to_path() + +import torch +from lib.models.mano_wrapper import MANO +from hawor.utils.geometry import aa_to_rotmat +import numpy as np +import sys +import os + +def block_print(): + sys.stdout = open(os.devnull, 'w') + +def enable_print(): + sys.stdout = sys.__stdout__ + +def get_mano_faces(): + block_print() + + hawor_home = prepare_hawor_repo() + + MANO_cfg = { + 'DATA_DIR': os.path.join(hawor_home, '_DATA/data/'), + 'MODEL_PATH': os.path.join(hawor_home, '_DATA/data/mano'), + 'GENDER': 'neutral', + 'NUM_HAND_JOINTS': 15, + 'CREATE_BODY_POSE': False + } + mano_cfg = {k.lower(): v for k,v in MANO_cfg.items()} + mano = MANO(**mano_cfg) + enable_print() + return mano.faces + + +def run_mano(trans, root_orient, hand_pose, is_right=None, betas=None, use_cuda=True): + """ + Forward pass of the SMPL model and populates pred_data accordingly with + joints3d, verts3d, points3d. + + trans : B x T x 3 + root_orient : B x T x 3 + body_pose : B x T x J*3 + betas : (optional) B x D + """ + block_print() + + hawor_home = prepare_hawor_repo() + + MANO_cfg = { + 'DATA_DIR': os.path.join(hawor_home, '_DATA/data/'), + 'MODEL_PATH': os.path.join(hawor_home, '_DATA/data/mano'), + 'GENDER': 'neutral', + 'NUM_HAND_JOINTS': 15, + 'CREATE_BODY_POSE': False + } + mano_cfg = {k.lower(): v for k,v in MANO_cfg.items()} + mano = MANO(**mano_cfg) + if use_cuda: + mano = mano.cuda() + + B, T, _ = root_orient.shape + NUM_JOINTS = 15 + mano_params = { + 'global_orient': root_orient.reshape(B*T, -1), + 'hand_pose': hand_pose.reshape(B*T*NUM_JOINTS, 3), + 'betas': betas.reshape(B*T, -1), + } + rotmat_mano_params = mano_params + rotmat_mano_params['global_orient'] = aa_to_rotmat(mano_params['global_orient']).view(B*T, 1, 3, 3) + rotmat_mano_params['hand_pose'] = aa_to_rotmat(mano_params['hand_pose']).view(B*T, NUM_JOINTS, 3, 3) + rotmat_mano_params['transl'] = trans.reshape(B*T, 3) + + if use_cuda: + mano_output = mano(**{k: v.float().cuda() for k,v in rotmat_mano_params.items()}, pose2rot=False) + else: + mano_output = mano(**{k: v.float() for k,v in rotmat_mano_params.items()}, pose2rot=False) + + faces_right = mano.faces + faces_new = np.array([[92, 38, 234], + [234, 38, 239], + [38, 122, 239], + [239, 122, 279], + [122, 118, 279], + [279, 118, 215], + [118, 117, 215], + [215, 117, 214], + [117, 119, 214], + [214, 119, 121], + [119, 120, 121], + [121, 120, 78], + [120, 108, 78], + [78, 108, 79]]) + faces_right = np.concatenate([faces_right, faces_new], axis=0) + faces_n = len(faces_right) + faces_left = faces_right[:,[0,2,1]] + + outputs = { + "joints": mano_output.joints.reshape(B, T, -1, 3), + "vertices": mano_output.vertices.reshape(B, T, -1, 3), + } + + if not is_right is None: + # outputs["vertices"][..., 0] = (2*is_right-1)*outputs["vertices"][..., 0] + # outputs["joints"][..., 0] = (2*is_right-1)*outputs["joints"][..., 0] + is_right = (is_right[:, :, 0].cpu().numpy() > 0) + faces_result = np.zeros((B, T, faces_n, 3)) + faces_right_expanded = np.expand_dims(np.expand_dims(faces_right, axis=0), axis=0) + faces_left_expanded = np.expand_dims(np.expand_dims(faces_left, axis=0), axis=0) + faces_result = np.where(is_right[..., np.newaxis, np.newaxis], faces_right_expanded, faces_left_expanded) + outputs["faces"] = torch.from_numpy(faces_result.astype(np.int32)) + + + enable_print() + return outputs + +def run_mano_left(trans, root_orient, hand_pose, is_right=None, betas=None, use_cuda=True, fix_shapedirs=True): + """ + Forward pass of the SMPL model and populates pred_data accordingly with + joints3d, verts3d, points3d. + + trans : B x T x 3 + root_orient : B x T x 3 + body_pose : B x T x J*3 + betas : (optional) B x D + """ + block_print() + + hawor_home = prepare_hawor_repo() + + MANO_cfg = { + 'DATA_DIR': os.path.join(hawor_home, '_DATA/data_left/'), + 'MODEL_PATH': os.path.join(hawor_home, '_DATA/data_left/mano_left'), + 'GENDER': 'neutral', + 'NUM_HAND_JOINTS': 15, + 'CREATE_BODY_POSE': False, + 'is_rhand': False + } + mano_cfg = {k.lower(): v for k,v in MANO_cfg.items()} + mano = MANO(**mano_cfg) + if use_cuda: + mano = mano.cuda() + + # fix MANO shapedirs of the left hand bug (https://github.com/vchoutas/smplx/issues/48) + if fix_shapedirs: + mano.shapedirs[:, 0, :] *= -1 + + B, T, _ = root_orient.shape + NUM_JOINTS = 15 + mano_params = { + 'global_orient': root_orient.reshape(B*T, -1), + 'hand_pose': hand_pose.reshape(B*T*NUM_JOINTS, 3), + 'betas': betas.reshape(B*T, -1), + } + rotmat_mano_params = mano_params + rotmat_mano_params['global_orient'] = aa_to_rotmat(mano_params['global_orient']).view(B*T, 1, 3, 3) + rotmat_mano_params['hand_pose'] = aa_to_rotmat(mano_params['hand_pose']).view(B*T, NUM_JOINTS, 3, 3) + rotmat_mano_params['transl'] = trans.reshape(B*T, 3) + + if use_cuda: + mano_output = mano(**{k: v.float().cuda() for k,v in rotmat_mano_params.items()}, pose2rot=False) + else: + mano_output = mano(**{k: v.float() for k,v in rotmat_mano_params.items()}, pose2rot=False) + + faces_right = mano.faces + faces_new = np.array([[92, 38, 234], + [234, 38, 239], + [38, 122, 239], + [239, 122, 279], + [122, 118, 279], + [279, 118, 215], + [118, 117, 215], + [215, 117, 214], + [117, 119, 214], + [214, 119, 121], + [119, 120, 121], + [121, 120, 78], + [120, 108, 78], + [78, 108, 79]]) + faces_right = np.concatenate([faces_right, faces_new], axis=0) + faces_n = len(faces_right) + faces_left = faces_right[:,[0,2,1]] + + outputs = { + "joints": mano_output.joints.reshape(B, T, -1, 3), + "vertices": mano_output.vertices.reshape(B, T, -1, 3), + } + + if not is_right is None: + # outputs["vertices"][..., 0] = (2*is_right-1)*outputs["vertices"][..., 0] + # outputs["joints"][..., 0] = (2*is_right-1)*outputs["joints"][..., 0] + is_right = (is_right[:, :, 0].cpu().numpy() > 0) + faces_result = np.zeros((B, T, faces_n, 3)) + faces_right_expanded = np.expand_dims(np.expand_dims(faces_right, axis=0), axis=0) + faces_left_expanded = np.expand_dims(np.expand_dims(faces_left, axis=0), axis=0) + faces_result = np.where(is_right[..., np.newaxis, np.newaxis], faces_right_expanded, faces_left_expanded) + outputs["faces"] = torch.from_numpy(faces_result.astype(np.int32)) + + + enable_print() + return outputs + +def run_mano_twohands(init_trans, init_rot, init_hand_pose, is_right, init_betas, use_cuda=True, fix_shapedirs=True): + outputs_left = run_mano_left(init_trans[0:1], init_rot[0:1], init_hand_pose[0:1], None, init_betas[0:1], use_cuda=use_cuda, fix_shapedirs=fix_shapedirs) + outputs_right = run_mano(init_trans[1:2], init_rot[1:2], init_hand_pose[1:2], None, init_betas[1:2], use_cuda=use_cuda) + outputs_two = { + "vertices": torch.cat((outputs_left["vertices"], outputs_right["vertices"]), dim=0), + "joints": torch.cat((outputs_left["joints"], outputs_right["joints"]), dim=0) + + } + return outputs_two \ No newline at end of file diff --git a/demos/ego_hand_action_annotation/hawor_utils/patches/run_vis2.py b/demos/ego_hand_action_annotation/hawor_utils/patches/run_vis2.py new file mode 100644 index 0000000000..4cfcb5c9f4 --- /dev/null +++ b/demos/ego_hand_action_annotation/hawor_utils/patches/run_vis2.py @@ -0,0 +1,90 @@ +""" +Modified from https://github.com/ThunderVVV/HaWoR/blob/main/lib/vis/run_vis2.py. +""" +from ..common_utils import prepare_hawor_and_add_to_path +prepare_hawor_and_add_to_path() + +import os +import cv2 +import numpy as np + +import lib.vis.viewer as viewer_utils + + +def run_vis2_on_video_cam(res_dict, res_dict2, output_pth, focal_length, image_names, R_w2c=None, t_w2c=None, interactive=True): + + img0 = cv2.imread(image_names[0]) + height, width, _ = img0.shape + + world_mano = {} + world_mano['vertices'] = res_dict['vertices'] + world_mano['faces'] = res_dict['faces'] + + world_mano2 = {} + world_mano2['vertices'] = res_dict2['vertices'] + world_mano2['faces'] = res_dict2['faces'] + + vis_dict = {} + color_idx = 0 + world_mano['vertices'] = world_mano['vertices'] + for _id, _verts in enumerate(world_mano['vertices']): + verts = _verts.cpu().numpy() # T, N, 3 + body_faces = world_mano['faces'] + body_meshes = { + "v3d": verts, + "f3d": body_faces, + "vc": None, + "name": f"hand_{_id}", + # "color": "pace-green", + "color": "director-purple", + } + vis_dict[f"hand_{_id}"] = body_meshes + color_idx += 1 + + world_mano2['vertices'] = world_mano2['vertices'] + for _id, _verts in enumerate(world_mano2['vertices']): + verts = _verts.cpu().numpy() # T, N, 3 + body_faces = world_mano2['faces'] + body_meshes = { + "v3d": verts, + "f3d": body_faces, + "vc": None, + "name": f"hand2_{_id}", + # "color": "pace-blue", + "color": "director-blue", + } + vis_dict[f"hand2_{_id}"] = body_meshes + color_idx += 1 + + meshes = viewer_utils.construct_viewer_meshes( + vis_dict, draw_edges=False, flat_shading=False + ) + + num_frames = len(world_mano['vertices'][_id]) + Rt = np.zeros((num_frames, 3, 4)) + Rt[:, :3, :3] = R_w2c[:num_frames] + Rt[:, :3, 3] = t_w2c[:num_frames] + + cols, rows = (width, height) + K = np.array( + [ + [focal_length, 0, width / 2], + [0, focal_length, height / 2], + [0, 0, 1] + ] + ) + vis_h = height + vis_w = width + + data = viewer_utils.ViewerData(Rt, K, cols, rows, imgnames=image_names) + batch = (meshes, data) + + if interactive: + viewer = viewer_utils.ARCTICViewer(interactive=True, size=(vis_w, vis_h)) + viewer.render_seq(batch, out_folder=os.path.join(output_pth, 'aitviewer')) + else: + viewer = viewer_utils.ARCTICViewer(interactive=False, size=(vis_w, vis_h), render_types=['video']) + if os.path.exists(os.path.join(output_pth, 'aitviewer', "video_0.mp4")): + os.remove(os.path.join(output_pth, 'aitviewer', "video_0.mp4")) + viewer.render_seq(batch, out_folder=os.path.join(output_pth, 'aitviewer')) + return os.path.join(output_pth, 'aitviewer', "video_0.mp4") diff --git a/demos/ego_hand_action_annotation/vis_hand_action_demo.py b/demos/ego_hand_action_annotation/vis_hand_action_demo.py new file mode 100644 index 0000000000..b9939d55c4 --- /dev/null +++ b/demos/ego_hand_action_annotation/vis_hand_action_demo.py @@ -0,0 +1,578 @@ +#!/usr/bin/env python +"""Demo: Verify action annotations with 3D hand mesh + trajectory overlay. + +Combines: + - 3D MANO hand mesh wireframe rendering + - Action trajectory verification + +Usage: + python vis_hand_action_demo.py \ + --data_path data.pkl \ + --save_dir ./vis_action_verify +""" + +import argparse +import fractions +import os +import pickle +import json +import sys + +import cv2 +import numpy as np +import pyarrow.parquet as pq + +import torch + +from hawor_utils.common_utils import prepare_hawor_and_add_to_path +prepare_hawor_and_add_to_path() + +from hawor_utils.patches.process import get_mano_faces, run_mano, run_mano_left + +from data_juicer.utils.constant import Fields, MetaKeys + + +def load_image(image_input): + if isinstance(image_input, (str, bytes)): + if isinstance(image_input, str): + if not os.path.exists(image_input): + raise ValueError(f"Error: File not found at {image_input}") + img = cv2.imread(image_input) + else: + nparr = np.frombuffer(image_input, np.uint8) + img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) + + if img is None: + raise ValueError("Failed to decode image.") + + return img + + else: + raise TypeError("Input must be a file path (str) or image bytes (bytes).") + + +# --------------------------------------------------------------- +# MANO hand mesh (using HaWoR's run_mano / run_mano_left) +# --------------------------------------------------------------- + +def compute_hand_mesh(hand_transl, hand_orient, hand_pose, hand_betas, + is_left=False): + """Compute MANO hand mesh vertices, joints and faces. + + Uses HaWoR's run_mano (right) / run_mano_left (left) to ensure + correct coordinate conventions. + + Args: + hand_transl: list of (3,) translations + hand_orient: list of (3,) axis-angle global orientations + hand_pose: list of (45,) axis-angle hand poses + hand_betas: list of (10,) shape parameters + is_left: whether this is the left hand + + Returns: + vertices: (T, V, 3) numpy array in camera space + joints: (T, J, 3) numpy array in camera space + faces: (F, 3) numpy array of face indices + """ + transl = torch.tensor(hand_transl, dtype=torch.float32).unsqueeze(0) + rot = torch.tensor(hand_orient, dtype=torch.float32).unsqueeze(0) + pose = torch.tensor(hand_pose, dtype=torch.float32).unsqueeze(0) + betas = torch.tensor(hand_betas, dtype=torch.float32).unsqueeze(0) + + mano_fn = run_mano_left if is_left else run_mano + mano_out = mano_fn(transl, rot, pose, betas=betas) + + vertices = mano_out['vertices'][0].cpu().numpy() # (T, V, 3) + joints = mano_out['joints'][0].cpu().numpy() # (T, J, 3) + + # Build faces + faces_base = get_mano_faces() + faces_new = np.array([ + [92, 38, 234], [234, 38, 239], [38, 122, 239], [239, 122, 279], + [122, 118, 279], [279, 118, 215], [118, 117, 215], [215, 117, 214], + [117, 119, 214], [214, 119, 121], [119, 120, 121], [121, 120, 78], + [120, 108, 78], [78, 108, 79], + ]) + faces = np.concatenate([faces_base, faces_new], axis=0) + if is_left: + faces = faces[:, [0, 2, 1]] + + return vertices, joints, faces + + +def project_points_to_2d(points_3d, fov_x, width, height): + """Project batch of 3D points to 2D pixel coords.""" + fx = width / (2.0 * np.tan(fov_x / 2.0)) + cx, cy = width / 2.0, height / 2.0 + z_safe = np.where(np.abs(points_3d[..., 2]) < 1e-6, 1e-6, points_3d[..., 2]) + u = fx * points_3d[..., 0] / z_safe + cx + v = fx * points_3d[..., 1] / z_safe + cy + return np.stack([u, v], axis=-1) + + +def draw_mesh_filled(frame, verts_2d, faces, color, alpha=0.3): + """Draw filled semi-transparent mesh on frame.""" + overlay = frame.copy() + h, w = frame.shape[:2] + for face in faces: + pts = verts_2d[face].astype(np.int32) + if np.any(pts[:, 0] < -w) or np.any(pts[:, 0] > 2 * w): + continue + if np.any(pts[:, 1] < -h) or np.any(pts[:, 1] > 2 * h): + continue + cv2.fillPoly(overlay, [pts], color=color, lineType=cv2.LINE_AA) + cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame) + + +def draw_mesh_wireframe(frame, verts_2d, faces, color, alpha=0.4, thickness=1): + """Draw mesh wireframe on frame.""" + overlay = frame.copy() + h, w = frame.shape[:2] + for face in faces: + pts = verts_2d[face].astype(np.int32) + if np.any(pts[:, 0] < -w) or np.any(pts[:, 0] > 2 * w): + continue + if np.any(pts[:, 1] < -h) or np.any(pts[:, 1] > 2 * h): + continue + cv2.polylines(overlay, [pts], isClosed=True, color=color, + thickness=thickness, lineType=cv2.LINE_AA) + cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame) + + +# --------------------------------------------------------------- +# MANO joint skeleton visualization +# --------------------------------------------------------------- + +# MANO 21-joint skeleton: wrist(0), index(1-4), middle(5-8), +# ring(9-12), pinky(13-16), thumb(17-20) +MANO_SKELETON_BONES = [ + # Index finger + (0, 1), (1, 2), (2, 3), (3, 4), + # Middle finger + (0, 5), (5, 6), (6, 7), (7, 8), + # Ring finger + (0, 9), (9, 10), (10, 11), (11, 12), + # Pinky finger + (0, 13), (13, 14), (14, 15), (15, 16), + # Thumb + (0, 17), (17, 18), (18, 19), (19, 20), +] + +# Per-finger colors (BGR) for distinct visualization +FINGER_COLORS = { + "thumb": (0, 255, 255), # yellow + "index": (0, 0, 255), # red + "middle": (0, 255, 0), # green + "ring": (255, 165, 0), # orange-ish + "pinky": (255, 0, 255), # magenta +} + +BONE_FINGER_MAP = { + 0: "index", 1: "index", 2: "index", 3: "index", + 4: "middle", 5: "middle", 6: "middle", 7: "middle", + 8: "ring", 9: "ring", 10: "ring", 11: "ring", + 12: "pinky", 13: "pinky", 14: "pinky", 15: "pinky", + 16: "thumb", 17: "thumb", 18: "thumb", 19: "thumb", +} + + +def draw_joints(frame, joints_2d, bone_color_override=None, + joint_radius=4, bone_thickness=2, alpha=0.8): + """Draw MANO hand joints and skeleton bones on frame. + + Args: + frame: BGR image (modified in-place). + joints_2d: (J, 2) array of 2D joint positions. + bone_color_override: if set, use this single color for all bones + instead of per-finger colors. + joint_radius: radius of joint circles. + bone_thickness: line thickness for bones. + alpha: blending alpha for the overlay. + """ + overlay = frame.copy() + h, w = frame.shape[:2] + + # Draw bones + for bone_idx, (j1, j2) in enumerate(MANO_SKELETON_BONES): + pt1 = joints_2d[j1].astype(np.int32) + pt2 = joints_2d[j2].astype(np.int32) + # Skip if far out of frame + if (pt1[0] < -w or pt1[0] > 2 * w or pt1[1] < -h or pt1[1] > 2 * h + or pt2[0] < -w or pt2[0] > 2 * w or pt2[1] < -h + or pt2[1] > 2 * h): + continue + if bone_color_override is not None: + color = bone_color_override + else: + finger = BONE_FINGER_MAP[bone_idx] + color = FINGER_COLORS[finger] + cv2.line(overlay, tuple(pt1), tuple(pt2), color, + bone_thickness, cv2.LINE_AA) + + # Draw joint circles + for j in range(joints_2d.shape[0]): + pt = joints_2d[j].astype(np.int32) + if pt[0] < -w or pt[0] > 2 * w or pt[1] < -h or pt[1] > 2 * h: + continue + # Wrist joint is larger + r = joint_radius + 2 if j == 0 else joint_radius + cv2.circle(overlay, tuple(pt), r, (255, 255, 255), -1, cv2.LINE_AA) + cv2.circle(overlay, tuple(pt), r, (0, 0, 0), 1, cv2.LINE_AA) + + cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame) + + +# --------------------------------------------------------------- +# Trajectory reconstruction from actions +# --------------------------------------------------------------- + + +def world_to_camera(pos_world, cam_c2w): + """Convert world position to camera space using cam_c2w.""" + cam_c2w = np.asarray(cam_c2w, dtype=np.float64) + R = cam_c2w[:3, :3] + t = cam_c2w[:3, 3] + pos_cam = R.T @ (pos_world - t) + return pos_cam + + +def project_to_2d(pos_cam, fov_x, w, h): + """Project camera-space position to 2D pixel coords.""" + fx = w / (2.0 * np.tan(fov_x / 2.0)) + cx, cy = w / 2.0, h / 2.0 + z = pos_cam[2] if abs(pos_cam[2]) > 1e-6 else 1e-6 + u = fx * pos_cam[0] / z + cx + v = fx * pos_cam[1] / z + cy + return np.array([u, v]) + + +# --------------------------------------------------------------- +# Drawing +# --------------------------------------------------------------- + +def draw_trajectory(frame, points_2d, color, thickness=2, dot_radius=4): + """Draw trajectory line with dots.""" + for i in range(1, len(points_2d)): + pt1 = tuple(points_2d[i - 1].astype(int)) + pt2 = tuple(points_2d[i].astype(int)) + cv2.line(frame, pt1, pt2, color, thickness, cv2.LINE_AA) + for i, pt in enumerate(points_2d): + cv2.circle(frame, tuple(pt.astype(int)), dot_radius, color, -1, cv2.LINE_AA) + + +def draw_current_marker(frame, pt, color, label=""): + """Draw a highlighted marker for current position.""" + pt_int = tuple(pt.astype(int)) + cv2.circle(frame, pt_int, 10, color, -1, cv2.LINE_AA) + cv2.circle(frame, pt_int, 12, (255, 255, 255), 2, cv2.LINE_AA) + if label: + cv2.putText(frame, label, (pt_int[0] + 15, pt_int[1] - 5), + cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA) + + +def draw_action_info(frame, hand_infos, frame_idx): + """Draw per-hand state and action values on frame.""" + font = cv2.FONT_HERSHEY_SIMPLEX + h, w = frame.shape[:2] + x = 10 + y0 = h - 22 * (len(hand_infos) * 3 + 1) - 10 + + lines = [(f"Frame {frame_idx}", (255, 255, 255))] + for info in hand_infos: + hand_label = "R" if info["hand"] == "right" else "L" + state = info["state"] + action = info["action"] + grip_str = "OPEN" if state[7] > 0 else "CLOSED" + grip_color = (0, 255, 0) if state[7] > 0 else (0, 0, 255) + + lines.append(( + f"[{hand_label}] Pos: [{state[0]:+.3f},{state[1]:+.3f},{state[2]:+.3f}]" + f" Rot: [{state[3]:+.2f},{state[4]:+.2f},{state[5]:+.2f}]", + (255, 255, 255))) + lines.append(( + f"[{hand_label}] dPos: [{action[0]:+.4f},{action[1]:+.4f},{action[2]:+.4f}]" + f" dRot: [{action[3]:+.3f},{action[4]:+.3f},{action[5]:+.3f}]", + (200, 200, 200))) + lines.append((f"[{hand_label}] Grip: {grip_str} ({state[7]:+.2f})", grip_color)) + + yy = y0 + for text, color in lines: + (tw, th), _ = cv2.getTextSize(text, font, 0.45, 1) + cv2.rectangle(frame, (x - 2, yy - th - 2), (x + tw + 4, yy + 4), + (0, 0, 0), -1) + cv2.putText(frame, text, (x, yy), font, 0.45, color, 1, cv2.LINE_AA) + yy += 22 + + +# --------------------------------------------------------------- +# Main +# --------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Verify action annotations with 3D hand mesh + trajectory (both hands)", + ) + parser.add_argument("--data_path", type=str, required=True, + help="Path to data (must have hand_action_tags)") + parser.add_argument("--save_dir", type=str, default="./vis_action_verify") + parser.add_argument("--sample_idx", type=int, default=0) + parser.add_argument("--video_idx", type=int, default=0) + parser.add_argument("--fps", type=float, default=30) + args = parser.parse_args() + + os.makedirs(args.save_dir, exist_ok=True) + + # Load data + print(f"Loading: {args.data_path}") + with open(args.data_path, "rb") as f: + if args.data_path.endswith(('.json', '.jsonl')): + samples = [json.loads(line) for line in f.readlines()] + elif args.data_path.endswith('.pkl'): + samples = pickle.load(f) + elif args.data_path.endswith('.parquet'): + table = pq.read_table(f) + samples = table.to_pylist() + + tgt_sample = samples[args.sample_idx] + meta = tgt_sample[Fields.meta] + if isinstance(meta, bytes): + meta = pickle.loads(meta) + + frames = tgt_sample[MetaKeys.video_frames][args.video_idx] + + assert MetaKeys.hand_action_tags in meta, "Need hand_action_tags" + assert MetaKeys.hand_reconstruction_hawor_tags in meta, "Need hawor tags" + assert MetaKeys.video_camera_pose_tags in meta, "Need camera pose tags" + + action_tags = meta[MetaKeys.hand_action_tags][args.video_idx] + hawor = meta[MetaKeys.hand_reconstruction_hawor_tags][args.video_idx] + cam_pose = meta[MetaKeys.video_camera_pose_tags][args.video_idx] + + from data_juicer.utils.file_utils import load_numpy + cam_c2w_all = load_numpy(cam_pose["cam_c2w"]) + + fov_x = hawor["fov_x"] + + # Detect data format: new {"right":{...}, "left":{...}} or old flat + if "states" in action_tags: + # Old flat format — wrap as single hand (assume right) + ht = action_tags.get("hand_type", "right") + action_tags = {ht: action_tags} + + img = load_image(frames[0]) + img_h, img_w = img.shape[:2] + + # ---- Process BOTH hands ---- + hand_colors = { + "right": {"gt": (0, 255, 0), "recon": (0, 165, 255), "mesh": (180, 120, 200)}, + "left": {"gt": (255, 255, 0), "recon": (255, 0, 255), "mesh": (200, 150, 50)}, + } + + hand_results = {} + + for hand_type in ["right", "left"]: + # Read action data from pipeline output + hand_action = action_tags.get(hand_type, {}) + states_raw = hand_action.get("states", []) + actions_raw = hand_action.get("actions", []) + valid_ids = hand_action.get("valid_frame_ids", []) + + if len(states_raw) < 2: + print(f"\n {hand_type} hand: no action data from pipeline, skipping") + continue + + states = np.array(states_raw, dtype=np.float64) + actions = np.array(actions_raw, dtype=np.float64) + + print(f"\n === {hand_type.upper()} hand ===") + is_left = (hand_type == "left") + + # MANO mesh from hawor data (support both new and legacy format) + if hand_type in hawor and isinstance(hawor[hand_type], dict): + hand = hawor[hand_type] + frame_ids = hand.get("frame_ids", []) + hand_transl = hand.get("transl", []) + hand_orient = hand.get("global_orient", []) + hand_pose = hand.get("hand_pose", []) + hand_betas = hand.get("betas", []) + else: + prefix = f"{hand_type}_" + frame_ids = hawor.get(f"{prefix}frame_id_list", []) + hand_transl = hawor.get(f"{prefix}transl_list", []) + hand_orient = hawor.get(f"{prefix}global_orient_list", []) + hand_pose = hawor.get(f"{prefix}hand_pose_list", []) + hand_betas = hawor.get(f"{prefix}beta_list", []) + + # Read pipeline's joints_cam from action_tags (21 MANO joints) + joints_cam_raw = hand_action.get("joints_cam", None) + if joints_cam_raw is not None and len(joints_cam_raw) > 0: + joints_cam = np.array(joints_cam_raw, dtype=np.float64) + joints_cam_id_map = {fid: i for i, fid in enumerate(valid_ids)} + print(f" joints_cam (from action_tags): {joints_cam.shape}") + else: + joints_cam = None + joints_cam_id_map = {} + + # Backup mesh_joints, used when joints_cam is not provided in the data. + if len(frame_ids) >= 2: + mesh_verts, mesh_joints, mesh_faces = compute_hand_mesh( + hand_transl, hand_orient, hand_pose, hand_betas, + is_left=is_left, + ) + print(f" Mesh: vertices {mesh_verts.shape}") + else: + mesh_verts, mesh_joints, mesh_faces, frame_ids = ( + None, None, None, []) + + T = len(states) + print(f" States: {states.shape}, Actions: {actions.shape}, Valid: {len(valid_ids)}") + + mesh_id_map = {fid: i for i, fid in enumerate(frame_ids)} + + # Project action states to 2D at actual wrist joint position. + # Convert world-space states back to camera space, then shift + # from MANO transl to wrist joint so trajectory sits on the hand. + wrist_2d = [] + for t in range(T): + fid = valid_ids[t] + pos_cam = world_to_camera(states[t, :3], cam_c2w_all[fid]) + if joints_cam is not None and fid in joints_cam_id_map: + jidx = joints_cam_id_map[fid] + wrist_cam = joints_cam[jidx, 0, :] + # Find matching hawor transl for this frame + if fid in mesh_id_map: + midx = mesh_id_map[fid] + pos_cam = pos_cam + (wrist_cam - np.asarray(hand_transl[midx])) + # Backup mesh_joints, used when joints_cam is not provided in the data. + elif mesh_joints is not None and fid in mesh_id_map: + midx = mesh_id_map[fid] + wrist_cam = mesh_joints[midx, 0, :] + pos_cam = pos_cam + (wrist_cam - np.asarray(hand_transl[midx])) + wrist_2d.append(project_to_2d(pos_cam, fov_x, img_w, img_h)) + wrist_2d = np.array(wrist_2d) + + hand_results[hand_type] = { + "states": states, + "actions": actions, + "valid_ids": valid_ids, + "wrist_2d": wrist_2d, + "mesh_verts": mesh_verts, + "mesh_joints": mesh_joints, + "mesh_faces": mesh_faces, + "mesh_id_map": mesh_id_map, + "frame_ids": frame_ids, + "joints_cam": joints_cam, + "joints_cam_id_map": joints_cam_id_map, + } + + if not hand_results: + print("No valid hand data found!") + return + + # Collect all valid frame ids across both hands + all_valid_fids = sorted(set().union( + *(set(r["valid_ids"]) for r in hand_results.values()))) + + # ---- Render video frames ---- + print(f"\nRendering {len(all_valid_fids)} frames...") + frames_dir = os.path.join(args.save_dir, "frames") + os.makedirs(frames_dir, exist_ok=True) + output_frames = [] + + for frame_seq, fid in enumerate(all_valid_fids): + frame = load_image(frames[fid]) + if frame is None: + continue + + canvas = frame.copy() + hand_infos = [] + + for hand_type, res in hand_results.items(): + colors = hand_colors[hand_type] + + # -- Draw 3D hand mesh -- + if res["mesh_verts"] is not None and fid in res["mesh_id_map"]: + midx = res["mesh_id_map"][fid] + verts_2d = project_points_to_2d( + res["mesh_verts"][midx], fov_x, img_w, img_h) + draw_mesh_filled(canvas, verts_2d, res["mesh_faces"], + colors["mesh"], alpha=0.25) + draw_mesh_wireframe(canvas, verts_2d, res["mesh_faces"], + colors["mesh"], alpha=0.5, thickness=1) + + # -- Draw 21 MANO joints from pipeline's joints_cam -- + if res["joints_cam"] is not None and fid in res["joints_cam_id_map"]: + jidx = res["joints_cam_id_map"][fid] + joints_2d = project_points_to_2d( + res["joints_cam"][jidx], fov_x, img_w, img_h) + draw_joints(canvas, joints_2d, + joint_radius=4, bone_thickness=2, + alpha=0.85) + # Backup mesh_joints, used when joints_cam is not provided in the data. + elif res["mesh_joints"] is not None: + joints_2d = project_points_to_2d( + res["mesh_joints"][midx], fov_x, img_w, img_h) + draw_joints(canvas, joints_2d, + joint_radius=4, bone_thickness=2, + alpha=0.85) + + # -- Draw wrist trajectory -- + if fid not in set(res["valid_ids"]): + continue + + t_idx = res["valid_ids"].index(fid) + trail = res["wrist_2d"][:t_idx + 1] + + if len(trail) >= 2: + draw_trajectory(canvas, trail, colors["gt"], + thickness=3, dot_radius=4) + + label_prefix = "R" if hand_type == "right" else "L" + draw_current_marker(canvas, res["wrist_2d"][t_idx], + colors["gt"], label_prefix) + + hand_infos.append({ + "hand": hand_type, + "state": res["states"][t_idx], + "action": res["actions"][t_idx], + }) + + # Draw action state info + if hand_infos: + draw_action_info(canvas, hand_infos, fid) + + out_path = os.path.join(frames_dir, f"verify_{frame_seq:04d}.jpg") + cv2.imwrite(out_path, canvas) + output_frames.append(canvas) + + # ---- Save video ---- + if output_frames: + import av + video_path = os.path.join(args.save_dir, "action_verify.mp4") + out_h, out_w = output_frames[0].shape[:2] + fps_frac = fractions.Fraction(args.fps).limit_denominator(10000) + + container = av.open(video_path, mode="w") + stream = container.add_stream("libx264", rate=fps_frac) + stream.width = out_w + stream.height = out_h + stream.pix_fmt = "yuv420p" + stream.options = {"crf": "18", "preset": "medium"} + + for f_bgr in output_frames: + f_rgb = cv2.cvtColor(f_bgr, cv2.COLOR_BGR2RGB) + av_frame = av.VideoFrame.from_ndarray(f_rgb, format="rgb24") + for pkt in stream.encode(av_frame): + container.mux(pkt) + for pkt in stream.encode(): + container.mux(pkt) + container.close() + + size_mb = os.path.getsize(video_path) / (1024 * 1024) + print(f"\nVideo: {video_path} ({size_mb:.1f} MB)") + + print("Done!") + + +if __name__ == "__main__": + main() diff --git a/demos/ego_hand_action_annotation/vis_segments.py b/demos/ego_hand_action_annotation/vis_segments.py new file mode 100644 index 0000000000..7642a32edb --- /dev/null +++ b/demos/ego_hand_action_annotation/vis_segments.py @@ -0,0 +1,430 @@ +"""Visualize the results of VideoAtomicActionSegmentMapper and VideoTrajectoryOverlayMapper. + +Features: +1. Save each atomic action segment as an independent video +2. Concatenate trajectory overlay results for easy checking +""" +import json +import os +import subprocess +import sys +from pathlib import Path + +import cv2 +import numpy as np + + +def write_video_ffmpeg(frames_bgr: list[np.ndarray], out_path: str, fps: float = 30.0): + """Use ffmpeg pipe to write H.264 mp4, good compatibility.""" + if not frames_bgr: + return + import shutil + import tempfile + + h, w = frames_bgr[0].shape[:2] + + # write to local /tmp (to avoid NFS not supporting faststart seek) + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".mp4") + os.close(tmp_fd) + + cmd = [ + "ffmpeg", "-y", + "-f", "rawvideo", + "-vcodec", "rawvideo", + "-pix_fmt", "bgr24", + "-s", f"{w}x{h}", + "-r", str(fps), + "-i", "-", + "-c:v", "libx264", + "-pix_fmt", "yuv420p", + "-preset", "fast", + "-crf", "23", + "-movflags", "+faststart", + tmp_path, + ] + raw_data = b"".join(f.tobytes() for f in frames_bgr) + proc = subprocess.Popen( + cmd, stdin=subprocess.PIPE, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + ) + _, stderr = proc.communicate(input=raw_data) + if proc.returncode != 0: + print(f" ffmpeg 写入失败 (rc={proc.returncode}): {out_path}") + print(f" stderr: {stderr.decode()[-200:]}") + os.unlink(tmp_path) + return + + shutil.move(tmp_path, out_path) + + +def load_result(json_path: str) -> dict: + with open(json_path) as f: + return json.load(f) + + +# ───────────────────────────────────────────────────────────── +# 1. atomic action segments → save as independent videos +# ───────────────────────────────────────────────────────────── +def export_atomic_segments_as_videos( + data: dict, + output_dir: str, + fps: float = 30.0, +): + """Save each atomic action segment as an independent video. + + Output directory structure: + output_dir/ + left/ + seg0_f000-f040.mp4 + seg1_f041-f067.mp4 + ... + right/ + seg0_f000-f028.mp4 + seg1_f029-f060.mp4 + ... + """ + meta = data.get("__dj__meta__", {}) + segments = meta.get("atomic_action_segments", []) + if not segments: + print("Not found atomic_action_segments") + return + + # get frame paths + frames = data.get("video_frames", []) + if frames and isinstance(frames[0], list): + frames = frames[0] + if not frames: + print("Not found video_frames") + return + + # separate by hand type + for hand_type in ("left", "right"): + hand_segs = [s for s in segments if s["hand_type"] == hand_type] + if not hand_segs: + continue + + hand_dir = os.path.join(output_dir, hand_type) + os.makedirs(hand_dir, exist_ok=True) + + for seg in hand_segs: + seg_id = seg["segment_id"] + start = seg["start_frame"] + end = seg["end_frame"] + valid_fids = seg.get("valid_frame_ids", list(range(start, end + 1))) + + out_name = f"seg{seg_id}_f{start:03d}-f{end:03d}.mp4" + out_path = os.path.join(hand_dir, out_name) + + collected_frames = [] + for fid in valid_fids: + if fid >= len(frames): + continue + img = cv2.imread(frames[fid]) + if img is None: + continue + + # put text label on the frame + label = f"{hand_type.upper()} seg{seg_id} frame={fid}" + cv2.putText( + img, label, (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, + ) + collected_frames.append(img) + + if not collected_frames: + print(f" Skip {hand_type} seg{seg_id}: no valid frames") + continue + + write_video_ffmpeg(collected_frames, out_path, fps) + n_written = len(collected_frames) + print(f" [{hand_type}] seg{seg_id}: frames {start}-{end} " + f"({n_written} frames) → {out_path}") + + +# ───────────────────────────────────────────────────────────── +# 2. Trajectory overlay visualization → make grid image and video +# - each segment's 8 overlay frames to a grid image +# - all segment's overlay frames to a video +# ───────────────────────────────────────────────────────────── +def make_overlay_grid( + data: dict, + output_dir: str, + grid_cols: int = 4, + thumb_w: int = 480, +): + """Make a 2xN grid image for each segment's overlay frames. + """ + meta = data.get("__dj__meta__", {}) + segments = meta.get("atomic_action_segments", []) + if not segments: + print("Not found atomic_action_segments") + return + + os.makedirs(output_dir, exist_ok=True) + + for seg in segments: + hand_type = seg["hand_type"] + seg_id = seg["segment_id"] + overlay_paths = seg.get("overlay_frames", []) + if not overlay_paths: + continue + + # read all overlay frames + imgs = [] + for p in overlay_paths: + img = cv2.imread(p) + if img is not None: + imgs.append(img) + if not imgs: + continue + + # resize to uniform thumbnail size + thumb_h = int(thumb_w * imgs[0].shape[0] / imgs[0].shape[1]) + thumbs = [cv2.resize(im, (thumb_w, thumb_h)) for im in imgs] + + # add frame index label + sampled_indices = seg.get("sampled_frame_indices", []) + for i, th in enumerate(thumbs): + fid_label = f"f{sampled_indices[i]}" if i < len(sampled_indices) else f"#{i}" + cv2.putText( + th, fid_label, (5, 25), + cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2, + ) + + # make grid + n = len(thumbs) + rows = (n + grid_cols - 1) // grid_cols + # pad with blank + while len(thumbs) < rows * grid_cols: + thumbs.append(np.zeros_like(thumbs[0])) + + grid_rows = [] + for r in range(rows): + row_imgs = thumbs[r * grid_cols: (r + 1) * grid_cols] + grid_rows.append(np.hstack(row_imgs)) + grid = np.vstack(grid_rows) + + # add title bar on top + title_h = 40 + title_bar = np.zeros((title_h, grid.shape[1], 3), dtype=np.uint8) + title = (f"{hand_type.upper()} seg{seg_id} " + f"frames {seg['start_frame']}-{seg['end_frame']}") + cv2.putText( + title_bar, title, (10, 28), + cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2, + ) + grid = np.vstack([title_bar, grid]) + + out_path = os.path.join( + output_dir, + f"grid_{hand_type}_seg{seg_id}_f{seg['start_frame']:03d}-f{seg['end_frame']:03d}.jpg", + ) + cv2.imwrite(out_path, grid) + print(f" Grid: {out_path}") + + +def make_overlay_video( + data: dict, + output_path: str, + fps: float = 2.0, +): + """Make a video from all segment's overlay frames in sequence. + + Each segment's frames are separated by a separator frame. + """ + meta = data.get("__dj__meta__", {}) + segments = meta.get("atomic_action_segments", []) + if not segments: + return + + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # determine video size first + ref_img = None + for seg in segments: + for p in seg.get("overlay_frames", []): + ref_img = cv2.imread(p) + if ref_img is not None: + break + if ref_img is not None: + break + if ref_img is None: + return + + h, w = ref_img.shape[:2] + collected_frames = [] + + for seg in segments: + hand_type = seg["hand_type"] + seg_id = seg["segment_id"] + overlay_paths = seg.get("overlay_frames", []) + sampled_indices = seg.get("sampled_frame_indices", []) + + # make a separator frame + sep = np.zeros((h, w, 3), dtype=np.uint8) + text = (f"{hand_type.upper()} Seg {seg_id} " + f"Frames {seg['start_frame']}-{seg['end_frame']}") + cv2.putText( + sep, text, (w // 6, h // 2), + cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 255, 255), 3, + ) + collected_frames.append(sep) + + for i, p in enumerate(overlay_paths): + img = cv2.imread(p) + if img is None: + continue + img = cv2.resize(img, (w, h)) + fid_label = f"f{sampled_indices[i]}" if i < len(sampled_indices) else f"#{i}" + label = f"{hand_type} seg{seg_id} {fid_label}" + cv2.putText( + img, label, (10, 30), + cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2, + ) + collected_frames.append(img) + + write_video_ffmpeg(collected_frames, output_path, fps) + print(f" Overlay video: {output_path}") + + +# ───────────────────────────────────────────────────────────── +# 3. Segmentation timeline plot: plot speed curve + segmentation points +# ───────────────────────────────────────────────────────────── +def plot_segmentation_timeline( + data: dict, + output_path: str, +): + """Plot left and right hand speed curves + atomic action segmentation points.""" + try: + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + import matplotlib.patches as mpatches + except ImportError: + print("Matplotlib is not installed, skipping timeline plot") + return + + meta = data.get("__dj__meta__", {}) + hand_action = meta.get("hand_action_tags", []) + segments = meta.get("atomic_action_segments", []) + + if not hand_action: + return + + merged = hand_action[0] if isinstance(hand_action, list) else hand_action + + fig, axes = plt.subplots(2, 1, figsize=(18, 8), sharex=True) + + colors_seg = plt.cm.tab10.colors + + for ax_idx, hand_type in enumerate(("left", "right")): + ax = axes[ax_idx] + hdata = merged.get(hand_type, {}) + states = hdata.get("states", []) + if not states: + ax.set_title(f"{hand_type.upper()} hand - no data") + continue + + states_arr = np.array(states, dtype=np.float64) + positions = states_arr[:, 0:3] + speed = np.linalg.norm(np.diff(positions, axis=0), axis=1) + speed = np.concatenate([[0.0], speed]) + + ax.plot(speed, color="gray", alpha=0.5, label="raw speed") + + # Savitzky-Golay 平滑 + try: + from scipy.signal import savgol_filter + win = min(11, len(speed)) + if win % 2 == 0: + win -= 1 + if win >= 3: + smooth = savgol_filter(speed, win, polyorder=2) + ax.plot(smooth, color="blue", linewidth=1.5, label="smooth speed") + except Exception: + pass + + # 画每个 segment 的区间 + hand_segs = [s for s in segments if s["hand_type"] == hand_type] + for i, seg in enumerate(hand_segs): + c = colors_seg[i % len(colors_seg)] + ax.axvspan( + seg["start_frame"], seg["end_frame"], + alpha=0.15, color=c, + ) + mid = (seg["start_frame"] + seg["end_frame"]) / 2 + ax.annotate( + f"seg{seg['segment_id']}", + (mid, ax.get_ylim()[1] * 0.9 if ax.get_ylim()[1] > 0 else 0.1), + ha="center", fontsize=9, color=c, fontweight="bold", + ) + # 切分线 + ax.axvline(seg["start_frame"], color=c, linestyle="--", alpha=0.5) + ax.axvline(seg["end_frame"], color=c, linestyle="--", alpha=0.5) + + ax.set_title(f"{hand_type.upper()} Hand — Speed & Segments") + ax.set_ylabel("Speed (m/frame)") + ax.legend(loc="upper right") + ax.grid(True, alpha=0.3) + + axes[1].set_xlabel("Frame") + plt.tight_layout() + os.makedirs(os.path.dirname(output_path), exist_ok=True) + plt.savefig(output_path, dpi=150, bbox_inches="tight") + plt.close() + print(f" Timeline: {output_path}") + + +# ───────────────────────────────────────────────────────────── +# Main +# ───────────────────────────────────────────────────────────── +def main(): + data_dir = sys.argv[1] + + if len(sys.argv) < 2: + print("Usage: python visualize_segments.py ") + return + + vis_dir = os.path.join(data_dir, "visualization") + os.makedirs(vis_dir, exist_ok=True) + + # find all json results + json_files = sorted(Path(data_dir).glob("*.json")) + if not json_files: + print(f"No json files found in {data_dir}") + return + + for jf in json_files: + print(f"\n{'='*60}") + print(f"Processing: {jf.name}") + print(f"{'='*60}") + data = load_result(str(jf)) + + sample_id = jf.stem + sample_vis = os.path.join(vis_dir, sample_id) + + # 1. Atomic action segments → short videos + print("\n[1] Export atomic action segments as videos (left and right hand separated)...") + seg_video_dir = os.path.join(sample_vis, "atomic_segments") + export_atomic_segments_as_videos(data, seg_video_dir, fps=30.0) + + # 2. Trajectory overlay grids + print("\n[2] Generate trajectory overlay grids...") + grid_dir = os.path.join(sample_vis, "overlay_grids") + make_overlay_grid(data, grid_dir) + + # 3. Trajectory overlay video + print("\n[3] Generate trajectory overlay continuous video...") + overlay_vid = os.path.join(sample_vis, "overlay_all.mp4") + make_overlay_video(data, overlay_vid, fps=2.0) + + # 4. Segmentation timeline plot + print("\n[4] Generate segmentation timeline plot...") + timeline_path = os.path.join(sample_vis, "segmentation_timeline.png") + plot_segmentation_timeline(data, timeline_path) + + print(f"\nAll visualization results saved in: {vis_dir}") + + +if __name__ == "__main__": + main() diff --git a/demos/ego_hand_action_annotation/vla_pipeline.py b/demos/ego_hand_action_annotation/vla_pipeline.py new file mode 100644 index 0000000000..702e8a987e --- /dev/null +++ b/demos/ego_hand_action_annotation/vla_pipeline.py @@ -0,0 +1,274 @@ +import os +import sys +import copy +import time + +import torch +import ray +from ray.data import ActorPoolStrategy + +from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.ops.base_op import OPERATORS, Mapper +from data_juicer.ops.mapper import ( + VideoCameraCalibrationMogeMapper, + VideoHandReconstructionHaworMapper, + VideoCameraPoseMegaSaMMapper, + VideoExtractFramesMapper, + VideoHandActionComputeMapper, + ExportToLeRobotMapper) + +sys.path.insert(0, os.path.dirname(__file__)) + +from custom_ops.video_action_captioning_mapper import VideoActionCaptioningMapper + + +@OPERATORS.register_module("video_hawor_megasam_combined_mapper") +class VideoHaWorMegaSaMCombinedMapper(Mapper): + """Combined HaWoR hand reconstruction + MegaSaM camera pose estimation. + + Runs both GPU models sequentially on the same actor to: + 1. Avoid inter-stage data serialization overhead + 2. Simplify Ray scheduling (fewer GPU stages = less resource contention) + 3. Share GPU memory efficiently + """ + + _accelerator = "cuda" + + def __init__( + self, + # HaWoR params + hawor_model_path: str = "hawor.ckpt", + hawor_config_path: str = "model_config.yaml", + hawor_detector_path: str = "detector.pt", + mano_right_path: str = "MANO_RIGHT.pkl", + mano_left_path: str = "MANO_LEFT.pkl", + camera_calibration_field: str = MetaKeys.camera_calibration_moge_tags, + hawor_tag_field: str = MetaKeys.hand_reconstruction_hawor_tags, + frame_field: str = MetaKeys.video_frames, + hawor_thresh: float = 0.2, + # MegaSaM params + megasam_tag_field: str = MetaKeys.video_camera_pose_tags, + megasam_max_frames: int = 1000, + megasam_droid_buffer: int = 1024, + megasam_save_dir: str = None, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + self._hawor_kwargs = dict( + hawor_model_path=hawor_model_path, + hawor_config_path=hawor_config_path, + hawor_detector_path=hawor_detector_path, + mano_right_path=mano_right_path, + mano_left_path=mano_left_path, + camera_calibration_field=camera_calibration_field, + tag_field_name=hawor_tag_field, + frame_field=frame_field, + thresh=hawor_thresh, + batch_mode=True, + skip_op_error=kwargs.get("skip_op_error", False), + ) + self._megasam_kwargs = dict( + tag_field_name=megasam_tag_field, + camera_calibration_field=camera_calibration_field, + frame_field=frame_field, + max_frames=megasam_max_frames, + droid_buffer=megasam_droid_buffer, + save_dir=megasam_save_dir, + batch_mode=True, + skip_op_error=kwargs.get("skip_op_error", False), + ) + + self._hawor_op = None + self._megasam_op = None + + def _ensure_ops(self): + if self._hawor_op is None: + self._hawor_op = VideoHandReconstructionHaworMapper(**self._hawor_kwargs) + if self._megasam_op is None: + self._megasam_op = VideoCameraPoseMegaSaMMapper(**self._megasam_kwargs) + + def process_single(self, sample=None, rank=None): + from loguru import logger as _logger + + self._ensure_ops() + + sample = self._hawor_op.process_single(sample, rank=rank) + + try: + sample = self._megasam_op.process_single(sample, rank=rank) + except Exception as e: + # MegaSaM failure should not discard HaWoR results. + # Write empty camera pose so downstream can still detect + # the missing data gracefully instead of crashing. + import traceback + _logger.error(f"MegaSaM failed (HaWoR result preserved): " + f"{e} -- {traceback.format_exc()}") + megasam_field = self._megasam_kwargs.get( + "tag_field_name", MetaKeys.video_camera_pose_tags) + if Fields.meta not in sample: + sample[Fields.meta] = {} + # One empty entry per video clip + n_videos = len(sample.get(self._hawor_kwargs.get( + "frame_field", MetaKeys.video_frames), [])) + sample[Fields.meta][megasam_field] = [ + {} for _ in range(max(1, n_videos)) + ] + + + return sample + + +if __name__ == '__main__': + + from ray.data import DataContext + DataContext.get_current().enable_fallback_to_arrow_object_ext_type = True + + s_time = time.time() + + ray.init(address='auto') + + output_dir = "./output/" + os.makedirs(output_dir, exist_ok=True) + lerobot_output_dir = os.path.join(output_dir, "lerobot_dataset") + + video_key = "videos" + skip_op_error = False + + video_paths = [ + "./data/1018.mp4", + "./data/1034.mp4", + ] + samples = [ + { + video_key: [video], + "text": "", + # Fields.stats: {}, + Fields.meta: {} + } for video in video_paths + ] + + ds = ray.data.from_items(samples) + + ds = ds.map_batches( + VideoExtractFramesMapper, + fn_constructor_kwargs=dict( + frame_sampling_method="uniform", # "all_keyframes" + frame_num=20, + video_backend='ffmpeg', + output_format='path', #'bytes', + frame_dir=os.path.join(output_dir, 'frames'), + frame_field=MetaKeys.video_frames, + legacy_split_by_text_token=False, + batch_mode=True, + skip_op_error=skip_op_error, + video_key=video_key, + ), + batch_size=1, + num_cpus=1, + batch_format="pyarrow", + ) + + ds = ds.map_batches( + VideoCameraCalibrationMogeMapper, + fn_constructor_kwargs=dict( + model_path="Ruicheng/moge-2-vitl", + tag_field_name=MetaKeys.camera_calibration_moge_tags, + frame_field=MetaKeys.video_frames, + output_depth=True, + output_points=False, + output_mask=False, + save_dir=os.path.join(output_dir, 'moge_arrays'), + batch_mode=True, + skip_op_error=skip_op_error, + ), + batch_size=1, + num_gpus=0.15, # adjust the ratio based on the gpu type + batch_format="pyarrow", + compute=ActorPoolStrategy(min_size=1, max_size=2), # adjust the scope based on available resources + runtime_env=None, + ) + + ds = ds.map_batches( + VideoHaWorMegaSaMCombinedMapper, + fn_constructor_kwargs=dict( + camera_calibration_field=MetaKeys.camera_calibration_moge_tags, + hawor_tag_field=MetaKeys.hand_reconstruction_hawor_tags, + megasam_tag_field=MetaKeys.video_camera_pose_tags, + mano_right_path='/path/to/MANO_RIGHT.pkl', + mano_left_path='/path/to/MANO_LEFT.pkl', + frame_field=MetaKeys.video_frames, + megasam_max_frames=1000, + megasam_save_dir=os.path.join(output_dir, 'megasam_arrays'), + batch_mode=True, + skip_op_error=skip_op_error, + ), + batch_size=1, + num_gpus=0.25, # adjust the ratio based on the gpu type + batch_format="pyarrow", + runtime_env={"conda": "mega-sam"}, + compute=ActorPoolStrategy(min_size=1, max_size=2), # adjust the scope based on available resources + ) + + ds = ds.map_batches( + VideoHandActionComputeMapper, + fn_constructor_kwargs=dict( + hand_reconstruction_field=MetaKeys.hand_reconstruction_hawor_tags, # outputs of VideoHandReconstructionHaworMapper + camera_pose_field=MetaKeys.video_camera_pose_tags, # outputs of VideoCameraPoseMegaSaMMapper + tag_field_name=MetaKeys.hand_action_tags, + hand_type="both", + batch_mode=True, + skip_op_error=skip_op_error, + ), + batch_size=1, + num_cpus=1, + batch_format="pyarrow", + ) + + ds = ds.map_batches( + VideoActionCaptioningMapper, + fn_constructor_kwargs=dict( + api_or_hf_model='qwen-vl-max', + is_api_model=True, + hand_type='both', + frame_field=MetaKeys.video_frames, + tag_field_name="hand_action_caption", + batch_mode=True, + skip_op_error=skip_op_error, + ), + batch_size=1, + num_cpus=1, + batch_format="pyarrow", + ) + + ds = ds.map_batches( + ExportToLeRobotMapper, + fn_constructor_kwargs=dict( + output_dir=lerobot_output_dir, + hand_action_field=MetaKeys.hand_action_tags, # outputs of VideoHandActionComputeMapper + frame_field=MetaKeys.video_frames, + video_key=video_key, + task_description_key="text", + fps=10, + robot_type="egodex_hand", + batch_mode=True, + skip_op_error=skip_op_error, + ), + batch_size=1, + num_cpus=1, + batch_format="pyarrow", + runtime_env=None, + ) + + ds.write_parquet(output_dir) + # ds.write_json(output_dir, force_ascii=False) + + ExportToLeRobotMapper.finalize_dataset( + output_dir=lerobot_output_dir, + fps=10, + robot_type="egodex_hand", + ) + + print(f"LeRobot exported to: {lerobot_output_dir}") + print(f'>>>>total cost time: {time.time() - s_time}') diff --git a/docs/Operators.md b/docs/Operators.md index 2418e8e0af..27c7dd73ff 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -46,7 +46,7 @@ Data-Juicer 中的算子分为以下 8 种类型。 | [filter](#filter) | 56 | Filters out low-quality samples. 过滤低质量样本。 | | [formatter](#formatter) | 8 | Discovers, loads, and canonicalizes source data. 发现、加载、规范化原始数据。 | | [grouper](#grouper) | 3 | Group samples to batched samples. 将样本分组,每一组组成一个批量样本。 | -| [mapper](#mapper) | 105 | Edits and transforms samples. 对数据样本进行编辑和转换。 | +| [mapper](#mapper) | 112 | Edits and transforms samples. 对数据样本进行编辑和转换。 | | [pipeline](#pipeline) | 3 | Applies dataset-level processing; both input and output are datasets. 执行数据集级别的操作,输入和输出均为完整数据集。 | | [selector](#selector) | 5 | Selects top samples based on ranking. 基于排序选取高质量样本。 | @@ -200,6 +200,7 @@ All the specific operators are listed below, each featured with several capabili | dialog_topic_detection_mapper | 💻CPU 🔗API 🟢Stable | Generates user's topic labels and analysis in a dialog. 在对话框中生成用户的主题标签和分析。 | [info](operators/mapper/dialog_topic_detection_mapper.md) | - | | download_file_mapper | 💻CPU 🟡Beta | Mapper to download URL files to local files or load them into memory. 映射器将URL文件下载到本地文件或将其加载到内存中。 | [info](operators/mapper/download_file_mapper.md) | - | | expand_macro_mapper | 🔤Text 💻CPU 🟢Stable | Expands macro definitions in the document body of LaTeX samples. 展开LaTeX示例文档主体中的宏定义。 | [info](operators/mapper/expand_macro_mapper.md) | - | +| export_to_lerobot_mapper | 🔮Multimodal 💻CPU 🟡Beta | Export processed video data to LeRobot v2.0 dataset format (LIBERO-style). 将处理后的视频数据导出为LeRobot v2.0数据集格式 (LIBERO风格)。 | - | - | | extract_entity_attribute_mapper | 🔤Text 💻CPU 🔗API 🟢Stable | Extracts attributes for given entities from the text and stores them in the sample's metadata. 从文本中提取给定实体的属性,并将其存储在示例的元数据中。 | [info](operators/mapper/extract_entity_attribute_mapper.md) | - | | extract_entity_relation_mapper | 🔤Text 💻CPU 🔗API 🟢Stable | Extracts entities and relations from text to build a knowledge graph. 从文本中提取实体和关系以构建知识图谱。 | [info](operators/mapper/extract_entity_relation_mapper.md) | - | | extract_event_mapper | 🔤Text 💻CPU 🔗API 🟢Stable | Extracts events and relevant characters from the text. 从文本中提取事件和相关字符。 | [info](operators/mapper/extract_event_mapper.md) | - | @@ -259,20 +260,25 @@ All the specific operators are listed below, each featured with several capabili | text_chunk_mapper | 🔤Text 💻CPU 🔗API 🟢Stable | Split input text into chunks based on specified criteria. 根据指定的条件将输入文本拆分为块。 | [info](operators/mapper/text_chunk_mapper.md) | - | | text_tagging_by_prompt_mapper | 🔤Text 🚀GPU 🌊vLLM 🧩HF 🟡Beta | Mapper to generate text tags using prompt with LLM. Mapper使用带有LLM的prompt生成文本标记。 | [info](operators/mapper/text_tagging_by_prompt_mapper.md) | - | | vggt_mapper | 🎬Video 🚀GPU 🟡Beta | Input a video of a single scene, and use VGGT to extract information including Camera Pose, Depth Maps, Point Maps, and 3D Point Tracks. 输入单个场景的视频,并使用VGGT提取包括相机姿态、深度图、点图和3D点轨迹的信息。 | [info](operators/mapper/vggt_mapper.md) | - | -| video_camera_calibration_static_deepcalib_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib. 使用DeepCalib计算静态摄像机的摄像机内部和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_static_deepcalib_mapper.md) | - | -| video_camera_calibration_static_moge_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib). 使用Moge-2 (比DeepCalib更准确) 计算静态摄像机的摄像机内部函数和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_static_moge_mapper.md) | - | -| video_camera_pose_mapper | 🎬Video 🚀GPU 🟡Beta | Extract camera poses by leveraging MegaSaM and MoGe-2. 通过利用MegaSaM和MoGe-2提取相机姿势。 | - | - | +| video_atomic_action_segment_mapper | 🎬Video 💻CPU 🟡Beta | Segment a unified hand trajectory into atomic action clips. 将统一的手轨迹分割成原子动作片段。 | - | - | +| video_camera_calibration_deepcalib_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib. 使用DeepCalib计算静态摄像机的摄像机内部和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_deepcalib_mapper.md) | - | +| video_camera_calibration_droidcalib_mapper | 🎬Video 🚀GPU 🟡Beta | Extract camera intrinsics from videos using DroidCalib. 使用DroidCalib从视频中提取相机内部函数。 | - | - | +| video_camera_calibration_moge_mapper | 🎬Video 🚀GPU 🟡Beta | Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib). 使用Moge-2 (比DeepCalib更准确) 计算静态摄像机的摄像机内部函数和视场 (FOV)。 | [info](operators/mapper/video_camera_calibration_moge_mapper.md) | - | +| video_camera_pose_megasam_mapper | 🎬Video 🚀GPU 🟡Beta | Extract camera poses by leveraging MegaSaM and MoGe-2. 通过利用MegaSaM和MoGe-2提取相机姿势。 | - | - | | video_captioning_from_audio_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Mapper to caption a video according to its audio streams based on Qwen-Audio model. 映射器根据基于qwen-audio模型的音频流为视频添加字幕。 | [info](operators/mapper/video_captioning_from_audio_mapper.md) | - | | video_captioning_from_frames_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Generates video captions from sampled frames using an image-to-text model. 使用图像到文本模型从采样帧生成视频字幕。 | [info](operators/mapper/video_captioning_from_frames_mapper.md) | - | | video_captioning_from_summarizer_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...). 映射器通过总结几种生成的文本 (来自视频/音频/帧的字幕,来自音频/帧的标签,...) 来生成视频字幕。 | [info](operators/mapper/video_captioning_from_summarizer_mapper.md) | - | | video_captioning_from_video_mapper | 🔮Multimodal 🚀GPU 🧩HF 🟢Stable | Generates video captions using a Hugging Face video-to-text model and sampled video frames. 使用拥抱面部视频到文本模型和采样视频帧生成视频字幕。 | [info](operators/mapper/video_captioning_from_video_mapper.md) | - | | video_captioning_from_vlm_mapper | 🔮Multimodal 🚀GPU 🌊vLLM 🧩HF 🟡Beta | Generates video captions using a VLM that accepts videos as inputs. 使用接受视频作为输入的VLM生成视频字幕。 | [info](operators/mapper/video_captioning_from_vlm_mapper.md) | - | +| video_clip_reassembly_mapper | 🎬Video 💻CPU 🟡Beta | Reassemble hand-action results from overlapping video clips. 重新组合-重叠视频剪辑的手部动作结果。 | - | - | | video_depth_estimation_mapper | 🎬Video 🚀GPU 🟡Beta | Perform depth estimation on the video. 对视频进行深度估计。 | [info](operators/mapper/video_depth_estimation_mapper.md) | - | | video_extract_frames_mapper | 🔮Multimodal 💻CPU 🟢Stable | Mapper to extract frames from video files according to specified methods. 映射器根据指定的方法从视频文件中提取帧。 | [info](operators/mapper/video_extract_frames_mapper.md) | - | | video_face_blur_mapper | 🎬Video 💻CPU 🟢Stable | Mapper to blur faces detected in videos. 映射器模糊在视频中检测到的人脸。 | [info](operators/mapper/video_face_blur_mapper.md) | - | | video_ffmpeg_wrapped_mapper | 🎬Video 💻CPU 🟢Stable | Wraps FFmpeg video filters for processing video files in a dataset. 包装FFmpeg视频过滤器,用于处理数据集中的视频文件。 | [info](operators/mapper/video_ffmpeg_wrapped_mapper.md) | - | +| video_hand_action_compute_mapper | 🎬Video 💻CPU 🟡Beta | Compute 7-DoF actions and 8-dim states from hand reconstruction and camera pose results. 根据手重建和相机姿态结果计算7自由度动作和8-dim状态。 | - | - | +| video_hand_motion_smooth_mapper | 🎬Video 💻CPU 🟡Beta | Apply smoothing to world-space hand motions and remove outliers. 对世界空间手部运动应用平滑并移除异常值。 | - | - | | video_hand_reconstruction_hawor_mapper | 🎬Video 🚀GPU 🟡Beta | Use HaWoR and MoGe-2 for hand reconstruction. 使用HaWoR和MoGe-2进行手部重建。 | - | - | -| video_hand_reconstruction_mapper | 🎬Video 🚀GPU 🟡Beta | Use the WiLoR model for hand localization and reconstruction. 使用WiLoR模型进行手部定位和重建。 | [info](operators/mapper/video_hand_reconstruction_mapper.md) | - | +| video_hand_reconstruction_mapper | 🎬Video 🚀GPU 🔴Alpha | Use the WiLoR model for hand localization and reconstruction. 使用WiLoR模型进行手部定位和重建。 | [info](operators/mapper/video_hand_reconstruction_mapper.md) | - | | video_object_segmenting_mapper | 🎬Video 🚀GPU 🧩HF 🟡Beta | Text-guided semantic segmentation of valid objects throughout the video (YOLOE + SAM2). 在整个视频中对有效对象进行文本引导的语义分割 (YOLOE SAM2)。 | [info](operators/mapper/video_object_segmenting_mapper.md) | - | | video_remove_watermark_mapper | 🎬Video 💻CPU 🟢Stable | Remove watermarks from videos based on specified regions. 根据指定区域从视频中删除水印。 | [info](operators/mapper/video_remove_watermark_mapper.md) | - | | video_resize_aspect_ratio_mapper | 🎬Video 💻CPU 🟢Stable | Resizes videos to fit within a specified aspect ratio range. 调整视频大小以适应指定的宽高比范围。 | [info](operators/mapper/video_resize_aspect_ratio_mapper.md) | - | @@ -282,6 +288,7 @@ All the specific operators are listed below, each featured with several capabili | video_split_by_scene_mapper | 🔮Multimodal 💻CPU 🟢Stable | Splits videos into scene clips based on detected scene changes. 根据检测到的场景变化将视频拆分为场景剪辑。 | [info](operators/mapper/video_split_by_scene_mapper.md) | - | | video_tagging_from_audio_mapper | 🎬Video 🚀GPU 🧩HF 🟢Stable | Generates video tags from audio streams using the Audio Spectrogram Transformer. 使用音频频谱图转换器从音频流生成视频标签。 | [info](operators/mapper/video_tagging_from_audio_mapper.md) | - | | video_tagging_from_frames_mapper | 🎬Video 🚀GPU 🟢Stable | Generates video tags from frames extracted from videos. 从视频中提取的帧生成视频标签。 | [info](operators/mapper/video_tagging_from_frames_mapper.md) | - | +| video_trajectory_overlay_mapper | 🎬Video 💻CPU 🟡Beta | Prepare VLM-ready frames by sampling and overlaying hand trajectories. 通过采样和覆盖手部轨迹来准备VLM就绪帧。 | - | - | | video_undistort_mapper | 🎬Video 💻CPU 🟡Beta | Undistort raw videos with corresponding camera intrinsics and distortion coefficients. 使用相应的相机固有特性和失真系数对原始视频进行失真。 | [info](operators/mapper/video_undistort_mapper.md) | - | | video_whole_body_pose_estimation_mapper | 🎬Video 🚀GPU 🟡Beta | Input a video containing people, and use the DWPose model to extract the body, hand, feet, and face keypoints of the human subjects in the video, i.e., 2D Whole-body Pose Estimation. 输入包含人的视频,并使用DWPose模型来提取视频中人类主体的身体、手、脚和面部关键点,即2D全身姿态估计。 | [info](operators/mapper/video_whole_body_pose_estimation_mapper.md) | - | | whitespace_normalization_mapper | 🔤Text 💻CPU 🟢Stable | Normalizes various types of whitespace characters to standard spaces in text samples. 将文本样本中各种类型的空白字符规范化为标准空格。 | [info](operators/mapper/whitespace_normalization_mapper.md) | - | diff --git a/docs/op_doc_enhance_workflow/examples.json b/docs/op_doc_enhance_workflow/examples.json index 475deeb0c0..92cfa1cf5a 100644 --- a/docs/op_doc_enhance_workflow/examples.json +++ b/docs/op_doc_enhance_workflow/examples.json @@ -5078,29 +5078,29 @@ "samples": null } }, - "video_camera_calibration_static_deepcalib_mapper": { + "video_camera_calibration_deepcalib_mapper": { "test": { - "op_code": "VideoCameraCalibrationStaticDeepcalibMapper(model_path='weights_10_0.02.h5', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE)", + "op_code": "VideoCameraCalibrationDeepcalibMapper(model_path='weights_10_0.02.h5', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE)", "ds": "[{'videos': ['{PROJECT_ROOT}/tests/ops/data/video3.mp4']}, {'videos': ['{PROJECT_ROOT}/tests/ops/data/video4.mp4']}, {'videos': ['{PROJECT_ROOT}/tests/ops/data/video12.mp4']}]", "tgt": "[{'frame_names_shape': [49], 'intrinsics_list_shape': [49, 3, 3], 'xi_list_shape': [49], 'hfov_list_shape': [49], 'vfov_list_shape': [49]}, {'frame_names_shape': [22], 'intrinsics_list_shape': [22, 3, 3], 'xi_list_shape': [22], 'hfov_list_shape': [22], 'vfov_list_shape': [22]}, {'frame_names_shape': [3], 'intrinsics_list_shape': [3, 3, 3], 'xi_list_shape': [3], 'hfov_list_shape': [3], 'vfov_list_shape': [3]}]", "samples": null }, "test_mul_proc": { - "op_code": "VideoCameraCalibrationStaticDeepcalibMapper(model_path='weights_10_0.02.h5', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE)", + "op_code": "VideoCameraCalibrationDeepcalibMapper(model_path='weights_10_0.02.h5', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE)", "ds": "[{'videos': ['{PROJECT_ROOT}/tests/ops/data/video3.mp4']}, {'videos': ['{PROJECT_ROOT}/tests/ops/data/video4.mp4']}, {'videos': ['{PROJECT_ROOT}/tests/ops/data/video12.mp4']}]", "tgt": "[{'frame_names_shape': [49], 'intrinsics_list_shape': [49, 3, 3], 'xi_list_shape': [49], 'hfov_list_shape': [49], 'vfov_list_shape': [49]}, {'frame_names_shape': [22], 'intrinsics_list_shape': [22, 3, 3], 'xi_list_shape': [22], 'hfov_list_shape': [22], 'vfov_list_shape': [22]}, {'frame_names_shape': [3], 'intrinsics_list_shape': [3, 3, 3], 'xi_list_shape': [3], 'hfov_list_shape': [3], 'vfov_list_shape': [3]}]", "samples": null } }, - "video_camera_calibration_static_moge_mapper": { + "video_camera_calibration_moge_mapper": { "test": { - "op_code": "VideoCameraCalibrationStaticMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE, if_output_points_info=True, if_output_depth_info=True, if_output_mask_info=True)", + "op_code": "VideoCameraCalibrationMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE, if_output_points_info=True, if_output_depth_info=True, if_output_mask_info=True)", "ds": "[{'videos': ['{PROJECT_ROOT}/tests/ops/data/video3.mp4']}, {'videos': ['{PROJECT_ROOT}/tests/ops/data/video4.mp4']}, {'videos': ['{PROJECT_ROOT}/tests/ops/data/video12.mp4']}]", "tgt": "[{'frame_names_shape': [49], 'intrinsics_list_shape': [49, 3, 3], 'hfov_list_shape': [49], 'vfov_list_shape': [49], 'points_list_shape': [49, 640, 362, 3], 'depth_list_shape': [49, 640, 362], 'mask_list_shape': [49, 640, 362]}, {'frame_names_shape': [22], 'intrinsics_list_shape': [22, 3, 3], 'hfov_list_shape': [22], 'vfov_list_shape': [22], 'points_list_shape': [22, 360, 480, 3], 'depth_list_shape': [22, 360, 480], 'mask_list_shape': [22, 360, 480]}, {'frame_names_shape': [3], 'intrinsics_list_shape': [3, 3, 3], 'hfov_list_shape': [3], 'vfov_list_shape': [3], 'points_list_shape': [3, 1080, 1920, 3], 'depth_list_shape': [3, 1080, 1920], 'mask_list_shape': [3, 1080, 1920]}]", "samples": null }, "test_mul_proc": { - "op_code": "VideoCameraCalibrationStaticMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE, if_output_points_info=True, if_output_depth_info=True, if_output_mask_info=True)", + "op_code": "VideoCameraCalibrationMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE, if_output_points_info=True, if_output_depth_info=True, if_output_mask_info=True)", "ds": "[{'videos': ['{PROJECT_ROOT}/tests/ops/data/video3.mp4']}, {'videos': ['{PROJECT_ROOT}/tests/ops/data/video4.mp4']}, {'videos': ['{PROJECT_ROOT}/tests/ops/data/video12.mp4']}]", "tgt": "[{'frame_names_shape': [49], 'intrinsics_list_shape': [49, 3, 3], 'hfov_list_shape': [49], 'vfov_list_shape': [49], 'points_list_shape': [49, 640, 362, 3], 'depth_list_shape': [49, 640, 362], 'mask_list_shape': [49, 640, 362]}, {'frame_names_shape': [22], 'intrinsics_list_shape': [22, 3, 3], 'hfov_list_shape': [22], 'vfov_list_shape': [22], 'points_list_shape': [22, 360, 480, 3], 'depth_list_shape': [22, 360, 480], 'mask_list_shape': [22, 360, 480]}, {'frame_names_shape': [3], 'intrinsics_list_shape': [3, 3, 3], 'hfov_list_shape': [3], 'vfov_list_shape': [3], 'points_list_shape': [3, 1080, 1920, 3], 'depth_list_shape': [3, 1080, 1920], 'mask_list_shape': [3, 1080, 1920]}]", "samples": null diff --git a/docs/operators/mapper/video_camera_calibration_static_deepcalib_mapper.md b/docs/operators/mapper/video_camera_calibration_deepcalib_mapper.md similarity index 96% rename from docs/operators/mapper/video_camera_calibration_static_deepcalib_mapper.md rename to docs/operators/mapper/video_camera_calibration_deepcalib_mapper.md index 0910e5d7bc..e309d272d1 100644 --- a/docs/operators/mapper/video_camera_calibration_static_deepcalib_mapper.md +++ b/docs/operators/mapper/video_camera_calibration_deepcalib_mapper.md @@ -1,4 +1,4 @@ -# video_camera_calibration_static_deepcalib_mapper +# video_camera_calibration_deepcalib_mapper Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib. @@ -14,7 +14,7 @@ Tags 标签: gpu, video | `model_path` | | `'weights_10_0.02.h5'` | The path to the DeepCalib Regression model. | | `frame_num` | typing.Annotated[int, Gt(gt=0)] | `3` | The number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. If "duration" > 0, frame_num is the number of frames per segment. | | `duration` | | `0` | The duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment. | -| `tag_field_name` | | `'static_camera_calibration_deepcalib_tags'` | The field name to store the tags. It's "static_camera_calibration_deepcalib_tags" in default. | +| `tag_field_name` | | `'camera_calibration_deepcalib_tags'` | The field name to store the tags. It's "camera_calibration_deepcalib_tags" in default. | | `frame_dir` | | `DATA_JUICER_ASSETS_CACHE` | Output directory to save extracted frames. | | `if_output_info` | | `True` | Whether to save the camera parameters results to an JSON file. | | `output_info_dir` | | `DATA_JUICER_ASSETS_CACHE` | Output directory for saving camera parameters. | @@ -24,7 +24,7 @@ Tags 标签: gpu, video ## 📊 Effect demonstration 效果演示 ### test ```python -VideoCameraCalibrationStaticDeepcalibMapper(model_path='weights_10_0.02.h5', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE) +VideoCameraCalibrationDeepcalibMapper(model_path='weights_10_0.02.h5', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE) ``` #### 📥 input data 输入数据 @@ -36,7 +36,7 @@ VideoCameraCalibrationStaticDeepcalibMapper(model_path='weights_10_0.02.h5', fra ### test_mul_proc ```python -VideoCameraCalibrationStaticDeepcalibMapper(model_path='weights_10_0.02.h5', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE) +VideoCameraCalibrationDeepcalibMapper(model_path='weights_10_0.02.h5', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE) ``` #### 📥 input data 输入数据 @@ -48,6 +48,6 @@ VideoCameraCalibrationStaticDeepcalibMapper(model_path='weights_10_0.02.h5', fra ## 🔗 related links 相关链接 -- [source code 源代码](../../../data_juicer/ops/mapper/video_camera_calibration_static_deepcalib_mapper.py) -- [unit test 单元测试](../../../tests/ops/mapper/test_video_camera_calibration_static_deepcalib_mapper.py) +- [source code 源代码](../../../data_juicer/ops/mapper/video_camera_calibration_deepcalib_mapper.py) +- [unit test 单元测试](../../../tests/ops/mapper/test_video_camera_calibration_deepcalib_mapper.py) - [Return operator list 返回算子列表](../../Operators.md) \ No newline at end of file diff --git a/docs/operators/mapper/video_camera_calibration_static_moge_mapper.md b/docs/operators/mapper/video_camera_calibration_moge_mapper.md similarity index 96% rename from docs/operators/mapper/video_camera_calibration_static_moge_mapper.md rename to docs/operators/mapper/video_camera_calibration_moge_mapper.md index c74ae1c861..836ebbf198 100644 --- a/docs/operators/mapper/video_camera_calibration_static_moge_mapper.md +++ b/docs/operators/mapper/video_camera_calibration_moge_mapper.md @@ -1,4 +1,4 @@ -# video_camera_calibration_static_moge_mapper +# video_camera_calibration_moge_mapper Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib). @@ -14,7 +14,7 @@ Tags 标签: gpu, video | `model_path` | | `'Ruicheng/moge-2-vitl'` | The path to the Moge-2 model. | | `frame_num` | typing.Annotated[int, Gt(gt=0)] | `3` | The number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. If "duration" > 0, frame_num is the number of frames per segment. | | `duration` | | `0` | The duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment. | -| `tag_field_name` | | `'static_camera_calibration_moge_tags'` | The field name to store the tags. It's "static_camera_calibration_moge_tags" in default. | +| `tag_field_name` | | `'camera_calibration_moge_tags'` | The field name to store the tags. It's "camera_calibration_moge_tags" in default. | | `frame_dir` | | `DATA_JUICER_ASSETS_CACHE` | Output directory to save extracted frames. | | `if_output_info` | | `True` | Whether to save the camera parameters results to an JSON file. | | `output_info_dir` | | `DATA_JUICER_ASSETS_CACHE` | Output directory for saving camera parameters. | @@ -27,7 +27,7 @@ Tags 标签: gpu, video ## 📊 Effect demonstration 效果演示 ### test ```python -VideoCameraCalibrationStaticMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE, if_output_points_info=True, if_output_depth_info=True, if_output_mask_info=True) +VideoCameraCalibrationMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE, if_output_points_info=True, if_output_depth_info=True, if_output_mask_info=True) ``` #### 📥 input data 输入数据 @@ -39,7 +39,7 @@ VideoCameraCalibrationStaticMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_ ### test_mul_proc ```python -VideoCameraCalibrationStaticMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE, if_output_points_info=True, if_output_depth_info=True, if_output_mask_info=True) +VideoCameraCalibrationMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_num=1, duration=1, frame_dir=DATA_JUICER_ASSETS_CACHE, if_output_info=True, output_info_dir=DATA_JUICER_ASSETS_CACHE, if_output_points_info=True, if_output_depth_info=True, if_output_mask_info=True) ``` #### 📥 input data 输入数据 @@ -51,6 +51,6 @@ VideoCameraCalibrationStaticMogeMapper(model_path='Ruicheng/moge-2-vitl', frame_ ## 🔗 related links 相关链接 -- [source code 源代码](../../../data_juicer/ops/mapper/video_camera_calibration_static_moge_mapper.py) -- [unit test 单元测试](../../../tests/ops/mapper/test_video_camera_calibration_static_moge_mapper.py) +- [source code 源代码](../../../data_juicer/ops/mapper/video_camera_calibration_moge_mapper.py) +- [unit test 单元测试](../../../tests/ops/mapper/test_video_camera_calibration_moge_mapper.py) - [Return operator list 返回算子列表](../../Operators.md) \ No newline at end of file diff --git a/tests/ops/filter/test_video_motion_score_filter.py b/tests/ops/filter/test_video_motion_score_filter.py index 69d4833d73..55baf078f2 100644 --- a/tests/ops/filter/test_video_motion_score_filter.py +++ b/tests/ops/filter/test_video_motion_score_filter.py @@ -229,6 +229,96 @@ def test_frame_field(self): op = VideoMotionScoreFilter(min_score=0, max_score=3.0, frame_field='frames', num_proc=2) self._run_helper(op, ds_list, tgt_list, select_field=['frames']) + def test_frame_field_without_original_fps(self): + """When original_fps is not specified, all frames are processed + (backward compatible behavior).""" + ds_list = [{ + 'frames': [[self.img1_path, self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path]], + }] + tgt_list = [{ + 'frames': [[self.img1_path, self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path, self.img1_path]], + }] + op = VideoMotionScoreFilter( + min_score=0, max_score=3.0, + frame_field='frames', sampling_fps=2, + ) + self._run_helper(op, ds_list, tgt_list, select_field=['frames']) + + def test_frame_field_with_original_fps(self): + """When original_fps is specified, frames are sampled at sampling_fps + rate. With original_fps=30 and sampling_fps=2, sampling_step=15. + For 3 identical frames (idx 0,1,2), only idx 0 is selected + (0 % 15 == 0), resulting in no optical flow pairs -> score=-1.""" + ds_list = [{ + 'frames': [[self.img1_path, self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path]], + }] + # With original_fps=30 and sampling_fps=2, step=15, + # only frame 0 is selected from each video -> only 1 frame -> + # no optical flow -> score=-1, which is < min_score=0 -> filtered out + tgt_list = [] + op = VideoMotionScoreFilter( + min_score=0, max_score=3.0, + frame_field='frames', sampling_fps=2, original_fps=30, + ) + self._run_helper(op, ds_list, tgt_list, select_field=['frames']) + + def test_frame_field_with_original_fps_small_step(self): + """When original_fps is close to sampling_fps, sampling_step is small. + With original_fps=4 and sampling_fps=2, sampling_step=2. + For 3 identical frames (idx 0,1,2), frames 0 and 2 are selected + (0%2==0, 2%2==0), resulting in 1 optical flow pair. Since frames + are identical, motion score is 0, which is within [0, 3.0].""" + ds_list = [{ + 'frames': [[self.img1_path, self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path]], + }] + # step=2: for 3 frames, idx 0,2 selected -> 1 flow pair -> score=0 + # for 2 frames, idx 0 selected -> no flow pair -> score=-1 + # for 1 frame, idx 0 selected -> no flow pair -> score=-1 + # score=0 is in [0, 3.0] -> kept; score=-1 is not -> filtered + tgt_list = [{ + 'frames': [[self.img1_path, self.img1_path, self.img1_path]], + }] + op = VideoMotionScoreFilter( + min_score=0, max_score=3.0, + frame_field='frames', sampling_fps=2, original_fps=4, + ) + self._run_helper(op, ds_list, tgt_list, select_field=['frames']) + + def test_frame_field_with_original_fps_equal_sampling_fps(self): + """When original_fps equals sampling_fps, sampling_step=1, all frames + are processed (same as not specifying original_fps).""" + ds_list = [{ + 'frames': [[self.img1_path, self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path]], + }] + tgt_list = [{ + 'frames': [[self.img1_path, self.img1_path, self.img1_path]], + }, { + 'frames': [[self.img1_path, self.img1_path]], + }] + op = VideoMotionScoreFilter( + min_score=0, max_score=3.0, + frame_field='frames', sampling_fps=2, original_fps=2, + ) + self._run_helper(op, ds_list, tgt_list, select_field=['frames']) + if __name__ == '__main__': unittest.main() diff --git a/tests/ops/mapper/test_export_to_lerobot_mapper.py b/tests/ops/mapper/test_export_to_lerobot_mapper.py new file mode 100644 index 0000000000..f82105fa72 --- /dev/null +++ b/tests/ops/mapper/test_export_to_lerobot_mapper.py @@ -0,0 +1,478 @@ +import json +import os +import shutil +import tempfile +import unittest +import numpy as np + +from data_juicer.core.data import NestedDataset as Dataset +from data_juicer.ops.mapper.export_to_lerobot_mapper import ExportToLeRobotMapper +from data_juicer.utils.constant import Fields +from data_juicer.utils.mm_utils import load_file_byte +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +class ExportToLeRobotMapperTest(DataJuicerTestCaseBase): + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', + 'data') + vid3_path = os.path.join(data_path, 'video3.mp4') + vid4_path = os.path.join(data_path, 'video4.mp4') + + def setUp(self): + self.output_dir = tempfile.mkdtemp(prefix='lerobot_test_') + + def tearDown(self): + if os.path.exists(self.output_dir): + shutil.rmtree(self.output_dir) + + def _make_sample(self, video_source, num_frames=10, task_desc="pick up the cup", + valid_frame_ids=None): + """Create a synthetic sample with hand action data.""" + states = (np.random.randn(num_frames, 8).astype(np.float32)).tolist() + actions = (np.random.randn(num_frames, 7).astype(np.float32)).tolist() + if valid_frame_ids is None: + valid_frame_ids = list(range(num_frames)) + + sample = { + 'videos': [video_source], + 'text': task_desc, + Fields.meta: { + 'hand_action_tags': [{ + 'right': { + 'states': states, + 'actions': actions, + 'valid_frame_ids': valid_frame_ids, + 'hand_type': 'right', + }, + 'left': { + 'states': [], + 'actions': [], + 'valid_frame_ids': [], + 'hand_type': 'left', + } + }], + } + } + return sample + + def test_process_single(self): + """Test processing a single sample.""" + sample = self._make_sample(self.vid3_path, num_frames=10) + ds_list = [sample] + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + hand_action_field='hand_action_tags', + fps=10, + robot_type='egodex_hand', + ) + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + self.assertEqual(len(res_list), 1) + export_info = res_list[0][Fields.meta].get('lerobot_export', []) + self.assertGreater(len(export_info), 0) + + ep = export_info[0] + self.assertIn('uuid', ep) + self.assertIn('parquet_path', ep) + self.assertEqual(ep['num_frames'], 10) + + # Verify staging files exist + self.assertTrue(os.path.exists(ep['parquet_path'])) + + def test_finalize_dataset(self): + """Test the full pipeline: process + finalize.""" + samples = [ + self._make_sample(self.vid3_path, num_frames=10, task_desc="pick up cup"), + self._make_sample(self.vid4_path, num_frames=8, task_desc="place cup"), + ] + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + hand_action_field='hand_action_tags', + fps=10, + robot_type='egodex_hand', + ) + + dataset = Dataset.from_list(samples) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + + # Finalize + ExportToLeRobotMapper.finalize_dataset( + output_dir=self.output_dir, + fps=10, + robot_type='egodex_hand', + ) + + meta_dir = os.path.join(self.output_dir, 'meta') + + # Check info.json + info_path = os.path.join(meta_dir, 'info.json') + self.assertTrue(os.path.exists(info_path)) + with open(info_path, 'r') as f: + info = json.load(f) + self.assertEqual(info['codebase_version'], 'v2.0') + self.assertEqual(info['robot_type'], 'egodex_hand') + self.assertEqual(info['total_episodes'], 2) + self.assertEqual(info['total_frames'], 18) # 10 + 8 + self.assertEqual(info['fps'], 10) + self.assertEqual(info['total_tasks'], 2) + + # Check features + self.assertIn('observation.state', info['features']) + self.assertEqual(info['features']['observation.state']['shape'], [8]) + self.assertIn('action', info['features']) + self.assertEqual(info['features']['action']['shape'], [7]) + + # Check episodes.jsonl + episodes_path = os.path.join(meta_dir, 'episodes.jsonl') + self.assertTrue(os.path.exists(episodes_path)) + with open(episodes_path, 'r') as f: + episodes = [json.loads(line) for line in f if line.strip()] + self.assertEqual(len(episodes), 2) + + # Check tasks.jsonl + tasks_path = os.path.join(meta_dir, 'tasks.jsonl') + self.assertTrue(os.path.exists(tasks_path)) + with open(tasks_path, 'r') as f: + tasks = [json.loads(line) for line in f if line.strip()] + self.assertEqual(len(tasks), 2) + + # Check modality.json + modality_path = os.path.join(meta_dir, 'modality.json') + self.assertTrue(os.path.exists(modality_path)) + with open(modality_path, 'r') as f: + modality = json.load(f) + self.assertIn('state', modality) + self.assertIn('action', modality) + + # Check data directory + data_dir = os.path.join(self.output_dir, 'data', 'chunk-000') + self.assertTrue(os.path.exists(data_dir)) + parquet_files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')] + self.assertEqual(len(parquet_files), 2) + + # Check staging is cleaned up + staging_dir = os.path.join(self.output_dir, 'staging') + self.assertFalse(os.path.exists(staging_dir)) + + def test_empty_action_data(self): + """Test with empty action data - should not export anything.""" + sample = { + 'videos': [self.vid3_path], + 'text': 'test', + Fields.meta: { + 'hand_action_tags': [], + } + } + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + hand_action_field='hand_action_tags', + ) + + dataset = Dataset.from_list([sample]) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + export_info = res_list[0][Fields.meta].get('lerobot_export', []) + self.assertEqual(len(export_info), 0) + + def test_same_task_deduplication(self): + """Test that episodes with the same task share a task_index.""" + samples = [ + self._make_sample(self.vid3_path, num_frames=5, task_desc="pick up cup"), + self._make_sample(self.vid4_path, num_frames=5, task_desc="pick up cup"), + ] + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + hand_action_field='hand_action_tags', + fps=10, + ) + + dataset = Dataset.from_list(samples) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + + ExportToLeRobotMapper.finalize_dataset( + output_dir=self.output_dir, fps=10, + ) + + with open(os.path.join(self.output_dir, 'meta', 'info.json'), 'r') as f: + info = json.load(f) + self.assertEqual(info['total_tasks'], 1) # same task + + def test_mul_proc(self): + """Test with multiple processes.""" + samples = [ + self._make_sample(self.vid3_path, num_frames=5), + self._make_sample(self.vid4_path, num_frames=5), + ] + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + hand_action_field='hand_action_tags', + fps=10, + ) + + dataset = Dataset.from_list(samples) + dataset = dataset.map(op.process, num_proc=2, with_rank=True) + res_list = dataset.to_list() + + for sample in res_list: + export_info = sample[Fields.meta].get('lerobot_export', []) + self.assertGreater(len(export_info), 0) + + def test_video_bytes_input(self): + """Test processing with video bytes input instead of file paths.""" + video_bytes = load_file_byte(self.vid3_path) + sample = self._make_sample(video_bytes, num_frames=10) + ds_list = [sample] + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + hand_action_field='hand_action_tags', + fps=10, + robot_type='egodex_hand', + ) + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + self.assertEqual(len(res_list), 1) + export_info = res_list[0][Fields.meta].get('lerobot_export', []) + self.assertGreater(len(export_info), 0) + + ep = export_info[0] + self.assertEqual(ep['num_frames'], 10) + self.assertTrue(os.path.exists(ep['parquet_path'])) + # Verify the video was written to staging + self.assertTrue(os.path.exists(ep['video_path'])) + self.assertTrue(ep['video_path'].endswith('.mp4')) + + def test_video_bytes_finalize(self): + """Test full pipeline with video bytes: process + finalize.""" + vid3_bytes = load_file_byte(self.vid3_path) + vid4_bytes = load_file_byte(self.vid4_path) + samples = [ + self._make_sample(vid3_bytes, num_frames=10, task_desc="pick up cup"), + self._make_sample(vid4_bytes, num_frames=8, task_desc="place cup"), + ] + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + hand_action_field='hand_action_tags', + fps=10, + robot_type='egodex_hand', + ) + + dataset = Dataset.from_list(samples) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + + ExportToLeRobotMapper.finalize_dataset( + output_dir=self.output_dir, fps=10, robot_type='egodex_hand', + ) + + with open(os.path.join(self.output_dir, 'meta', 'info.json'), 'r') as f: + info = json.load(f) + self.assertEqual(info['total_episodes'], 2) + self.assertEqual(info['total_frames'], 18) + self.assertEqual(info['total_videos'], 2) + + # Check video files in final directory + video_dir = os.path.join(self.output_dir, 'videos', 'chunk-000', + 'observation.images.image') + video_files = [f for f in os.listdir(video_dir) if f.endswith('.mp4')] + self.assertEqual(len(video_files), 2) + + # ------------------------------------------------------------------ + # Segment-based export tests + # ------------------------------------------------------------------ + def _make_segment_sample(self, n_frames=30, n_segments=2): + """Create a sample with atomic action segments for segment export.""" + # Create dummy frame images + frame_dir = os.path.join(self.output_dir, 'frames') + os.makedirs(frame_dir, exist_ok=True) + frame_paths = [] + for i in range(n_frames): + img = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8) + path = os.path.join(frame_dir, f'frame_{i:04d}.jpg') + import cv2 + cv2.imwrite(path, img) + frame_paths.append(path) + + frames_per_seg = n_frames // n_segments + segments = [] + for s in range(n_segments): + start = s * frames_per_seg + end = min((s + 1) * frames_per_seg - 1, n_frames - 1) + n = end - start + 1 + states = np.random.randn(n, 8).astype(np.float32).tolist() + actions = np.random.randn(n, 7).astype(np.float32).tolist() + segments.append({ + "hand_type": "right" if s % 2 == 0 else "left", + "segment_id": s, + "start_frame": start, + "end_frame": end, + "states": states, + "actions": actions, + "valid_frame_ids": list(range(start, end + 1)), + "caption": { + "think": "test reasoning", + "action": f"Pick up object {s}", + }, + }) + + return { + "video_frames": frame_paths, + "text": "", + Fields.meta: { + "atomic_action_segments": segments, + }, + } + + def test_segment_export_creates_episodes(self): + """Each segment should become a separate episode.""" + sample = self._make_segment_sample(n_frames=30, n_segments=3) + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + segment_field='atomic_action_segments', + fps=10, + ) + + dataset = Dataset.from_list([sample]) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + export_info = res_list[0][Fields.meta].get('lerobot_export', []) + self.assertEqual(len(export_info), 3) + + for ep in export_info: + self.assertIn('segment_id', ep) + self.assertIn('hand_type', ep) + self.assertGreater(ep['num_frames'], 0) + self.assertTrue(os.path.exists(ep['parquet_path'])) + + def test_segment_export_finalize(self): + """Full segment pipeline: process + finalize.""" + sample = self._make_segment_sample(n_frames=30, n_segments=2) + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + segment_field='atomic_action_segments', + fps=10, + robot_type='egodex_hand', + ) + + dataset = Dataset.from_list([sample]) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + + ExportToLeRobotMapper.finalize_dataset( + output_dir=self.output_dir, fps=10, robot_type='egodex_hand', + ) + + with open(os.path.join(self.output_dir, 'meta', 'info.json')) as f: + info = json.load(f) + self.assertEqual(info['total_episodes'], 2) + # Each segment has its own task description + self.assertEqual(info['total_tasks'], 2) + + # Check episode parquet files + data_dir = os.path.join(self.output_dir, 'data', 'chunk-000') + parquets = [f for f in os.listdir(data_dir) if f.endswith('.parquet')] + self.assertEqual(len(parquets), 2) + + # Check tasks.jsonl has per-segment captions + with open(os.path.join(self.output_dir, 'meta', 'tasks.jsonl')) as f: + tasks = [json.loads(line) for line in f if line.strip()] + task_names = {t['task'] for t in tasks} + self.assertIn('Pick up object 0', task_names) + self.assertIn('Pick up object 1', task_names) + + def test_segment_export_skips_na(self): + """Segments with N/A caption should be skipped.""" + sample = self._make_segment_sample(n_frames=20, n_segments=2) + # Mark first segment as N/A + sample[Fields.meta]['atomic_action_segments'][0]['caption'] = { + 'think': '', 'action': 'N/A'} + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + segment_field='atomic_action_segments', + fps=10, + ) + + dataset = Dataset.from_list([sample]) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + export_info = res_list[0][Fields.meta].get('lerobot_export', []) + self.assertEqual(len(export_info), 1) # Only seg1 exported + self.assertEqual(export_info[0]['segment_id'], 1) + + def test_segment_export_frame_index_relative(self): + """Frame indices in parquet should be segment-relative (0-based).""" + import pyarrow.parquet as pq + + sample = self._make_segment_sample(n_frames=20, n_segments=1) + seg = sample[Fields.meta]['atomic_action_segments'][0] + # Segment covers frames 0-19 + self.assertEqual(seg['start_frame'], 0) + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + segment_field='atomic_action_segments', + fps=10, + ) + + dataset = Dataset.from_list([sample]) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + ep = res_list[0][Fields.meta]['lerobot_export'][0] + table = pq.read_table(ep['parquet_path']) + df = table.to_pandas() + + # frame_index should be 0-based (segment-relative) + self.assertEqual(df['frame_index'].tolist()[0], 0) + self.assertEqual(df['frame_index'].tolist()[-1], + len(seg['states']) - 1) + + def test_valid_frame_ids_in_parquet(self): + """Test that valid_frame_ids are used as frame_index in parquet.""" + import pyarrow.parquet as pq + + # Sparse frame IDs: hand detected at frames 0, 3, 7, 12, 15 + valid_frame_ids = [0, 3, 7, 12, 15] + sample = self._make_sample( + self.vid3_path, num_frames=5, + valid_frame_ids=valid_frame_ids, + ) + + op = ExportToLeRobotMapper( + output_dir=self.output_dir, + hand_action_field='hand_action_tags', + fps=10, + ) + + dataset = Dataset.from_list([sample]) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + ep = res_list[0][Fields.meta]['lerobot_export'][0] + table = pq.read_table(ep['parquet_path']) + df = table.to_pandas() + + # frame_index should match valid_frame_ids + self.assertEqual(df['frame_index'].tolist(), valid_frame_ids) + # timestamp should be frame_id / fps + expected_timestamps = [float(fid) / 10 for fid in valid_frame_ids] + for actual, expected in zip(df['timestamp'].tolist(), expected_timestamps): + self.assertAlmostEqual(actual, expected, places=5) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/ops/mapper/test_video_atomic_action_segment_mapper.py b/tests/ops/mapper/test_video_atomic_action_segment_mapper.py new file mode 100644 index 0000000000..a5087a2bb9 --- /dev/null +++ b/tests/ops/mapper/test_video_atomic_action_segment_mapper.py @@ -0,0 +1,269 @@ +import unittest + +import numpy as np + +from data_juicer.ops.base_op import Fields +from data_juicer.ops.mapper.video_atomic_action_segment_mapper import \ + VideoAtomicActionSegmentMapper +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +class VideoAtomicActionSegmentMapperTest(DataJuicerTestCaseBase): + """Tests for VideoAtomicActionSegmentMapper.""" + + # ------------------------------------------------------------------ + # helpers + # ------------------------------------------------------------------ + @staticmethod + def _make_states(positions, n_dims=8): + """Build an (N, 8) state array from (N, 3) positions.""" + n = len(positions) + states = np.zeros((n, n_dims), dtype=np.float32) + states[:, 0:3] = positions + return states.tolist() + + def _make_sample(self, right_states=None, left_states=None): + hand_data = {} + if right_states is not None: + hand_data["right"] = { + "hand_type": "right", + "states": right_states, + "actions": [], + "valid_frame_ids": list(range(len(right_states))), + "joints_world": [], + "joints_cam": [], + } + if left_states is not None: + hand_data["left"] = { + "hand_type": "left", + "states": left_states, + "actions": [], + "valid_frame_ids": list(range(len(left_states))), + "joints_world": [], + "joints_cam": [], + } + return { + Fields.meta: { + "hand_action_tags": [hand_data], + } + } + + # ------------------------------------------------------------------ + # speed / minima helpers + # ------------------------------------------------------------------ + def test_compute_speed_basic(self): + """Speed should be the norm of consecutive position differences.""" + positions = np.array([ + [0, 0, 0], + [1, 0, 0], + [1, 1, 0], + [1, 1, 1], + ], dtype=np.float64) + speed = VideoAtomicActionSegmentMapper._compute_speed(positions) + self.assertEqual(len(speed), 4) + self.assertAlmostEqual(speed[0], 0.0) + self.assertAlmostEqual(speed[1], 1.0) + self.assertAlmostEqual(speed[2], 1.0) + self.assertAlmostEqual(speed[3], 1.0) + + def test_compute_speed_single_frame(self): + positions = np.array([[1, 2, 3]], dtype=np.float64) + speed = VideoAtomicActionSegmentMapper._compute_speed(positions) + self.assertEqual(len(speed), 1) + self.assertAlmostEqual(speed[0], 0.0) + + def test_find_local_minima(self): + # Create a speed profile with clear minima + # Pattern: high - low - high - low - high + speed = np.array([5, 2, 1, 2, 5, 3, 0.5, 3, 5], dtype=np.float64) + minima = VideoAtomicActionSegmentMapper._find_local_minima( + speed, half_window=2) + # Frame 2 (speed=1) and frame 6 (speed=0.5) should be minima + self.assertIn(2, minima) + self.assertIn(6, minima) + + def test_find_local_minima_flat(self): + """Flat speed profile → every frame is a local minimum.""" + speed = np.array([1.0] * 10, dtype=np.float64) + minima = VideoAtomicActionSegmentMapper._find_local_minima( + speed, half_window=3) + # All interior frames should be minima (<=) + self.assertTrue(len(minima) > 0) + + # ------------------------------------------------------------------ + # segment merging / splitting + # ------------------------------------------------------------------ + def test_merge_short_segments(self): + op = VideoAtomicActionSegmentMapper(min_segment_frames=10) + # Cut points produce segments of length 5, 5, 90 → second cut + # should be merged because 10-5=5 < 10 + cut_points = [5, 10] + result = op._merge_short_segments(cut_points, n_frames=100) + # The second cut point should be removed (too close to first) + self.assertNotIn(10, result) + # Result should have fewer cut points than original + self.assertLess(len(result), len(cut_points)) + + def test_split_long_segments(self): + op = VideoAtomicActionSegmentMapper( + min_segment_frames=5, + max_segment_frames=20, + ) + # One segment of 50 frames should be split + speed = np.concatenate([ + np.linspace(5, 1, 25), + np.linspace(1, 5, 25), + ]) + cut_points = op._split_long_segments([], speed, n_frames=50) + self.assertTrue(len(cut_points) > 0) + + # ------------------------------------------------------------------ + # end-to-end process_single + # ------------------------------------------------------------------ + def test_simple_segmentation(self): + """Two distinct motion bursts should produce at least 2 segments.""" + n = 60 + positions = np.zeros((n, 3), dtype=np.float64) + # First motion: frames 0-20 move right + for i in range(20): + positions[i] = [i * 0.05, 0, 0] + # Pause: frames 20-40 stay still + for i in range(20, 40): + positions[i] = [1.0, 0, 0] + # Second motion: frames 40-60 move up + for i in range(40, 60): + positions[i] = [1.0, (i - 40) * 0.05, 0] + + states = self._make_states(positions) + sample = self._make_sample(right_states=states) + + op = VideoAtomicActionSegmentMapper( + hand_action_field="hand_action_tags", + segment_field="atomic_action_segments", + speed_smooth_window=5, + min_window=5, + min_segment_frames=5, + max_segment_frames=300, + hand_type="right", + ) + result = op.process_single(sample) + segments = result[Fields.meta]["atomic_action_segments"] + + self.assertGreaterEqual(len(segments), 2) + # All segments should be for right hand + for seg in segments: + self.assertEqual(seg["hand_type"], "right") + self.assertIn("start_frame", seg) + self.assertIn("end_frame", seg) + self.assertIn("states", seg) + self.assertIn("valid_frame_ids", seg) + self.assertGreater(len(seg["states"]), 1) + + def test_both_hands(self): + """With hand_type='both', both hands should be segmented.""" + n = 40 + pos_r = np.column_stack([np.linspace(0, 1, n), + np.zeros(n), np.zeros(n)]) + pos_l = np.column_stack([np.zeros(n), + np.linspace(0, 1, n), np.zeros(n)]) + sample = self._make_sample( + right_states=self._make_states(pos_r), + left_states=self._make_states(pos_l), + ) + op = VideoAtomicActionSegmentMapper( + hand_action_field="hand_action_tags", + segment_field="segs", + hand_type="both", + min_segment_frames=5, + ) + result = op.process_single(sample) + segments = result[Fields.meta]["segs"] + + hand_types_present = {s["hand_type"] for s in segments} + self.assertIn("right", hand_types_present) + self.assertIn("left", hand_types_present) + + def test_too_few_frames_skip(self): + """Fewer frames than min_segment_frames → no segments.""" + states = self._make_states(np.zeros((3, 3))) + sample = self._make_sample(right_states=states) + + op = VideoAtomicActionSegmentMapper( + hand_action_field="hand_action_tags", + segment_field="segs", + min_segment_frames=8, + hand_type="right", + ) + result = op.process_single(sample) + segments = result[Fields.meta]["segs"] + self.assertEqual(len(segments), 0) + + def test_no_meta(self): + """Sample without meta should pass through unchanged.""" + sample = {"text": "hello"} + op = VideoAtomicActionSegmentMapper() + result = op.process_single(sample) + self.assertEqual(result, sample) + + def test_empty_hand_data(self): + """Empty hand_action_tags → no segments.""" + sample = {Fields.meta: {"hand_action_tags": []}} + op = VideoAtomicActionSegmentMapper( + hand_action_field="hand_action_tags", + segment_field="segs", + ) + result = op.process_single(sample) + self.assertNotIn("segs", result[Fields.meta]) + + def test_segments_sorted_by_start_frame(self): + """All segments should be sorted by start_frame.""" + n = 100 + positions = np.zeros((n, 3), dtype=np.float64) + for i in range(n): + positions[i] = [np.sin(i * 0.2) * 0.5, np.cos(i * 0.3) * 0.3, 0] + sample = self._make_sample( + right_states=self._make_states(positions), + ) + op = VideoAtomicActionSegmentMapper( + hand_action_field="hand_action_tags", + segment_field="segs", + hand_type="right", + min_segment_frames=5, + min_window=5, + ) + result = op.process_single(sample) + segments = result[Fields.meta]["segs"] + starts = [s["start_frame"] for s in segments] + self.assertEqual(starts, sorted(starts)) + + def test_segment_coverage(self): + """Segments should collectively cover all frames without gaps.""" + n = 50 + positions = np.zeros((n, 3), dtype=np.float64) + for i in range(n): + positions[i] = [i * 0.02, 0, 0] + sample = self._make_sample( + right_states=self._make_states(positions), + ) + op = VideoAtomicActionSegmentMapper( + hand_action_field="hand_action_tags", + segment_field="segs", + hand_type="right", + min_segment_frames=3, + min_window=3, + ) + result = op.process_single(sample) + segments = result[Fields.meta]["segs"] + + if len(segments) >= 2: + # Verify no gaps between consecutive segments + for i in range(len(segments) - 1): + self.assertEqual( + segments[i]["end_frame"], + segments[i + 1]["start_frame"] - 1, + "Gap between consecutive segments", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/ops/mapper/test_video_camera_calibration_deepcalib_mapper.py b/tests/ops/mapper/test_video_camera_calibration_deepcalib_mapper.py new file mode 100644 index 0000000000..9fa9e12a4f --- /dev/null +++ b/tests/ops/mapper/test_video_camera_calibration_deepcalib_mapper.py @@ -0,0 +1,87 @@ +import os +import unittest +import numpy as np + +from data_juicer.core.data import NestedDataset as Dataset +from data_juicer.ops.mapper.video_camera_calibration_deepcalib_mapper import VideoCameraCalibrationDeepcalibMapper +from data_juicer.ops.mapper.video_extract_frames_mapper import VideoExtractFramesMapper +from data_juicer.utils.mm_utils import SpecialTokens +from data_juicer.utils.constant import Fields, MetaKeys, CameraCalibrationKeys +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase +from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE + + +class VideoCameraCalibrationDeepcalibMapperTest(DataJuicerTestCaseBase): + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', + 'data') + vid3_path = os.path.join(data_path, 'video3.mp4') + vid4_path = os.path.join(data_path, 'video4.mp4') + vid12_path = os.path.join(data_path, 'video12.mp4') + + def _run_and_assert(self, num_proc): + ds_list = [{ + 'videos': [self.vid3_path] + }, { + 'videos': [self.vid4_path] + }, { + 'videos': [self.vid12_path] + }] + + tgt_list = [{"frame_names_shape": [6], + "intrinsics_list_shape": [6, 3, 3], + "xi_list_shape": [6], + "hfov_list_shape": [6], + "vfov_list_shape": [6]}, + {"frame_names_shape": [5], + "intrinsics_list_shape": [5, 3, 3], + "xi_list_shape": [5], + "hfov_list_shape": [5], + "vfov_list_shape": [5]}, + {"frame_names_shape": [1], + "intrinsics_list_shape": [1, 3, 3], + "xi_list_shape": [1], + "hfov_list_shape": [1], + "vfov_list_shape": [1]}] + + # Step 1: Extract frames from videos + extract_op = VideoExtractFramesMapper( + frame_sampling_method='all_keyframes', + output_format='bytes', + legacy_split_by_text_token=False, + ) + dataset = Dataset.from_list(ds_list) + if Fields.meta not in dataset.features: + dataset = dataset.add_column(name=Fields.meta, + column=[{}] * dataset.num_rows) + dataset = dataset.map(extract_op.process, num_proc=num_proc, batched=True, batch_size=1) + + # Step 2: Run camera calibration + op = VideoCameraCalibrationDeepcalibMapper( + model_path="weights_10_0.02.h5", + ) + dataset = dataset.map(op.process, num_proc=num_proc) + res_list = dataset.to_list() + + for sample, target in zip(res_list, tgt_list): + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_deepcalib_tags][0][CameraCalibrationKeys.intrinsics]).shape), + target["intrinsics_list_shape"]) + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_deepcalib_tags][0][CameraCalibrationKeys.xi]).shape), + target["xi_list_shape"]) + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_deepcalib_tags][0][CameraCalibrationKeys.hfov]).shape), + target["hfov_list_shape"]) + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_deepcalib_tags][0][CameraCalibrationKeys.vfov]).shape), + target["vfov_list_shape"]) + + def test(self): + self._run_and_assert(num_proc=1) + + def test_mul_proc(self): + self._run_and_assert(num_proc=2) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/ops/mapper/test_video_camera_calibration_droidcalib_mapper.py b/tests/ops/mapper/test_video_camera_calibration_droidcalib_mapper.py new file mode 100644 index 0000000000..d7849f0082 --- /dev/null +++ b/tests/ops/mapper/test_video_camera_calibration_droidcalib_mapper.py @@ -0,0 +1,57 @@ +import os +import unittest +import numpy as np + +from data_juicer.core.data import NestedDataset as Dataset +from data_juicer.ops.mapper.video_camera_calibration_droidcalib_mapper import VideoCameraCalibrationDroidCalibMapper +from data_juicer.utils.constant import Fields, MetaKeys, CameraCalibrationKeys +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase +from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE + + +@unittest.skip( + 'Requires CUDA and DroidCalib compiled extensions. ' + 'Run manually with GPU available.' +) +class VideoCameraCalibrationDroidCalibMapperTest(DataJuicerTestCaseBase): + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', + 'data') + vid3_path = os.path.join(data_path, 'video3.mp4') + vid12_path = os.path.join(data_path, 'video12.mp4') + + def _run_and_assert(self, num_proc): + ds_list = [{ + 'videos': [self.vid3_path] + }, { + 'videos': [self.vid12_path] + }] + + op = VideoCameraCalibrationDroidCalibMapper( + tag_field_name=MetaKeys.camera_calibration_droidcalib_tags, + ) + dataset = Dataset.from_list(ds_list) + if Fields.meta not in dataset.features: + dataset = dataset.add_column(name=Fields.meta, + column=[{}] * dataset.num_rows) + dataset = dataset.map(op.process, num_proc=num_proc, with_rank=True) + res_list = dataset.to_list() + + for sample in res_list: + tag_list = sample[Fields.meta][MetaKeys.camera_calibration_droidcalib_tags] + self.assertIsInstance(tag_list, list) + self.assertGreater(len(tag_list), 0) + + for video_result in tag_list: + self.assertIn(CameraCalibrationKeys.intrinsics, video_result) + intrinsics = np.array(video_result[CameraCalibrationKeys.intrinsics]) + self.assertEqual(intrinsics.shape, (3, 3)) + + def test(self): + self._run_and_assert(num_proc=1) + + def test_mul_proc(self): + self._run_and_assert(num_proc=2) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/ops/mapper/test_video_camera_calibration_moge_mapper.py b/tests/ops/mapper/test_video_camera_calibration_moge_mapper.py new file mode 100644 index 0000000000..eb7aa286e7 --- /dev/null +++ b/tests/ops/mapper/test_video_camera_calibration_moge_mapper.py @@ -0,0 +1,100 @@ +import os +import unittest +import numpy as np + +from data_juicer.core.data import NestedDataset as Dataset +from data_juicer.ops.mapper import VideoExtractFramesMapper, VideoCameraCalibrationMogeMapper +from data_juicer.utils.mm_utils import SpecialTokens +from data_juicer.utils.constant import Fields, MetaKeys, CameraCalibrationKeys +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase +from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE + + +class VideoCameraCalibrationMogeMapperTest(DataJuicerTestCaseBase): + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', + 'data') + vid3_path = os.path.join(data_path, 'video3.mp4') + vid4_path = os.path.join(data_path, 'video4.mp4') + vid12_path = os.path.join(data_path, 'video12.mp4') + + def test_default(self): + ds_list = [{ + 'videos': [self.vid3_path] + }, { + 'videos': [self.vid4_path] + }, { + 'videos': [self.vid12_path] + }] + + num_frames_vid3 = 6 + num_frames_vid4 = 5 + num_frames_vid12 = 1 + tgt_list = [{"frame_names_shape": [num_frames_vid3], + "intrinsics_list_shape": [num_frames_vid3, 3, 3], + "hfov_list_shape": [num_frames_vid3], + "vfov_list_shape": [num_frames_vid3], + "points_list_shape": [num_frames_vid3, 640, 362, 3], + "depth_list_shape": [num_frames_vid3, 640, 362], + "mask_list_shape": [num_frames_vid3, 640, 362]}, + {"frame_names_shape": [num_frames_vid4], + "intrinsics_list_shape": [num_frames_vid4, 3, 3], + "hfov_list_shape": [num_frames_vid4], + "vfov_list_shape": [num_frames_vid4], + "points_list_shape": [num_frames_vid4, 360, 480, 3], + "depth_list_shape": [num_frames_vid4, 360, 480], + "mask_list_shape": [num_frames_vid4, 360, 480]}, + {"frame_names_shape": [num_frames_vid12], + "intrinsics_list_shape": [num_frames_vid12, 3, 3], + "hfov_list_shape": [num_frames_vid12], + "vfov_list_shape": [num_frames_vid12], + "points_list_shape": [num_frames_vid12, 1080, 1920, 3], + "depth_list_shape": [num_frames_vid12, 1080, 1920], + "mask_list_shape": [num_frames_vid12, 1080, 1920]}] + + + # Step 1: Extract frames from videos + extract_op = VideoExtractFramesMapper( + frame_sampling_method='all_keyframes', + output_format='bytes', + legacy_split_by_text_token=False, + frame_field=MetaKeys.video_frames, + ) + dataset = Dataset.from_list(ds_list) + if Fields.meta not in dataset.features: + dataset = dataset.add_column(name=Fields.meta, + column=[{}] * dataset.num_rows) + dataset = dataset.map(extract_op.process, batched=True, batch_size=1) + + # Step 2: Run camera calibration + op = VideoCameraCalibrationMogeMapper( + model_path="Ruicheng/moge-2-vitl", + frame_field=MetaKeys.video_frames, + tag_field_name=MetaKeys.camera_calibration_moge_tags, + ) + + dataset = dataset.map(op.process) + res_list = dataset.to_list() + + for sample, target in zip(res_list, tgt_list): + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_moge_tags][0][CameraCalibrationKeys.intrinsics]).shape), + target["intrinsics_list_shape"]) + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_moge_tags][0][CameraCalibrationKeys.hfov]).shape), + target["hfov_list_shape"]) + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_moge_tags][0][CameraCalibrationKeys.vfov]).shape), + target["vfov_list_shape"]) + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_moge_tags][0][CameraCalibrationKeys.points]).shape), + target["points_list_shape"]) + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_moge_tags][0][CameraCalibrationKeys.depth]).shape), + target["depth_list_shape"]) + self.assertEqual( + list(np.array(sample[Fields.meta][MetaKeys.camera_calibration_moge_tags][0][CameraCalibrationKeys.mask]).shape), + target["mask_list_shape"]) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/ops/mapper/test_video_camera_calibration_static_deepcalib_mapper.py b/tests/ops/mapper/test_video_camera_calibration_static_deepcalib_mapper.py deleted file mode 100644 index ab816e6709..0000000000 --- a/tests/ops/mapper/test_video_camera_calibration_static_deepcalib_mapper.py +++ /dev/null @@ -1,76 +0,0 @@ -import os -import unittest -import numpy as np - -from data_juicer.core.data import NestedDataset as Dataset -from data_juicer.ops.mapper.video_camera_calibration_static_deepcalib_mapper import VideoCameraCalibrationStaticDeepcalibMapper -from data_juicer.utils.mm_utils import SpecialTokens -from data_juicer.utils.constant import Fields, MetaKeys -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE - - -class VideoCameraCalibrationStaticDeepcalibMapperTest(DataJuicerTestCaseBase): - data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', - 'data') - vid3_path = os.path.join(data_path, 'video3.mp4') - vid4_path = os.path.join(data_path, 'video4.mp4') - vid12_path = os.path.join(data_path, 'video12.mp4') - - def _run_and_assert(self, num_proc): - ds_list = [{ - 'videos': [self.vid3_path] - }, { - 'videos': [self.vid4_path] - }, { - 'videos': [self.vid12_path] - }] - - tgt_list = [{"frame_names_shape": [49], - "intrinsics_list_shape": [49, 3, 3], - "xi_list_shape": [49], - "hfov_list_shape": [49], - "vfov_list_shape": [49]}, - {"frame_names_shape": [22], - "intrinsics_list_shape": [22, 3, 3], - "xi_list_shape": [22], - "hfov_list_shape": [22], - "vfov_list_shape": [22]}, - {"frame_names_shape": [3], - "intrinsics_list_shape": [3, 3, 3], - "xi_list_shape": [3], - "hfov_list_shape": [3], - "vfov_list_shape": [3]}] - - op = VideoCameraCalibrationStaticDeepcalibMapper( - model_path="weights_10_0.02.h5", - frame_num=1, - duration=1, - frame_dir=DATA_JUICER_ASSETS_CACHE, - if_output_info=True, - output_info_dir=DATA_JUICER_ASSETS_CACHE, - ) - dataset = Dataset.from_list(ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, - column=[{}] * dataset.num_rows) - dataset = dataset.map(op.process, num_proc=num_proc, with_rank=True) - res_list = dataset.to_list() - - for sample, target in zip(res_list, tgt_list): - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_deepcalib_tags]["frame_names"]).shape), target["frame_names_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_deepcalib_tags]["intrinsics_list"]).shape), target["intrinsics_list_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_deepcalib_tags]["xi_list"]).shape), target["xi_list_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_deepcalib_tags]["hfov_list"]).shape), target["hfov_list_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_deepcalib_tags]["vfov_list"]).shape), target["vfov_list_shape"]) - - - def test(self): - self._run_and_assert(num_proc=1) - - def test_mul_proc(self): - self._run_and_assert(num_proc=2) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/ops/mapper/test_video_camera_calibration_static_moge_mapper.py b/tests/ops/mapper/test_video_camera_calibration_static_moge_mapper.py deleted file mode 100644 index 1aa03a5fbd..0000000000 --- a/tests/ops/mapper/test_video_camera_calibration_static_moge_mapper.py +++ /dev/null @@ -1,89 +0,0 @@ -import os -import unittest -import numpy as np - -from data_juicer.core.data import NestedDataset as Dataset -from data_juicer.ops.mapper.video_camera_calibration_static_moge_mapper import VideoCameraCalibrationStaticMogeMapper -from data_juicer.utils.mm_utils import SpecialTokens -from data_juicer.utils.constant import Fields, MetaKeys -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE - - -class VideoCameraCalibrationStaticMogeMapperTest(DataJuicerTestCaseBase): - data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', - 'data') - vid3_path = os.path.join(data_path, 'video3.mp4') - vid4_path = os.path.join(data_path, 'video4.mp4') - vid12_path = os.path.join(data_path, 'video12.mp4') - - def _run_and_assert(self, num_proc): - ds_list = [{ - 'videos': [self.vid3_path] - }, { - 'videos': [self.vid4_path] - }, { - 'videos': [self.vid12_path] - }] - - tgt_list = [{"frame_names_shape": [49], - "intrinsics_list_shape": [49, 3, 3], - "hfov_list_shape": [49], - "vfov_list_shape": [49], - "points_list_shape": [49, 640, 362, 3], - "depth_list_shape": [49, 640, 362], - "mask_list_shape": [49, 640, 362]}, - {"frame_names_shape": [22], - "intrinsics_list_shape": [22, 3, 3], - "hfov_list_shape": [22], - "vfov_list_shape": [22], - "points_list_shape": [22, 360, 480, 3], - "depth_list_shape": [22, 360, 480], - "mask_list_shape": [22, 360, 480]}, - {"frame_names_shape": [3], - "intrinsics_list_shape": [3, 3, 3], - "hfov_list_shape": [3], - "vfov_list_shape": [3], - "points_list_shape": [3, 1080, 1920, 3], - "depth_list_shape": [3, 1080, 1920], - "mask_list_shape": [3, 1080, 1920]}] - - op = VideoCameraCalibrationStaticMogeMapper( - model_path="Ruicheng/moge-2-vitl", - frame_num=1, - duration=1, - frame_dir=DATA_JUICER_ASSETS_CACHE, - if_output_info=True, - output_info_dir=DATA_JUICER_ASSETS_CACHE, - if_output_points_info=True, - if_output_depth_info=True, - if_output_mask_info=True, - ) - - dataset = Dataset.from_list(ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, - column=[{}] * dataset.num_rows) - dataset = dataset.map(op.process, num_proc=num_proc, with_rank=True) - res_list = dataset.to_list() - - - for sample, target in zip(res_list, tgt_list): - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["frame_names"]).shape), target["frame_names_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["intrinsics_list"]).shape), target["intrinsics_list_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["hfov_list"]).shape), target["hfov_list_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["vfov_list"]).shape), target["vfov_list_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["points_list"]).shape), target["points_list_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["depth_list"]).shape), target["depth_list_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.static_camera_calibration_moge_tags]["mask_list"]).shape), target["mask_list_shape"]) - - - def test(self): - self._run_and_assert(num_proc=1) - - def test_mul_proc(self): - self._run_and_assert(num_proc=2) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/ops/mapper/test_video_camera_pose_mapper.py b/tests/ops/mapper/test_video_camera_pose_mapper.py deleted file mode 100644 index 2ef303d7d5..0000000000 --- a/tests/ops/mapper/test_video_camera_pose_mapper.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -import unittest -import numpy as np - -from data_juicer.core.data import NestedDataset as Dataset -from data_juicer.ops.mapper.video_camera_pose_mapper import VideoCameraPoseMapper -from data_juicer.utils.mm_utils import SpecialTokens -from data_juicer.utils.constant import Fields, MetaKeys -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE - - - -class VideoCameraPoseMapperTest(DataJuicerTestCaseBase): - data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', - 'data') - vid3_path = os.path.join(data_path, 'video3.mp4') - vid11_path = os.path.join(data_path, 'video11.mp4') - vid12_path = os.path.join(data_path, 'video12.mp4') - - - def _run_and_assert(self, num_proc): - ds_list = [{ - 'videos': [self.vid3_path] - }, { - 'videos': [self.vid11_path] - }, { - 'videos': [self.vid12_path] - }] - - tgt_list = [{"images_shape": [49, 584, 328, 3], - "depths_shape": [49, 584, 328], - "intrinsic_shape": [3, 3], - "cam_c2w_shape": [49, 4, 4]}, - {"images_shape": [11, 328, 584, 3], - "depths_shape": [11, 328, 584], - "intrinsic_shape": [3, 3], - "cam_c2w_shape": [11, 4, 4]}, - {"images_shape": [3, 328, 584, 3], - "depths_shape": [3, 328, 584], - "intrinsic_shape": [3, 3], - "cam_c2w_shape": [3, 4, 4]}] - - op = VideoCameraPoseMapper( - moge_model_path="Ruicheng/moge-2-vitl", - frame_num=1, - duration=1, - frame_dir=DATA_JUICER_ASSETS_CACHE, - if_output_moge_info=False, - moge_output_info_dir=DATA_JUICER_ASSETS_CACHE, - if_save_info=True, - output_info_dir=DATA_JUICER_ASSETS_CACHE, - ) - - dataset = Dataset.from_list(ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, - column=[{}] * dataset.num_rows) - dataset = dataset.map(op.process, num_proc=num_proc, with_rank=True) - res_list = dataset.to_list() - - for sample, target in zip(res_list, tgt_list): - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_camera_pose_tags]["images"]).shape), target["images_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_camera_pose_tags]["depths"]).shape), target["depths_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_camera_pose_tags]["intrinsic"]).shape), target["intrinsic_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.video_camera_pose_tags]["cam_c2w"]).shape), target["cam_c2w_shape"]) - - - def test(self): - self._run_and_assert(num_proc=1) - - def test_mul_proc(self): - self._run_and_assert(num_proc=2) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/ops/mapper/test_video_camera_pose_megasam_mapper.py b/tests/ops/mapper/test_video_camera_pose_megasam_mapper.py new file mode 100644 index 0000000000..075c0839ea --- /dev/null +++ b/tests/ops/mapper/test_video_camera_pose_megasam_mapper.py @@ -0,0 +1,125 @@ +import os +import unittest +import numpy as np +import cv2 + +from data_juicer.core.data import NestedDataset as Dataset +from data_juicer.ops.mapper.video_camera_pose_megasam_mapper import VideoCameraPoseMegaSaMMapper +from data_juicer.ops.mapper.video_extract_frames_mapper import VideoExtractFramesMapper +from data_juicer.utils.constant import Fields, MetaKeys, CameraCalibrationKeys +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +@unittest.skip( + 'Requires mega-sam conda environment with CUDA compiled extensions ' + '(droid_backends, lietorch).' +) +class VideoCameraPoseMegaSaMMapperTest(DataJuicerTestCaseBase): + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', + 'data') + vid3_path = os.path.join(data_path, 'video3.mp4') + vid12_path = os.path.join(data_path, 'video12.mp4') + + def _extract_frames_and_build_dataset(self): + """Extract real frames from videos, then build dataset with + dummy camera calibration (depth + intrinsics) to simulate + output from VideoCameraCalibrationMogeMapper.""" + ds_list = [{ + 'videos': [self.vid3_path] + }, { + 'videos': [self.vid12_path] + }] + + # Step 1: Extract real frames from videos + # Use 'uniform' sampling to ensure enough frames for DROID-SLAM + extract_op = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=8, + output_format='bytes', + legacy_split_by_text_token=False, + ) + dataset = Dataset.from_list(ds_list) + if Fields.meta not in dataset.features: + dataset = dataset.add_column(name=Fields.meta, + column=[{}] * dataset.num_rows) + dataset = dataset.map(extract_op.process, batched=True, batch_size=1) + + # Step 2: Add dummy camera calibration data matching real frame dims + res_list = dataset.to_list() + for sample in res_list: + video_frames = sample[MetaKeys.video_frames] + calibration_list = [] + for frames_per_video in video_frames: + num_frames = len(frames_per_video) + # Read the first frame to get dimensions + first_frame = frames_per_video[0] + if isinstance(first_frame, bytes): + image_array = np.frombuffer(first_frame, dtype=np.uint8) + first_frame = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + else: + first_frame = cv2.imread(first_frame) + h, w, _ = first_frame.shape + # Dummy intrinsics (3x3) + focal = 500.0 + K = [[focal, 0, w / 2.0], + [0, focal, h / 2.0], + [0, 0, 1]] + # Structured depth maps (N, H, W) simulating a scene + # with smooth depth gradient (near=1m to far=5m) + # Random noise causes DROID-SLAM factor graph to be empty + base_depth = np.linspace(1.0, 5.0, h).reshape(h, 1) + base_depth = np.broadcast_to(base_depth, (h, w)) + depth = np.stack([ + base_depth + 0.1 * i for i in range(num_frames) + ]).tolist() + calibration_list.append({ + CameraCalibrationKeys.depth: depth, + CameraCalibrationKeys.intrinsics: K, + }) + sample[Fields.meta]['camera_calibration'] = calibration_list + + dataset = Dataset.from_list(res_list) + return dataset + + def test_default(self): + dataset = self._extract_frames_and_build_dataset() + + op = VideoCameraPoseMegaSaMMapper( + tag_field_name=MetaKeys.video_camera_pose_tags, + frame_field=MetaKeys.video_frames, + camera_calibration_field='camera_calibration', + max_frames=1000, + ) + dataset = dataset.map(op.process) + res_list = dataset.to_list() + + tgt = { + "depths_ndim": 3, + "intrinsic_shape": [3, 3], + "cam_c2w_last_dim": 4, + } + + for sample in res_list: + tag_list = sample[Fields.meta][MetaKeys.video_camera_pose_tags] + self.assertIsInstance(tag_list, list) + self.assertGreater(len(tag_list), 0) + + for video_result in tag_list: + # Check output keys + self.assertIn(CameraCalibrationKeys.depth, video_result) + self.assertIn(CameraCalibrationKeys.intrinsics, video_result) + self.assertIn(CameraCalibrationKeys.cam_c2w, video_result) + + # Check shapes + depths = np.array(video_result[CameraCalibrationKeys.depth]) + intrinsic = np.array(video_result[CameraCalibrationKeys.intrinsics]) + cam_c2w = np.array(video_result[CameraCalibrationKeys.cam_c2w]) + + self.assertEqual(depths.ndim, tgt["depths_ndim"]) + self.assertEqual(list(intrinsic.shape), tgt["intrinsic_shape"]) + self.assertEqual(cam_c2w.shape[-1], tgt["cam_c2w_last_dim"]) + self.assertEqual(cam_c2w.shape[-2], 4) # (N, 4, 4) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/ops/mapper/test_video_clip_reassembly_mapper.py b/tests/ops/mapper/test_video_clip_reassembly_mapper.py new file mode 100644 index 0000000000..1170a85319 --- /dev/null +++ b/tests/ops/mapper/test_video_clip_reassembly_mapper.py @@ -0,0 +1,284 @@ +import os +import shutil +import tempfile +import unittest + +import cv2 +import numpy as np + +from data_juicer.ops.base_op import Fields +from data_juicer.ops.mapper.video_clip_reassembly_mapper import \ + VideoClipReassemblyMapper +from data_juicer.utils.constant import CameraCalibrationKeys, MetaKeys +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +class VideoClipReassemblyMapperTest(DataJuicerTestCaseBase): + """Tests for VideoClipReassemblyMapper.""" + + def setUp(self): + super().setUp() + self.tmp_dir = tempfile.mkdtemp() + + def tearDown(self): + super().tearDown() + if os.path.exists(self.tmp_dir): + shutil.rmtree(self.tmp_dir) + + # ------------------------------------------------------------------ + # helpers + # ------------------------------------------------------------------ + def _create_frames(self, n_frames, prefix="clip0"): + """Create unique dummy frames with reproducible content.""" + clip_dir = os.path.join(self.tmp_dir, prefix) + os.makedirs(clip_dir, exist_ok=True) + paths = [] + for i in range(n_frames): + # Deterministic content based on global frame id + img = np.full((100, 100, 3), fill_value=(i * 7) % 256, + dtype=np.uint8) + path = os.path.join(clip_dir, f"frame_{i:04d}.jpg") + cv2.imwrite(path, img) + paths.append(path) + return paths + + def _create_overlapping_clips( + self, total_frames=30, clip_len=15, overlap=5, + ): + """Create overlapping clip frame lists with shared frame images. + + Returns (per_clip_frames, all_frame_paths). + """ + step = clip_len - overlap + all_dir = os.path.join(self.tmp_dir, "all_frames") + os.makedirs(all_dir, exist_ok=True) + + # Create all unique frames + all_paths = [] + for i in range(total_frames): + img = np.full((100, 100, 3), fill_value=(i * 7) % 256, + dtype=np.uint8) + path = os.path.join(all_dir, f"frame_{i:04d}.jpg") + cv2.imwrite(path, img) + all_paths.append(path) + + # Build per-clip frame lists (with real overlapping files) + per_clip = [] + offset = 0 + while offset < total_frames: + end = min(offset + clip_len, total_frames) + clip_frames = [] + clip_dir = os.path.join( + self.tmp_dir, f"clip_{len(per_clip)}") + os.makedirs(clip_dir, exist_ok=True) + for local_i, global_i in enumerate(range(offset, end)): + # Copy the global frame so pixel matching works + src = all_paths[global_i] + dst = os.path.join(clip_dir, f"frame_{local_i:04d}.jpg") + img = cv2.imread(src) + cv2.imwrite(dst, img) + clip_frames.append(dst) + per_clip.append(clip_frames) + offset += step + if end >= total_frames: + break + + return per_clip, all_paths + + @staticmethod + def _make_hand_data(n_frames, hand_type="right", offset=0.0): + states = np.zeros((n_frames, 8), dtype=np.float32) + for i in range(n_frames): + states[i, 0] = offset + i * 0.01 + return { + "hand_type": hand_type, + "states": states.tolist(), + "actions": np.zeros((n_frames, 7)).tolist(), + "valid_frame_ids": list(range(n_frames)), + "joints_world": [], + "joints_cam": [], + } + + @staticmethod + def _make_cam_pose(n_frames): + c2w = np.array([np.eye(4) for _ in range(n_frames)]) + return {CameraCalibrationKeys.cam_c2w: c2w.tolist()} + + # ------------------------------------------------------------------ + # _merge_video_frames + # ------------------------------------------------------------------ + def test_merge_video_frames_no_overlap(self): + frames_a = ["a0", "a1", "a2"] + frames_b = ["b0", "b1", "b2"] + merged = VideoClipReassemblyMapper._merge_video_frames( + [frames_a, frames_b], [0, 3]) + self.assertEqual(merged, ["a0", "a1", "a2", "b0", "b1", "b2"]) + + def test_merge_video_frames_with_overlap(self): + frames_a = ["a0", "a1", "a2", "a3", "a4"] + frames_b = ["b0", "b1", "b2", "b3", "b4"] + # Clip B starts at global offset 3 → overlaps with a3, a4 + merged = VideoClipReassemblyMapper._merge_video_frames( + [frames_a, frames_b], [0, 3]) + self.assertEqual(len(merged), 8) # 0..7 + # First clip fills 0-4, second fills 5-7 (3+0=3 already filled) + self.assertEqual(merged[0], "a0") + self.assertEqual(merged[3], "a3") # first clip wins + self.assertEqual(merged[5], "b2") # only from clip B + + # ------------------------------------------------------------------ + # _detect_clip_offsets + # ------------------------------------------------------------------ + def test_detect_clip_offsets_with_matching(self): + """Pixel matching should detect the correct overlap offset.""" + per_clip, _ = self._create_overlapping_clips( + total_frames=30, clip_len=15, overlap=5) + offsets = VideoClipReassemblyMapper._detect_clip_offsets( + per_clip, nominal_step=10) + # First offset is always 0 + self.assertEqual(offsets[0], 0) + # With clip_len=15, overlap=5 → step=10 + if len(offsets) > 1: + self.assertEqual(offsets[1], 10) + + def test_detect_clip_offsets_single_clip(self): + frames = self._create_frames(10, "only_clip") + offsets = VideoClipReassemblyMapper._detect_clip_offsets( + [frames], nominal_step=10) + self.assertEqual(offsets, [0]) + + # ------------------------------------------------------------------ + # _blend_weight + # ------------------------------------------------------------------ + def test_blend_weight_no_overlap(self): + op = VideoClipReassemblyMapper() + w = op._blend_weight( + clip_idx=0, local_fid=5, n_clips=1, + clip_len=10, overlap_prev=0, overlap_next=0) + self.assertAlmostEqual(w, 1.0) + + def test_blend_weight_overlap_ramp(self): + op = VideoClipReassemblyMapper() + # First frame of overlap region with previous clip + w = op._blend_weight( + clip_idx=1, local_fid=0, n_clips=3, + clip_len=20, overlap_prev=5, overlap_next=5) + self.assertLess(w, 1.0) + self.assertGreater(w, 0.0) + + # ------------------------------------------------------------------ + # _merge_moge + # ------------------------------------------------------------------ + def test_merge_moge_basic(self): + moge_a = {"depth": ["d0", "d1", "d2"], "hfov": [1.0, 1.0, 1.0]} + moge_b = {"depth": ["d3", "d4", "d5"], "hfov": [1.0, 1.0, 1.0]} + merged = VideoClipReassemblyMapper._merge_moge( + [moge_a, moge_b], [0, 3]) + self.assertEqual(len(merged["depth"]), 6) + + # ------------------------------------------------------------------ + # end-to-end with single clip → passthrough + # ------------------------------------------------------------------ + def test_single_clip_passthrough(self): + """Single clip (non-nested frames) → no reassembly needed.""" + frames = self._create_frames(10, "single") + sample = { + "videos": ["video.mp4"], + "clips": ["video.mp4"], + "video_frames": frames, # not nested + Fields.meta: { + MetaKeys.hand_action_tags: [ + {"right": self._make_hand_data(10)}, + ], + MetaKeys.video_camera_pose_tags: [ + self._make_cam_pose(10), + ], + MetaKeys.camera_calibration_moge_tags: [ + {"hfov": [1.0] * 10}, + ], + }, + } + op = VideoClipReassemblyMapper( + split_duration=5.0, overlap_duration=2.0, fps=10.0) + result = op.process_single(sample) + # Should be unchanged + self.assertEqual(len(result["video_frames"]), 10) + + # ------------------------------------------------------------------ + # end-to-end with multiple clips + # ------------------------------------------------------------------ + def test_multi_clip_reassembly(self): + """Two overlapping clips should be merged correctly.""" + per_clip, _ = self._create_overlapping_clips( + total_frames=25, clip_len=15, overlap=5) + n_clips = len(per_clip) + + # Build per-clip hand action data + hand_actions = [] + cam_poses = [] + moge_list = [] + for ci in range(n_clips): + n = len(per_clip[ci]) + hand_actions.append({"right": self._make_hand_data(n)}) + cam_poses.append(self._make_cam_pose(n)) + moge_list.append({"depth": [f"d{ci}_{j}" for j in range(n)], + "hfov": [1.0] * n}) + + sample = { + "videos": ["video.mp4"], + "clips": ["clip0.mp4", "clip1.mp4"], + "video_frames": per_clip, + Fields.meta: { + MetaKeys.hand_action_tags: hand_actions, + MetaKeys.video_camera_pose_tags: cam_poses, + MetaKeys.camera_calibration_moge_tags: moge_list, + }, + } + + op = VideoClipReassemblyMapper( + split_duration=1.5, overlap_duration=0.5, fps=10.0) + result = op.process_single(sample) + + meta = result[Fields.meta] + # hand_action_tags should be merged into single entry + ha = meta[MetaKeys.hand_action_tags] + self.assertEqual(len(ha), 1) + merged_right = ha[0]["right"] + # Merged trajectory should cover more frames than a single clip + self.assertGreater(len(merged_right["states"]), + len(per_clip[0])) + + # video_frames should be merged + merged_frames = result["video_frames"] + self.assertIsInstance(merged_frames, list) + + # ------------------------------------------------------------------ + # _empty_hand_result + # ------------------------------------------------------------------ + def test_empty_hand_result(self): + r = VideoClipReassemblyMapper._empty_hand_result("left") + self.assertEqual(r["hand_type"], "left") + self.assertEqual(r["states"], []) + self.assertEqual(r["valid_frame_ids"], []) + + # ------------------------------------------------------------------ + # nominal step computation + # ------------------------------------------------------------------ + def test_compute_nominal_step(self): + op = VideoClipReassemblyMapper( + split_duration=5.0, overlap_duration=2.0, fps=30.0) + self.assertEqual(op._compute_nominal_step(), 90) + + def test_compute_nominal_step_none(self): + op = VideoClipReassemblyMapper() + self.assertIsNone(op._compute_nominal_step()) + + def test_no_meta_passthrough(self): + sample = {"text": "hello"} + op = VideoClipReassemblyMapper() + result = op.process_single(sample) + self.assertEqual(result, sample) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/ops/mapper/test_video_extract_frames_mapper.py b/tests/ops/mapper/test_video_extract_frames_mapper.py index 8acfb43c29..7626a350b7 100644 --- a/tests/ops/mapper/test_video_extract_frames_mapper.py +++ b/tests/ops/mapper/test_video_extract_frames_mapper.py @@ -306,5 +306,340 @@ def test_legacy_split_by_text_token_false(self): self.assertTrue(isinstance(res_list[sample_i][frame_key][video_idx][0], bytes)) + # ---------------------------------------------------------------- + # FFmpeg backend: uniform sampling tests + # ---------------------------------------------------------------- + + def test_ffmpeg_uniform_no_duration_path(self): + """FFmpeg uniform sampling without duration, output as path.""" + ds_list = [{ + 'text': f'{SpecialTokens.video} 白色的小羊站在一旁讲话。旁边还有两只灰色猫咪和一只拉着灰狼的猫咪。', + 'videos': [self.vid1_path] + }, { + 'text': + f'{SpecialTokens.video} 身穿白色上衣的男子,拿着一个东西,拍打自己的胃部。{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }, { + 'text': + f'{SpecialTokens.video} 两个长头发的女子正坐在一张圆桌前讲话互动。 {SpecialTokens.eoc}', + 'videos': [self.vid3_path] + }] + + frame_num = 2 + frame_dir = os.path.join(self.tmp_dir, 'test_ffmpeg_uniform_no_dur') + vid1_frame_dir = self._get_frames_dir(self.vid1_path, frame_dir) + vid2_frame_dir = self._get_frames_dir(self.vid2_path, frame_dir) + vid3_frame_dir = self._get_frames_dir(self.vid3_path, frame_dir) + + op = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + output_format='path', + duration=0, + frame_dir=frame_dir, + batch_size=2, + num_proc=1, + video_backend='ffmpeg') + + dataset = Dataset.from_list(ds_list) + dataset = op.run(dataset) + res_list = dataset.to_list() + + tgt_frames_num = [[frame_num], [frame_num], [frame_num]] + tgt_frames_dir = [[vid1_frame_dir], [vid2_frame_dir], [vid3_frame_dir]] + for sample_i in range(len(res_list)): + num_videos = len(ds_list[sample_i]['videos']) + for video_idx in range(num_videos): + self.assertEqual( + res_list[sample_i][MetaKeys.video_frames][video_idx], + [osp.join( + tgt_frames_dir[sample_i][video_idx], + f'frame_{f_i}.jpg') for f_i in range(tgt_frames_num[sample_i][video_idx]) + ]) + # verify frame files actually exist + for frame_path in res_list[sample_i][MetaKeys.video_frames][video_idx]: + self.assertTrue(osp.exists(frame_path)) + + def test_ffmpeg_uniform_with_duration_path(self): + """FFmpeg uniform sampling with duration segmentation, output as path. + Should produce same frame counts as the av backend test_uniform_sampling. + """ + ds_list = [{ + 'text': f'{SpecialTokens.video} 白色的小羊站在一旁讲话。旁边还有两只灰色猫咪和一只拉着灰狼的猫咪。', + 'videos': [self.vid1_path] + }, { + 'text': + f'{SpecialTokens.video} 身穿白色上衣的男子,拿着一个东西,拍打自己的胃部。{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }, { + 'text': + f'{SpecialTokens.video} 两个长头发的女子正坐在一张圆桌前讲话互动。 {SpecialTokens.eoc}', + 'videos': [self.vid3_path] + }] + + frame_num = 3 + frame_dir = os.path.join(self.tmp_dir, 'test_ffmpeg_uniform_dur') + vid1_frame_dir = self._get_frames_dir(self.vid1_path, frame_dir) + vid2_frame_dir = self._get_frames_dir(self.vid2_path, frame_dir) + vid3_frame_dir = self._get_frames_dir(self.vid3_path, frame_dir) + + op = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + duration=10, + frame_dir=frame_dir, + batch_size=2, + num_proc=1, + video_backend='ffmpeg') + + dataset = Dataset.from_list(ds_list) + dataset = op.run(dataset) + res_list = dataset.to_list() + + # Same expected counts as AV backend: + # video1 (11.75s): 1 full segment [0,10) -> 3 frames + # video2 (23.17s): 2 full segments [0,10),[10,20) -> 6 frames + # video3 (49.58s): 4 full segments [0,10),[10,20),[20,30),[30,40) -> 12 frames + tgt_frames_num = [[3], [6], [12]] + tgt_frames_dir = [[vid1_frame_dir], [vid2_frame_dir], [vid3_frame_dir]] + for sample_i in range(len(res_list)): + num_videos = len(ds_list[sample_i]['videos']) + for video_idx in range(num_videos): + self.assertEqual( + res_list[sample_i][MetaKeys.video_frames][video_idx], + [osp.join( + tgt_frames_dir[sample_i][video_idx], + f'frame_{f_i}.jpg') for f_i in range(tgt_frames_num[sample_i][video_idx]) + ]) + + def test_ffmpeg_uniform_bytes_format(self): + """FFmpeg uniform sampling with bytes output format.""" + ds_list = [{ + 'text': f'{SpecialTokens.video} 白色的小羊站在一旁讲话。旁边还有两只灰色猫咪和一只拉着灰狼的猫咪。', + 'videos': [self.vid1_path] + }, { + 'text': + f'{SpecialTokens.video} 身穿白色上衣的男子,拿着一个东西,拍打自己的胃部。{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }, { + 'text': + f'{SpecialTokens.video} 两个长头发的女子正坐在一张圆桌前讲话互动。 {SpecialTokens.eoc}', + 'videos': [self.vid3_path] + }] + + frame_num = 3 + frame_field = 'frames_bytes' + op = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + output_format='bytes', + duration=0, + frame_field=frame_field, + batch_size=2, + num_proc=1, + video_backend='ffmpeg') + + dataset = Dataset.from_list(ds_list) + dataset = op.run(dataset) + res_list = dataset.to_list() + + for sample_i in range(len(res_list)): + num_videos = len(ds_list[sample_i]['videos']) + for video_idx in range(num_videos): + self.assertEqual( + len(res_list[sample_i][frame_field][video_idx]), + frame_num) + self.assertTrue( + isinstance(res_list[sample_i][frame_field][video_idx][0], bytes)) + + def test_ffmpeg_uniform_bytes_with_duration(self): + """FFmpeg uniform sampling with duration segmentation, bytes output.""" + ds_list = [{ + 'text': f'{SpecialTokens.video} 白色的小羊站在一旁讲话。旁边还有两只灰色猫咪和一只拉着灰狼的猫咪。', + 'videos': [self.vid1_path] + }, { + 'text': + f'{SpecialTokens.video} 身穿白色上衣的男子,拿着一个东西,拍打自己的胃部。{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }, { + 'text': + f'{SpecialTokens.video} 两个长头发的女子正坐在一张圆桌前讲话互动。 {SpecialTokens.eoc}', + 'videos': [self.vid3_path] + }] + + frame_num = 3 + frame_field = 'frames_bytes' + op = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + output_format='bytes', + duration=10, + frame_field=frame_field, + batch_size=2, + num_proc=1, + video_backend='ffmpeg') + + dataset = Dataset.from_list(ds_list) + dataset = op.run(dataset) + res_list = dataset.to_list() + + tgt_frames_num = [[3], [6], [12]] + for sample_i in range(len(res_list)): + num_videos = len(ds_list[sample_i]['videos']) + for video_idx in range(num_videos): + self.assertEqual( + len(res_list[sample_i][frame_field][video_idx]), + tgt_frames_num[sample_i][video_idx]) + self.assertTrue( + isinstance(res_list[sample_i][frame_field][video_idx][0], bytes)) + + def test_ffmpeg_uniform_single_frame(self): + """FFmpeg uniform sampling with frame_num=1 extracts middle frame.""" + ds_list = [{ + 'text': f'{SpecialTokens.video} 白色的小羊站在一旁讲话。', + 'videos': [self.vid1_path] + }] + + frame_num = 1 + frame_dir = os.path.join(self.tmp_dir, 'test_ffmpeg_single') + vid1_frame_dir = self._get_frames_dir(self.vid1_path, frame_dir) + + op = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + output_format='path', + duration=0, + frame_dir=frame_dir, + batch_size=1, + num_proc=1, + video_backend='ffmpeg') + + dataset = Dataset.from_list(ds_list) + dataset = op.run(dataset) + res_list = dataset.to_list() + + self.assertEqual( + res_list[0][MetaKeys.video_frames][0], + [osp.join(vid1_frame_dir, 'frame_0.jpg')]) + self.assertTrue(osp.exists(res_list[0][MetaKeys.video_frames][0][0])) + + def test_ffmpeg_uniform_multi_video_per_sample(self): + """FFmpeg uniform sampling with multiple videos in a single sample.""" + ds_list = [{ + 'text': + f'{SpecialTokens.video} 身穿白色上衣的男子。{SpecialTokens.eoc}' + + f'{SpecialTokens.video} 两个长头发的女子。{SpecialTokens.eoc}', + 'videos': [self.vid2_path, self.vid3_path] + }] + + frame_num = 4 + frame_dir = os.path.join(self.tmp_dir, 'test_ffmpeg_multi') + vid2_frame_dir = self._get_frames_dir(self.vid2_path, frame_dir) + vid3_frame_dir = self._get_frames_dir(self.vid3_path, frame_dir) + + op = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + output_format='path', + duration=0, + frame_dir=frame_dir, + batch_size=1, + num_proc=1, + video_backend='ffmpeg') + + dataset = Dataset.from_list(ds_list) + dataset = op.run(dataset) + res_list = dataset.to_list() + + self.assertEqual(len(res_list[0][MetaKeys.video_frames]), 2) + self.assertEqual( + res_list[0][MetaKeys.video_frames][0], + [osp.join(vid2_frame_dir, f'frame_{i}.jpg') for i in range(frame_num)]) + self.assertEqual( + res_list[0][MetaKeys.video_frames][1], + [osp.join(vid3_frame_dir, f'frame_{i}.jpg') for i in range(frame_num)]) + + def test_ffmpeg_uniform_legacy_split_false(self): + """FFmpeg uniform sampling with legacy_split_by_text_token=False.""" + ds_list = [{ + 'text': '', + 'videos': [self.vid1_path] + }, { + 'text': '', + 'videos': [self.vid2_path, self.vid3_path] + }] + + frame_num = 3 + frame_field = 'frames' + op = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + output_format='bytes', + duration=0, + frame_field=frame_field, + batch_size=2, + num_proc=1, + video_backend='ffmpeg', + legacy_split_by_text_token=False) + + dataset = Dataset.from_list(ds_list) + dataset = op.run(dataset) + res_list = dataset.to_list() + + # Each video should have exactly frame_num frames + self.assertEqual(len(res_list[0][frame_field][0]), frame_num) + self.assertEqual(len(res_list[1][frame_field][0]), frame_num) + self.assertEqual(len(res_list[1][frame_field][1]), frame_num) + self.assertTrue(isinstance(res_list[0][frame_field][0][0], bytes)) + + def test_ffmpeg_uniform_frame_count_matches_av(self): + """Verify that ffmpeg uniform sampling produces the same frame count + as the av backend for both with and without duration.""" + ds_list = [{ + 'text': f'{SpecialTokens.video} test', + 'videos': [self.vid3_path] + }] + + for frame_num in [1, 2, 3, 5]: + for duration in [0, 5, 10, 15]: + frame_field_av = f'frames_av_{frame_num}_{duration}' + frame_field_ff = f'frames_ff_{frame_num}_{duration}' + + op_av = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + output_format='bytes', + duration=duration, + frame_field=frame_field_av, + batch_size=1, + num_proc=1, + video_backend='av') + + op_ff = VideoExtractFramesMapper( + frame_sampling_method='uniform', + frame_num=frame_num, + output_format='bytes', + duration=duration, + frame_field=frame_field_ff, + batch_size=1, + num_proc=1, + video_backend='ffmpeg') + + dataset_av = Dataset.from_list(ds_list) + dataset_av = op_av.run(dataset_av) + res_av = dataset_av.to_list() + + dataset_ff = Dataset.from_list(ds_list) + dataset_ff = op_ff.run(dataset_ff) + res_ff = dataset_ff.to_list() + + av_count = len(res_av[0][frame_field_av][0]) + ff_count = len(res_ff[0][frame_field_ff][0]) + self.assertEqual( + av_count, ff_count, + f'Frame count mismatch for frame_num={frame_num}, ' + f'duration={duration}: av={av_count}, ffmpeg={ff_count}') + + if __name__ == '__main__': unittest.main() diff --git a/tests/ops/mapper/test_video_hand_action_compute_mapper.py b/tests/ops/mapper/test_video_hand_action_compute_mapper.py new file mode 100644 index 0000000000..d293eb5fb5 --- /dev/null +++ b/tests/ops/mapper/test_video_hand_action_compute_mapper.py @@ -0,0 +1,212 @@ +import os +import unittest +import numpy as np + +from data_juicer.core.data import NestedDataset as Dataset +from data_juicer.ops.mapper.video_hand_action_compute_mapper import VideoHandActionComputeMapper +from data_juicer.utils.constant import Fields, MetaKeys, CameraCalibrationKeys +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +class VideoHandActionComputeMapperTest(DataJuicerTestCaseBase): + + def _make_sample(self, num_frames=10, with_left=True): + """Create a synthetic sample with hand reconstruction and camera pose data.""" + # Generate dummy camera poses: identity with small translations + cam_c2w = [] + for i in range(num_frames): + mat = np.eye(4) + mat[0, 3] = i * 0.01 # small x translation + cam_c2w.append(mat.tolist()) + + # Generate dummy right hand reconstruction data + right_frame_ids = list(range(num_frames)) + right_transl = (np.random.randn(num_frames, 3) * 0.1).tolist() + right_global_orient = (np.random.randn(num_frames, 3) * 0.1).tolist() # axis-angle + right_hand_pose = (np.random.randn(num_frames, 45) * 0.1).tolist() # axis-angle + + hand_recon = { + "fov_x": 0.75, + "img_focal": 500.0, + "right": { + "frame_ids": right_frame_ids, + "transl": right_transl, + "global_orient": right_global_orient, + "hand_pose": right_hand_pose, + "betas": (np.zeros((num_frames, 10))).tolist(), + }, + "left": { + "frame_ids": [], + "transl": [], + "global_orient": [], + "hand_pose": [], + "betas": [], + } + } + + if with_left: + left_frame_ids = list(range(0, num_frames, 2)) # every other frame + n_left = len(left_frame_ids) + hand_recon["left"] = { + "frame_ids": left_frame_ids, + "transl": (np.random.randn(n_left, 3) * 0.1).tolist(), + "global_orient": (np.random.randn(n_left, 3) * 0.1).tolist(), + "hand_pose": (np.random.randn(n_left, 45) * 0.1).tolist(), + "betas": (np.zeros((n_left, 10))).tolist(), + } + + camera_pose = { + CameraCalibrationKeys.cam_c2w: cam_c2w, + } + + sample = { + 'videos': ['dummy_video.mp4'], + 'text': 'pick up the cup', + Fields.meta: { + MetaKeys.hand_reconstruction_hawor_tags: [hand_recon], + MetaKeys.video_camera_pose_tags: [camera_pose], + } + } + return sample + + def test_both_hands(self): + """Test computing actions for both hands.""" + sample = self._make_sample(num_frames=10, with_left=True) + ds_list = [sample] + + op = VideoHandActionComputeMapper( + hand_reconstruction_field=MetaKeys.hand_reconstruction_hawor_tags, + camera_pose_field=MetaKeys.video_camera_pose_tags, + tag_field_name=MetaKeys.hand_action_tags, + hand_type="both", + ) + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + self.assertEqual(len(res_list), 1) + tag = res_list[0][Fields.meta][MetaKeys.hand_action_tags] + self.assertIsInstance(tag, list) + self.assertEqual(len(tag), 1) # one video + + video_result = tag[0] + # Should have both hand results + self.assertIn('right', video_result) + self.assertIn('left', video_result) + + # Right hand + right = video_result['right'] + self.assertIn('states', right) + self.assertIn('actions', right) + self.assertIn('valid_frame_ids', right) + + states = np.array(right['states']) + actions = np.array(right['actions']) + self.assertEqual(states.shape[1], 8) # 8-dim state + self.assertEqual(actions.shape[1], 7) # 7-dim action + self.assertEqual(states.shape[0], actions.shape[0]) + + # Left hand + left = video_result['left'] + left_states = np.array(left['states']) + left_actions = np.array(left['actions']) + if len(left_states) > 0: + self.assertEqual(left_states.shape[1], 8) + self.assertEqual(left_actions.shape[1], 7) + + def test_right_hand_only(self): + """Test computing actions for right hand only.""" + sample = self._make_sample(num_frames=10, with_left=False) + ds_list = [sample] + + op = VideoHandActionComputeMapper( + hand_type="right", + ) + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + video_result = res_list[0][Fields.meta][MetaKeys.hand_action_tags][0] + self.assertIn('right', video_result) + self.assertNotIn('left', video_result) + + right = video_result['right'] + states = np.array(right['states']) + self.assertEqual(states.shape[1], 8) + + def test_insufficient_frames(self): + """Test with only 1 frame (needs at least 2 for actions).""" + sample = self._make_sample(num_frames=1, with_left=False) + ds_list = [sample] + + op = VideoHandActionComputeMapper(hand_type="right") + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + video_result = res_list[0][Fields.meta][MetaKeys.hand_action_tags][0] + right = video_result['right'] + # Should be empty when < 2 frames + self.assertEqual(len(right['states']), 0) + self.assertEqual(len(right['actions']), 0) + + def test_empty_hand_recon(self): + """Test with empty hand reconstruction data.""" + sample = { + 'videos': ['dummy_video.mp4'], + 'text': 'test', + Fields.meta: { + MetaKeys.hand_reconstruction_hawor_tags: [], + MetaKeys.video_camera_pose_tags: [], + } + } + + op = VideoHandActionComputeMapper(hand_type="both") + + dataset = Dataset.from_list([sample]) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + tag = res_list[0][Fields.meta][MetaKeys.hand_action_tags] + self.assertEqual(len(tag), 0) + + def test_last_action_is_zero(self): + """Test that the last frame's positional action is zero.""" + sample = self._make_sample(num_frames=5, with_left=False) + ds_list = [sample] + + op = VideoHandActionComputeMapper(hand_type="right") + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1, with_rank=True) + res_list = dataset.to_list() + + video_result = res_list[0][Fields.meta][MetaKeys.hand_action_tags][0] + actions = np.array(video_result['right']['actions']) + # Last action: position deltas should be zero + np.testing.assert_array_almost_equal(actions[-1, :6], 0.0, decimal=5) + + def test_mul_proc(self): + """Test with multiple processes.""" + samples = [ + self._make_sample(num_frames=8, with_left=True), + self._make_sample(num_frames=6, with_left=True), + ] + + op = VideoHandActionComputeMapper(hand_type="both") + + dataset = Dataset.from_list(samples) + dataset = dataset.map(op.process, num_proc=2, with_rank=True) + res_list = dataset.to_list() + + self.assertEqual(len(res_list), 2) + for sample in res_list: + tag = sample[Fields.meta][MetaKeys.hand_action_tags] + self.assertGreater(len(tag), 0) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/ops/mapper/test_video_hand_motion_smooth_mapper.py b/tests/ops/mapper/test_video_hand_motion_smooth_mapper.py new file mode 100644 index 0000000000..1647899c4d --- /dev/null +++ b/tests/ops/mapper/test_video_hand_motion_smooth_mapper.py @@ -0,0 +1,266 @@ +import unittest + +import numpy as np + +from data_juicer.ops.base_op import Fields +from data_juicer.ops.mapper.video_hand_motion_smooth_mapper import ( + VideoHandMotionSmoothMapper, + _recompute_actions, +) +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +class RecomputeActionsTest(DataJuicerTestCaseBase): + """Tests for the _recompute_actions helper.""" + + def test_linear_motion(self): + """Linear translation should produce constant dx.""" + n = 10 + states = np.zeros((n, 8), dtype=np.float64) + for i in range(n): + states[i, 0] = i * 0.1 # x increases linearly + actions = _recompute_actions(states) + self.assertEqual(actions.shape, (n, 7)) + for t in range(n - 1): + self.assertAlmostEqual(actions[t, 0], 0.1, places=5) + self.assertAlmostEqual(actions[t, 1], 0.0, places=5) + self.assertAlmostEqual(actions[t, 2], 0.0, places=5) + + def test_gripper_passthrough(self): + """Gripper value from next state should appear in action.""" + states = np.zeros((5, 8), dtype=np.float64) + states[1, 7] = 0.5 + states[2, 7] = 1.0 + actions = _recompute_actions(states) + self.assertAlmostEqual(actions[0, 6], 0.5) + self.assertAlmostEqual(actions[1, 6], 1.0) + + def test_single_frame(self): + states = np.zeros((1, 8), dtype=np.float64) + states[0, 7] = 0.3 + actions = _recompute_actions(states) + self.assertEqual(actions.shape, (1, 7)) + self.assertAlmostEqual(actions[0, 6], 0.3) + + +class VideoHandMotionSmoothMapperTest(DataJuicerTestCaseBase): + """Tests for VideoHandMotionSmoothMapper.""" + + # ------------------------------------------------------------------ + # helpers + # ------------------------------------------------------------------ + @staticmethod + def _make_hand_data(positions, hand_type="right"): + """Build hand data dict from (N, 3) positions.""" + n = len(positions) + states = np.zeros((n, 8), dtype=np.float32) + states[:, 0:3] = positions + return { + "hand_type": hand_type, + "states": states.tolist(), + "actions": np.zeros((n, 7), dtype=np.float32).tolist(), + "valid_frame_ids": list(range(n)), + "joints_world": [], + "joints_cam": [], + } + + def _make_sample(self, right_data=None, left_data=None): + clip = {} + if right_data is not None: + clip["right"] = right_data + if left_data is not None: + clip["left"] = left_data + return { + Fields.meta: { + "hand_action_tags": [clip], + } + } + + # ------------------------------------------------------------------ + # outlier replacement + # ------------------------------------------------------------------ + def test_replace_outliers_no_outliers(self): + """Smooth trajectory should be unchanged.""" + positions = np.column_stack([ + np.linspace(0, 1, 20), + np.zeros(20), + np.zeros(20), + ]) + result = VideoHandMotionSmoothMapper._replace_outliers( + positions, threshold_mad=5.0) + np.testing.assert_allclose(result, positions, atol=1e-10) + + def test_replace_outliers_with_spike(self): + """A single spike should be interpolated away.""" + np.random.seed(42) + n = 20 + positions = np.column_stack([ + np.linspace(0, 1, n) + np.random.normal(0, 0.02, n), + np.random.normal(0, 0.02, n), + np.random.normal(0, 0.02, n), + ]) + # Insert a huge spike at frame 10 + positions[10] = [100.0, 100.0, 100.0] + result = VideoHandMotionSmoothMapper._replace_outliers( + positions, threshold_mad=3.0) + # The spike should be reduced from the original (100, 100, 100) + original_dist = np.linalg.norm(positions[10] - positions[9]) + result_dist = np.linalg.norm(result[10] - result[9]) + self.assertLess(result_dist, original_dist) + + def test_replace_outliers_short_trajectory(self): + """Very short trajectories should be returned as-is.""" + positions = np.array([[0, 0, 0], [1, 0, 0], [2, 0, 0]], + dtype=np.float64) + result = VideoHandMotionSmoothMapper._replace_outliers( + positions, threshold_mad=3.0) + np.testing.assert_array_equal(result, positions) + + # ------------------------------------------------------------------ + # Savitzky-Golay smoothing + # ------------------------------------------------------------------ + def test_savgol_smooth_reduces_noise(self): + """Smoothed noisy signal should have lower variance than raw.""" + np.random.seed(42) + n = 50 + clean = np.linspace(0, 1, n) + noisy = clean + np.random.normal(0, 0.1, n) + smoothed = VideoHandMotionSmoothMapper._savgol_smooth( + noisy, window=11, polyorder=3) + residual_raw = np.std(noisy - clean) + residual_smooth = np.std(smoothed - clean) + self.assertLess(residual_smooth, residual_raw) + + def test_savgol_smooth_short(self): + """Data shorter than the window should not crash.""" + data = np.array([1.0, 2.0, 3.0]) + result = VideoHandMotionSmoothMapper._savgol_smooth( + data, window=11, polyorder=3) + self.assertEqual(len(result), 3) + + def test_savgol_smooth_2d(self): + """Should smooth each column independently.""" + np.random.seed(0) + data = np.random.randn(30, 3) + result = VideoHandMotionSmoothMapper._savgol_smooth( + data, window=7, polyorder=2) + self.assertEqual(result.shape, data.shape) + + # ------------------------------------------------------------------ + # orientation smoothing + # ------------------------------------------------------------------ + def test_smooth_orientations_preserves_shape(self): + n = 30 + eulers = np.random.randn(n, 3) * 0.5 + result = VideoHandMotionSmoothMapper._smooth_orientations( + eulers, window=7, polyorder=2) + self.assertEqual(result.shape, (n, 3)) + + # ------------------------------------------------------------------ + # end-to-end process_single + # ------------------------------------------------------------------ + def test_process_single_smooths(self): + """Smoothing should modify states while preserving structure.""" + np.random.seed(42) + n = 30 + positions = np.column_stack([ + np.linspace(0, 1, n) + np.random.normal(0, 0.05, n), + np.zeros(n), + np.zeros(n), + ]) + hand_data = self._make_hand_data(positions, "right") + sample = self._make_sample(right_data=hand_data) + + op = VideoHandMotionSmoothMapper( + hand_action_field="hand_action_tags", + savgol_window=7, + savgol_polyorder=2, + min_frames_for_smoothing=5, + ) + result = op.process_single(sample) + + smoothed = result[Fields.meta]["hand_action_tags"][0]["right"] + self.assertEqual(len(smoothed["states"]), n) + self.assertEqual(len(smoothed["actions"]), n) + self.assertEqual(len(smoothed["valid_frame_ids"]), n) + # States should be different from original (smoothed) + orig_states = np.array(hand_data["states"]) + new_states = np.array(smoothed["states"]) + self.assertFalse(np.allclose(orig_states, new_states, atol=1e-6)) + + def test_process_single_preserves_frame_count(self): + """Smoothing should NOT change the number of frames.""" + n = 25 + positions = np.column_stack([ + np.linspace(0, 1, n), + np.zeros(n), + np.zeros(n), + ]) + hand_data = self._make_hand_data(positions, "right") + sample = self._make_sample(right_data=hand_data) + + op = VideoHandMotionSmoothMapper(min_frames_for_smoothing=5) + result = op.process_single(sample) + + smoothed = result[Fields.meta]["hand_action_tags"][0]["right"] + self.assertEqual(len(smoothed["states"]), n) + self.assertEqual(len(smoothed["valid_frame_ids"]), n) + + def test_process_single_too_few_frames(self): + """Fewer frames than threshold → keep original.""" + positions = np.array([[0, 0, 0], [1, 0, 0]], dtype=np.float64) + hand_data = self._make_hand_data(positions, "right") + sample = self._make_sample(right_data=hand_data) + + op = VideoHandMotionSmoothMapper(min_frames_for_smoothing=10) + result = op.process_single(sample) + + smoothed = result[Fields.meta]["hand_action_tags"][0]["right"] + np.testing.assert_array_equal( + np.array(smoothed["states"]), + np.array(hand_data["states"]), + ) + + def test_no_meta_passthrough(self): + sample = {"text": "hello"} + op = VideoHandMotionSmoothMapper() + result = op.process_single(sample) + self.assertEqual(result, sample) + + def test_empty_hand_action(self): + sample = {Fields.meta: {"hand_action_tags": []}} + op = VideoHandMotionSmoothMapper() + result = op.process_single(sample) + self.assertEqual( + result[Fields.meta]["hand_action_tags"], []) + + def test_smooth_joints_world(self): + """joints_world should also be smoothed when enabled.""" + np.random.seed(0) + n = 30 + positions = np.column_stack([ + np.linspace(0, 1, n), + np.zeros(n), np.zeros(n), + ]) + hand_data = self._make_hand_data(positions, "right") + # Add noisy joints_world: (n, 21, 3) + joints = np.random.randn(n, 21, 3) * 0.1 + hand_data["joints_world"] = joints.tolist() + sample = self._make_sample(right_data=hand_data) + + op = VideoHandMotionSmoothMapper( + smooth_joints=True, + min_frames_for_smoothing=5, + ) + result = op.process_single(sample) + smoothed = result[Fields.meta]["hand_action_tags"][0]["right"] + self.assertEqual(len(smoothed["joints_world"]), n) + # Should be different (smoothed) + self.assertFalse(np.allclose( + np.array(smoothed["joints_world"]), + joints, atol=1e-6, + )) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/ops/mapper/test_video_hand_reconstruction_hawor_mapper.py b/tests/ops/mapper/test_video_hand_reconstruction_hawor_mapper.py index 4a9cd07890..9142bfef9a 100644 --- a/tests/ops/mapper/test_video_hand_reconstruction_hawor_mapper.py +++ b/tests/ops/mapper/test_video_hand_reconstruction_hawor_mapper.py @@ -3,117 +3,107 @@ import numpy as np from data_juicer.core.data import NestedDataset as Dataset -from data_juicer.ops.mapper.video_hand_reconstruction_hawor_mapper import VideoHandReconstructionHaworMapper -from data_juicer.utils.mm_utils import SpecialTokens -from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.ops.mapper import VideoExtractFramesMapper, VideoHandReconstructionHaworMapper +from data_juicer.utils.constant import Fields, MetaKeys, CameraCalibrationKeys from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE -@unittest.skip('Users need to download MANO_RIGHT.pkl.') +@unittest.skip('Users need to download MANO_RIGHT.pkl and MANO_LEFT.pkl.') class VideoHandReconstructionHaworMapperTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'data') vid3_path = os.path.join(data_path, 'video3.mp4') vid4_path = os.path.join(data_path, 'video4.mp4') - ds_list = [{ - 'videos': [vid3_path] - }, { - 'videos': [vid4_path] - }] + def _build_ds_list(self): + """Build dataset with pre-extracted frames and camera calibration in meta.""" + ds_list = [{ + 'videos': [self.vid3_path], + Fields.meta: { + 'camera_calibration': [{CameraCalibrationKeys.hfov: [0.76] * 6,}], + } + }, { + 'videos': [self.vid4_path], + Fields.meta: { + 'camera_calibration': [{ + CameraCalibrationKeys.hfov: [0.66] * 5, + }], + } + }] - tgt_list = [{ - "fov_x": 0.7572688730116571, - "left_frame_id_list": [2, 7, 8, 9, 10, 28, 33, 34, 36, 38, 39, 43, 44, 45, 46, 47, 48], - "left_beta_list_shape": (17, 10), - "left_hand_pose_list_shape": (17, 15, 3, 3), - "left_global_orient_list_shape": (17, 3, 3), - "left_transl_list_shape": (17, 3), - "right_frame_id_list": [1, 2, 3, 4, 8, 9, 11, 12, 13, 14, 16, 17, 19, 20, 22, 23, 24, 29, 30, 33, 34, 36, 37, 40, 41, 42, 43, 44, 45, 47, 48], - "right_beta_list_shape": (31, 10), - "right_hand_pose_list_shape": (31, 15, 3, 3), - "right_global_orient_list_shape": (31, 3, 3), - "right_transl_list_shape": (31, 3), - }, { - "fov_x": 0.6575318204118722, - "left_frame_id_list": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 17, 18, 19, 20, 21], - "left_beta_list_shape": (17, 10), - "left_hand_pose_list_shape": (17, 15, 3, 3), - "left_global_orient_list_shape": (17, 3, 3), - "left_transl_list_shape": (17, 3), - "right_frame_id_list": [0, 3, 8, 16], - "right_beta_list_shape": (4, 10), - "right_hand_pose_list_shape": (4, 15, 3, 3), - "right_global_orient_list_shape": (4, 3, 3), - "right_transl_list_shape": (4, 3), - }] + extract_op = VideoExtractFramesMapper( + frame_sampling_method='all_keyframes', + output_format='bytes', + legacy_split_by_text_token=False, + ) + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(extract_op.process, batched=True, batch_size=1) + + return dataset.to_list() - def test(self): + def test_default(self): + ds_list = self._build_ds_list() op = VideoHandReconstructionHaworMapper( hawor_model_path="hawor.ckpt", hawor_config_path="model_config.yaml", hawor_detector_path="detector.pt", - moge_model_path="Ruicheng/moge-2-vitl", - mano_right_path="path_to_mano_right_pkl", - frame_num=1, - duration=1, + tag_field_name=MetaKeys.hand_reconstruction_hawor_tags, + mano_right_path='MANO_RIGHT.pkl', + mano_left_path='MANO_LEFT.pkl', + frame_field=MetaKeys.video_frames, + camera_calibration_field='camera_calibration', thresh=0.2, - frame_dir=DATA_JUICER_ASSETS_CACHE, - moge_output_info_dir=DATA_JUICER_ASSETS_CACHE, ) - dataset = Dataset.from_list(self.ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, - column=[{}] * dataset.num_rows) - dataset = dataset.map(op.process, num_proc=1, with_rank=True) - res_list = dataset.to_list() - for sample, target in zip(res_list, self.tgt_list): - self.assertEqual(abs(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["fov_x"] - target["fov_x"]) < 0.01, True) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_beta_list"]).shape[1:], target["left_beta_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_hand_pose_list"]).shape[1:], target["left_hand_pose_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_global_orient_list"]).shape[1:], target["left_global_orient_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_transl_list"]).shape[1:], target["left_transl_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_beta_list"]).shape[1:], target["right_beta_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_hand_pose_list"]).shape[1:], target["right_hand_pose_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_global_orient_list"]).shape[1:], target["right_global_orient_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_transl_list"]).shape[1:], target["right_transl_list_shape"][1:]) + # Process each sample directly to avoid Arrow type inference + # conflicts when hand detection results vary across samples + # (empty list [] inferred as null vs list>). + res_list = [] + for sample in ds_list: + result = op.process_single(sample) + res_list.append(result) - - def test_mul_proc(self): + for sample in res_list: + tag = sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags] + self.assertIsInstance(tag, list) + self.assertGreater(len(tag), 0) - op = VideoHandReconstructionHaworMapper( - hawor_model_path="hawor.ckpt", - hawor_config_path="model_config.yaml", - hawor_detector_path="detector.pt", - moge_model_path="Ruicheng/moge-2-vitl", - mano_right_path="path_to_mano_right_pkl", - frame_num=1, - duration=1, - thresh=0.2, - frame_dir=DATA_JUICER_ASSETS_CACHE, - moge_output_info_dir=DATA_JUICER_ASSETS_CACHE, - ) - dataset = Dataset.from_list(self.ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, - column=[{}] * dataset.num_rows) - dataset = dataset.map(op.process, num_proc=2, with_rank=True) - res_list = dataset.to_list() + for video_result in tag: + # Check top-level keys + self.assertIn('fov_x', video_result) + self.assertIn('img_focal', video_result) + self.assertIn('left', video_result) + self.assertIn('right', video_result) + + # Check hand output structure (axis-angle format) + for hand_type in ['left', 'right']: + hand = video_result[hand_type] + self.assertIn('frame_ids', hand) + self.assertIn('global_orient', hand) + self.assertIn('hand_pose', hand) + self.assertIn('betas', hand) + self.assertIn('transl', hand) - for sample, target in zip(res_list, self.tgt_list): - self.assertEqual(abs(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["fov_x"] - target["fov_x"]) < 0.01, True) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_beta_list"]).shape[1:], target["left_beta_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_hand_pose_list"]).shape[1:], target["left_hand_pose_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_global_orient_list"]).shape[1:], target["left_global_orient_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["left_transl_list"]).shape[1:], target["left_transl_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_beta_list"]).shape[1:], target["right_beta_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_hand_pose_list"]).shape[1:], target["right_hand_pose_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_global_orient_list"]).shape[1:], target["right_global_orient_list_shape"][1:]) - self.assertEqual(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_hawor_tags]["right_transl_list"]).shape[1:], target["right_transl_list_shape"][1:]) + n_frames = len(hand['frame_ids']) + if n_frames > 0: + # global_orient: list of (3,) axis-angle + self.assertEqual( + np.array(hand['global_orient']).shape, + (n_frames, 3)) + # hand_pose: list of (45,) axis-angle + self.assertEqual( + np.array(hand['hand_pose']).shape, + (n_frames, 45)) + # betas: list of (10,) + self.assertEqual( + np.array(hand['betas']).shape, + (n_frames, 10)) + # transl: list of (3,) + self.assertEqual( + np.array(hand['transl']).shape, + (n_frames, 3)) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/ops/mapper/test_video_hand_reconstruction_mapper.py b/tests/ops/mapper/test_video_hand_reconstruction_mapper.py deleted file mode 100644 index a22f0b36d9..0000000000 --- a/tests/ops/mapper/test_video_hand_reconstruction_mapper.py +++ /dev/null @@ -1,106 +0,0 @@ -import os -import unittest -import numpy as np - -from data_juicer.core.data import NestedDataset as Dataset -from data_juicer.ops.mapper.video_hand_reconstruction_mapper import VideoHandReconstructionMapper -from data_juicer.utils.mm_utils import SpecialTokens -from data_juicer.utils.constant import Fields, MetaKeys -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE - - -@unittest.skip('Users need to download MANO_RIGHT.pkl.') -class VideoHandReconstructionMapperTest(DataJuicerTestCaseBase): - data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', - 'data') - vid3_path = os.path.join(data_path, 'video3.mp4') - vid4_path = os.path.join(data_path, 'video4.mp4') - - ds_list = [{ - 'videos': [vid3_path] - }, { - 'videos': [vid4_path] - }] - - tgt_list = [{ - "frame_nums": 49, - "vertices_shape": [2, 778, 3], - "camera_translation_shape": [2, 3], - "if_right_hand_shape": [2], - "joints_shape": [2, 21, 3], - "keypoints_shape": [2, 778, 2] - }, { - "frame_nums": 22, - "vertices_shape": [1, 778, 3], - "camera_translation_shape": [1, 3], - "if_right_hand_shape": [1], - "joints_shape": [1, 21, 3], - "keypoints_shape": [1, 778, 2] - }] - - - def test(self): - - op = VideoHandReconstructionMapper( - wilor_model_path="wilor_final.ckpt", - wilor_model_config="model_config.yaml", - detector_model_path="detector.pt", - mano_right_path="path_to_mano_right_pkl", - frame_num=1, - duration=1, - frame_dir=DATA_JUICER_ASSETS_CACHE, - if_save_visualization=True, - save_visualization_dir=DATA_JUICER_ASSETS_CACHE, - if_save_mesh=True, - save_mesh_dir=DATA_JUICER_ASSETS_CACHE, - ) - dataset = Dataset.from_list(self.ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, - column=[{}] * dataset.num_rows) - dataset = dataset.map(op.process, num_proc=1, with_rank=True) - res_list = dataset.to_list() - - for sample, target in zip(res_list, self.tgt_list): - self.assertEqual(len(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["vertices"]), target["frame_nums"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["vertices"][10]).shape), target["vertices_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["camera_translation"][10]).shape), target["camera_translation_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["if_right_hand"][10]).shape), target["if_right_hand_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["joints"][10]).shape), target["joints_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["keypoints"][10]).shape), target["keypoints_shape"]) - - - def test_mul_proc(self): - - op = VideoHandReconstructionMapper( - wilor_model_path="wilor_final.ckpt", - wilor_model_config="model_config.yaml", - detector_model_path="detector.pt", - mano_right_path="path_to_mano_right_pkl", - frame_num=1, - duration=1, - frame_dir=DATA_JUICER_ASSETS_CACHE, - if_save_visualization=True, - save_visualization_dir=DATA_JUICER_ASSETS_CACHE, - if_save_mesh=True, - save_mesh_dir=DATA_JUICER_ASSETS_CACHE, - ) - dataset = Dataset.from_list(self.ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, - column=[{}] * dataset.num_rows) - dataset = dataset.map(op.process, num_proc=2, with_rank=True) - res_list = dataset.to_list() - - for sample, target in zip(res_list, self.tgt_list): - self.assertEqual(len(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["vertices"]), target["frame_nums"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["vertices"][10]).shape), target["vertices_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["camera_translation"][10]).shape), target["camera_translation_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["if_right_hand"][10]).shape), target["if_right_hand_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["joints"][10]).shape), target["joints_shape"]) - self.assertEqual(list(np.array(sample[Fields.meta][MetaKeys.hand_reconstruction_tags]["keypoints"][10]).shape), target["keypoints_shape"]) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/ops/mapper/test_video_split_by_duration_mapper.py b/tests/ops/mapper/test_video_split_by_duration_mapper.py index 42b5dceed5..53ae0452f6 100644 --- a/tests/ops/mapper/test_video_split_by_duration_mapper.py +++ b/tests/ops/mapper/test_video_split_by_duration_mapper.py @@ -1,13 +1,16 @@ # flake8: noqa: E501 import os +import shutil +import tempfile import unittest from data_juicer.core.data import NestedDataset as Dataset +from data_juicer.ops.base_op import Fields from data_juicer.ops.mapper.video_split_by_duration_mapper import \ VideoSplitByDurationMapper from data_juicer.utils.file_utils import add_suffix_to_filename -from data_juicer.utils.mm_utils import SpecialTokens +from data_juicer.utils.mm_utils import SpecialTokens, load_file_byte from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase @@ -18,6 +21,12 @@ class VideoSplitByDurationMapperTest(DataJuicerTestCaseBase): vid1_path = os.path.join(data_path, 'video1.mp4') vid2_path = os.path.join(data_path, 'video2.mp4') vid3_path = os.path.join(data_path, 'video3.mp4') + tmp_dir = tempfile.TemporaryDirectory().name + + def tearDown(self): + super().tearDown() + if os.path.exists(self.tmp_dir): + shutil.rmtree(self.tmp_dir) def _get_res_list(self, dataset, source_list): res_list = [] @@ -230,6 +239,331 @@ def test_min_last_split_duration(self): keep_original_sample=False) self._run_video_split_by_duration_mapper(op, ds_list, tgt_list) + def test_output_format_bytes(self, save_field=None): + ds_list = [{ + 'text': f'{SpecialTokens.video} 白色的小羊站在一旁讲话。旁边还有两只灰色猫咪和一只拉着灰狼的猫咪。', + 'videos': [self.vid1_path] + }, { + 'text': + f'{SpecialTokens.video} 身穿白色上衣的男子,拿着一个东西,拍打自己的胃部。{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }, { + 'text': + f'{SpecialTokens.video} 两个长头发的女子正坐在一张圆桌前讲话互动。 {SpecialTokens.eoc}', + 'videos': [self.vid3_path] + }] + tgt_list = [{ + 'text': + f'{SpecialTokens.video}{SpecialTokens.video} 白色的小羊站在一旁讲话。旁边还有两只灰色猫咪和一只拉着灰狼的猫咪。{SpecialTokens.eoc}', + 'split_frames_num': 2 + }, { + 'text': + f'{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video} 身穿白色上衣的男子,拿着一个东西,拍打自己的胃部。{SpecialTokens.eoc}', + 'split_frames_num': 3 + }, { + 'text': + f'{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video} 两个长头发的女子正坐在一张圆桌前讲话互动。 {SpecialTokens.eoc}', + 'split_frames_num': 5 + }] + op = VideoSplitByDurationMapper( + split_duration=10, + keep_original_sample=False, + output_format="bytes", + save_field=save_field, + save_dir=self.tmp_dir, + legacy_split_by_text_token=True) + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1) + res_list = dataset.to_list() + + save_field = save_field or "videos" + for i in range(len(ds_list)): + res = res_list[i] + tgt = tgt_list[i] + self.assertEqual(res['text'], tgt['text']) + self.assertEqual(len(res[Fields.source_file]), tgt['split_frames_num']) + for clip_path in res[Fields.source_file]: + self.assertTrue(os.path.exists(clip_path)) + self.assertEqual(len(res[save_field]), tgt['split_frames_num']) + self.assertTrue(all(isinstance(v, bytes) for v in res[save_field])) + + def test_output_format_bytes_save_field(self): + self.test_output_format_bytes(save_field="clips") + + def test_input_video_bytes(self): + ds_list = [{ + 'text': f'{SpecialTokens.video} 白色的小羊站在一旁讲话。旁边还有两只灰色猫咪和一只拉着灰狼的猫咪。', + 'videos': [load_file_byte(self.vid1_path)] + }, { + 'text': + f'{SpecialTokens.video} 身穿白色上衣的男子,拿着一个东西,拍打自己的胃部。{SpecialTokens.eoc}', + 'videos': [load_file_byte(self.vid2_path)] + }, { + 'text': + f'{SpecialTokens.video} 两个长头发的女子正坐在一张圆桌前讲话互动。 {SpecialTokens.eoc}', + 'videos': [load_file_byte(self.vid3_path)] + }] + tgt_list = [{ + 'text': + f'{SpecialTokens.video}{SpecialTokens.video} 白色的小羊站在一旁讲话。旁边还有两只灰色猫咪和一只拉着灰狼的猫咪。{SpecialTokens.eoc}', + 'split_frames_num': 2 + }, { + 'text': + f'{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video} 身穿白色上衣的男子,拿着一个东西,拍打自己的胃部。{SpecialTokens.eoc}', + 'split_frames_num': 3 + }, { + 'text': + f'{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video} 两个长头发的女子正坐在一张圆桌前讲话互动。 {SpecialTokens.eoc}', + 'split_frames_num': 5 + }] + + save_field = "clips" + op = VideoSplitByDurationMapper( + split_duration=10, + keep_original_sample=False, + output_format="bytes", + save_field=save_field, + save_dir=self.tmp_dir, + legacy_split_by_text_token=True, + video_backend="ffmpeg") + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1) + res_list = dataset.to_list() + + for i in range(len(ds_list)): + res = res_list[i] + tgt = tgt_list[i] + self.assertEqual(res['text'], tgt['text']) + self.assertEqual(len(res[Fields.source_file]), tgt['split_frames_num']) + for clip_path in res[Fields.source_file]: + self.assertTrue(os.path.exists(clip_path)) + self.assertEqual(len(res[save_field]), tgt['split_frames_num']) + self.assertTrue(all(isinstance(v, bytes) for v in res[save_field])) + + + # ─── overlap_duration tests ─────────────────────────────────── + + def test_overlap_basic(self): + """split=10, overlap=5 → step=5. + video1(11.76s): [0-10],[5-11.76] → 2 clips + video2(23.17s): [0-10],[5-15],[10-20],[15-23.17] → 4 clips + video3(49.58s): 9 clips + """ + ds_list = [{ + 'text': f'{SpecialTokens.video} vid1.', + 'videos': [self.vid1_path] + }, { + 'text': f'{SpecialTokens.video} vid2.{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }, { + 'text': f'{SpecialTokens.video} vid3.{SpecialTokens.eoc}', + 'videos': [self.vid3_path] + }] + tgt_list = [{ + 'text': f'{SpecialTokens.video}{SpecialTokens.video} vid1.{SpecialTokens.eoc}', + 'split_frames_num': [2] + }, { + 'text': f'{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video} vid2.{SpecialTokens.eoc}', + 'split_frames_num': [4] + }, { + 'text': f'{"".join([SpecialTokens.video] * 9)} vid3.{SpecialTokens.eoc}', + 'split_frames_num': [9] + }] + op = VideoSplitByDurationMapper( + split_duration=10, + overlap_duration=5, + keep_original_sample=False) + self._run_video_split_by_duration_mapper(op, ds_list, tgt_list) + + def test_overlap_more_clips_than_no_overlap(self): + """Overlap produces more clips than no-overlap for the same video. + video2(23.17s): + no-overlap split=10: [0-10],[10-20],[20-23.17] → 3 clips + overlap=5 split=10: [0-10],[5-15],[10-20],[15-23.17] → 4 clips + """ + ds_list = [{ + 'text': f'{SpecialTokens.video} vid2.{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }] + op_no_overlap = VideoSplitByDurationMapper( + split_duration=10, + overlap_duration=0, + keep_original_sample=False) + op_overlap = VideoSplitByDurationMapper( + split_duration=10, + overlap_duration=5, + keep_original_sample=False) + + dataset_no = Dataset.from_list(ds_list) + dataset_no = dataset_no.map(op_no_overlap.process, num_proc=1) + no_clips = len(dataset_no.to_list()[0]['videos']) + + dataset_ov = Dataset.from_list(ds_list) + dataset_ov = dataset_ov.map(op_overlap.process, num_proc=1) + ov_clips = len(dataset_ov.to_list()[0]['videos']) + + self.assertEqual(no_clips, 3) + self.assertEqual(ov_clips, 4) + self.assertGreater(ov_clips, no_clips) + + def test_overlap_short_video_no_split(self): + """Video shorter than split_duration → return original, overlap irrelevant. + video1(11.76s) with split=20, overlap=5: no split. + """ + ds_list = [{ + 'text': f'{SpecialTokens.video} vid1.', + 'videos': [self.vid1_path] + }] + op = VideoSplitByDurationMapper( + split_duration=20, + overlap_duration=5, + keep_original_sample=False) + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1) + res = dataset.to_list()[0] + # Original path returned as-is + self.assertEqual(res['videos'], [self.vid1_path]) + + def test_overlap_with_min_last_split_duration(self): + """split=10, overlap=5, min_last=10 → drop short last segments. + video1(11.76s): [0-10], then start=5 remaining=6.76<10 → 1 clip + video2(23.17s): [0-10],[5-15],[10-20], then start=15 remaining=8.17<10 → 3 clips + video3(49.58s): 8 clips, then start=40 remaining=9.58<10 → 8 clips + """ + ds_list = [{ + 'text': f'{SpecialTokens.video} vid1.', + 'videos': [self.vid1_path] + }, { + 'text': f'{SpecialTokens.video} vid2.{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }, { + 'text': f'{SpecialTokens.video} vid3.{SpecialTokens.eoc}', + 'videos': [self.vid3_path] + }] + tgt_list = [{ + 'text': f'{SpecialTokens.video} vid1.{SpecialTokens.eoc}', + 'split_frames_num': [1] + }, { + 'text': f'{SpecialTokens.video}{SpecialTokens.video}{SpecialTokens.video} vid2.{SpecialTokens.eoc}', + 'split_frames_num': [3] + }, { + 'text': f'{"".join([SpecialTokens.video] * 8)} vid3.{SpecialTokens.eoc}', + 'split_frames_num': [8] + }] + op = VideoSplitByDurationMapper( + split_duration=10, + overlap_duration=5, + min_last_split_duration=10, + keep_original_sample=False) + self._run_video_split_by_duration_mapper(op, ds_list, tgt_list) + + def test_overlap_bytes_output(self): + """Overlap with bytes output format.""" + ds_list = [{ + 'text': f'{SpecialTokens.video} vid2.{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }] + save_field = "clips" + op = VideoSplitByDurationMapper( + split_duration=10, + overlap_duration=5, + keep_original_sample=False, + output_format="bytes", + save_field=save_field, + save_dir=self.tmp_dir, + legacy_split_by_text_token=True) + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1) + res = dataset.to_list()[0] + + # video2(23.17s), split=10, overlap=5 → 4 clips + self.assertEqual(len(res[save_field]), 4) + self.assertTrue(all(isinstance(v, bytes) for v in res[save_field])) + self.assertEqual(len(res[Fields.source_file]), 4) + + def test_overlap_non_legacy_save_field(self): + """Overlap with non-legacy mode (save_field, no text token update).""" + ds_list = [{ + 'text': '', + 'videos': [self.vid3_path] + }] + save_field = "clips" + op = VideoSplitByDurationMapper( + split_duration=10, + overlap_duration=5, + keep_original_sample=False, + save_field=save_field, + save_dir=self.tmp_dir, + legacy_split_by_text_token=False) + + dataset = Dataset.from_list(ds_list) + dataset = dataset.map(op.process, num_proc=1) + res = dataset.to_list()[0] + + # video3(49.58s), split=10, overlap=5 → 9 clips + self.assertEqual(len(res[save_field]), 9) + # Original videos field untouched + self.assertEqual(res['videos'], [self.vid3_path]) + + def test_overlap_zero_same_as_default(self): + """overlap_duration=0 produces identical results to the default.""" + ds_list = [{ + 'text': f'{SpecialTokens.video} vid2.{SpecialTokens.eoc}', + 'videos': [self.vid2_path] + }] + op_default = VideoSplitByDurationMapper( + split_duration=10, + keep_original_sample=False) + op_zero = VideoSplitByDurationMapper( + split_duration=10, + overlap_duration=0, + keep_original_sample=False) + + dataset_d = Dataset.from_list(ds_list) + dataset_d = dataset_d.map(op_default.process, num_proc=1) + res_d = dataset_d.to_list()[0] + + dataset_z = Dataset.from_list(ds_list) + dataset_z = dataset_z.map(op_zero.process, num_proc=1) + res_z = dataset_z.to_list()[0] + + self.assertEqual(len(res_d['videos']), len(res_z['videos'])) + self.assertEqual(res_d['text'], res_z['text']) + + def test_overlap_validation_negative(self): + """overlap_duration < 0 should raise AssertionError.""" + with self.assertRaises(AssertionError): + VideoSplitByDurationMapper(split_duration=10, overlap_duration=-1) + + def test_overlap_validation_exceeds_split(self): + """overlap_duration >= split_duration should raise AssertionError.""" + with self.assertRaises(AssertionError): + VideoSplitByDurationMapper(split_duration=10, overlap_duration=10) + with self.assertRaises(AssertionError): + VideoSplitByDurationMapper(split_duration=10, overlap_duration=15) + + def test_overlap_multi_chunk(self): + """Overlap with multi-video sample. + video1(11.76s) split=10,overlap=5: 2 clips + video3(49.58s) split=10,overlap=5: 9 clips + """ + ds_list = [{ + 'text': f'{SpecialTokens.video} v1.{SpecialTokens.eoc}{SpecialTokens.video} v3.{SpecialTokens.eoc}', + 'videos': [self.vid1_path, self.vid3_path] + }] + tgt_list = [{ + 'text': f'{SpecialTokens.video}{SpecialTokens.video} v1.{SpecialTokens.eoc}{"".join([SpecialTokens.video] * 9)} v3.{SpecialTokens.eoc}', + 'split_frames_num': [2, 9] + }] + op = VideoSplitByDurationMapper( + split_duration=10, + overlap_duration=5, + keep_original_sample=False) + self._run_video_split_by_duration_mapper(op, ds_list, tgt_list) + if __name__ == '__main__': unittest.main() diff --git a/tests/ops/mapper/test_video_trajectory_overlay_mapper.py b/tests/ops/mapper/test_video_trajectory_overlay_mapper.py new file mode 100644 index 0000000000..2f3fcce5ca --- /dev/null +++ b/tests/ops/mapper/test_video_trajectory_overlay_mapper.py @@ -0,0 +1,309 @@ +import os +import shutil +import tempfile +import unittest + +import cv2 +import numpy as np + +from data_juicer.ops.base_op import Fields +from data_juicer.ops.mapper.video_trajectory_overlay_mapper import \ + VideoTrajectoryOverlayMapper +from data_juicer.utils.constant import CameraCalibrationKeys, MetaKeys +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + + +class VideoTrajectoryOverlayMapperTest(DataJuicerTestCaseBase): + """Tests for VideoTrajectoryOverlayMapper.""" + + def setUp(self): + super().setUp() + self.tmp_dir = tempfile.mkdtemp() + self.frames_dir = os.path.join(self.tmp_dir, "frames") + self.overlay_dir = os.path.join(self.tmp_dir, "overlays") + os.makedirs(self.frames_dir, exist_ok=True) + + def tearDown(self): + super().tearDown() + if os.path.exists(self.tmp_dir): + shutil.rmtree(self.tmp_dir) + + # ------------------------------------------------------------------ + # helpers + # ------------------------------------------------------------------ + def _create_dummy_frames(self, n_frames=30, width=640, height=480): + """Create dummy frame images and return their paths.""" + paths = [] + for i in range(n_frames): + img = np.random.randint(0, 255, (height, width, 3), + dtype=np.uint8) + path = os.path.join(self.frames_dir, f"frame_{i:04d}.jpg") + cv2.imwrite(path, img) + paths.append(path) + return paths + + def _make_sample(self, n_frames=30, n_segments=2): + """Build a sample with dummy segments and camera data.""" + frame_paths = self._create_dummy_frames(n_frames) + + # Identity camera poses + cam_c2w = [np.eye(4).tolist() for _ in range(n_frames)] + + # Simple intrinsics K matrix (fx=320, fy=320, cx=320, cy=240) + K = [[320.0, 0, 320.0], [0, 320.0, 240.0], [0, 0, 1]] + intrinsics_list = [K for _ in range(n_frames)] + + # Simple segments + frames_per_seg = n_frames // n_segments + segments = [] + for s in range(n_segments): + start = s * frames_per_seg + end = min((s + 1) * frames_per_seg - 1, n_frames - 1) + n = end - start + 1 + states = np.zeros((n, 8), dtype=np.float32) + # Build joints_world: (n, 21, 3) — use joint 9 as palm + joints_world = np.zeros((n, 21, 3), dtype=np.float32) + # Linear motion for palm (joint 9) + for i in range(n): + palm_pos = np.array([ + (start + i) * 0.01, + (start + i) * 0.005, + 1.0, # z=1 so projection works + ]) + joints_world[i, 9] = palm_pos + # Wrist (joint 0) slightly offset from palm + joints_world[i, 0] = palm_pos + np.array([-0.02, 0.01, 0]) + states[i, 0:3] = palm_pos # states for fallback + segments.append({ + "hand_type": "right" if s % 2 == 0 else "left", + "segment_id": s, + "start_frame": start, + "end_frame": end, + "states": states.tolist(), + "actions": [], + "valid_frame_ids": list(range(start, end + 1)), + "joints_world": joints_world.tolist(), + }) + + return { + "video_frames": frame_paths, + Fields.meta: { + "atomic_action_segments": segments, + MetaKeys.video_camera_pose_tags: [{ + CameraCalibrationKeys.cam_c2w: cam_c2w, + }], + MetaKeys.camera_calibration_moge_tags: [{ + CameraCalibrationKeys.hfov: [1.0], + CameraCalibrationKeys.intrinsics: intrinsics_list, + }], + }, + } + + # ------------------------------------------------------------------ + # projection helpers + # ------------------------------------------------------------------ + def test_world_to_camera_identity(self): + """Identity c2w should return the same position.""" + pos = np.array([1.0, 2.0, 3.0]) + c2w = np.eye(4) + result = VideoTrajectoryOverlayMapper._world_to_camera(pos, c2w) + np.testing.assert_allclose(result, pos, atol=1e-10) + + def test_world_to_camera_translation(self): + """Translation-only c2w should shift the position.""" + pos = np.array([5.0, 5.0, 5.0]) + c2w = np.eye(4) + c2w[:3, 3] = [1.0, 2.0, 3.0] + result = VideoTrajectoryOverlayMapper._world_to_camera(pos, c2w) + # cam = (world - t) @ R = (pos - t) + expected = np.array([4.0, 3.0, 2.0]) + np.testing.assert_allclose(result, expected, atol=1e-10) + + def test_project_to_2d_center_fov(self): + """A point on the optical axis should project to the image center + when using fov_x fallback.""" + pos_cam = np.array([0.0, 0.0, 1.0]) + fov_x = np.pi / 2 # 90 degrees + w, h = 640, 480 + result = VideoTrajectoryOverlayMapper._project_to_2d( + pos_cam, w, h, fov_x=fov_x) + np.testing.assert_allclose(result, [320.0, 240.0], atol=1e-10) + + def test_project_to_2d_center_K(self): + """A point on the optical axis should project to (cx, cy) + when using a K intrinsics matrix.""" + pos_cam = np.array([0.0, 0.0, 1.0]) + K = np.array([[320.0, 0, 320.0], + [0, 320.0, 240.0], + [0, 0, 1.0]]) + w, h = 640, 480 + result = VideoTrajectoryOverlayMapper._project_to_2d( + pos_cam, w, h, K=K) + np.testing.assert_allclose(result, [320.0, 240.0], atol=1e-10) + + def test_project_to_2d_batch(self): + """Batch projection should work for multiple points.""" + pos_cam = np.array([ + [0.0, 0.0, 1.0], + [0.1, 0.0, 1.0], + ]) + result = VideoTrajectoryOverlayMapper._project_to_2d( + pos_cam, 640, 480, fov_x=np.pi / 2) + self.assertEqual(result.shape, (2, 2)) + + def test_project_to_2d_batch_K(self): + """Batch projection with K matrix should work for multiple points.""" + pos_cam = np.array([ + [0.0, 0.0, 1.0], + [0.1, 0.0, 1.0], + ]) + K = np.array([[320.0, 0, 320.0], + [0, 320.0, 240.0], + [0, 0, 1.0]]) + result = VideoTrajectoryOverlayMapper._project_to_2d( + pos_cam, 640, 480, K=K) + self.assertEqual(result.shape, (2, 2)) + # First point on optical axis → center + np.testing.assert_allclose(result[0], [320.0, 240.0], atol=1e-10) + # Second point offset in x → u > cx + self.assertGreater(result[1, 0], 320.0) + + # ------------------------------------------------------------------ + # temporal color gradient + # ------------------------------------------------------------------ + def test_temporal_color_blue_at_start(self): + """t=0 should give blue (BGR: high B, low G, low R).""" + b, g, r = VideoTrajectoryOverlayMapper._temporal_color(0.0) + self.assertEqual(b, 255) + self.assertEqual(g, 0) + self.assertEqual(r, 0) + + def test_temporal_color_green_at_mid(self): + """t=0.5 should give green (BGR: low B, high G, low R).""" + b, g, r = VideoTrajectoryOverlayMapper._temporal_color(0.5) + self.assertEqual(b, 0) + self.assertEqual(g, 255) + self.assertEqual(r, 0) + + def test_temporal_color_red_at_end(self): + """t=1.0 should give red (BGR: low B, low G, high R).""" + b, g, r = VideoTrajectoryOverlayMapper._temporal_color(1.0) + self.assertEqual(b, 0) + self.assertEqual(g, 0) + self.assertEqual(r, 255) + + def test_temporal_color_gradient_monotonic(self): + """Blue should decrease and red should increase over time.""" + colors = [VideoTrajectoryOverlayMapper._temporal_color(t) + for t in np.linspace(0, 1, 11)] + blues = [c[0] for c in colors] + reds = [c[2] for c in colors] + # Blue starts high and ends low + self.assertGreater(blues[0], blues[-1]) + # Red starts low and ends high + self.assertLess(reds[0], reds[-1]) + + # ------------------------------------------------------------------ + # draw trajectory + # ------------------------------------------------------------------ + def test_draw_trajectory_returns_image(self): + """_draw_trajectory should return an image of the same shape.""" + frame = np.zeros((480, 640, 3), dtype=np.uint8) + points = np.array([[100, 200], [150, 250], [200, 300]], + dtype=np.float64) + op = VideoTrajectoryOverlayMapper() + result = op._draw_trajectory(frame, points, current_idx=0) + self.assertEqual(result.shape, frame.shape) + # Should have drawn something (not all zeros) + self.assertGreater(np.sum(result), 0) + + def test_draw_trajectory_out_of_bounds(self): + """Points outside frame should not crash.""" + frame = np.zeros((480, 640, 3), dtype=np.uint8) + points = np.array([[-100, -100], [700, 500], [320, 240]], + dtype=np.float64) + op = VideoTrajectoryOverlayMapper() + result = op._draw_trajectory(frame, points, current_idx=2) + self.assertEqual(result.shape, frame.shape) + + # ------------------------------------------------------------------ + # end-to-end process_single + # ------------------------------------------------------------------ + def test_process_single_creates_overlays(self): + """process_single should create overlay images for each segment.""" + sample = self._make_sample(n_frames=30, n_segments=2) + op = VideoTrajectoryOverlayMapper( + segment_field="atomic_action_segments", + save_dir=self.overlay_dir, + n_sample_frames=4, + ) + result = op.process_single(sample) + segments = result[Fields.meta]["atomic_action_segments"] + + for seg in segments: + overlay_paths = seg.get("overlay_frames", []) + self.assertGreater(len(overlay_paths), 0, + f"No overlays for {seg['hand_type']} " + f"seg{seg['segment_id']}") + for p in overlay_paths: + self.assertTrue(os.path.exists(p), + f"Overlay file not found: {p}") + + sampled = seg.get("sampled_frame_indices", []) + self.assertEqual(len(sampled), len(overlay_paths)) + + def test_process_single_no_segments(self): + """No segments → sample unchanged.""" + sample = { + "video_frames": [], + Fields.meta: {"atomic_action_segments": []}, + } + op = VideoTrajectoryOverlayMapper() + result = op.process_single(sample) + self.assertEqual(result[Fields.meta]["atomic_action_segments"], []) + + def test_process_single_no_camera(self): + """Missing camera data → segments get empty overlay_frames.""" + frame_paths = self._create_dummy_frames(10) + sample = { + "video_frames": frame_paths, + Fields.meta: { + "atomic_action_segments": [{ + "hand_type": "right", + "segment_id": 0, + "start_frame": 0, + "end_frame": 9, + "states": np.zeros((10, 8)).tolist(), + "valid_frame_ids": list(range(10)), + }], + MetaKeys.video_camera_pose_tags: [], + }, + } + op = VideoTrajectoryOverlayMapper(save_dir=self.overlay_dir) + result = op.process_single(sample) + # Should not crash, just skip + self.assertIn("atomic_action_segments", result[Fields.meta]) + + def test_n_sample_frames_respected(self): + """Number of overlay frames should match n_sample_frames.""" + sample = self._make_sample(n_frames=30, n_segments=1) + n_sample = 4 + op = VideoTrajectoryOverlayMapper( + segment_field="atomic_action_segments", + save_dir=self.overlay_dir, + n_sample_frames=n_sample, + ) + result = op.process_single(sample) + seg = result[Fields.meta]["atomic_action_segments"][0] + self.assertEqual(len(seg["overlay_frames"]), n_sample) + + def test_no_meta_passthrough(self): + """Sample without meta should pass through.""" + sample = {"text": "test"} + op = VideoTrajectoryOverlayMapper() + result = op.process_single(sample) + self.assertEqual(result, sample) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/ops/mapper/test_video_undistort_mapper.py b/tests/ops/mapper/test_video_undistort_mapper.py index 7ab3c2b753..0a1168007d 100644 --- a/tests/ops/mapper/test_video_undistort_mapper.py +++ b/tests/ops/mapper/test_video_undistort_mapper.py @@ -1,59 +1,73 @@ import os import unittest +import numpy as np +import shutil +import tempfile from data_juicer.core.data import NestedDataset as Dataset from data_juicer.ops.mapper.video_undistort_mapper import VideoUndistortMapper -from data_juicer.utils.constant import Fields, MetaKeys +from data_juicer.utils.constant import Fields, MetaKeys, CameraCalibrationKeys from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase -from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE -@unittest.skip("Due to strange AttributeError: module 'cv2.omnidir' has no attribute 'initUndistortRectifyMap', " - "which won't happen when running this test independently.") + class VideoUndistortMapperTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'data') vid3_path = os.path.join(data_path, 'video3.mp4') vid12_path = os.path.join(data_path, 'video12.mp4') + temp_dir = tempfile.TemporaryDirectory().name + + def tearDown(self) -> None: + super().tearDown() + + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) def _run_and_assert(self, output_video_dir, num_proc): ds_list = [{ 'videos': [self.vid3_path], - 'intrinsics': [[465.4728460758426, 0, 181.0], [0, 465.4728460758426, 320.0], [0, 0, 1]], - 'distortion_coefficients': None, - 'xi': 0.203957462310791, - 'rotation_matrix': None, - 'intrinsics_new': None + Fields.meta: { + 'camera_calibration': [{ + CameraCalibrationKeys.intrinsics: [[465.4728460758426, 0, 181.0], [0, 465.4728460758426, 320.0], [0, 0, 1]], + CameraCalibrationKeys.xi: [0.203957462310791], + CameraCalibrationKeys.dist_coeffs: None, + CameraCalibrationKeys.rectify_R: None, + CameraCalibrationKeys.new_intrinsics: None, + }], + } }, { 'videos': [self.vid12_path], - 'intrinsics': [[1227.3657989501953, 0, 960.0], [0, 1227.3657989501953, 540.0], [0, 0, 1]], - 'distortion_coefficients': None, - 'xi': 0.33518279, - 'rotation_matrix': None, - 'intrinsics_new': None + Fields.meta: { + 'camera_calibration': [{ + CameraCalibrationKeys.intrinsics: [[1227.3657989501953, 0, 960.0], [0, 1227.3657989501953, 540.0], [0, 0, 1]], + CameraCalibrationKeys.xi: [0.33518279], + CameraCalibrationKeys.dist_coeffs: None, + CameraCalibrationKeys.rectify_R: None, + CameraCalibrationKeys.new_intrinsics: None, + }], + } }] - tgt_key_names = ["new_video_path"] - op = VideoUndistortMapper( - output_video_dir=output_video_dir + output_video_dir=output_video_dir, + camera_calibration_field='camera_calibration', ) dataset = Dataset.from_list(ds_list) - if Fields.meta not in dataset.features: - dataset = dataset.add_column(name=Fields.meta, - column=[{}] * dataset.num_rows) dataset = dataset.map(op.process, num_proc=num_proc, with_rank=True) res_list = dataset.to_list() for sample in res_list: - self.assertEqual(list(sample[Fields.meta][MetaKeys.video_undistortion_tags].keys()), tgt_key_names) + tag_list = sample[MetaKeys.undistorted_video] + self.assertIsInstance(tag_list, list) + self.assertGreater(len(tag_list), 0) def test(self): - self._run_and_assert(output_video_dir=os.path.join(DATA_JUICER_ASSETS_CACHE, "output_video1"), num_proc=1) + self._run_and_assert(output_video_dir=os.path.join(self.temp_dir, "output_video1"), num_proc=1) def test_mul_proc(self): - self._run_and_assert(output_video_dir=os.path.join(DATA_JUICER_ASSETS_CACHE, "output_video2"), num_proc=2) + self._run_and_assert(output_video_dir=os.path.join(self.temp_dir, "output_video2"), num_proc=2) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main()