Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
8526228
update vla ops
Cathy0908 Mar 6, 2026
d24d91a
add new ops and update havor op
Mar 11, 2026
b7f467e
fix op
Mar 11, 2026
fb52cec
add vis demo
Mar 11, 2026
e77d7d7
update vis demo tools
Mar 12, 2026
ba47fb6
update vis tools
Mar 12, 2026
570c791
update ops
Mar 12, 2026
8fe0698
add vla_pipeline demo
Cathy0908 Mar 13, 2026
27b9aa5
update vla demo
Cathy0908 Mar 13, 2026
f148ce7
update code
Cathy0908 Mar 13, 2026
f608d13
merge main
Cathy0908 Mar 13, 2026
17c707b
update
Cathy0908 Mar 13, 2026
7d94c72
add ut
Cathy0908 Mar 13, 2026
e02b585
rename demo name
Cathy0908 Mar 13, 2026
95ccec5
update dir name
Cathy0908 Mar 13, 2026
3b497cd
fix pre-commit check
Cathy0908 Mar 13, 2026
94efd7a
remove old ut
Cathy0908 Mar 13, 2026
01699d0
update operator.md
Cathy0908 Mar 13, 2026
db06f38
add action caption
Cathy0908 Mar 17, 2026
527f4d1
update readme
Cathy0908 Mar 17, 2026
6a22558
Fix merge conflicts: handle deleted files and manual merge for video_…
Cathy0908 Mar 18, 2026
5ef6256
fix ut
Cathy0908 Mar 18, 2026
51fd226
update video action caption
Cathy0908 Mar 18, 2026
628cc96
support output bytes for video_split_by_duration_mapper
Cathy0908 Mar 18, 2026
8579334
support video bytes for export_to_lerobot_mapper
Cathy0908 Mar 18, 2026
c384a6b
optimize video_hand_reconstruction_hawor_mapper
Cathy0908 Mar 19, 2026
c1a1d24
update vla_pipeline
Cathy0908 Mar 19, 2026
3c3d4f9
optimize performance of ops
Cathy0908 Mar 23, 2026
a007178
optimize performance: tolist -> numpy
Cathy0908 Mar 23, 2026
69989d0
remove comments
Cathy0908 Mar 23, 2026
030eacf
add joints_cam info, shape=(T, 21, 3)
Cathy0908 Mar 24, 2026
91d2a47
update ops
Cathy0908 Mar 25, 2026
36faca9
add droid_buffer param for megasam
Cathy0908 Mar 26, 2026
581cf45
support save depth, points, mask to file
Cathy0908 Mar 26, 2026
e58172e
add Dockerfile for ego pose demo
Cathy0908 Mar 27, 2026
380efac
update vla pipeline
Cathy0908 Mar 27, 2026
d4f72cc
update vis tools
Cathy0908 Mar 27, 2026
d586c10
update vla_pipleline.pty
Cathy0908 Mar 27, 2026
237340b
add new ops:VideoHandMotionSmoothMapper,VideoClipReassemblyMapper
Cathy0908 Mar 31, 2026
eb70c76
update operators.md
Cathy0908 Mar 31, 2026
b519a52
add new ops
Cathy0908 Apr 1, 2026
a0fcb17
update ops
Cathy0908 Apr 1, 2026
137ca1c
optimize ops
Cathy0908 Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions data_juicer/config/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -560,18 +560,18 @@ process:
if_output_point_maps_from_projection: True # Determines whether to output point maps directly inferred by VGGT.
if_output_point_maps_from_unprojection: True # Determines whether to output point maps constructed from depth maps and camera parameters.
if_output_point_tracks: True # Determines whether to output point tracks.
- video_camera_calibration_static_deepcalib_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib.
- video_camera_calibration_deepcalib_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using DeepCalib.
model_path: "weights_10_0.02.h5" # The path to the DeepCalib Regression model.
frame_num: 3 # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
duration: 0 # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
tag_field_name: 'static_camera_calibration_deepcalib_tags' # the field name to store the tags. It's "static_camera_calibration_deepcalib_tags" in default.
tag_field_name: 'camera_calibration_deepcalib_tags' # the field name to store the tags. It's "camera_calibration_deepcalib_tags" in default.
frame_dir: None # Output directory to save extracted frames.
output_info_dir: None # Output directory for saving camera parameters.
- video_camera_calibration_static_moge_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib).
- video_camera_calibration_moge_mapper: # Compute the camera intrinsics and field of view (FOV) for a static camera using Moge-2 (more accurate than DeepCalib).
model_path: "Ruicheng/moge-2-vitl" # The path to the Moge-2 model.
frame_num: 3 # the number of frames to be extracted uniformly from the video. If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
duration: 0 # the duration of each segment in seconds. If 0, frames are extracted from the entire video. If duration > 0, the video is segmented into multiple segments based on duration, and frames are extracted from each segment.
tag_field_name: 'static_camera_calibration_moge_tags' # the field name to store the tags. It's "static_camera_calibration_moge_tags" in default.
tag_field_name: 'camera_calibration_moge_tags' # the field name to store the tags. It's "camera_calibration_moge_tags" in default.
frame_dir: None # Output directory to save extracted frames.
if_output_info: True # Whether to save the camera parameters results to an JSON file.
output_info_dir: None # Output directory for saving camera parameters.
Expand Down Expand Up @@ -972,6 +972,7 @@ process:
min_score: 0.25 # the minimum motion score to keep samples
max_score: 10000.0 # the maximum motion score to keep samples
sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow
original_fps: null # the original FPS of the video from which the frames were extracted, only used when frame_field is specified
size: null # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w)
max_size: null # maximum allowed for the longer edge of resized frames
divisible: 1 # The number that the dimensions must be divisible by.
Expand All @@ -981,6 +982,7 @@ process:
min_score: 1.0 # the minimum motion score to keep samples
max_score: 10000.0 # the maximum motion score to keep samples
sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow
original_fps: null # the original FPS of the video from which the frames were extracted, only used when frame_field is specified
size: null # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w)
max_size: null # maximum allowed for the longer edge of resized frames
divisible: 8 # The number that the dimensions must be divisible by.
Expand Down
19 changes: 19 additions & 0 deletions data_juicer/ops/common/mano_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,25 @@ def forward(self, *args, **kwargs) -> smplx.utils.MANOOutput:
mano_output.joints = joints
return mano_output

@classmethod
def build_left(cls, model_path, fix_shapedirs=True):
"""Build a LEFT-hand MANO model.

HaWoR uses a separate MANO_LEFT.pkl with is_rhand=False and a
shapedirs bug-fix (https://github.com/vchoutas/smplx/issues/48).

Args:
model_path: Path to MANO_LEFT.pkl
fix_shapedirs: Apply the left-hand shapedirs fix (default True).

Returns:
MANO model configured for left hand.
"""
model = cls(model_path=model_path, is_rhand=False)
if fix_shapedirs:
model.shapedirs[:, 0, :] *= -1
return model

def query(self, hmr_output):
batch_size = hmr_output["pred_rotmat"].shape[0]
pred_rotmat = hmr_output["pred_rotmat"].reshape(batch_size, -1, 3, 3)
Expand Down
20 changes: 19 additions & 1 deletion data_juicer/ops/filter/video_motion_score_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
max_score: float = sys.float_info.max,
frame_field: Optional[str] = None,
sampling_fps: PositiveFloat = 2,
original_fps: Optional[PositiveFloat] = None,
size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None,
max_size: Optional[PositiveInt] = None,
divisible: PositiveInt = 1,
Expand All @@ -72,6 +73,11 @@ def __init__(
If frame_field is None, extract frames from the video field.
:param sampling_fps: The sampling rate in frames_per_second for
optical flow calculations.
:param original_fps: The original FPS of the video from which the
frames were extracted. Only used when `frame_field` is specified.
When provided, frames will be sampled at `sampling_fps` rate
by computing `sampling_step = round(original_fps / sampling_fps)`.
If None, all frames will be processed without sampling.
:param size: Resize frames before computing optical flow. If size is a
sequence like (h, w), frame size will be matched to this. If size
is an int, smaller edge of frames will be matched to this number.
Expand Down Expand Up @@ -101,6 +107,7 @@ def __init__(
self.min_score = min_score
self.max_score = max_score
self.sampling_fps = sampling_fps
self.original_fps = original_fps
self.frame_field = frame_field

if isinstance(size, (list, tuple)):
Expand Down Expand Up @@ -198,7 +205,18 @@ def _compute_motion_scores_from_frames(self, frames):
video_motion_scores = []
optical_flows = []
prev_frame = None
for frame in frames:

# compute sampling step if original_fps is provided
sampling_step = 1
if self.original_fps is not None and self.original_fps > 0:
effective_fps = min(self.sampling_fps, self.original_fps)
sampling_step = max(round(self.original_fps / effective_fps), 1)

for frame_idx, frame in enumerate(frames):
# skip frames according to sampling_step
if sampling_step > 1 and frame_idx % sampling_step != 0:
continue

if isinstance(frame, bytes):
image_array = np.frombuffer(frame, dtype=np.uint8)
frame = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
Expand Down
2 changes: 2 additions & 0 deletions data_juicer/ops/filter/video_motion_score_ptlflow_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(
ckpt_path: Optional[str] = "things",
get_model_args: Optional[dict] = None,
sampling_fps: PositiveFloat = 2,
original_fps: Optional[PositiveFloat] = None,
size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None,
max_size: Optional[PositiveInt] = None,
divisible: PositiveInt = 8,
Expand All @@ -70,6 +71,7 @@ def __init__(
max_score,
frame_field,
sampling_fps,
original_fps,
size,
max_size,
divisible,
Expand Down
2 changes: 2 additions & 0 deletions data_juicer/ops/filter/video_motion_score_raft_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __init__(
max_score: float = sys.float_info.max,
frame_field: Optional[str] = None,
sampling_fps: PositiveFloat = 2,
original_fps: Optional[PositiveFloat] = None,
size: Union[PositiveInt, Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt], None] = None,
max_size: Optional[PositiveInt] = None,
divisible: PositiveInt = 8,
Expand All @@ -67,6 +68,7 @@ def __init__(
max_score,
frame_field,
sampling_fps,
original_fps,
size,
max_size,
divisible,
Expand Down
29 changes: 22 additions & 7 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from .dialog_topic_detection_mapper import DialogTopicDetectionMapper
from .download_file_mapper import DownloadFileMapper
from .expand_macro_mapper import ExpandMacroMapper
from .export_to_lerobot_mapper import ExportToLeRobotMapper
from .extract_entity_attribute_mapper import ExtractEntityAttributeMapper
from .extract_entity_relation_mapper import ExtractEntityRelationMapper
from .extract_event_mapper import ExtractEventMapper
Expand Down Expand Up @@ -84,22 +85,27 @@
from .text_chunk_mapper import TextChunkMapper
from .text_tagging_by_prompt_mapper import TextTaggingByPromptMapper
from .vggt_mapper import VggtMapper
from .video_camera_calibration_static_deepcalib_mapper import (
VideoCameraCalibrationStaticDeepcalibMapper,
from .video_atomic_action_segment_mapper import VideoAtomicActionSegmentMapper
from .video_camera_calibration_deepcalib_mapper import (
VideoCameraCalibrationDeepcalibMapper,
)
from .video_camera_calibration_static_moge_mapper import (
VideoCameraCalibrationStaticMogeMapper,
from .video_camera_calibration_droidcalib_mapper import (
VideoCameraCalibrationDroidCalibMapper,
)
from .video_camera_pose_mapper import VideoCameraPoseMapper
from .video_camera_calibration_moge_mapper import VideoCameraCalibrationMogeMapper
from .video_camera_pose_megasam_mapper import VideoCameraPoseMegaSaMMapper
from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper
from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper
from .video_captioning_from_video_mapper import VideoCaptioningFromVideoMapper
from .video_captioning_from_vlm_mapper import VideoCaptioningFromVLMMapper
from .video_clip_reassembly_mapper import VideoClipReassemblyMapper
from .video_depth_estimation_mapper import VideoDepthEstimationMapper
from .video_extract_frames_mapper import VideoExtractFramesMapper
from .video_face_blur_mapper import VideoFaceBlurMapper
from .video_ffmpeg_wrapped_mapper import VideoFFmpegWrappedMapper
from .video_hand_action_compute_mapper import VideoHandActionComputeMapper
from .video_hand_motion_smooth_mapper import VideoHandMotionSmoothMapper
from .video_hand_reconstruction_hawor_mapper import VideoHandReconstructionHaworMapper
from .video_hand_reconstruction_mapper import VideoHandReconstructionMapper
from .video_object_segmenting_mapper import VideoObjectSegmentingMapper
Expand All @@ -111,11 +117,14 @@
from .video_split_by_scene_mapper import VideoSplitBySceneMapper
from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper
from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
from .video_trajectory_overlay_mapper import VideoTrajectoryOverlayMapper
from .video_undistort_mapper import VideoUndistortMapper
from .video_whole_body_pose_estimation_mapper import VideoWholeBodyPoseEstimationMapper
from .whitespace_normalization_mapper import WhitespaceNormalizationMapper

__all__ = [
"VideoCameraCalibrationDroidCalibMapper",
"VideoCameraPoseMegaSaMMapper",
"AudioAddGaussianNoiseMapper",
"AudioFFmpegWrappedMapper",
"CalibrateQAMapper",
Expand All @@ -140,6 +149,7 @@
"ExpandMacroMapper",
"ExtractEntityAttributeMapper",
"ExtractEntityRelationMapper",
"ExportToLeRobotMapper",
"ExtractEventMapper",
"ExtractKeywordMapper",
"ExtractNicknameMapper",
Expand Down Expand Up @@ -196,8 +206,8 @@
"TextChunkMapper",
"TextTaggingByPromptMapper",
"VggtMapper",
"VideoCameraCalibrationStaticDeepcalibMapper",
"VideoCameraCalibrationStaticMogeMapper",
"VideoCameraCalibrationDeepcalibMapper",
"VideoCameraCalibrationMogeMapper",
"VideoCaptioningFromAudioMapper",
"VideoCaptioningFromFramesMapper",
"VideoCaptioningFromSummarizerMapper",
Expand All @@ -208,6 +218,11 @@
"VideoFFmpegWrappedMapper",
"VideoHandReconstructionHaworMapper",
"VideoHandReconstructionMapper",
"VideoHandActionComputeMapper",
"VideoHandMotionSmoothMapper",
"VideoClipReassemblyMapper",
"VideoAtomicActionSegmentMapper",
"VideoTrajectoryOverlayMapper",
"VideoFaceBlurMapper",
"VideoObjectSegmentingMapper",
"VideoRemoveWatermarkMapper",
Expand Down
Loading
Loading