datajuicer
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.pre-commit-hooks/build_op_doc.py‎
Lines changed: 56 additions & 1 deletion b/‎.pre-commit-hooks/build_op_doc.py‎
Lines changed: 56 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎data_juicer/config/config.py‎
Lines changed: 89 additions & 7 deletions b/‎data_juicer/config/config.py‎
Lines changed: 89 additions & 7 deletions
diff --git a/‎data_juicer/config/config_all.yaml‎
Lines changed: 11 additions & 10 deletions b/‎data_juicer/config/config_all.yaml‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎data_juicer/core/data/dj_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎data_juicer/core/data/dj_dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data_juicer/core/executor/default_executor.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/core/executor/default_executor.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data_juicer/core/executor/ray_executor.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/core/executor/ray_executor.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data_juicer/core/executor/ray_executor_partitioned.py‎
Lines changed: 2 additions & 0 deletions b/‎data_juicer/core/executor/ray_executor_partitioned.py‎
Lines changed: 2 additions & 0 deletions
@@ -39,6 +39,9 @@ tmp/
 # perf bench data
 perf_bench_data/
 
+# some local demo data
+demos/local/*
+
 # env file
 .env
 
 
@@ -76,7 +76,14 @@
 # >>> OP code/test paths and exclusive files/dirs
 OP_CODE_PREFIX = "data_juicer/ops/"
 OP_TEST_PREFIX = "tests/ops/"
-OP_EXCLUDE = {"__init__.py", "common", "__pycache__"}
+OP_EXCLUDE = {
+    "__init__.py",
+    "common",
+    "__pycache__",
+    # Helper module under mapper/ (not a registered OP)
+    "dialog_llm_input_utils.py",
+    "dialog_quality_llm_utils.py",
+}
 
 FORMATTER_CODE_PREFIX = "data_juicer/format/"
 FORMATTER_TEST_PREFIX = "tests/format/"
@@ -273,6 +280,16 @@ def pick_doc_for_op(docstrings: List[tuple], op_stem: str) -> str:
     return docstrings[-1][1]
 
 
+def is_registered_op(code_path):
+    """
+    Return True only if the file contains an OPERATORS.register_module call,
+    indicating it defines a concrete registered OP rather than a base class.
+    """
+    with open(code_path, "r", encoding="utf-8") as fin:
+        content = fin.read()
+    return "OPERATORS.register_module" in content
+
+
 def get_class_and_docstring(code_path):
     """
     Get (class_name, first-sentence doc) for each ClassDef in the file that has a class docstring.
@@ -375,6 +392,8 @@ def get_op_list_from_code():
                 continue
             if not code_path.endswith(".py") or "_cpp" in code_path:
                 continue
+            if not is_registered_op(code_path):
+                continue
             docstrings = get_class_and_docstring(code_path)
             stem = op.replace(".py", "")
             doc = pick_doc_for_op(docstrings, stem)
@@ -658,6 +677,41 @@ def check_and_update_op_record(old_op_record_list, new_op_record_list):
     return updated_op_record_list
 
 
+def print_op_doc_diff(old_op_num_dict, new_op_num_dict, old_op_record_list, updated_op_record_list):
+    """
+    Print the difference between the old and new op_num_dict and op_record_list.
+    """
+    all_types = set(old_op_num_dict) | set(new_op_num_dict)
+    for t in sorted(all_types):
+        old_cnt = old_op_num_dict.get(t)
+        new_cnt = new_op_num_dict.get(t)
+        if old_cnt != new_cnt:
+            print(f"  [op_num] type={t}: {old_cnt} -> {new_cnt}")
+
+    old_record_dict = {r.name: r for r in old_op_record_list}
+    new_record_dict = {r.name: r for r in updated_op_record_list}
+    old_names = set(old_record_dict)
+    new_names = set(new_record_dict)
+    for name in sorted(new_names - old_names):
+        print(f"  [op_record] ADDED:   {new_record_dict[name]}")
+    for name in sorted(old_names - new_names):
+        print(f"  [op_record] REMOVED: {old_record_dict[name]}")
+    for name in sorted(old_names & new_names):
+        old_r, new_r = old_record_dict[name], new_record_dict[name]
+        if old_r != new_r:
+            print(f"  [op_record] CHANGED: {name}")
+            if old_r.type != new_r.type:
+                print(f"    type:  {old_r.type!r} -> {new_r.type!r}")
+            if set(old_r.tags) != set(new_r.tags):
+                print(f"    tags:  {old_r.tags} -> {new_r.tags}")
+            if old_r.desc != new_r.desc:
+                print(f"    desc:  {old_r.desc!r} -> {new_r.desc!r}")
+            if old_r.info != new_r.info:
+                print(f"    info:  {old_r.info!r} -> {new_r.info!r}")
+            if old_r.ref != new_r.ref:
+                print(f"    ref:   {old_r.ref!r} -> {new_r.ref!r}")
+
+
 def main():
     old_op_record_list, old_op_num_dict = parse_op_record_from_current_doc()
     new_op_record_list, new_op_num_dict = get_op_list_from_code()
@@ -666,6 +720,7 @@ def main():
     if new_op_num_dict == old_op_num_dict and old_op_record_list == updated_op_record_list:
         exit(0)
     else:
+        print_op_doc_diff(old_op_num_dict, new_op_num_dict, old_op_record_list, updated_op_record_list)
         generate_new_doc(updated_op_record_list, old_op_record_list)
         print("Operator document is updated.")
         exit(1)
 
@@ -187,6 +187,7 @@ For detailed documentation, please see [here](https://datajuicer.github.io/data-
 
 **Quick Links:**
 - **[operator zoo](https://datajuicer.github.io/data-juicer/en/main/docs/Operators.html)** — Browse 200+ operators with examples
+- **[Agent interaction quality & bad-case](demos/agent/README.md)** — In-repo recipe, JSONL pipeline, HTML report (`demos/agent/`; operators such as `agent_bad_case_signal_mapper` are also listed in [docs/Operators.md](docs/Operators.md))
 - **[data-juicer-hub](https://github.com/datajuicer/data-juicer-hub)** — Community-driven recipes and best practices
 - **[developer guide](https://datajuicer.github.io/data-juicer/en/main/docs/DeveloperGuide.html)** — Build your own code and contribute to DJ 
 - **[data-juicer-cookbook](https://datajuicer.github.io/data-juicer/en/main/docs/tutorial/DJ-Cookbook.html)** — resource archive
 
@@ -269,6 +269,33 @@ def build_base_parser() -> ArgumentParser:
         "an error will be raised. Should contain aws_access_key_id, aws_secret_access_key, aws_region, "
         "and optionally aws_session_token and endpoint_url.",
     )
+    parser.add_argument(
+        "--decrypt_after_reading",
+        type=bool,
+        default=False,
+        help="Whether to decrypt input dataset files after reading. When True, "
+        "each input file is decrypted in memory using a Fernet key before "
+        "being loaded by HuggingFace datasets. No plaintext file is written "
+        "to disk. HuggingFace cache is automatically disabled to prevent "
+        "plaintext Arrow files from being persisted. Default: False.",
+    )
+    parser.add_argument(
+        "--encrypt_before_export",
+        type=bool,
+        default=False,
+        help="Whether to encrypt output dataset files before writing to disk. "
+        "When True, each exported file is encrypted in-place with a Fernet "
+        "key immediately after being written. Default: False.",
+    )
+    parser.add_argument(
+        "--encryption_key_path",
+        type=Optional[str],
+        default=None,
+        help="Path to a file containing the Fernet encryption key (base64 "
+        "url-safe string). If not provided, the key is read from the "
+        "environment variable DJ_ENCRYPTION_KEY. Required when either "
+        "decrypt_after_reading or encrypt_before_export is True.",
+    )
     parser.add_argument(
         "--keep_stats_in_res_ds",
         type=bool,
@@ -764,6 +791,13 @@ def init_configs(args: Optional[List[str]] = None, which_entry: object = None, l
         setting up logger.
     :return: a global cfg object used by the DefaultExecutor or Analyzer
     """
+    # Optional: stdlib json for HF datasets JSONL (avoids ujson "Value is too big!")
+    from data_juicer.utils.datasets_json_compat import (
+        apply_stdlib_json_patch_for_datasets,
+    )
+
+    apply_stdlib_json_patch_for_datasets()
+
     if args is None:
         args = sys.argv[1:]
     with timing_context("Total config initialization time"):
@@ -985,6 +1019,37 @@ def init_setup_from_cfg(cfg: Namespace, load_configs_only=False):
             os.makedirs(cfg.temp_dir, exist_ok=True)
         tempfile.tempdir = cfg.temp_dir
 
+    # encryption mode: force disable HF cache to prevent plaintext Arrow files
+    # from being persisted to disk, and validate the key early.
+    if cfg.get("decrypt_after_reading", False) or cfg.get("encrypt_before_export", False):
+        if cfg.get("use_cache", True):
+            logger.warning(
+                "Encryption mode is enabled: forcing use_cache=False to "
+                "prevent plaintext Arrow cache files from being written to disk."
+            )
+            from datasets import disable_caching
+
+            disable_caching()
+            cfg.use_cache = False
+            if cfg.cache_compress:
+                logger.warning("Disable cache compression due to disabled cache.")
+                cfg.cache_compress = None
+            import tempfile
+
+            logger.warning(
+                f"Set temp directory to store temp files to [{cfg.temp_dir}]. "
+                "For maximum security, set temp_dir to a memory-backed "
+                "filesystem such as /dev/shm."
+            )
+            if cfg.temp_dir is not None and not os.path.exists(cfg.temp_dir):
+                os.makedirs(cfg.temp_dir, exist_ok=True)
+            tempfile.tempdir = cfg.temp_dir
+        # Validate key availability early so the job fails fast on
+        # misconfiguration rather than deep into processing.
+        from data_juicer.utils.encryption_utils import load_fernet_key
+
+        load_fernet_key(cfg.get("encryption_key_path", None))
+
     # The checkpoint mode is not compatible with op fusion for now.
     if cfg.get("op_fusion", False):
         cfg.use_checkpoint = False
@@ -1203,15 +1268,20 @@ def update_op_process(cfg, parser, used_ops=None):
             # Add new operator
             cfg.process.append({op_name: None if internal_op_para is None else namespace_to_dict(internal_op_para)})
 
-    # Optimize type checking
+    # Optimize type checking: deepcopy(parser) does not replicate nested add_class_arguments,
+    # so only pass global args to temp_parser to avoid "Unrecognized arguments" for op.* keys.
     recognized_args = {
         action.dest for action in parser._actions if hasattr(action, "dest") and isinstance(action, ActionTypeHint)
     }
+    exclude_prefixes = tuple(used_ops) + tuple(f"{op_name}." for op_name in (used_ops or ()))
 
-    # check the op params via type hint
     temp_parser = copy.deepcopy(parser)
-
-    temp_args = namespace_to_arg_list(temp_cfg, includes=recognized_args, excludes=["config"])
+    temp_args = namespace_to_arg_list(
+        temp_cfg,
+        includes=recognized_args,
+        excludes=["config"],
+        exclude_prefixes=exclude_prefixes,
+    )
 
     if temp_cfg.config:
         temp_args.extend(["--config", os.path.abspath(temp_cfg.config[0])])
@@ -1224,15 +1294,27 @@ def update_op_process(cfg, parser, used_ops=None):
     return cfg
 
 
-def namespace_to_arg_list(namespace, prefix="", includes=None, excludes=None):
+def namespace_to_arg_list(namespace, prefix="", includes=None, excludes=None, exclude_prefixes=None):
     arg_list = []
+    exclude_prefixes = exclude_prefixes or ()
 
     for key, value in vars(namespace).items():
+        concat_key = f"{prefix}{key}"
+        if exclude_prefixes and (
+            concat_key in exclude_prefixes
+            or any(concat_key.startswith(p + ".") for p in exclude_prefixes if "." not in p)
+        ):
+            continue
         if issubclass(type(value), Namespace):
-            nested_args = namespace_to_arg_list(value, f"{prefix}{key}.")
+            nested_args = namespace_to_arg_list(
+                value,
+                f"{prefix}{key}.",
+                includes=includes,
+                excludes=excludes,
+                exclude_prefixes=exclude_prefixes,
+            )
             arg_list.extend(nested_args)
         elif value is not None:
-            concat_key = f"{prefix}{key}"
             if includes is not None and concat_key not in includes:
                 continue
             if excludes is not None and concat_key in excludes:
 
@@ -33,7 +33,13 @@ export_in_parallel: false                                   # whether to export
 keep_stats_in_res_ds: false                                 # whether to keep the computed stats in the result dataset. The intermediate fields to store the stats computed by Filters will be removed if it's False. It's False in default.
 keep_hashes_in_res_ds: false                                # whether to keep the computed hashes in the result dataset. The intermediate fields to store the hashes computed by Deduplicators will be removed if it's False. It's False in default.
 export_extra_args: {}                                       # Other optional arguments for exporting in dict. For example, the key mapping info for exporting the WebDataset format.
+# If loading local JSONL fails with ujson ValueError: Value is too big!, run with:
+#   DATA_JUICER_USE_STDLIB_JSON=1 dj-process --config your.yaml
+# Per-line tolerance (stdlib json, skip bad lines; jsonl-only inputs), see DatasetCfg:
+#   load_jsonl_lenient: true
+#   # or: DATA_JUICER_JSONL_LENIENT=1
 load_dataset_kwargs: {}                                     # extra kwargs passed to datasets.load_dataset(). Useful for format-specific options, e.g. chunksize (JSON), columns (Parquet), delimiter (CSV).
+load_jsonl_lenient: false                                    # if true, stream jsonl* shards with stdlib json and skip bad lines; other suffixes in the same folder are ignored (not HF fallback). Confirm logs contain "[lenient jsonl] ACTIVE".
 
 auto_op_parallelism: true                                   # whether to automatically set num_proc according to system resources. It is true in default.
 np: 4                                                       # number of subprocess to process your dataset
@@ -56,6 +62,11 @@ fusion_strategy: 'probe'                                    # OP fusion strategy
 cache_compress: null                                        # the compression method of the cache file, which can be specified in ['gzip', 'zstd', 'lz4']. If this parameter is None, the cache file will not be compressed. We recommend you turn on this argument when your input dataset is larger than tens of GB and your disk space is not enough.
 adaptive_batch_size: false                                  # whether to use adaptive batch sizes for each OP according to the probed results. It's False in default.
 
+# for data encryption / decryption
+decrypt_after_reading: false                                # whether to decrypt input files after reading. Each file is decrypted in-memory (plaintext never written to disk). HF cache is automatically disabled. Requires encryption_key_path or DJ_ENCRYPTION_KEY env var.
+encrypt_before_export: false                                # whether to encrypt output files before writing to disk. Each exported file is encrypted in-place immediately after being written. Requires encryption_key_path or DJ_ENCRYPTION_KEY env var.
+encryption_key_path: null                                   # path to a file containing the Fernet key (base64 url-safe string). Falls back to environment variable DJ_ENCRYPTION_KEY if not set.
+
 # for multimodal data processing
 image_key: 'images'                                         # key name of field to store the list of sample image paths.
 image_bytes_key: 'image_bytes'                              # key name of field to store the list of sample image bytes.
@@ -338,16 +349,6 @@ process:
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
       save_dir: null                                          # The directory where generated files will be stored. If not specified, outputs will be saved in the same directory as their corresponding input files. This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.
-  - image_captioning_from_gpt4v_mapper:                     # generate samples whose texts are generated based on gpt-4-vision and the image
-      mode: 'description'                                     # mode of text generated from images, can be one of ['reasoning', 'description', 'conversation', 'custom']
-      api_key: ''                                             # the API key to authenticate the request
-      max_token: 500                                          # the maximum number of tokens to generate. Default is 500.
-      temperature: 1.0                                        # controls the randomness of the output (range from 0 to 1). Default is 0.
-      system_prompt: ''                                       # a string prompt used to set the context of a conversation and provide global guidance or rules for the gpt4-vision so that it can  generate responses in the expected way. If `mode` set to `custom`, the parameter will be used
-      user_prompt: ''                                         # a string prompt to guide the generation of gpt4-vision for each samples. It's "" in default, which means no prompt provided
-      user_prompt_key: null                                   # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default
-      keep_original_sample: true                              # whether to keep the original sample. If it's set to False, there will be only generated text in the final datasets and the original text will be removed. It's True in default
-      any_or_all: 'any'                                       # keep this sample with 'any' or 'all' strategy of all images. 'any': keep this sample if any images meet the condition. 'all': keep this sample only if all images meet the condition
   - image_captioning_mapper:                                # generate captions for images to augment datasets
       hf_img2seq: 'Salesforce/blip2-opt-2.7b'                 # model name on huggingface to generate caption
       caption_num: 1                                          # how many candidate captions to generate for each image
 
@@ -152,7 +152,7 @@ def __init__(self, *args, **kargs):
         # batched sample, (k & v) are organized by list manner
         for k, v in self.items():
             if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
-                self[k] = [NestedQueryDict(item) for item in v]
+                self[k] = [NestedQueryDict(item) if isinstance(item, dict) else item for item in v]
 
     def __getitem__(self, key):
         return nested_query(self, key)
 
@@ -113,6 +113,8 @@ def __init__(self, cfg: Optional[Namespace] = None):
             self.np,
             keep_stats_in_res_ds=self.cfg.keep_stats_in_res_ds,
             keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds,
+            encrypt_before_export=getattr(self.cfg, "encrypt_before_export", False),
+            encryption_key_path=getattr(self.cfg, "encryption_key_path", None),
             **export_extra_args,
         )
 
 
@@ -114,6 +114,8 @@ def __init__(self, cfg: Optional[Namespace] = None):
             self.cfg.export_shard_size,
             keep_stats_in_res_ds=self.cfg.keep_stats_in_res_ds,
             keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds,
+            encrypt_before_export=getattr(self.cfg, "encrypt_before_export", False),
+            encryption_key_path=getattr(self.cfg, "encryption_key_path", None),
             **export_extra_args,
         )
 
 
@@ -244,6 +244,8 @@ def __init__(self, cfg: Optional[Namespace] = None):
             getattr(self.cfg, "export_shard_size", 0),
             keep_stats_in_res_ds=getattr(self.cfg, "keep_stats_in_res_ds", True),
             keep_hashes_in_res_ds=getattr(self.cfg, "keep_hashes_in_res_ds", False),
+            encrypt_before_export=getattr(self.cfg, "encrypt_before_export", False),
+            encryption_key_path=getattr(self.cfg, "encryption_key_path", None),
             **export_extra_args,
         )
Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,8 @@ def __init__(self, cfg: Optional[Namespace] = None):`
`113`	`113`	`self.np,`
`114`	`114`	`keep_stats_in_res_ds=self.cfg.keep_stats_in_res_ds,`
`115`	`115`	`keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds,`
	`116`	`+ encrypt_before_export=getattr(self.cfg, "encrypt_before_export", False),`
	`117`	`+ encryption_key_path=getattr(self.cfg, "encryption_key_path", None),`
`116`	`118`	`**export_extra_args,`
`117`	`119`	`)`
`118`	`120`
Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,8 @@ def __init__(self, cfg: Optional[Namespace] = None):`
`114`	`114`	`self.cfg.export_shard_size,`
`115`	`115`	`keep_stats_in_res_ds=self.cfg.keep_stats_in_res_ds,`
`116`	`116`	`keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds,`
	`117`	`+ encrypt_before_export=getattr(self.cfg, "encrypt_before_export", False),`
	`118`	`+ encryption_key_path=getattr(self.cfg, "encryption_key_path", None),`
`117`	`119`	`**export_extra_args,`
`118`	`120`	`)`
`119`	`121`
Original file line number	Diff line number	Diff line change
`@@ -244,6 +244,8 @@ def __init__(self, cfg: Optional[Namespace] = None):`
`244`	`244`	`getattr(self.cfg, "export_shard_size", 0),`
`245`	`245`	`keep_stats_in_res_ds=getattr(self.cfg, "keep_stats_in_res_ds", True),`
`246`	`246`	`keep_hashes_in_res_ds=getattr(self.cfg, "keep_hashes_in_res_ds", False),`
	`247`	`+ encrypt_before_export=getattr(self.cfg, "encrypt_before_export", False),`
	`248`	`+ encryption_key_path=getattr(self.cfg, "encryption_key_path", None),`
`247`	`249`	`**export_extra_args,`
`248`	`250`	`)`
`249`	`251`