fix: cache fingerprint stability for wrapped methods and FusedFilter

cyruszhang · cyruszhang · commit c31c619dc4e9 · 2026-04-16T14:11:48.000-07:00
The Hasher only walked one level of __wrapped__ when resolving bound
  method owners, but wrap_func_with_nested_access adds multiple decorator
  layers.  This caused every NestedDataset.map/filter call to miss cache
  despite OP hashes matching, because the actual function passed to HF
  datasets was the deeply-wrapped variant.

  Additionally, FusedFilter.fused_filters (a list of child OPs) was
  serialized via dill.dumps which included each child's work_dir,
  defeating cache for fused pipelines.

  - Walk the full __wrapped__ chain (up to 10 levels) in Hasher._find_op_owner
  - Recursively sanitize nested OP instances in _fingerprint_bytes
  - Add tests for FusedFilter, wrapped methods, and multi-step pipeline cache hits
diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -329,10 +329,27 @@ def _fingerprint_bytes(self):
         poison the cache.  Callable attributes (bound/wrapped methods like
         ``process``, ``compute_stats``) are also excluded because they
         close over ``self`` and would re-introduce the excluded attrs.
+
+        Nested OP instances (e.g. ``FusedFilter.fused_filters``) are
+        recursively fingerprinted via their own ``_fingerprint_bytes``
+        so that their ``work_dir`` is also excluded.
         """
         import dill
 
-        state = {k: v for k, v in self.__dict__.items() if k not in self._NON_FINGERPRINT_ATTRS and not callable(v)}
+        def _sanitize(v):
+            """Recursively replace OP instances with their fingerprint bytes."""
+            if isinstance(v, OP) and hasattr(v, "_fingerprint_bytes"):
+                return v._fingerprint_bytes()
+            if isinstance(v, (list, tuple)):
+                converted = [_sanitize(item) for item in v]
+                return type(v)(converted)
+            return v
+
+        state = {}
+        for k, v in self.__dict__.items():
+            if k in self._NON_FINGERPRINT_ATTRS or callable(v):
+                continue
+            state[k] = _sanitize(v)
         return dill.dumps(state)
 
     def __init__(self, *args, **kwargs):
diff --git a/data_juicer/utils/fingerprint_utils.py b/data_juicer/utils/fingerprint_utils.py
@@ -29,6 +29,31 @@ def hash_bytes(cls, value: Union[bytes, List[bytes]]) -> str:
             m.update(x)
         return m.hexdigest()
 
+    @classmethod
+    def _find_op_owner(cls, value):
+        """Walk the ``__self__`` / ``__wrapped__`` chain to find an object
+        that exposes ``_fingerprint_bytes``.  Returns ``(obj, func_name)``
+        or ``(None, None)``."""
+        # Direct bound method
+        obj = getattr(value, "__self__", None)
+        if obj is not None:
+            if callable(getattr(obj, "_fingerprint_bytes", None)):
+                func_name = getattr(value, "__name__", getattr(value, "__qualname__", ""))
+                return obj, func_name
+        # Walk the full __wrapped__ chain (handles multiple decorator
+        # layers such as wrap_func_with_nested_access → @wraps → bound
+        # method).
+        cur = value
+        for _ in range(10):  # guard against infinite loops
+            cur = getattr(cur, "__wrapped__", None)
+            if cur is None:
+                break
+            obj = getattr(cur, "__self__", None)
+            if obj is not None and callable(getattr(obj, "_fingerprint_bytes", None)):
+                func_name = getattr(cur, "__name__", getattr(cur, "__qualname__", ""))
+                return obj, func_name
+        return None, None
+
     @classmethod
     def hash_default(cls, value: Any) -> str:
         """
@@ -45,21 +70,9 @@ def hash_default(cls, value: Any) -> str:
         # _fingerprint_bytes, hash the (fingerprint, method_name) pair
         # instead of dill-dumping the bound method (which would
         # re-serialize the full object including excluded attrs).
-        obj = getattr(value, "__self__", None)
+        obj, func_name = cls._find_op_owner(value)
         if obj is not None:
-            obj_fp = getattr(obj, "_fingerprint_bytes", None)
-            if callable(obj_fp):
-                func_name = getattr(value, "__name__", getattr(value, "__qualname__", ""))
-                return cls.hash_bytes(obj_fp() + dill.dumps(func_name))
-        # functools.wraps closures: check __wrapped__.__self__
-        wrapped = getattr(value, "__wrapped__", None)
-        if wrapped is not None:
-            obj = getattr(wrapped, "__self__", None)
-            if obj is not None:
-                obj_fp = getattr(obj, "_fingerprint_bytes", None)
-                if callable(obj_fp):
-                    func_name = getattr(wrapped, "__name__", getattr(wrapped, "__qualname__", ""))
-                    return cls.hash_bytes(obj_fp() + dill.dumps(func_name))
+            return cls.hash_bytes(obj._fingerprint_bytes() + dill.dumps(func_name))
         return cls.hash_bytes(dill.dumps(value))
 
     @classmethod
diff --git a/tests/utils/test_fingerprint_utils.py b/tests/utils/test_fingerprint_utils.py
@@ -85,5 +85,106 @@ def test_serialization_round_trip_preserves_all_attrs(self):
         self.assertTrue(restored.skip_op_error)
 
 
+class FusedFilterFingerprintTest(DataJuicerTestCaseBase):
+    """Tests that FusedFilter fingerprints exclude child OP work_dirs."""
+
+    def test_fused_filter_stable_across_work_dirs(self):
+        from data_juicer.ops.filter.words_num_filter import WordsNumFilter
+        from data_juicer.ops.op_fusion import FusedFilter
+
+        f1a = TextLengthFilter(min_len=5, max_len=10000, work_dir='/tmp/a')
+        f2a = WordsNumFilter(min_num=2, max_num=1000, work_dir='/tmp/a')
+        fused_a = FusedFilter('fused', [f1a, f2a])
+
+        f1b = TextLengthFilter(min_len=5, max_len=10000, work_dir='/tmp/b')
+        f2b = WordsNumFilter(min_num=2, max_num=1000, work_dir='/tmp/b')
+        fused_b = FusedFilter('fused', [f1b, f2b])
+
+        self.assertEqual(Hasher.hash(fused_a), Hasher.hash(fused_b))
+
+    def test_fused_filter_differs_when_child_params_change(self):
+        from data_juicer.ops.filter.words_num_filter import WordsNumFilter
+        from data_juicer.ops.op_fusion import FusedFilter
+
+        f1a = TextLengthFilter(min_len=5, max_len=10000, work_dir='/tmp/a')
+        f2a = WordsNumFilter(min_num=2, max_num=1000, work_dir='/tmp/a')
+        fused_a = FusedFilter('fused', [f1a, f2a])
+
+        f1b = TextLengthFilter(min_len=50, max_len=10000, work_dir='/tmp/a')
+        f2b = WordsNumFilter(min_num=2, max_num=1000, work_dir='/tmp/a')
+        fused_b = FusedFilter('fused', [f1b, f2b])
+
+        self.assertNotEqual(Hasher.hash(fused_a), Hasher.hash(fused_b))
+
+
+class WrappedFunctionFingerprintTest(DataJuicerTestCaseBase):
+    """Tests that wrapped bound methods (via wrap_func_with_nested_access)
+    produce stable fingerprints across work_dirs."""
+
+    def test_wrapped_compute_stats_stable(self):
+        from data_juicer.core.data.dj_dataset import wrap_func_with_nested_access
+
+        op_a = TextLengthFilter(min_len=5, max_len=10000, work_dir='/tmp/a')
+        op_b = TextLengthFilter(min_len=5, max_len=10000, work_dir='/tmp/b')
+        wa = wrap_func_with_nested_access(op_a.compute_stats)
+        wb = wrap_func_with_nested_access(op_b.compute_stats)
+        self.assertEqual(Hasher.hash(wa), Hasher.hash(wb))
+
+    def test_wrapped_differs_when_params_change(self):
+        from data_juicer.core.data.dj_dataset import wrap_func_with_nested_access
+
+        op_a = TextLengthFilter(min_len=5, max_len=10000, work_dir='/tmp/a')
+        op_b = TextLengthFilter(min_len=50, max_len=10000, work_dir='/tmp/a')
+        wa = wrap_func_with_nested_access(op_a.compute_stats)
+        wb = wrap_func_with_nested_access(op_b.compute_stats)
+        self.assertNotEqual(Hasher.hash(wa), Hasher.hash(wb))
+
+    def test_multistep_pipeline_cache_hit(self):
+        """Full pipeline with multiple OPs: second run with different
+        work_dir must produce zero new cache files."""
+        import glob
+        import os
+
+        from datasets import load_dataset, enable_caching
+
+        from data_juicer.ops.filter.alphanumeric_filter import AlphanumericFilter
+        from data_juicer.ops.filter.words_num_filter import WordsNumFilter
+        from data_juicer.utils.constant import Fields
+
+        enable_caching()
+        ds = NestedDataset(load_dataset(
+            'json',
+            data_files='demos/data/demo-dataset.jsonl',
+            split='train',
+        ))
+        if Fields.stats not in ds.features:
+            ds = ds.map(lambda x: {Fields.stats: {}})
+        cache_dir = os.path.dirname(ds.cache_files[0]['filename'])
+
+        def run_pipeline(dataset, work_dir):
+            ops = [
+                TextLengthFilter(min_len=5, max_len=10000, work_dir=work_dir),
+                WordsNumFilter(min_num=2, max_num=1000, work_dir=work_dir),
+                AlphanumericFilter(min_ratio=0.0, max_ratio=1.0,
+                                   work_dir=work_dir),
+            ]
+            cur = dataset
+            for op in ops:
+                cur = cur.map(op.compute_stats, num_proc=1)
+                cur = cur.filter(op.process, num_proc=1)
+            return cur
+
+        run_pipeline(ds, '/tmp/pipeline_test_A')
+        cache_after_a = set(glob.glob(os.path.join(cache_dir, '*.arrow')))
+
+        run_pipeline(ds, '/tmp/pipeline_test_B')
+        cache_after_b = set(glob.glob(os.path.join(cache_dir, '*.arrow')))
+
+        new_files = cache_after_b - cache_after_a
+        self.assertEqual(len(new_files), 0,
+                         f'Pipeline B created {len(new_files)} new cache '
+                         f'files; expected 0 (full cache hit)')
+
+
 if __name__ == '__main__':
     unittest.main()