datajuicer
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README_ZH.md‎
Lines changed: 1 addition & 1 deletion b/‎README_ZH.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/config_all.yaml‎
Lines changed: 54 additions & 0 deletions b/‎configs/config_all.yaml‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎data_juicer/ops/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎data_juicer/ops/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎data_juicer/ops/base_op.py‎
Lines changed: 1 addition & 0 deletions b/‎data_juicer/ops/base_op.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎data_juicer/ops/filter/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎data_juicer/ops/filter/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎data_juicer/ops/filter/in_context_influence_filter.py‎
Lines changed: 109 additions & 0 deletions b/‎data_juicer/ops/filter/in_context_influence_filter.py‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎data_juicer/ops/filter/instruction_following_difficulty_filter.py‎
Lines changed: 44 additions & 0 deletions b/‎data_juicer/ops/filter/instruction_following_difficulty_filter.py‎
Lines changed: 44 additions & 0 deletions
@@ -38,7 +38,7 @@ Data-Juicer is being actively updated and maintained. We will periodically enhan
 
 [Demo Video] DataJuicer-Agent: Quick start your data processing journey!
 
-https://github.com/user-attachments/assets/58aea900-e51f-4ec2-b1c0-eead97967893
+https://github.com/user-attachments/assets/6eb726b7-6054-4b0c-905e-506b2b9c7927
 
 [Demo Video] DataJuicer-Sandbox: Better data-model co-dev at a lower cost!
 
 
@@ -32,7 +32,7 @@ Data-Juicer正在积极更新和维护中，我们将定期强化和新增更多
 
 [Demo Video] DataJuicer-Agent:数据处理，即刻启程！
 
-https://github.com/user-attachments/assets/58aea900-e51f-4ec2-b1c0-eead97967893
+https://github.com/user-attachments/assets/6eb726b7-6054-4b0c-905e-506b2b9c7927
 
 [Demo Video] DataJuicer-Sandbox: 降本增效，优化数据-模型协同开发！
 
 
@@ -721,6 +721,24 @@ process:
       prob_threshold: 0.8                                     # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
       mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
+  - in_context_influence_filter:                            # filter to keep texts whose in-context influence upon validation set within a specific range.
+      hf_model: 'Qwen/Qwen2.5-0.5B'                           # Huggingface embedding model name.
+      model_params: null                                      # Parameters for initializing the API model.
+      min_score: 1.0                                          # Minimum perplexity score.
+      max_score: 100.0                                        # Maximum perplexity score.
+      query_template: null                                    # Template for building the query string.
+      response_template: mull                                 # Template for building the response string.
+      valid_dataset: null                                     # The dataset to use for validation
+      task_desc: null                                         # The description of the validation task.
+      valid_as_demo: True                                     # If true, score =  L(A|Q) / L(A|task_desc, Q_v, A_v, Q); If false, score = L(A_v|Q) L(A_v|task_desc, Q, A, Q_v) .
+      n_shot: null                                           # The number of shots in validation.
+  - instruction_following_difficulty_filter:                 # filter to keep texts whose instruction follows difficulty (IFD, https://arxiv.org/abs/2308.12032) falls within a specific range."
+      hf_model: 'Qwen/Qwen2.5-0.5B'                           # Huggingface embedding model name.
+      model_params: null                                      # Parameters for initializing the API model.
+      min_score: 1.0                                          # Minimum perplexity score.
+      max_score: 100.0                                        # Maximum perplexity score.
+      query_template: null                                    # Template for building the query string.
+      response_template: mull                                 # Template for building the response string.
   - language_id_score_filter:                               # filter text in specific language with language scores larger than a specific max value
       lang: en                                                # keep text in what language
       min_score: 0.8                                          # the min language scores to filter text
@@ -739,6 +757,13 @@ process:
       enable_vllm: false                                      # If true, use VLLM for loading hugging face or local llm. Otherwise, use API for reference.
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - llm_perplexity_filter:                                  # filter to keep samples with perplexity score, computed using a specified llm, within a specific range.
+      hf_model: 'Qwen/Qwen2.5-0.5B'                           # Huggingface embedding model name.
+      model_params: null                                      # Parameters for initializing the API model.
+      min_score: 1.0                                          # Minimum perplexity score.
+      max_score: 100.0                                        # Maximum perplexity score.
+      query_template: null                                    # Template for building the query string.
+      response_template: null                                 # Template for building the response string.
   - llm_quality_score_filter:                               # filter to keep sample with high quality score estimated by LLM.
       api_or_hf_model: 'gpt-4o'                               # API or huggingface model name.
       min_score: 0.5                                          # The lowest quality score threshold to keep the sample.
@@ -754,6 +779,23 @@ process:
       enable_vllm: false                                      # If true, use VLLM for loading hugging face or local llm. Otherwise, use API for reference.
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+  - llm_task_relevance_filter:                              # filter to keep sample with high relevance score to validation tasks estimated by LLM.
+      api_or_hf_model: 'gpt-4o'                               # API or huggingface model name.
+      min_score: 0.5                                          # The lowest quality score threshold to keep the sample.
+      api_endpoint: null                                      # URL endpoint for the API.
+      response_path: null                                     # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
+      input_keys: ['text']                                    # Sub set of keys in the sample. Support data with multi fields such as 'query', 'analysis' and 'answer' in RFT data.
+      field_names: ['text']                                   # Corresponding field names for input keys.
+      system_prompt: null                                     # System prompt for the task.
+      input_template: null                                    # The input template.
+      field_template: null                                    # Template for each field in the prompt.
+      try_num: 3                                              # The number of retry attempts when there is an API call error or outputs parsing error.
+      enable_vllm: false                                      # If true, use VLLM for loading hugging face or local llm. Otherwise, use API for reference.
+      model_params: {}                                        # Parameters for initializing the API model.
+      sampling_params: {}                                     # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
+      valid_dataset: null                                     # The dataset to use for validation
+      task_desc: null                                         # The description of the validation task.
+      n_shot: null                                           # The number of shots in validation.
   - maximum_line_length_filter:                             # filter text with the maximum length of lines out of specific range
       min_len: 10                                             # the min length of filter range
       max_len: 10000                                          # the max length of filter range
@@ -795,6 +837,18 @@ process:
   - text_action_filter:                                     # filter text according the number of action verb
       lang: en                                                # consider the words in what language
       min_action_num: 1                                       # text will be filtered whose verbs less the min action number
+  - text_embd_similarity_filter:                            # Filter to keep texts whose average embedding similarity to a set of given validation texts falls within a specific range.
+      api_or_hf_model: text-embedding-v4                      # API or huggingface embedding model name
+      is_hf_model: false                                      # indicates if the model is from HuggingFace
+      api_endpoint: embeddings                                # embedding URL endpoint for the API
+      response_path: data.0.embedding                         # path to extract content from the API response
+      model_params: null                                      # parameters for initializing the API model
+      min_score: 0.1                                          # the min average similarity to keep samples
+      max_score: 1.0                                          # the max average similarity to keep samples
+      valid_dataset: null                                     # the dataset to use for validation
+      ebd_dim: 1024                                           # the embedding's dimension via API
+      pooling: null                                           # strategy to extract embedding from the hidden states
+      input_template: null                                    # template for building the model input.
   - text_entity_dependency_filter:                          # filter text without non-independent entity nouns
       lang: en                                                # consider the words in what language
       min_dependency_num: 1                                   # the min number of adjacent edges of a non-independent noun in dependency tree
 
@@ -16,6 +16,7 @@ def timing_context(description):
 with timing_context('Importing operator modules'):
     from . import aggregator, deduplicator, filter, grouper, mapper, selector
     from .base_op import (
+        ATTRIBUTION_FILTERS,
         NON_STATS_FILTERS,
         OPERATORS,
         TAGGING_OPS,
 
@@ -16,6 +16,7 @@
 UNFORKABLE = Registry("Unforkable")
 NON_STATS_FILTERS = Registry("Non-stats Filters")
 TAGGING_OPS = Registry("Tagging Operators")
+ATTRIBUTION_FILTERS = Registry("Attribution Filters")
 
 
 def convert_list_dict_to_dict_list(samples):
 
@@ -18,10 +18,16 @@
 from .image_text_matching_filter import ImageTextMatchingFilter
 from .image_text_similarity_filter import ImageTextSimilarityFilter
 from .image_watermark_filter import ImageWatermarkFilter
+from .in_context_influence_filter import InContextInfluenceFilter
+from .instruction_following_difficulty_filter import (
+    InstructionFollowingDifficultyFilter,
+)
 from .language_id_score_filter import LanguageIDScoreFilter
 from .llm_analysis_filter import LLMAnalysisFilter
 from .llm_difficulty_score_filter import LLMDifficultyScoreFilter
+from .llm_perplexity_filter import LLMPerplexityFilter
 from .llm_quality_score_filter import LLMQualityScoreFilter
+from .llm_task_relevance_filter import LLMTaskRelevanceFilter
 from .maximum_line_length_filter import MaximumLineLengthFilter
 from .perplexity_filter import PerplexityFilter
 from .phrase_grounding_recall_filter import PhraseGroundingRecallFilter
@@ -31,6 +37,7 @@
 from .stopwords_filter import StopWordsFilter
 from .suffix_filter import SuffixFilter
 from .text_action_filter import TextActionFilter
+from .text_embd_similarity_filter import TextEmbdSimilarityFilter
 from .text_entity_dependency_filter import TextEntityDependencyFilter
 from .text_length_filter import TextLengthFilter
 from .text_pair_similarity_filter import TextPairSimilarityFilter
@@ -71,9 +78,13 @@
     "ImageTextSimilarityFilter",
     "ImageWatermarkFilter",
     "LanguageIDScoreFilter",
+    "InContextInfluenceFilter",
+    "InstructionFollowingDifficultyFilter",
     "LLMAnalysisFilter",
     "LLMQualityScoreFilter",
+    "LLMPerplexityFilter",
     "LLMDifficultyScoreFilter",
+    "LLMTaskRelevanceFilter",
     "MaximumLineLengthFilter",
     "PerplexityFilter",
     "PhraseGroundingRecallFilter",
@@ -83,6 +94,7 @@
     "StopWordsFilter",
     "SuffixFilter",
     "TextActionFilter",
+    "TextEmbdSimilarityFilter",
     "TextEntityDependencyFilter",
     "TextLengthFilter",
     "TextPairSimilarityFilter",
 
@@ -0,0 +1,109 @@
+from typing import Dict, List, Optional
+
+from datasets import Dataset
+from loguru import logger
+
+from data_juicer.ops.base_op import ATTRIBUTION_FILTERS, OPERATORS
+from data_juicer.ops.filter.llm_perplexity_filter import LLMPerplexityFilter
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.lazy_loader import LazyLoader
+
+torch = LazyLoader("torch")
+transformers = LazyLoader("transformers")
+
+OP_NAME = "in_context_influence_filter"
+
+
+@OPERATORS.register_module(OP_NAME)
+@ATTRIBUTION_FILTERS.register_module(OP_NAME)
+class InContextInfluenceFilter(LLMPerplexityFilter):
+    """Filter to keep texts whose in-context influence upon validation set within a specific range."""
+
+    # This operator is currently under development and evaluation as part of an ongoing research project.
+    # The Data-Juicer team retains full copyright over this operator.
+
+    _accelerator = "cuda"
+
+    def __init__(
+        self,
+        valid_dataset: Optional[List[Dict]] = None,
+        task_desc: str = None,
+        valid_as_demo: bool = False,
+        n_shot: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialization method.
+
+        :param valid_dataset: The dataset to use for validation.
+            If None, 'self.prepare_valid_feature' should be manually called before applying the filter.
+        :param task_desc: The description of the validation task.
+        :param valid_as_demo: If true, score =  L(A|Q) / L(A|task_desc, Q_v, A_v, Q);
+                              If false, score = L(A_v|Q) L(A_v|task_desc, Q, A, Q_v).
+        :param n_shot: The number of shots in validation.
+        """
+        super().__init__(*args, **kwargs)
+        self.valid_as_demo = valid_as_demo
+        self.task_desc = task_desc
+        self.valid_feature = {}
+        if valid_dataset is not None:
+            self.prepare_valid_feature(Dataset.from_list(valid_dataset), task_desc, n_shot)
+        else:
+            logger.warning(
+                f"valid_dataset and task_desc are both None when initializing {OP_NAME}. \
+                'prepare_valid_feature' method should be manually called before applying the filter."
+            )
+
+    @property
+    def valid_feature_ready(self):
+        return "valid_samples" in self.valid_feature and "valid_losses" in self.valid_feature
+
+    def prepare_valid_feature(self, dataset=None, task_desc=None, n_shot=None, *args, **kwargs):
+        n_shot = n_shot or len(dataset)
+        self.valid_feature["valid_samples"] = []
+        self.valid_feature["valid_losses"] = []
+        for i, sample in enumerate(dataset):
+            if i >= n_shot:
+                break
+            sample_w_msgs = self.sample_with_messages(sample, system_prompt=task_desc)
+            self.valid_feature["valid_samples"].append(sample_w_msgs)
+            loss = self._loss(sample_w_msgs)
+            self.valid_feature["valid_losses"].append(loss)
+
+    def compute_stats_single(self, sample, rank=None):
+        # check if it's computed already
+        if StatsKeys.in_context_influence in sample[Fields.stats]:
+            return sample
+
+        assert self.valid_feature_ready, "Validation feature not ready yet. Call prepare_valid_feature first."
+
+        sample_w_msgs = self.sample_with_messages(sample)
+
+        scores = []
+        if self.valid_as_demo:
+            # L(A|Q) / L(A|Q_v, A_v, Q)
+            loss_wo_demo = self._loss(sample_w_msgs, rank=rank)
+            for valid_sample in self.valid_feature["valid_samples"]:
+                loss_w_demo = self._loss(sample_w_msgs, pre_example=valid_sample, rank=rank)
+                scores.append(loss_wo_demo / loss_w_demo)
+        else:
+            # L(A_v|Q_v) / L(A_v|Q, A, Q_v)
+            for valid_sample, loss_wo_demo in zip(
+                self.valid_feature["valid_samples"], self.valid_feature["valid_losses"]
+            ):
+                loss_w_demo = self._loss(valid_sample, pre_example=sample_w_msgs, rank=rank)
+                scores.append(loss_wo_demo / loss_w_demo)
+
+        # TODO: aggregation strategies
+        in_context_influence = sum(scores) / len(scores)
+        sample[Fields.stats][StatsKeys.in_context_influence] = in_context_influence
+
+        return sample
+
+    def process_single(self, sample):
+        score = sample[Fields.stats][StatsKeys.in_context_influence]
+        if score is None:
+            return True
+
+        return self.min_score <= score <= self.max_score
@@ -0,0 +1,44 @@
+import logging
+
+from data_juicer.ops.base_op import OPERATORS
+from data_juicer.ops.filter.llm_perplexity_filter import LLMPerplexityFilter
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.lazy_loader import LazyLoader
+
+torch = LazyLoader("torch")
+transformers = LazyLoader("transformers")
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+OP_NAME = "instruction_following_difficulty_filter"
+
+
+@OPERATORS.register_module(OP_NAME)
+class InstructionFollowingDifficultyFilter(LLMPerplexityFilter):
+    """Filter to keep texts whose instruction follows difficulty (IFD, https://arxiv.org/abs/2308.12032)
+    falls within a specific range."""
+
+    _accelerator = "cuda"
+
+    def compute_stats_single(self, sample, rank=None):
+
+        # check if it's computed already
+        if StatsKeys.ifd_score in sample[Fields.stats]:
+            return sample
+
+        sample_w_msgs = self.sample_with_messages(sample)
+        msgs_wo_query = sample_w_msgs["messages"][-1:]
+        sample_w_msg_wo_query = dict(**sample_w_msgs)
+        sample_w_msg_wo_query.update({"messages": msgs_wo_query})
+
+        loss_w_query = self._loss(sample_w_msgs, rank)
+        loss_wo_query = self._loss(sample_w_msg_wo_query, rank)
+        sample[Fields.stats][StatsKeys.ifd_score] = loss_w_query / loss_wo_query
+
+        return sample
+
+    def process_single(self, sample):
+        score = sample[Fields.stats][StatsKeys.ifd_score]
+
+        return self.min_score <= score <= self.max_score