You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: configs/config_all.yaml
+54Lines changed: 54 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -721,6 +721,24 @@ process:
721
721
prob_threshold: 0.8# the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
722
722
any_or_all: any # keep this sample when any/all images meet the filter condition
723
723
mem_required: '500MB'# This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrain the maximum number of processes that can be launched
724
+
- in_context_influence_filter: # filter to keep texts whose in-context influence upon validation set within a specific range.
725
+
hf_model: 'Qwen/Qwen2.5-0.5B'# Huggingface embedding model name.
726
+
model_params: null # Parameters for initializing the API model.
727
+
min_score: 1.0# Minimum perplexity score.
728
+
max_score: 100.0# Maximum perplexity score.
729
+
query_template: null # Template for building the query string.
730
+
response_template: mull # Template for building the response string.
731
+
valid_dataset: null # The dataset to use for validation
732
+
task_desc: null # The description of the validation task.
733
+
valid_as_demo: True # If true, score = L(A|Q) / L(A|task_desc, Q_v, A_v, Q); If false, score = L(A_v|Q) L(A_v|task_desc, Q, A, Q_v) .
734
+
n_shot: null # The number of shots in validation.
735
+
- instruction_following_difficulty_filter: # filter to keep texts whose instruction follows difficulty (IFD, https://arxiv.org/abs/2308.12032) falls within a specific range."
736
+
hf_model: 'Qwen/Qwen2.5-0.5B'# Huggingface embedding model name.
737
+
model_params: null # Parameters for initializing the API model.
738
+
min_score: 1.0# Minimum perplexity score.
739
+
max_score: 100.0# Maximum perplexity score.
740
+
query_template: null # Template for building the query string.
741
+
response_template: mull # Template for building the response string.
724
742
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
725
743
lang: en # keep text in what language
726
744
min_score: 0.8# the min language scores to filter text
@@ -739,6 +757,13 @@ process:
739
757
enable_vllm: false # If true, use VLLM for loading hugging face or local llm. Otherwise, use API for reference.
740
758
model_params: {} # Parameters for initializing the API model.
741
759
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
760
+
- llm_perplexity_filter: # filter to keep samples with perplexity score, computed using a specified llm, within a specific range.
761
+
hf_model: 'Qwen/Qwen2.5-0.5B'# Huggingface embedding model name.
762
+
model_params: null # Parameters for initializing the API model.
763
+
min_score: 1.0# Minimum perplexity score.
764
+
max_score: 100.0# Maximum perplexity score.
765
+
query_template: null # Template for building the query string.
766
+
response_template: null # Template for building the response string.
742
767
- llm_quality_score_filter: # filter to keep sample with high quality score estimated by LLM.
743
768
api_or_hf_model: 'gpt-4o'# API or huggingface model name.
744
769
min_score: 0.5# The lowest quality score threshold to keep the sample.
@@ -754,6 +779,23 @@ process:
754
779
enable_vllm: false # If true, use VLLM for loading hugging face or local llm. Otherwise, use API for reference.
755
780
model_params: {} # Parameters for initializing the API model.
756
781
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
782
+
- llm_task_relevance_filter: # filter to keep sample with high relevance score to validation tasks estimated by LLM.
783
+
api_or_hf_model: 'gpt-4o'# API or huggingface model name.
784
+
min_score: 0.5# The lowest quality score threshold to keep the sample.
785
+
api_endpoint: null # URL endpoint for the API.
786
+
response_path: null # Path to extract content from the API response. Defaults to 'choices.0.message.content'.
787
+
input_keys: ['text'] # Sub set of keys in the sample. Support data with multi fields such as 'query', 'analysis' and 'answer' in RFT data.
788
+
field_names: ['text'] # Corresponding field names for input keys.
789
+
system_prompt: null # System prompt for the task.
790
+
input_template: null # The input template.
791
+
field_template: null # Template for each field in the prompt.
792
+
try_num: 3# The number of retry attempts when there is an API call error or outputs parsing error.
793
+
enable_vllm: false # If true, use VLLM for loading hugging face or local llm. Otherwise, use API for reference.
794
+
model_params: {} # Parameters for initializing the API model.
795
+
sampling_params: {} # Extra parameters passed to the API call. e.g {'temperature': 0.9, 'top_p': 0.95}
796
+
valid_dataset: null # The dataset to use for validation
797
+
task_desc: null # The description of the validation task.
798
+
n_shot: null # The number of shots in validation.
757
799
- maximum_line_length_filter: # filter text with the maximum length of lines out of specific range
758
800
min_len: 10# the min length of filter range
759
801
max_len: 10000# the max length of filter range
@@ -795,6 +837,18 @@ process:
795
837
- text_action_filter: # filter text according the number of action verb
796
838
lang: en # consider the words in what language
797
839
min_action_num: 1# text will be filtered whose verbs less the min action number
840
+
- text_embd_similarity_filter: # Filter to keep texts whose average embedding similarity to a set of given validation texts falls within a specific range.
841
+
api_or_hf_model: text-embedding-v4 # API or huggingface embedding model name
842
+
is_hf_model: false # indicates if the model is from HuggingFace
843
+
api_endpoint: embeddings # embedding URL endpoint for the API
844
+
response_path: data.0.embedding # path to extract content from the API response
845
+
model_params: null # parameters for initializing the API model
846
+
min_score: 0.1# the min average similarity to keep samples
847
+
max_score: 1.0# the max average similarity to keep samples
848
+
valid_dataset: null # the dataset to use for validation
849
+
ebd_dim: 1024# the embedding's dimension via API
850
+
pooling: null # strategy to extract embedding from the hidden states
851
+
input_template: null # template for building the model input.
798
852
- text_entity_dependency_filter: # filter text without non-independent entity nouns
799
853
lang: en # consider the words in what language
800
854
min_dependency_num: 1# the min number of adjacent edges of a non-independent noun in dependency tree
0 commit comments