AgentFlow/train/config.yaml at main · lupantech/AgentFlow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# config.yaml
env:
  OPENAI_API_KEY: "YOUR_ONENAI_API" # Not that neccesary.
  CUDA_VISIBLE_DEVICES: '0,1,2,3,4,5,6,7'
  HYDRA_FULL_ERROR: 1
  N_GPUS: 8
  BASE_MODEL: 'Qwen/Qwen2.5-7B-Instruct' # This model will be served in vllm, used by `agentflow.port`, and serve as the base model of rollout in the training process.
  ROLLOUT_TP_SIZE: 1
  EXPERIMENT_NAME: 'AgentFlow_pro'
  PROJECT_NAME: 'AgentFlow_pro'
  BASE_DATA_DIR: 'data' # This is where to find training and eval data.
  VERBOSITY: 'DEBUG'
  N_WORKERS: 16
  ENABLE_TOOLS: ["Base_Generator_Tool","Python_Coder_Tool","Google_Search_Tool","Wikipedia_Search_Tool"] # If OpenAI API is on, then it can add tools.
  # TOOL_ENGINE: ["gpt-4o-mini","dashscope-qwen2.5-7b-coder-instruct","Default","Default"] # Default means use tool.py's default params. You can set "dashscope" as qwen7B, "gpt-4o-mini" for gpt, or "self" as the training BASE_MODEL.
  # TOOL_ENGINE: ["vllm-Qwen/Qwen2.5-7B-Instruct","vllm-Qwen/Qwen2.5-7B-Instruct","Default","Default"] # If you are not using the Dashscope API, you can use VLLM to serve qwen2.5-7b-instruct on your own server and redesign the llm_engine port.
  MODEL_ENGINE: ["trainable","gpt-4o-mini","gpt-4o-mini","gpt-4o-mini"]
  # MODEL_ENGINE: ["trainable","dashscope","dashscope","dashscope"] # [planner_main, planner_fixed, verifier, executor]. "trainable" means use the BASE_MODEL with different temperatures for train/test. You can also specify specific models like "dashscope", "gpt-4o-mini", etc.
  TOOL_STEPS: 3 # Do not increase beyond 5. It may overflow context.
  TEST_TEMPERATURE: 0.0
  TRAIN_TEMPERATURE: 0.7 # 0.7 - 0.5 is good. Please check whether every planner and the executor have the this parameter correctly transfered.
  OUTPUT_TYPE: "direct" # Different output mode in rollout's last output, not that neccesary if we are searching and math reasoning because answer should be short.
  AGENT_MAX_TIMEOUT: 500 # 300-500 is good. When steps extend beyond this limit, there may be errors.

python_args:
  agentflow.port: 9999 # This will be sent to agent serving and training.
  algorithm.adv_estimator: 'grpo'
  data.train_files: '${BASE_DATA_DIR}/train/combined_train.parquet' # Mixed nq search and mathard, shuffled.
  data.val_files: '${BASE_DATA_DIR}/val/aime24.parquet' # AIME24 for fast check, the first epoch maybe down due to async start and file lock.
  actor_rollout_ref.rollout.tensor_model_parallel_size: '${ROLLOUT_TP_SIZE}'
  trainer.n_gpus_per_node: '${N_GPUS}'
  data.train_batch_size: 32
  actor_rollout_ref.rollout.n: 8
  actor_rollout_ref.actor.ppo_mini_batch_size: 8
  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu: 4
  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu: 4
  actor_rollout_ref.rollout.multi_turn.format: 'hermes'
  actor_rollout_ref.model.path: '${BASE_MODEL}'
  data.max_prompt_length: 18432 # This is safe here because in qwen-plannr&executor and dashscope/4o-mini's tools will never prematurely end the output at this length.
  data.max_response_length: 2048 # Make sure this will be transfered all the way down.
  data.truncation: 'truncate' # If this is set to "error", then the process will be shut down when the input length exceeds the max_prompt_length + max_response_length.
  trainer.val_before_train: True # If you restart from a CKPT, then you don't need to save time.
  actor_rollout_ref.actor.optim.lr: 1e-6
  actor_rollout_ref.model.use_remove_padding: True
  actor_rollout_ref.actor.use_kl_loss: True
  actor_rollout_ref.actor.kl_loss_coef: 0.001
  actor_rollout_ref.actor.entropy_coeff: 0.0 # Maybe this can somehow prevent model repetition?
  actor_rollout_ref.actor.clip_ratio_low: 0.2
  actor_rollout_ref.actor.clip_ratio_high: 0.3
  actor_rollout_ref.model.enable_gradient_checkpointing: True
  actor_rollout_ref.actor.fsdp_config.param_offload: False
  actor_rollout_ref.actor.fsdp_config.optimizer_offload: False
  actor_rollout_ref.rollout.name: 'vllm'
  actor_rollout_ref.rollout.gpu_memory_utilization: 0.6 # 0.55-0.65 is fine. If it's too small, the BASE_MODEL inference will be slow. If it's too large, the KV cache and other extra saved logic will cause OOM.
  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu: 4
  actor_rollout_ref.ref.fsdp_config.param_offload: False
  algorithm.use_kl_in_reward: False
  trainer.critic_warmup: 0
  trainer.logger: ['console','wandb']
  trainer.project_name: '${PROJECT_NAME}'
  trainer.experiment_name: '${EXPERIMENT_NAME}'
  trainer.nnodes: 1
  trainer.save_freq: 2 # This is to ensure the CKPT exists.
  trainer.test_freq: 2
  trainer.total_epochs: 5