File tree Expand file tree Collapse file tree
specforge/modeling/target Expand file tree Collapse file tree Original file line number Diff line number Diff line change 22 "architectures" : [
33 " LlamaForCausalLMEagle3"
44 ],
5- "eagle_config" : {
6- "eagle_aux_hidden_state_layer_ids" : [
7- 1 ,
8- 23 ,
9- 45
10- ]
11- },
125 "attention_dropout" : 0.0 ,
136 "bos_token_id" : 151643 ,
147 "decoder_sparse_step" : 1 ,
Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+
3+ SCRIPT_DIR=$( cd -- " $( dirname -- " ${BASH_SOURCE[0]} " ) " & > /dev/null && pwd )
4+ ROOT_DIR=$( dirname $SCRIPT_DIR )
5+ export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR /cache/compiled_kernels
6+
7+ NUM_GPUS=${1:- 8}
8+ TP_SIZE=4
9+ BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:- 64}
10+
11+ torchrun \
12+ --standalone \
13+ --nproc_per_node $NUM_GPUS \
14+ $ROOT_DIR /scripts/train_eagle3.py \
15+ --target-model-path $ROOT_DIR //Qwen/Qwen3-Next-80B-A3B-Instruct-FP8/\
16+ --draft-model-config $ROOT_DIR /configs/qwen3-next-80b-a3b-eagle3.json \
17+ --train-data-path $ROOT_DIR /data_qwen80b/qwen3_80b_perfectblend_train_regen.jsonl \
18+ --output-dir $ROOT_DIR /qwen3-80b-regen-blend \
19+ --num-epochs 2 \
20+ --batch-size 2 \
21+ --learning-rate 1e-4 \
22+ --max-length 4096 \
23+ --chat-template qwen \
24+ --cache-dir $ROOT_DIR /cache \
25+ --embedding-key model.embed_tokens.weight \
26+ --tp-size $TP_SIZE \
27+ --sglang-mem-fraction-static 0.5 \
28+ --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \
29+ --target-model-backend sglang
Original file line number Diff line number Diff line change 11pre-commit
2- torch == 2.8.0
3- torchaudio == 2.8.0
4- torchvision == 0.23.0
2+ torch == 2.9.1
3+ torchaudio == 2.9.1
4+ torchvision == 0.24.1
55transformers == 4.57.1
66qwen-vl-utils == 0.0.11
77datasets
1212numpy
1313accelerate
1414pydantic
15- sglang [all ]== 0.5.5
15+ sglang [all ]== 0.5.6
1616openai-harmony
Original file line number Diff line number Diff line change 88from sglang .srt .configs .model_config import ModelConfig
99from sglang .srt .managers .schedule_batch import Req , ScheduleBatch
1010from sglang .srt .managers .scheduler import Scheduler
11+ from sglang .srt .mem_cache .cache_init_params import CacheInitParams
1112from sglang .srt .mem_cache .radix_cache import RadixCache
1213from sglang .srt .model_executor .forward_batch_info import CaptureHiddenMode , ForwardBatch
1314from sglang .srt .sampling .sampling_params import SamplingParams
@@ -306,11 +307,13 @@ def _extend(
306307 module .return_last_hidden_states = return_last_hidden_states
307308 module .return_logits = return_logits
308309
309- tree_cache = RadixCache (
310- None ,
310+ cache_params = CacheInitParams (
311+ disable = False ,
312+ req_to_token_pool = self .model_runner .req_to_token_pool ,
311313 token_to_kv_pool_allocator = self .model_runner .token_to_kv_pool_allocator ,
312314 page_size = self .model_runner .server_args .page_size ,
313315 )
316+ tree_cache = RadixCache (cache_params )
314317
315318 batch = ScheduleBatch .init_new (
316319 reqs = reqs ,
You can’t perform that action at this time.
0 commit comments