Skip to content

Commit 381476b

Browse files
authored
update sglang && support qwen3 next (#355)
* support qwen3 next * fix bug * fix bug * update sglang
1 parent e012016 commit 381476b

4 files changed

Lines changed: 38 additions & 13 deletions

File tree

configs/qwen3-next-80b-a3b-eagle3.json

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,6 @@
22
"architectures": [
33
"LlamaForCausalLMEagle3"
44
],
5-
"eagle_config": {
6-
"eagle_aux_hidden_state_layer_ids": [
7-
1,
8-
23,
9-
45
10-
]
11-
},
125
"attention_dropout": 0.0,
136
"bos_token_id": 151643,
147
"decoder_sparse_step": 1,
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/bin/bash
2+
3+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
4+
ROOT_DIR=$(dirname $SCRIPT_DIR)
5+
export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
6+
7+
NUM_GPUS=${1:-8}
8+
TP_SIZE=4
9+
BUILD_DATASET_NUM_PROC=${BUILD_DATASET_NUM_PROC:-64}
10+
11+
torchrun \
12+
--standalone \
13+
--nproc_per_node $NUM_GPUS \
14+
$ROOT_DIR/scripts/train_eagle3.py \
15+
--target-model-path $ROOT_DIR//Qwen/Qwen3-Next-80B-A3B-Instruct-FP8/\
16+
--draft-model-config $ROOT_DIR/configs/qwen3-next-80b-a3b-eagle3.json \
17+
--train-data-path $ROOT_DIR/data_qwen80b/qwen3_80b_perfectblend_train_regen.jsonl \
18+
--output-dir $ROOT_DIR/qwen3-80b-regen-blend \
19+
--num-epochs 2 \
20+
--batch-size 2 \
21+
--learning-rate 1e-4 \
22+
--max-length 4096 \
23+
--chat-template qwen \
24+
--cache-dir $ROOT_DIR/cache \
25+
--embedding-key model.embed_tokens.weight \
26+
--tp-size $TP_SIZE \
27+
--sglang-mem-fraction-static 0.5 \
28+
--build-dataset-num-proc $BUILD_DATASET_NUM_PROC \
29+
--target-model-backend sglang

requirements.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
pre-commit
2-
torch==2.8.0
3-
torchaudio==2.8.0
4-
torchvision==0.23.0
2+
torch==2.9.1
3+
torchaudio==2.9.1
4+
torchvision==0.24.1
55
transformers==4.57.1
66
qwen-vl-utils==0.0.11
77
datasets
@@ -12,5 +12,5 @@ psutil
1212
numpy
1313
accelerate
1414
pydantic
15-
sglang[all]==0.5.5
15+
sglang[all]==0.5.6
1616
openai-harmony

specforge/modeling/target/eagle3_target_model.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from sglang.srt.configs.model_config import ModelConfig
99
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
1010
from sglang.srt.managers.scheduler import Scheduler
11+
from sglang.srt.mem_cache.cache_init_params import CacheInitParams
1112
from sglang.srt.mem_cache.radix_cache import RadixCache
1213
from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardBatch
1314
from sglang.srt.sampling.sampling_params import SamplingParams
@@ -306,11 +307,13 @@ def _extend(
306307
module.return_last_hidden_states = return_last_hidden_states
307308
module.return_logits = return_logits
308309

309-
tree_cache = RadixCache(
310-
None,
310+
cache_params = CacheInitParams(
311+
disable=False,
312+
req_to_token_pool=self.model_runner.req_to_token_pool,
311313
token_to_kv_pool_allocator=self.model_runner.token_to_kv_pool_allocator,
312314
page_size=self.model_runner.server_args.page_size,
313315
)
316+
tree_cache = RadixCache(cache_params)
314317

315318
batch = ScheduleBatch.init_new(
316319
reqs=reqs,

0 commit comments

Comments
 (0)