Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/scripts/action_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def evaluate(models: list[str],
for idx, ori_model in enumerate(models):
print()
print(50 * '==')
print(f'Start evaluating {idx+1}/{num_model} {ori_model} ...')
print(f'Start evaluating {idx + 1}/{num_model} {ori_model} ...')
model = ori_model.lower()

lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/eval_chat_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
dict(role='HUMAN', begin='[INST] ', end=' [/INST]'),
dict(role='BOT', begin='', end='', generate=True),
],
eos_token_id=2)
eos_token_id=2)

MAX_SESSION_LEN = 2048
MAX_NEW_TOKENS = 1024
Expand Down
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
repos:
- repo: https://github.com/hhatto/autopep8
rev: v2.3.2
hooks:
- id: autopep8
Comment thread
windreamer marked this conversation as resolved.
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.4
hooks:
Expand Down
16 changes: 8 additions & 8 deletions autotest/interface/restful/test_restful_chat_completions_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,14 @@ def test_array_stopwords_streaming(self, backend, model_case):
@pytest.mark.internlm2_5
def test_special_words(self, backend, model_case):
message = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' \
'<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
'能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
'发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' \
'如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' \
'难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),' \
'机器学习和数据科学(用于展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、' \
'JSON等格式的文件)。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,' \
'计算曲线积分:$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
'<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
'能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
'发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' \
'如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' \
'难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),' \
'机器学习和数据科学(用于展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、' \
'JSON等格式的文件)。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,' \
'计算曲线积分:$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
api_client = APIClient(BASE_URL)
model_name = api_client.available_models[0]
for output in api_client.chat_completions_v1(model=model_name,
Expand Down
2 changes: 1 addition & 1 deletion autotest/interface/restful/test_restful_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ def single_request(idx):

success_rate = success_count / 20
assert success_rate == 1.0, \
f'Stress test failed: success rate {success_rate*100}% < 80%'
f'Stress test failed: success rate {success_rate * 100}% < 80%'

if success_count > 0:
avg_latency = total_latency / success_count
Expand Down
2 changes: 1 addition & 1 deletion autotest/tools/pipeline/mllm_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def load_video(video_path, bound=None, num_segments=32):

question = ''
for i in range(len(imgs)):
question = question + f'Frame{i+1}: {IMAGE_TOKEN}\n'
question = question + f'Frame{i + 1}: {IMAGE_TOKEN}\n'

if lang == 'cn':
question += '视频里有什么动物,它在做什么?'
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def serve(model_path: str,
This function has been removed. Please use alternative methods.

This will run the api_server in a subprocess.
""" # noqa E501
""" # noqa E501
raise NotImplementedError("The 'serve' function is no longer available. "
'This function has been deprecated and removed.')

Expand Down
11 changes: 4 additions & 7 deletions lmdeploy/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_lora_adapters(adapters: list[str]):
else:
for pair in adapters:
assert '=' in pair, f'Multiple lora paths must in format of ' \
f'xxx=yyy. But given: {pair}'
f'xxx=yyy. But given: {pair}'
name, path = pair.strip().split('=', 1)
assert name not in output, f'Multiple lora paths with repeated lora name: {name}'
output[name] = path
Expand Down Expand Up @@ -420,8 +420,7 @@ def calib_batchsize(parser):
'--batch-size',
type=int,
default=1,
help=\
'The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM' # noqa
help='The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM' # noqa
)

@staticmethod
Expand All @@ -432,8 +431,7 @@ def calib_search_scale(parser):
'--search-scale',
action='store_true',
default=False,
help=\
'Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied' # noqa
help='Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied' # noqa
)

@staticmethod
Expand All @@ -454,8 +452,7 @@ def chat_template(parser):
'--chat-template',
type=str,
default=None,
help=\
'A JSON file or string that specifies the chat template configuration. ' # noqa
help='A JSON file or string that specifies the chat template configuration. ' # noqa
'Please refer to https://lmdeploy.readthedocs.io/en/latest/advance/chat_template.html for the specification' # noqa
)

Expand Down
16 changes: 8 additions & 8 deletions lmdeploy/lite/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs
from .cal_qparams import (
QParams,
cal_qparams_per_channel_absmax,
cal_qparams_per_channel_minmax,
cal_qparams_per_group_absmax,
cal_qparams_per_group_minmax,
cal_qparams_per_tensor_absmax,
cal_qparams_per_tensor_minmax,
precise_round,
QParams,
cal_qparams_per_channel_absmax,
cal_qparams_per_channel_minmax,
cal_qparams_per_group_absmax,
cal_qparams_per_group_minmax,
cal_qparams_per_tensor_absmax,
cal_qparams_per_tensor_minmax,
precise_round,
)
from .calib_dataloader import get_calib_loaders
from .collect import bimap_name_mod, collect_target_modules, collect_target_weights
Expand Down
14 changes: 7 additions & 7 deletions lmdeploy/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,16 +451,16 @@ def __post_init__(self):
assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
assert self.device_type in ['cuda', 'ascend', 'maca', 'camb'], (f'invalid device_type: {self.device_type}')
assert self.kernel_block_size >= 16 and \
(self.kernel_block_size & (self.kernel_block_size - 1)) == 0, \
f'kernel_block_size must be >= 16 and a power of 2, but got {self.kernel_block_size}'
(self.kernel_block_size & (self.kernel_block_size - 1)) == 0, \
f'kernel_block_size must be >= 16 and a power of 2, but got {self.kernel_block_size}'
assert self.block_size >= self.kernel_block_size and \
self.block_size % self.kernel_block_size == 0, \
(f'block_size must be >= kernel_block_size and an integer multiple '
f'of kernel_block_size, but got block_size {self.block_size} '
f'and kernel_block_size {self.kernel_block_size}')
self.block_size % self.kernel_block_size == 0, \
(f'block_size must be >= kernel_block_size and an integer multiple '
f'of kernel_block_size, but got block_size {self.block_size} '
f'and kernel_block_size {self.kernel_block_size}')
if self.quant_policy > 0 and self.device_type not in ['cuda', 'ascend']:
assert False, \
'kv cache quantization only works for CUDA and ASCEND.'
'kv cache quantization only works for CUDA and ASCEND.'
if self.device_type == 'camb' and self.block_size != 16:
self.block_size = 16
logger.warning('Currently, camb device requires block size to be 16, \
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,10 @@ def log(self):
f'{scheduler_stats.num_api_routed_reqs} / {scheduler_stats.num_api_waiting_reqs}, '
f'Engine (running/waiting): '
f'{scheduler_stats.num_running_reqs} / {scheduler_stats.num_waiting_reqs}, '
f'KV cache: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, ')
f'KV cache: {scheduler_stats.gpu_cache_usage * 100:.1f}%, ')

if scheduler_stats.prefix_cache_hit_rate != 0:
log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%, '
log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100:.1f}%, '

if spec_msg is not None:
log_msg += spec_msg
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,10 @@ def get_prompt(self, prompt, sequence_start=True):
f'{self.assistant}'
else:
return f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
f'{self.assistant}'
else:
return f'{self.separator}{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
f'{self.assistant}'

def messages2prompt(self, messages, sequence_start=True, **kwargs):
"""Return the prompt that is concatenated with other elements in the
Expand Down
3 changes: 2 additions & 1 deletion lmdeploy/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def infer(self,
res = res.extend(out) if res else out
outputs.append(res)
finally:
if pbar: pbar.close() # noqa
if pbar:
pbar.close() # noqa
if is_single:
return outputs[0]
return outputs
Expand Down
8 changes: 4 additions & 4 deletions lmdeploy/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ def save_csv(self, csv_file: str, hyperparams):
f'{self.rps:.3f}',
f'{(self.input_throughput):.3f}',
f'{self.output_throughput:.3f}',
f'{self.e2e_mean*1000:.3f}',
f'{self.ttft_mean*1000:.3f}' if self.stream_output else '-',
f'{self.tpot_mean*1000:.3f}',
f'{self.itls_mean*1000:.3f}' if self.stream_output else '-',
f'{self.e2e_mean * 1000:.3f}',
f'{self.ttft_mean * 1000:.3f}' if self.stream_output else '-',
f'{self.tpot_mean * 1000:.3f}',
f'{self.itls_mean * 1000:.3f}' if self.stream_output else '-',
])
2 changes: 1 addition & 1 deletion lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_total_slots():
is_prefill_no_cache = False
if not step_context.is_decoding:
is_prefill_no_cache = \
all((step_context.q_seqlens ==
all((step_context.q_seqlens ==
step_context.kv_seqlens).tolist())
q_start_loc = step_context.q_start_loc
cu_seqlens = torch.cat((q_start_loc, step_context.q_seqlens.sum().unsqueeze(0))).int()
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/pytorch/engine/model_agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,7 +1180,7 @@ def wakeup(self, tags: list[str] | None = None):
if 'weights' in tags:
device = next(self.patched_model.get_model().parameters()).device
assert device.type in ['cpu', 'meta']
spec_model = self.spec_agent.get_model()
spec_model = self.spec_agent.get_model()

if device.type == 'cpu':
self.patched_model.get_model().to(torch.cuda.current_device())
Expand Down
8 changes: 4 additions & 4 deletions lmdeploy/pytorch/kernels/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Copyright (c) OpenMMLab. All rights reserved.

from .w8a8_triton_kernels import (
matmul_kernel_dynamic_quant,
per_channel_quant,
per_token_quant_int8,
rms_norm_dynamic_quant,
matmul_kernel_dynamic_quant,
per_channel_quant,
per_token_quant_int8,
rms_norm_dynamic_quant,
)

__all__ = [
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def _gemm_fp8_tma_pre_hook(nargs):
'BLOCK_N': 64,
}, num_stages=3, num_warps=4, pre_hook=_gemm_fp8_tma_pre_hook)
],
key=['N', 'K'])
key=['N', 'K'])
@triton.jit
def _gemm_fp8_tma_kernel(
desc_a,
Expand Down Expand Up @@ -296,7 +296,7 @@ def _gemm_fp8_tma_kernel(
'BLOCK_N': 64,
}, num_stages=3, num_warps=4)
],
key=['N', 'K'])
key=['N', 'K'])
@triton.jit
def _gemm_fp8_kernel(
A,
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/pytorch/kernels/cuda/causal_conv1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def causal_conv1d_fwd_main(
init_col = k_val + w
if init_col < width - 1:
out_vals[i] += w_local[w] * T.cast(Init_states[seq_idx_cur, c_idx, init_col],
T.float32)
T.float32)
else:
for w in T.unroll(width):
out_vals[i] += T.if_then_else(seq_idx_local[i + w] == seq_idx_cur,
Expand Down
32 changes: 16 additions & 16 deletions lmdeploy/pytorch/kernels/cuda/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,66 +17,66 @@ def get_cuda_autotune_config():
'BLOCK_SIZE_K': 64,
'GROUP_SIZE_M': 1,
},
num_stages=3,
num_warps=8),
num_stages=3,
num_warps=8),
triton.Config({
'BLOCK_SIZE_M': 64,
'BLOCK_SIZE_N': 256,
'BLOCK_SIZE_K': 32,
'GROUP_SIZE_M': 1,
},
num_stages=4,
num_warps=4),
num_stages=4,
num_warps=4),
# SM8
triton.Config({
'BLOCK_SIZE_M': 128,
'BLOCK_SIZE_N': 128,
'BLOCK_SIZE_K': 32,
'GROUP_SIZE_M': 1,
},
num_stages=4,
num_warps=4),
num_stages=4,
num_warps=4),
triton.Config({
'BLOCK_SIZE_M': 64,
'BLOCK_SIZE_N': 256,
'BLOCK_SIZE_K': 32,
'GROUP_SIZE_M': 1,
},
num_stages=4,
num_warps=4),
num_stages=4,
num_warps=4),
triton.Config({
'BLOCK_SIZE_M': 64,
'BLOCK_SIZE_N': 128,
'BLOCK_SIZE_K': 64,
'GROUP_SIZE_M': 1,
},
num_stages=4,
num_warps=4),
num_stages=4,
num_warps=4),
# SM7-
triton.Config({
'BLOCK_SIZE_M': 64,
'BLOCK_SIZE_N': 128,
'BLOCK_SIZE_K': 32,
'GROUP_SIZE_M': 1,
},
num_stages=4,
num_warps=4),
num_stages=4,
num_warps=4),
triton.Config({
'BLOCK_SIZE_M': 128,
'BLOCK_SIZE_N': 32,
'BLOCK_SIZE_K': 32,
'GROUP_SIZE_M': 1,
},
num_stages=4,
num_warps=4),
num_stages=4,
num_warps=4),
triton.Config({
'BLOCK_SIZE_M': 64,
'BLOCK_SIZE_N': 32,
'BLOCK_SIZE_K': 32,
'GROUP_SIZE_M': 1,
},
num_stages=5,
num_warps=2),
num_stages=5,
num_warps=2),
]


Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/pytorch/kernels/cuda/rms_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ def test_rms_norm(bsz, ctx_len, feat_len, dtype):

torch_cost = (t1 - t0) / N_REPEATS * 1000
triton_cost = (t2 - t1) / N_REPEATS * 1000
print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n' \
f' torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n')
print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n'
f' torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n')

test_rms_norm(1, 8128, 5120, torch.float16)
test_rms_norm(1, 8128, 5120, torch.float32)
Expand Down
Loading
Loading