InternLM · windreamer · Apr 14, 2026 · Apr 14, 2026
diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
@@ -94,7 +94,7 @@ def evaluate(models: list[str],
     for idx, ori_model in enumerate(models):
         print()
         print(50 * '==')
-        print(f'Start evaluating {idx+1}/{num_model} {ori_model} ...')
+        print(f'Start evaluating {idx + 1}/{num_model} {ori_model} ...')
         model = ori_model.lower()
 
         lmdeploy_dir = os.path.abspath(os.environ['LMDEPLOY_DIR'])

diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
@@ -133,7 +133,7 @@
     dict(role='HUMAN', begin='[INST] ', end=' [/INST]'),
     dict(role='BOT', begin='', end='', generate=True),
 ],
-                            eos_token_id=2)
+    eos_token_id=2)
 
 MAX_SESSION_LEN = 2048
 MAX_NEW_TOKENS = 1024

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,8 @@
 repos:
+  - repo: https://github.com/hhatto/autopep8
+    rev: v2.3.2
+    hooks:
+      - id: autopep8
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.15.4
     hooks:

diff --git a/autotest/interface/restful/test_restful_chat_completions_v1.py b/autotest/interface/restful/test_restful_chat_completions_v1.py
@@ -227,14 +227,14 @@ def test_array_stopwords_streaming(self, backend, model_case):
     @pytest.mark.internlm2_5
     def test_special_words(self, backend, model_case):
         message = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' \
-                '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
-                '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
-                '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' \
-                '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' \
-                '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），' \
-                '机器学习和数据科学（用于展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、' \
-                'JSON等格式的文件）。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，' \
-                '计算曲线积分：$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
+            '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
+            '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
+            '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' \
+            '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' \
+            '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），' \
+            '机器学习和数据科学（用于展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、' \
+            'JSON等格式的文件）。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，' \
+            '计算曲线积分：$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
         api_client = APIClient(BASE_URL)
         model_name = api_client.available_models[0]
         for output in api_client.chat_completions_v1(model=model_name,

diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py
@@ -748,7 +748,7 @@ def single_request(idx):
 
         success_rate = success_count / 20
         assert success_rate == 1.0, \
-            f'Stress test failed: success rate {success_rate*100}% < 80%'
+            f'Stress test failed: success rate {success_rate * 100}% < 80%'
 
         if success_count > 0:
             avg_latency = total_latency / success_count

diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
@@ -210,7 +210,7 @@ def load_video(video_path, bound=None, num_segments=32):
 
     question = ''
     for i in range(len(imgs)):
-        question = question + f'Frame{i+1}: {IMAGE_TOKEN}\n'
+        question = question + f'Frame{i + 1}: {IMAGE_TOKEN}\n'
 
     if lang == 'cn':
         question += '视频里有什么动物，它在做什么？'

diff --git a/lmdeploy/api.py b/lmdeploy/api.py
@@ -95,7 +95,7 @@ def serve(model_path: str,
         This function has been removed. Please use alternative methods.
 
     This will run the api_server in a subprocess.
-    """ # noqa E501
+    """  # noqa E501
     raise NotImplementedError("The 'serve' function is no longer available. "
                               'This function has been deprecated and removed.')
 

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -61,7 +61,7 @@ def get_lora_adapters(adapters: list[str]):
     else:
         for pair in adapters:
             assert '=' in pair, f'Multiple lora paths must in format of ' \
-                                 f'xxx=yyy. But given: {pair}'
+                f'xxx=yyy. But given: {pair}'
             name, path = pair.strip().split('=', 1)
             assert name not in output, f'Multiple lora paths with repeated lora name: {name}'
             output[name] = path
@@ -420,8 +420,7 @@ def calib_batchsize(parser):
             '--batch-size',
             type=int,
             default=1,
-            help=\
-            'The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM'  # noqa
+            help='The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM'  # noqa
         )
 
     @staticmethod
@@ -432,8 +431,7 @@ def calib_search_scale(parser):
             '--search-scale',
             action='store_true',
             default=False,
-            help=\
-            'Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied'  # noqa
+            help='Whether search scale ratio. Default to be disabled, which means only smooth quant with 0.5 ratio will be applied'  # noqa
         )
 
     @staticmethod
@@ -454,8 +452,7 @@ def chat_template(parser):
             '--chat-template',
             type=str,
             default=None,
-            help=\
-            'A JSON file or string that specifies the chat template configuration. '  # noqa
+            help='A JSON file or string that specifies the chat template configuration. '  # noqa
             'Please refer to https://lmdeploy.readthedocs.io/en/latest/advance/chat_template.html for the specification'  # noqa
         )
 

diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py
@@ -2,14 +2,14 @@
 
 from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs
 from .cal_qparams import (
-                          QParams,
-                          cal_qparams_per_channel_absmax,
-                          cal_qparams_per_channel_minmax,
-                          cal_qparams_per_group_absmax,
-                          cal_qparams_per_group_minmax,
-                          cal_qparams_per_tensor_absmax,
-                          cal_qparams_per_tensor_minmax,
-                          precise_round,
+    QParams,
+    cal_qparams_per_channel_absmax,
+    cal_qparams_per_channel_minmax,
+    cal_qparams_per_group_absmax,
+    cal_qparams_per_group_minmax,
+    cal_qparams_per_tensor_absmax,
+    cal_qparams_per_tensor_minmax,
+    precise_round,
 )
 from .calib_dataloader import get_calib_loaders
 from .collect import bimap_name_mod, collect_target_modules, collect_target_weights

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -451,16 +451,16 @@ def __post_init__(self):
         assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
         assert self.device_type in ['cuda', 'ascend', 'maca', 'camb'], (f'invalid device_type: {self.device_type}')
         assert self.kernel_block_size >= 16 and \
-               (self.kernel_block_size & (self.kernel_block_size - 1)) == 0, \
-               f'kernel_block_size must be >= 16 and a power of 2, but got {self.kernel_block_size}'
+            (self.kernel_block_size & (self.kernel_block_size - 1)) == 0, \
+            f'kernel_block_size must be >= 16 and a power of 2, but got {self.kernel_block_size}'
         assert self.block_size >= self.kernel_block_size and \
-               self.block_size % self.kernel_block_size == 0, \
-               (f'block_size must be >= kernel_block_size and an integer multiple '
-                f'of kernel_block_size, but got block_size {self.block_size} '
-                f'and kernel_block_size {self.kernel_block_size}')
+            self.block_size % self.kernel_block_size == 0, \
+            (f'block_size must be >= kernel_block_size and an integer multiple '
+             f'of kernel_block_size, but got block_size {self.block_size} '
+             f'and kernel_block_size {self.kernel_block_size}')
         if self.quant_policy > 0 and self.device_type not in ['cuda', 'ascend']:
             assert False, \
-                   'kv cache quantization only works for CUDA and ASCEND.'
+                'kv cache quantization only works for CUDA and ASCEND.'
         if self.device_type == 'camb' and self.block_size != 16:
             self.block_size = 16
             logger.warning('Currently, camb device requires block size to be 16, \

diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
@@ -116,10 +116,10 @@ def log(self):
                    f'{scheduler_stats.num_api_routed_reqs} / {scheduler_stats.num_api_waiting_reqs}, '
                    f'Engine (running/waiting): '
                    f'{scheduler_stats.num_running_reqs} / {scheduler_stats.num_waiting_reqs}, '
-                   f'KV cache: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, ')
+                   f'KV cache: {scheduler_stats.gpu_cache_usage * 100:.1f}%, ')
 
         if scheduler_stats.prefix_cache_hit_rate != 0:
-            log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%, '
+            log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100:.1f}%, '
 
         if spec_msg is not None:
             log_msg += spec_msg

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -162,10 +162,10 @@ def get_prompt(self, prompt, sequence_start=True):
                     f'{self.assistant}'
             else:
                 return f'{self.user}{prompt}{self.eoh}' \
-                       f'{self.assistant}'
+                    f'{self.assistant}'
         else:
             return f'{self.separator}{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
+                f'{self.assistant}'
 
     def messages2prompt(self, messages, sequence_start=True, **kwargs):
         """Return the prompt that is concatenated with other elements in the

diff --git a/lmdeploy/pipeline.py b/lmdeploy/pipeline.py
@@ -122,7 +122,8 @@ def infer(self,
                     res = res.extend(out) if res else out
                 outputs.append(res)
         finally:
-            if pbar: pbar.close()  # noqa
+            if pbar:
+                pbar.close()  # noqa
         if is_single:
             return outputs[0]
         return outputs

diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py
@@ -166,8 +166,8 @@ def save_csv(self, csv_file: str, hyperparams):
                 f'{self.rps:.3f}',
                 f'{(self.input_throughput):.3f}',
                 f'{self.output_throughput:.3f}',
-                f'{self.e2e_mean*1000:.3f}',
-                f'{self.ttft_mean*1000:.3f}' if self.stream_output else '-',
-                f'{self.tpot_mean*1000:.3f}',
-                f'{self.itls_mean*1000:.3f}' if self.stream_output else '-',
+                f'{self.e2e_mean * 1000:.3f}',
+                f'{self.ttft_mean * 1000:.3f}' if self.stream_output else '-',
+                f'{self.tpot_mean * 1000:.3f}',
+                f'{self.itls_mean * 1000:.3f}' if self.stream_output else '-',
             ])
diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
@@ -55,7 +55,7 @@ def get_total_slots():
         is_prefill_no_cache = False
         if not step_context.is_decoding:
             is_prefill_no_cache = \
-               all((step_context.q_seqlens ==
+                all((step_context.q_seqlens ==
                     step_context.kv_seqlens).tolist())
         q_start_loc = step_context.q_start_loc
         cu_seqlens = torch.cat((q_start_loc, step_context.q_seqlens.sum().unsqueeze(0))).int()

diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py
@@ -1180,7 +1180,7 @@ def wakeup(self, tags: list[str] | None = None):
         if 'weights' in tags:
             device = next(self.patched_model.get_model().parameters()).device
             assert device.type in ['cpu', 'meta']
-            spec_model =  self.spec_agent.get_model()
+            spec_model = self.spec_agent.get_model()
 
             if device.type == 'cpu':
                 self.patched_model.get_model().to(torch.cuda.current_device())

diff --git a/lmdeploy/pytorch/kernels/__init__.py b/lmdeploy/pytorch/kernels/__init__.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from .w8a8_triton_kernels import (
-                                  matmul_kernel_dynamic_quant,
-                                  per_channel_quant,
-                                  per_token_quant_int8,
-                                  rms_norm_dynamic_quant,
+    matmul_kernel_dynamic_quant,
+    per_channel_quant,
+    per_token_quant_int8,
+    rms_norm_dynamic_quant,
 )
 
 __all__ = [

diff --git a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
@@ -208,7 +208,7 @@ def _gemm_fp8_tma_pre_hook(nargs):
         'BLOCK_N': 64,
     }, num_stages=3, num_warps=4, pre_hook=_gemm_fp8_tma_pre_hook)
 ],
-                 key=['N', 'K'])
+    key=['N', 'K'])
 @triton.jit
 def _gemm_fp8_tma_kernel(
     desc_a,
@@ -296,7 +296,7 @@ def _gemm_fp8_tma_kernel(
         'BLOCK_N': 64,
     }, num_stages=3, num_warps=4)
 ],
-                 key=['N', 'K'])
+    key=['N', 'K'])
 @triton.jit
 def _gemm_fp8_kernel(
     A,

diff --git a/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py b/lmdeploy/pytorch/kernels/cuda/causal_conv1d.py
@@ -121,7 +121,7 @@ def causal_conv1d_fwd_main(
                             init_col = k_val + w
                             if init_col < width - 1:
                                 out_vals[i] += w_local[w] * T.cast(Init_states[seq_idx_cur, c_idx, init_col],
-                                                                    T.float32)
+                                                                   T.float32)
                 else:
                     for w in T.unroll(width):
                         out_vals[i] += T.if_then_else(seq_idx_local[i + w] == seq_idx_cur,

diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe.py b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
@@ -17,66 +17,66 @@ def get_cuda_autotune_config():
             'BLOCK_SIZE_K': 64,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=3,
-                      num_warps=8),
+            num_stages=3,
+            num_warps=8),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 256,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         # SM8
         triton.Config({
             'BLOCK_SIZE_M': 128,
             'BLOCK_SIZE_N': 128,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 256,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 128,
             'BLOCK_SIZE_K': 64,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         # SM7-
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 128,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 128,
             'BLOCK_SIZE_N': 32,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=4,
-                      num_warps=4),
+            num_stages=4,
+            num_warps=4),
         triton.Config({
             'BLOCK_SIZE_M': 64,
             'BLOCK_SIZE_N': 32,
             'BLOCK_SIZE_K': 32,
             'GROUP_SIZE_M': 1,
         },
-                      num_stages=5,
-                      num_warps=2),
+            num_stages=5,
+            num_warps=2),
     ]
 
 

diff --git a/lmdeploy/pytorch/kernels/cuda/rms_norm.py b/lmdeploy/pytorch/kernels/cuda/rms_norm.py
@@ -191,8 +191,8 @@ def test_rms_norm(bsz, ctx_len, feat_len, dtype):
 
         torch_cost = (t1 - t0) / N_REPEATS * 1000
         triton_cost = (t2 - t1) / N_REPEATS * 1000
-        print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n' \
-                f'  torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n')
+        print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n'
+              f'  torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n')
 
     test_rms_norm(1, 8128, 5120, torch.float16)
     test_rms_norm(1, 8128, 5120, torch.float32)