From dab681875b0af6c5c08378cf451b56f8b162ed7e Mon Sep 17 00:00:00 2001 From: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Date: Thu, 5 Feb 2026 17:37:05 +0800 Subject: [PATCH] Revert "[Cherry-Pick]Support Norm before Rope(#6332) (#6333)" This reverts commit a62d003a112ba6d73d02abe4efe9c7bce15aa0e6. --- .../layers/attention/append_attn_backend.py | 12 +- .../layers/attention/attention.py | 3 - .../layers/attention/flash_attn_backend.py | 50 +-- .../attention/flash_mask_attn_backend.py | 12 +- .../model_executor/layers/normalization.py | 89 +---- .../model_executor/ops/triton_ops/__init__.py | 8 +- .../ops/triton_ops/qk_rmsnorm_fused_kernel.py | 130 ------- .../ops/triton_ops/triton_utils.py | 11 - tests/layers/test_normalization.py | 326 ------------------ tests/operators/test_qk_rmsnorm_fused.py | 133 ------- 10 files changed, 16 insertions(+), 758 deletions(-) delete mode 100644 fastdeploy/model_executor/ops/triton_ops/qk_rmsnorm_fused_kernel.py delete mode 100644 tests/layers/test_normalization.py delete mode 100644 tests/operators/test_qk_rmsnorm_fused.py diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index eb6147561fb..4608bd81e92 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -229,10 +229,6 @@ def forward_mixed( sliding_window = layer.sliding_window - norm_after_rope_in_kernel = not getattr(layer, "qk_norm_before_rope", False) - q_norm_weight = getattr(layer, "q_norm_weight", None) if norm_after_rope_in_kernel else None - k_norm_weight = getattr(layer, "k_norm_weight", None) if norm_after_rope_in_kernel else None - if self.pd_disaggregation_mode == "per_query": metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( metadata.kv_signal_metadata, @@ -344,8 +340,8 @@ def forward_mixed( layer.linear_smooth, forward_meta.attn_mask_offsets, metadata.kv_signal_data_list[layer.layer_id], - q_norm_weight, - k_norm_weight, + getattr(layer, "q_norm_weight", None), + getattr(layer, "k_norm_weight", None), getattr(layer, "sinks", None), getattr(layer, "rms_norm_eps", 1e-6), metadata._fuse_kernel_compute_dtype, @@ -400,8 +396,8 @@ def forward_mixed( layer.linear_smooth, forward_meta.attn_mask_offsets, metadata.kv_signal_data_list[layer.layer_id], - q_norm_weight, - k_norm_weight, + getattr(layer, "q_norm_weight", None), + getattr(layer, "k_norm_weight", None), getattr(layer, "sinks", None), getattr(layer, "rms_norm_eps", 1e-6), metadata._fuse_kernel_compute_dtype, diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index 7249092d590..a5ac1876e34 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -59,7 +59,6 @@ def __init__( linear_smooth: paddle.Tensor = None, use_neox_rotary_style: bool = False, use_qk_norm: bool = False, - qk_norm_before_rope: bool = False, rms_norm_eps: float = 1e-6, with_sinks: bool = False, ) -> None: @@ -77,7 +76,6 @@ def __init__( linear_shift (Optional[paddle.Tensor], optional): The shift of linear. Defaults to None. linear_smooth (Optional[paddle.Tensor], optional): The smooth of linear. Defaults to None. use_qk_norm (bool, optional): Whether to apply rmsnorm on QA after rope. Defaults to False. - qk_norm_before_rope (bool, optional): Whether to apply rmsnorm before rope (e.g., Qwen style). Defaults to False. if True, use_qk_norm should also be True. rms_norm_eps (float, optional): The epsilon of RMSNorm. Defaults to 1e-6. Raises: @@ -126,7 +124,6 @@ def __init__( else: logger.info(f"Attention is running in cache kv {self.quant_method.cache_quant_config.quant_type} mode") self.use_qk_norm = use_qk_norm - self.qk_norm_before_rope = qk_norm_before_rope self.rms_norm_eps = rms_norm_eps if self.use_qk_norm: self.q_norm_key = f"{self.prefix}.q_norm" diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index f247a5cc406..ef368a7234e 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -249,48 +249,6 @@ def forward_mixed( layer.layer_id + self.start_layer_index, ) - norm_after_rope_in_kernel = not getattr(layer, "qk_norm_before_rope", False) - q_norm_weight = getattr(layer, "q_norm_weight", None) if norm_after_rope_in_kernel else None - k_norm_weight = getattr(layer, "k_norm_weight", None) if norm_after_rope_in_kernel else None - - if layer.layer_id == 0: - get_block_shape_and_split_kv_block( - forward_meta.seq_lens_encoder, - forward_meta.seq_lens_decoder, - forward_meta.seq_lens_this_time, - forward_meta.decoder_batch_ids, - forward_meta.decoder_tile_ids_per_batch, - forward_meta.decoder_num_blocks_cpu, - forward_meta.decoder_num_blocks_device, - forward_meta.decoder_chunk_size_device, - forward_meta.max_len_tensor_cpu, - forward_meta.encoder_batch_ids, - forward_meta.encoder_tile_ids_per_batch, - forward_meta.encoder_num_blocks_x_cpu, - forward_meta.kv_batch_ids, - forward_meta.kv_tile_ids_per_batch, - forward_meta.kv_num_blocks_x_cpu, - self.encoder_block_shape_q, - self.decoder_block_shape_q, - self.group_size, - self.block_size, - ) - - if forward_meta.max_len_tensor_cpu[1].item() > 0: - ( - metadata.cu_seqlens_k, - metadata.pre_cache_batch_ids, - metadata.pre_cache_tile_ids_per_batch, - metadata.pre_cache_num_blocks_cpu, - metadata.kv_token_num_cpu, - ) = pre_cache_len_concat( - forward_meta.seq_lens_encoder, - forward_meta.seq_lens_decoder, - forward_meta.seq_lens_this_time, - forward_meta.max_len_tensor_cpu[2], - self.block_size, - ) - use_fa_do_prefill = forward_meta.max_len_tensor_cpu[1].item() > 0 if use_fa_do_prefill: @@ -312,8 +270,8 @@ def forward_mixed( metadata.pre_cache_batch_ids, metadata.pre_cache_tile_ids_per_batch, metadata.pre_cache_num_blocks_cpu, - q_norm_weight, - k_norm_weight, + getattr(layer, "q_norm_weight", None), + getattr(layer, "k_norm_weight", None), getattr(layer, "cache_k_scale", None), getattr(layer, "cache_v_scale", None), getattr(layer, "cache_k_out_scale", None), @@ -375,8 +333,8 @@ def forward_mixed( layer.linear_smooth, forward_meta.attn_mask_offsets, metadata.kv_signal_data_list[layer.layer_id], - q_norm_weight, - k_norm_weight, + getattr(layer, "q_norm_weight", None), + getattr(layer, "k_norm_weight", None), getattr(layer, "sinks", None), getattr(layer, "rms_norm_eps", 1e-6), metadata._fuse_kernel_compute_dtype, diff --git a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py index bee89138f7a..ae21ca3ba9f 100644 --- a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py @@ -181,10 +181,6 @@ def forward_mixed( ): metadata = forward_meta.attention_metadata - norm_after_rope_in_kernel = not getattr(layer, "qk_norm_before_rope", False) - q_norm_weight = getattr(layer, "q_norm_weight", None) if norm_after_rope_in_kernel else None - k_norm_weight = getattr(layer, "k_norm_weight", None) if norm_after_rope_in_kernel else None - if self.pd_disaggregation_mode == "per_query": metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( metadata.kv_signal_metadata, @@ -256,8 +252,8 @@ def forward_mixed( forward_meta.pre_cache_batch_ids, forward_meta.pre_cache_tile_ids_per_batch, forward_meta.pre_cache_num_blocks_cpu, - q_norm_weight, - k_norm_weight, + getattr(layer, "q_norm_weight", None), + getattr(layer, "k_norm_weight", None), getattr(layer, "cache_k_scale", None), getattr(layer, "cache_v_scale", None), getattr(layer, "cache_k_out_scale", None), @@ -324,8 +320,8 @@ def forward_mixed( layer.linear_smooth, forward_meta.attn_mask_offsets, metadata.kv_signal_data_list[layer.layer_id], - q_norm_weight, - k_norm_weight, + getattr(layer, "q_norm_weight", None), + getattr(layer, "k_norm_weight", None), getattr(layer, "sinks", None), getattr(layer, "rms_norm_eps", 1e-6), metadata._fuse_kernel_compute_dtype, diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py index abfea7c947c..a66172fc1b5 100644 --- a/fastdeploy/model_executor/layers/normalization.py +++ b/fastdeploy/model_executor/layers/normalization.py @@ -20,7 +20,6 @@ import paddle from paddle import nn -from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.platforms import current_platform if current_platform.is_gcu(): @@ -29,7 +28,7 @@ from paddle.incubate.nn.functional import fused_layer_norm, fused_rms_norm from fastdeploy.config import FDConfig -from fastdeploy.model_executor.ops.triton_ops import _TRITON_AVAILABLE, qk_rmsnorm_fused +from fastdeploy.model_executor.forward_meta import ForwardMeta from .utils import get_tensor @@ -257,92 +256,6 @@ def forward( return out, residual_out -class QKRMSNorm(nn.Layer): - """ - QK Normalization layer. - """ - - def __init__( - self, - fd_config: FDConfig, - head_dim: int, - q_size: int, - kv_size: int, - eps: float = 1e-5, - prefix: str = "", - begin_norm_axis: int = 1, - dtype: str = None, - ) -> None: - super().__init__() - self.fd_config = fd_config - self.prefix: str = prefix - self.head_dim: int = head_dim - self.q_weight_key: Optional[str] = f"{prefix}.q_norm.weight" - self.k_weight_key: Optional[str] = f"{prefix}.k_norm.weight" - self.eps: float = eps - self._norm_weight_dtype = dtype - if self._norm_weight_dtype is None: - self._norm_weight_dtype = self._helper.get_default_dtype() - else: - assert dtype in [ - "float32", - "bfloat16", - "float16", - ], f"Unsupported dtype: {dtype}. Must be one of: float32, bfloat16, float16" - - self.q_size = q_size - self.kv_size = kv_size - - self.q_norm = RMSNorm( - fd_config, - hidden_size=self.head_dim, - eps=fd_config.model_config.rms_norm_eps, - prefix=f"{prefix}.q_norm", - begin_norm_axis=begin_norm_axis, - ) - self.k_norm = RMSNorm( - fd_config, - hidden_size=self.head_dim, - eps=fd_config.model_config.rms_norm_eps, - prefix=f"{prefix}.k_norm", - begin_norm_axis=begin_norm_axis, - ) - self.qk_norm_fused = current_platform.is_cuda() and _TRITON_AVAILABLE - - def load_state_dict(self, state_dict): - self.q_norm.load_state_dict(state_dict) - self.k_norm.load_state_dict(state_dict) - - def forward( - self, - qkv_out, - forward_meta, - ) -> paddle.Tensor: - if self.qk_norm_fused and forward_meta.step_use_cudagraph: - qkv_out = qk_rmsnorm_fused( - qkv_out, - self.q_norm.weight, - self.k_norm.weight, - self.eps, - self.q_size, - self.kv_size, - self.head_dim, - ) - else: - q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size], axis=-1) - - q_by_head = q.reshape([*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim]) - q_by_head = self.q_norm(q_by_head)[0] - q = q_by_head.reshape(q.shape) - - k_by_head = k.reshape([*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim]) - k_by_head = self.k_norm(k_by_head)[0] - k = k_by_head.reshape(k.shape) - - qkv_out = paddle.concat([q, k, v], axis=-1) - return qkv_out - - class LayerNorm(nn.Layer): """ Initializes the LayerNormalization layer diff --git a/fastdeploy/model_executor/ops/triton_ops/__init__.py b/fastdeploy/model_executor/ops/triton_ops/__init__.py index 47b069a8b2c..3481c30caa6 100644 --- a/fastdeploy/model_executor/ops/triton_ops/__init__.py +++ b/fastdeploy/model_executor/ops/triton_ops/__init__.py @@ -15,12 +15,10 @@ """ try: - from .qk_rmsnorm_fused_kernel import qk_rmsnorm_fused from .repetition_early_stop_kernel import repetition_early_stopper_kernel + from .wint2_fused_moe import fused_moe_wint2_triton from .wint2_fused_moe_kernel import moe_wint2_ffn_kernel - _TRITON_AVAILABLE = True - - __all__ = ["moe_wint2_ffn_kernel", "repetition_early_stopper_kernel", "qk_rmsnorm_fused"] + __all__ = ["fused_moe_wint2_triton", "moe_wint2_ffn_kernel", "repetition_early_stopper_kernel"] except: - _TRITON_AVAILABLE = False + pass diff --git a/fastdeploy/model_executor/ops/triton_ops/qk_rmsnorm_fused_kernel.py b/fastdeploy/model_executor/ops/triton_ops/qk_rmsnorm_fused_kernel.py deleted file mode 100644 index 255e87b9c8f..00000000000 --- a/fastdeploy/model_executor/ops/triton_ops/qk_rmsnorm_fused_kernel.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import triton -import triton.language as tl - -from fastdeploy.model_executor.ops.triton_ops.triton_utils import ( - enable_compat_on_triton_kernel, -) -from fastdeploy.utils import ceil_div - - -@enable_compat_on_triton_kernel -@triton.jit -def qk_rmsnorm_fused_kernel( - x_ptr, - q_weight_ptr, - k_weight_ptr, - M, - q_size, - kv_size, - eps, - num_q_heads: tl.constexpr, - num_kv_heads: tl.constexpr, - head_dim: tl.constexpr, - BLOCK_HEADS: tl.constexpr, -): - pid = tl.program_id(0) - - heads_per_token = tl.cdiv(num_q_heads, BLOCK_HEADS) - token_id = pid // heads_per_token - head_block = pid % heads_per_token - - if token_id >= M: - return - - offs_h = tl.arange(0, BLOCK_HEADS) - offs_d = tl.arange(0, head_dim) - - head_ids = head_block * BLOCK_HEADS + offs_h - - q_mask = head_ids < num_q_heads - kv_mask = head_ids < num_kv_heads - - row_base = token_id * (q_size + 2 * kv_size) - - # ------------------- - # Q RMSNorm - # ------------------- - q_ptrs = x_ptr + row_base + head_ids[:, None] * head_dim + offs_d[None, :] - - q = tl.load(q_ptrs, mask=q_mask[:, None], other=0.0).to(tl.float32) - q_var = tl.sum(q * q, axis=1) / head_dim - q_hat = q * tl.rsqrt(q_var[:, None] + eps) - - q_w = tl.load(q_weight_ptr + offs_d).to(tl.float32) - q_out = q_hat * q_w[None, :] - - tl.store( - q_ptrs, - q_out, - mask=q_mask[:, None], - ) - - # ------------------- - # K RMSNorm - # ------------------- - k_ptrs = x_ptr + row_base + q_size + head_ids[:, None] * head_dim + offs_d[None, :] - - k = tl.load(k_ptrs, mask=kv_mask[:, None], other=0.0).to(tl.float32) - k_var = tl.sum(k * k, axis=1) / head_dim - k_hat = k * tl.rsqrt(k_var[:, None] + eps) - - k_w = tl.load(k_weight_ptr + offs_d).to(tl.float32) - k_out = k_hat * k_w[None, :] - - tl.store( - k_ptrs, - k_out, - mask=kv_mask[:, None], - ) - - -def qk_rmsnorm_fused( - qkv_out, - q_norm_weight, - k_norm_weight, - eps, - q_size, - kv_size, - head_dim, -): - assert qkv_out.ndim == 2 - M, _ = qkv_out.shape - - num_q_heads = q_size // head_dim - num_kv_heads = kv_size // head_dim - - BLOCK_HEADS = 4 if num_q_heads <= 32 else 8 - - grid = (M * ceil_div(num_q_heads, BLOCK_HEADS),) - - qk_rmsnorm_fused_kernel[grid]( - x_ptr=qkv_out, - q_weight_ptr=q_norm_weight, - k_weight_ptr=k_norm_weight, - M=M, - q_size=q_size, - kv_size=kv_size, - eps=eps, - num_q_heads=num_q_heads, - num_kv_heads=num_kv_heads, - head_dim=head_dim, - BLOCK_HEADS=BLOCK_HEADS, - num_warps=2, - ) - return qkv_out diff --git a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py index f92e8d9c94f..a61268044bd 100644 --- a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py +++ b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py @@ -30,17 +30,6 @@ python_path = sys.executable -def enable_compat_on_triton_kernel(triton_kernel): - class WrappedTritonKernel: - def __init__(self, kernel): - self.kernel = kernel - - def __getitem__(self, index): - return paddle.use_compat_guard(enable=True, silent=True)(self.kernel[index]) - - return WrappedTritonKernel(triton_kernel) - - def SubstituteTemplate(template, values): """ Substitute all variables in the given template string using the provided values dictionary. diff --git a/tests/layers/test_normalization.py b/tests/layers/test_normalization.py deleted file mode 100644 index 9bd84c08b5e..00000000000 --- a/tests/layers/test_normalization.py +++ /dev/null @@ -1,326 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from unittest.mock import patch - -import paddle - -from fastdeploy.model_executor.forward_meta import ForwardMeta -from fastdeploy.model_executor.layers.normalization import QKRMSNorm - - -class DummyQuantConfig: - quant_round_type = 1 - quant_max_bound = 127 - quant_min_bound = -128 - - -class DummyModelConfig: - rms_norm_eps = 1e-5 - - -class DummyParallelConfig: - expert_parallel_size = 1 - tensor_parallel_size = 1 - tensor_parallel_rank = 0 - tp_group = None - - -class DummyFDConfig: - def __init__(self): - self.quant_config = DummyQuantConfig() - self.model_config = DummyModelConfig() - self.parallel_config = DummyParallelConfig() - - -class TestQKRMSNorm(unittest.TestCase): - - def setUp(self): - """Set up test fixtures before each test method.""" - self.fd_config = DummyFDConfig() - self.head_dim = 64 - self.q_size = 512 # 8 heads * 64 head_dim - self.kv_size = 128 # 2 heads * 64 head_dim - self.eps = 1e-5 - - def create_qkrmsnorm_layer(self, dtype="float16"): - """Helper method to create QKRMSNorm layer with given dtype.""" - return QKRMSNorm( - fd_config=self.fd_config, - head_dim=self.head_dim, - q_size=self.q_size, - kv_size=self.kv_size, - eps=self.eps, - prefix="test_qk_norm", - dtype=dtype, - ) - - def create_test_forward_meta(self, step_use_cudagraph=False): - """Helper method to create ForwardMeta with given cudagraph setting.""" - forward_meta = ForwardMeta( - ids_remove_padding=paddle.to_tensor([1, 2, 3]), step_use_cudagraph=step_use_cudagraph - ) - return forward_meta - - def create_test_qkv_tensor(self, batch_size=2, seq_len=10, dtype="float16"): - """Helper method to create test qkv tensor.""" - total_size = self.q_size + self.kv_size + self.kv_size - qkv_out = paddle.randn([batch_size, seq_len, total_size], dtype=dtype) - return qkv_out - - def test_initialization(self): - """Test that QKRMSNorm initializes correctly with different parameters.""" - # Test with float16 dtype - layer = self.create_qkrmsnorm_layer(dtype="float16") - self.assertEqual(layer.head_dim, self.head_dim) - self.assertEqual(layer.q_size, self.q_size) - self.assertEqual(layer.kv_size, self.kv_size) - self.assertEqual(layer.eps, self.eps) - self.assertIsNotNone(layer.q_norm) - self.assertIsNotNone(layer.k_norm) - - # Test with float32 dtype - layer_fp32 = self.create_qkrmsnorm_layer(dtype="float32") - self.assertEqual(layer_fp32.head_dim, self.head_dim) - - # Test with bfloat16 dtype - layer_bf16 = self.create_qkrmsnorm_layer(dtype="bfloat16") - self.assertEqual(layer_bf16.head_dim, self.head_dim) - - def test_invalid_dtype_initialization(self): - """Test that QKRMSNorm raises error with invalid dtype.""" - with self.assertRaises(AssertionError) as context: - QKRMSNorm( - fd_config=self.fd_config, - head_dim=self.head_dim, - q_size=self.q_size, - kv_size=self.kv_size, - eps=self.eps, - prefix="test", - dtype="int8", # Invalid dtype - ) - self.assertIn("Unsupported dtype: int8", str(context.exception)) - - def test_forward_non_fused_path(self): - """Test forward computation using non-fused path (split and reassemble).""" - layer = self.create_qkrmsnorm_layer() - qkv_out = self.create_test_qkv_tensor() - forward_meta = self.create_test_forward_meta(step_use_cudagraph=False) - - # Mock the triton availability to ensure non-fused path - with patch.object(layer, "qk_norm_fused", False): - output = layer.forward(qkv_out, forward_meta) - - # Verify output shape is same as input - self.assertEqual(output.shape, qkv_out.shape) - self.assertEqual(output.dtype, qkv_out.dtype) - - # Verify output is different from input (normalization occurred) - self.assertFalse(paddle.allclose(output, qkv_out)) - - def test_forward_fused_path_cuda_cudagraph(self): - """Test forward computation using fused path when CUDA and cudagraph are available.""" - layer = self.create_qkrmsnorm_layer() - qkv_out = self.create_test_qkv_tensor() - forward_meta = self.create_test_forward_meta(step_use_cudagraph=True) - - # Mock to simulate CUDA environment with triton available - with patch.object(layer, "qk_norm_fused", True): - # Mock the qk_rmsnorm_fused function - with patch("fastdeploy.model_executor.layers.normalization.qk_rmsnorm_fused") as mock_fused: - mock_fused.return_value = qkv_out # Return the same tensor for simplicity - - output = layer.forward(qkv_out, forward_meta) - - # Verify fused function was called with correct parameters - mock_fused.assert_called_once() - call_args = mock_fused.call_args[0] - self.assertEqual(call_args[0].shape, qkv_out.shape) # qkv_out - self.assertEqual(call_args[3], layer.eps) # eps - self.assertEqual(call_args[4], layer.q_size) # q_size - self.assertEqual(call_args[5], layer.kv_size) # kv_size - self.assertEqual(call_args[6], layer.head_dim) # head_dim - - self.assertEqual(output.shape, qkv_out.shape) - - def test_forward_fused_path_cuda_no_cudagraph(self): - """Test that fused path is not used when cudagraph is disabled.""" - layer = self.create_qkrmsnorm_layer() - qkv_out = self.create_test_qkv_tensor() - forward_meta = self.create_test_forward_meta(step_use_cudagraph=False) - - # Even if triton is available, should use non-fused path when cudagraph is False - with patch.object(layer, "qk_norm_fused", True): - output = layer.forward(qkv_out, forward_meta) - - # Should still work correctly using non-fused path - self.assertEqual(output.shape, qkv_out.shape) - self.assertEqual(output.dtype, qkv_out.dtype) - - def test_forward_different_batch_sizes(self): - """Test forward computation with different batch sizes.""" - layer = self.create_qkrmsnorm_layer() - - # Test with batch_size = 1 - qkv_out_1 = self.create_test_qkv_tensor(batch_size=1, seq_len=5) - forward_meta = self.create_test_forward_meta(step_use_cudagraph=False) - output_1 = layer.forward(qkv_out_1, forward_meta) - self.assertEqual(output_1.shape, qkv_out_1.shape) - - # Test with batch_size = 8 - qkv_out_8 = self.create_test_qkv_tensor(batch_size=8, seq_len=5) - output_8 = layer.forward(qkv_out_8, forward_meta) - self.assertEqual(output_8.shape, qkv_out_8.shape) - - # Test with batch_size = 16, seq_len = 20 - qkv_out_16 = self.create_test_qkv_tensor(batch_size=16, seq_len=20) - output_16 = layer.forward(qkv_out_16, forward_meta) - self.assertEqual(output_16.shape, qkv_out_16.shape) - - def test_forward_different_sequence_lengths(self): - """Test forward computation with different sequence lengths.""" - layer = self.create_qkrmsnorm_layer() - forward_meta = self.create_test_forward_meta(step_use_cudagraph=False) - - # Test with short sequence - qkv_out_short = self.create_test_qkv_tensor(batch_size=2, seq_len=1) - output_short = layer.forward(qkv_out_short, forward_meta) - self.assertEqual(output_short.shape, qkv_out_short.shape) - - # Test with long sequence - qkv_out_long = self.create_test_qkv_tensor(batch_size=2, seq_len=100) - output_long = layer.forward(qkv_out_long, forward_meta) - self.assertEqual(output_long.shape, qkv_out_long.shape) - - def test_forward_different_dtypes(self): - """Test forward computation with different input dtypes.""" - forward_meta = self.create_test_forward_meta(step_use_cudagraph=False) - - # Test with float16 - layer_fp16 = self.create_qkrmsnorm_layer(dtype="float16") - qkv_out_fp16 = self.create_test_qkv_tensor(dtype="float16") - output_fp16 = layer_fp16.forward(qkv_out_fp16, forward_meta) - self.assertEqual(output_fp16.dtype, paddle.float16) - - # Test with float32 - layer_fp32 = self.create_qkrmsnorm_layer(dtype="float32") - qkv_out_fp32 = self.create_test_qkv_tensor(dtype="float32") - output_fp32 = layer_fp32.forward(qkv_out_fp32, forward_meta) - self.assertEqual(output_fp32.dtype, paddle.float32) - - # Test with bfloat16 - layer_bf16 = self.create_qkrmsnorm_layer(dtype="bfloat16") - qkv_out_bf16 = self.create_test_qkv_tensor(dtype="bfloat16") - output_bf16 = layer_bf16.forward(qkv_out_bf16, forward_meta) - self.assertEqual(output_bf16.dtype, paddle.bfloat16) - - def test_forward_edge_cases(self): - """Test forward computation with edge cases.""" - layer = self.create_qkrmsnorm_layer() - forward_meta = self.create_test_forward_meta(step_use_cudagraph=False) - - # Test with very small values - qkv_out_small = paddle.full([2, 5, self.q_size + 2 * self.kv_size], 1e-6, dtype="float16") - output_small = layer.forward(qkv_out_small, forward_meta) - self.assertEqual(output_small.shape, qkv_out_small.shape) - - # Test with very large values - qkv_out_large = paddle.full([2, 5, self.q_size + 2 * self.kv_size], 1e6, dtype="float16") - output_large = layer.forward(qkv_out_large, forward_meta) - self.assertEqual(output_large.shape, qkv_out_large.shape) - - # Test with mixed positive and negative values - qkv_out_mixed = paddle.randn([2, 5, self.q_size + 2 * self.kv_size], dtype="float16") - # Ensure some negative values - qkv_out_mixed = qkv_out_mixed - 0.5 - output_mixed = layer.forward(qkv_out_mixed, forward_meta) - self.assertEqual(output_mixed.shape, qkv_out_mixed.shape) - - def test_q_k_v_split_correctness(self): - """Test that Q, K, V splitting in non-fused path is correct.""" - layer = self.create_qkrmsnorm_layer() - qkv_out = self.create_test_qkv_tensor() - forward_meta = self.create_test_forward_meta(step_use_cudagraph=False) - - with patch.object(layer, "qk_norm_fused", False): - output = layer.forward(qkv_out, forward_meta) - - # Manually split and verify the dimensions - q, k, v = qkv_out.split([layer.q_size, layer.kv_size, layer.kv_size], axis=-1) - - self.assertEqual(q.shape[-1], layer.q_size) - self.assertEqual(k.shape[-1], layer.kv_size) - self.assertEqual(v.shape[-1], layer.kv_size) - - # Verify that q and k have been normalized by checking they're different from original - q_original = qkv_out.split([layer.q_size, layer.kv_size, layer.kv_size], axis=-1)[0] - self.assertFalse( - paddle.allclose(q_original, output.split([layer.q_size, layer.kv_size, layer.kv_size], axis=-1)[0]) - ) - - def test_load_state_dict(self): - """Test loading state dictionary.""" - layer = self.create_qkrmsnorm_layer() - - # Create a mock state dict - state_dict = { - "test_qk_norm.q_norm.weight": paddle.ones([layer.head_dim], dtype="float16"), - "test_qk_norm.k_norm.weight": paddle.ones([layer.head_dim], dtype="float16"), - } - - # This should not raise any errors - layer.load_state_dict(state_dict) - - def test_forward_with_none_forward_meta(self): - """Test forward computation when forward_meta is None.""" - layer = self.create_qkrmsnorm_layer() - qkv_out = self.create_test_qkv_tensor() - - # Should work without forward_meta (use non-fused path) - with patch.object(layer, "qk_norm_fused", True): # Even if triton available - output = layer.forward(qkv_out, None) - - self.assertEqual(output.shape, qkv_out.shape) - - def test_forward_consistency_between_paths(self): - """Test that both fused and non-fused paths produce consistent results (when applicable).""" - # Note: This test verifies that both paths work without crashing - # In practice, the results may differ due to different implementations - layer = self.create_qkrmsnorm_layer() - qkv_out = self.create_test_qkv_tensor() - - # Test non-fused path - forward_meta_no_cuda = self.create_test_forward_meta(step_use_cudagraph=False) - with patch.object(layer, "qk_norm_fused", False): - output_non_fused = layer.forward(qkv_out, forward_meta_no_cuda) - - # Test fused path (mocked) - forward_meta_cuda = self.create_test_forward_meta(step_use_cudagraph=True) - with patch.object(layer, "qk_norm_fused", True): - with patch("fastdeploy.model_executor.layers.normalization.qk_rmsnorm_fused") as mock_fused: - # Make the mock return a tensor with the same shape but different values - mock_output = qkv_out + 0.1 # Slightly different to simulate actual computation - mock_fused.return_value = mock_output - output_fused = layer.forward(qkv_out, forward_meta_cuda) - - # Both should produce valid outputs with correct shapes - self.assertEqual(output_non_fused.shape, qkv_out.shape) - self.assertEqual(output_fused.shape, qkv_out.shape) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/operators/test_qk_rmsnorm_fused.py b/tests/operators/test_qk_rmsnorm_fused.py deleted file mode 100644 index 6318778955e..00000000000 --- a/tests/operators/test_qk_rmsnorm_fused.py +++ /dev/null @@ -1,133 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest - -import numpy as np -import paddle - -from fastdeploy.model_executor.ops.triton_ops import qk_rmsnorm_fused -from tests.utils import OpPerformanceTester - -paddle.set_default_dtype("bfloat16") -paddle.seed(99) - - -class TestQKNorm(unittest.TestCase): - def setUp(self) -> None: - # Qwen3-30B-A3B TP1 - self.hidden_size = 2048 - self.num_attention_heads = 32 - self.num_key_value_heads = 4 - self.num_hidden_layers = 48 - self.head_dim = 128 - self.rms_norm_eps = 1e-6 - self.tp_size = 1 - - # # Qwen3-235B-A22B TP4 - # self.hidden_size = 4096 - # self.num_attention_heads = 64 - # self.num_key_value_heads = 4 - # self.num_hidden_layers = 94 - # self.head_dim = 128 - # self.rms_norm_eps = 1e-6 - # self.tp_size = 4 - - # # GLM_4.6 TP4 - # self.hidden_size = 5120 - # self.num_attention_heads = 96 - # self.num_key_value_heads = 8 - # self.num_hidden_layers = 92 - # self.head_dim = 128 - # self.rms_norm_eps = 1e-5 - # self.tp_size = 4 - - self.num_kv_heads_replicas = max(1, self.tp_size // self.num_key_value_heads) - self.q_size = self.num_attention_heads * self.head_dim // self.tp_size - self.kv_size = self.num_key_value_heads * self.head_dim * self.num_kv_heads_replicas // self.tp_size - self.q_norm_weight = paddle.randn([self.head_dim], paddle.bfloat16) - self.k_norm_weight = paddle.randn([self.head_dim], paddle.bfloat16) - - def qk_norm_paddle(self, qkv_out): - q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size], axis=-1) - q_by_head = q.reshape([*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim]) - q_by_head = paddle.incubate.nn.functional.fused_rms_norm( - q_by_head, self.q_norm_weight, None, self.rms_norm_eps, begin_norm_axis=2 - )[0] - q = q_by_head.reshape(q.shape) - - k_by_head = k.reshape([*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim]) - k_by_head = paddle.incubate.nn.functional.fused_rms_norm( - k_by_head, self.k_norm_weight, None, self.rms_norm_eps, begin_norm_axis=2 - )[0] - k = k_by_head.reshape(k.shape) - - qkv_out = paddle.concat([q, k, v], axis=-1) - return qkv_out - - def qk_norm_triton_fused(self, qkv_out): - qkv_out = qk_rmsnorm_fused( - qkv_out, - self.q_norm_weight, - self.k_norm_weight, - self.rms_norm_eps, - self.q_size, - self.kv_size, - self.head_dim, - ) - return qkv_out - - def test_qk_norm_paddle_performance(self): - tester_paddle = OpPerformanceTester( - op_name="qk_norm_paddle", - op_fn=self.qk_norm_paddle, - num_layers=self.num_hidden_layers, - ) - - tester_paddle.benchmark( - input_size=self.head_dim - * (self.num_attention_heads // self.tp_size + 2 * self.num_key_value_heads // self.tp_size), - batch_sizes=[1, 8, 64, 128, 1024, 2048, 4096, 8192], - ) - - def test_qk_norm_fused_performance(self): - tester = OpPerformanceTester( - op_name="qk_norm_triton_fused", - op_fn=self.qk_norm_triton_fused, - num_layers=self.num_hidden_layers, - ) - tester.benchmark( - input_size=self.head_dim - * (self.num_attention_heads // self.tp_size + 2 * self.num_key_value_heads // self.tp_size), - batch_sizes=[1, 8, 64, 128, 1024, 2048, 4096, 8192], - ) - - def test_qk_norm_result(self): - x = paddle.randn( - [ - 128, - self.head_dim - * (self.num_attention_heads // self.tp_size + 2 * self.num_key_value_heads // self.tp_size), - ], - paddle.bfloat16, - ) - out_paddle = self.qk_norm_paddle(x) - out_triton_fused = self.qk_norm_triton_fused(x) - np.testing.assert_allclose(out_triton_fused.numpy(), out_paddle.numpy(), rtol=1e-4, atol=1e-4) - - -if __name__ == "__main__": - unittest.main()