NVIDIA · pggPL · Jan 30, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 29, 2026
diff --git a/tests/cpp/operator/test_grouped_gemm.cu b/tests/cpp/operator/test_grouped_gemm.cu
@@ -102,8 +102,8 @@ std::vector<std::tuple<size_t, size_t, size_t>> make_shapes(ShapeCase scase) {
 }
 
 void run_grouped_gemm_case(const TestParams& params) {
-#if CUBLAS_VERSION < 130100
-  GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.1+, but compile-time cuBLAS version is "
+#if CUBLAS_VERSION < 130200
+  GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.2+, but compile-time cuBLAS version is "
                << CUBLAS_VERSION << ".";
 #else
   if (getDeviceComputeCapability() < blackwellComputeCapability) {
@@ -267,7 +267,7 @@ void run_grouped_gemm_case(const TestParams& params) {
                    atol,
                    rtol);
   }
-#endif  // CUBLAS_VERSION >= 130100
+#endif  // CUBLAS_VERSION >= 130200
 }
 
 class GroupedGemmTest : public ::testing::TestWithParam<TestParams> {};

diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu
@@ -494,9 +494,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
 #endif  // CUBLAS_VERSION >= 120800
     } else if (mxfp8_gemm) {
 #if CUBLAS_VERSION >= 120800
-      NVTE_CHECK(cuda::cublas_version() >= 120800,
+      NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120800,
                  "MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ",
-                 cuda::cublas_version());
+                 transformer_engine::cuda::cublas_version());
 
       // Check that scales are in expected format
       NVTE_CHECK(inputA->with_gemm_swizzled_scales,
@@ -518,7 +518,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
 
       // Workaround for heuristic cache bug in cublasLt. This separates the MXFP8 cache key from non-block scaling.
       // CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is unused for block scaling so it's safe to set.
-      if (cuda::cublas_version() <= 120803) {
+      if (transformer_engine::cuda::cublas_version() <= 120803) {
         const int64_t dummy_a_vec_stride = 1;
         NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
             operationDesc, CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE, &dummy_a_vec_stride,
@@ -530,9 +530,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
 #endif                     // CUBLAS_VERSION >= 120800
     } else if (use_fp4) {  // NVFP4 GEMM
 #if CUBLAS_VERSION >= 120800
-      NVTE_CHECK(cuda::cublas_version() >= 120800,
+      NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120800,
                  "FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ",
-                 cuda::cublas_version());
+                 transformer_engine::cuda::cublas_version());
 
       // Check that scales are in expected format
       NVTE_CHECK(inputA->with_gemm_swizzled_scales,
@@ -567,9 +567,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
                (inputB->scaling_mode == NVTE_BLOCK_SCALING_1D ||
                 inputB->scaling_mode == NVTE_BLOCK_SCALING_2D)) {
 #if CUBLAS_VERSION >= 120900
-      NVTE_CHECK(cuda::cublas_version() >= 120900,
+      NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120900,
                  "FP8 block scaling requires cuBLAS 12.9+, but run-time cuBLAS version is ",
-                 cuda::cublas_version());
+                 transformer_engine::cuda::cublas_version());
 
       // Check that matrix formats are valid
       NVTE_CHECK((!(inputA->scaling_mode == NVTE_BLOCK_SCALING_2D &&
@@ -602,7 +602,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
     }
 
 #if CUBLAS_VERSION >= 120800
-    if (cuda::cublas_version() >= 120800) {
+    if (transformer_engine::cuda::cublas_version() >= 120800) {
       NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc,
                                                        CUBLASLT_MATMUL_DESC_A_SCALE_MODE,
                                                        &scaling_mode_a, sizeof(scaling_mode_a)));
@@ -619,7 +619,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
       NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
           operationDesc, CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, &D_amax, sizeof(D_amax)));
 #if CUBLAS_VERSION >= 120800
-      if (cuda::cublas_version() >= 120800) {
+      if (transformer_engine::cuda::cublas_version() >= 120800) {
         // NOTE: In all current cases where FP8 output is supported, the input is
         // scaled identically to the output.
         NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc,
@@ -703,12 +703,14 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
         "Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but compile-time cuBLAS version is ",
         CUBLAS_VERSION);
 #else
-    NVTE_CHECK(cuda::cudart_version() >= 12020 && cuda::cudart_version() < 13000,
+    NVTE_CHECK(transformer_engine::cuda::cudart_version() >= 12020 &&
+                   transformer_engine::cuda::cudart_version() < 13000,
                "Atomic GEMM requires CUDA >=12.2.0 and <13.0.0, but run-time CUDA version is ",
-               cuda::cudart_version());
-    NVTE_CHECK(cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000,
+               transformer_engine::cuda::cudart_version());
+    NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120205 &&
+                   transformer_engine::cuda::cublas_version() < 130000,
                "Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but run-time cuBLAS version is ",
-               cuda::cublas_version());
+               transformer_engine::cuda::cublas_version());
     if (m_split == 0) m_split = 1;
     if (n_split == 0) n_split = 1;
     NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
@@ -934,9 +936,10 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor
       "Atomic GEMM requires CUDA version >=12.2.0 and <13.0.0, but run-time CUDA version is ",
       transformer_engine::cuda::cudart_version());
   NVTE_CHECK(
-      cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000,
+      transformer_engine::cuda::cublas_version() >= 120205 &&
+          transformer_engine::cuda::cublas_version() < 130000,
       "Atomic GEMM requires cuBLAS version >=12.2.5 and <13.0.0, but run-time cuBLAS version is ",
-      cuda::cublas_version());
+      transformer_engine::cuda::cublas_version());
 
   const Tensor *inputA = convertNVTETensorCheck(A);
   const Tensor *inputB = convertNVTETensorCheck(B);

diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu
@@ -26,7 +26,7 @@ inline void CreateCublasHandle(cublasLtHandle_t *handle) {
 
 }  // namespace
 
-#if CUBLAS_VERSION >= 130100
+#if CUBLAS_VERSION >= 130200
 
 namespace {
 
@@ -543,13 +543,13 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT
   NVTE_API_CALL(nvte_grouped_gemm);
   using namespace transformer_engine;
 
-  // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.1+
-  const int current_device = cuda::current_device();
-  NVTE_CHECK(cuda::sm_arch(current_device) >= 100,
+  // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.2+
+  const int current_device = transformer_engine::cuda::current_device();
+  NVTE_CHECK(transformer_engine::cuda::sm_arch(current_device) >= 100,
              "nvte_grouped_gemm requires Blackwell (SM100) or newer architecture.");
-  NVTE_CHECK(cuda::cublas_version() >= 130100,
-             "nvte_grouped_gemm requires cuBLAS 13.1+, but run-time cuBLAS version is ",
-             cuda::cublas_version());
+  NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 130200,
+             "nvte_grouped_gemm requires cuBLAS 13.2+, but run-time cuBLAS version is ",
+             transformer_engine::cuda::cublas_version());
 
   // Convert to internal types
   const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A);
@@ -631,15 +631,15 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT
                                    kGroupedGemmCublasWorkspaceSize, stream));
 }
 
-#else  // CUBLAS_VERSION < 130100
+#else  // CUBLAS_VERSION < 130200
 
 void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb,
                        const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha,
                        const NVTETensor beta, NVTETensor workspace_setup,
                        NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config,
                        cudaStream_t stream) {
-  NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.1+, but compile-time cuBLAS version is ",
+  NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.2+, but compile-time cuBLAS version is ",
              CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer.");
-             CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer.");
+             CUBLAS_VERSION, ". Please upgrade to CUDA 13.2 or newer.");
-             CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer.");
+             CUBLAS_VERSION, ". Please upgrade to CUDA 13.2 or newer.");
 }
 
-#endif  // CUBLAS_VERSION >= 130100
+#endif  // CUBLAS_VERSION >= 130200
diff --git a/transformer_engine/common/include/transformer_engine/gemm.h b/transformer_engine/common/include/transformer_engine/gemm.h
@@ -299,7 +299,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
 /* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */
 /*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C
  *
- * \note Requires cuBLAS 13.1+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture.
+ * \note Requires cuBLAS 13.2+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture.
  *       Will error at runtime if compiled with an older cuBLAS version or run on
  *       a pre-Blackwell GPU.
  *
@@ -322,7 +322,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
  *  \param[in]  stream           CUDA stream for the operation.
  *
  * Requirements:
- * - cuBLAS 13.1+ (CUDA 13.1+)
+ * - cuBLAS 13.2+ (CUDA 13.1+)
  * - Blackwell (SM100) or newer GPU architecture
  * - A, B, C (if provided), D must have the same num_tensors
  * - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i]