Skip to content

Commit eb7c1e3

Browse files
authored
Add NVFP4 DS (#2356)
Signed-off-by: yiliu30 <[email protected]>
1 parent e6503b0 commit eb7c1e3

File tree

7 files changed

+52
-12
lines changed

7 files changed

+52
-12
lines changed

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8 and evaluate them using a custom vLLM fork.
1+
This example provides an end-to-end workflow to quantize DeepSeek models to MXFP4/MXFP8/NVFP4 and evaluate them using a custom vLLM fork.
22

33
## Requirement
44
```bash
@@ -29,13 +29,18 @@ bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels
2929
bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels
3030
```
3131

32+
- NVFP4
33+
```bash
34+
bash run_quant.sh --model $MODEL -t nvfp4 --output_dir ./qmodels
35+
```
36+
3237
## Evaluation
3338

3439
### Prompt Tests
3540

3641
Usage:
3742
```bash
38-
bash ./run_generate.sh -s [mxfp4|mxfp8] -tp [tensor_parallel_size] -m [model_path]
43+
bash ./run_generate.sh -s [mxfp4|mxfp8|nvfp4] -tp [tensor_parallel_size] -m [model_path]
3944
```
4045

4146
- MXFP8
@@ -46,12 +51,16 @@ bash ./run_generate.sh -s mxfp8 -tp 8 -m /path/to/ds_mxfp8
4651
```bash
4752
bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4
4853
```
54+
- NVFP4
55+
```bash
56+
bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4
57+
```
4958
### Evaluation
5059

5160

5261
Usage:
5362
```bash
54-
bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
63+
bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
5564
```
5665
```bash
5766
bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8
@@ -62,4 +71,9 @@ bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
6271
```bash
6372
bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4
6473
bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
74+
```
75+
- NVFP4
76+
```bash
77+
bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4
78+
bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
6579
```

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,15 @@
3232
"fp_layers": "lm_head,self_attn",
3333
"iters": 0,
3434
},
35+
"nvfp4": {
36+
"scheme": "NVFP4",
37+
"fp_layers": "lm_head,self_attn",
38+
"iters": 0,
39+
"export_format": "llm_compressor",
40+
"low_cpu_mem_usage": True,
41+
"low_gpu_mem_usage": True,
42+
"reloading":False,
43+
},
3544
}
3645

3746

@@ -58,7 +67,7 @@ def quant_model(args):
5867
)
5968

6069
config = topologies_config[args.t]
61-
export_format = "auto_round" if args.use_autoround_format else "llm_compressor"
70+
export_format = config.get("export_format", "auto_round")
6271
output_dir = f"{args.output_dir}/quantized_model_{args.t}"
6372
fp32_model, tokenizer = get_model_and_tokenizer(args.model)
6473
quant_config = AutoRoundConfig(
@@ -69,6 +78,7 @@ def quant_model(args):
6978
fp_layers=config["fp_layers"],
7079
export_format=export_format,
7180
output_dir=output_dir,
81+
low_gpu_mem_usage=True,
7282
reloading=False,
7383
)
7484

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
lm-eval==0.4.9.1
2-
loguru
2+
loguru
3+
compressed-tensors==0.12.2

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ BATCH_SIZE=512
1111

1212
# Function to display usage
1313
usage() {
14-
echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]"
14+
echo "Usage: $0 -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]"
1515
echo " -m: Path to the quantized model (required)"
1616
echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)"
1717
echo " -t: Task name(s) to evaluate (default: piqa,hellaswag,mmlu)"
@@ -80,6 +80,13 @@ if [[ "$SCHEME" == "mxfp4" ]]; then
8080
VLLM_ENABLE_STATIC_MOE=0
8181
VLLM_USE_DEEP_GEMM=0
8282
VLLM_ENABLE_AR_EXT=1
83+
elif [[ "$SCHEME" == "nvfp4" ]]; then
84+
VLLM_AR_MXFP4_MODULAR_MOE=0
85+
VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
86+
VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0
87+
VLLM_ENABLE_STATIC_MOE=0
88+
VLLM_USE_DEEP_GEMM=0
89+
VLLM_ENABLE_AR_EXT=0
8390
elif [[ "$SCHEME" == "mxfp8" ]]; then
8491
VLLM_AR_MXFP4_MODULAR_MOE=0
8592
VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
@@ -88,7 +95,7 @@ elif [[ "$SCHEME" == "mxfp8" ]]; then
8895
VLLM_USE_DEEP_GEMM=0
8996
VLLM_ENABLE_AR_EXT=1
9097
else
91-
echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4' or 'mxfp8'."
98+
echo "Error: Invalid quantization scheme (-s). Must be 'mxfp4', 'nvfp4' or 'mxfp8'."
9299
usage
93100
exit 1
94101
fi

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ done
5252

5353
# Validate quantization type
5454
QUANT_TYPE_UPPER=$(echo "$QUANT_TYPE" | tr '[:lower:]' '[:upper:]')
55-
if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" ]]; then
56-
echo "Error: Quantization type must be mxfp4 or mxfp8"
55+
if [[ "$QUANT_TYPE_UPPER" != "MXFP4" && "$QUANT_TYPE_UPPER" != "MXFP8" && "$QUANT_TYPE_UPPER" != "NVFP4" ]]; then
56+
echo "Error: Quantization type must be mxfp4, mxfp8 or nvfp4"
5757
usage
5858
exit 1
5959
fi
@@ -83,17 +83,23 @@ echo ""
8383

8484
# Set environment variables based on quantization type
8585
if [[ "$QUANT_TYPE_UPPER" == "MXFP4" ]]; then
86+
export VLLM_ENABLE_AR_EXT=1
8687
export VLLM_AR_MXFP4_MODULAR_MOE=1
8788
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=1
8889
echo "Using MXFP4 configuration"
90+
elif [[ "$QUANT_TYPE_UPPER" == "NVFP4" ]]; then
91+
export VLLM_ENABLE_AR_EXT=0
92+
export VLLM_AR_MXFP4_MODULAR_MOE=0
93+
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
94+
echo "Using NVFP4 configuration"
8995
else
96+
export VLLM_ENABLE_AR_EXT=1
9097
export VLLM_AR_MXFP4_MODULAR_MOE=0
9198
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=0
9299
echo "Using MXFP8 configuration"
93100
fi
94101

95102
# Common environment variables
96-
export VLLM_ENABLE_AR_EXT=1
97103
export VLLM_ENABLE_STATIC_MOE=0
98104
export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0
99105
export VLLM_USE_DEEP_GEMM=0
@@ -113,6 +119,6 @@ python generate.py \
113119
--tensor_parallel_size $TP_SIZE \
114120
--max-tokens 16 \
115121
--max-num-seqs 4 \
122+
--max-model-len 2048 \
116123
--gpu_memory_utilization 0.75 \
117-
--no-enable-prefix-caching \
118-
--enable_expert_parallel
124+
--no-enable-prefix-caching

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ done
4141
[ -z "$TARGET" ] && echo "Error: -t is required" && usage
4242
[ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage
4343

44+
AR_LOG_LEVEL=TRACE \
4445
python quantize.py \
4546
--model "$MODEL" \
4647
-t "$TARGET" \

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def get_model_and_tokenizer(model_name):
4040
model_name,
4141
device_map="cpu",
4242
trust_remote_code=True,
43+
dtype="auto",
4344
)
4445
tokenizer = AutoTokenizer.from_pretrained(
4546
model_name,

0 commit comments

Comments
 (0)