A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

Disclosure: Some links in this article are affiliate links. AI Maestro may earn a commission if you make a purchase, at no…

By AI Maestro May 17, 2026 6 min read
A Coding Implementation to Compress and Benchmark Instruction-Tuned LLMs with FP8, GPTQ, and SmoothQuant Quantization using llmcompressor

Overview

In this tutorial, we explore how to apply post-training quantization to an instruction-tuned language model using llmcompressor. We start with an FP16 baseline and then compare multiple compression strategies including FP8 dynamic quantization, GPTQ W4A16, and SmoothQuant with GPTQ W8A8. Along the way, we benchmark each model variant for disk size, generation latency, throughput, perplexity, and output quality.

Key Takeaways

  • By applying different quantization methods to an instruction-tuned language model, we gain insights into how they affect model efficiency, deployment readiness, and performance trade-offs.
  • We prepare a reusable calibration dataset for each compression recipe, ensuring that the recipes are grounded in realistic inputs.
  • The comparison of models with varying degrees of precision (FP16, FP8, 4-bit weights) provides valuable information on how these methods impact model behavior during inference.


```python
import subprocess, sys
from pathlib import Path

def pip(*pkgs):
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])

pip("llmcompressor", "compressed-tensors",
"transformers>=4.45", "accelerate", "datasets")

import os, gc, time, json, math
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
WORKDIR = Path("/content/quant_lab"); WORKDIR.mkdir(exist_ok=True)
os.chdir(WORKDIR)

def free_mem():
gc.collect(); torch.cuda.empty_cache()

def dir_size_gb(path):
total = 0
for root, _, files in os.walk(path):
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total / 1e9

def time_generation(model, tok, prompt, max_new_tokens=64):
"""Greedy decode; reports latency & tokens/sec after a brief warmup."""
inputs = tok(prompt, return_tensors="pt").to(model.device)
_ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
torch.cuda.synchronize()
t0 = time.time()
out = model.generate(**inputs, max_new_tokens=max_new_tokens,
do_sample=False, pad_token_id=tok.eos_token_id)
torch.cuda.synchronize()
dt = time.time() - t0
new_ids = out[0][inputs["input_ids"].shape[1]:]
return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt

@torch.no_grad()
def wikitext_ppl(model, tok, seq_len=512, max_chunks=20, stride=512):
"""Light WikiText-2 perplexity probe (fast, indicative)."""
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
text = "\n\n".join(t for t in ds["text"][:400] if t.strip())
enc = tok(text, return_tensors="pt").input_ids.to(model.device)
nll_sum, tok_count = 0.0, 0
for begin in range(0, enc.size(1) - seq_len, stride):
chunk = enc[:, begin:begin+seq_len]
out = model(chunk, labels=chunk)
nll_sum += out.loss.float().item() * seq_len
tok_count += seq_len
if tok_count // seq_len >= max_chunks: break
return math.exp(nll_sum / tok_count)

results = {}
PROMPT = ("<|im_start|>user\nIn two sentences, explain why post-training "
"quantization works for large language models.<|im_end|>\n"
"<|im_start|>assistant\n")

def benchmark(label, model_path_or_id):
free_mem()
print(f"\n──── benchmarking: {label} ────")
tok = AutoTokenizer.from_pretrained(model_path_or_id)
m = AutoModelForCausalLM.from_pretrained(
model_path_or_id, torch_dtype="auto", device_map="cuda").eval()
sample, dt, tps = time_generation(m, tok, PROMPT)
ppl = wikitext_ppl(m, tok)
size = dir_size_gb(model_path_or_id) if os.path.isdir(str(model_path_or_id)) else None
results[label] = {"size_gb": size, "ppl": round(ppl, 3),
"latency_s": round(dt, 3), "tok_per_s": round(tps, 1),
"sample": sample.strip().replace("\n", " ")[:180]}
print(json.dumps(results[label], indent=2))
del m; free_mem()
```

Installation and Environment Setup

We begin by installing all required libraries using pip, ensuring that they are available in the environment. We verify that a CUDA-enabled GPU is present and ready for use.


```python
print("CUDA enabled? ", torch.cuda.is_available(), "| Device:", torch.cuda.get_device_name(0), "| torch version:", torch.__version__)
```

Model and Helper Functions

We define the base Qwen2.5 instruction model, create a working directory for our experiments, and prepare helper functions to manage memory, calculate model sizes, time generation operations, and evaluate perplexity.


```python
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
workdir = Path("/content/quant_lab"); workdir.mkdir(exist_ok=True)
os.chdir(workdir)

def free_mem():
gc.collect(); torch.cuda.empty_cache()

def dir_size_gb(path):
total = 0
for root, _, files in os.walk(path):
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total / 1e9

def time_generation(model, tok, prompt, max_new_tokens=64):
"""Greedy decode; reports latency & tokens/sec after a brief warmup."""
inputs = tok(prompt, return_tensors="pt").to(model.device)
_ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
torch.cuda.synchronize()
t0 = time.time()
out = model.generate(**inputs, max_new_tokens=max_new_tokens,
do_sample=False, pad_token_id=tok.eos_token_id)
torch.cuda.synchronize()
dt = time.time() - t0
new_ids = out[0][inputs["input_ids"].shape[1]:]
return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt

@torch.no_grad()
def wikitext_ppl(model, tok, seq_len=512, max_chunks=20, stride=512):
"""Light WikiText-2 perplexity probe (fast, indicative)."""
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
text = "\n\n".join(t for t in ds["text"][:400] if t.strip())
enc = tok(text, return_tensors="pt").input_ids.to(model.device)
nll_sum, tok_count = 0.0, 0
for begin in range(0, enc.size(1) - seq_len, stride):
chunk = enc[:, begin:begin+seq_len]
out = model(chunk, labels=chunk)
nll_sum += out.loss.float().item() * seq_len
tok_count += seq_len
if tok_count // seq_len >= max_chunks: break
return math.exp(nll_sum / tok_count)

results = {}
PROMPT = ("<|im_start|>user\nIn two sentences, explain why post-training "
"quantization works for large language models.<|im_end|>\n"
"<|im_start|>assistant\n")
```

Benchmark Function with Calibration Dataset

We define a reusable benchmark function that loads any model variant, tests its generation speed, calculates perplexity, and stores the results for final comparison. We also create a calibration dataset using UltraChat samples to ensure that compression recipes are grounded in realistic inputs.


```python
NUM_CALIB_SAMPLES = 256
MAX_SEQ_LEN = 1024

tok = AutoTokenizer.from_pretrained(model_id)
raw = load_dataset("HuggingFaceH4/ultrachat_200k",
split=f"train_sft[:{NUM_CALIB_SAMPLES}]")
def to_text(ex):
return {"text": tok.apply_chat_template(ex["messages"], tokenize=False)}

def tokenize(ex):
return tok(ex["text"], padding=False, truncation=True,
max_length=MAX_SEQ_LEN, add_special_tokens=False)

calib_ds = (raw.shuffle(seed=42)
.map(to_text)
.map(tokenize, remove_columns=raw.column_names))
print("Calibration set:", len(calib_ds), "samples, max_seq_len =", MAX_SEQ_LEN)
```

FP8 Dynamic Quantization with llmcompressor

We apply FP8 dynamic quantization using llmcompressor. This method compresses linear layers while keeping the language modeling head in higher precision. We save the compressed model and run a benchmark to compare its performance against the original FP16 model.


```python
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

print("\n════════════ Recipe 1: FP8_DYNAMIC ════════════")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
tok = AutoTokenizer.from_pretrained(model_id)
recipe_fp8 = QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)

oneshot(model=model, recipe=recipe_fp8)
FP8_DIR = "Qwen2.5-0.5B-FP8-Dynamic"
model.save_pretrained(FP8_DIR, save_compressed=True)
tok.save_pretrained(FP8_DIR)
del model; free_mem()
benchmark("01_fp8_dynamic", FP8_DIR)

print("\n════════════ Recipe 2: GPTQ W4A16 ════════════")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
recipe_w4a16 = GPTQModifier(
targets="Linear",
scheme="W4A16",
ignore=["lm_head"],
dampening_frac=0.01,
)

oneshot(
model=model,
dataset=calib_ds,
recipe=recipe_w4a16,
max_seq_length=MAX_SEQ_LEN,
num_calibration_samples=NUM_CALIB_SAMPLES,
)
W4A16_DIR = "Qwen2.5-0.5B-W4A16


Originally published at marktechpost.com. Curated by AI Maestro.

Stay ahead of AI. Get the most important stories delivered to your inbox — no spam, no noise.

Name
Scroll to Top