Overview
In this tutorial, we explore how to apply post-training quantization to an instruction-tuned language model using llmcompressor. We start with an FP16 baseline and then compare multiple compression strategies including FP8 dynamic quantization, GPTQ W4A16, and SmoothQuant with GPTQ W8A8. Along the way, we benchmark each model variant for disk size, generation latency, throughput, perplexity, and output quality.
Key Takeaways
- By applying different quantization methods to an instruction-tuned language model, we gain insights into how they affect model efficiency, deployment readiness, and performance trade-offs.
- We prepare a reusable calibration dataset for each compression recipe, ensuring that the recipes are grounded in realistic inputs.
- The comparison of models with varying degrees of precision (FP16, FP8, 4-bit weights) provides valuable information on how these methods impact model behavior during inference.
```python
import subprocess, sys
from pathlib import Path
def pip(*pkgs):
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])
pip("llmcompressor", "compressed-tensors",
"transformers>=4.45", "accelerate", "datasets")
import os, gc, time, json, math
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
WORKDIR = Path("/content/quant_lab"); WORKDIR.mkdir(exist_ok=True)
os.chdir(WORKDIR)
def free_mem():
gc.collect(); torch.cuda.empty_cache()
def dir_size_gb(path):
total = 0
for root, _, files in os.walk(path):
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total / 1e9
def time_generation(model, tok, prompt, max_new_tokens=64):
"""Greedy decode; reports latency & tokens/sec after a brief warmup."""
inputs = tok(prompt, return_tensors="pt").to(model.device)
_ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
torch.cuda.synchronize()
t0 = time.time()
out = model.generate(**inputs, max_new_tokens=max_new_tokens,
do_sample=False, pad_token_id=tok.eos_token_id)
torch.cuda.synchronize()
dt = time.time() - t0
new_ids = out[0][inputs["input_ids"].shape[1]:]
return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt
@torch.no_grad()
def wikitext_ppl(model, tok, seq_len=512, max_chunks=20, stride=512):
"""Light WikiText-2 perplexity probe (fast, indicative)."""
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
text = "\n\n".join(t for t in ds["text"][:400] if t.strip())
enc = tok(text, return_tensors="pt").input_ids.to(model.device)
nll_sum, tok_count = 0.0, 0
for begin in range(0, enc.size(1) - seq_len, stride):
chunk = enc[:, begin:begin+seq_len]
out = model(chunk, labels=chunk)
nll_sum += out.loss.float().item() * seq_len
tok_count += seq_len
if tok_count // seq_len >= max_chunks: break
return math.exp(nll_sum / tok_count)
results = {}
PROMPT = ("<|im_start|>user\nIn two sentences, explain why post-training "
"quantization works for large language models.<|im_end|>\n"
"<|im_start|>assistant\n")
def benchmark(label, model_path_or_id):
free_mem()
print(f"\n──── benchmarking: {label} ────")
tok = AutoTokenizer.from_pretrained(model_path_or_id)
m = AutoModelForCausalLM.from_pretrained(
model_path_or_id, torch_dtype="auto", device_map="cuda").eval()
sample, dt, tps = time_generation(m, tok, PROMPT)
ppl = wikitext_ppl(m, tok)
size = dir_size_gb(model_path_or_id) if os.path.isdir(str(model_path_or_id)) else None
results[label] = {"size_gb": size, "ppl": round(ppl, 3),
"latency_s": round(dt, 3), "tok_per_s": round(tps, 1),
"sample": sample.strip().replace("\n", " ")[:180]}
print(json.dumps(results[label], indent=2))
del m; free_mem()
```
Installation and Environment Setup
We begin by installing all required libraries using pip, ensuring that they are available in the environment. We verify that a CUDA-enabled GPU is present and ready for use.
```python
print("CUDA enabled? ", torch.cuda.is_available(), "| Device:", torch.cuda.get_device_name(0), "| torch version:", torch.__version__)
```
Model and Helper Functions
We define the base Qwen2.5 instruction model, create a working directory for our experiments, and prepare helper functions to manage memory, calculate model sizes, time generation operations, and evaluate perplexity.
```python
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
workdir = Path("/content/quant_lab"); workdir.mkdir(exist_ok=True)
os.chdir(workdir)
def free_mem():
gc.collect(); torch.cuda.empty_cache()
def dir_size_gb(path):
total = 0
for root, _, files in os.walk(path):
for f in files:
total += os.path.getsize(os.path.join(root, f))
return total / 1e9
def time_generation(model, tok, prompt, max_new_tokens=64):
"""Greedy decode; reports latency & tokens/sec after a brief warmup."""
inputs = tok(prompt, return_tensors="pt").to(model.device)
_ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
torch.cuda.synchronize()
t0 = time.time()
out = model.generate(**inputs, max_new_tokens=max_new_tokens,
do_sample=False, pad_token_id=tok.eos_token_id)
torch.cuda.synchronize()
dt = time.time() - t0
new_ids = out[0][inputs["input_ids"].shape[1]:]
return tok.decode(new_ids, skip_special_tokens=True), dt, max_new_tokens/dt
@torch.no_grad()
def wikitext_ppl(model, tok, seq_len=512, max_chunks=20, stride=512):
"""Light WikiText-2 perplexity probe (fast, indicative)."""
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
text = "\n\n".join(t for t in ds["text"][:400] if t.strip())
enc = tok(text, return_tensors="pt").input_ids.to(model.device)
nll_sum, tok_count = 0.0, 0
for begin in range(0, enc.size(1) - seq_len, stride):
chunk = enc[:, begin:begin+seq_len]
out = model(chunk, labels=chunk)
nll_sum += out.loss.float().item() * seq_len
tok_count += seq_len
if tok_count // seq_len >= max_chunks: break
return math.exp(nll_sum / tok_count)
results = {}
PROMPT = ("<|im_start|>user\nIn two sentences, explain why post-training "
"quantization works for large language models.<|im_end|>\n"
"<|im_start|>assistant\n")
```
Benchmark Function with Calibration Dataset
We define a reusable benchmark function that loads any model variant, tests its generation speed, calculates perplexity, and stores the results for final comparison. We also create a calibration dataset using UltraChat samples to ensure that compression recipes are grounded in realistic inputs.
```python
NUM_CALIB_SAMPLES = 256
MAX_SEQ_LEN = 1024
tok = AutoTokenizer.from_pretrained(model_id)
raw = load_dataset("HuggingFaceH4/ultrachat_200k",
split=f"train_sft[:{NUM_CALIB_SAMPLES}]")
def to_text(ex):
return {"text": tok.apply_chat_template(ex["messages"], tokenize=False)}
def tokenize(ex):
return tok(ex["text"], padding=False, truncation=True,
max_length=MAX_SEQ_LEN, add_special_tokens=False)
calib_ds = (raw.shuffle(seed=42)
.map(to_text)
.map(tokenize, remove_columns=raw.column_names))
print("Calibration set:", len(calib_ds), "samples, max_seq_len =", MAX_SEQ_LEN)
```
FP8 Dynamic Quantization with llmcompressor
We apply FP8 dynamic quantization using llmcompressor. This method compresses linear layers while keeping the language modeling head in higher precision. We save the compressed model and run a benchmark to compare its performance against the original FP16 model.
```python
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
print("\n════════════ Recipe 1: FP8_DYNAMIC ════════════")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
tok = AutoTokenizer.from_pretrained(model_id)
recipe_fp8 = QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)
oneshot(model=model, recipe=recipe_fp8)
FP8_DIR = "Qwen2.5-0.5B-FP8-Dynamic"
model.save_pretrained(FP8_DIR, save_compressed=True)
tok.save_pretrained(FP8_DIR)
del model; free_mem()
benchmark("01_fp8_dynamic", FP8_DIR)
print("\n════════════ Recipe 2: GPTQ W4A16 ════════════")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
recipe_w4a16 = GPTQModifier(
targets="Linear",
scheme="W4A16",
ignore=["lm_head"],
dampening_frac=0.01,
)
oneshot(
model=model,
dataset=calib_ds,
recipe=recipe_w4a16,
max_seq_length=MAX_SEQ_LEN,
num_calibration_samples=NUM_CALIB_SAMPLES,
)
W4A16_DIR = "Qwen2.5-0.5B-W4A16
Originally published at marktechpost.com. Curated by AI Maestro.
Stay ahead of AI. Get the most important stories delivered to your inbox — no spam, no noise.




