Learn how LLMs are quantized dynamically using the PyTorch framework. Checkout the impact on:
Accuracy
Size of model
Intent is not to teach PyTorch but to de-mystify Quantization and show its real benefits in terms of model compression without a significant loss of accuracy.
Create a new notebook under the template/Quantization/Pytorch-Dynamic-Quantization
Note: To understand how this code is working, please review the lessons in the section: Hugging Face (advanced)
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load pre-trained model and tokenizer (e.g., GPT-2)
# model_name = "openai-community/gpt2"
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Test the model before quantization
text = "Once upon a time,"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
# Print model output
print("Original Model Output:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# Apply dynamic quantization to the model
quantized_model = torch.quantization.quantize_dynamic(
model, # the model to quantize
{torch.nn.Linear}, # layers to quantize (focusing on Linear layers)
dtype=torch.qint8
)
# Test the quantized model's output
outputs_quantized = quantized_model.generate(**inputs, max_new_tokens=10)
# Print the output
print("\nQuantized Model Output:")
print(tokenizer.decode(outputs_quantized[0], skip_special_tokens=True))
# Function to print and compare model sizes
# Code below serializes the model to the file system
# Note that this is just to get an idea of relatives sizes
# and not the exact memory footprints.
def print_size_of_model(model, model_name=""):
torch.save(model.state_dict(), f"{model_name}.pt")
size_mb = os.path.getsize(f'{model_name}.pt') / 1e6
print(f"\nModel size of {model_name}: {size_mb:.2f} MB")
# Compare sizes of original and quantized models
print_size_of_model(model, "Original_Model")
print_size_of_model(quantized_model, "Quantized_Model")
# Clean up saved files after checking size
os.remove("Original_Model.pt")
os.remove("Quantized_Model.pt")
Larger models need GPU (over ~1B parameters) for quantization. Use Google Colab/T4 runtime for larger models.