-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Here are BitsFusion Script Quantisation without finetuned #5
Comments
Hi, did you implement this? |
Here's the same code with the comments translated. attached |
translated import torch
import copy
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from diffusers import StableDiffusionPipeline
from scipy.cluster.vq import kmeans2
# Configuration
model_path = "path/to/your/sd-v1-5-fp16.safetensors"
output_path = "path/to/your/quantized_model.safetensors"
bits = [1, 2, 3, 4] # Bit-widths to be tested
sensitivity_threshold = 0.05
size_factor = 0.5
clip_thresholds = [0.9, 0.95, 0.98]
time_steps = 50
# Function to calculate Mean Squared Error (MSE)
def calculate_mse(image1, image2):
"""Calculate Mean Squared Error (MSE) between two images."""
return ((image1 - image2) ** 2).mean()
# Function to calculate CLIP score
def calculate_clip_score(images, texts, clip_processor, clip_model):
"""Calculate CLIP score for generated images and text prompts."""
inputs = clip_processor(text=texts, images=images, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)
logits_per_image = outputs.logits_per_image
return logits_per_image.diag().mean().item()
# Function to analyze layer sensitivity
def analyze_layer_sensitivity(model, prompts, bits, sample_size=100):
"""Analyze layer sensitivity to quantization."""
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
results = {}
for name, param in model.named_parameters():
if "weight" in name:
results[name] = {}
for b in bits:
quantized_model = copy.deepcopy(model)
quantize_layer(quantized_model, name, b)
# Generate images with the quantized model (implementation needed)
generated_images = generate_images(quantized_model, prompts, sample_size)
# Calculate metrics
results[name][b] = {
"mse": calculate_mse(generated_images, generate_images(model, prompts, sample_size)),
"clip_score": calculate_clip_score(generated_images, prompts, clip_processor, clip_model),
"parameters": param.numel()
}
return results
# Function to quantize layer using Lloyd-Max from SciPy
def quantize_layer(model, layer_name, bits):
"""Apply quantization to a specific layer using the Lloyd-Max algorithm from SciPy.
Args:
model: PyTorch model containing the layer to be quantized.
layer_name: Name of the layer to be quantized (string).
bits: Number of bits used for quantization (integer).
"""
# Get the layer by its name
layer = dict(model.named_modules())[layer_name]
# Ensure the layer has weights
if not hasattr(layer, 'weight'):
print(f"Layer {layer_name} does not have weights.")
return
# Get the weight tensor
w = layer.weight.data
# Skip quantization if weights are already in integer data type
if w.dtype not in [torch.float, torch.float16, torch.float32, torch.float64]:
print(f"Layer {layer_name} already has integer weights.")
return
# Convert tensor to CPU for Lloyd-Max quantization
w = w.cpu().numpy()
# Perform Lloyd-Max quantization using kmeans2 from SciPy
centroids, labels = kmeans2(w.reshape(-1, 1), 2**bits, minit='points')
quantized_w = centroids[labels].reshape(w.shape)
# Convert back to PyTorch tensor and save to the layer
layer.weight.data = torch.from_numpy(quantized_w).to(layer.weight.device)
# Calculate and save scale factor and zero point
min_val = w.min()
max_val = w.max()
scale = (max_val - min_val) / (2**bits - 1)
zero_point = torch.round(-min_val / scale).to(layer.weight.device)
layer.quantization_scale = scale.to(layer.weight.device)
layer.quantization_zero_point = zero_point
# Replace forward function to perform dequantization during inference
layer._forward_impl = layer.forward
def quantized_forward(*args, **kwargs):
# Dequantize weights before forward operation
dequantized_w = (layer.weight.data - layer.quantization_zero_point) * layer.quantization_scale
return layer._forward_impl(dequantized_w, *args[1:], **kwargs)
layer.forward = quantized_forward
# Function to generate images
def generate_images(model, prompts, sample_size):
"""Generate images using the Stable Diffusion model."""
# Implement the image generation process using the diffusers pipeline
# Use `model` and `prompts` as input
# ...
# TODO
# Function to determine mixed-precision strategy
def determine_mixed_precision(results, sensitivity_threshold, size_factor, clip_thresholds):
"""Determine optimal bit-width for each layer."""
mixed_precision = {}
for name, layer_results in results.items():
sensitivity_scores = {
b: layer_results[b]["mse"] / (layer_results[b]["parameters"] ** size_factor)
for b in bits
}
optimal_bits = min(bits, key=lambda b: sensitivity_scores[b])
if sensitivity_scores[optimal_bits] > sensitivity_threshold:
optimal_bits = max(bits) # Use maximum bit-width if exceeding threshold
clip_score_drop = layer_results[max(bits)]["clip_score"] - results[name][32]["clip_score"]
for i, threshold in enumerate(clip_thresholds):
if clip_score_drop > np.quantile(
[results[n][32]["clip_score"] for n in results], threshold
):
optimal_bits += i + 1
break
mixed_precision[name] = optimal_bits
return mixed_precision
# Function to quantize the model
def quantize_model(model, mixed_precision):
"""Apply quantization to the model based on mixed-precision strategy."""
for name, param in model.named_parameters():
if "weight" in name and name in mixed_precision:
quantize_layer(model, name, mixed_precision[name])
return model
# Function for pre-computation and caching of time embeddings
def precompute_time_embeddings(model, time_steps):
"""Calculate and store time embeddings."""
time_embeddings = {}
for t in range(time_steps):
time_embeddings[t] = model.time_embedding(torch.tensor([t]))
return time_embeddings
# Load Stable Diffusion model
pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
model = pipe.unet
# Layer sensitivity analysis (implement generate_images first)
prompts = ["A photo of a cat"] # Replace with your desired prompts
results = analyze_layer_sensitivity(model, prompts, bits)
# Determine mixed-precision strategy
mixed_precision = determine_mixed_precision(
results, sensitivity_threshold, size_factor, clip_thresholds
)
# Apply quantization to the model
quantized_model = quantize_model(model, mixed_precision)
# Pre-computation and caching of time embeddings
time_embeddings = precompute_time_embeddings(quantized_model, time_steps)
# Save the quantized model
torch.save(quantized_model.state_dict(), output_path)
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The text was updated successfully, but these errors were encountered: