Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import torch | |
import deepspeed | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
# Retrieve your Hugging Face token from an environment variable. | |
hf_token = os.environ.get("hfaccesstoken") | |
# Set the correct model identifier. | |
model_name = "AI-Sweden-Models/gpt-sw3-6.7b" | |
# Load the tokenizer using your authentication token. | |
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token) | |
# Load the model using offload_folder and device_map to manage GPU memory. | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
device_map="auto", | |
offload_folder="offload", | |
use_auth_token=hf_token | |
) | |
# Wrap the model with DeepSpeed inference. | |
model = deepspeed.init_inference( | |
model, | |
mp_size=1, # No model parallelism. | |
dtype=torch.half, # Use half precision to save memory. | |
replace_method="auto" | |
) | |
def generate_text(prompt): | |
# Tokenize the input and move to the correct device. | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
output = model.generate( | |
**inputs, | |
max_length=256, | |
do_sample=True, | |
top_p=0.9, | |
temperature=0.7 | |
) | |
return tokenizer.decode(output[0], skip_special_tokens=True) | |
# Create the Gradio interface. | |
demo = gr.Interface( | |
fn=generate_text, | |
inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."), | |
outputs="text", | |
title="GPT-SW3 6.7B with DeepSpeed Inference Offloading", | |
description=( | |
"This demo loads the GPT-SW3 6.7B model from AI-Sweden-Models using DeepSpeed inference offloading. " | |
"Enter a prompt and see the model generate text." | |
) | |
) | |
demo.launch() | |