cakemus's picture
init
423dd52
import os
import gradio as gr
import torch
import deepspeed
from transformers import AutoTokenizer, AutoModelForCausalLM
# Retrieve your Hugging Face token from an environment variable.
hf_token = os.environ.get("hfaccesstoken")
# Set the correct model identifier.
model_name = "AI-Sweden-Models/gpt-sw3-6.7b"
# Load the tokenizer using your authentication token.
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
# Load the model using offload_folder and device_map to manage GPU memory.
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
offload_folder="offload",
use_auth_token=hf_token
)
# Wrap the model with DeepSpeed inference.
model = deepspeed.init_inference(
model,
mp_size=1, # No model parallelism.
dtype=torch.half, # Use half precision to save memory.
replace_method="auto"
)
def generate_text(prompt):
# Tokenize the input and move to the correct device.
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(
**inputs,
max_length=256,
do_sample=True,
top_p=0.9,
temperature=0.7
)
return tokenizer.decode(output[0], skip_special_tokens=True)
# Create the Gradio interface.
demo = gr.Interface(
fn=generate_text,
inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."),
outputs="text",
title="GPT-SW3 6.7B with DeepSpeed Inference Offloading",
description=(
"This demo loads the GPT-SW3 6.7B model from AI-Sweden-Models using DeepSpeed inference offloading. "
"Enter a prompt and see the model generate text."
)
)
demo.launch()