import os import gradio as gr import torch import deepspeed from transformers import AutoTokenizer, AutoModelForCausalLM # Retrieve your Hugging Face token from an environment variable. hf_token = os.environ.get("hfaccesstoken") # Set the correct model identifier. model_name = "AI-Sweden-Models/gpt-sw3-6.7b" # Load the tokenizer using your authentication token. tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token) # Load the model using offload_folder and device_map to manage GPU memory. model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", offload_folder="offload", use_auth_token=hf_token ) # Wrap the model with DeepSpeed inference. model = deepspeed.init_inference( model, mp_size=1, # No model parallelism. dtype=torch.half, # Use half precision to save memory. replace_method="auto" ) def generate_text(prompt): # Tokenize the input and move to the correct device. inputs = tokenizer(prompt, return_tensors="pt").to(model.device) output = model.generate( **inputs, max_length=256, do_sample=True, top_p=0.9, temperature=0.7 ) return tokenizer.decode(output[0], skip_special_tokens=True) # Create the Gradio interface. demo = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=5, placeholder="Enter your prompt here..."), outputs="text", title="GPT-SW3 6.7B with DeepSpeed Inference Offloading", description=( "This demo loads the GPT-SW3 6.7B model from AI-Sweden-Models using DeepSpeed inference offloading. " "Enter a prompt and see the model generate text." ) ) demo.launch()