Spaces:
Build error
Build error
from transformers import AutoTokenizer | |
from exllamav2 import ( | |
ExLlamaV2, | |
ExLlamaV2Config, | |
ExLlamaV2Cache, | |
ExLlamaV2Tokenizer | |
) | |
from exllamav2.generator import ( | |
ExLlamaV2StreamingGenerator, | |
ExLlamaV2Sampler | |
) | |
import torch | |
# Configure model | |
model_dir = "TheBloke_Wizard-Vicuna-13B-GPTQ" # Path to downloaded model | |
config = ExLlamaV2Config() | |
config.model_dir = model_dir | |
config.prepare() | |
# Load model | |
model = ExLlamaV2(config) | |
cache = ExLlamaV2Cache(model) | |
model.load_autosplit(cache) | |
# Load tokenizer (HF-compatible) | |
tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
def generate_response(prompt, max_tokens=200, temperature=0.7): | |
# Initialize generator | |
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) | |
generator.set_stop_conditions([tokenizer.eos_token_id]) | |
# Configure sampler | |
settings = ExLlamaV2Sampler.Settings() | |
settings.temperature = temperature | |
settings.top_k = 50 | |
settings.top_p = 0.8 | |
# Encode prompt | |
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda() | |
# Generate | |
output = generator.generate_simple( | |
input_ids, | |
settings, | |
max_tokens, | |
seed=42 | |
) | |
return tokenizer.decode(output[0], skip_special_tokens=True) | |
############################################## | |
from fastapi import FastAPI | |
import uvicorn | |
app = FastAPI() | |
def greet_json(): | |
return {"Hello": "World!"} | |
async def message(input: str): | |
return generate_response(input) | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=7860) |