ether0 / app.py
jtordable's picture
Update app.py
85d0b5f verified
raw
history blame
2.22 kB
import os
import gradio as gr
import spaces
import torch
import logging
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import logging as hf_logging
logging.basicConfig(
filename="/tmp/app.log",
level=logging.DEBUG,
format="%(asctime)s %(levelname)s: %(message)s"
)
logging.info("Starting app.py logging")
hf_logging.set_verbosity_debug()
hf_logging.set_verbosity_info()
hf_logging.enable_default_handler()
hf_logging.enable_explicit_format()
hf_logging.add_handler(logging.FileHandler("/tmp/transformers.log"))
model_id = "futurehouse/ether0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16
)
@spaces.GPU
def chat_fn(prompt, max_tokens=512):
t0 = time.time()
max_tokens = min(int(max_tokens), 32_000)
try:
messages = [{"role": "user", "content": prompt}]
chat_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(chat_prompt, return_tensors="pt").to(model.device)
t1 = time.time()
logging.info(f"🧠 Tokenization complete in {t1 - t0:.2f}s")
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=True,
temperature=0.1,
pad_token_id=tokenizer.eos_token_id
)
t2 = time.time()
logging.info(f"⚡️ Generation complete in {t2 - t1:.2f}s (max_tokens={max_tokens})")
generated_text = tokenizer.decode(
outputs[0][inputs['input_ids'].shape[1]:],
skip_special_tokens=True
)
t3 = time.time()
logging.info(f"🔓 Decoding complete in {t3 - t2:.2f}s (output length: {len(generated_text)})")
return generated_text
except Exception:
logging.exception("❌ Exception during generation")
return "⚠️ Generation failed"
gr.Interface(
fn=chat_fn,
inputs=[
gr.Textbox(label="prompt"),
gr.Number(label="max_tokens", value=512, precision=0)
],
outputs="text",
title="Ether0"
).launch(ssr_mode=False)