import os import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI() model_name = "google/gemma-2-2b-it" tokenizer = None model = None try: logger.info(f"Loading model: {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv("HF_TOKEN")) use_gpu = torch.cuda.is_available() logger.info(f"GPU available: {use_gpu}") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # メモリ削減 device_map="cpu", # GPU利用不可 token=os.getenv("HF_TOKEN"), low_cpu_mem_usage=True ) logger.info("Model loaded successfully") except Exception as e: logger.error(f"Model load error: {e}") raise def generate_text(text, max_length=50): try: logger.info(f"Generating text for input: {text}") inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to("cpu") outputs = model.generate(**inputs, max_length=max_length) result = tokenizer.decode(outputs[0], skip_special_tokens=True) logger.info(f"Generated text: {result}") return result except Exception as e: logger.error(f"Generation error: {e}") return f"Error: {str(e)}" iface = gr.Interface( fn=generate_text, inputs=[gr.Textbox(label="Input Text"), gr.Slider(10, 100, value=50, label="Max Length")], outputs=gr.Textbox(label="Generated Text"), title="Gemma 2 API" ) if __name__ == "__main__": try: logger.info("Launching Gradio interface") iface.launch(server_name="0.0.0.0", server_port=8080) except Exception as e: logger.error(f"Gradio launch error: {e}") raise