Spaces:
Sleeping
Sleeping
| import os | |
| import logging | |
| from threading import Thread | |
| from logging.handlers import RotatingFileHandler | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline | |
| # Logging setup | |
| log_file = '/tmp/app_debug.log' | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.DEBUG) | |
| file_handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5) | |
| file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) | |
| logger.addHandler(file_handler) | |
| logger.debug("Application started") | |
| # Define model parameters | |
| MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct" | |
| CONTEXT_LENGTH = 16000 | |
| # Configuration for 4-bit quantization | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 | |
| ) | |
| # Load tokenizer and model | |
| if torch.cuda.is_available(): | |
| logger.debug("GPU is available. Proceeding with GPU setup.") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| device_map="auto", | |
| quantization_config=quantization_config, | |
| trust_remote_code=True, | |
| ) | |
| else: | |
| logger.warning("GPU is not available. Proceeding with CPU setup.") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| ) | |
| # Create Hugging Face pipeline | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_length=CONTEXT_LENGTH, | |
| temperature=0.7, | |
| top_k=50, | |
| top_p=0.9, | |
| repetition_penalty=1.2, | |
| ) | |
| # Prediction function using the model directly | |
| def predict( | |
| message, | |
| temperature, | |
| max_new_tokens, | |
| top_k, | |
| repetition_penalty, | |
| top_p, | |
| ): | |
| try: | |
| result = pipe(message, temperature=temperature, max_length=max_new_tokens, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty) | |
| return result[0]['generated_text'] | |
| except Exception as e: | |
| logger.exception(f"Error during prediction: {e}") | |
| return "An error occurred." | |
| # Gradio UI | |
| interface = gr.Interface( | |
| fn=predict, | |
| inputs=[ | |
| gr.Textbox(label="User input"), | |
| gr.Slider(0, 1, 0.7, label="Temperature"), | |
| gr.Slider(128, 2048, 1024, label="Max new tokens"), | |
| gr.Slider(1, 80, 40, label="Top K sampling"), | |
| gr.Slider(0, 2, 1.1, label="Repetition penalty"), | |
| gr.Slider(0, 1, 0.95, label="Top P sampling"), | |
| ], | |
| outputs="text", | |
| live=True, | |
| ) | |
| interface.launch() | |
| logger.debug("Chat interface initialized and launched") | |