QWEN-2.5-Coder-7B

Sleeping

App Files Files Community

Leri777 commited on Oct 9, 2024

Commit

cfdd958

verified ·

1 Parent(s): 6a6e013

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -74

app.py CHANGED Viewed

@@ -1,15 +1,10 @@
-# Optimized Python script for ZeroGPU Environment with Qwen-2.5-Coder-7B-Instruct
 import os
 import logging
 from threading import Thread
 from logging.handlers import RotatingFileHandler
 import torch
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from langchain_huggingface import HuggingFacePipeline
-from langchain.prompts import PromptTemplate
-from langchain.chains import LLMChain
-from transformers import pipeline
 # Logging setup
 log_file = '/tmp/app_debug.log'
@@ -30,31 +25,25 @@ quantization_config = BitsAndBytesConfig(
     load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
 )
-# Load tokenizer and model with GPU availability check
-def load_model():
-    if torch.cuda.is_available():
-        logger.debug("GPU is available. Proceeding with GPU setup.")
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            device_map="auto",
-            quantization_config=quantization_config,
-            trust_remote_code=True,
-        )
-        device = torch.device('cuda')
-    else:
-        logger.warning("GPU is not available. Proceeding with CPU setup.")
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            device_map="auto",
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-        )
-        device = torch.device('cpu')
-    return model, tokenizer, device
-model, tokenizer, device = load_model()
 # Create Hugging Face pipeline
 pipe = pipeline(
@@ -68,56 +57,18 @@ pipe = pipeline(
     repetition_penalty=1.2,
 )
-# Initialize HuggingFacePipeline model for LangChain
-chat_model = HuggingFacePipeline(pipeline=pipe)
-logger.debug("Model and tokenizer loaded successfully")
-# Define the conversation template for LangChain
-template = """<|im_start|>system
-{system_prompt}
-<|im_end|>
-{history}
-<|im_start|>user
-{human_input}
-<|im_end|>
-<|im_start|>assistant"""
-# Create LangChain prompt and chain
-prompt = PromptTemplate(
-    template=template, input_variables=["system_prompt", "history", "human_input"]
-)
-chain = LLMChain(llm=chat_model, prompt=prompt)
-# Format the conversation history
-def format_history(history):
-    formatted = ""
-    for human, ai in history:
-        formatted += f"<|im_start|>user\n{human}\n<|im_end|>\n<|im_start|>assistant\n{ai}\n<|im_end|>\n"
-    return formatted
-# Prediction function using LangChain and model
 def predict(
     message,
-    history,
-    system_prompt,
     temperature,
     max_new_tokens,
     top_k,
     repetition_penalty,
     top_p,
 ):
-    formatted_history = format_history(history)
     try:
-        result = chain.run(
-            {
-                "system_prompt": system_prompt,
-                "history": formatted_history,
-                "human_input": message,
-            }
-        )
-        return result
     except Exception as e:
         logger.exception(f"Error during prediction: {e}")
         return "An error occurred."
@@ -127,7 +78,6 @@ interface = gr.Interface(
     fn=predict,
     inputs=[
         gr.Textbox(label="User input"),
-        gr.Textbox("You are a helpful coding assistant", label="System prompt"),
         gr.Slider(0, 1, 0.7, label="Temperature"),
         gr.Slider(128, 2048, 1024, label="Max new tokens"),
         gr.Slider(1, 80, 40, label="Top K sampling"),
@@ -140,4 +90,4 @@ interface = gr.Interface(
 interface.launch()
-logger.debug("Chat interface initialized and launched")

 import os
 import logging
 from threading import Thread
 from logging.handlers import RotatingFileHandler
 import torch
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
 # Logging setup
 log_file = '/tmp/app_debug.log'
     load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
 )
+# Load tokenizer and model
+if torch.cuda.is_available():
+    logger.debug("GPU is available. Proceeding with GPU setup.")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        device_map="auto",
+        quantization_config=quantization_config,
+        trust_remote_code=True,
+    )
+else:
+    logger.warning("GPU is not available. Proceeding with CPU setup.")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        device_map="auto",
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+    )
 # Create Hugging Face pipeline
 pipe = pipeline(
     repetition_penalty=1.2,
 )
+# Prediction function using the model directly
 def predict(
     message,
     temperature,
     max_new_tokens,
     top_k,
     repetition_penalty,
     top_p,
 ):
     try:
+        result = pipe(message, temperature=temperature, max_length=max_new_tokens, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty)
+        return result[0]['generated_text']
     except Exception as e:
         logger.exception(f"Error during prediction: {e}")
         return "An error occurred."
     fn=predict,
     inputs=[
         gr.Textbox(label="User input"),
         gr.Slider(0, 1, 0.7, label="Temperature"),
         gr.Slider(128, 2048, 1024, label="Max new tokens"),
         gr.Slider(1, 80, 40, label="Top K sampling"),
 interface.launch()
+logger.debug("Chat interface initialized and launched")