QWEN-2.5-Coder-7B

Sleeping

App Files Files Community

Leri777 commited on Oct 9, 2024

Commit

16013c5

verified ·

1 Parent(s): 3c19b27

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -51

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import os
 import logging
 from logging.handlers import RotatingFileHandler
 import gradio as gr
 from transformers import AutoTokenizer, BitsAndBytesConfig
 from langchain_huggingface import ChatHuggingFace
 from langchain.prompts import PromptTemplate
 from langchain.chains import LLMChain
-# Настройка логирования
 log_file = '/tmp/app_debug.log'
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -18,67 +19,75 @@ logger.addHandler(file_handler)
 logger.debug("Application started")
 MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
-MODEL_NAME = MODEL_ID.split("/")[-1]
-template = """<|im_start|>system\n{system_prompt}\n<|im_end|>\n{history}<|im_start|>user\n{human_input}\n<|im_end|>\n<|im_start|>assistant\n"""
-prompt = PromptTemplate(template=template, input_variables=["system_prompt", "history", "human_input"])
-def format_history(history):
-    return "".join([f"<|im_start|>user\n{h[0]}\n<|im_end|>\n<|im_start|>assistant\n{h[1]}\n<|im_end|>\n" for h in history])
-def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
-    logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
-    chat_model.temperature = temperature
-    chat_model.max_new_tokens = max_new_tokens
-    chat_model.top_k = top_k
-    chat_model.repetition_penalty = repetition_penalty
-    chat_model.top_p = top_p
-    chain = LLMChain(llm=chat_model, prompt=prompt)
-    try:
-        formatted_history = format_history(history)
-        for chunk in chain.stream({"system_prompt": system_prompt, "history": formatted_history, "human_input": message}):
-            yield chunk["text"]
-        logger.debug(f"Prediction completed successfully for message: '{message}'")
-    except Exception as e:
-        logger.exception(f"Error during prediction: {str(e)}")
-        yield "An error occurred during processing."
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 chat_model = ChatHuggingFace(
     model_name=MODEL_ID,
-    tokenizer=tokenizer,
     model_kwargs={
         "device_map": "auto",
-        "quantization_config": BitsAndBytesConfig(load_in_4bit=True),
-    }
 )
 logger.debug("Model and tokenizer loaded successfully")
-gr.ChatInterface(
-    predict,
-    title=f"🤖 {MODEL_NAME}",
-    description=f"This is the {MODEL_NAME} model designed for coding assistance and general AI tasks.",
-    examples=[
-        ["Can you solve the equation 2x + 3 = 11 for x in Python?"],
-        ["Write a Java program that checks if a number is even or odd."],
-        ["How can I reverse a string in JavaScript?"],
-        ["Create a C++ function to find the factorial of a number."],
-        ["Write a Python list comprehension to generate a list of squares of numbers from 1 to 10."],
-    ],
-    additional_inputs=[
-        gr.Textbox("You are a code assistant.", label="System prompt"),
-        gr.Slider(0, 1, 0.3, label="Temperature"),
-        gr.Slider(128, 4096, 1024, label="Max new tokens"),
         gr.Slider(1, 80, 40, label="Top K sampling"),
         gr.Slider(0, 2, 1.1, label="Repetition penalty"),
-        gr.Slider(0, 1, 0.95, label="Top P sampling"),
     ],
-    theme=gr.themes.Soft(primary_hue="blue"),
-).queue().launch()
-logger.debug("Chat interface initialized and launched")

 import os
 import logging
+from threading import Thread
 from logging.handlers import RotatingFileHandler
+import torch
 import gradio as gr
 from transformers import AutoTokenizer, BitsAndBytesConfig
 from langchain_huggingface import ChatHuggingFace
 from langchain.prompts import PromptTemplate
 from langchain.chains import LLMChain
+# Logging setup
 log_file = '/tmp/app_debug.log'
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 logger.debug("Application started")
+# Define model parameters
 MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
+CONTEXT_LENGTH = 16000
+# Configuration for 4-bit quantization
+quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+# Initialize HuggingFace Chat model with LangChain
 chat_model = ChatHuggingFace(
     model_name=MODEL_ID,
     model_kwargs={
         "device_map": "auto",
+        "quantization_config": quantization_config,
+        "attn_implementation": "flash_attention_2",
+    },
+    tokenizer=tokenizer
 )
 logger.debug("Model and tokenizer loaded successfully")
+# Define the conversation template for LangChain
+template = """<|im_start|>system
+{system_prompt}
+<|im_end|>
+{history}
+<|im_start|>user
+{human_input}
+<|im_end|>
+<|im_start|>assistant"""
+# Create LangChain prompt and chain
+prompt = PromptTemplate(template=template, input_variables=["system_prompt", "history", "human_input"])
+chain = LLMChain(llm=chat_model, prompt=prompt)
+# Format the conversation history
+def format_history(history):
+    formatted = ""
+    for human, ai in history:
+        formatted += f"<|im_start|>user\n{human}\n<|im_end|>\n<|im_start|>assistant\n{ai}\n<|im_end|>\n"
+    return formatted
+# Prediction function using LangChain and model
+def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
+    formatted_history = format_history(history)
+    try:
+        result = chain.run({"system_prompt": system_prompt, "history": formatted_history, "human_input": message})
+        return result
+    except Exception as e:
+        logger.exception(f"Error during prediction: {e}")
+        return "An error occurred."
+# Gradio UI
+gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Textbox(label="User input"),
+        gr.State(),
+        gr.Textbox("You are a helpful coding assistant", label="System prompt"),
+        gr.Slider(0, 1, 0.7, label="Temperature"),
+        gr.Slider(128, 2048, 1024, label="Max new tokens"),
         gr.Slider(1, 80, 40, label="Top K sampling"),
         gr.Slider(0, 2, 1.1, label="Repetition penalty"),
+        gr.Slider(0, 1, 0.95, label="Top P sampling")
     ],
+    outputs="text",
+    title="Qwen2.5-Coder-7B-Instruct with LangChain",
+    live=True,
+).launch()