Leri777 commited on
Commit
cfdd958
·
verified ·
1 Parent(s): 6a6e013

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -74
app.py CHANGED
@@ -1,15 +1,10 @@
1
- # Optimized Python script for ZeroGPU Environment with Qwen-2.5-Coder-7B-Instruct
2
  import os
3
  import logging
4
  from threading import Thread
5
  from logging.handlers import RotatingFileHandler
6
  import torch
7
  import gradio as gr
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
9
- from langchain_huggingface import HuggingFacePipeline
10
- from langchain.prompts import PromptTemplate
11
- from langchain.chains import LLMChain
12
- from transformers import pipeline
13
 
14
  # Logging setup
15
  log_file = '/tmp/app_debug.log'
@@ -30,31 +25,25 @@ quantization_config = BitsAndBytesConfig(
30
  load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
31
  )
32
 
33
- # Load tokenizer and model with GPU availability check
34
- def load_model():
35
- if torch.cuda.is_available():
36
- logger.debug("GPU is available. Proceeding with GPU setup.")
37
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
38
- model = AutoModelForCausalLM.from_pretrained(
39
- MODEL_ID,
40
- device_map="auto",
41
- quantization_config=quantization_config,
42
- trust_remote_code=True,
43
- )
44
- device = torch.device('cuda')
45
- else:
46
- logger.warning("GPU is not available. Proceeding with CPU setup.")
47
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
48
- model = AutoModelForCausalLM.from_pretrained(
49
- MODEL_ID,
50
- device_map="auto",
51
- trust_remote_code=True,
52
- low_cpu_mem_usage=True,
53
- )
54
- device = torch.device('cpu')
55
- return model, tokenizer, device
56
-
57
- model, tokenizer, device = load_model()
58
 
59
  # Create Hugging Face pipeline
60
  pipe = pipeline(
@@ -68,56 +57,18 @@ pipe = pipeline(
68
  repetition_penalty=1.2,
69
  )
70
 
71
- # Initialize HuggingFacePipeline model for LangChain
72
- chat_model = HuggingFacePipeline(pipeline=pipe)
73
-
74
- logger.debug("Model and tokenizer loaded successfully")
75
-
76
- # Define the conversation template for LangChain
77
- template = """<|im_start|>system
78
- {system_prompt}
79
- <|im_end|>
80
- {history}
81
- <|im_start|>user
82
- {human_input}
83
- <|im_end|>
84
- <|im_start|>assistant"""
85
-
86
- # Create LangChain prompt and chain
87
- prompt = PromptTemplate(
88
- template=template, input_variables=["system_prompt", "history", "human_input"]
89
- )
90
- chain = LLMChain(llm=chat_model, prompt=prompt)
91
-
92
- # Format the conversation history
93
- def format_history(history):
94
- formatted = ""
95
- for human, ai in history:
96
- formatted += f"<|im_start|>user\n{human}\n<|im_end|>\n<|im_start|>assistant\n{ai}\n<|im_end|>\n"
97
- return formatted
98
-
99
- # Prediction function using LangChain and model
100
  def predict(
101
  message,
102
- history,
103
- system_prompt,
104
  temperature,
105
  max_new_tokens,
106
  top_k,
107
  repetition_penalty,
108
  top_p,
109
  ):
110
- formatted_history = format_history(history)
111
-
112
  try:
113
- result = chain.run(
114
- {
115
- "system_prompt": system_prompt,
116
- "history": formatted_history,
117
- "human_input": message,
118
- }
119
- )
120
- return result
121
  except Exception as e:
122
  logger.exception(f"Error during prediction: {e}")
123
  return "An error occurred."
@@ -127,7 +78,6 @@ interface = gr.Interface(
127
  fn=predict,
128
  inputs=[
129
  gr.Textbox(label="User input"),
130
- gr.Textbox("You are a helpful coding assistant", label="System prompt"),
131
  gr.Slider(0, 1, 0.7, label="Temperature"),
132
  gr.Slider(128, 2048, 1024, label="Max new tokens"),
133
  gr.Slider(1, 80, 40, label="Top K sampling"),
@@ -140,4 +90,4 @@ interface = gr.Interface(
140
 
141
  interface.launch()
142
 
143
- logger.debug("Chat interface initialized and launched")
 
 
1
  import os
2
  import logging
3
  from threading import Thread
4
  from logging.handlers import RotatingFileHandler
5
  import torch
6
  import gradio as gr
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
 
 
 
 
8
 
9
  # Logging setup
10
  log_file = '/tmp/app_debug.log'
 
25
  load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
26
  )
27
 
28
+ # Load tokenizer and model
29
+ if torch.cuda.is_available():
30
+ logger.debug("GPU is available. Proceeding with GPU setup.")
31
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ MODEL_ID,
34
+ device_map="auto",
35
+ quantization_config=quantization_config,
36
+ trust_remote_code=True,
37
+ )
38
+ else:
39
+ logger.warning("GPU is not available. Proceeding with CPU setup.")
40
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
41
+ model = AutoModelForCausalLM.from_pretrained(
42
+ MODEL_ID,
43
+ device_map="auto",
44
+ trust_remote_code=True,
45
+ low_cpu_mem_usage=True,
46
+ )
 
 
 
 
 
 
47
 
48
  # Create Hugging Face pipeline
49
  pipe = pipeline(
 
57
  repetition_penalty=1.2,
58
  )
59
 
60
+ # Prediction function using the model directly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def predict(
62
  message,
 
 
63
  temperature,
64
  max_new_tokens,
65
  top_k,
66
  repetition_penalty,
67
  top_p,
68
  ):
 
 
69
  try:
70
+ result = pipe(message, temperature=temperature, max_length=max_new_tokens, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty)
71
+ return result[0]['generated_text']
 
 
 
 
 
 
72
  except Exception as e:
73
  logger.exception(f"Error during prediction: {e}")
74
  return "An error occurred."
 
78
  fn=predict,
79
  inputs=[
80
  gr.Textbox(label="User input"),
 
81
  gr.Slider(0, 1, 0.7, label="Temperature"),
82
  gr.Slider(128, 2048, 1024, label="Max new tokens"),
83
  gr.Slider(1, 80, 40, label="Top K sampling"),
 
90
 
91
  interface.launch()
92
 
93
+ logger.debug("Chat interface initialized and launched")