Update app.py
Browse files
app.py
CHANGED
@@ -101,26 +101,53 @@ def generate_prompt(query: str, context_snippets: list) -> str:
|
|
101 |
)
|
102 |
return prompt
|
103 |
|
104 |
-
def get_llm_response(prompt: str, model_name: str = "meta-llama/Llama-2-7b-chat-hf", max_new_tokens: int = None) -> str:
|
105 |
-
|
106 |
-
|
107 |
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
if not os.path.exists("offload"):
|
111 |
-
os.makedirs("offload")
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
model = AutoModelForCausalLM.from_pretrained(
|
115 |
model_name,
|
116 |
device_map="auto",
|
117 |
-
offload_folder="offload", # Specify the folder where weights will be offloaded
|
118 |
use_safetensors=False,
|
119 |
-
|
120 |
-
torch_dtype=torch.float16,
|
121 |
-
token=HF_TOKEN
|
122 |
)
|
123 |
-
|
124 |
|
125 |
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
126 |
outputs = text_gen(prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
|
@@ -134,6 +161,7 @@ def get_llm_response(prompt: str, model_name: str = "meta-llama/Llama-2-7b-chat-
|
|
134 |
|
135 |
return answer
|
136 |
|
|
|
137 |
############################################
|
138 |
# Gradio Interface Functions
|
139 |
############################################
|
|
|
101 |
)
|
102 |
return prompt
|
103 |
|
104 |
+
# def get_llm_response(prompt: str, model_name: str = "meta-llama/Llama-2-7b-chat-hf", max_new_tokens: int = None) -> str:
|
105 |
+
# if max_new_tokens is None:
|
106 |
+
# max_new_tokens = 1024 if is_detailed_query(prompt) else 256
|
107 |
|
108 |
+
# torch.cuda.empty_cache()
|
109 |
+
|
110 |
+
# if not os.path.exists("offload"):
|
111 |
+
# os.makedirs("offload")
|
112 |
+
|
113 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=HF_TOKEN)
|
114 |
+
# model = AutoModelForCausalLM.from_pretrained(
|
115 |
+
# model_name,
|
116 |
+
# device_map="auto",
|
117 |
+
# offload_folder="offload", # Specify the folder where weights will be offloaded
|
118 |
+
# use_safetensors=False,
|
119 |
+
# trust_remote_code=True,
|
120 |
+
# torch_dtype=torch.float16,
|
121 |
+
# token=HF_TOKEN
|
122 |
+
# )
|
123 |
|
|
|
|
|
124 |
|
125 |
+
# text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
126 |
+
# outputs = text_gen(prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
|
127 |
+
# full_response = outputs[0]['generated_text']
|
128 |
+
|
129 |
+
# marker = "Answer:"
|
130 |
+
# if marker in full_response:
|
131 |
+
# answer = full_response.split(marker, 1)[1].strip()
|
132 |
+
# else:
|
133 |
+
# answer = full_response.strip()
|
134 |
+
|
135 |
+
# return answer
|
136 |
+
|
137 |
+
def get_llm_response(prompt: str, model_name: str = "EleutherAI/gpt-neo-125M", max_new_tokens: int = None) -> str:
|
138 |
+
if max_new_tokens is None:
|
139 |
+
max_new_tokens = 256 # You can adjust this value as needed.
|
140 |
+
|
141 |
+
torch.cuda.empty_cache()
|
142 |
+
|
143 |
+
# Load the tokenizer and model for GPT-Neo 125M.
|
144 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
145 |
model = AutoModelForCausalLM.from_pretrained(
|
146 |
model_name,
|
147 |
device_map="auto",
|
|
|
148 |
use_safetensors=False,
|
149 |
+
torch_dtype=torch.float32 # Using default precision since model is small.
|
|
|
|
|
150 |
)
|
|
|
151 |
|
152 |
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
153 |
outputs = text_gen(prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
|
|
|
161 |
|
162 |
return answer
|
163 |
|
164 |
+
|
165 |
############################################
|
166 |
# Gradio Interface Functions
|
167 |
############################################
|