justinj92 commited on
Commit
31350b4
·
verified ·
1 Parent(s): 0d3ab17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -35
app.py CHANGED
@@ -9,6 +9,32 @@ import os
9
  from threading import Thread
10
  import spaces
11
  import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  import subprocess
13
 
14
  subprocess.run(
@@ -17,8 +43,56 @@ subprocess.run(
17
  shell=True,
18
  )
19
 
20
- tok = AutoTokenizer.from_pretrained("justinj92/phi3-orpo")
21
- model = AutoModelForCausalLM.from_pretrained("justinj92/phi3-orpo", attn_implementation="flash_attention_2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  terminators = [
24
  tok.eos_token_id,
@@ -28,6 +102,7 @@ terminators = [
28
  32000
29
  ]
30
 
 
31
  if torch.cuda.is_available():
32
  device = torch.device("cuda")
33
  print(f"Using GPU: {torch.cuda.get_device_name(device)}")
@@ -36,47 +111,143 @@ else:
36
  print("Using CPU")
37
 
38
  model = model.to(device)
39
- # Dispatch Errors
40
-
41
-
42
- @spaces.GPU(duration=60)
43
- def chat(message, history, temperature, do_sample, max_tokens):
44
- chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
45
- for item in history:
46
- chat.append({"role": "user", "content": item[0]})
47
- if item[1] is not None:
48
- chat.append({"role": "assistant", "content": item[1]})
49
- chat.append({"role": "user", "content": message})
50
- messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
51
- model_inputs = tok([messages], return_tensors="pt").to(device)
52
- streamer = TextIteratorStreamer(
53
- tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
54
- )
55
- generate_kwargs = dict(
56
- model_inputs,
57
- streamer=streamer,
58
- max_new_tokens=max_tokens,
59
- do_sample=True,
60
- temperature=temperature,
61
- eos_token_id=terminators,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- if temperature == 0:
65
- generate_kwargs["do_sample"] = False
66
 
67
- t = Thread(target=model.generate, kwargs=generate_kwargs)
68
- t.start()
69
 
70
- partial_text = ""
71
- for new_text in streamer:
72
- partial_text += new_text
73
- yield partial_text
74
 
75
- yield partial_text
76
 
77
 
78
  demo = gr.ChatInterface(
79
- fn=chat,
80
  examples=[["Write me a poem about Machine Learning."]],
81
  # multimodal=False,
82
  additional_inputs_accordion=gr.Accordion(
 
9
  from threading import Thread
10
  import spaces
11
  import time
12
+
13
+ import langchain
14
+ import os
15
+ import glob
16
+ import gc
17
+
18
+ # loaders
19
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
20
+
21
+ # splits
22
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
23
+
24
+ # prompts
25
+ from langchain import PromptTemplate
26
+
27
+ # vector stores
28
+ from langchain_community.vectorstores import FAISS
29
+
30
+ # models
31
+ from langchain.llms import HuggingFacePipeline
32
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
33
+
34
+ # retrievers
35
+ from langchain.chains import RetrievalQA
36
+
37
+
38
  import subprocess
39
 
40
  subprocess.run(
 
43
  shell=True,
44
  )
45
 
46
+
47
+ class CFG:
48
+ DEBUG = False
49
+
50
+ ### LLM
51
+ model_name = 'justinj92/phi3-orpo'
52
+ temperature = 0.7
53
+ top_p = 0.90
54
+ repetition_penalty = 1.15
55
+ max_len = 8192
56
+ max_new_tokens = 512
57
+
58
+ ### splitting
59
+ split_chunk_size = 800
60
+ split_overlap = 400
61
+
62
+ ### embeddings
63
+ embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
64
+
65
+ ### similar passages
66
+ k = 6
67
+
68
+ ### paths
69
+ PDFs_path = '/data'
70
+ Embeddings_path = '/embeddings/input'
71
+ Output_folder = '/ml-papers-vector'
72
+
73
+ loader = DirectoryLoader(CFG.PDFs_path, glob="./*.pdf", loader_cls=PyPDFLoader,use_multithreading=True)
74
+
75
+ documents = loader.load()
76
+
77
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap)
78
+
79
+ if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
80
+ embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
81
+ vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
82
+ vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
83
+
84
+ embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
85
+ vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
86
+
87
+
88
+ def build_model(model_repo = CFG.model_name):
89
+ tokenizer = AutoTokenizer.from_pretrained(model_repo)
90
+ model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2")
91
+
92
+ return tokenizer, model
93
+
94
+
95
+ tok, model = build_model(model_repo = CFG.model_name)
96
 
97
  terminators = [
98
  tok.eos_token_id,
 
102
  32000
103
  ]
104
 
105
+
106
  if torch.cuda.is_available():
107
  device = torch.device("cuda")
108
  print(f"Using GPU: {torch.cuda.get_device_name(device)}")
 
111
  print("Using CPU")
112
 
113
  model = model.to(device)
114
+
115
+ pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
116
+
117
+ llm = HuggingFacePipeline(pipeline = pipe)
118
+
119
+ prompt_template = """
120
+ <|system|>
121
+
122
+ You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
123
+
124
+ You are given some extracted parts from machine learning papers along with a question.
125
+
126
+ If you don't know the answer, just say "I don't know." Don't try to make up an answer.
127
+
128
+ It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
129
+
130
+ Use only the following pieces of context to answer the question at the end.
131
+
132
+ <|end|>
133
+
134
+ <|user|>
135
+
136
+ Context: {context}
137
+
138
+ Question is below. Remember to answer in the same language:
139
+
140
+ Question: {question}
141
+
142
+ <|end|>
143
+
144
+ <|assistant|>
145
+
146
+ """
147
+
148
+
149
+ PROMPT = PromptTemplate(
150
+ template = prompt_template,
151
+ input_variables = ["context", "question"]
152
+ )
153
+
154
+ retriever = vectordb.as_retriever(
155
+ search_type = "similarity",
156
+ search_kwargs = {"k": CFG.k}
157
+ )
158
+
159
+ qa_chain = RetrievalQA.from_chain_type(
160
+ llm = llm,
161
+ chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
162
+ retriever = retriever,
163
+ chain_type_kwargs = {"prompt": PROMPT},
164
+ return_source_documents = True,
165
+ verbose = False
166
+ )
167
+
168
+ @spaces.GPU(duration=120)
169
+ def wrap_text_preserve_newlines(text, width=1500):
170
+ # Split the input text into lines based on newline characters
171
+ lines = text.split('\n')
172
+
173
+ # Wrap each line individually
174
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
175
+
176
+ # Join the wrapped lines back together using newline characters
177
+ wrapped_text = '\n'.join(wrapped_lines)
178
+
179
+ return wrapped_text
180
+
181
+ @spaces.GPU(duration=120)
182
+ def process_llm_response(llm_response):
183
+ ans = wrap_text_preserve_newlines(llm_response['result'])
184
+
185
+ sources_used = ' \n'.join(
186
+ [
187
+ source.metadata['source'].split('/')[-1][:-4]
188
+ + ' - page: '
189
+ + str(source.metadata['page'])
190
+ for source in llm_response['source_documents']
191
+ ]
192
  )
193
+
194
+ ans = ans + '\n\nSources: \n' + sources_used
195
+
196
+ ### return only the text after the pattern
197
+ pattern = "<|assistant|>"
198
+ index = ans.find(pattern)
199
+ if index != -1:
200
+ ans = ans[index + len(pattern):]
201
+
202
+ return ans.strip()
203
+
204
+ @spaces.GPU(duration=120)
205
+ def llm_ans(query):
206
+
207
+ llm_response = qa_chain.invoke(query)
208
+ ans = process_llm_response(llm_response)
209
+
210
+ return ans
211
+
212
+
213
+ # @spaces.GPU(duration=60)
214
+ # def chat(message, history, temperature, do_sample, max_tokens):
215
+ # chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
216
+ # for item in history:
217
+ # chat.append({"role": "user", "content": item[0]})
218
+ # if item[1] is not None:
219
+ # chat.append({"role": "assistant", "content": item[1]})
220
+ # chat.append({"role": "user", "content": message})
221
+ # messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
222
+ # model_inputs = tok([messages], return_tensors="pt").to(device)
223
+ # streamer = TextIteratorStreamer(
224
+ # tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
225
+ # )
226
+ # generate_kwargs = dict(
227
+ # model_inputs,
228
+ # streamer=streamer,
229
+ # max_new_tokens=max_tokens,
230
+ # do_sample=True,
231
+ # temperature=temperature,
232
+ # eos_token_id=terminators,
233
+ # )
234
 
235
+ # if temperature == 0:
236
+ # generate_kwargs["do_sample"] = False
237
 
238
+ # t = Thread(target=model.generate, kwargs=generate_kwargs)
239
+ # t.start()
240
 
241
+ # partial_text = ""
242
+ # for new_text in streamer:
243
+ # partial_text += new_text
244
+ # yield partial_text
245
 
246
+ # yield partial_text
247
 
248
 
249
  demo = gr.ChatInterface(
250
+ fn=llm_ans,
251
  examples=[["Write me a poem about Machine Learning."]],
252
  # multimodal=False,
253
  additional_inputs_accordion=gr.Accordion(