Phi3-ORPO

Paused

App Files Files Community

justinj92 commited on May 7, 2024

Commit

31350b4

verified ·

1 Parent(s): 0d3ab17

Update app.py

Browse files

Files changed (1) hide show

app.py +206 -35

app.py CHANGED Viewed

@@ -9,6 +9,32 @@ import os
 from threading import Thread
 import spaces
 import time
 import subprocess
 subprocess.run(
@@ -17,8 +43,56 @@ subprocess.run(
     shell=True,
 )
-tok = AutoTokenizer.from_pretrained("justinj92/phi3-orpo")
-model = AutoModelForCausalLM.from_pretrained("justinj92/phi3-orpo", attn_implementation="flash_attention_2")
 terminators = [
     tok.eos_token_id,
@@ -28,6 +102,7 @@ terminators = [
     32000
 ]
 if torch.cuda.is_available():
     device = torch.device("cuda")
     print(f"Using GPU: {torch.cuda.get_device_name(device)}")
@@ -36,47 +111,143 @@ else:
     print("Using CPU")
 model = model.to(device)
-# Dispatch Errors
-@spaces.GPU(duration=60)
-def chat(message, history, temperature, do_sample, max_tokens):
-    chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
-    for item in history:
-        chat.append({"role": "user", "content": item[0]})
-        if item[1] is not None:
-            chat.append({"role": "assistant", "content": item[1]})
-    chat.append({"role": "user", "content": message})
-    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-    model_inputs = tok([messages], return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(
-        tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
-    )
-    generate_kwargs = dict(
-        model_inputs,
-        streamer=streamer,
-        max_new_tokens=max_tokens,
-        do_sample=True,
-        temperature=temperature,
-        eos_token_id=terminators,
     )
-    if temperature == 0:
-        generate_kwargs["do_sample"] = False
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    partial_text = ""
-    for new_text in streamer:
-        partial_text += new_text
-        yield partial_text
-    yield partial_text
 demo = gr.ChatInterface(
-    fn=chat,
     examples=[["Write me a poem about Machine Learning."]],
     # multimodal=False,
     additional_inputs_accordion=gr.Accordion(

 from threading import Thread
 import spaces
 import time
+import langchain
+import os
+import glob
+import gc
+# loaders
+from langchain.document_loaders import PyPDFLoader, DirectoryLoader
+# splits
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+# prompts
+from langchain import PromptTemplate
+# vector stores
+from langchain_community.vectorstores import FAISS
+# models
+from langchain.llms import HuggingFacePipeline
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+# retrievers
+from langchain.chains import RetrievalQA
 import subprocess
 subprocess.run(
     shell=True,
 )
+class CFG:
+    DEBUG = False
+    ### LLM
+    model_name = 'justinj92/phi3-orpo'
+    temperature = 0.7
+    top_p = 0.90
+    repetition_penalty = 1.15
+    max_len = 8192
+    max_new_tokens = 512
+    ### splitting
+    split_chunk_size = 800
+    split_overlap = 400
+    ### embeddings
+    embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
+    ### similar passages
+    k = 6
+    ### paths
+    PDFs_path = '/data'
+    Embeddings_path =  '/embeddings/input'
+    Output_folder = '/ml-papers-vector'
+loader = DirectoryLoader(CFG.PDFs_path, glob="./*.pdf", loader_cls=PyPDFLoader,use_multithreading=True)
+documents = loader.load()
+text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap)
+if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
+    embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
+    vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
+    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
+embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
+vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
+def build_model(model_repo = CFG.model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_repo)
+    model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2")
+    return tokenizer, model
+tok, model = build_model(model_repo = CFG.model_name)
 terminators = [
     tok.eos_token_id,
     32000
 ]
 if torch.cuda.is_available():
     device = torch.device("cuda")
     print(f"Using GPU: {torch.cuda.get_device_name(device)}")
     print("Using CPU")
 model = model.to(device)
+pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
+llm = HuggingFacePipeline(pipeline = pipe)
+prompt_template = """
+<|system|>
+You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
+You are given some extracted parts from machine learning papers along with a question.
+If you don't know the answer, just say "I don't know." Don't try to make up an answer.
+It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
+Use only the following pieces of context to answer the question at the end.
+<|end|>
+<|user|>
+Context: {context}
+Question is below. Remember to answer in the same language:
+Question: {question}
+<|end|>
+<|assistant|>
+"""
+PROMPT = PromptTemplate(
+    template = prompt_template,
+    input_variables = ["context", "question"]
+)
+retriever = vectordb.as_retriever(
+    search_type = "similarity",
+    search_kwargs = {"k": CFG.k}
+)
+qa_chain = RetrievalQA.from_chain_type(
+    llm = llm,
+    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
+    retriever = retriever,
+    chain_type_kwargs = {"prompt": PROMPT},
+    return_source_documents = True,
+    verbose = False
+)
+@spaces.GPU(duration=120)
+def wrap_text_preserve_newlines(text, width=1500):
+    # Split the input text into lines based on newline characters
+    lines = text.split('\n')
+    # Wrap each line individually
+    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
+    # Join the wrapped lines back together using newline characters
+    wrapped_text = '\n'.join(wrapped_lines)
+    return wrapped_text
+@spaces.GPU(duration=120)
+def process_llm_response(llm_response):
+    ans = wrap_text_preserve_newlines(llm_response['result'])
+    sources_used = ' \n'.join(
+        [
+            source.metadata['source'].split('/')[-1][:-4]
+            + ' - page: '
+            + str(source.metadata['page'])
+            for source in llm_response['source_documents']
+        ]
     )
+    ans = ans + '\n\nSources: \n' + sources_used
+    ### return only the text after the pattern
+    pattern = "<|assistant|>"
+    index = ans.find(pattern)
+    if index != -1:
+        ans = ans[index + len(pattern):]
+    return ans.strip()
+@spaces.GPU(duration=120)
+def llm_ans(query):
+    llm_response = qa_chain.invoke(query)
+    ans = process_llm_response(llm_response)
+    return ans
+# @spaces.GPU(duration=60)
+# def chat(message, history, temperature, do_sample, max_tokens):
+#     chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
+#     for item in history:
+#         chat.append({"role": "user", "content": item[0]})
+#         if item[1] is not None:
+#             chat.append({"role": "assistant", "content": item[1]})
+#     chat.append({"role": "user", "content": message})
+#     messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+#     model_inputs = tok([messages], return_tensors="pt").to(device)
+#     streamer = TextIteratorStreamer(
+#         tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
+#     )
+#     generate_kwargs = dict(
+#         model_inputs,
+#         streamer=streamer,
+#         max_new_tokens=max_tokens,
+#         do_sample=True,
+#         temperature=temperature,
+#         eos_token_id=terminators,
+#     )
+#     if temperature == 0:
+#         generate_kwargs["do_sample"] = False
+#     t = Thread(target=model.generate, kwargs=generate_kwargs)
+#     t.start()
+#     partial_text = ""
+#     for new_text in streamer:
+#         partial_text += new_text
+#         yield partial_text
+#     yield partial_text
 demo = gr.ChatInterface(
+    fn=llm_ans,
     examples=[["Write me a poem about Machine Learning."]],
     # multimodal=False,
     additional_inputs_accordion=gr.Accordion(