Spaces:

BHO
/

TKOGPTv1.0

Sleeping

App Files Files Community

BHO commited on May 15, 2023

Commit

e556db9

1 Parent(s): decbdf3

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +416 -0

app.py ADDED Viewed

	@@ -0,0 +1,416 @@

+import gradio as gr
+from haystack.document_stores import FAISSDocumentStore
+from haystack.nodes import EmbeddingRetriever
+import openai
+import pandas as pd
+import os
+from utils import (
+    make_pairs,
+    set_openai_api_key,
+    create_user_id,
+    to_completion,
+)
+from datetime import datetime
+# from azure.storage.fileshare import ShareServiceClient
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except:
+    pass
+theme = gr.themes.Soft(
+    primary_hue="sky",
+    font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
+)
+init_prompt = (
+    "TKOQA, an AI Assistant for Tikehau. "
+)
+sources_prompt = (
+    "When relevant, use facts and numbers from the following documents in your answer. "
+)
+def get_reformulation_prompt(query: str) -> str:
+    return f"""Reformulate the following user message to be a short standalone question in English, in the context of the Universal Registration Document of Tikehau .
+---
+query: what is the AUM of Tikehau in 2022?
+standalone question: What is the AUM of TIkehau in 2022?
+language: English
+---
+query: what is T2?
+standalone question: what is the transition energy fund at Tikehau?
+language: English
+---
+query: what is the business of Tikehau?
+standalone question: What are the main business units of Tikehau?
+language: English
+---
+query: {query}
+standalone question:"""
+system_template = {
+    "role": "system",
+    "content": init_prompt,
+}
+# openai.api_type = "azure"
+os.environ["OPENAI_API_KEY"] = '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'
+openai.api_key = os.environ["OPENAI_API_KEY"]
+# BHO
+# openai.api_base = os.environ["ressource_endpoint"]
+# openai.api_version = "2022-12-01"
+document_store = FAISSDocumentStore()
+ds = FAISSDocumentStore.load(index_path="./tko_urd.faiss", config_path="./tko_urd.json",)
+retriever = EmbeddingRetriever(
+    document_store=ds,
+    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
+    model_format="sentence_transformers",
+    progress_bar=False,
+)
+# retrieve_giec = EmbeddingRetriever(
+#     document_store=FAISSDocumentStore.load(
+#         index_path="./documents/climate_gpt_v2_only_giec.faiss",
+#         config_path="./documents/climate_gpt_v2_only_giec.json",
+#     ),
+#     embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
+#     model_format="sentence_transformers",
+# )
+# BHO
+# For Azure connection in secrets in HuggingFace
+# credential = {
+#     "account_key": os.environ["account_key"],
+#     "account_name": os.environ["account_name"],
+# }
+# BHO
+# account_url = os.environ["account_url"]
+# file_share_name = "climategpt"
+# service = ShareServiceClient(account_url=account_url, credential=credential)
+# share_client = service.get_share_client(file_share_name)
+user_id = create_user_id(10)
+def filter_sources(df, k_summary=3, k_total=10, source="ipcc"):
+    assert source in ["ipcc", "ipbes", "all"]
+    # Filter by source
+    if source == "ipcc":
+        df = df.loc[df["source"] == "IPCC"]
+    elif source == "ipbes":
+        df = df.loc[df["source"] == "IPBES"]
+    else:
+        pass
+    # Prepare summaries
+    df_summaries = df #.loc[df.loc.obj.values]
+    # Separate summaries and full reports
+    #df_summaries = df.loc[df["report_type"].isin(["SPM", "TS"])]
+    #df_full = df.loc[~df["report_type"].isin(["SPM", "TS"])]
+    # Find passages from summaries dataset
+    passages_summaries = df_summaries.head(k_summary)
+    # Find passages from full reports dataset
+    # passages_fullreports = df_full.head(k_total - len(passages_summaries))
+    # Concatenate passages
+    #passages = pd.concat([passages_summaries, passages_fullreports], axis=0, ignore_index=True)
+    passages = passages_summaries
+    return passages
+def retrieve_with_summaries(query, retriever, k_summary=3, k_total=10, source="ipcc", max_k=100, threshold=0.555,
+                            as_dict=True):
+    assert max_k > k_total
+    docs = retriever.retrieve(query, top_k=max_k)
+    docs = [{**x.meta, "score": x.score, "content": x.content} for x in docs if x.score > threshold]
+    if len(docs) == 0:
+        return []
+    res = pd.DataFrame(docs)
+    passages_df = filter_sources(res, k_summary, k_total, source)
+    if as_dict:
+        contents = passages_df["content"].tolist()
+        meta = passages_df.drop(columns=["content"]).to_dict(orient="records")
+        passages = []
+        for i in range(len(contents)):
+            passages.append({"content": contents[i], "meta": meta[i]})
+        return passages
+    else:
+        return passages_df
+def make_html_source(source, i):
+    meta = source['meta']
+    return f"""
+<div class="card">
+    <div class="card-content">
+        <h2>Doc {i} - {meta['file_name']} - Page {meta['page_number']}</h2>
+        <p>{source['content']}</p>
+    </div>
+</div>
+"""
+def chat(
+        user_id: str,
+        query: str,
+        history: list = [system_template],
+        report_type: str = "All available",
+        threshold: float = 0.555,
+) -> tuple:
+    """retrieve relevant documents in the document store then query gpt-turbo
+    Args:
+        query (str): user message.
+        history (list, optional): history of the conversation. Defaults to [system_template].
+        report_type (str, optional): should be "All available" or "IPCC only". Defaults to "All available".
+        threshold (float, optional): similarity threshold, don't increase more than 0.568. Defaults to 0.56.
+    Yields:
+        tuple: chat gradio format, chat openai format, sources used.
+    """
+    if report_type not in ["IPCC", "IPBES"]: report_type = "all"
+    print("Searching in ", report_type, " reports")
+    reformulated_query = openai.Completion.create(
+        engine="text-davinci-003",
+        prompt=get_reformulation_prompt(query),
+        temperature=0,
+        max_tokens=128,
+        stop=["\n---\n", "<|im_end|>"],
+    )
+    reformulated_query = reformulated_query["choices"][0]["text"]
+    reformulated_query, language = reformulated_query.split("\n")
+    language = language.split(":")[1].strip()
+    sources = retrieve_with_summaries(reformulated_query, retriever, k_total=10, k_summary=3, as_dict=True,
+                                      source=report_type.lower(), threshold=threshold)
+    response_retriever = {
+        "language": language,
+        "reformulated_query": reformulated_query,
+        "query": query,
+        "sources": sources,
+    }
+    # docs = [d for d in retriever.retrieve(query=reformulated_query, top_k=10) if d.score > threshold]
+    messages = history + [{"role": "user", "content": query}]
+    if len(sources) > 0:
+        docs_string = []
+        docs_html = []
+        for i, d in enumerate(sources, 1):
+            #docs_string.append(f"📃 Doc {i}: {d['meta']['short_name']} page {d['meta']['page_number']}\n{d['content']}")
+            docs_string.append(f"📃 Doc {i}: {d['meta']['file_name']} page {d['meta']['page_number']}\n{d['content']}")
+            docs_html.append(make_html_source(d, i))
+        docs_string = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
+        docs_html = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_html)
+        messages.append({"role": "system", "content": f"{sources_prompt}\n\n{docs_string}\n\nAnswer in {language}:"})
+        response = openai.Completion.create(
+            # engine="climateGPT",
+            engine="text-davinci-003",
+            prompt=to_completion(messages),
+            temperature=0,  # deterministic
+            stream=True,
+            max_tokens=1024,
+        )
+        complete_response = ""
+        messages.pop()
+        messages.append({"role": "assistant", "content": complete_response})
+        timestamp = str(datetime.now().timestamp())
+        file = user_id[0] + timestamp + ".json"
+        logs = {
+            "user_id": user_id[0],
+            "prompt": query,
+            "retrived": sources,
+            "report_type": report_type,
+            "prompt_eng": messages[0],
+            "answer": messages[-1]["content"],
+            "time": timestamp,
+        }
+        # log_on_azure(file, logs, share_client)
+        print(logs)
+        for chunk in response:
+            if (chunk_message := chunk["choices"][0].get("text")) and chunk_message != "<|im_end|>":
+                complete_response += chunk_message
+                messages[-1]["content"] = complete_response
+                gradio_format = make_pairs([a["content"] for a in messages[1:]])
+                yield gradio_format, messages, docs_html
+    else:
+        docs_string = "⚠️ No relevant passages found in the URDs"
+        complete_response = "**⚠️ No relevant passages found in the URDs **"
+        messages.append({"role": "assistant", "content": complete_response})
+        gradio_format = make_pairs([a["content"] for a in messages[1:]])
+        yield gradio_format, messages, docs_string
+def save_feedback(feed: str, user_id):
+    if len(feed) > 1:
+        timestamp = str(datetime.now().timestamp())
+        file = user_id[0] + timestamp + ".json"
+        logs = {
+            "user_id": user_id[0],
+            "feedback": feed,
+            "time": timestamp,
+        }
+        # log_on_azure(file, logs, share_client)
+        print(logs)
+        return "Feedback submitted, thank you!"
+def reset_textbox():
+    return gr.update(value="")
+# def log_on_azure(file, logs, share_client):
+#     file_client = share_client.get_file_client(file)
+#     file_client.upload_file(str(logs))
+with gr.Blocks(title="TKO URD Q&A", css="style.css", theme=theme) as demo:
+    user_id_state = gr.State([user_id])
+    # Gradio
+    gr.Markdown("<h1><center>Tikehau Capital Q&A </center></h1>")
+    with gr.Row():
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(elem_id="chatbot", label=" Tikehau Capital Q&A chatbot", show_label=False)
+            state = gr.State([system_template])
+            with gr.Row():
+                ask = gr.Textbox(
+                    show_label=True,
+                    placeholder="Ask here your Tikehau-related question and press enter",
+                ).style(container=False)
+                #ask_examples_hidden = gr.Textbox(elem_id="hidden-message")
+            # examples_questions = gr.Examples(
+            #     [
+            #         "What is the AUM of Tikehau in 2022?",
+            #     ],
+            #     [ask_examples_hidden],
+            #     examples_per_page=15,
+            #)
+        with gr.Column(scale=1, variant="panel"):
+            gr.Markdown("### Sources")
+            sources_textbox = gr.Markdown(show_label=False)
+    # dropdown_sources = gr.inputs.Dropdown(
+    #     ["IPCC", "IPBES", "ALL"],
+    #     default="ALL",
+    #     label="Select reports",
+    # )
+    dropdown_sources = gr.State(["All"])
+    ask.submit(
+        fn=chat,
+        inputs=[
+            user_id_state,
+            ask,
+            state,
+            dropdown_sources
+        ],
+        outputs=[chatbot, state, sources_textbox],
+    )
+    ask.submit(reset_textbox, [], [ask])
+    # ask_examples_hidden.change(
+    #     fn=chat,
+    #     inputs=[
+    #         user_id_state,
+    #         ask_examples_hidden,
+    #         state,
+    #         dropdown_sources
+    #     ],
+    #     outputs=[chatbot, state, sources_textbox],
+    # )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown(
+                """
+<div class="warning-box">
+Version 0.1-beta - This tool is under active development
+</div>
+"""
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("*Source : Tikehau Universal Registration Documents *")
+    gr.Markdown("## How to use TKO URD Q&A")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown(
+                """
+    ### 💪 Getting started
+    - In the chatbot section, simply type your Tikehau-related question, answers will be provided  with references to relevant URDs.
+    """
+            )
+        with gr.Column(scale=1):
+            gr.Markdown(
+                """
+    ### ⚠️ Limitations
+    <div class="warning-box">
+    <ul>
+        <li>Please note that, like any AI, the model may occasionally generate an inaccurate or imprecise answer.</li>
+    </div>
+    """
+            )
+    gr.Markdown("## 🙏 Feedback and feature requests")
+    gr.Markdown(
+        """
+    ### Beta test
+    - Feedback welcome.
+    """
+    )
+    gr.Markdown(
+        """
+## 🛢️ Carbon Footprint
+Carbon emissions were measured during the development and inference process using CodeCarbon [https://github.com/mlco2/codecarbon](https://github.com/mlco2/codecarbon)
+| Phase | Description | Emissions | Source |
+| --- | --- | --- | --- |
+| Inference | API call to turbo-GPT | ~0.38gCO2e / call | https://medium.com/@chrispointon/the-carbon-footprint-of-chatgpt-e1bc14e4cc2a |
+Carbon Emissions are **relatively low but not negligible** compared to other usages: one question asked to ClimateQ&A is around 0.482gCO2e - equivalent to 2.2m by car (https://datagir.ademe.fr/apps/impact-co2/)
+Or around 2 to 4 times more than a typical Google search.
+</b>.
+"""
+    )
+    demo.queue(concurrency_count=16)
+demo.launch()