Spaces:

BHO
/

TKOGPTv1.0

Sleeping

File size: 13,132 Bytes

import gradio as gr
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
import openai
import pandas as pd
import os
from utils import (
    make_pairs,
    set_openai_api_key,
    create_user_id,
    to_completion,
)

from datetime import datetime

# from azure.storage.fileshare import ShareServiceClient

try:
    from dotenv import load_dotenv

    load_dotenv()
except:
    pass

theme = gr.themes.Soft(
    primary_hue="sky",
    font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
)

init_prompt = (
    "TKOQA, an AI Assistant for Tikehau. "

)
sources_prompt = (
    "When relevant, use facts and numbers from the following documents in your answer. "
)


def get_reformulation_prompt(query: str) -> str:
    return f"""Reformulate the following user message to be a short standalone question in English, in the context of the Universal Registration Document of Tikehau .
---
query: what is the AUM of Tikehau in 2022?
standalone question: What is the AUM of TIkehau in 2022?
language: English
---
query: what is T2?
standalone question: what is the transition energy fund at Tikehau?
language: English
---
query: what is the business of Tikehau?
standalone question: What are the main business units of Tikehau?
language: English
---
query: {query}
standalone question:"""



system_template = {
    "role": "system",
    "content": init_prompt,
}

openai.api_key = os.environ["OPENAI_API_KEY"]

# BHO
# openai.api_base = os.environ["ressource_endpoint"]
# openai.api_version = "2022-12-01"

document_store = FAISSDocumentStore()

ds = FAISSDocumentStore.load(index_path="./tko_urd.faiss", config_path="./tko_urd.json",)

retriever = EmbeddingRetriever(
    document_store=ds,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers",
    progress_bar=False,
)

# retrieve_giec = EmbeddingRetriever(
#     document_store=FAISSDocumentStore.load(
#         index_path="./documents/climate_gpt_v2_only_giec.faiss",
#         config_path="./documents/climate_gpt_v2_only_giec.json",
#     ),
#     embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
#     model_format="sentence_transformers",
# )

# BHO
# For Azure connection in secrets in HuggingFace
# credential = {
#     "account_key": os.environ["account_key"],
#     "account_name": os.environ["account_name"],
# }

# BHO
# account_url = os.environ["account_url"]
# file_share_name = "climategpt"
# service = ShareServiceClient(account_url=account_url, credential=credential)
# share_client = service.get_share_client(file_share_name)
user_id = create_user_id(10)


def filter_sources(df, k_summary=3, k_total=10, source="ipcc"):
    assert source in ["ipcc", "ipbes", "all"]

    # Filter by source
    if source == "ipcc":
        df = df.loc[df["source"] == "IPCC"]
    elif source == "ipbes":
        df = df.loc[df["source"] == "IPBES"]
    else:
        pass

    # Prepare summaries
    df_summaries = df #.loc[df.loc.obj.values]
    # Separate summaries and full reports
    #df_summaries = df.loc[df["report_type"].isin(["SPM", "TS"])]
    #df_full = df.loc[~df["report_type"].isin(["SPM", "TS"])]

    # Find passages from summaries dataset
    passages_summaries = df_summaries.head(k_summary)

    # Find passages from full reports dataset
    # passages_fullreports = df_full.head(k_total - len(passages_summaries))

    # Concatenate passages
    #passages = pd.concat([passages_summaries, passages_fullreports], axis=0, ignore_index=True)
    passages = passages_summaries
    return passages


def retrieve_with_summaries(query, retriever, k_summary=3, k_total=10, source="ipcc", max_k=100, threshold=0.555,
                            as_dict=True):
    assert max_k > k_total
    docs = retriever.retrieve(query, top_k=max_k)
    docs = [{**x.meta, "score": x.score, "content": x.content} for x in docs if x.score > threshold]
    if len(docs) == 0:
        return []
    res = pd.DataFrame(docs)
    passages_df = filter_sources(res, k_summary, k_total, source)
    if as_dict:
        contents = passages_df["content"].tolist()
        meta = passages_df.drop(columns=["content"]).to_dict(orient="records")
        passages = []
        for i in range(len(contents)):
            passages.append({"content": contents[i], "meta": meta[i]})
        return passages
    else:
        return passages_df


def make_html_source(source, i):
    meta = source['meta']
    return f"""
<div class="card">
    <div class="card-content">
        <h2>Doc {i} - {meta['file_name']} - Page {meta['page_number']}</h2>
        <p>{source['content']}</p>
    </div>
    
</div>
"""


def chat(
        user_id: str,
        query: str,
        history: list = [system_template],
        report_type: str = "All available",
        threshold: float = 0.555,
) -> tuple:
    """retrieve relevant documents in the document store then query gpt-turbo

    Args:
        query (str): user message.
        history (list, optional): history of the conversation. Defaults to [system_template].
        report_type (str, optional): should be "All available" or "IPCC only". Defaults to "All available".
        threshold (float, optional): similarity threshold, don't increase more than 0.568. Defaults to 0.56.

    Yields:
        tuple: chat gradio format, chat openai format, sources used.
    """

    if report_type not in ["IPCC", "IPBES"]: report_type = "all"
    print("Searching in ", report_type, " reports")

    reformulated_query = openai.Completion.create(
        engine="text-davinci-003",
        prompt=get_reformulation_prompt(query),
        temperature=0,
        max_tokens=128,
        stop=["\n---\n", "<|im_end|>"],
    )

    reformulated_query = reformulated_query["choices"][0]["text"]
    reformulated_query, language = reformulated_query.split("\n")
    language = language.split(":")[1].strip()

    sources = retrieve_with_summaries(reformulated_query, retriever, k_total=10, k_summary=3, as_dict=True,
                                      source=report_type.lower(), threshold=threshold)
    response_retriever = {
        "language": language,
        "reformulated_query": reformulated_query,
        "query": query,
        "sources": sources,
    }

    # docs = [d for d in retriever.retrieve(query=reformulated_query, top_k=10) if d.score > threshold]
    messages = history + [{"role": "user", "content": query}]

    if len(sources) > 0:
        docs_string = []
        docs_html = []
        for i, d in enumerate(sources, 1):
            #docs_string.append(f"📃 Doc {i}: {d['meta']['short_name']} page {d['meta']['page_number']}\n{d['content']}")
            docs_string.append(f"📃 Doc {i}: {d['meta']['file_name']} page {d['meta']['page_number']}\n{d['content']}")
            docs_html.append(make_html_source(d, i))
        docs_string = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
        docs_html = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_html)
        messages.append({"role": "system", "content": f"{sources_prompt}\n\n{docs_string}\n\nAnswer in {language}:"})

        response = openai.Completion.create(
            # engine="climateGPT",
            engine="text-davinci-003",
            prompt=to_completion(messages),
            temperature=0,  # deterministic
            stream=True,
            max_tokens=1024,
        )

        complete_response = ""
        messages.pop()

        messages.append({"role": "assistant", "content": complete_response})
        timestamp = str(datetime.now().timestamp())
        file = user_id[0] + timestamp + ".json"
        logs = {
            "user_id": user_id[0],
            "prompt": query,
            "retrived": sources,
            "report_type": report_type,
            "prompt_eng": messages[0],
            "answer": messages[-1]["content"],
            "time": timestamp,
        }
        # log_on_azure(file, logs, share_client)
        print(logs)

        for chunk in response:
            if (chunk_message := chunk["choices"][0].get("text")) and chunk_message != "<|im_end|>":
                complete_response += chunk_message
                messages[-1]["content"] = complete_response
                gradio_format = make_pairs([a["content"] for a in messages[1:]])
                yield gradio_format, messages, docs_html

    else:
        docs_string = "⚠️ No relevant passages found in the URDs"
        complete_response = "**⚠️ No relevant passages found in the URDs **"
        messages.append({"role": "assistant", "content": complete_response})
        gradio_format = make_pairs([a["content"] for a in messages[1:]])
        yield gradio_format, messages, docs_string


def save_feedback(feed: str, user_id):
    if len(feed) > 1:
        timestamp = str(datetime.now().timestamp())
        file = user_id[0] + timestamp + ".json"
        logs = {
            "user_id": user_id[0],
            "feedback": feed,
            "time": timestamp,
        }
        # log_on_azure(file, logs, share_client)
        print(logs)
        return "Feedback submitted, thank you!"


def reset_textbox():
    return gr.update(value="")


# def log_on_azure(file, logs, share_client):
#     file_client = share_client.get_file_client(file)
#     file_client.upload_file(str(logs))


with gr.Blocks(title="TKO URD Q&A", css="style.css", theme=theme) as demo:
    user_id_state = gr.State([user_id])

    # Gradio
    gr.Markdown("<h1><center>Tikehau Capital Q&A </center></h1>")

    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(elem_id="chatbot", label=" Tikehau Capital Q&A chatbot", show_label=False)
            state = gr.State([system_template])

            with gr.Row():
                ask = gr.Textbox(
                    show_label=True,
                    placeholder="Ask here your Tikehau-related question and press enter",
                ).style(container=False)
                #ask_examples_hidden = gr.Textbox(elem_id="hidden-message")

            # examples_questions = gr.Examples(
            #     [
            #         "What is the AUM of Tikehau in 2022?",
            #     ],
            #     [ask_examples_hidden],
            #     examples_per_page=15,
            #)

        with gr.Column(scale=1, variant="panel"):
            gr.Markdown("### Sources")
            sources_textbox = gr.Markdown(show_label=False)

    # dropdown_sources = gr.inputs.Dropdown(
    #     ["IPCC", "IPBES", "ALL"],
    #     default="ALL",
    #     label="Select reports",
    # )
    dropdown_sources = gr.State(["All"])

    ask.submit(
        fn=chat,
        inputs=[
            user_id_state,
            ask,
            state,
            dropdown_sources

        ],
        outputs=[chatbot, state, sources_textbox],
    )
    ask.submit(reset_textbox, [], [ask])

    # ask_examples_hidden.change(
    #     fn=chat,
    #     inputs=[
    #         user_id_state,
    #         ask_examples_hidden,
    #         state,
    #         dropdown_sources
    #     ],
    #     outputs=[chatbot, state, sources_textbox],
    # )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown(
                """
<div class="warning-box">
Version 0.1-beta - This tool is under active development
</div>

"""
            )

        with gr.Column(scale=1):
            gr.Markdown("*Source : Tikehau Universal Registration Documents *")

    gr.Markdown("## How to use TKO URD Q&A")
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown(
                """
    ### 💪 Getting started
    - In the chatbot section, simply type your Tikehau-related question, answers will be provided  with references to relevant URDs.
    """
            )
        with gr.Column(scale=1):
            gr.Markdown(
                """
    ### ⚠️ Limitations
    <div class="warning-box">
    <ul>
        <li>Please note that, like any AI, the model may occasionally generate an inaccurate or imprecise answer.</li>
    </div>
    """
            )

    gr.Markdown("## 🙏 Feedback and feature requests")
    gr.Markdown(
        """
    ### Beta test
    - Feedback welcome. Inspired from the Climate tool by Ekimetrics.
    """
    )

    gr.Markdown(
        """


## 🛢️ Carbon Footprint

Carbon emissions were measured during the development and inference process using CodeCarbon [https://github.com/mlco2/codecarbon](https://github.com/mlco2/codecarbon)

| Phase | Description | Emissions | Source |
| --- | --- | --- | --- |
| Inference | API call to turbo-GPT | ~0.38gCO2e / call | https://medium.com/@chrispointon/the-carbon-footprint-of-chatgpt-e1bc14e4cc2a |

Carbon Emissions are **relatively low but not negligible** compared to other usages: one question asked to TKO Q&A is around 0.482gCO2e - equivalent to 2.2m by car (https://datagir.ademe.fr/apps/impact-co2/)  
Or around 2 to 4 times more than a typical Google search. 

</b>.
 
"""
    )

    demo.queue(concurrency_count=16)

demo.launch()