Spaces:

Pixeltable
/

Multi-LLM-RAG-with-Groundtruth-Comparison

Running

App Files Files Community

PierreBrunelle commited on Oct 5, 2024

Commit

cb808f6

verified ·

1 Parent(s): 61df238

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -27

app.py CHANGED Viewed

@@ -1,15 +1,3 @@
-# -*- coding: utf-8 -*-
-"""LLM Comparison
-Automatically generated by Colab.
-Original file is located at
-    https://colab.research.google.com/drive/156SKaX3DY6jwOhcpwZVM5AiLscOAbNNJ
-"""
-# Commented out IPython magic to ensure Python compatibility.
-# %pip install -qU pixeltable gradio sentence-transformers tiktoken openai openpyxl
 import gradio as gr
 import pandas as pd
 import pixeltable as pxt
@@ -50,22 +38,29 @@ def create_prompt(top_k_list: list[dict], question: str) -> str:
     {question}'''
 def process_files(ground_truth_file, pdf_files):
-    # Process ground truth file
     if ground_truth_file.name.endswith('.csv'):
         queries_t = pxt.io.import_csv('rag_demo.queries', ground_truth_file.name)
     else:
         queries_t = pxt.io.import_excel('rag_demo.queries', ground_truth_file.name)
-    # Process PDF files
     documents_t = pxt.create_table(
         'rag_demo.documents',
         {'document': pxt.DocumentType()}
     )
     documents_t.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf'))
-     # Create chunks view
     chunks_t = pxt.create_view(
         'rag_demo.chunks',
         documents_t,
@@ -76,10 +71,10 @@ def process_files(ground_truth_file, pdf_files):
         )
     )
-    # Add embedding index
     chunks_t.add_embedding_index('text', string_embed=e5_embed)
-    # Create top_k query
     @chunks_t.query
     def top_k(query_text: str):
       sim = chunks_t.text.similarity(query_text)
@@ -89,13 +84,13 @@ def process_files(ground_truth_file, pdf_files):
               .limit(5)
       )
-    # Add computed columns to queries_t
     queries_t['question_context'] = chunks_t.top_k(queries_t.Question)
     queries_t['prompt'] = create_prompt(
         queries_t.question_context, queries_t.Question
     )
-    # Prepare messages for OpenAI
     messages = [
         {
             'role': 'system',
@@ -109,17 +104,18 @@ def process_files(ground_truth_file, pdf_files):
      # Add OpenAI response column
     queries_t['response'] = openai.chat_completions(
-        model='gpt-4o-mini-2024-07-18', messages=messages
     )
-    queries_t['answer'] = queries_t.response.choices[0].message.content
     df_output = queries_t.select(queries_t.Question, queries_t.correct_answer, queries_t.answer).collect().to_pandas()
     try:
-    #Display content
-      return df_output
     except Exception as e:
         return f"An error occurred: {str(e)}", None
@@ -127,18 +123,21 @@ def process_files(ground_truth_file, pdf_files):
 with gr.Blocks() as demo:
     gr.Markdown("# RAG Demo App")
     with gr.Row():
         ground_truth_file = gr.File(label="Upload Ground Truth (CSV or XLSX)", file_count="single")
         pdf_files = gr.File(label="Upload PDF Documents", file_count="multiple")
-    process_button = gr.Button("Process Files")
     df_output = gr.DataFrame(label="Pixeltable Table")
     #question_input = gr.Textbox(label="Enter your question")
     #query_button = gr.Button("Query LLM")
-    process_button.click(process_files, inputs=[ground_truth_file, pdf_files], outputs=df_output)
     #query_button.click(query_llm, inputs=question_input, outputs=output_dataframe)
 if __name__ == "__main__":

 import gradio as gr
 import pandas as pd
 import pixeltable as pxt
     {question}'''
+# Gradio Application
 def process_files(ground_truth_file, pdf_files):
+    # Ensure a clean slate for the demo by removing and recreating the 'rag_demo' directory
+    pxt.drop_dir('rag_demo', force=True)
+    pxt.create_dir('rag_demo')
+    # Process the ground truth file, which contains questions and correct answers
+    # Import as CSV or Excel depending on the file extension
     if ground_truth_file.name.endswith('.csv'):
         queries_t = pxt.io.import_csv('rag_demo.queries', ground_truth_file.name)
     else:
         queries_t = pxt.io.import_excel('rag_demo.queries', ground_truth_file.name)
+    # Create a table to store the uploaded PDF documents
     documents_t = pxt.create_table(
         'rag_demo.documents',
         {'document': pxt.DocumentType()}
     )
+    # Insert the PDF files into the documents table
     documents_t.insert({'document': file.name} for file in pdf_files if file.name.endswith('.pdf'))
+    # Create a view that splits the documents into smaller chunks
     chunks_t = pxt.create_view(
         'rag_demo.chunks',
         documents_t,
         )
     )
+    # Add an embedding index to the chunks for similarity search
     chunks_t.add_embedding_index('text', string_embed=e5_embed)
+    # Define a query function to retrieve the top-k most similar chunks for a given question
     @chunks_t.query
     def top_k(query_text: str):
       sim = chunks_t.text.similarity(query_text)
               .limit(5)
       )
+    # Add computed columns to the queries table for context retrieval and prompt creation
     queries_t['question_context'] = chunks_t.top_k(queries_t.Question)
     queries_t['prompt'] = create_prompt(
         queries_t.question_context, queries_t.Question
     )
+    # Prepare messages for the OpenAI API, including system instructions and user prompt
     messages = [
         {
             'role': 'system',
      # Add OpenAI response column
     queries_t['response'] = openai.chat_completions(
+        model='gpt-4o-mini-2024-07-18, messages=messages
     )
+    # Extract the answer text from the API response
+    queries_t['answer'] = queries_t.response.choices[0].message.content.astype(pxt.StringType())
+    # Prepare the output dataframe with questions, correct answers, and model-generated answers
     df_output = queries_t.select(queries_t.Question, queries_t.correct_answer, queries_t.answer).collect().to_pandas()
     try:
+        # Return the output dataframe for display
+        return df_output
     except Exception as e:
         return f"An error occurred: {str(e)}", None
 with gr.Blocks() as demo:
     gr.Markdown("# RAG Demo App")
+    # File upload components for ground truth and PDF documents
     with gr.Row():
         ground_truth_file = gr.File(label="Upload Ground Truth (CSV or XLSX)", file_count="single")
         pdf_files = gr.File(label="Upload PDF Documents", file_count="multiple")
+    # Button to trigger file processing
+    process_button = gr.Button("Process Files and Generate Outputs")
+    # Output component to display the results
     df_output = gr.DataFrame(label="Pixeltable Table")
+    process_button.click(process_files, inputs=[ground_truth_file, pdf_files], outputs=df_output)
     #question_input = gr.Textbox(label="Enter your question")
     #query_button = gr.Button("Query LLM")
     #query_button.click(query_llm, inputs=question_input, outputs=output_dataframe)
 if __name__ == "__main__":