Spaces:

nolanzandi
/

virtual-data-analyst

Running

App Files Files Community

nolanzandi commited on Feb 26

Commit

ccbdd61

verified ·

1 Parent(s): 19b2962

Refactor functions and improve llm accuracy (#14)

Browse files

- refactor functions and improve llm accuracy (3d660e2779859c6a266e023b0f93f48de373bb3a)

Files changed (9) hide show

app.py +99 -2
functions/__init__.py +2 -2
functions/chart_functions.py +77 -26
functions/chat_functions.py +3 -111
pipelines/__init__.py +0 -3
pipelines/pipelines.py +0 -91
requirements.txt +1 -1
tools.py +12 -7
utils.py +3 -1

app.py CHANGED Viewed

@@ -1,5 +1,9 @@
-from functions import demo
 import os
 from getpass import getpass
 from dotenv import load_dotenv
@@ -9,5 +13,98 @@ load_dotenv()
 if "OPENAI_API_KEY" not in os.environ:
     os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
 ## Uncomment the line below to launch the chat app with UI
-demo.launch(debug=True, allowed_paths=["temp/"])

+from data_sources import process_data_upload
+from functions import example_question_generator, chatbot_with_fc
+from utils import TEMP_DIR, message_dict
+import gradio as gr
+import ast
 import os
 from getpass import getpass
 from dotenv import load_dotenv
 if "OPENAI_API_KEY" not in os.environ:
     os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
+def delete_db(req: gr.Request):
+    import shutil
+    dir_path = TEMP_DIR / str(req.session_hash)
+    if os.path.exists(dir_path):
+        shutil.rmtree(dir_path)
+        message_dict[req.session_hash] = None
+def run_example(input):
+    return input
+def example_display(input):
+    if input == None:
+        display = True
+    else:
+        display = False
+    return [gr.update(visible=display),gr.update(visible=display)]
+css= ".file_marker .large{min-height:50px !important;} .example_btn{max-width:300px;}"
+with gr.Blocks(css=css, delete_cache=(3600,3600)) as demo:
+    title = gr.HTML("<h1 style='text-align:center;'>Virtual Data Analyst</h1>")
+    description = gr.HTML("""<p style='text-align:center;'>Upload a data file and chat with our virtual data analyst
+                          to get insights on your data set. Currently accepts CSV, TSV, TXT, XLS, XLSX, XML, and JSON files.
+                          Can now generate charts and graphs!
+                          Try a sample file to get started!</p>
+                          <p style='text-align:center;'>This tool is under active development. If you experience bugs with use,
+                          open a discussion in the community tab and I will respond.</p>""")
+    example_file_1 = gr.File(visible=False, value="samples/bank_marketing_campaign.csv")
+    example_file_2 = gr.File(visible=False, value="samples/online_retail_data.csv")
+    with gr.Row():
+        example_btn_1 = gr.Button(value="Try Me: bank_marketing_campaign.csv", elem_classes="example_btn", size="md", variant="primary")
+        example_btn_2 = gr.Button(value="Try Me: online_retail_data.csv", elem_classes="example_btn", size="md", variant="primary")
+    file_output = gr.File(label="Data File (CSV, TSV, TXT, XLS, XLSX, XML, JSON)", show_label=True, elem_classes="file_marker", file_types=['.csv','.xlsx','.txt','.json','.ndjson','.xml','.xls','.tsv'])
+    example_btn_1.click(fn=run_example, inputs=example_file_1, outputs=file_output)
+    example_btn_2.click(fn=run_example, inputs=example_file_2, outputs=file_output)
+    file_output.change(fn=example_display, inputs=file_output, outputs=[example_btn_1, example_btn_2])
+    @gr.render(inputs=file_output)
+    def data_options(filename, request: gr.Request):
+        print(filename)
+        message_dict[request.session_hash] = None
+        if filename:
+            process_upload(filename, request.session_hash)
+            if "bank_marketing_campaign" in filename:
+                example_questions = [
+                                        ["Describe the dataset"],
+                                        ["What levels of education have the highest and lowest average balance?"],
+                                        ["What job is most and least common for a yes response from the individuals, not counting 'unknown'?"],
+                                        ["Can you generate a bar chart of education vs. average balance?"],
+                                        ["Can you generate a table of levels of education versus average balance, percent married, percent with a loan, and percent in default?"]
+                                    ]
+            elif "online_retail_data" in filename:
+                example_questions = [
+                                        ["Describe the dataset"],
+                                        ["What month had the highest revenue?"],
+                                        ["Is revenue higher in the morning or afternoon?"],
+                                        ["Can you generate a line graph of revenue per month?"],
+                                        ["Can you generate a table of revenue per month?"]
+                                    ]
+            else:
+                try:
+                    generated_examples = ast.literal_eval(example_question_generator(request.session_hash))
+                    example_questions = [
+                                            ["Describe the dataset"]
+                                        ]
+                    for example in generated_examples:
+                        example_questions.append([example])
+                except:
+                    example_questions = [
+                                        ["Describe the dataset"],
+                                        ["List the columns in the dataset"],
+                                        ["What could this data be used for?"],
+                                    ]
+            parameters = gr.Textbox(visible=False, value=request.session_hash)
+            bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
+            chat = gr.ChatInterface(
+                                fn=chatbot_with_fc,
+                                type='messages',
+                                chatbot=bot,
+                                title="Chat with your data file",
+                                concurrency_limit=None,
+                                examples=example_questions,
+                                additional_inputs=parameters
+                                )
+    def process_upload(upload_value, session_hash):
+        if upload_value:
+            process_data_upload(upload_value, session_hash)
+        return [], []
+    demo.unload(delete_db)
 ## Uncomment the line below to launch the chat app with UI
+demo.launch(debug=True, allowed_paths=["temp/"])

functions/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .sqlite_functions import SQLiteQuery, sqlite_query_func
 from .chart_functions import chart_generation_func, table_generation_func
-from .chat_functions import demo
-__all__ = ["SQLiteQuery","sqlite_query_func","chart_generation_func","table_generation_func","demo"]

 from .sqlite_functions import SQLiteQuery, sqlite_query_func
 from .chart_functions import chart_generation_func, table_generation_func
+from .chat_functions import example_question_generator, chatbot_with_fc
+__all__ = ["SQLiteQuery","sqlite_query_func","chart_generation_func","table_generation_func","example_question_generator","chatbot_with_fc"]

functions/chart_functions.py CHANGED Viewed

@@ -1,45 +1,96 @@
 from typing import List
-from quickchart import QuickChart
 import pandas as pd
 from utils import TEMP_DIR
 import os
 from dotenv import load_dotenv
 load_dotenv()
 root_url = os.getenv("ROOT_URL")
-def chart_generation_func(queries: List[str], session_hash):
     print("CHART GENERATION")
-    query_dict = queries[0]
-    print(query_dict)
-    qc = QuickChart()
-    qc.width = 1000
-    qc.height = 500
-    # Config can be set as a string or as a nested dict
-    qc.config = query_dict
-    url_id = qc.get_short_url().rsplit('/', 1)[-1]
-    url_base = qc.get_url_base()
-    # You can get the chart URL...
-    interactive_url = url_base + '/chart-maker/view/' + url_id
-    edit_url = url_base + '/chart-maker/edit/' + url_id
-    iframe = '<div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + interactive_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n <p>Edit, share, and download this graph <a target="_blank" href="' + edit_url + '">here</a></p></div>'
-    return {"reply": iframe}
-def table_generation_func(data: List[str], session_hash):
-    dir_path = TEMP_DIR / str(session_hash)
     print("TABLE GENERATION")
     print(data)
-    df = pd.DataFrame(data)
-    csv_path = f'{dir_path}/data.csv'
-    df.to_csv(csv_path)
-    download_path = f'{root_url}/gradio_api/file/temp/{session_hash}/data.csv'
-    html_table = df.to_html() + f'<p>Download as a <a target="_blank" href="{download_path}">CSV file</a></p>'
-    print(html_table)
-    return {"reply": html_table}

 from typing import List
+from typing import Dict
+import plotly.io as pio
 import pandas as pd
 from utils import TEMP_DIR
 import os
+import ast
 from dotenv import load_dotenv
 load_dotenv()
 root_url = os.getenv("ROOT_URL")
+def chart_generation_func(data: List[dict], session_hash: str, layout: Dict[str,str]={}):
     print("CHART GENERATION")
+    print(data)
+    print(layout)
+    try:
+        dir_path = TEMP_DIR / str(session_hash)
+        chart_path = f'{dir_path}/chart.html'
+        #Processing data to account for variation from LLM
+        data_list = []
+        layout_dict = {}
+        if isinstance(data, list):
+            data_list = data
+        else:
+            data_list.append(data)
+        if isinstance(data[0], str):
+           data_list[0] = ast.literal_eval(data_list[0])
+        if isinstance(layout, list):
+           layout_obj = layout[0]
+        else:
+           layout_obj = layout
+        if isinstance(layout_obj, str):
+           layout_dict = ast.literal_eval(layout_obj)
+        else:
+           layout_dict = layout_obj
+        fig = dict({"data": data_list,
+                    "layout": layout_dict})
+        pio.write_html(fig, chart_path, full_html=False)
+        chart_url = f'{root_url}/gradio_api/file/temp/{session_hash}/chart.html'
+        iframe = '<div style=overflow:auto;><iframe\n    scrolling="yes"\n    width="1000px"\n    height="500px"\n    src="' + chart_url + '"\n    frameborder="0"\n    allowfullscreen\n></iframe>\n</div>'
+        return {"reply": iframe}
+    except Exception as e:
+      print("CHART ERROR")
+      reply = f"""There was an error generating the Plotly Chart from {data} and {layout}
+              The error is {e},
+              You should probably try again.
+              """
+      return {"reply": reply}
+def table_generation_func(data: List[dict], session_hash):
     print("TABLE GENERATION")
     print(data)
+    try:
+        dir_path = TEMP_DIR / str(session_hash)
+        csv_path = f'{dir_path}/data.csv'
+        #Processing data to account for variation from LLM
+        if isinstance(data, list):
+           data_obj = data[0]
+        else:
+           data_obj = data
+        if isinstance(data_obj, str):
+           data_dict = ast.literal_eval(data_obj)
+        else:
+           data_dict = data_obj
+        df = pd.DataFrame.from_dict(data_dict)
+        print(df)
+        df.to_csv(csv_path)
+        download_path = f'{root_url}/gradio_api/file/temp/{session_hash}/data.csv'
+        html_table = df.to_html() + f'<p>Download as a <a href="{download_path}">CSV file</a></p>'
+        print(html_table)
+        return {"reply": html_table}
+    except Exception as e:
+      print("TABLE ERROR")
+      reply = f"""There was an error generating the Pandas DataFrame table from {data}
+              The error is {e},
+              You should probably try again.
+              """
+      return {"reply": reply}

functions/chat_functions.py CHANGED Viewed

@@ -1,24 +1,10 @@
-from data_sources import process_data_upload
-from utils import TEMP_DIR
-import gradio as gr
 from haystack.dataclasses import ChatMessage
 from haystack.components.generators.chat import OpenAIChatGenerator
-import os
-import ast
-from getpass import getpass
-from dotenv import load_dotenv
-load_dotenv()
-if "OPENAI_API_KEY" not in os.environ:
-    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
 chat_generator = OpenAIChatGenerator(model="gpt-4o")
 response = None
-message_dict = {}
 def example_question_generator(session_hash):
     import sqlite3
@@ -51,10 +37,9 @@ def example_question_generator(session_hash):
 def chatbot_with_fc(message, history, session_hash):
     from functions import sqlite_query_func, chart_generation_func, table_generation_func
-    from pipelines import rag_pipeline_func
     import tools
-    available_functions = {"sql_query_func": sqlite_query_func, "rag_pipeline_func": rag_pipeline_func, "chart_generation_func": chart_generation_func, "table_generation_func":table_generation_func }
     if message_dict[session_hash] != None:
         message_dict[session_hash].append(ChatMessage.from_user(message))
@@ -62,7 +47,7 @@ def chatbot_with_fc(message, history, session_hash):
         messages = [
             ChatMessage.from_system(
                 """You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source'.
-                You also have access to a chart API that uses chart.js dictionaries formatted as a string to generate charts and graphs.
                 You also have access to a function, called table_generation_func, that builds table formatted html and generates a link to download as CSV."""
             )
         ]
@@ -95,97 +80,4 @@ def chatbot_with_fc(message, history, session_hash):
             break
     return response["replies"][0].text
-def delete_db(req: gr.Request):
-    import shutil
-    dir_path = TEMP_DIR / str(req.session_hash)
-    if os.path.exists(dir_path):
-        shutil.rmtree(dir_path)
-        message_dict[req.session_hash] = None
-def run_example(input):
-    return input
-def example_display(input):
-    if input == None:
-        display = True
-    else:
-        display = False
-    return [gr.update(visible=display),gr.update(visible=display)]
-css= ".file_marker .large{min-height:50px !important;} .example_btn{max-width:300px;}"
-with gr.Blocks(css=css, delete_cache=(3600,3600)) as demo:
-    title = gr.HTML("<h1 style='text-align:center;'>Virtual Data Analyst</h1>")
-    description = gr.HTML("""<p style='text-align:center;'>Upload a data file and chat with our virtual data analyst
-                          to get insights on your data set. Currently accepts CSV, TSV, TXT, XLS, XLSX, XML, and JSON files.
-                          Can now generate charts and graphs!
-                          Try a sample file to get started!</p>
-                          <p style='text-align:center;'>This tool is under active development. If you experience bugs with use,
-                          open a discussion in the community tab and I will respond.</p>""")
-    example_file_1 = gr.File(visible=False, value="samples/bank_marketing_campaign.csv")
-    example_file_2 = gr.File(visible=False, value="samples/online_retail_data.csv")
-    with gr.Row():
-        example_btn_1 = gr.Button(value="Try Me: bank_marketing_campaign.csv", elem_classes="example_btn", size="md", variant="primary")
-        example_btn_2 = gr.Button(value="Try Me: online_retail_data.csv", elem_classes="example_btn", size="md", variant="primary")
-    file_output = gr.File(label="Data File (CSV, TSV, TXT, XLS, XLSX, XML, JSON)", show_label=True, elem_classes="file_marker", file_types=['.csv','.xlsx','.txt','.json','.ndjson','.xml','.xls','.tsv'])
-    example_btn_1.click(fn=run_example, inputs=example_file_1, outputs=file_output)
-    example_btn_2.click(fn=run_example, inputs=example_file_2, outputs=file_output)
-    file_output.change(fn=example_display, inputs=file_output, outputs=[example_btn_1, example_btn_2])
-    @gr.render(inputs=file_output)
-    def data_options(filename, request: gr.Request):
-        print(filename)
-        message_dict[request.session_hash] = None
-        if filename:
-            process_upload(filename, request.session_hash)
-            if "bank_marketing_campaign" in filename:
-                example_questions = [
-                                        ["Describe the dataset"],
-                                        ["What levels of education have the highest and lowest average balance?"],
-                                        ["What job is most and least common for a yes response from the individuals, not counting 'unknown'?"],
-                                        ["Can you generate a bar chart of education vs. average balance?"],
-                                        ["Can you generate a table of levels of education versus average balance, percent married, percent with a loan, and percent in default?"]
-                                    ]
-            elif "online_retail_data" in filename:
-                example_questions = [
-                                        ["Describe the dataset"],
-                                        ["What month had the highest revenue?"],
-                                        ["Is revenue higher in the morning or afternoon?"],
-                                        ["Can you generate a line graph of revenue per month?"],
-                                        ["Can you generate a table of revenue per month?"]
-                                    ]
-            else:
-                try:
-                    generated_examples = ast.literal_eval(example_question_generator(request.session_hash))
-                    example_questions = [
-                                            ["Describe the dataset"]
-                                        ]
-                    for example in generated_examples:
-                        example_questions.append([example])
-                except:
-                    example_questions = [
-                                        ["Describe the dataset"],
-                                        ["List the columns in the dataset"],
-                                        ["What could this data be used for?"],
-                                    ]
-            parameters = gr.Textbox(visible=False, value=request.session_hash)
-            bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
-            chat = gr.ChatInterface(
-                                fn=chatbot_with_fc,
-                                type='messages',
-                                chatbot=bot,
-                                title="Chat with your data file",
-                                concurrency_limit=None,
-                                examples=example_questions,
-                                additional_inputs=parameters
-                                )
-    def process_upload(upload_value, session_hash):
-        if upload_value:
-            process_data_upload(upload_value, session_hash)
-        return [], []
-    demo.unload(delete_db)

+from utils import TEMP_DIR, message_dict
 from haystack.dataclasses import ChatMessage
 from haystack.components.generators.chat import OpenAIChatGenerator
 chat_generator = OpenAIChatGenerator(model="gpt-4o")
 response = None
 def example_question_generator(session_hash):
     import sqlite3
 def chatbot_with_fc(message, history, session_hash):
     from functions import sqlite_query_func, chart_generation_func, table_generation_func
     import tools
+    available_functions = {"sql_query_func": sqlite_query_func, "chart_generation_func": chart_generation_func, "table_generation_func":table_generation_func }
     if message_dict[session_hash] != None:
         message_dict[session_hash].append(ChatMessage.from_user(message))
         messages = [
             ChatMessage.from_system(
                 """You are a helpful and knowledgeable agent who has access to an SQLite database which has a table called 'data_source'.
+                You also have access to a chart function that uses plotly dictionaries to generate charts and graphs.
                 You also have access to a function, called table_generation_func, that builds table formatted html and generates a link to download as CSV."""
             )
         ]
             break
     return response["replies"][0].text

pipelines/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .pipelines import rag_pipeline_func
-__all__ = ["rag_pipeline_func"]

pipelines/pipelines.py DELETED Viewed

@@ -1,91 +0,0 @@
-from haystack import Pipeline
-from haystack.components.builders import PromptBuilder
-from haystack.components.generators.openai import OpenAIGenerator
-from haystack.components.routers import ConditionalRouter
-from functions import SQLiteQuery
-from typing import List
-import sqlite3
-import os
-from getpass import getpass
-from dotenv import load_dotenv
-load_dotenv()
-if "OPENAI_API_KEY" not in os.environ:
-    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
-from haystack.components.builders import PromptBuilder
-from haystack.components.generators import OpenAIGenerator
-llm = OpenAIGenerator(model="gpt-4o")
-def rag_pipeline_func(queries: str, session_hash):
-   sql_query = SQLiteQuery(f'data_source_{session_hash}.db')
-   connection = sqlite3.connect(f'data_source_{session_hash}.db')
-   cur=connection.execute('select * from data_source')
-   columns = [i[0] for i in cur.description]
-   cur.close()
-   #Rag Pipeline
-   prompt = PromptBuilder(template="""Please generate an SQL query. The query should answer the following Question: {{question}};
-               If the question cannot be answered given the provided table and columns, return 'no_answer'
-               The query is to be answered for the table is called 'data_source' with the following
-               Columns: {{columns}};
-               Answer:""")
-   routes = [
-      {
-         "condition": "{{'no_answer' not in replies[0]}}",
-         "output": "{{replies}}",
-         "output_name": "sql",
-         "output_type": List[str],
-      },
-      {
-         "condition": "{{'no_answer' in replies[0]}}",
-         "output": "{{question}}",
-         "output_name": "go_to_fallback",
-         "output_type": str,
-      },
-   ]
-   router = ConditionalRouter(routes)
-   fallback_prompt = PromptBuilder(template="""User entered a query that cannot be answered with the given table.
-                                             The query was: {{question}} and the table had columns: {{columns}}.
-                                             Let the user know why the question cannot be answered""")
-   fallback_llm = OpenAIGenerator(model="gpt-4")
-   conditional_sql_pipeline = Pipeline()
-   conditional_sql_pipeline.add_component("prompt", prompt)
-   conditional_sql_pipeline.add_component("llm", llm)
-   conditional_sql_pipeline.add_component("router", router)
-   conditional_sql_pipeline.add_component("fallback_prompt", fallback_prompt)
-   conditional_sql_pipeline.add_component("fallback_llm", fallback_llm)
-   conditional_sql_pipeline.add_component("sql_querier", sql_query)
-   conditional_sql_pipeline.connect("prompt", "llm")
-   conditional_sql_pipeline.connect("llm.replies", "router.replies")
-   conditional_sql_pipeline.connect("router.sql", "sql_querier.queries")
-   conditional_sql_pipeline.connect("router.go_to_fallback", "fallback_prompt.question")
-   conditional_sql_pipeline.connect("fallback_prompt", "fallback_llm")
-   print("RAG PIPELINE FUNCTION")
-   result = conditional_sql_pipeline.run({"prompt": {"question": queries,
-                                                  "columns": columns},
-                                       "router": {"question": queries},
-                                       "fallback_prompt": {"columns": columns}})
-   if 'sql_querier' in result:
-      reply = result['sql_querier']['results'][0]
-   elif 'fallback_llm' in result:
-      reply = result['fallback_llm']['replies'][0]
-   else:
-      reply = result["llm"]["replies"][0]
-   print("reply content")
-   print(reply.content)
-   return {"reply": reply.content}

requirements.txt CHANGED Viewed

@@ -2,5 +2,5 @@ haystack-ai
 python-dotenv
 gradio
 pandas
-quickchart.io
 openpyxl

 python-dotenv
 gradio
 pandas
+plotly
 openpyxl

tools.py CHANGED Viewed

@@ -29,7 +29,7 @@ def tools_call(session_hash):
                             }
                         }
                     },
-                    "required": ["question"],
                 },
             },
         },
@@ -44,17 +44,22 @@ def tools_call(session_hash):
                 "parameters": {
                     "type": "object",
                     "properties": {
-                        "queries": {
                             "type": "array",
-                            "description": """The data points to use in the chart generation. Infer this from the user's message.
-                            Send a chart.js dictionary with options that correspond to the users request. But also format this dictionary as a string as this will allow javascript to be interpreted by the API we are using.
-                            Send nothing else.""",
                             "items": {
                                 "type": "string",
                             }
                         }
                     },
-                    "required": ["question"],
                 },
             },
         },
@@ -82,7 +87,7 @@ def tools_call(session_hash):
                             }
                         }
                     },
-                    "required": ["question"],
                 },
             },
         }

                             }
                         }
                     },
+                    "required": ["queries"],
                 },
             },
         },
                 "parameters": {
                     "type": "object",
                     "properties": {
+                        "data": {
                             "type": "array",
+                            "description": """The list containing a dictionary that contains the 'data' portion of the plotly chart generation. Infer this from the user's message.""",
+                            "items": {
+                                "type": "string",
+                            }
+                        },
+                        "layout": {
+                            "type": "array",
+                            "description": """The dictionary that contains the 'layout' portion of the plotly chart generation""",
                             "items": {
                                 "type": "string",
                             }
                         }
                     },
+                    "required": ["data"],
                 },
             },
         },
                             }
                         }
                     },
+                    "required": ["data"],
                 },
             },
         }

utils.py CHANGED Viewed

@@ -2,4 +2,6 @@ from pathlib import Path
 current_dir = Path(__file__).parent
-TEMP_DIR = current_dir / 'temp'

 current_dir = Path(__file__).parent
+TEMP_DIR = current_dir / 'temp'
+message_dict = {}