import os import streamlit as st import pandas as pd import openai import torch import matplotlib.pyplot as plt from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from dotenv import load_dotenv import anthropic import ast import re # Load environment variables load_dotenv() os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY") # UI Styling st.markdown( """ """, unsafe_allow_html=True ) st.title("Excel Q&A Chatbot 📊") # Model Selection model_choice = st.selectbox("Select LLM Model", ["OpenAI GPT-3.5", "Claude 3 Haiku", "Mistral-7B"]) # Load appropriate model based on selection if model_choice == "Mistral-7B": model_name = "mistralai/Mistral-7B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) def ask_mistral(query): inputs = tokenizer(query, return_tensors="pt").to("cuda") output = model.generate(**inputs) return tokenizer.decode(output[0]) elif model_choice == "Claude 3 Haiku": client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) def ask_claude(query): response = client.messages.create( model="claude-3-haiku", max_tokens=512, messages=[{"role": "user", "content": query}] ) return response.content[0]["text"] else: client = openai.OpenAI() def ask_gpt(query): response = client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": query}] ) return response.choices[0].message.content # File Upload with validation uploaded_file = st.file_uploader("Upload a file", type=["csv", "xlsx", "xls", "json", "tsv"]) if uploaded_file is not None: file_extension = uploaded_file.name.split(".")[-1].lower() try: if file_extension == "csv": df = pd.read_csv(uploaded_file) elif file_extension in ["xlsx", "xls"]: df = pd.read_excel(uploaded_file, engine="openpyxl") elif file_extension == "json": df = pd.read_json(uploaded_file) elif file_extension == "tsv": df = pd.read_csv(uploaded_file, sep="\t") else: st.error("Unsupported file format. Please upload a CSV, Excel, JSON, or TSV file.") st.stop() st.write("### Preview of Data:") st.write(df.head()) # Extract metadata column_names = df.columns.tolist() data_types = df.dtypes.apply(lambda x: x.name).to_dict() missing_values = df.isnull().sum().to_dict() # Display metadata st.write("### Column Details:") st.write(pd.DataFrame({"Column": column_names, "Type": data_types.values(), "Missing Values": missing_values.values()})) except Exception as e: st.error(f"Error loading file: {str(e)}") st.stop() # User Query query = st.text_input("Ask a question about this data:") if st.button("Submit Query"): if query: # Interpret the query using selected LLM if model_choice == "Mistral-7B": parsed_query = ask_mistral(f"Convert this question into a structured data operation: {query}") elif model_choice == "Claude 3 Haiku": parsed_query = ask_claude(f"Convert this question into a structured data operation: {query}") else: parsed_query = ask_gpt(f"Convert this question into a structured data operation: {query}") # Validate and clean query parsed_query = re.sub(r"[^a-zA-Z0-9_()\[\]"'., ]", "", parsed_query.strip()) st.write(f"Parsed Query: `{parsed_query}`") # Predefined Safe Execution Methods SAFE_OPERATIONS = { "sum": lambda col: df[col].sum(), "mean": lambda col: df[col].mean(), "max": lambda col: df[col].max(), "groupby_sum": lambda col, group_by: df.groupby(group_by)[col].sum() } # Safe Execution try: exec_result = eval(parsed_query, {"df": df, "pd": pd, "SAFE_OPERATIONS": SAFE_OPERATIONS}) st.write("### Result:") st.write(exec_result if isinstance(exec_result, pd.DataFrame) else str(exec_result)) # If numerical data, show a visualization dynamically if isinstance(exec_result, pd.Series): fig, ax = plt.subplots() if exec_result.dtype in ["int64", "float64"]: exec_result.plot(kind="bar", ax=ax) elif exec_result.dtype == "object": exec_result.value_counts().plot(kind="bar", ax=ax) st.pyplot(fig) except SyntaxError as e: st.error(f"Syntax Error in parsed query: {str(e)}") except Exception as e: st.error(f"Error executing query: {str(e)}") # Memory for context retention if "query_history" not in st.session_state: st.session_state.query_history = [] st.session_state.query_history.append(query)