amiguel commited on
Commit
02319e6
Β·
verified Β·
1 Parent(s): 99cc3fd

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -134
app.py DELETED
@@ -1,134 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import tempfile
4
- import os
5
- import json
6
- from pathlib import Path
7
-
8
- from langchain.schema import Document
9
- #from langchain.document_loaders import Document
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- from langchain.embeddings import HuggingFaceEmbeddings
12
- from langchain.vectorstores import FAISS
13
- from langchain.chains import RetrievalQAWithSourcesChain
14
- from langchain import HuggingFacePipeline
15
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
16
-
17
- USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
18
- BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
19
- CHAT_HISTORY_FILE = Path("chat_memory.json")
20
-
21
- def load_chat_history():
22
- if CHAT_HISTORY_FILE.exists():
23
- with open(CHAT_HISTORY_FILE, "r") as f:
24
- return json.load(f)
25
- return []
26
-
27
- def save_chat_history(history):
28
- with open(CHAT_HISTORY_FILE, "w") as f:
29
- json.dump(history, f)
30
-
31
- def preprocess_excel(file_path: str) -> pd.DataFrame:
32
- df_raw = pd.read_excel(file_path, sheet_name='Data Base', header=None)
33
- df = df_raw.iloc[4:].copy()
34
- df.columns = df.iloc[0]
35
- df = df[1:]
36
- df.dropna(how='all', inplace=True)
37
- df.dropna(axis=1, how='all', inplace=True)
38
- df.reset_index(drop=True, inplace=True)
39
- df.columns = df.columns.astype(str)
40
- return df
41
-
42
- def build_vectorstore_from_structured_records(df: pd.DataFrame):
43
- df.fillna("", inplace=True)
44
- records = []
45
- for i, row in df.iterrows():
46
- item_class = str(row.get("Item Class", "")).strip()
47
- job_done = str(row.get("Job Done", "")).strip()
48
- backlog = str(row.get("Backlog?", "")).strip()
49
- days = str(row.get("Days in Backlog", "")).strip()
50
- if not any([item_class, job_done, backlog, days]):
51
- continue
52
- sentence = f"Item Class {item_class} has status {job_done}, is in {backlog} backlog, and has {days} days."
53
- records.append(Document(page_content=sentence, metadata={"source": f"Row {i+1}"}))
54
-
55
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
56
- split_docs = splitter.split_documents(records)
57
-
58
- embeddings = HuggingFaceEmbeddings(
59
- model_name="sentence-transformers/all-MiniLM-l6-v2",
60
- model_kwargs={"device": "cpu"},
61
- encode_kwargs={"normalize_embeddings": False}
62
- )
63
- vectorstore = FAISS.from_documents(split_docs, embeddings)
64
- return vectorstore
65
-
66
- def create_qa_pipeline(vectorstore):
67
- model_id = "google/flan-t5-base"
68
- tokenizer = AutoTokenizer.from_pretrained(model_id)
69
- model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
70
- gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
71
- llm = HuggingFacePipeline(pipeline=gen_pipeline)
72
- retriever = vectorstore.as_retriever()
73
- qa = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)
74
- return qa
75
-
76
- st.set_page_config(page_title="Excel-Aware RAG Chatbot", layout="wide")
77
- st.title("πŸ“Š Excel-Aware RAG Chatbot (Structured QA)")
78
-
79
- with st.sidebar:
80
- uploaded_file = st.file_uploader("Upload your Excel file (.xlsx or .xlsm with 'Data Base' sheet)", type=["xlsx", "xlsm"])
81
- if st.button("πŸ—‘οΈ Clear Chat History"):
82
- st.session_state.chat_history = []
83
- if CHAT_HISTORY_FILE.exists():
84
- CHAT_HISTORY_FILE.unlink()
85
- st.rerun()
86
-
87
- if "chat_history" not in st.session_state:
88
- st.session_state.chat_history = load_chat_history()
89
-
90
- if uploaded_file is not None:
91
- with st.spinner("Processing and indexing your Excel sheet..."):
92
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsm") as tmp_file:
93
- tmp_file.write(uploaded_file.read())
94
- tmp_path = tmp_file.name
95
-
96
- try:
97
- df = preprocess_excel(tmp_path)
98
- vectorstore = build_vectorstore_from_structured_records(df)
99
- qa = create_qa_pipeline(vectorstore)
100
- st.success("βœ… File processed and chatbot ready! Ask your questions below.")
101
- except Exception as e:
102
- st.error(f"❌ Error processing file: {e}")
103
- finally:
104
- os.remove(tmp_path)
105
-
106
- for message in st.session_state.chat_history:
107
- st.chat_message(message["role"], avatar=USER_AVATAR if message["role"] == "user" else BOT_AVATAR).markdown(message["content"])
108
-
109
- user_prompt = st.chat_input("Ask about item classes, backlog, or status...")
110
-
111
- if user_prompt:
112
- st.session_state.chat_history.append({"role": "user", "content": user_prompt})
113
- st.chat_message("user", avatar=USER_AVATAR).markdown(user_prompt)
114
-
115
- with st.chat_message("assistant", avatar=BOT_AVATAR):
116
- with st.spinner("Thinking..."):
117
- try:
118
- response = qa.invoke({"question": user_prompt})
119
- final_response = response['answer']
120
- sources = response.get('sources', '')
121
- placeholder = st.empty()
122
- streamed = ""
123
- for word in final_response.split():
124
- streamed += word + " "
125
- placeholder.markdown(streamed + "β–Œ")
126
- placeholder.markdown(f"**{final_response.strip()}**")
127
- if sources:
128
- st.markdown(f"<sub>πŸ“Ž <i>{sources}</i></sub>", unsafe_allow_html=True)
129
- st.session_state.chat_history.append({"role": "assistant", "content": final_response})
130
- save_chat_history(st.session_state.chat_history)
131
- except Exception as e:
132
- st.error(f"❌ Error: {e}")
133
- else:
134
- st.info("Upload a file on the left to get started.")