import os import streamlit as st import fitz import sqlite3 import pdfplumber from langchain_community.vectorstores import FAISS from langchain_openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter # Now works @st.cache_resource def init_system(): process_pdf("Q1FY24.pdf") embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY")) vector_store = FAISS.load_local("faiss_index", embeddings) conn = sqlite3.connect('metric_table.db') return vector_store, conn def process_pdf(pdf_path): # Structured Data Extraction conn = sqlite3.connect('metric_table.db') cursor = conn.cursor() cursor.execute('''CREATE TABLE IF NOT EXISTS metric_table (metric TEXT, quarter TEXT, value REAL)''') # Example metric insertion (add full extraction logic) cursor.execute("INSERT INTO metric_table VALUES ('Revenue', 'Q1 FY24', 19.8)") conn.commit() # Unstructured Data Processing full_text = "" with fitz.open(pdf_path) as doc: for page in doc: full_text += page.get_text() # Text Chunking & Embedding splitter = RecursiveCharacterTextSplitter(chunk_size=1000) chunks = splitter.split_text(full_text) embeddings = OpenAIEmbeddings( openai_api_key=os.getenv("OPENAI_API_KEY"), model="text-embedding-ada-002" ) FAISS.from_texts(chunks, embeddings).save_local("faiss_index") # Streamlit UI def main(): st.title("Fundrev Financial Analyzer") # Initialize system vector_store, conn = init_system() query = st.text_input("Ask financial question:") if query: # Structured data queries if any(kw in query.lower() for kw in ["trend", "margin", "revenue"]): cursor = conn.cursor() cursor.execute(f"SELECT * FROM metric_table WHERE metric LIKE '%{query}%'") results = cursor.fetchall() st.table(results if results else "No matching metrics found") # Unstructured data queries else: docs = vector_store.similarity_search(query, k=1) st.write(docs[0].page_content if docs else "No relevant information found") if __name__ == "__main__": main()