Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
import faiss
|
5 |
+
import numpy as np
|
6 |
+
import requests
|
7 |
+
import fitz # PyMuPDF for PDF text extraction
|
8 |
+
|
9 |
+
# Load your data from PDF
|
10 |
+
@st.cache_resource
|
11 |
+
def load_data(file):
|
12 |
+
text = ""
|
13 |
+
with fitz.open(stream=file.read(), filetype="pdf") as pdf_file:
|
14 |
+
for page in pdf_file:
|
15 |
+
text += page.get_text()
|
16 |
+
return pd.DataFrame({'combined_text': [text]})
|
17 |
+
|
18 |
+
# Initialize the embedding model and FAISS index
|
19 |
+
@st.cache_resource
|
20 |
+
def initialize_embeddings(data):
|
21 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
22 |
+
embeddings = embedder.encode(data['combined_text'].tolist(), convert_to_tensor=False)
|
23 |
+
embedding_dim = embeddings[0].shape[0]
|
24 |
+
index = faiss.IndexFlatL2(embedding_dim)
|
25 |
+
index.add(np.array(embeddings))
|
26 |
+
return embedder, index
|
27 |
+
|
28 |
+
# OpenAI API setup
|
29 |
+
openai_api_key = "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" # Replace with your actual OpenAI API key
|
30 |
+
|
31 |
+
# Function to retrieve top-k similar documents from FAISS index
|
32 |
+
def retrieve(query, embedder, index, data, top_k=5):
|
33 |
+
query_embedding = embedder.encode([query], convert_to_tensor=False)
|
34 |
+
distances, indices = index.search(np.array(query_embedding), top_k)
|
35 |
+
return data.iloc[indices[0]]
|
36 |
+
|
37 |
+
# Function for RAG using OpenAI API
|
38 |
+
def rag_query(query, embedder, index, data, top_k=5):
|
39 |
+
retrieved_docs = retrieve(query, embedder, index, data, top_k)
|
40 |
+
context = "\n".join(retrieved_docs['combined_text'].tolist())
|
41 |
+
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
|
42 |
+
|
43 |
+
# Call the OpenAI API
|
44 |
+
headers = {
|
45 |
+
"Authorization": f"Bearer {openai_api_key}",
|
46 |
+
"Content-Type": "application/json"
|
47 |
+
}
|
48 |
+
|
49 |
+
data = {
|
50 |
+
"model": "gpt-3.5-turbo", # Change to your preferred model
|
51 |
+
"messages": [{"role": "user", "content": prompt}],
|
52 |
+
"max_tokens": 100,
|
53 |
+
"temperature": 0.7
|
54 |
+
}
|
55 |
+
|
56 |
+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data)
|
57 |
+
|
58 |
+
if response.status_code == 200:
|
59 |
+
answer = response.json().get("choices", [{}])[0].get("message", {}).get("content", "No answer found.")
|
60 |
+
else:
|
61 |
+
answer = f"Error: {response.json().get('error', 'Unknown error')}"
|
62 |
+
|
63 |
+
return answer
|
64 |
+
|
65 |
+
# Streamlit UI
|
66 |
+
st.title("RAG Application with OpenAI API")
|
67 |
+
st.write("Ask a question, and I'll find the answer for you!")
|
68 |
+
|
69 |
+
# File uploader for PDF data
|
70 |
+
uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
|
71 |
+
|
72 |
+
if uploaded_file is not None:
|
73 |
+
# Load data
|
74 |
+
data = load_data(uploaded_file)
|
75 |
+
|
76 |
+
# Initialize embeddings and FAISS index
|
77 |
+
embedder, index = initialize_embeddings(data)
|
78 |
+
|
79 |
+
# User input for query
|
80 |
+
query = st.text_input("Your question:")
|
81 |
+
|
82 |
+
if st.button("Get Answer"):
|
83 |
+
if query:
|
84 |
+
answer = rag_query(query, embedder, index, data)
|
85 |
+
st.write("Answer:", answer)
|
86 |
+
else:
|
87 |
+
st.write("Please enter a question.")
|
88 |
+
else:
|
89 |
+
st.write("Please upload a PDF file to start.")
|