|
import streamlit as st |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer |
|
import faiss |
|
import numpy as np |
|
import requests |
|
import fitz |
|
|
|
|
|
@st.cache_resource |
|
def load_data(file): |
|
text = "" |
|
with fitz.open(stream=file.read(), filetype="pdf") as pdf_file: |
|
for page in pdf_file: |
|
text += page.get_text() |
|
return pd.DataFrame({'combined_text': [text]}) |
|
|
|
|
|
@st.cache_resource |
|
def initialize_embeddings(data): |
|
embedder = SentenceTransformer('all-MiniLM-L6-v2') |
|
embeddings = embedder.encode(data['combined_text'].tolist(), convert_to_tensor=False) |
|
embedding_dim = embeddings[0].shape[0] |
|
index = faiss.IndexFlatL2(embedding_dim) |
|
index.add(np.array(embeddings)) |
|
return embedder, index |
|
|
|
|
|
openai_api_key = "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" |
|
|
|
|
|
def retrieve(query, embedder, index, data, top_k=5): |
|
query_embedding = embedder.encode([query], convert_to_tensor=False) |
|
distances, indices = index.search(np.array(query_embedding), top_k) |
|
return data.iloc[indices[0]] |
|
|
|
|
|
def rag_query(query, embedder, index, data, top_k=5): |
|
retrieved_docs = retrieve(query, embedder, index, data, top_k) |
|
context = "\n".join(retrieved_docs['combined_text'].tolist()) |
|
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:" |
|
|
|
|
|
headers = { |
|
"Authorization": f"Bearer {openai_api_key}", |
|
"Content-Type": "application/json" |
|
} |
|
|
|
data = { |
|
"model": "gpt-3.5-turbo", |
|
"messages": [{"role": "user", "content": prompt}], |
|
"max_tokens": 100, |
|
"temperature": 0.7 |
|
} |
|
|
|
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data) |
|
|
|
if response.status_code == 200: |
|
answer = response.json().get("choices", [{}])[0].get("message", {}).get("content", "No answer found.") |
|
else: |
|
answer = f"Error: {response.json().get('error', 'Unknown error')}" |
|
|
|
return answer |
|
|
|
|
|
st.title("RAG Application with OpenAI API") |
|
st.write("Ask a question, and I'll find the answer for you!") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"]) |
|
|
|
if uploaded_file is not None: |
|
|
|
data = load_data(uploaded_file) |
|
|
|
|
|
embedder, index = initialize_embeddings(data) |
|
|
|
|
|
query = st.text_input("Your question:") |
|
|
|
if st.button("Get Answer"): |
|
if query: |
|
answer = rag_query(query, embedder, index, data) |
|
st.write("Answer:", answer) |
|
else: |
|
st.write("Please enter a question.") |
|
else: |
|
st.write("Please upload a PDF file to start.") |
|
|