|
|
|
import os |
|
import streamlit as st |
|
from io import BytesIO |
|
from tempfile import NamedTemporaryFile |
|
from langchain.chains import create_retrieval_chain |
|
from langchain.chains.combine_documents import create_stuff_documents_chain |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from langchain_openai import ChatOpenAI |
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.vectorstores import FAISS |
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from reportlab.lib.pagesizes import letter |
|
from reportlab.pdfgen import canvas |
|
|
|
|
|
def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder): |
|
|
|
os.environ["OPENAI_API_KEY"] = api_key |
|
|
|
|
|
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: |
|
temp_pdf.write(uploaded_file.read()) |
|
temp_pdf_path = temp_pdf.name |
|
|
|
|
|
loader = PyPDFLoader(temp_pdf_path) |
|
docs = loader.load() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) |
|
splits = text_splitter.split_documents(docs) |
|
|
|
|
|
vectorstore = FAISS.from_documents( |
|
documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large") |
|
) |
|
retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) |
|
|
|
|
|
if os.path.exists(prompt_path): |
|
with open(prompt_path, "r") as file: |
|
system_prompt = file.read() |
|
else: |
|
raise FileNotFoundError(f"The specified file was not found: {prompt_path}") |
|
|
|
|
|
prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", system_prompt), |
|
("human", "{input}"), |
|
] |
|
) |
|
|
|
|
|
llm = ChatOpenAI(model="gpt-4o") |
|
question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context") |
|
rag_chain = create_retrieval_chain(retriever, question_answer_chain) |
|
|
|
|
|
if os.path.exists(questions_path): |
|
with open(questions_path, "r") as file: |
|
questions = [line.strip() for line in file.readlines() if line.strip()] |
|
else: |
|
raise FileNotFoundError(f"The specified file was not found: {questions_path}") |
|
|
|
|
|
qa_results = [] |
|
for question in questions: |
|
result = rag_chain.invoke({"input": question}) |
|
answer = result["answer"] |
|
qa_text = f"### Question: {question}\n**Answer:** {answer}\n" |
|
qa_results.append(qa_text) |
|
|
|
display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True) |
|
|
|
|
|
os.remove(temp_pdf_path) |
|
|
|
return qa_results |
|
|
|
|
|
def create_pdf(content): |
|
buffer = BytesIO() |
|
pdf = canvas.Canvas(buffer, pagesize=letter) |
|
pdf.setFont("Helvetica", 10) |
|
|
|
|
|
text = pdf.beginText(40, 750) |
|
|
|
for line in content.split("\n"): |
|
text.textLine(line) |
|
|
|
pdf.drawText(text) |
|
pdf.showPage() |
|
pdf.save() |
|
|
|
buffer.seek(0) |
|
return buffer |
|
|
|
|
|
st.title("Climate Policy Summary Tool") |
|
|
|
|
|
api_key = st.text_input("Enter your OpenAI API key:", type="password") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf") |
|
|
|
|
|
prompt_file_path = "summary_tool_system_prompt.md" |
|
questions_file_path = "summary_tool_questions.md" |
|
|
|
|
|
if st.button("Generate") and api_key and uploaded_file: |
|
|
|
display_placeholder = st.empty() |
|
|
|
with st.spinner("Processing..."): |
|
try: |
|
results = process_pdf(api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder) |
|
|
|
|
|
markdown_output = "\n".join(results) |
|
st.download_button("Download as Markdown", markdown_output, file_name="results.md") |
|
|
|
|
|
pdf_output = create_pdf(markdown_output) |
|
st.download_button("Download as PDF", data=pdf_output, file_name="results.pdf") |
|
|
|
except Exception as e: |
|
st.error(f"An error occurred: {e}") |
|
|