File size: 4,908 Bytes
934cece 0ad40ce 15eadf4 0ad40ce d094df2 0ad40ce 15eadf4 0ad40ce 15eadf4 0ad40ce d094df2 0ad40ce 934cece d094df2 934cece 0ad40ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import os
import streamlit as st
from io import BytesIO
from tempfile import NamedTemporaryFile
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
# Function to process PDF, run Q&A, and return results
def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
# Set up OpenAI API key
os.environ["OPENAI_API_KEY"] = api_key
# Temporarily save the uploaded file to disk
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(uploaded_file.read()) # Write the uploaded file to the temp file
temp_pdf_path = temp_pdf.name
# Load the PDF document using PyPDFLoader
loader = PyPDFLoader(temp_pdf_path)
docs = loader.load()
# Split the document into smaller chunks for embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(docs)
# Create vector store and retriever
vectorstore = FAISS.from_documents(
documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large")
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
# Read the system prompt from a Markdown (.md) file
if os.path.exists(prompt_path):
with open(prompt_path, "r") as file:
system_prompt = file.read()
else:
raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
# Ensure the system prompt includes {context} for document input
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)
# Create the retrieval and question-answering chains
llm = ChatOpenAI(model="gpt-4o")
question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
# Load questions from a Markdown file
if os.path.exists(questions_path):
with open(questions_path, "r") as file:
questions = [line.strip() for line in file.readlines() if line.strip()]
else:
raise FileNotFoundError(f"The specified file was not found: {questions_path}")
# Generate question and answer pairs incrementally
qa_results = []
for question in questions:
result = rag_chain.invoke({"input": question})
answer = result["answer"]
qa_text = f"### Question: {question}\n**Answer:** {answer}\n"
qa_results.append(qa_text)
# Update the placeholder with each new Q&A pair
display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)
# Clean up the temporary file
os.remove(temp_pdf_path)
return qa_results
# Function to create a PDF using reportlab
def create_pdf(content):
buffer = BytesIO()
pdf = canvas.Canvas(buffer, pagesize=letter)
pdf.setFont("Helvetica", 10)
# Start position for writing text
text = pdf.beginText(40, 750)
for line in content.split("\n"):
text.textLine(line)
pdf.drawText(text)
pdf.showPage()
pdf.save()
buffer.seek(0)
return buffer
# Streamlit app layout
st.title("Climate Policy Summary Tool")
# Input OpenAI API key
api_key = st.text_input("Enter your OpenAI API key:", type="password")
# File upload section for PDF
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
# Define static paths for prompt and questions
prompt_file_path = "summary_tool_system_prompt.md"
questions_file_path = "summary_tool_questions.md"
# When user clicks "Generate"
if st.button("Generate") and api_key and uploaded_file:
# Create a placeholder to update with each Q&A
display_placeholder = st.empty()
with st.spinner("Processing..."):
try:
results = process_pdf(api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder)
# Allow the user to download the results as a Markdown file
markdown_output = "\n".join(results)
st.download_button("Download as Markdown", markdown_output, file_name="results.md")
# Create a PDF file for the user to download
pdf_output = create_pdf(markdown_output)
st.download_button("Download as PDF", data=pdf_output, file_name="results.pdf")
except Exception as e:
st.error(f"An error occurred: {e}")
|