File size: 4,908 Bytes
934cece
0ad40ce
 
 
 
 
 
 
 
 
15eadf4
0ad40ce
 
d094df2
 
0ad40ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15eadf4
0ad40ce
 
 
15eadf4
 
0ad40ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d094df2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ad40ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
934cece
 
 
 
d094df2
 
934cece
0ad40ce
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

import os
import streamlit as st
from io import BytesIO
from tempfile import NamedTemporaryFile
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

# Function to process PDF, run Q&A, and return results
def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
    # Set up OpenAI API key
    os.environ["OPENAI_API_KEY"] = api_key

    # Temporarily save the uploaded file to disk
    with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(uploaded_file.read())  # Write the uploaded file to the temp file
        temp_pdf_path = temp_pdf.name

    # Load the PDF document using PyPDFLoader
    loader = PyPDFLoader(temp_pdf_path)
    docs = loader.load()

    # Split the document into smaller chunks for embedding
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
    splits = text_splitter.split_documents(docs)

    # Create vector store and retriever
    vectorstore = FAISS.from_documents(
        documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large")
    )
    retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

    # Read the system prompt from a Markdown (.md) file
    if os.path.exists(prompt_path):
        with open(prompt_path, "r") as file:
            system_prompt = file.read()
    else:
        raise FileNotFoundError(f"The specified file was not found: {prompt_path}")

    # Ensure the system prompt includes {context} for document input
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )

    # Create the retrieval and question-answering chains
    llm = ChatOpenAI(model="gpt-4o")
    question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)

    # Load questions from a Markdown file
    if os.path.exists(questions_path):
        with open(questions_path, "r") as file:
            questions = [line.strip() for line in file.readlines() if line.strip()]
    else:
        raise FileNotFoundError(f"The specified file was not found: {questions_path}")

    # Generate question and answer pairs incrementally
    qa_results = []
    for question in questions:
        result = rag_chain.invoke({"input": question})
        answer = result["answer"]
        qa_text = f"### Question: {question}\n**Answer:** {answer}\n"
        qa_results.append(qa_text)
        # Update the placeholder with each new Q&A pair
        display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)

    # Clean up the temporary file
    os.remove(temp_pdf_path)

    return qa_results

# Function to create a PDF using reportlab
def create_pdf(content):
    buffer = BytesIO()
    pdf = canvas.Canvas(buffer, pagesize=letter)
    pdf.setFont("Helvetica", 10)
    
    # Start position for writing text
    text = pdf.beginText(40, 750)

    for line in content.split("\n"):
        text.textLine(line)
    
    pdf.drawText(text)
    pdf.showPage()
    pdf.save()

    buffer.seek(0)
    return buffer

# Streamlit app layout
st.title("Climate Policy Summary Tool")

# Input OpenAI API key
api_key = st.text_input("Enter your OpenAI API key:", type="password")

# File upload section for PDF
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")

# Define static paths for prompt and questions
prompt_file_path = "summary_tool_system_prompt.md"
questions_file_path = "summary_tool_questions.md"

# When user clicks "Generate"
if st.button("Generate") and api_key and uploaded_file:
    # Create a placeholder to update with each Q&A
    display_placeholder = st.empty()

    with st.spinner("Processing..."):
        try:
            results = process_pdf(api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder)
            
            # Allow the user to download the results as a Markdown file
            markdown_output = "\n".join(results)
            st.download_button("Download as Markdown", markdown_output, file_name="results.md")

            # Create a PDF file for the user to download
            pdf_output = create_pdf(markdown_output)
            st.download_button("Download as PDF", data=pdf_output, file_name="results.pdf")

        except Exception as e:
            st.error(f"An error occurred: {e}")