File size: 6,249 Bytes
0ad40ce
318c146
0ad40ce
 
 
 
 
 
 
 
318c146
0ad40ce
 
318c146
 
0ad40ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15eadf4
0ad40ce
 
 
15eadf4
 
0ad40ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318c146
 
 
 
 
0ad40ce
 
 
 
 
 
 
 
 
318c146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d094df2
318c146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d094df2
318c146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d094df2
0ad40ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318c146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
934cece
0ad40ce
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os
import re
import streamlit as st
from io import BytesIO
from tempfile import NamedTemporaryFile
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from xhtml2pdf import pisa
from markdown import markdown

# Function to process PDF, run Q&A, and return results
def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
    # Set up OpenAI API key
    os.environ["OPENAI_API_KEY"] = api_key

    # Temporarily save the uploaded file to disk
    with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(uploaded_file.read())  # Write the uploaded file to the temp file
        temp_pdf_path = temp_pdf.name

    # Load the PDF document using PyPDFLoader
    loader = PyPDFLoader(temp_pdf_path)
    docs = loader.load()

    # Split the document into smaller chunks for embedding
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
    splits = text_splitter.split_documents(docs)

    # Create vector store and retriever
    vectorstore = FAISS.from_documents(
        documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large")
    )
    retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

    # Read the system prompt from a Markdown (.md) file
    if os.path.exists(prompt_path):
        with open(prompt_path, "r") as file:
            system_prompt = file.read()
    else:
        raise FileNotFoundError(f"The specified file was not found: {prompt_path}")

    # Ensure the system prompt includes {context} for document input
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )

    # Create the retrieval and question-answering chains
    llm = ChatOpenAI(model="gpt-4o")
    question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)

    # Load questions from a Markdown file
    if os.path.exists(questions_path):
        with open(questions_path, "r") as file:
            questions = [line.strip() for line in file.readlines() if line.strip()]
    else:
        raise FileNotFoundError(f"The specified file was not found: {questions_path}")

    # Generate question and answer pairs incrementally
    qa_results = []
    for question in questions:
        result = rag_chain.invoke({"input": question})
        answer = result["answer"]

        # Clean up the answer
        answer = clean_answer(answer)

        qa_text = f"### Question: {question}\n**Answer:**\n\n{answer}\n"
        qa_results.append(qa_text)
        # Update the placeholder with each new Q&A pair
        display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)

    # Clean up the temporary file
    os.remove(temp_pdf_path)

    return qa_results

# Function to clean up the AI's answer
def clean_answer(answer):
    # Remove unwanted prefixes like 'markdown'
    answer = answer.strip()
    if answer.lower().startswith('markdown'):
        answer = answer[len('markdown'):].strip()

    # Additional cleaning if necessary
    # For example, ensure that markdown table syntax is correct
    return answer

# Function to convert markdown text to PDF with table support
def md_to_pdf(md_text):
    # Convert markdown to HTML with table support
    html_content = markdown(md_text, output_format='html5', extensions=['tables'])
    
    # Define CSS styles for better table rendering
    css_styles = '''
    <style>
    body { 
        font-family: Arial, sans-serif; 
        font-size: 12pt;
    }
    table { 
        border-collapse: collapse; 
        width: 100%; 
    }
    th, td { 
        border: 1px solid black; 
        padding: 8px; 
        text-align: left; 
    }
    th { 
        background-color: #f2f2f2; 
    }
    </style>
    '''
    
    # Construct the full HTML with CSS and content
    html = f'''
    <html>
    <head>
    {css_styles}
    </head>
    <body>
    {html_content}
    </body>
    </html>
    '''
    
    # Generate the PDF
    pdf = BytesIO()
    pisa_status = pisa.CreatePDF(html, dest=pdf)
    if pisa_status.err:
        return None
    return pdf.getvalue()

# Streamlit app layout
st.title("Climate Policy Summary Tool")

# Input OpenAI API key
api_key = st.text_input("Enter your OpenAI API key:", type="password")

# File upload section for PDF
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")

# Define static paths for prompt and questions
prompt_file_path = "summary_tool_system_prompt.md"
questions_file_path = "summary_tool_questions.md"

# When user clicks "Generate"
if st.button("Generate") and api_key and uploaded_file:
    # Create a placeholder to update with each Q&A
    display_placeholder = st.empty()

    with st.spinner("Processing..."):
        try:
            results = process_pdf(api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder)
            
            # Allow the user to download the results as a Markdown file
            markdown_text = "\n".join(results)
            st.download_button(
                label="Download Results as Markdown",
                data=markdown_text,
                file_name="qa_results.md",
                mime="text/markdown"
            )

            # Convert markdown to PDF
            pdf_bytes = md_to_pdf(markdown_text)
            if pdf_bytes:
                st.download_button(
                    label="Download Results as PDF",
                    data=pdf_bytes,
                    file_name="qa_results.pdf",
                    mime="application/pdf"
                )
            else:
                st.error("Error generating PDF")

        except Exception as e:
            st.error(f"An error occurred: {e}")