File size: 6,249 Bytes
0ad40ce 318c146 0ad40ce 318c146 0ad40ce 318c146 0ad40ce 15eadf4 0ad40ce 15eadf4 0ad40ce 318c146 0ad40ce 318c146 d094df2 318c146 d094df2 318c146 d094df2 0ad40ce 318c146 934cece 0ad40ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import os
import re
import streamlit as st
from io import BytesIO
from tempfile import NamedTemporaryFile
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from xhtml2pdf import pisa
from markdown import markdown
# Function to process PDF, run Q&A, and return results
def process_pdf(api_key, uploaded_file, questions_path, prompt_path, display_placeholder):
# Set up OpenAI API key
os.environ["OPENAI_API_KEY"] = api_key
# Temporarily save the uploaded file to disk
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(uploaded_file.read()) # Write the uploaded file to the temp file
temp_pdf_path = temp_pdf.name
# Load the PDF document using PyPDFLoader
loader = PyPDFLoader(temp_pdf_path)
docs = loader.load()
# Split the document into smaller chunks for embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(docs)
# Create vector store and retriever
vectorstore = FAISS.from_documents(
documents=splits, embedding=OpenAIEmbeddings(model="text-embedding-3-large")
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
# Read the system prompt from a Markdown (.md) file
if os.path.exists(prompt_path):
with open(prompt_path, "r") as file:
system_prompt = file.read()
else:
raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
# Ensure the system prompt includes {context} for document input
prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)
# Create the retrieval and question-answering chains
llm = ChatOpenAI(model="gpt-4o")
question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context")
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
# Load questions from a Markdown file
if os.path.exists(questions_path):
with open(questions_path, "r") as file:
questions = [line.strip() for line in file.readlines() if line.strip()]
else:
raise FileNotFoundError(f"The specified file was not found: {questions_path}")
# Generate question and answer pairs incrementally
qa_results = []
for question in questions:
result = rag_chain.invoke({"input": question})
answer = result["answer"]
# Clean up the answer
answer = clean_answer(answer)
qa_text = f"### Question: {question}\n**Answer:**\n\n{answer}\n"
qa_results.append(qa_text)
# Update the placeholder with each new Q&A pair
display_placeholder.markdown("\n".join(qa_results), unsafe_allow_html=True)
# Clean up the temporary file
os.remove(temp_pdf_path)
return qa_results
# Function to clean up the AI's answer
def clean_answer(answer):
# Remove unwanted prefixes like 'markdown'
answer = answer.strip()
if answer.lower().startswith('markdown'):
answer = answer[len('markdown'):].strip()
# Additional cleaning if necessary
# For example, ensure that markdown table syntax is correct
return answer
# Function to convert markdown text to PDF with table support
def md_to_pdf(md_text):
# Convert markdown to HTML with table support
html_content = markdown(md_text, output_format='html5', extensions=['tables'])
# Define CSS styles for better table rendering
css_styles = '''
<style>
body {
font-family: Arial, sans-serif;
font-size: 12pt;
}
table {
border-collapse: collapse;
width: 100%;
}
th, td {
border: 1px solid black;
padding: 8px;
text-align: left;
}
th {
background-color: #f2f2f2;
}
</style>
'''
# Construct the full HTML with CSS and content
html = f'''
<html>
<head>
{css_styles}
</head>
<body>
{html_content}
</body>
</html>
'''
# Generate the PDF
pdf = BytesIO()
pisa_status = pisa.CreatePDF(html, dest=pdf)
if pisa_status.err:
return None
return pdf.getvalue()
# Streamlit app layout
st.title("Climate Policy Summary Tool")
# Input OpenAI API key
api_key = st.text_input("Enter your OpenAI API key:", type="password")
# File upload section for PDF
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
# Define static paths for prompt and questions
prompt_file_path = "summary_tool_system_prompt.md"
questions_file_path = "summary_tool_questions.md"
# When user clicks "Generate"
if st.button("Generate") and api_key and uploaded_file:
# Create a placeholder to update with each Q&A
display_placeholder = st.empty()
with st.spinner("Processing..."):
try:
results = process_pdf(api_key, uploaded_file, questions_file_path, prompt_file_path, display_placeholder)
# Allow the user to download the results as a Markdown file
markdown_text = "\n".join(results)
st.download_button(
label="Download Results as Markdown",
data=markdown_text,
file_name="qa_results.md",
mime="text/markdown"
)
# Convert markdown to PDF
pdf_bytes = md_to_pdf(markdown_text)
if pdf_bytes:
st.download_button(
label="Download Results as PDF",
data=pdf_bytes,
file_name="qa_results.pdf",
mime="application/pdf"
)
else:
st.error("Error generating PDF")
except Exception as e:
st.error(f"An error occurred: {e}")
|