Spaces:
Running
Running
import streamlit as st | |
import requests | |
# ----------------- Streamlit Setup ----------------- # | |
st.set_page_config(page_title="Lease Agreement Extractor (OCR + AI)", layout="centered") | |
st.title("π Lease Agreement Extractor (OCR + AI)") | |
# ----------------- Assistant & API Config ----------------- # | |
OPENAI_ASSISTANT_ID = "asst_xBnNfiyWmVa4iF3CgXwJnmBt" # Replace with your assistant ID | |
OPENAI_API_KEY = "YOUR_OPENAI_API_KEY" # Replace with your OpenAI key | |
HEADERS = { | |
"Authorization": f"Bearer {OPENAI_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
# ----------------- Upload Interface ----------------- # | |
uploaded_file = st.file_uploader("Upload a lease agreement (PDF, PNG, JPG)", type=["pdf", "png", "jpg", "jpeg"]) | |
def upload_file_to_openai(file_bytes, filename): | |
files = { | |
"file": (filename, file_bytes, "application/octet-stream") | |
} | |
response = requests.post("https://api.openai.com/v1/files", headers={"Authorization": f"Bearer {OPENAI_API_KEY}"}, files=files) | |
return response.json().get("id") | |
def create_thread_and_run(file_id): | |
# Create thread | |
thread_res = requests.post("https://api.openai.com/v1/threads", headers=HEADERS, json={}) | |
thread_data = thread_res.json() | |
thread_id = thread_data.get("id") | |
# Run assistant with structured lease extraction prompt | |
run_payload = { | |
"assistant_id": OPENAI_ASSISTANT_ID, | |
"instructions": """ | |
You are an AI assistant designed to extract structured lease data from any uploaded agreement, even if it is scanned. Your responsibilities are: | |
- Accept PDF or image-based lease documents. | |
- Use OCR (Optical Character Recognition) if the document is non-selectable or scanned. | |
- Parse and extract key structured data fields from lease agreements. | |
- Output results in JSON format following a fixed schema. | |
OCR & Extraction Rules: | |
1. If text cannot be extracted normally, perform OCR using image-to-text on each page. | |
2. Preserve paragraph structure, detect headers, signature blocks, tables, and dates. | |
Expected JSON Output: | |
{ | |
"document_title": "Lease Agreement Title or File Name", | |
"parties": { | |
"lessor": "Name of Lessor", | |
"lessee": "Name of Lessee" | |
}, | |
"property_description": "Detailed description of leased property", | |
"term": { | |
"start_date": "YYYY-MM-DD", | |
"end_date": "YYYY-MM-DD", | |
"renewal_options": "Yes/No or Clause Text" | |
}, | |
"financials": { | |
"monthly_rent": "Amount", | |
"deposit": "Amount", | |
"escalation_clause": "Text if present" | |
}, | |
"obligations": { | |
"maintenance": "Lessor/Lessee", | |
"insurance": "Lessor/Lessee", | |
"subletting": "Allowed/Not allowed" | |
}, | |
"signatures": [ | |
{ | |
"party": "Party Name", | |
"signed_by": "Full Name", | |
"date": "YYYY-MM-DD", | |
"position": "Title if listed" | |
} | |
], | |
"raw_text_per_page": { | |
"page_1": "Full OCR text of page 1", | |
"page_2": "Full OCR text of page 2" | |
} | |
} | |
π Behaviors: | |
- Automatically detect if OCR is needed. | |
- Normalize dates to YYYY-MM-DD. | |
- Use "Not Found" or null where info is missing. | |
- Return JSON only. | |
- If OCR text is low-confidence, include: "low_confidence": true | |
""", | |
"file_ids": [file_id] | |
} | |
run_res = requests.post(f"https://api.openai.com/v1/threads/{thread_id}/runs", headers=HEADERS, json=run_payload) | |
run_data = run_res.json() | |
run_id = run_data.get("id") | |
return thread_id, run_id | |
# ----------------- Main UI Flow ----------------- # | |
if uploaded_file is not None: | |
st.success("π File uploaded successfully.") | |
if st.button("π Run Lease Extraction"): | |
with st.spinner("Uploading and invoking assistant..."): | |
file_id = upload_file_to_openai(uploaded_file.getvalue(), uploaded_file.name) | |
if file_id: | |
thread_id, run_id = create_thread_and_run(file_id) | |
st.success("β Assistant run started!") | |
st.code(f"Thread ID: {thread_id}", language="text") | |
st.code(f"Run ID: {run_id}", language="text") | |
st.markdown("π You can retrieve results via OpenAI API using these IDs.") | |
else: | |
st.error("β File upload to OpenAI failed.") | |
else: | |
st.info("Please upload a lease agreement file to begin.") | |