import base64
from io import BytesIO
import json
import os
from meta_prompt import get_prompt
from openai import OpenAI
from utils import render_pdf_to_base64png, image_to_pdf, get_anchor_text
import gradio as gr
from PIL import Image
openai = OpenAI(base_url=os.environ.get("TYPHOON_BASE_URL"), api_key=os.environ.get("TYPHOON_API_KEY"))
theme = gr.themes.Soft(
primary_hue=gr.themes.Color(
c50="#f7f7fd",
c100="#dfdef8",
c200="#c4c1f2",
c300="#a29eea",
c400="#8f8ae6",
c500="#756fe0",
c600="#635cc1",
c700="#4f4a9b",
c800="#433f83",
c900="#302d5e",
c950="#302d5e",
),
secondary_hue="rose",
neutral_hue="stone",
)
def process_pdf(pdf_or_image_file, task_type):
if pdf_or_image_file is None:
return None, "No file uploaded"
orig_filename = pdf_or_image_file.name
ext = os.path.splitext(orig_filename)[1].lower()
filename = orig_filename # default to original file if PDF
# If the file is not a PDF, assume it's an image and convert it to PDF.
if ext not in [".pdf"]:
filename = image_to_pdf(orig_filename)
if filename is None:
return None, "Error converting image to PDF"
# Render the first page to base64 PNG and then load it into a PIL image.
image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1800)
image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
# Extract anchor text from the PDF (first page)
anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=8000)
# Retrieve and fill in the prompt template with the anchor_text
prompt_template_fn = get_prompt(task_type)
PROMPT = prompt_template_fn(anchor_text)
# Create a messages structure including text and image URL
messages = [{
"role": "user",
"content": [
{"type": "text", "text": PROMPT},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}]
# send messages to openai compatible api
response = openai.chat.completions.create(
model=os.environ.get("TYPHOON_OCR_MODEL"),
messages=messages,
max_tokens=16384,
extra_body={
"repetition_penalty": 1.2,
"temperature": 0.1,
"top_p": 0.6,
},
)
text_output = response.choices[0].message.content
# Try to parse the output assuming it is a Python dictionary containing 'natural_text'
try:
json_data = json.loads(text_output)
markdown_out = json_data.get('natural_text', "").replace("