Spaces:

opentyphoon
/

typhoon-ocr

Running

File size: 5,964 Bytes

import base64
from io import BytesIO
import json
import os
from meta_prompt import get_prompt
from openai import OpenAI
from utils import render_pdf_to_base64png, image_to_pdf, get_anchor_text
import gradio as gr
from PIL import Image


openai = OpenAI(base_url=os.environ.get("TYPHOON_BASE_URL"), api_key=os.environ.get("TYPHOON_API_KEY"))

theme = gr.themes.Soft(
    primary_hue=gr.themes.Color(
        c50="#f7f7fd",
        c100="#dfdef8",
        c200="#c4c1f2",
        c300="#a29eea",
        c400="#8f8ae6",
        c500="#756fe0",
        c600="#635cc1",
        c700="#4f4a9b",
        c800="#433f83",
        c900="#302d5e",
        c950="#302d5e",
    ),
    secondary_hue="rose",
    neutral_hue="stone",
)

def process_pdf(pdf_or_image_file, task_type):
    if pdf_or_image_file is None:
        return None, "No file uploaded"
    
    orig_filename = pdf_or_image_file.name
    ext = os.path.splitext(orig_filename)[1].lower()
    filename = orig_filename  # default to original file if PDF
    
    # If the file is not a PDF, assume it's an image and convert it to PDF.
    if ext not in [".pdf"]:
        filename = image_to_pdf(orig_filename)
        if filename is None:
            return None, "Error converting image to PDF"
    
    # Render the first page to base64 PNG and then load it into a PIL image.
    image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1800)
    image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
    
    # Extract anchor text from the PDF (first page)
    anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=8000)
    
    # Retrieve and fill in the prompt template with the anchor_text
    prompt_template_fn = get_prompt(task_type)
    PROMPT = prompt_template_fn(anchor_text)
    
    # Create a messages structure including text and image URL
    messages = [{
        "role": "user",
        "content": [
            {"type": "text", "text": PROMPT},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
        ],
    }]
    # send messages to openai compatible api
    response = openai.chat.completions.create(
        model=os.environ.get("TYPHOON_OCR_MODEL"),
        messages=messages,
        max_tokens=16384,
        extra_body={
            "repetition_penalty": 1.2,
            "temperature": 0.1,
            "top_p": 0.6,
        },
        
    )
    text_output = response.choices[0].message.content
    
    # Try to parse the output assuming it is a Python dictionary containing 'natural_text'
    try:
        json_data = json.loads(text_output)
        markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
    except Exception as e:
        markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
    
    return image_pil, markdown_out


# Build the Gradio UI.
with gr.Blocks(theme=theme) as demo:
    title = gr.HTML("""
    <h1>Typhoon OCR</h1>
    <ul>
        <li>🤗 <b>Model weights</b>: <a href="https://huggingface.co/scb10x/typhoon-ocr-7b" target="_blank">https://huggingface.co/scb10x/typhoon-ocr-7b</a></li>
    </ul>
    <br />
    <details>
        <summary><strong>Disclaimer</strong></summary>
        The responses generated by this Artificial Intelligence (AI) system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
    </details>
    <br />
    <details>
        <summary><strong>Terms of use</strong></summary>
        By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. Vision language models are prone to hallucinations to a greater extent compared to text-only LLMs.
    </details>
    <br />
    <details>
        <summary><strong>License</strong></summary>
        This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
    </details>
""")
    with gr.Row():
        with gr.Column(scale=1):
            # Update file_types to accept PDF as well as common image formats.
            pdf_input = gr.File(label="📄 Upload Image file or PDF file (only the first page will be processed)", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
            task_dropdown = gr.Dropdown(["default", "structure"], label="🎯 Select Task", value="default")
            run_button = gr.Button("🚀 Run")
            image_output = gr.Image(label="📸 Preview Image (Page 1)", type="pil")
        with gr.Column(scale=2):
            markdown_output = gr.Markdown(label='Markdown Result', show_label=True)

    
    # Connect the UI inputs to the processing function.
    run_button.click(
        fn=process_pdf,
        inputs=[pdf_input, task_dropdown],
        outputs=[image_output, markdown_output]
    )

# Launch the Gradio demo (temporary public share for 72 hours)
demo.launch(share=False)