Spaces:
Running
Running
File size: 7,769 Bytes
b499397 12eacd9 b499397 fe3edea 12eacd9 fe3edea 9fab3a2 12eacd9 fe3edea 9fab3a2 b499397 9fab3a2 b499397 9fab3a2 b499397 e1f64ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
"""
Entry point for DescribePDF Hugging Face Space.
"""
import gradio as gr
import os
import sys
# Asegurarnos de que el directorio actual esté en el path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
# Importar solo las partes necesarias
from describepdf import config, core
# Definir tema
theme = gr.themes.Soft(
primary_hue="red",
secondary_hue="rose",
spacing_size="lg",
)
# Variables globales para configuración predeterminada
SUGGESTED_VLMS = [
"qwen/qwen2.5-vl-72b-instruct",
"google/gemini-2.5-pro-preview-03-25",
"openai/chatgpt-4o-latest"
]
SUGGESTED_LLMS = [
"google/gemini-2.5-flash-preview",
"openai/chatgpt-4o-latest",
"anthropic/claude-3.5-sonnet"
]
SUGGESTED_LANGUAGES = [
"English", "Spanish", "French", "German",
"Chinese", "Japanese", "Italian",
"Portuguese", "Russian", "Korean"
]
def generate(
pdf_file_obj,
ui_api_key,
ui_vlm_model,
ui_lang,
ui_use_md,
ui_use_sum,
ui_sum_model,
progress=gr.Progress()
):
"""Wrapper function to call the core conversion process"""
if pdf_file_obj is None:
return "Please upload a PDF file.", gr.update(value=None, visible=False), None
# Load environment config
env_config = config.get_config()
# Prepare configuration for this run
api_key = ui_api_key.strip() if ui_api_key.strip() else env_config.get("openrouter_api_key")
current_run_config = {
"provider": "openrouter",
"openrouter_api_key": api_key,
"vlm_model": ui_vlm_model,
"output_language": ui_lang,
"use_markitdown": ui_use_md,
"use_summary": ui_use_sum,
"summary_llm_model": ui_sum_model if ui_sum_model else env_config.get("or_summary_model")
}
# Validate API key
if not current_run_config.get("openrouter_api_key"):
error_msg = "Error: OpenRouter API Key is missing. Provide it in the UI."
return error_msg, gr.update(value=None, visible=False), None
# Create progress callback for Gradio
def progress_callback(progress_value, status):
clamped_progress = max(0.0, min(1.0, progress_value))
progress(clamped_progress, desc=status)
# Run the conversion
status_message, result_markdown = core.convert_pdf_to_markdown(
pdf_file_obj.name,
current_run_config,
progress_callback
)
# Handle the download file
if result_markdown:
try:
import tempfile
import secrets
# Get base filename from the uploaded PDF
base_name = os.path.splitext(os.path.basename(pdf_file_obj.name))[0]
download_filename = f"{base_name}_description.md"
# Create a temporary file
random_suffix = secrets.token_hex(4)
temp_dir = tempfile.gettempdir()
download_filepath = os.path.join(temp_dir, f"{base_name}_{random_suffix}.md")
# Write markdown result to the temporary file
with open(download_filepath, "w", encoding="utf-8") as md_file:
md_file.write(result_markdown)
download_button_update = gr.update(value=download_filepath, visible=True, label=f"Download '{download_filename}'")
except Exception as e:
status_message += f" (Error creating download file: {str(e)})"
download_button_update = gr.update(value=None, visible=False)
else:
download_button_update = gr.update(value=None, visible=False)
return status_message, download_button_update, result_markdown
# Crear interfaz de usuario manualmente en lugar de usar la función create_ui()
with gr.Blocks(title="DescribePDF", theme=theme) as app:
gr.Markdown("<center><img src='https://davidlms.github.io/DescribePDF/assets/poster.png' alt='Describe PDF Logo' width='600px'/></center>")
gr.Markdown(
"""<div style="display: flex;align-items: center;justify-content: center">
[<a href="https://davidlms.github.io/describepdf/">Project Page</a>] | [<a href="https://github.com/DavidLMS/describepdf">Github</a>]</div>
"""
)
gr.Markdown(
"DescribePDF is an open-source tool designed to convert PDF files into detailed page-by-page descriptions in Markdown format using Vision-Language Models (VLMs). Unlike traditional PDF extraction tools that focus on replicating the text layout, DescribePDF generates rich, contextual descriptions of each page's content, making it perfect for visually complex documents like catalogs, scanned documents, and presentations."
)
with gr.Tabs():
# Generate tab
with gr.TabItem("Generate"):
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="Upload PDF",
file_types=['.pdf'],
type="filepath"
)
convert_button = gr.Button(
"Describe",
variant="primary"
)
progress_output = gr.Textbox(
label="Progress",
interactive=False,
lines=2
)
download_button = gr.File(
label="Download Markdown",
visible=False,
interactive=False
)
with gr.Column(scale=2):
markdown_output = gr.Markdown(label="Result (Markdown)")
# Configuration tab
with gr.TabItem("Settings"):
gr.Markdown(
"Adjust settings for the *next* generation."
)
api_key_input = gr.Textbox(
label="OpenRouter API Key",
type="password",
placeholder="Enter your OpenRouter API key",
value=""
)
vlm_model_input = gr.Dropdown(
label="VLM Model",
choices=SUGGESTED_VLMS,
value=SUGGESTED_VLMS[0],
allow_custom_value=True,
info="Select or type the OpenRouter VLM model name"
)
output_language_input = gr.Dropdown(
label="Output Language",
choices=SUGGESTED_LANGUAGES,
value="English",
allow_custom_value=True,
info="Select or type the desired output language"
)
with gr.Row():
use_markitdown_checkbox = gr.Checkbox(
label="Use Markitdown for extra text context",
value=False
)
use_summary_checkbox = gr.Checkbox(
label="Use PDF summary for augmented context",
value=False
)
summary_llm_model_input = gr.Dropdown(
label="LLM Model for Summary",
choices=SUGGESTED_LLMS,
value=SUGGESTED_LLMS[0],
allow_custom_value=True,
info="Select or type the OpenRouter LLM model name for summaries"
)
# Connect UI components
conversion_inputs = [
pdf_input, api_key_input, vlm_model_input, output_language_input,
use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input
]
conversion_outputs = [
progress_output, download_button, markdown_output
]
convert_button.click(
fn=generate,
inputs=conversion_inputs,
outputs=conversion_outputs
)
# Para Hugging Face Spaces
if __name__ == "__main__":
app.launch() |