File size: 7,769 Bytes
b499397
12eacd9
b499397
 
fe3edea
 
12eacd9
fe3edea
9fab3a2
12eacd9
fe3edea
9fab3a2
 
b499397
9fab3a2
 
 
 
 
 
b499397
9fab3a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b499397
e1f64ae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
Entry point for DescribePDF Hugging Face Space.
"""

import gradio as gr
import os
import sys

# Asegurarnos de que el directorio actual esté en el path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

# Importar solo las partes necesarias
from describepdf import config, core

# Definir tema 
theme = gr.themes.Soft(
    primary_hue="red",
    secondary_hue="rose",
    spacing_size="lg",
)

# Variables globales para configuración predeterminada
SUGGESTED_VLMS = [
    "qwen/qwen2.5-vl-72b-instruct", 
    "google/gemini-2.5-pro-preview-03-25",
    "openai/chatgpt-4o-latest"
]

SUGGESTED_LLMS = [
    "google/gemini-2.5-flash-preview", 
    "openai/chatgpt-4o-latest",
    "anthropic/claude-3.5-sonnet"
]

SUGGESTED_LANGUAGES = [
    "English", "Spanish", "French", "German", 
    "Chinese", "Japanese", "Italian", 
    "Portuguese", "Russian", "Korean"
]

def generate(
    pdf_file_obj, 
    ui_api_key, 
    ui_vlm_model, 
    ui_lang, 
    ui_use_md, 
    ui_use_sum, 
    ui_sum_model, 
    progress=gr.Progress()
):
    """Wrapper function to call the core conversion process"""
    if pdf_file_obj is None:
        return "Please upload a PDF file.", gr.update(value=None, visible=False), None

    # Load environment config
    env_config = config.get_config()

    # Prepare configuration for this run
    api_key = ui_api_key.strip() if ui_api_key.strip() else env_config.get("openrouter_api_key")

    current_run_config = {
        "provider": "openrouter",
        "openrouter_api_key": api_key,
        "vlm_model": ui_vlm_model,
        "output_language": ui_lang,
        "use_markitdown": ui_use_md,
        "use_summary": ui_use_sum,
        "summary_llm_model": ui_sum_model if ui_sum_model else env_config.get("or_summary_model")
    }

    # Validate API key
    if not current_run_config.get("openrouter_api_key"):
        error_msg = "Error: OpenRouter API Key is missing. Provide it in the UI."
        return error_msg, gr.update(value=None, visible=False), None

    # Create progress callback for Gradio
    def progress_callback(progress_value, status):
        clamped_progress = max(0.0, min(1.0, progress_value))
        progress(clamped_progress, desc=status)

    # Run the conversion
    status_message, result_markdown = core.convert_pdf_to_markdown(
        pdf_file_obj.name,
        current_run_config,
        progress_callback
    )

    # Handle the download file
    if result_markdown:
        try:
            import tempfile
            import secrets
            
            # Get base filename from the uploaded PDF
            base_name = os.path.splitext(os.path.basename(pdf_file_obj.name))[0]
            download_filename = f"{base_name}_description.md"
            
            # Create a temporary file
            random_suffix = secrets.token_hex(4)
            temp_dir = tempfile.gettempdir()
            download_filepath = os.path.join(temp_dir, f"{base_name}_{random_suffix}.md")

            # Write markdown result to the temporary file
            with open(download_filepath, "w", encoding="utf-8") as md_file:
                md_file.write(result_markdown)
                
            download_button_update = gr.update(value=download_filepath, visible=True, label=f"Download '{download_filename}'")
        except Exception as e:
            status_message += f" (Error creating download file: {str(e)})"
            download_button_update = gr.update(value=None, visible=False)
    else:
        download_button_update = gr.update(value=None, visible=False)

    return status_message, download_button_update, result_markdown

# Crear interfaz de usuario manualmente en lugar de usar la función create_ui()
with gr.Blocks(title="DescribePDF", theme=theme) as app:
    gr.Markdown("<center><img src='https://davidlms.github.io/DescribePDF/assets/poster.png' alt='Describe PDF Logo' width='600px'/></center>")
    gr.Markdown(
        """<div style="display: flex;align-items: center;justify-content: center">
        [<a href="https://davidlms.github.io/describepdf/">Project Page</a>] | [<a href="https://github.com/DavidLMS/describepdf">Github</a>]</div>
        """
    )
    gr.Markdown(
        "DescribePDF is an open-source tool designed to convert PDF files into detailed page-by-page descriptions in Markdown format using Vision-Language Models (VLMs). Unlike traditional PDF extraction tools that focus on replicating the text layout, DescribePDF generates rich, contextual descriptions of each page's content, making it perfect for visually complex documents like catalogs, scanned documents, and presentations."
    )

    with gr.Tabs():
        # Generate tab
        with gr.TabItem("Generate"):
            with gr.Row():
                with gr.Column(scale=1):
                    pdf_input = gr.File(
                        label="Upload PDF", 
                        file_types=['.pdf'], 
                        type="filepath"
                    )
                    convert_button = gr.Button(
                        "Describe", 
                        variant="primary"
                    )
                    progress_output = gr.Textbox(
                        label="Progress", 
                        interactive=False, 
                        lines=2
                    )
                    download_button = gr.File(
                        label="Download Markdown", 
                        visible=False, 
                        interactive=False
                    )

                with gr.Column(scale=2):
                    markdown_output = gr.Markdown(label="Result (Markdown)")

        # Configuration tab
        with gr.TabItem("Settings"):
            gr.Markdown(
                "Adjust settings for the *next* generation."
            )
            api_key_input = gr.Textbox(
                label="OpenRouter API Key",
                type="password",
                placeholder="Enter your OpenRouter API key",
                value="" 
            )
            vlm_model_input = gr.Dropdown(
                label="VLM Model", 
                choices=SUGGESTED_VLMS,
                value=SUGGESTED_VLMS[0],
                allow_custom_value=True,
                info="Select or type the OpenRouter VLM model name"
            )
            output_language_input = gr.Dropdown(
                label="Output Language", 
                choices=SUGGESTED_LANGUAGES,
                value="English",
                allow_custom_value=True,
                info="Select or type the desired output language"
            )
            with gr.Row():
                use_markitdown_checkbox = gr.Checkbox(
                    label="Use Markitdown for extra text context",
                    value=False
                )
                use_summary_checkbox = gr.Checkbox(
                    label="Use PDF summary for augmented context",
                    value=False
                )
            summary_llm_model_input = gr.Dropdown(
                label="LLM Model for Summary", 
                choices=SUGGESTED_LLMS,
                value=SUGGESTED_LLMS[0],
                allow_custom_value=True,
                info="Select or type the OpenRouter LLM model name for summaries"
            )

    # Connect UI components
    conversion_inputs = [
        pdf_input, api_key_input, vlm_model_input, output_language_input,
        use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input
    ]
    conversion_outputs = [
        progress_output, download_button, markdown_output
    ]
    convert_button.click(
        fn=generate,
        inputs=conversion_inputs,
        outputs=conversion_outputs
    )

# Para Hugging Face Spaces
if __name__ == "__main__":
    app.launch()