File size: 8,443 Bytes
4220939
 
 
 
 
cf5a0c5
4220939
 
 
 
 
 
 
 
 
cf5a0c5
 
 
4220939
 
cf5a0c5
 
 
4220939
 
 
 
 
 
 
cf5a0c5
4220939
 
cf5a0c5
4220939
cf5a0c5
4220939
cf5a0c5
4220939
 
 
 
 
 
 
 
 
 
cf5a0c5
4220939
 
 
cf5a0c5
 
 
 
 
 
 
4220939
 
 
 
cf5a0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4220939
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
<!doctype html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
    <title>PDF to Markdown Converter (Streaming)</title>
    <style>
        body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; margin: 0; background-color: #f0f2f5; color: #1c1e21; line-height: 1.5; }
        .navbar { background-color: #1877f2; padding: 10px 20px; color: white; text-align: center; }
        .navbar h1 { margin: 0; font-size: 1.8em; }
        .container { max-width: 800px; margin: 20px auto; background-color: #fff; padding: 25px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1), 0 8px 16px rgba(0,0,0,0.1); }
        p { margin-bottom: 1em; }
        label { display: block; margin-top: 15px; margin-bottom: 5px; font-weight: 600; color: #4b4f56; }
        input[type="file"], input[type="text"] { width: calc(100% - 22px); padding: 10px; margin-top: 5px; border: 1px solid #dddfe2; border-radius: 6px; font-size: 1em; }
        input[type="file"] { padding: 7px; }
        #submitBtn { background-color: #1877f2; color: white; padding: 10px 20px; border: none; border-radius: 6px; cursor: pointer; margin-top: 25px; font-size: 1.1em; font-weight: bold; }
        #submitBtn:hover { background-color: #166fe5; }
        #submitBtn:disabled { background-color: #a0a0a0; cursor: not-allowed; }
        .message { margin-top: 20px; padding: 12px; border-radius: 6px; font-size: 0.95em; }
        .error { background-color: #f8d7da; border: 1px solid #f5c2c7; color: #842029; }
        #statusArea { background-color: #e7f3ff; border: 1px solid #cfe2ff; color: #055160; margin-top: 20px; padding: 10px; min-height: 50px; border-radius: 6px; }
        #statusArea p { margin: 5px 0; }
        #markdownOutput { background-color: #f5f6f7; padding: 15px; border: 1px solid #e0e0e0; border-radius: 6px; white-space: pre-wrap; word-wrap: break-word; margin-top: 20px; font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace; font-size: 0.9em; line-height: 1.6; min-height: 100px; }
        .or-separator { text-align: center; margin: 20px 0; font-weight: bold; color: #606770; }
        .form-actions { text-align: center; }
        .footer { text-align: center; margin-top: 30px; font-size: 0.85em; color: #606770; }
    </style>
</head>
<body>
    <div class="navbar">
        <h1>PDF to Markdown Converter (Streaming)</h1>
    </div>
    <div class="container">
        <p>Upload a PDF file or provide a URL to convert it to Markdown. Progress will be streamed.</p>

        <div id="globalError" class="message error" style="display:none;"></div>

        <form id="pdfForm">
            <div>
                <label for="pdf_file">Upload PDF File:</label>
                <input type="file" name="pdf_file" id="pdf_file" accept=".pdf">
            </div>
            <div class="or-separator">OR</div>
            <div>
                <label for="pdf_url">Enter PDF URL:</label>
                <input type="text" name="pdf_url" id="pdf_url" placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf">
            </div>
            <div class="form-actions">
                <button type="button" id="submitBtn">Convert to Markdown</button>
            </div>
        </form>

        <h2>Processing Status:</h2>
        <div id="statusArea">
            <p>Waiting for input...</p>
        </div>

        <h2>Markdown Output:</h2>
        <pre id="markdownOutput">Output will appear here...</pre>
    </div>
    <div class="footer">
        <p>Powered by Flask, Poppler, Tesseract, and Hugging Face.</p>
    </div>

    <script>
        const form = document.getElementById('pdfForm');
        const submitBtn = document.getElementById('submitBtn');
        const statusArea = document.getElementById('statusArea');
        const markdownOutput = document.getElementById('markdownOutput');
        const globalError = document.getElementById('globalError');

        submitBtn.addEventListener('click', async function(event) {
            event.preventDefault();
            submitBtn.disabled = true;
            statusArea.innerHTML = '<p>Starting processing...</p>';
            markdownOutput.textContent = 'Processing...';
            globalError.style.display = 'none';

            const formData = new FormData(form);
            
            try {
                const response = await fetch("{{ url_for('process_pdf_stream') }}", {
                    method: 'POST',
                    body: formData,
                });

                if (!response.ok) {
                    // Handle initial HTTP errors before streaming starts (e.g., 400, 500 from Flask before yield)
                    const errorText = await response.text();
                    throw new Error(`Server error: ${response.status} ${response.statusText}. ${errorText}`);
                }
                
                // Process the streamed response
                const reader = response.body.getReader();
                const decoder = new TextDecoder();
                markdownOutput.textContent = ''; // Clear previous output

                while (true) {
                    const { value, done } = await reader.read();
                    if (done) {
                        statusArea.innerHTML += '<p><strong>Processing complete.</strong></p>';
                        break;
                    }
                    
                    const chunk = decoder.decode(value, { stream: true });
                    // Expecting JSON objects: {"type": "status", "message": "..."} or {"type": "markdown", "content": "..."} or {"type": "error", "message": "..."}
                    // Simple split for potentially multiple JSON objects in one chunk
                    chunk.split('\n').forEach(line => {
                        if (line.trim() === '') return;
                        try {
                            const data = JSON.parse(line);
                            if (data.type === 'status') {
                                const p = document.createElement('p');
                                p.textContent = data.message;
                                statusArea.appendChild(p);
                                statusArea.scrollTop = statusArea.scrollHeight; // Auto-scroll
                            } else if (data.type === 'markdown_chunk') {
                                markdownOutput.textContent += data.content;
                            } else if (data.type === 'markdown_replace') {
                                markdownOutput.textContent = data.content; // For initial title or full rewrite
                            } else if (data.type === 'image_md') {
                                markdownOutput.textContent += data.content;
                            } else if (data.type === 'error') {
                                const p = document.createElement('p');
                                p.style.color = 'red';
                                p.textContent = 'ERROR: ' + data.message;
                                statusArea.appendChild(p);
                                globalError.textContent = 'An error occurred: ' + data.message;
                                globalError.style.display = 'block';
                            } else if (data.type === 'final_status') {
                                statusArea.innerHTML += `<p><strong>${data.message}</strong></p>`;
                            }
                        } catch (e) {
                            console.warn('Failed to parse JSON chunk:', line, e);
                            // Might be raw text for debugging or incomplete JSON
                            // statusArea.innerHTML += `<p>Raw chunk: ${line}</p>`;
                        }
                    });
                }

            } catch (error) {
                console.error('Fetch error:', error);
                statusArea.innerHTML = `<p style="color:red;"><strong>Processing failed:</strong> ${error.message}</p>`;
                markdownOutput.textContent = 'Error occurred.';
                globalError.textContent = `An error occurred during the request: ${error.message}`;
                globalError.style.display = 'block';
            } finally {
                submitBtn.disabled = false;
            }
        });
    </script>
</body>
</html>