Spaces:
Sleeping
Sleeping
File size: 8,443 Bytes
4220939 cf5a0c5 4220939 cf5a0c5 4220939 cf5a0c5 4220939 cf5a0c5 4220939 cf5a0c5 4220939 cf5a0c5 4220939 cf5a0c5 4220939 cf5a0c5 4220939 cf5a0c5 4220939 cf5a0c5 4220939 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>PDF to Markdown Converter (Streaming)</title>
<style>
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; margin: 0; background-color: #f0f2f5; color: #1c1e21; line-height: 1.5; }
.navbar { background-color: #1877f2; padding: 10px 20px; color: white; text-align: center; }
.navbar h1 { margin: 0; font-size: 1.8em; }
.container { max-width: 800px; margin: 20px auto; background-color: #fff; padding: 25px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1), 0 8px 16px rgba(0,0,0,0.1); }
p { margin-bottom: 1em; }
label { display: block; margin-top: 15px; margin-bottom: 5px; font-weight: 600; color: #4b4f56; }
input[type="file"], input[type="text"] { width: calc(100% - 22px); padding: 10px; margin-top: 5px; border: 1px solid #dddfe2; border-radius: 6px; font-size: 1em; }
input[type="file"] { padding: 7px; }
#submitBtn { background-color: #1877f2; color: white; padding: 10px 20px; border: none; border-radius: 6px; cursor: pointer; margin-top: 25px; font-size: 1.1em; font-weight: bold; }
#submitBtn:hover { background-color: #166fe5; }
#submitBtn:disabled { background-color: #a0a0a0; cursor: not-allowed; }
.message { margin-top: 20px; padding: 12px; border-radius: 6px; font-size: 0.95em; }
.error { background-color: #f8d7da; border: 1px solid #f5c2c7; color: #842029; }
#statusArea { background-color: #e7f3ff; border: 1px solid #cfe2ff; color: #055160; margin-top: 20px; padding: 10px; min-height: 50px; border-radius: 6px; }
#statusArea p { margin: 5px 0; }
#markdownOutput { background-color: #f5f6f7; padding: 15px; border: 1px solid #e0e0e0; border-radius: 6px; white-space: pre-wrap; word-wrap: break-word; margin-top: 20px; font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace; font-size: 0.9em; line-height: 1.6; min-height: 100px; }
.or-separator { text-align: center; margin: 20px 0; font-weight: bold; color: #606770; }
.form-actions { text-align: center; }
.footer { text-align: center; margin-top: 30px; font-size: 0.85em; color: #606770; }
</style>
</head>
<body>
<div class="navbar">
<h1>PDF to Markdown Converter (Streaming)</h1>
</div>
<div class="container">
<p>Upload a PDF file or provide a URL to convert it to Markdown. Progress will be streamed.</p>
<div id="globalError" class="message error" style="display:none;"></div>
<form id="pdfForm">
<div>
<label for="pdf_file">Upload PDF File:</label>
<input type="file" name="pdf_file" id="pdf_file" accept=".pdf">
</div>
<div class="or-separator">OR</div>
<div>
<label for="pdf_url">Enter PDF URL:</label>
<input type="text" name="pdf_url" id="pdf_url" placeholder="e.g., https://arxiv.org/pdf/1706.03762.pdf">
</div>
<div class="form-actions">
<button type="button" id="submitBtn">Convert to Markdown</button>
</div>
</form>
<h2>Processing Status:</h2>
<div id="statusArea">
<p>Waiting for input...</p>
</div>
<h2>Markdown Output:</h2>
<pre id="markdownOutput">Output will appear here...</pre>
</div>
<div class="footer">
<p>Powered by Flask, Poppler, Tesseract, and Hugging Face.</p>
</div>
<script>
const form = document.getElementById('pdfForm');
const submitBtn = document.getElementById('submitBtn');
const statusArea = document.getElementById('statusArea');
const markdownOutput = document.getElementById('markdownOutput');
const globalError = document.getElementById('globalError');
submitBtn.addEventListener('click', async function(event) {
event.preventDefault();
submitBtn.disabled = true;
statusArea.innerHTML = '<p>Starting processing...</p>';
markdownOutput.textContent = 'Processing...';
globalError.style.display = 'none';
const formData = new FormData(form);
try {
const response = await fetch("{{ url_for('process_pdf_stream') }}", {
method: 'POST',
body: formData,
});
if (!response.ok) {
// Handle initial HTTP errors before streaming starts (e.g., 400, 500 from Flask before yield)
const errorText = await response.text();
throw new Error(`Server error: ${response.status} ${response.statusText}. ${errorText}`);
}
// Process the streamed response
const reader = response.body.getReader();
const decoder = new TextDecoder();
markdownOutput.textContent = ''; // Clear previous output
while (true) {
const { value, done } = await reader.read();
if (done) {
statusArea.innerHTML += '<p><strong>Processing complete.</strong></p>';
break;
}
const chunk = decoder.decode(value, { stream: true });
// Expecting JSON objects: {"type": "status", "message": "..."} or {"type": "markdown", "content": "..."} or {"type": "error", "message": "..."}
// Simple split for potentially multiple JSON objects in one chunk
chunk.split('\n').forEach(line => {
if (line.trim() === '') return;
try {
const data = JSON.parse(line);
if (data.type === 'status') {
const p = document.createElement('p');
p.textContent = data.message;
statusArea.appendChild(p);
statusArea.scrollTop = statusArea.scrollHeight; // Auto-scroll
} else if (data.type === 'markdown_chunk') {
markdownOutput.textContent += data.content;
} else if (data.type === 'markdown_replace') {
markdownOutput.textContent = data.content; // For initial title or full rewrite
} else if (data.type === 'image_md') {
markdownOutput.textContent += data.content;
} else if (data.type === 'error') {
const p = document.createElement('p');
p.style.color = 'red';
p.textContent = 'ERROR: ' + data.message;
statusArea.appendChild(p);
globalError.textContent = 'An error occurred: ' + data.message;
globalError.style.display = 'block';
} else if (data.type === 'final_status') {
statusArea.innerHTML += `<p><strong>${data.message}</strong></p>`;
}
} catch (e) {
console.warn('Failed to parse JSON chunk:', line, e);
// Might be raw text for debugging or incomplete JSON
// statusArea.innerHTML += `<p>Raw chunk: ${line}</p>`;
}
});
}
} catch (error) {
console.error('Fetch error:', error);
statusArea.innerHTML = `<p style="color:red;"><strong>Processing failed:</strong> ${error.message}</p>`;
markdownOutput.textContent = 'Error occurred.';
globalError.textContent = `An error occurred during the request: ${error.message}`;
globalError.style.display = 'block';
} finally {
submitBtn.disabled = false;
}
});
</script>
</body>
</html> |