Update app.py via AI Editor
Browse files
app.py
CHANGED
|
@@ -7,16 +7,16 @@ import os
|
|
| 7 |
import tempfile
|
| 8 |
import shutil
|
| 9 |
import logging
|
| 10 |
-
from flask import send_file
|
| 11 |
import threading
|
| 12 |
-
import pickle
|
| 13 |
from PyPDF2 import PdfReader, PdfWriter
|
| 14 |
import re
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Configure logging
|
| 17 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
|
| 18 |
|
| 19 |
-
# Session storage and lock management
|
| 20 |
SESSION_DATA = {}
|
| 21 |
SESSION_LOCKS = {}
|
| 22 |
|
|
@@ -54,7 +54,6 @@ def extract_text_headers(reader, page_num):
|
|
| 54 |
try:
|
| 55 |
page = reader.pages[page_num]
|
| 56 |
text = page.extract_text() or ""
|
| 57 |
-
# Extract the first non-blank line as a potential header
|
| 58 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 59 |
header = lines[0] if lines else ""
|
| 60 |
return header
|
|
@@ -104,10 +103,8 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
|
|
| 104 |
chapter = is_chapter_header(header)
|
| 105 |
split_here = False
|
| 106 |
|
| 107 |
-
# Force split if over max size
|
| 108 |
if size >= max_mb:
|
| 109 |
split_here = True
|
| 110 |
-
# Prefer to split between min_split_mb and max_mb at logical points
|
| 111 |
elif size >= min_split_mb:
|
| 112 |
if blank or chapter or (header and header != last_header):
|
| 113 |
split_here = True
|
|
@@ -118,11 +115,9 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
|
|
| 118 |
current_writer = PdfWriter()
|
| 119 |
last_header = header
|
| 120 |
|
| 121 |
-
# Add final split if not already
|
| 122 |
if last_split_at < n_pages:
|
| 123 |
splits.append((last_split_at, n_pages))
|
| 124 |
|
| 125 |
-
# Write split files
|
| 126 |
split_files = []
|
| 127 |
for idx, (start, end) in enumerate(splits):
|
| 128 |
writer = PdfWriter()
|
|
@@ -135,11 +130,16 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
|
|
| 135 |
split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
|
| 136 |
return split_files
|
| 137 |
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
external_stylesheets = [dbc.themes.BOOTSTRAP]
|
| 140 |
app = dash.Dash(__name__, external_stylesheets=external_stylesheets, suppress_callback_exceptions=True)
|
| 141 |
server = app.server
|
| 142 |
-
|
| 143 |
app.title = "Intelligent PDF Splitter"
|
| 144 |
|
| 145 |
app.layout = dbc.Container(
|
|
@@ -169,8 +169,8 @@ app.layout = dbc.Container(
|
|
| 169 |
multiple=False,
|
| 170 |
accept='.pdf'
|
| 171 |
),
|
| 172 |
-
html.Div(id='file-info'),
|
| 173 |
-
dbc.Button("Clear Session", id='clear-session', color='secondary', className='mt-2'),
|
| 174 |
dcc.Loading(
|
| 175 |
id="loading", type="default",
|
| 176 |
children=[html.Div(id='split-results')]
|
|
@@ -196,84 +196,170 @@ app.layout = dbc.Container(
|
|
| 196 |
Input('upload-pdf', 'contents'),
|
| 197 |
State('upload-pdf', 'filename'),
|
| 198 |
Input('clear-session', 'n_clicks'),
|
|
|
|
|
|
|
| 199 |
State('session-store', 'data'),
|
| 200 |
prevent_initial_call='initial_duplicate'
|
| 201 |
)
|
| 202 |
-
def handle_upload(contents, filename, clear_n, session_data):
|
| 203 |
trigger = ctx.triggered_id
|
| 204 |
session_id = get_session_id()
|
| 205 |
flask.g.session_id = session_id
|
| 206 |
session_dir = get_session_dir(session_id)
|
| 207 |
lock = get_session_lock(session_id)
|
| 208 |
|
|
|
|
|
|
|
|
|
|
| 209 |
if trigger == 'clear-session':
|
| 210 |
clean_session(session_id)
|
| 211 |
resp_data = {}
|
| 212 |
return "", "", resp_data
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
])
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
| 228 |
|
| 229 |
-
if not
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
-
if
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
results = [
|
| 249 |
html.H5("Split Files:"),
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
f"{fi['filename']} ({fi['size']:.2f} MB) ",
|
| 253 |
-
dbc.Button("Download", id={'type': 'download-btn', 'index': idx}, href=f"/download/{session_id}/{fi['filename']}", color='primary', size='sm')
|
| 254 |
-
]) for idx, fi in enumerate(split_files)
|
| 255 |
-
])
|
| 256 |
]
|
| 257 |
-
file_info = html.Div(f"Uploaded: {filename} ({len(pdf_bytes)/1024/1024:.2f} MB)")
|
| 258 |
-
session_data = {
|
| 259 |
-
'orig_filename': filename,
|
| 260 |
-
'split_files': split_files,
|
| 261 |
-
}
|
| 262 |
-
logging.info(f"PDF split into {len(split_files)} chunks for session {session_id}")
|
| 263 |
return file_info, results, session_data
|
| 264 |
-
except Exception as e:
|
| 265 |
-
logging.error(f"Error processing PDF: {e}")
|
| 266 |
-
return html.Div(f"Error: {e}", style={'color': 'red'}), "", {}
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
|
|
|
|
|
|
| 270 |
session_dir = get_session_dir(session_id)
|
| 271 |
file_path = os.path.join(session_dir, filename)
|
| 272 |
if os.path.exists(file_path):
|
| 273 |
-
logging.info(f"Serving file {file_path} for session {session_id}")
|
| 274 |
-
return send_file(file_path, mimetype='application/
|
| 275 |
else:
|
| 276 |
-
logging.error(f"
|
| 277 |
return "File not found", 404
|
| 278 |
|
| 279 |
@app.callback(
|
|
|
|
| 7 |
import tempfile
|
| 8 |
import shutil
|
| 9 |
import logging
|
| 10 |
+
from flask import send_file
|
| 11 |
import threading
|
|
|
|
| 12 |
from PyPDF2 import PdfReader, PdfWriter
|
| 13 |
import re
|
| 14 |
+
import zipfile
|
| 15 |
+
import base64
|
| 16 |
|
| 17 |
# Configure logging
|
| 18 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
|
| 19 |
|
|
|
|
| 20 |
SESSION_DATA = {}
|
| 21 |
SESSION_LOCKS = {}
|
| 22 |
|
|
|
|
| 54 |
try:
|
| 55 |
page = reader.pages[page_num]
|
| 56 |
text = page.extract_text() or ""
|
|
|
|
| 57 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 58 |
header = lines[0] if lines else ""
|
| 59 |
return header
|
|
|
|
| 103 |
chapter = is_chapter_header(header)
|
| 104 |
split_here = False
|
| 105 |
|
|
|
|
| 106 |
if size >= max_mb:
|
| 107 |
split_here = True
|
|
|
|
| 108 |
elif size >= min_split_mb:
|
| 109 |
if blank or chapter or (header and header != last_header):
|
| 110 |
split_here = True
|
|
|
|
| 115 |
current_writer = PdfWriter()
|
| 116 |
last_header = header
|
| 117 |
|
|
|
|
| 118 |
if last_split_at < n_pages:
|
| 119 |
splits.append((last_split_at, n_pages))
|
| 120 |
|
|
|
|
| 121 |
split_files = []
|
| 122 |
for idx, (start, end) in enumerate(splits):
|
| 123 |
writer = PdfWriter()
|
|
|
|
| 130 |
split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
|
| 131 |
return split_files
|
| 132 |
|
| 133 |
+
def make_zip_of_splits(split_files, session_dir):
|
| 134 |
+
zip_path = os.path.join(session_dir, "split_files.zip")
|
| 135 |
+
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
| 136 |
+
for file in split_files:
|
| 137 |
+
zipf.write(file['path'], arcname=file['filename'])
|
| 138 |
+
return zip_path
|
| 139 |
+
|
| 140 |
external_stylesheets = [dbc.themes.BOOTSTRAP]
|
| 141 |
app = dash.Dash(__name__, external_stylesheets=external_stylesheets, suppress_callback_exceptions=True)
|
| 142 |
server = app.server
|
|
|
|
| 143 |
app.title = "Intelligent PDF Splitter"
|
| 144 |
|
| 145 |
app.layout = dbc.Container(
|
|
|
|
| 169 |
multiple=False,
|
| 170 |
accept='.pdf'
|
| 171 |
),
|
| 172 |
+
html.Div(id='file-info', className='mb-4'),
|
| 173 |
+
dbc.Button("Clear Session", id='clear-session', color='secondary', className='mt-2 mb-2'),
|
| 174 |
dcc.Loading(
|
| 175 |
id="loading", type="default",
|
| 176 |
children=[html.Div(id='split-results')]
|
|
|
|
| 196 |
Input('upload-pdf', 'contents'),
|
| 197 |
State('upload-pdf', 'filename'),
|
| 198 |
Input('clear-session', 'n_clicks'),
|
| 199 |
+
Input({'type': 'delete-upload-btn', 'index': 0}, 'n_clicks'),
|
| 200 |
+
Input('split-btn', 'n_clicks'),
|
| 201 |
State('session-store', 'data'),
|
| 202 |
prevent_initial_call='initial_duplicate'
|
| 203 |
)
|
| 204 |
+
def handle_upload(contents, filename, clear_n, delete_upload_n, split_n, session_data):
|
| 205 |
trigger = ctx.triggered_id
|
| 206 |
session_id = get_session_id()
|
| 207 |
flask.g.session_id = session_id
|
| 208 |
session_dir = get_session_dir(session_id)
|
| 209 |
lock = get_session_lock(session_id)
|
| 210 |
|
| 211 |
+
if session_data is None:
|
| 212 |
+
session_data = {}
|
| 213 |
+
|
| 214 |
if trigger == 'clear-session':
|
| 215 |
clean_session(session_id)
|
| 216 |
resp_data = {}
|
| 217 |
return "", "", resp_data
|
| 218 |
|
| 219 |
+
if trigger == {'type': 'delete-upload-btn', 'index': 0}:
|
| 220 |
+
orig_filename = session_data.get('orig_filename', '')
|
| 221 |
+
pdf_path = os.path.join(session_dir, orig_filename)
|
| 222 |
+
if os.path.exists(pdf_path):
|
| 223 |
+
os.remove(pdf_path)
|
| 224 |
+
session_data = {}
|
| 225 |
+
if os.path.exists(session_dir):
|
| 226 |
+
for file in os.listdir(session_dir):
|
| 227 |
+
os.remove(os.path.join(session_dir, file))
|
| 228 |
+
return "", "", {}
|
| 229 |
+
|
| 230 |
+
if trigger == 'upload-pdf':
|
| 231 |
+
if not contents:
|
| 232 |
+
return "", "", {}
|
| 233 |
+
|
| 234 |
+
if not allowed_file(filename):
|
| 235 |
+
return html.Div("Only .pdf files are allowed.", style={'color': 'red'}), "", {}
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
header, b64data = contents.split(',', 1)
|
| 239 |
+
pdf_bytes = base64.b64decode(b64data)
|
| 240 |
+
pdf_path = os.path.join(session_dir, filename)
|
| 241 |
+
with open(pdf_path, 'wb') as f:
|
| 242 |
+
f.write(pdf_bytes)
|
| 243 |
+
logging.info(f"PDF uploaded and saved to {pdf_path} for session {session_id}")
|
| 244 |
+
|
| 245 |
+
session_data = {
|
| 246 |
+
'orig_filename': filename,
|
| 247 |
+
'split_files': None,
|
| 248 |
+
'zip_ready': False,
|
| 249 |
+
}
|
| 250 |
+
file_info = dbc.Row([
|
| 251 |
+
dbc.Col(html.Div(f"Uploaded: {filename} ({len(pdf_bytes)/1024/1024:.2f} MB)"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
|
| 252 |
+
dbc.Col(
|
| 253 |
+
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
|
| 254 |
+
width=3, style={'display': 'flex', 'justifyContent': 'end'}
|
| 255 |
+
)
|
| 256 |
+
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
|
| 257 |
+
split_results = dbc.Row([
|
| 258 |
+
dbc.Col(
|
| 259 |
+
dbc.Button("Split PDF", id='split-btn', color='primary', className='mb-3 mt-2', n_clicks=0, style={'width': '180px', 'fontWeight': 'bold'}),
|
| 260 |
+
width=12, style={'display': 'flex', 'justifyContent': 'center'}
|
| 261 |
+
)
|
| 262 |
])
|
| 263 |
+
return file_info, split_results, session_data
|
| 264 |
+
except Exception as e:
|
| 265 |
+
logging.error(f"Error processing PDF: {e}")
|
| 266 |
+
return html.Div(f"Error: {e}", style={'color': 'red'}), "", {}
|
| 267 |
|
| 268 |
+
if session_data.get('orig_filename') and not session_data.get('split_files'):
|
| 269 |
+
# If user refreshes after upload but before split, restore file info and split button
|
| 270 |
+
file_info = dbc.Row([
|
| 271 |
+
dbc.Col(html.Div(f"Uploaded: {session_data['orig_filename']}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
|
| 272 |
+
dbc.Col(
|
| 273 |
+
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
|
| 274 |
+
width=3, style={'display': 'flex', 'justifyContent': 'end'}
|
| 275 |
+
)
|
| 276 |
+
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
|
| 277 |
+
split_results = dbc.Row([
|
| 278 |
+
dbc.Col(
|
| 279 |
+
dbc.Button("Split PDF", id='split-btn', color='primary', className='mb-3 mt-2', n_clicks=0, style={'width': '180px', 'fontWeight': 'bold'}),
|
| 280 |
+
width=12, style={'display': 'flex', 'justifyContent': 'center'}
|
| 281 |
+
)
|
| 282 |
+
])
|
| 283 |
+
return file_info, split_results, session_data
|
| 284 |
|
| 285 |
+
if trigger == 'split-btn':
|
| 286 |
+
orig_filename = session_data.get('orig_filename')
|
| 287 |
+
if not orig_filename:
|
| 288 |
+
return html.Div("No file to split.", style={'color': 'red'}), "", session_data
|
| 289 |
+
pdf_path = os.path.join(session_dir, orig_filename)
|
| 290 |
+
if not os.path.exists(pdf_path):
|
| 291 |
+
return html.Div("Uploaded file not found. Please upload again.", style={'color': 'red'}), "", {}
|
| 292 |
+
try:
|
| 293 |
+
with lock:
|
| 294 |
+
split_files = intelligent_pdf_split(pdf_path, session_dir)
|
| 295 |
+
zip_path = make_zip_of_splits(split_files, session_dir)
|
| 296 |
+
session_data['split_files'] = split_files
|
| 297 |
+
session_data['zip_ready'] = True
|
| 298 |
+
file_info = dbc.Row([
|
| 299 |
+
dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
|
| 300 |
+
dbc.Col(
|
| 301 |
+
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
|
| 302 |
+
width=3, style={'display': 'flex', 'justifyContent': 'end'}
|
| 303 |
+
)
|
| 304 |
+
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
|
| 305 |
+
split_files_list = html.Ul([
|
| 306 |
+
html.Li([
|
| 307 |
+
f"{fi['filename']} ({fi['size']:.2f} MB)"
|
| 308 |
+
]) for fi in split_files
|
| 309 |
+
])
|
| 310 |
+
download_zip_btn = dbc.Button(
|
| 311 |
+
"Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4',
|
| 312 |
+
href=f"/download_zip/{session_id}/split_files.zip"
|
| 313 |
+
)
|
| 314 |
+
results = [
|
| 315 |
+
html.H5("Split Files:"),
|
| 316 |
+
split_files_list,
|
| 317 |
+
html.Div(download_zip_btn, style={'marginTop': '30px'})
|
| 318 |
+
]
|
| 319 |
+
logging.info(f"PDF split into {len(split_files)} chunks for session {session_id}, zip ready.")
|
| 320 |
+
return file_info, results, session_data
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logging.error(f"Error splitting PDF: {e}")
|
| 323 |
+
return html.Div(f"Error: {e}", style={'color': 'red'}), "", session_data
|
| 324 |
|
| 325 |
+
# Restore split results if user refreshes after splitting
|
| 326 |
+
if session_data.get('split_files'):
|
| 327 |
+
split_files = session_data['split_files']
|
| 328 |
+
orig_filename = session_data.get('orig_filename', '')
|
| 329 |
+
file_info = dbc.Row([
|
| 330 |
+
dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
|
| 331 |
+
dbc.Col(
|
| 332 |
+
dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
|
| 333 |
+
width=3, style={'display': 'flex', 'justifyContent': 'end'}
|
| 334 |
+
)
|
| 335 |
+
], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
|
| 336 |
+
split_files_list = html.Ul([
|
| 337 |
+
html.Li([
|
| 338 |
+
f"{fi['filename']} ({fi['size']:.2f} MB)"
|
| 339 |
+
]) for fi in split_files
|
| 340 |
+
])
|
| 341 |
+
download_zip_btn = dbc.Button(
|
| 342 |
+
"Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4',
|
| 343 |
+
href=f"/download_zip/{session_id}/split_files.zip"
|
| 344 |
+
)
|
| 345 |
results = [
|
| 346 |
html.H5("Split Files:"),
|
| 347 |
+
split_files_list,
|
| 348 |
+
html.Div(download_zip_btn, style={'marginTop': '30px'})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
return file_info, results, session_data
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
+
return "", "", session_data
|
| 353 |
+
|
| 354 |
+
@app.server.route('/download_zip/<session_id>/<filename>')
|
| 355 |
+
def download_zip_file(session_id, filename):
|
| 356 |
session_dir = get_session_dir(session_id)
|
| 357 |
file_path = os.path.join(session_dir, filename)
|
| 358 |
if os.path.exists(file_path):
|
| 359 |
+
logging.info(f"Serving zip file {file_path} for session {session_id}")
|
| 360 |
+
return send_file(file_path, mimetype='application/zip', as_attachment=True, download_name=filename)
|
| 361 |
else:
|
| 362 |
+
logging.error(f"ZIP file not found for download: {file_path}")
|
| 363 |
return "File not found", 404
|
| 364 |
|
| 365 |
@app.callback(
|