bluenevus commited on
Commit
a2ffbec
·
1 Parent(s): d03cd92

Update app.py via AI Editor

Browse files
Files changed (1) hide show
  1. app.py +149 -63
app.py CHANGED
@@ -7,16 +7,16 @@ import os
7
  import tempfile
8
  import shutil
9
  import logging
10
- from flask import send_file, make_response
11
  import threading
12
- import pickle
13
  from PyPDF2 import PdfReader, PdfWriter
14
  import re
 
 
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
18
 
19
- # Session storage and lock management
20
  SESSION_DATA = {}
21
  SESSION_LOCKS = {}
22
 
@@ -54,7 +54,6 @@ def extract_text_headers(reader, page_num):
54
  try:
55
  page = reader.pages[page_num]
56
  text = page.extract_text() or ""
57
- # Extract the first non-blank line as a potential header
58
  lines = [line.strip() for line in text.split('\n') if line.strip()]
59
  header = lines[0] if lines else ""
60
  return header
@@ -104,10 +103,8 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
104
  chapter = is_chapter_header(header)
105
  split_here = False
106
 
107
- # Force split if over max size
108
  if size >= max_mb:
109
  split_here = True
110
- # Prefer to split between min_split_mb and max_mb at logical points
111
  elif size >= min_split_mb:
112
  if blank or chapter or (header and header != last_header):
113
  split_here = True
@@ -118,11 +115,9 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
118
  current_writer = PdfWriter()
119
  last_header = header
120
 
121
- # Add final split if not already
122
  if last_split_at < n_pages:
123
  splits.append((last_split_at, n_pages))
124
 
125
- # Write split files
126
  split_files = []
127
  for idx, (start, end) in enumerate(splits):
128
  writer = PdfWriter()
@@ -135,11 +130,16 @@ def intelligent_pdf_split(input_path, session_dir, max_mb=5, min_split_mb=4):
135
  split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
136
  return split_files
137
 
138
- # Dash app setup
 
 
 
 
 
 
139
  external_stylesheets = [dbc.themes.BOOTSTRAP]
140
  app = dash.Dash(__name__, external_stylesheets=external_stylesheets, suppress_callback_exceptions=True)
141
  server = app.server
142
-
143
  app.title = "Intelligent PDF Splitter"
144
 
145
  app.layout = dbc.Container(
@@ -169,8 +169,8 @@ app.layout = dbc.Container(
169
  multiple=False,
170
  accept='.pdf'
171
  ),
172
- html.Div(id='file-info'),
173
- dbc.Button("Clear Session", id='clear-session', color='secondary', className='mt-2'),
174
  dcc.Loading(
175
  id="loading", type="default",
176
  children=[html.Div(id='split-results')]
@@ -196,84 +196,170 @@ app.layout = dbc.Container(
196
  Input('upload-pdf', 'contents'),
197
  State('upload-pdf', 'filename'),
198
  Input('clear-session', 'n_clicks'),
 
 
199
  State('session-store', 'data'),
200
  prevent_initial_call='initial_duplicate'
201
  )
202
- def handle_upload(contents, filename, clear_n, session_data):
203
  trigger = ctx.triggered_id
204
  session_id = get_session_id()
205
  flask.g.session_id = session_id
206
  session_dir = get_session_dir(session_id)
207
  lock = get_session_lock(session_id)
208
 
 
 
 
209
  if trigger == 'clear-session':
210
  clean_session(session_id)
211
  resp_data = {}
212
  return "", "", resp_data
213
 
214
- # If user returns, restore state
215
- if not contents and session_data and 'split_files' in session_data:
216
- split_files = session_data.get('split_files', [])
217
- file_info = html.Div(f"Previous upload: {session_data.get('orig_filename', '')}")
218
- results = [
219
- html.H5("Split Files:"),
220
- html.Ul([
221
- html.Li([
222
- f"{fi['filename']} ({fi['size']:.2f} MB) ",
223
- dbc.Button("Download", id={'type': 'download-btn', 'index': idx}, href=f"/download/{session_id}/{fi['filename']}", color='primary', size='sm')
224
- ]) for idx, fi in enumerate(split_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  ])
226
- ]
227
- return file_info, results, session_data
 
 
228
 
229
- if not contents:
230
- return "", "", {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
- if not allowed_file(filename):
233
- return html.Div("Only .pdf files are allowed.", style={'color': 'red'}), "", {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
- try:
236
- # Save file
237
- header, b64data = contents.split(',', 1)
238
- import base64
239
- pdf_bytes = base64.b64decode(b64data)
240
- pdf_path = os.path.join(session_dir, filename)
241
- with open(pdf_path, 'wb') as f:
242
- f.write(pdf_bytes)
243
- logging.info(f"PDF uploaded and saved to {pdf_path} for session {session_id}")
244
-
245
- # Split PDF with lock
246
- with lock:
247
- split_files = intelligent_pdf_split(pdf_path, session_dir)
 
 
 
 
 
 
 
248
  results = [
249
  html.H5("Split Files:"),
250
- html.Ul([
251
- html.Li([
252
- f"{fi['filename']} ({fi['size']:.2f} MB) ",
253
- dbc.Button("Download", id={'type': 'download-btn', 'index': idx}, href=f"/download/{session_id}/{fi['filename']}", color='primary', size='sm')
254
- ]) for idx, fi in enumerate(split_files)
255
- ])
256
  ]
257
- file_info = html.Div(f"Uploaded: {filename} ({len(pdf_bytes)/1024/1024:.2f} MB)")
258
- session_data = {
259
- 'orig_filename': filename,
260
- 'split_files': split_files,
261
- }
262
- logging.info(f"PDF split into {len(split_files)} chunks for session {session_id}")
263
  return file_info, results, session_data
264
- except Exception as e:
265
- logging.error(f"Error processing PDF: {e}")
266
- return html.Div(f"Error: {e}", style={'color': 'red'}), "", {}
267
 
268
- @app.server.route('/download/<session_id>/<filename>')
269
- def download_split_file(session_id, filename):
 
 
270
  session_dir = get_session_dir(session_id)
271
  file_path = os.path.join(session_dir, filename)
272
  if os.path.exists(file_path):
273
- logging.info(f"Serving file {file_path} for session {session_id}")
274
- return send_file(file_path, mimetype='application/pdf', as_attachment=True, download_name=filename)
275
  else:
276
- logging.error(f"File not found for download: {file_path}")
277
  return "File not found", 404
278
 
279
  @app.callback(
 
7
  import tempfile
8
  import shutil
9
  import logging
10
+ from flask import send_file
11
  import threading
 
12
  from PyPDF2 import PdfReader, PdfWriter
13
  import re
14
+ import zipfile
15
+ import base64
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
19
 
 
20
  SESSION_DATA = {}
21
  SESSION_LOCKS = {}
22
 
 
54
  try:
55
  page = reader.pages[page_num]
56
  text = page.extract_text() or ""
 
57
  lines = [line.strip() for line in text.split('\n') if line.strip()]
58
  header = lines[0] if lines else ""
59
  return header
 
103
  chapter = is_chapter_header(header)
104
  split_here = False
105
 
 
106
  if size >= max_mb:
107
  split_here = True
 
108
  elif size >= min_split_mb:
109
  if blank or chapter or (header and header != last_header):
110
  split_here = True
 
115
  current_writer = PdfWriter()
116
  last_header = header
117
 
 
118
  if last_split_at < n_pages:
119
  splits.append((last_split_at, n_pages))
120
 
 
121
  split_files = []
122
  for idx, (start, end) in enumerate(splits):
123
  writer = PdfWriter()
 
130
  split_files.append({'filename': os.path.basename(out_path), 'size': size, 'path': out_path})
131
  return split_files
132
 
133
+ def make_zip_of_splits(split_files, session_dir):
134
+ zip_path = os.path.join(session_dir, "split_files.zip")
135
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
136
+ for file in split_files:
137
+ zipf.write(file['path'], arcname=file['filename'])
138
+ return zip_path
139
+
140
  external_stylesheets = [dbc.themes.BOOTSTRAP]
141
  app = dash.Dash(__name__, external_stylesheets=external_stylesheets, suppress_callback_exceptions=True)
142
  server = app.server
 
143
  app.title = "Intelligent PDF Splitter"
144
 
145
  app.layout = dbc.Container(
 
169
  multiple=False,
170
  accept='.pdf'
171
  ),
172
+ html.Div(id='file-info', className='mb-4'),
173
+ dbc.Button("Clear Session", id='clear-session', color='secondary', className='mt-2 mb-2'),
174
  dcc.Loading(
175
  id="loading", type="default",
176
  children=[html.Div(id='split-results')]
 
196
  Input('upload-pdf', 'contents'),
197
  State('upload-pdf', 'filename'),
198
  Input('clear-session', 'n_clicks'),
199
+ Input({'type': 'delete-upload-btn', 'index': 0}, 'n_clicks'),
200
+ Input('split-btn', 'n_clicks'),
201
  State('session-store', 'data'),
202
  prevent_initial_call='initial_duplicate'
203
  )
204
+ def handle_upload(contents, filename, clear_n, delete_upload_n, split_n, session_data):
205
  trigger = ctx.triggered_id
206
  session_id = get_session_id()
207
  flask.g.session_id = session_id
208
  session_dir = get_session_dir(session_id)
209
  lock = get_session_lock(session_id)
210
 
211
+ if session_data is None:
212
+ session_data = {}
213
+
214
  if trigger == 'clear-session':
215
  clean_session(session_id)
216
  resp_data = {}
217
  return "", "", resp_data
218
 
219
+ if trigger == {'type': 'delete-upload-btn', 'index': 0}:
220
+ orig_filename = session_data.get('orig_filename', '')
221
+ pdf_path = os.path.join(session_dir, orig_filename)
222
+ if os.path.exists(pdf_path):
223
+ os.remove(pdf_path)
224
+ session_data = {}
225
+ if os.path.exists(session_dir):
226
+ for file in os.listdir(session_dir):
227
+ os.remove(os.path.join(session_dir, file))
228
+ return "", "", {}
229
+
230
+ if trigger == 'upload-pdf':
231
+ if not contents:
232
+ return "", "", {}
233
+
234
+ if not allowed_file(filename):
235
+ return html.Div("Only .pdf files are allowed.", style={'color': 'red'}), "", {}
236
+
237
+ try:
238
+ header, b64data = contents.split(',', 1)
239
+ pdf_bytes = base64.b64decode(b64data)
240
+ pdf_path = os.path.join(session_dir, filename)
241
+ with open(pdf_path, 'wb') as f:
242
+ f.write(pdf_bytes)
243
+ logging.info(f"PDF uploaded and saved to {pdf_path} for session {session_id}")
244
+
245
+ session_data = {
246
+ 'orig_filename': filename,
247
+ 'split_files': None,
248
+ 'zip_ready': False,
249
+ }
250
+ file_info = dbc.Row([
251
+ dbc.Col(html.Div(f"Uploaded: {filename} ({len(pdf_bytes)/1024/1024:.2f} MB)"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
252
+ dbc.Col(
253
+ dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
254
+ width=3, style={'display': 'flex', 'justifyContent': 'end'}
255
+ )
256
+ ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
257
+ split_results = dbc.Row([
258
+ dbc.Col(
259
+ dbc.Button("Split PDF", id='split-btn', color='primary', className='mb-3 mt-2', n_clicks=0, style={'width': '180px', 'fontWeight': 'bold'}),
260
+ width=12, style={'display': 'flex', 'justifyContent': 'center'}
261
+ )
262
  ])
263
+ return file_info, split_results, session_data
264
+ except Exception as e:
265
+ logging.error(f"Error processing PDF: {e}")
266
+ return html.Div(f"Error: {e}", style={'color': 'red'}), "", {}
267
 
268
+ if session_data.get('orig_filename') and not session_data.get('split_files'):
269
+ # If user refreshes after upload but before split, restore file info and split button
270
+ file_info = dbc.Row([
271
+ dbc.Col(html.Div(f"Uploaded: {session_data['orig_filename']}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
272
+ dbc.Col(
273
+ dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
274
+ width=3, style={'display': 'flex', 'justifyContent': 'end'}
275
+ )
276
+ ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
277
+ split_results = dbc.Row([
278
+ dbc.Col(
279
+ dbc.Button("Split PDF", id='split-btn', color='primary', className='mb-3 mt-2', n_clicks=0, style={'width': '180px', 'fontWeight': 'bold'}),
280
+ width=12, style={'display': 'flex', 'justifyContent': 'center'}
281
+ )
282
+ ])
283
+ return file_info, split_results, session_data
284
 
285
+ if trigger == 'split-btn':
286
+ orig_filename = session_data.get('orig_filename')
287
+ if not orig_filename:
288
+ return html.Div("No file to split.", style={'color': 'red'}), "", session_data
289
+ pdf_path = os.path.join(session_dir, orig_filename)
290
+ if not os.path.exists(pdf_path):
291
+ return html.Div("Uploaded file not found. Please upload again.", style={'color': 'red'}), "", {}
292
+ try:
293
+ with lock:
294
+ split_files = intelligent_pdf_split(pdf_path, session_dir)
295
+ zip_path = make_zip_of_splits(split_files, session_dir)
296
+ session_data['split_files'] = split_files
297
+ session_data['zip_ready'] = True
298
+ file_info = dbc.Row([
299
+ dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
300
+ dbc.Col(
301
+ dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
302
+ width=3, style={'display': 'flex', 'justifyContent': 'end'}
303
+ )
304
+ ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
305
+ split_files_list = html.Ul([
306
+ html.Li([
307
+ f"{fi['filename']} ({fi['size']:.2f} MB)"
308
+ ]) for fi in split_files
309
+ ])
310
+ download_zip_btn = dbc.Button(
311
+ "Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4',
312
+ href=f"/download_zip/{session_id}/split_files.zip"
313
+ )
314
+ results = [
315
+ html.H5("Split Files:"),
316
+ split_files_list,
317
+ html.Div(download_zip_btn, style={'marginTop': '30px'})
318
+ ]
319
+ logging.info(f"PDF split into {len(split_files)} chunks for session {session_id}, zip ready.")
320
+ return file_info, results, session_data
321
+ except Exception as e:
322
+ logging.error(f"Error splitting PDF: {e}")
323
+ return html.Div(f"Error: {e}", style={'color': 'red'}), "", session_data
324
 
325
+ # Restore split results if user refreshes after splitting
326
+ if session_data.get('split_files'):
327
+ split_files = session_data['split_files']
328
+ orig_filename = session_data.get('orig_filename', '')
329
+ file_info = dbc.Row([
330
+ dbc.Col(html.Div(f"Uploaded: {orig_filename}"), width=9, style={'display': 'flex', 'alignItems': 'center'}),
331
+ dbc.Col(
332
+ dbc.Button("Delete", id={'type': 'delete-upload-btn', 'index': 0}, color='danger', n_clicks=0, className='ms-5'),
333
+ width=3, style={'display': 'flex', 'justifyContent': 'end'}
334
+ )
335
+ ], className='mb-3', align='center', style={'marginTop': "15px", 'marginBottom': '25px'})
336
+ split_files_list = html.Ul([
337
+ html.Li([
338
+ f"{fi['filename']} ({fi['size']:.2f} MB)"
339
+ ]) for fi in split_files
340
+ ])
341
+ download_zip_btn = dbc.Button(
342
+ "Download All (ZIP)", color="primary", size="lg", className='mb-3 mt-4',
343
+ href=f"/download_zip/{session_id}/split_files.zip"
344
+ )
345
  results = [
346
  html.H5("Split Files:"),
347
+ split_files_list,
348
+ html.Div(download_zip_btn, style={'marginTop': '30px'})
 
 
 
 
349
  ]
 
 
 
 
 
 
350
  return file_info, results, session_data
 
 
 
351
 
352
+ return "", "", session_data
353
+
354
+ @app.server.route('/download_zip/<session_id>/<filename>')
355
+ def download_zip_file(session_id, filename):
356
  session_dir = get_session_dir(session_id)
357
  file_path = os.path.join(session_dir, filename)
358
  if os.path.exists(file_path):
359
+ logging.info(f"Serving zip file {file_path} for session {session_id}")
360
+ return send_file(file_path, mimetype='application/zip', as_attachment=True, download_name=filename)
361
  else:
362
+ logging.error(f"ZIP file not found for download: {file_path}")
363
  return "File not found", 404
364
 
365
  @app.callback(