pdf-to-word / app.py
bluenevus's picture
Update app.py
eb83bba verified
raw
history blame
5.42 kB
import base64
import io
import os
import threading
import time
import zipfile
from dash import Dash, dcc, html, Input, Output, State, ctx
import dash_bootstrap_components as dbc
from pdf2docx import Converter
import tempfile
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
# Global variables
uploaded_files = {}
converted_files = {}
conversion_progress = {}
conversion_complete = False
def convert_pdf_to_docx(pdf_path, docx_path):
cv = Converter(pdf_path)
cv.convert(docx_path)
cv.close()
def process_contents(contents, filename):
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
return io.BytesIO(decoded)
def convert_files(filenames):
global conversion_progress, converted_files, conversion_complete
total_files = len(filenames)
for i, filename in enumerate(filenames):
pdf_file = uploaded_files[filename]
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
temp_pdf.write(pdf_file.getvalue())
temp_pdf_path = temp_pdf.name
docx_filename = os.path.splitext(filename)[0] + '.docx'
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_docx:
temp_docx_path = temp_docx.name
convert_pdf_to_docx(temp_pdf_path, temp_docx_path)
with open(temp_docx_path, 'rb') as docx_file:
converted_files[docx_filename] = io.BytesIO(docx_file.read())
os.unlink(temp_pdf_path)
os.unlink(temp_docx_path)
conversion_progress[filename] = (i + 1) / total_files * 100
conversion_progress['overall'] = 100
conversion_complete = True
app.layout = dbc.Container([
dbc.Card(
dbc.CardBody([
html.H1("PDF to DOCX Converter", className="text-center mb-4"),
dcc.Upload(
id='upload-data',
children=html.Div([
'Drag and Drop or ',
html.A('Select PDF Files')
]),
style={
'width': '100%',
'height': '60px',
'lineHeight': '60px',
'borderWidth': '1px',
'borderStyle': 'dashed',
'borderRadius': '5px',
'textAlign': 'center',
'margin': '10px'
},
multiple=True
),
html.Div(id='upload-output'),
dbc.Button("Convert and Download", id="convert-button", color="primary", className="mt-3 mb-3", disabled=True),
html.Div(id='conversion-output'),
dcc.Download(id="download-zip"),
dcc.Interval(id='interval-component', interval=500, n_intervals=0, disabled=True)
]),
className="mt-3"
)
], fluid=True)
@app.callback(
Output('upload-output', 'children'),
Output('convert-button', 'disabled'),
Input('upload-data', 'contents'),
State('upload-data', 'filename'),
prevent_initial_call=True
)
def update_output(list_of_contents, list_of_names):
if list_of_contents is not None:
global uploaded_files
uploaded_files.clear()
children = []
for content, name in zip(list_of_contents, list_of_names):
if name.lower().endswith('.pdf'):
uploaded_files[name] = process_contents(content, name)
children.append(html.Div(f"Uploaded: {name}"))
else:
children.append(html.Div(f"Skipped: {name} (Not a PDF file)", style={'color': 'red'}))
return children, False
return [], True
@app.callback(
Output('interval-component', 'disabled'),
Input('convert-button', 'n_clicks'),
prevent_initial_call=True
)
def start_conversion(n_clicks):
if n_clicks is None:
return True
global conversion_progress, converted_files, conversion_complete
conversion_progress.clear()
converted_files.clear()
conversion_progress['overall'] = 0
conversion_complete = False
threading.Thread(target=convert_files, args=(list(uploaded_files.keys()),)).start()
return False
@app.callback(
Output('conversion-output', 'children'),
Input('interval-component', 'n_intervals'),
prevent_initial_call=True
)
def update_progress(n):
progress_bars = [
dbc.Progress(value=conversion_progress.get(filename, 0), label=f"{filename}: {conversion_progress.get(filename, 0):.0f}%", className="mb-3")
for filename in uploaded_files.keys()
]
return progress_bars
@app.callback(
Output('download-zip', 'data'),
Output('interval-component', 'disabled', allow_duplicate=True),
Input('interval-component', 'n_intervals'),
prevent_initial_call=True
)
def check_conversion_complete(n):
if conversion_complete:
with io.BytesIO() as zip_buffer:
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for filename, file_content in converted_files.items():
zip_file.writestr(filename, file_content.getvalue())
return dcc.send_bytes(zip_buffer.getvalue(), "converted_files.zip"), True
return None, False
if __name__ == '__main__':
print("Starting the Dash application...")
app.run(debug=True, host='0.0.0.0', port=7860)
print("Dash application has finished running.")