import base64 import io import os import zipfile from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update import dash_bootstrap_components as dbc from docx import Document from docx.enum.style import WD_STYLE_TYPE import markdown import threading import time import PyPDF2 import re app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP]) app.layout = dbc.Container([ html.H1("Auto-Wiki", className="my-4"), dcc.Upload( id='upload-data', children=html.Div([ 'Drag and Drop or ', html.A('Select Files') ]), style={ 'width': '100%', 'height': '60px', 'lineHeight': '60px', 'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px' }, multiple=True, accept='.docx,.pdf' ), html.Div(id='upload-output'), html.Div(id="upload-status", style={"display": "none"}), html.Div(id="conversion-status", style={"display": "none"}), dbc.Button("Convert and Download", id="convert-button", color="primary", className="mt-3", disabled=True), dcc.Download(id="download-zip") ]) def process_docx(contents, filename): content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) doc = Document(io.BytesIO(decoded)) full_text = [] for para in doc.paragraphs: if para.style.name.startswith('Heading'): level = int(para.style.name[-1]) full_text.append(f"{'#' * level} {para.text}") else: text = para.text for run in para.runs: if run.bold: text = text.replace(run.text, f"**{run.text}**") if run.italic: text = text.replace(run.text, f"*{run.text}*") if para.style.name == 'List Bullet': full_text.append(f"- {text}") elif para.style.name == 'List Number': full_text.append(f"1. {text}") else: full_text.append(text) return '\n\n'.join(full_text) def process_pdf(contents, filename): content_type, content_string = contents.split(',') decoded = base64.b64decode(content_string) pdf_file = io.BytesIO(decoded) pdf_reader = PyPDF2.PdfReader(pdf_file) full_text = [] for page in pdf_reader.pages: text = page.extract_text() # Basic formatting detection (this is a simplified approach and may not catch all formatting) text = re.sub(r'\*\*(.*?)\*\*', r'**\1**', text) # Bold text = re.sub(r'_(.*?)_', r'*\1*', text) # Italic text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE) # Numbered lists text = re.sub(r'^[•●○]\s', '- ', text, flags=re.MULTILINE) # Bullet points # Detect potential headers (simplified approach) lines = text.split('\n') for i, line in enumerate(lines): if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()): lines[i] = f"## {line}" full_text.append('\n'.join(lines)) return '\n\n'.join(full_text) def process_files(contents, filenames): processed_files = [] for c, n in zip(contents, filenames): if n.lower().endswith('.docx'): text = process_docx(c, n) elif n.lower().endswith('.pdf'): text = process_pdf(c, n) else: continue # Skip unsupported file types md = markdown.markdown(text) processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md)) time.sleep(0.1) # Simulate processing time zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: for name, content in processed_files: zip_file.writestr(name, content) return zip_buffer.getvalue() @app.callback( [Output('upload-output', 'children'), Output('convert-button', 'disabled'), Output('upload-status', 'children'), Output('upload-status', 'style'), Output('conversion-status', 'children'), Output('conversion-status', 'style'), Output('download-zip', 'data')], [Input('upload-data', 'contents'), Input('upload-data', 'filename'), Input('convert-button', 'n_clicks')], [State('upload-data', 'contents'), State('upload-data', 'filename')] ) def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames): ctx = callback_context if not ctx.triggered: return no_update if ctx.triggered[0]['prop_id'] == 'upload-data.contents': if list_of_contents is not None: children = [ html.Div([ html.H5(f"File uploaded: {name}"), html.Hr() ]) for name in list_of_names ] return children, False, "Files uploaded successfully", {"display": "block"}, "", {"display": "none"}, None return no_update if ctx.triggered[0]['prop_id'] == 'convert-button.n_clicks': if n_clicks is None or not contents: return no_update def process_and_download(): zip_data = process_files(contents, filenames) return dcc.send_bytes(zip_data, "converted_files.zip") return ( no_update, True, "", {"display": "none"}, "Converting files... This may take a moment.", {"display": "block"}, process_and_download() ) return no_update if __name__ == '__main__': print("Starting the Dash application...") app.run(debug=True, host='0.0.0.0', port=7860) print("Dash application has finished running.")