Spaces:

MicroHealth
/

auto-wiki

Paused

File size: 5,926 Bytes

import base64
import io
import os
import zipfile
from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
import dash_bootstrap_components as dbc
from docx import Document
from docx.enum.style import WD_STYLE_TYPE
import markdown
import threading
import time
import PyPDF2
import re

app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

app.layout = dbc.Container([
    html.H1("Auto-Wiki", className="my-4"),
    dcc.Upload(
        id='upload-data',
        children=html.Div([
            'Drag and Drop or ',
            html.A('Select Files')
        ]),
        style={
            'width': '100%',
            'height': '60px',
            'lineHeight': '60px',
            'borderWidth': '1px',
            'borderStyle': 'dashed',
            'borderRadius': '5px',
            'textAlign': 'center',
            'margin': '10px'
        },
        multiple=True,
        accept='.docx,.pdf'
    ),
    html.Div(id='upload-output'),
    html.Div(id="upload-status", style={"display": "none"}),
    html.Div(id="conversion-status", style={"display": "none"}),
    dbc.Button("Convert and Download", id="convert-button", color="primary", className="mt-3", disabled=True),
    dcc.Download(id="download-zip")
])

def process_docx(contents, filename):
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string)
    doc = Document(io.BytesIO(decoded))
    full_text = []
    
    for para in doc.paragraphs:
        if para.style.name.startswith('Heading'):
            level = int(para.style.name[-1])
            full_text.append(f"{'#' * level} {para.text}")
        else:
            text = para.text
            for run in para.runs:
                if run.bold:
                    text = text.replace(run.text, f"**{run.text}**")
                if run.italic:
                    text = text.replace(run.text, f"*{run.text}*")
            
            if para.style.name == 'List Bullet':
                full_text.append(f"- {text}")
            elif para.style.name == 'List Number':
                full_text.append(f"1. {text}")
            else:
                full_text.append(text)
    
    return '\n\n'.join(full_text)

def process_pdf(contents, filename):
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string)
    pdf_file = io.BytesIO(decoded)
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    full_text = []
    
    for page in pdf_reader.pages:
        text = page.extract_text()
        
        # Basic formatting detection (this is a simplified approach and may not catch all formatting)
        text = re.sub(r'\*\*(.*?)\*\*', r'**\1**', text)  # Bold
        text = re.sub(r'_(.*?)_', r'*\1*', text)  # Italic
        text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE)  # Numbered lists
        text = re.sub(r'^[•●○]\s', '- ', text, flags=re.MULTILINE)  # Bullet points
        
        # Detect potential headers (simplified approach)
        lines = text.split('\n')
        for i, line in enumerate(lines):
            if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()):
                lines[i] = f"## {line}"
        
        full_text.append('\n'.join(lines))
    
    return '\n\n'.join(full_text)

def process_files(contents, filenames):
    processed_files = []
    for c, n in zip(contents, filenames):
        if n.lower().endswith('.docx'):
            text = process_docx(c, n)
        elif n.lower().endswith('.pdf'):
            text = process_pdf(c, n)
        else:
            continue  # Skip unsupported file types
        md = markdown.markdown(text)
        processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md))
        time.sleep(0.1)  # Simulate processing time

    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
        for name, content in processed_files:
            zip_file.writestr(name, content)
    
    return zip_buffer.getvalue()

@app.callback(
    [Output('upload-output', 'children'),
     Output('convert-button', 'disabled'),
     Output('upload-status', 'children'),
     Output('upload-status', 'style'),
     Output('conversion-status', 'children'),
     Output('conversion-status', 'style'),
     Output('download-zip', 'data')],
    [Input('upload-data', 'contents'),
     Input('upload-data', 'filename'),
     Input('convert-button', 'n_clicks')],
    [State('upload-data', 'contents'),
     State('upload-data', 'filename')]
)
def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames):
    ctx = callback_context
    if not ctx.triggered:
        return no_update

    if ctx.triggered[0]['prop_id'] == 'upload-data.contents':
        if list_of_contents is not None:
            children = [
                html.Div([
                    html.H5(f"File uploaded: {name}"),
                    html.Hr()
                ]) for name in list_of_names
            ]
            return children, False, "Files uploaded successfully", {"display": "block"}, "", {"display": "none"}, None
        return no_update

    if ctx.triggered[0]['prop_id'] == 'convert-button.n_clicks':
        if n_clicks is None or not contents:
            return no_update

        def process_and_download():
            zip_data = process_files(contents, filenames)
            return dcc.send_bytes(zip_data, "converted_files.zip")

        return (
            no_update,
            True,
            "",
            {"display": "none"},
            "Converting files... This may take a moment.",
            {"display": "block"},
            process_and_download()
        )

    return no_update

if __name__ == '__main__':
    print("Starting the Dash application...")
    app.run(debug=True, host='0.0.0.0', port=7860)
    print("Dash application has finished running.")