Spaces:
Paused
Paused
File size: 5,926 Bytes
a736130 a166383 a736130 ee2db4c a736130 171b356 ee2db4c a736130 171b356 a736130 eb7c4fb a736130 e777af8 a736130 ee2db4c a736130 ee2db4c a736130 171b356 ee2db4c 171b356 ee2db4c 171b356 49e035d e777af8 49e035d a736130 eb7c4fb a736130 e777af8 a736130 e777af8 1879f8f a736130 a166383 a736130 171b356 a736130 eb7c4fb a166383 a736130 e777af8 a166383 a736130 e777af8 a736130 eb7c4fb a736130 a166383 a736130 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import base64
import io
import os
import zipfile
from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
import dash_bootstrap_components as dbc
from docx import Document
from docx.enum.style import WD_STYLE_TYPE
import markdown
import threading
import time
import PyPDF2
import re
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
app.layout = dbc.Container([
html.H1("Auto-Wiki", className="my-4"),
dcc.Upload(
id='upload-data',
children=html.Div([
'Drag and Drop or ',
html.A('Select Files')
]),
style={
'width': '100%',
'height': '60px',
'lineHeight': '60px',
'borderWidth': '1px',
'borderStyle': 'dashed',
'borderRadius': '5px',
'textAlign': 'center',
'margin': '10px'
},
multiple=True,
accept='.docx,.pdf'
),
html.Div(id='upload-output'),
html.Div(id="upload-status", style={"display": "none"}),
html.Div(id="conversion-status", style={"display": "none"}),
dbc.Button("Convert and Download", id="convert-button", color="primary", className="mt-3", disabled=True),
dcc.Download(id="download-zip")
])
def process_docx(contents, filename):
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
doc = Document(io.BytesIO(decoded))
full_text = []
for para in doc.paragraphs:
if para.style.name.startswith('Heading'):
level = int(para.style.name[-1])
full_text.append(f"{'#' * level} {para.text}")
else:
text = para.text
for run in para.runs:
if run.bold:
text = text.replace(run.text, f"**{run.text}**")
if run.italic:
text = text.replace(run.text, f"*{run.text}*")
if para.style.name == 'List Bullet':
full_text.append(f"- {text}")
elif para.style.name == 'List Number':
full_text.append(f"1. {text}")
else:
full_text.append(text)
return '\n\n'.join(full_text)
def process_pdf(contents, filename):
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
pdf_file = io.BytesIO(decoded)
pdf_reader = PyPDF2.PdfReader(pdf_file)
full_text = []
for page in pdf_reader.pages:
text = page.extract_text()
# Basic formatting detection (this is a simplified approach and may not catch all formatting)
text = re.sub(r'\*\*(.*?)\*\*', r'**\1**', text) # Bold
text = re.sub(r'_(.*?)_', r'*\1*', text) # Italic
text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE) # Numbered lists
text = re.sub(r'^[β’ββ]\s', '- ', text, flags=re.MULTILINE) # Bullet points
# Detect potential headers (simplified approach)
lines = text.split('\n')
for i, line in enumerate(lines):
if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()):
lines[i] = f"## {line}"
full_text.append('\n'.join(lines))
return '\n\n'.join(full_text)
def process_files(contents, filenames):
processed_files = []
for c, n in zip(contents, filenames):
if n.lower().endswith('.docx'):
text = process_docx(c, n)
elif n.lower().endswith('.pdf'):
text = process_pdf(c, n)
else:
continue # Skip unsupported file types
md = markdown.markdown(text)
processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md))
time.sleep(0.1) # Simulate processing time
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for name, content in processed_files:
zip_file.writestr(name, content)
return zip_buffer.getvalue()
@app.callback(
[Output('upload-output', 'children'),
Output('convert-button', 'disabled'),
Output('upload-status', 'children'),
Output('upload-status', 'style'),
Output('conversion-status', 'children'),
Output('conversion-status', 'style'),
Output('download-zip', 'data')],
[Input('upload-data', 'contents'),
Input('upload-data', 'filename'),
Input('convert-button', 'n_clicks')],
[State('upload-data', 'contents'),
State('upload-data', 'filename')]
)
def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames):
ctx = callback_context
if not ctx.triggered:
return no_update
if ctx.triggered[0]['prop_id'] == 'upload-data.contents':
if list_of_contents is not None:
children = [
html.Div([
html.H5(f"File uploaded: {name}"),
html.Hr()
]) for name in list_of_names
]
return children, False, "Files uploaded successfully", {"display": "block"}, "", {"display": "none"}, None
return no_update
if ctx.triggered[0]['prop_id'] == 'convert-button.n_clicks':
if n_clicks is None or not contents:
return no_update
def process_and_download():
zip_data = process_files(contents, filenames)
return dcc.send_bytes(zip_data, "converted_files.zip")
return (
no_update,
True,
"",
{"display": "none"},
"Converting files... This may take a moment.",
{"display": "block"},
process_and_download()
)
return no_update
if __name__ == '__main__':
print("Starting the Dash application...")
app.run(debug=True, host='0.0.0.0', port=7860)
print("Dash application has finished running.") |