Spaces:

MicroHealth
/

auto-wiki

Paused

App Files Files Community

auto-wiki / app.py

bluenevus

Update app.py

ee2db4c verified 5 months ago

raw

history blame

5.93 kB

	import base64
	import io
	import os
	import zipfile
	from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
	import dash_bootstrap_components as dbc
	from docx import Document
	from docx.enum.style import WD_STYLE_TYPE
	import markdown
	import threading
	import time
	import PyPDF2
	import re

	app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

	app.layout = dbc.Container([
	html.H1("Auto-Wiki", className="my-4"),
	dcc.Upload(
	id='upload-data',
	children=html.Div([
	'Drag and Drop or ',
	html.A('Select Files')
	]),
	style={
	'width': '100%',
	'height': '60px',
	'lineHeight': '60px',
	'borderWidth': '1px',
	'borderStyle': 'dashed',
	'borderRadius': '5px',
	'textAlign': 'center',
	'margin': '10px'
	},
	multiple=True,
	accept='.docx,.pdf'
	),
	html.Div(id='upload-output'),
	html.Div(id="upload-status", style={"display": "none"}),
	html.Div(id="conversion-status", style={"display": "none"}),
	dbc.Button("Convert and Download", id="convert-button", color="primary", className="mt-3", disabled=True),
	dcc.Download(id="download-zip")
	])

	def process_docx(contents, filename):
	content_type, content_string = contents.split(',')
	decoded = base64.b64decode(content_string)
	doc = Document(io.BytesIO(decoded))
	full_text = []

	for para in doc.paragraphs:
	if para.style.name.startswith('Heading'):
	level = int(para.style.name[-1])
	full_text.append(f"{'#' * level} {para.text}")
	else:
	text = para.text
	for run in para.runs:
	if run.bold:
	text = text.replace(run.text, f"{run.text}")
	if run.italic:
	text = text.replace(run.text, f"{run.text}")

	if para.style.name == 'List Bullet':
	full_text.append(f"- {text}")
	elif para.style.name == 'List Number':
	full_text.append(f"1. {text}")
	else:
	full_text.append(text)

	return '\n\n'.join(full_text)

	def process_pdf(contents, filename):
	content_type, content_string = contents.split(',')
	decoded = base64.b64decode(content_string)
	pdf_file = io.BytesIO(decoded)
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	full_text = []

	for page in pdf_reader.pages:
	text = page.extract_text()

	# Basic formatting detection (this is a simplified approach and may not catch all formatting)
	text = re.sub(r'\\(.?)\\', r'\1*', text) # Bold
	text = re.sub(r'_(.?)_', r'\1*', text) # Italic
	text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE) # Numbered lists
	text = re.sub(r'^[•●○]\s', '- ', text, flags=re.MULTILINE) # Bullet points

	# Detect potential headers (simplified approach)
	lines = text.split('\n')
	for i, line in enumerate(lines):
	if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()):
	lines[i] = f"## {line}"

	full_text.append('\n'.join(lines))

	return '\n\n'.join(full_text)

	def process_files(contents, filenames):
	processed_files = []
	for c, n in zip(contents, filenames):
	if n.lower().endswith('.docx'):
	text = process_docx(c, n)
	elif n.lower().endswith('.pdf'):
	text = process_pdf(c, n)
	else:
	continue # Skip unsupported file types
	md = markdown.markdown(text)
	processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md))
	time.sleep(0.1) # Simulate processing time

	zip_buffer = io.BytesIO()
	with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
	for name, content in processed_files:
	zip_file.writestr(name, content)

	return zip_buffer.getvalue()

	@app.callback(
	[Output('upload-output', 'children'),
	Output('convert-button', 'disabled'),
	Output('upload-status', 'children'),
	Output('upload-status', 'style'),
	Output('conversion-status', 'children'),
	Output('conversion-status', 'style'),
	Output('download-zip', 'data')],
	[Input('upload-data', 'contents'),
	Input('upload-data', 'filename'),
	Input('convert-button', 'n_clicks')],
	[State('upload-data', 'contents'),
	State('upload-data', 'filename')]
	)
	def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames):
	ctx = callback_context
	if not ctx.triggered:
	return no_update

	if ctx.triggered[0]['prop_id'] == 'upload-data.contents':
	if list_of_contents is not None:
	children = [
	html.Div([
	html.H5(f"File uploaded: {name}"),
	html.Hr()
	]) for name in list_of_names
	]
	return children, False, "Files uploaded successfully", {"display": "block"}, "", {"display": "none"}, None
	return no_update

	if ctx.triggered[0]['prop_id'] == 'convert-button.n_clicks':
	if n_clicks is None or not contents:
	return no_update

	def process_and_download():
	zip_data = process_files(contents, filenames)
	return dcc.send_bytes(zip_data, "converted_files.zip")

	return (
	no_update,
	True,
	"",
	{"display": "none"},
	"Converting files... This may take a moment.",
	{"display": "block"},
	process_and_download()
	)

	return no_update

	if __name__ == '__main__':
	print("Starting the Dash application...")
	app.run(debug=True, host='0.0.0.0', port=7860)
	print("Dash application has finished running.")