Spaces:

MicroHealth
/

auto-wiki

Paused

App Files Files Community

bluenevus commited on Apr 26

Commit

171b356

verified ·

1 Parent(s): a166383

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -4

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from docx import Document
 import markdown
 import threading
 import time
 app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
@@ -29,7 +30,8 @@ app.layout = dbc.Container([
             'textAlign': 'center',
             'margin': '10px'
         },
-        multiple=True
     ),
     html.Div(id='upload-output'),
     dbc.Progress(id="upload-progress", label="Upload Progress", style={"visibility": "hidden"}),
@@ -47,6 +49,16 @@ def process_docx(contents, filename):
         full_text.append(para.text)
     return '\n\n'.join(full_text)
 @app.callback(
     [Output('upload-output', 'children'),
      Output('convert-button', 'disabled'),
@@ -70,7 +82,7 @@ def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames
         if list_of_contents is not None:
             children = [
                 html.Div([
-                    html.H5(f"File: {name}"),
                     html.Hr()
                 ]) for name in list_of_names
             ]
@@ -87,9 +99,14 @@ def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames
         def process_files():
             processed_files = []
             for i, (c, n) in enumerate(zip(contents, filenames)):
-                text = process_docx(c, n)
                 md = markdown.markdown(text)
-                processed_files.append((n.replace('.docx', '.md'), md))
                 time.sleep(0.1)  # Simulate processing time
                 app.callback_context.response.set_data(f'{{"progress": {(i+1)/len(contents)*100}}}')

 import markdown
 import threading
 import time
+import PyPDF2
 app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
             'textAlign': 'center',
             'margin': '10px'
         },
+        multiple=True,
+        accept='.docx,.pdf'
     ),
     html.Div(id='upload-output'),
     dbc.Progress(id="upload-progress", label="Upload Progress", style={"visibility": "hidden"}),
         full_text.append(para.text)
     return '\n\n'.join(full_text)
+def process_pdf(contents, filename):
+    content_type, content_string = contents.split(',')
+    decoded = base64.b64decode(content_string)
+    pdf_file = io.BytesIO(decoded)
+    pdf_reader = PyPDF2.PdfReader(pdf_file)
+    full_text = []
+    for page in pdf_reader.pages:
+        full_text.append(page.extract_text())
+    return '\n\n'.join(full_text)
 @app.callback(
     [Output('upload-output', 'children'),
      Output('convert-button', 'disabled'),
         if list_of_contents is not None:
             children = [
                 html.Div([
+                    html.H5(f"File uploaded: {name}"),
                     html.Hr()
                 ]) for name in list_of_names
             ]
         def process_files():
             processed_files = []
             for i, (c, n) in enumerate(zip(contents, filenames)):
+                if n.lower().endswith('.docx'):
+                    text = process_docx(c, n)
+                elif n.lower().endswith('.pdf'):
+                    text = process_pdf(c, n)
+                else:
+                    continue  # Skip unsupported file types
                 md = markdown.markdown(text)
+                processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md))
                 time.sleep(0.1)  # Simulate processing time
                 app.callback_context.response.set_data(f'{{"progress": {(i+1)/len(contents)*100}}}')