bluenevus commited on
Commit
171b356
·
verified ·
1 Parent(s): a166383

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -4
app.py CHANGED
@@ -8,6 +8,7 @@ from docx import Document
8
  import markdown
9
  import threading
10
  import time
 
11
 
12
  app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
13
 
@@ -29,7 +30,8 @@ app.layout = dbc.Container([
29
  'textAlign': 'center',
30
  'margin': '10px'
31
  },
32
- multiple=True
 
33
  ),
34
  html.Div(id='upload-output'),
35
  dbc.Progress(id="upload-progress", label="Upload Progress", style={"visibility": "hidden"}),
@@ -47,6 +49,16 @@ def process_docx(contents, filename):
47
  full_text.append(para.text)
48
  return '\n\n'.join(full_text)
49
 
 
 
 
 
 
 
 
 
 
 
50
  @app.callback(
51
  [Output('upload-output', 'children'),
52
  Output('convert-button', 'disabled'),
@@ -70,7 +82,7 @@ def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames
70
  if list_of_contents is not None:
71
  children = [
72
  html.Div([
73
- html.H5(f"File: {name}"),
74
  html.Hr()
75
  ]) for name in list_of_names
76
  ]
@@ -87,9 +99,14 @@ def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames
87
  def process_files():
88
  processed_files = []
89
  for i, (c, n) in enumerate(zip(contents, filenames)):
90
- text = process_docx(c, n)
 
 
 
 
 
91
  md = markdown.markdown(text)
92
- processed_files.append((n.replace('.docx', '.md'), md))
93
  time.sleep(0.1) # Simulate processing time
94
  app.callback_context.response.set_data(f'{{"progress": {(i+1)/len(contents)*100}}}')
95
 
 
8
  import markdown
9
  import threading
10
  import time
11
+ import PyPDF2
12
 
13
  app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
14
 
 
30
  'textAlign': 'center',
31
  'margin': '10px'
32
  },
33
+ multiple=True,
34
+ accept='.docx,.pdf'
35
  ),
36
  html.Div(id='upload-output'),
37
  dbc.Progress(id="upload-progress", label="Upload Progress", style={"visibility": "hidden"}),
 
49
  full_text.append(para.text)
50
  return '\n\n'.join(full_text)
51
 
52
+ def process_pdf(contents, filename):
53
+ content_type, content_string = contents.split(',')
54
+ decoded = base64.b64decode(content_string)
55
+ pdf_file = io.BytesIO(decoded)
56
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
57
+ full_text = []
58
+ for page in pdf_reader.pages:
59
+ full_text.append(page.extract_text())
60
+ return '\n\n'.join(full_text)
61
+
62
  @app.callback(
63
  [Output('upload-output', 'children'),
64
  Output('convert-button', 'disabled'),
 
82
  if list_of_contents is not None:
83
  children = [
84
  html.Div([
85
+ html.H5(f"File uploaded: {name}"),
86
  html.Hr()
87
  ]) for name in list_of_names
88
  ]
 
99
  def process_files():
100
  processed_files = []
101
  for i, (c, n) in enumerate(zip(contents, filenames)):
102
+ if n.lower().endswith('.docx'):
103
+ text = process_docx(c, n)
104
+ elif n.lower().endswith('.pdf'):
105
+ text = process_pdf(c, n)
106
+ else:
107
+ continue # Skip unsupported file types
108
  md = markdown.markdown(text)
109
+ processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md))
110
  time.sleep(0.1) # Simulate processing time
111
  app.callback_context.response.set_data(f'{{"progress": {(i+1)/len(contents)*100}}}')
112