Artemis-IA commited on
Commit
1c4b493
·
verified ·
1 Parent(s): 69b16ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -119
app.py CHANGED
@@ -1,15 +1,9 @@
1
- import aiofiles
2
- import uvicorn
3
  import zipfile
4
- from fastapi import FastAPI, File, Query, UploadFile, HTTPException
5
- from fastapi.responses import FileResponse, JSONResponse
6
- from fastapi.middleware.cors import CORSMiddleware
7
- from enum import Enum
8
  from pathlib import Path
9
  from typing import List
10
  from PyPDF2 import PdfReader
11
- from easyocr import Reader
12
-
13
  from docling.document_converter import DocumentConverter, PdfFormatOption
14
  from docling.datamodel.base_models import InputFormat
15
  from docling.datamodel.document import ConversionStatus
@@ -26,17 +20,6 @@ from docling.datamodel.pipeline_options import (
26
  OcrMacOptions,
27
  )
28
 
29
- # Définition de l'application FastAPI
30
- app = FastAPI()
31
-
32
- app.add_middleware(
33
- CORSMiddleware,
34
- allow_origins=["*"],
35
- allow_credentials=True,
36
- allow_methods=["*"],
37
- allow_headers=["*"],
38
- )
39
-
40
  # Répertoires de sortie
41
  OUTPUT_DIR = Path("output")
42
  OUTPUT_DIR.mkdir(exist_ok=True)
@@ -47,26 +30,13 @@ FIGURES_DIR.mkdir(exist_ok=True)
47
  TABLES_DIR = OUTPUT_DIR / "tables"
48
  TABLES_DIR.mkdir(exist_ok=True)
49
 
50
- class ExportFormat(str, Enum):
51
- json = "json"
52
- yaml = "yaml"
53
- md = "md"
54
- multimodal = "multimodal"
55
-
56
- class OcrEngine(str, Enum):
57
- easyocr = "easyocr"
58
- tesseract_cli = "tesseract_cli"
59
- tesserocr = "tesserocr"
60
- rapidocr = "rapidocr"
61
- ocrmac = "ocrmac"
62
-
63
  # Vérification de validité des fichiers PDF
64
  def is_valid_pdf(file_path):
65
  try:
66
  PdfReader(file_path)
67
  return True
68
  except Exception as e:
69
- print(f"Fichier non valide ou corrompu : {file_path} - Erreur : {e}")
70
  return False
71
 
72
  # Fonction pour configurer le convertisseur de documents
@@ -75,9 +45,9 @@ def create_document_converter(
75
  export_figures: bool,
76
  export_tables: bool,
77
  accelerator: str,
78
- ocr_engine: OcrEngine,
79
  table_mode: str,
80
- ocr_languages: List[str]
81
  ) -> DocumentConverter:
82
  accelerator_options = AcceleratorOptions(
83
  num_threads=8,
@@ -89,16 +59,16 @@ def create_document_converter(
89
  do_cell_matching=True,
90
  )
91
 
92
- # OCR avec EasyOCR ou autres moteurs
93
- if ocr_engine == OcrEngine.easyocr:
94
  ocr_options = EasyOcrOptions(lang=ocr_languages)
95
- elif ocr_engine == OcrEngine.tesseract_cli:
96
  ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
97
- elif ocr_engine == OcrEngine.tesserocr:
98
  ocr_options = TesseractOcrOptions(lang=ocr_languages)
99
- elif ocr_engine == OcrEngine.rapidocr:
100
  ocr_options = RapidOcrOptions(lang=ocr_languages)
101
- elif ocr_engine == OcrEngine.ocrmac:
102
  ocr_options = OcrMacOptions(lang=ocr_languages)
103
  else:
104
  raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
@@ -112,6 +82,7 @@ def create_document_converter(
112
  table_structure_options=table_structure_options,
113
  ocr_options=ocr_options,
114
  )
 
115
  return DocumentConverter(
116
  allowed_formats=[
117
  InputFormat.PDF,
@@ -120,84 +91,67 @@ def create_document_converter(
120
  InputFormat.HTML,
121
  InputFormat.IMAGE,
122
  ],
123
- format_options={
124
- InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
125
- },
126
  )
127
 
128
- # Endpoint pour traiter les fichiers
129
- @app.post("/process_files/")
130
- async def process_files(
131
- files: List[UploadFile] = File(...),
132
- use_ocr: bool = Query(False),
133
- export_figures: bool = Query(True),
134
- export_tables: bool = Query(True),
135
- export_formats: List[ExportFormat] = Query(default=[ExportFormat.md]),
136
- accelerator: str = Query("cpu"),
137
- ocr_engine: OcrEngine = Query(OcrEngine.easyocr),
138
- table_mode: str = Query(TableFormerMode.ACCURATE),
139
- ocr_languages: List[str] = Query(default=["eng"]),
140
- ):
141
- input_paths = []
142
- for f in files:
143
- file_path = OUTPUT_DIR / f.filename
144
- async with aiofiles.open(file_path, 'wb') as out_file:
145
- content = await f.read()
146
- await out_file.write(content)
147
- print(f"Fichier reçu : {file_path} (taille : {file_path.stat().st_size} octets)")
148
- if not is_valid_pdf(file_path):
149
- return JSONResponse(
150
- content={"error": f"Le fichier {file_path.name} n'est pas un PDF valide."}, status_code=400
151
- )
152
- input_paths.append(file_path)
153
-
154
- converter = create_document_converter(
155
- use_ocr,
156
- export_figures,
157
- export_tables,
158
- accelerator,
159
- ocr_engine,
160
- table_mode,
161
- ocr_languages,
162
- )
163
- conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
164
-
165
- success_count, failure_count = 0, 0
166
- generated_files = []
167
-
168
- for conv_res in conv_results:
169
- if conv_res.status == ConversionStatus.SUCCESS:
170
- print(f"Conversion réussie pour : {conv_res.input.file}")
171
- success_count += 1
172
- for export_format in export_formats:
173
- if export_format == ExportFormat.md:
174
- output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.md"
175
- if conv_res.document.pages:
176
- with open(output_file, "w") as f:
177
- f.write("## Exemple de contenu Markdown généré\n")
178
- generated_files.append(output_file)
179
- else:
180
- print(f"Aucune donnée trouvée dans le document converti : {conv_res.input.file}")
181
- else:
182
- print(f"Échec de la conversion pour : {conv_res.input.file} - Statut : {conv_res.status}")
183
- failure_count += 1
184
-
185
- # Création du fichier ZIP
186
- zip_filename = OUTPUT_DIR / "exported_files.zip"
187
- with zipfile.ZipFile(zip_filename, "w") as zipf:
188
- for file in generated_files:
189
- zipf.write(file, file.name)
190
-
191
- return {"success_count": success_count, "failure_count": failure_count, "zip_path": str(zip_filename)}
192
-
193
- @app.get("/download/{filename}")
194
- def download_file(filename: str):
195
- file_path = OUTPUT_DIR / filename
196
- if file_path.exists():
197
- return FileResponse(path=file_path, filename=filename)
198
  else:
199
- raise HTTPException(status_code=404, detail="Fichier non trouvé.")
200
-
201
- # Démarrer le serveur
202
- if __name__ == "__main__":
203
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ import os
 
2
  import zipfile
 
 
 
 
3
  from pathlib import Path
4
  from typing import List
5
  from PyPDF2 import PdfReader
6
+ import streamlit as st
 
7
  from docling.document_converter import DocumentConverter, PdfFormatOption
8
  from docling.datamodel.base_models import InputFormat
9
  from docling.datamodel.document import ConversionStatus
 
20
  OcrMacOptions,
21
  )
22
 
 
 
 
 
 
 
 
 
 
 
 
23
  # Répertoires de sortie
24
  OUTPUT_DIR = Path("output")
25
  OUTPUT_DIR.mkdir(exist_ok=True)
 
30
  TABLES_DIR = OUTPUT_DIR / "tables"
31
  TABLES_DIR.mkdir(exist_ok=True)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Vérification de validité des fichiers PDF
34
  def is_valid_pdf(file_path):
35
  try:
36
  PdfReader(file_path)
37
  return True
38
  except Exception as e:
39
+ st.error(f"Fichier non valide ou corrompu : {file_path} - Erreur : {e}")
40
  return False
41
 
42
  # Fonction pour configurer le convertisseur de documents
 
45
  export_figures: bool,
46
  export_tables: bool,
47
  accelerator: str,
48
+ ocr_engine: str,
49
  table_mode: str,
50
+ ocr_languages: List[str],
51
  ) -> DocumentConverter:
52
  accelerator_options = AcceleratorOptions(
53
  num_threads=8,
 
59
  do_cell_matching=True,
60
  )
61
 
62
+ # OCR avec EasyOCR
63
+ if ocr_engine == "easyocr":
64
  ocr_options = EasyOcrOptions(lang=ocr_languages)
65
+ elif ocr_engine == "tesseract_cli":
66
  ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
67
+ elif ocr_engine == "tesserocr":
68
  ocr_options = TesseractOcrOptions(lang=ocr_languages)
69
+ elif ocr_engine == "rapidocr":
70
  ocr_options = RapidOcrOptions(lang=ocr_languages)
71
+ elif ocr_engine == "ocrmac":
72
  ocr_options = OcrMacOptions(lang=ocr_languages)
73
  else:
74
  raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
 
82
  table_structure_options=table_structure_options,
83
  ocr_options=ocr_options,
84
  )
85
+
86
  return DocumentConverter(
87
  allowed_formats=[
88
  InputFormat.PDF,
 
91
  InputFormat.HTML,
92
  InputFormat.IMAGE,
93
  ],
94
+ format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)},
 
 
95
  )
96
 
97
+ # Interface Streamlit
98
+ st.title("Conversion de documents PDF avec OCR")
99
+ st.subheader("Téléchargez un PDF pour commencer le traitement")
100
+
101
+ uploaded_files = st.file_uploader("Sélectionnez vos fichiers PDF", accept_multiple_files=True, type=["pdf"])
102
+ use_ocr = st.checkbox("Activer l'OCR ?", value=True)
103
+ export_figures = st.checkbox("Exporter les figures ?", value=True)
104
+ export_tables = st.checkbox("Exporter les tableaux ?", value=True)
105
+ ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
106
+ ocr_languages = st.text_input("Langues OCR (ex : eng, fra)", "eng").split(",")
107
+ table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"])
108
+
109
+ if st.button("Convertir"):
110
+ if uploaded_files:
111
+ input_paths = []
112
+ generated_files = []
113
+
114
+ for uploaded_file in uploaded_files:
115
+ file_path = OUTPUT_DIR / uploaded_file.name
116
+ with open(file_path, "wb") as f:
117
+ f.write(uploaded_file.read())
118
+ st.write(f"Fichier reçu : {file_path} (taille : {os.path.getsize(file_path)} octets)")
119
+
120
+ if not is_valid_pdf(file_path):
121
+ st.error(f"Le fichier {file_path.name} n'est pas un PDF valide.")
122
+ continue
123
+ input_paths.append(file_path)
124
+
125
+ # Configurer le convertisseur de documents
126
+ converter = create_document_converter(
127
+ use_ocr,
128
+ export_figures,
129
+ export_tables,
130
+ accelerator="cpu",
131
+ ocr_engine=ocr_engine,
132
+ table_mode=table_mode,
133
+ ocr_languages=ocr_languages,
134
+ )
135
+
136
+ # Conversion des fichiers
137
+ conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
138
+ for conv_res in conv_results:
139
+ if conv_res.status == ConversionStatus.SUCCESS:
140
+ st.success(f"Conversion réussie pour : {conv_res.input.file}")
141
+ output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.md"
142
+ with open(output_file, "w") as f:
143
+ f.write("## Exemple de contenu Markdown généré\n")
144
+ generated_files.append(output_file)
145
+ else:
146
+ st.error(f"Échec de la conversion pour : {conv_res.input.file} - Statut : {conv_res.status}")
147
+
148
+ # Création du fichier ZIP
149
+ zip_filename = OUTPUT_DIR / "exported_files.zip"
150
+ with zipfile.ZipFile(zip_filename, "w") as zipf:
151
+ for file in generated_files:
152
+ zipf.write(file, file.name)
153
+
154
+ st.success("Conversion terminée !")
155
+ st.download_button("Télécharger le ZIP", data=open(zip_filename, "rb").read(), file_name="exported_files.zip")
 
 
 
 
 
 
 
 
 
 
 
156
  else:
157
+ st.error("Veuillez télécharger au moins un fichier PDF.")