Artemis-IA commited on
Commit
c69f6ae
·
verified ·
1 Parent(s): 409f0a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -136
app.py CHANGED
@@ -1,15 +1,16 @@
1
- import os
2
- import time
 
3
  import zipfile
 
 
 
 
4
  from pathlib import Path
5
  from typing import List
6
- import shutil
7
- import json
8
- import yaml
9
- import datetime
10
- import easyocr
11
- import pandas as pd
12
- import streamlit as st
13
  from docling.document_converter import DocumentConverter, PdfFormatOption
14
  from docling.datamodel.base_models import InputFormat
15
  from docling.datamodel.document import ConversionStatus
@@ -25,70 +26,63 @@ from docling.datamodel.pipeline_options import (
25
  RapidOcrOptions,
26
  OcrMacOptions,
27
  )
28
- from easyocr.utils import get_language_list
29
-
30
- # Répertoires de sortie
31
- OUTPUT_DIR = Path("output")
32
- FIGURES_DIR = OUTPUT_DIR / "figures"
33
- TABLES_DIR = OUTPUT_DIR / "tables"
34
-
35
- for directory in [OUTPUT_DIR, FIGURES_DIR, TABLES_DIR]:
36
- directory.mkdir(exist_ok=True)
37
-
38
- # Récupération des langues supportées par EasyOCR
39
- supported_languages = get_language_list()
40
 
41
- # Configuration Streamlit
42
- st.set_page_config(page_title="Docling API UI", layout="wide")
43
 
44
- st.title("Docling Document Conversion API")
45
- st.sidebar.header("Configuration")
46
-
47
- # Formulaire de configuration
48
- use_ocr = st.sidebar.checkbox("Activer l'OCR", value=True)
49
- export_figures = st.sidebar.checkbox("Exporter les figures", value=True)
50
- export_tables = st.sidebar.checkbox("Exporter les tableaux", value=True)
51
-
52
- accelerator = st.sidebar.selectbox(
53
- "Accélérateur",
54
- ["auto", "cpu", "cuda", "mps"],
55
- index=0,
56
- format_func=lambda x: x.upper(),
57
- )
58
-
59
- ocr_engine = st.sidebar.selectbox(
60
- "Moteur OCR",
61
- ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"],
62
- index=0,
63
- )
64
-
65
- ocr_languages = st.sidebar.multiselect(
66
- "Langues OCR",
67
- options=supported_languages,
68
- default=["en"],
69
- )
70
 
71
- table_mode = st.sidebar.selectbox(
72
- "Mode Table",
73
- ["accurate", "fast"],
74
- index=0,
75
- format_func=lambda x: x.capitalize(),
 
76
  )
77
 
78
- export_formats = st.sidebar.multiselect(
79
- "Formats d'export",
80
- ["json", "yaml", "md", "multimodal"],
81
- default=["md"],
82
- )
83
 
84
- uploaded_files = st.file_uploader(
85
- "Uploader vos fichiers (PDF, DOCX, PPTX, HTML, IMAGES)",
86
- type=["pdf", "docx", "pptx", "html", "png", "jpg", "jpeg"],
87
- accept_multiple_files=True,
88
- )
89
 
90
- # Fonction pour créer le convertisseur
91
- def create_document_converter():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  accelerator_options = AcceleratorOptions(
93
  num_threads=8,
94
  device=AcceleratorDevice[accelerator.upper()],
@@ -99,20 +93,16 @@ def create_document_converter():
99
  do_cell_matching=True,
100
  )
101
 
102
- # Validation des langues
103
- if not all(lang in supported_languages for lang in ocr_languages):
104
- st.error(f"Certaines langues sélectionnées ne sont pas prises en charge : {ocr_languages}")
105
- st.stop()
106
-
107
- if ocr_engine == "easyocr":
108
  ocr_options = EasyOcrOptions(lang=ocr_languages)
109
- elif ocr_engine == "tesseract_cli":
110
  ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
111
- elif ocr_engine == "tesserocr":
112
  ocr_options = TesseractOcrOptions(lang=ocr_languages)
113
- elif ocr_engine == "rapidocr":
114
  ocr_options = RapidOcrOptions(lang=ocr_languages)
115
- elif ocr_engine == "ocrmac":
116
  ocr_options = OcrMacOptions(lang=ocr_languages)
117
  else:
118
  raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
@@ -126,7 +116,6 @@ def create_document_converter():
126
  table_structure_options=table_structure_options,
127
  ocr_options=ocr_options,
128
  )
129
-
130
  return DocumentConverter(
131
  allowed_formats=[
132
  InputFormat.PDF,
@@ -140,76 +129,79 @@ def create_document_converter():
140
  },
141
  )
142
 
143
- # Traitement des fichiers
144
- if st.button("Lancer la conversion") and uploaded_files:
145
- st.info("Conversion en cours, veuillez patienter...")
146
-
147
- converter = create_document_converter()
148
-
149
- # Préparer les fichiers pour le traitement
 
 
 
 
 
 
150
  input_paths = []
151
- for uploaded_file in uploaded_files:
152
- file_path = OUTPUT_DIR / uploaded_file.name
153
- with open(file_path, "wb") as f:
154
- f.write(uploaded_file.read())
 
 
 
 
 
 
155
  input_paths.append(file_path)
156
 
157
- # Conversion des fichiers
 
 
 
 
 
 
 
 
158
  conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
 
159
  success_count, failure_count = 0, 0
160
- exported_files = {"figures": [], "tables": [], "exports": []}
161
 
162
  for conv_res in conv_results:
163
  if conv_res.status == ConversionStatus.SUCCESS:
 
164
  success_count += 1
165
- doc_filename = conv_res.input.file.stem
166
-
167
- # Export des tableaux
168
- for table_ix, table in enumerate(conv_res.document.tables):
169
- csv_file = TABLES_DIR / f"{doc_filename}-table-{table_ix+1}.csv"
170
- table.export_to_dataframe().to_csv(csv_file, index=False)
171
- exported_files["tables"].append(str(csv_file))
172
-
173
- # Export des formats demandés
174
- for fmt in export_formats:
175
- output_file = OUTPUT_DIR / f"{doc_filename}.{fmt}"
176
- if fmt == "json":
177
- with open(output_file, "w", encoding="utf-8") as jf:
178
- json.dump(conv_res.document.export_to_dict(), jf, ensure_ascii=False, indent=2)
179
- elif fmt == "yaml":
180
- with open(output_file, "w", encoding="utf-8") as yf:
181
- yaml.dump(conv_res.document.export_to_dict(), yf, allow_unicode=True)
182
- elif fmt == "md":
183
- with open(output_file, "w", encoding="utf-8") as mf:
184
- mf.write(conv_res.document.export_to_markdown())
185
- exported_files["exports"].append(str(output_file))
186
-
187
  else:
 
188
  failure_count += 1
189
 
190
  # Création du fichier ZIP
191
- zip_path = OUTPUT_DIR / "exports.zip"
192
- with zipfile.ZipFile(zip_path, "w") as zipf:
193
- for category, files in exported_files.items():
194
- for file in files:
195
- zipf.write(file, arcname=Path(file).name)
196
-
197
- # Affichage des résultats
198
- st.success(f"Conversion terminée : {success_count} fichiers convertis avec succès.")
199
- if failure_count > 0:
200
- st.warning(f"{failure_count} fichiers n'ont pas pu être convertis.")
201
-
202
- st.download_button(
203
- label="Télécharger tous les résultats (ZIP)",
204
- data=zip_path.read_bytes(),
205
- file_name="exports.zip",
206
- mime="application/zip",
207
- )
208
 
209
- # Affichage des métriques
210
- st.subheader("Métriques de conversion")
211
- st.write(f"Fichiers traités : {len(uploaded_files)}")
212
- st.write(f"Conversions réussies : {success_count}")
213
- st.write(f"Échecs : {failure_count}")
214
- else:
215
- st.info("Ajoutez des fichiers pour commencer la conversion.")
 
1
+ import nest_asyncio
2
+ import aiofiles
3
+ import uvicorn
4
  import zipfile
5
+ from fastapi import FastAPI, File, Query, UploadFile, HTTPException
6
+ from fastapi.responses import FileResponse, JSONResponse
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from enum import Enum
9
  from pathlib import Path
10
  from typing import List
11
+ from PyPDF2 import PdfReader
12
+ from easyocr import Reader
13
+
 
 
 
 
14
  from docling.document_converter import DocumentConverter, PdfFormatOption
15
  from docling.datamodel.base_models import InputFormat
16
  from docling.datamodel.document import ConversionStatus
 
26
  RapidOcrOptions,
27
  OcrMacOptions,
28
  )
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ # Activation de nest_asyncio pour éviter les erreurs de boucle d'événement dans Colab
31
+ nest_asyncio.apply()
32
 
33
+ # Définition de l'application FastAPI
34
+ app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ app.add_middleware(
37
+ CORSMiddleware,
38
+ allow_origins=["*"],
39
+ allow_credentials=True,
40
+ allow_methods=["*"],
41
+ allow_headers=["*"],
42
  )
43
 
44
+ # Répertoires de sortie
45
+ OUTPUT_DIR = Path("output")
46
+ OUTPUT_DIR.mkdir(exist_ok=True)
 
 
47
 
48
+ FIGURES_DIR = OUTPUT_DIR / "figures"
49
+ FIGURES_DIR.mkdir(exist_ok=True)
 
 
 
50
 
51
+ TABLES_DIR = OUTPUT_DIR / "tables"
52
+ TABLES_DIR.mkdir(exist_ok=True)
53
+
54
+ class ExportFormat(str, Enum):
55
+ json = "json"
56
+ yaml = "yaml"
57
+ md = "md"
58
+ multimodal = "multimodal"
59
+
60
+ class OcrEngine(str, Enum):
61
+ easyocr = "easyocr"
62
+ tesseract_cli = "tesseract_cli"
63
+ tesserocr = "tesserocr"
64
+ rapidocr = "rapidocr"
65
+ ocrmac = "ocrmac"
66
+
67
+ # Vérification de validité des fichiers PDF
68
+ def is_valid_pdf(file_path):
69
+ try:
70
+ PdfReader(file_path)
71
+ return True
72
+ except Exception as e:
73
+ print(f"Fichier non valide ou corrompu : {file_path} - Erreur : {e}")
74
+ return False
75
+
76
+ # Fonction pour configurer le convertisseur de documents
77
+ def create_document_converter(
78
+ use_ocr: bool,
79
+ export_figures: bool,
80
+ export_tables: bool,
81
+ accelerator: str,
82
+ ocr_engine: OcrEngine,
83
+ table_mode: str,
84
+ ocr_languages: List[str]
85
+ ) -> DocumentConverter:
86
  accelerator_options = AcceleratorOptions(
87
  num_threads=8,
88
  device=AcceleratorDevice[accelerator.upper()],
 
93
  do_cell_matching=True,
94
  )
95
 
96
+ # OCR avec EasyOCR ou autres moteurs
97
+ if ocr_engine == OcrEngine.easyocr:
 
 
 
 
98
  ocr_options = EasyOcrOptions(lang=ocr_languages)
99
+ elif ocr_engine == OcrEngine.tesseract_cli:
100
  ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
101
+ elif ocr_engine == OcrEngine.tesserocr:
102
  ocr_options = TesseractOcrOptions(lang=ocr_languages)
103
+ elif ocr_engine == OcrEngine.rapidocr:
104
  ocr_options = RapidOcrOptions(lang=ocr_languages)
105
+ elif ocr_engine == OcrEngine.ocrmac:
106
  ocr_options = OcrMacOptions(lang=ocr_languages)
107
  else:
108
  raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
 
116
  table_structure_options=table_structure_options,
117
  ocr_options=ocr_options,
118
  )
 
119
  return DocumentConverter(
120
  allowed_formats=[
121
  InputFormat.PDF,
 
129
  },
130
  )
131
 
132
+ # Endpoint pour traiter les fichiers
133
+ @app.post("/process_files/")
134
+ async def process_files(
135
+ files: List[UploadFile] = File(...),
136
+ use_ocr: bool = Query(False),
137
+ export_figures: bool = Query(True),
138
+ export_tables: bool = Query(True),
139
+ export_formats: List[ExportFormat] = Query(default=[ExportFormat.md]),
140
+ accelerator: str = Query("cpu"),
141
+ ocr_engine: OcrEngine = Query(OcrEngine.easyocr),
142
+ table_mode: str = Query(TableFormerMode.ACCURATE),
143
+ ocr_languages: List[str] = Query(default=["eng"]),
144
+ ):
145
  input_paths = []
146
+ for f in files:
147
+ file_path = OUTPUT_DIR / f.filename
148
+ async with aiofiles.open(file_path, 'wb') as out_file:
149
+ content = await f.read()
150
+ await out_file.write(content)
151
+ print(f"Fichier reçu : {file_path} (taille : {file_path.stat().st_size} octets)")
152
+ if not is_valid_pdf(file_path):
153
+ return JSONResponse(
154
+ content={"error": f"Le fichier {file_path.name} n'est pas un PDF valide."}, status_code=400
155
+ )
156
  input_paths.append(file_path)
157
 
158
+ converter = create_document_converter(
159
+ use_ocr,
160
+ export_figures,
161
+ export_tables,
162
+ accelerator,
163
+ ocr_engine,
164
+ table_mode,
165
+ ocr_languages,
166
+ )
167
  conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
168
+
169
  success_count, failure_count = 0, 0
170
+ generated_files = []
171
 
172
  for conv_res in conv_results:
173
  if conv_res.status == ConversionStatus.SUCCESS:
174
+ print(f"Conversion réussie pour : {conv_res.input.file}")
175
  success_count += 1
176
+ for export_format in export_formats:
177
+ if export_format == ExportFormat.md:
178
+ output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.md"
179
+ if conv_res.document.pages:
180
+ with open(output_file, "w") as f:
181
+ f.write("## Exemple de contenu Markdown généré\n")
182
+ generated_files.append(output_file)
183
+ else:
184
+ print(f"Aucune donnée trouvée dans le document converti : {conv_res.input.file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  else:
186
+ print(f"Échec de la conversion pour : {conv_res.input.file} - Statut : {conv_res.status}")
187
  failure_count += 1
188
 
189
  # Création du fichier ZIP
190
+ zip_filename = OUTPUT_DIR / "exported_files.zip"
191
+ with zipfile.ZipFile(zip_filename, "w") as zipf:
192
+ for file in generated_files:
193
+ zipf.write(file, file.name)
194
+
195
+ return {"success_count": success_count, "failure_count": failure_count, "zip_path": str(zip_filename)}
196
+
197
+ @app.get("/download/{filename}")
198
+ def download_file(filename: str):
199
+ file_path = OUTPUT_DIR / filename
200
+ if file_path.exists():
201
+ return FileResponse(path=file_path, filename=filename)
202
+ else:
203
+ raise HTTPException(status_code=404, detail="Fichier non trouvé.")
 
 
 
204
 
205
+ # Démarrer le serveur
206
+ if __name__ == "__main__":
207
+ uvicorn.run(app, host="0.0.0.0", port=8000)