Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
-
import
|
2 |
-
import
|
|
|
3 |
import zipfile
|
|
|
|
|
|
|
|
|
4 |
from pathlib import Path
|
5 |
from typing import List
|
6 |
-
import
|
7 |
-
import
|
8 |
-
|
9 |
-
import datetime
|
10 |
-
import easyocr
|
11 |
-
import pandas as pd
|
12 |
-
import streamlit as st
|
13 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
14 |
from docling.datamodel.base_models import InputFormat
|
15 |
from docling.datamodel.document import ConversionStatus
|
@@ -25,70 +26,63 @@ from docling.datamodel.pipeline_options import (
|
|
25 |
RapidOcrOptions,
|
26 |
OcrMacOptions,
|
27 |
)
|
28 |
-
from easyocr.utils import get_language_list
|
29 |
-
|
30 |
-
# Répertoires de sortie
|
31 |
-
OUTPUT_DIR = Path("output")
|
32 |
-
FIGURES_DIR = OUTPUT_DIR / "figures"
|
33 |
-
TABLES_DIR = OUTPUT_DIR / "tables"
|
34 |
-
|
35 |
-
for directory in [OUTPUT_DIR, FIGURES_DIR, TABLES_DIR]:
|
36 |
-
directory.mkdir(exist_ok=True)
|
37 |
-
|
38 |
-
# Récupération des langues supportées par EasyOCR
|
39 |
-
supported_languages = get_language_list()
|
40 |
|
41 |
-
#
|
42 |
-
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
# Formulaire de configuration
|
48 |
-
use_ocr = st.sidebar.checkbox("Activer l'OCR", value=True)
|
49 |
-
export_figures = st.sidebar.checkbox("Exporter les figures", value=True)
|
50 |
-
export_tables = st.sidebar.checkbox("Exporter les tableaux", value=True)
|
51 |
-
|
52 |
-
accelerator = st.sidebar.selectbox(
|
53 |
-
"Accélérateur",
|
54 |
-
["auto", "cpu", "cuda", "mps"],
|
55 |
-
index=0,
|
56 |
-
format_func=lambda x: x.upper(),
|
57 |
-
)
|
58 |
-
|
59 |
-
ocr_engine = st.sidebar.selectbox(
|
60 |
-
"Moteur OCR",
|
61 |
-
["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"],
|
62 |
-
index=0,
|
63 |
-
)
|
64 |
-
|
65 |
-
ocr_languages = st.sidebar.multiselect(
|
66 |
-
"Langues OCR",
|
67 |
-
options=supported_languages,
|
68 |
-
default=["en"],
|
69 |
-
)
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
["
|
74 |
-
|
75 |
-
|
|
|
76 |
)
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
default=["md"],
|
82 |
-
)
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
type=["pdf", "docx", "pptx", "html", "png", "jpg", "jpeg"],
|
87 |
-
accept_multiple_files=True,
|
88 |
-
)
|
89 |
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
accelerator_options = AcceleratorOptions(
|
93 |
num_threads=8,
|
94 |
device=AcceleratorDevice[accelerator.upper()],
|
@@ -99,20 +93,16 @@ def create_document_converter():
|
|
99 |
do_cell_matching=True,
|
100 |
)
|
101 |
|
102 |
-
#
|
103 |
-
if
|
104 |
-
st.error(f"Certaines langues sélectionnées ne sont pas prises en charge : {ocr_languages}")
|
105 |
-
st.stop()
|
106 |
-
|
107 |
-
if ocr_engine == "easyocr":
|
108 |
ocr_options = EasyOcrOptions(lang=ocr_languages)
|
109 |
-
elif ocr_engine ==
|
110 |
ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
|
111 |
-
elif ocr_engine ==
|
112 |
ocr_options = TesseractOcrOptions(lang=ocr_languages)
|
113 |
-
elif ocr_engine ==
|
114 |
ocr_options = RapidOcrOptions(lang=ocr_languages)
|
115 |
-
elif ocr_engine ==
|
116 |
ocr_options = OcrMacOptions(lang=ocr_languages)
|
117 |
else:
|
118 |
raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
|
@@ -126,7 +116,6 @@ def create_document_converter():
|
|
126 |
table_structure_options=table_structure_options,
|
127 |
ocr_options=ocr_options,
|
128 |
)
|
129 |
-
|
130 |
return DocumentConverter(
|
131 |
allowed_formats=[
|
132 |
InputFormat.PDF,
|
@@ -140,76 +129,79 @@ def create_document_converter():
|
|
140 |
},
|
141 |
)
|
142 |
|
143 |
-
#
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
input_paths = []
|
151 |
-
for
|
152 |
-
file_path = OUTPUT_DIR /
|
153 |
-
with open(file_path,
|
154 |
-
f.
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
input_paths.append(file_path)
|
156 |
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
|
|
|
159 |
success_count, failure_count = 0, 0
|
160 |
-
|
161 |
|
162 |
for conv_res in conv_results:
|
163 |
if conv_res.status == ConversionStatus.SUCCESS:
|
|
|
164 |
success_count += 1
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
for fmt in export_formats:
|
175 |
-
output_file = OUTPUT_DIR / f"{doc_filename}.{fmt}"
|
176 |
-
if fmt == "json":
|
177 |
-
with open(output_file, "w", encoding="utf-8") as jf:
|
178 |
-
json.dump(conv_res.document.export_to_dict(), jf, ensure_ascii=False, indent=2)
|
179 |
-
elif fmt == "yaml":
|
180 |
-
with open(output_file, "w", encoding="utf-8") as yf:
|
181 |
-
yaml.dump(conv_res.document.export_to_dict(), yf, allow_unicode=True)
|
182 |
-
elif fmt == "md":
|
183 |
-
with open(output_file, "w", encoding="utf-8") as mf:
|
184 |
-
mf.write(conv_res.document.export_to_markdown())
|
185 |
-
exported_files["exports"].append(str(output_file))
|
186 |
-
|
187 |
else:
|
|
|
188 |
failure_count += 1
|
189 |
|
190 |
# Création du fichier ZIP
|
191 |
-
|
192 |
-
with zipfile.ZipFile(
|
193 |
-
for
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
file_name="exports.zip",
|
206 |
-
mime="application/zip",
|
207 |
-
)
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
st.write(f"Conversions réussies : {success_count}")
|
213 |
-
st.write(f"Échecs : {failure_count}")
|
214 |
-
else:
|
215 |
-
st.info("Ajoutez des fichiers pour commencer la conversion.")
|
|
|
1 |
+
import nest_asyncio
|
2 |
+
import aiofiles
|
3 |
+
import uvicorn
|
4 |
import zipfile
|
5 |
+
from fastapi import FastAPI, File, Query, UploadFile, HTTPException
|
6 |
+
from fastapi.responses import FileResponse, JSONResponse
|
7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
8 |
+
from enum import Enum
|
9 |
from pathlib import Path
|
10 |
from typing import List
|
11 |
+
from PyPDF2 import PdfReader
|
12 |
+
from easyocr import Reader
|
13 |
+
|
|
|
|
|
|
|
|
|
14 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
15 |
from docling.datamodel.base_models import InputFormat
|
16 |
from docling.datamodel.document import ConversionStatus
|
|
|
26 |
RapidOcrOptions,
|
27 |
OcrMacOptions,
|
28 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
# Activation de nest_asyncio pour éviter les erreurs de boucle d'événement dans Colab
|
31 |
+
nest_asyncio.apply()
|
32 |
|
33 |
+
# Définition de l'application FastAPI
|
34 |
+
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
app.add_middleware(
|
37 |
+
CORSMiddleware,
|
38 |
+
allow_origins=["*"],
|
39 |
+
allow_credentials=True,
|
40 |
+
allow_methods=["*"],
|
41 |
+
allow_headers=["*"],
|
42 |
)
|
43 |
|
44 |
+
# Répertoires de sortie
|
45 |
+
OUTPUT_DIR = Path("output")
|
46 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
|
47 |
|
48 |
+
FIGURES_DIR = OUTPUT_DIR / "figures"
|
49 |
+
FIGURES_DIR.mkdir(exist_ok=True)
|
|
|
|
|
|
|
50 |
|
51 |
+
TABLES_DIR = OUTPUT_DIR / "tables"
|
52 |
+
TABLES_DIR.mkdir(exist_ok=True)
|
53 |
+
|
54 |
+
class ExportFormat(str, Enum):
|
55 |
+
json = "json"
|
56 |
+
yaml = "yaml"
|
57 |
+
md = "md"
|
58 |
+
multimodal = "multimodal"
|
59 |
+
|
60 |
+
class OcrEngine(str, Enum):
|
61 |
+
easyocr = "easyocr"
|
62 |
+
tesseract_cli = "tesseract_cli"
|
63 |
+
tesserocr = "tesserocr"
|
64 |
+
rapidocr = "rapidocr"
|
65 |
+
ocrmac = "ocrmac"
|
66 |
+
|
67 |
+
# Vérification de validité des fichiers PDF
|
68 |
+
def is_valid_pdf(file_path):
|
69 |
+
try:
|
70 |
+
PdfReader(file_path)
|
71 |
+
return True
|
72 |
+
except Exception as e:
|
73 |
+
print(f"Fichier non valide ou corrompu : {file_path} - Erreur : {e}")
|
74 |
+
return False
|
75 |
+
|
76 |
+
# Fonction pour configurer le convertisseur de documents
|
77 |
+
def create_document_converter(
|
78 |
+
use_ocr: bool,
|
79 |
+
export_figures: bool,
|
80 |
+
export_tables: bool,
|
81 |
+
accelerator: str,
|
82 |
+
ocr_engine: OcrEngine,
|
83 |
+
table_mode: str,
|
84 |
+
ocr_languages: List[str]
|
85 |
+
) -> DocumentConverter:
|
86 |
accelerator_options = AcceleratorOptions(
|
87 |
num_threads=8,
|
88 |
device=AcceleratorDevice[accelerator.upper()],
|
|
|
93 |
do_cell_matching=True,
|
94 |
)
|
95 |
|
96 |
+
# OCR avec EasyOCR ou autres moteurs
|
97 |
+
if ocr_engine == OcrEngine.easyocr:
|
|
|
|
|
|
|
|
|
98 |
ocr_options = EasyOcrOptions(lang=ocr_languages)
|
99 |
+
elif ocr_engine == OcrEngine.tesseract_cli:
|
100 |
ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
|
101 |
+
elif ocr_engine == OcrEngine.tesserocr:
|
102 |
ocr_options = TesseractOcrOptions(lang=ocr_languages)
|
103 |
+
elif ocr_engine == OcrEngine.rapidocr:
|
104 |
ocr_options = RapidOcrOptions(lang=ocr_languages)
|
105 |
+
elif ocr_engine == OcrEngine.ocrmac:
|
106 |
ocr_options = OcrMacOptions(lang=ocr_languages)
|
107 |
else:
|
108 |
raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")
|
|
|
116 |
table_structure_options=table_structure_options,
|
117 |
ocr_options=ocr_options,
|
118 |
)
|
|
|
119 |
return DocumentConverter(
|
120 |
allowed_formats=[
|
121 |
InputFormat.PDF,
|
|
|
129 |
},
|
130 |
)
|
131 |
|
132 |
+
# Endpoint pour traiter les fichiers
|
133 |
+
@app.post("/process_files/")
|
134 |
+
async def process_files(
|
135 |
+
files: List[UploadFile] = File(...),
|
136 |
+
use_ocr: bool = Query(False),
|
137 |
+
export_figures: bool = Query(True),
|
138 |
+
export_tables: bool = Query(True),
|
139 |
+
export_formats: List[ExportFormat] = Query(default=[ExportFormat.md]),
|
140 |
+
accelerator: str = Query("cpu"),
|
141 |
+
ocr_engine: OcrEngine = Query(OcrEngine.easyocr),
|
142 |
+
table_mode: str = Query(TableFormerMode.ACCURATE),
|
143 |
+
ocr_languages: List[str] = Query(default=["eng"]),
|
144 |
+
):
|
145 |
input_paths = []
|
146 |
+
for f in files:
|
147 |
+
file_path = OUTPUT_DIR / f.filename
|
148 |
+
async with aiofiles.open(file_path, 'wb') as out_file:
|
149 |
+
content = await f.read()
|
150 |
+
await out_file.write(content)
|
151 |
+
print(f"Fichier reçu : {file_path} (taille : {file_path.stat().st_size} octets)")
|
152 |
+
if not is_valid_pdf(file_path):
|
153 |
+
return JSONResponse(
|
154 |
+
content={"error": f"Le fichier {file_path.name} n'est pas un PDF valide."}, status_code=400
|
155 |
+
)
|
156 |
input_paths.append(file_path)
|
157 |
|
158 |
+
converter = create_document_converter(
|
159 |
+
use_ocr,
|
160 |
+
export_figures,
|
161 |
+
export_tables,
|
162 |
+
accelerator,
|
163 |
+
ocr_engine,
|
164 |
+
table_mode,
|
165 |
+
ocr_languages,
|
166 |
+
)
|
167 |
conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
|
168 |
+
|
169 |
success_count, failure_count = 0, 0
|
170 |
+
generated_files = []
|
171 |
|
172 |
for conv_res in conv_results:
|
173 |
if conv_res.status == ConversionStatus.SUCCESS:
|
174 |
+
print(f"Conversion réussie pour : {conv_res.input.file}")
|
175 |
success_count += 1
|
176 |
+
for export_format in export_formats:
|
177 |
+
if export_format == ExportFormat.md:
|
178 |
+
output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.md"
|
179 |
+
if conv_res.document.pages:
|
180 |
+
with open(output_file, "w") as f:
|
181 |
+
f.write("## Exemple de contenu Markdown généré\n")
|
182 |
+
generated_files.append(output_file)
|
183 |
+
else:
|
184 |
+
print(f"Aucune donnée trouvée dans le document converti : {conv_res.input.file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
else:
|
186 |
+
print(f"Échec de la conversion pour : {conv_res.input.file} - Statut : {conv_res.status}")
|
187 |
failure_count += 1
|
188 |
|
189 |
# Création du fichier ZIP
|
190 |
+
zip_filename = OUTPUT_DIR / "exported_files.zip"
|
191 |
+
with zipfile.ZipFile(zip_filename, "w") as zipf:
|
192 |
+
for file in generated_files:
|
193 |
+
zipf.write(file, file.name)
|
194 |
+
|
195 |
+
return {"success_count": success_count, "failure_count": failure_count, "zip_path": str(zip_filename)}
|
196 |
+
|
197 |
+
@app.get("/download/{filename}")
|
198 |
+
def download_file(filename: str):
|
199 |
+
file_path = OUTPUT_DIR / filename
|
200 |
+
if file_path.exists():
|
201 |
+
return FileResponse(path=file_path, filename=filename)
|
202 |
+
else:
|
203 |
+
raise HTTPException(status_code=404, detail="Fichier non trouvé.")
|
|
|
|
|
|
|
204 |
|
205 |
+
# Démarrer le serveur
|
206 |
+
if __name__ == "__main__":
|
207 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
|
|
|
|