Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1 |
import os
|
2 |
-
import json
|
3 |
-
import zipfile
|
4 |
import time
|
|
|
5 |
from pathlib import Path
|
6 |
from typing import List
|
|
|
|
|
|
|
|
|
|
|
7 |
import pandas as pd
|
8 |
import streamlit as st
|
9 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
@@ -14,6 +18,7 @@ from docling.datamodel.pipeline_options import (
|
|
14 |
AcceleratorOptions,
|
15 |
AcceleratorDevice,
|
16 |
TableStructureOptions,
|
|
|
17 |
EasyOcrOptions,
|
18 |
TesseractCliOcrOptions,
|
19 |
TesseractOcrOptions,
|
@@ -23,24 +28,66 @@ from docling.datamodel.pipeline_options import (
|
|
23 |
|
24 |
# Répertoires de sortie
|
25 |
OUTPUT_DIR = Path("output")
|
26 |
-
OUTPUT_DIR.mkdir(exist_ok=True)
|
27 |
-
|
28 |
FIGURES_DIR = OUTPUT_DIR / "figures"
|
29 |
-
FIGURES_DIR.mkdir(exist_ok=True)
|
30 |
-
|
31 |
TABLES_DIR = OUTPUT_DIR / "tables"
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
accelerator_options = AcceleratorOptions(
|
45 |
num_threads=8,
|
46 |
device=AcceleratorDevice[accelerator.upper()],
|
@@ -51,6 +98,11 @@ def create_document_converter(
|
|
51 |
do_cell_matching=True,
|
52 |
)
|
53 |
|
|
|
|
|
|
|
|
|
|
|
54 |
if ocr_engine == "easyocr":
|
55 |
ocr_options = EasyOcrOptions(lang=ocr_languages)
|
56 |
elif ocr_engine == "tesseract_cli":
|
@@ -87,46 +139,13 @@ def create_document_converter(
|
|
87 |
},
|
88 |
)
|
89 |
|
90 |
-
#
|
91 |
-
|
92 |
-
|
93 |
-
with zipfile.ZipFile(zip_path, "w") as zipf:
|
94 |
-
for file_path in output_dir.rglob("*"):
|
95 |
-
if file_path.is_file():
|
96 |
-
zipf.write(file_path, arcname=file_path.relative_to(output_dir))
|
97 |
-
return str(zip_path)
|
98 |
-
|
99 |
-
# Fonction pour calculer les métriques (exemple : temps d'inférence, nombre de tables, figures, etc.)
|
100 |
-
def compute_metrics(conversion_results, start_time):
|
101 |
-
metrics = {
|
102 |
-
"total_documents": len(conversion_results),
|
103 |
-
"successful_conversions": sum(1 for res in conversion_results if res.status == ConversionStatus.SUCCESS),
|
104 |
-
"failed_conversions": sum(1 for res in conversion_results if res.status != ConversionStatus.SUCCESS),
|
105 |
-
"total_time": time.time() - start_time,
|
106 |
-
"tables_extracted": sum(len(res.document.tables) for res in conversion_results if res.status == ConversionStatus.SUCCESS),
|
107 |
-
"figures_extracted": sum(len(res.document.pictures) for res in conversion_results if res.status == ConversionStatus.SUCCESS),
|
108 |
-
}
|
109 |
-
return metrics
|
110 |
-
|
111 |
-
# Interface Streamlit
|
112 |
-
st.set_page_config(page_title="Docling Dynamic Processor", layout="wide")
|
113 |
-
st.title("Docling Dynamic Processor - Analyse et Extraction de Documents")
|
114 |
-
|
115 |
-
# Formulaire de configuration
|
116 |
-
st.sidebar.header("Options de configuration")
|
117 |
-
use_ocr = st.sidebar.checkbox("Activer l'OCR", value=True)
|
118 |
-
export_figures = st.sidebar.checkbox("Exporter les figures", value=True)
|
119 |
-
export_tables = st.sidebar.checkbox("Exporter les tableaux", value=True)
|
120 |
-
accelerator = st.sidebar.selectbox("Accélérateur", ["cpu", "cuda", "mps"], index=0)
|
121 |
-
ocr_engine = st.sidebar.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
|
122 |
-
table_mode = st.sidebar.selectbox("Mode Table", ["accurate", "fast"], index=0)
|
123 |
-
ocr_languages = st.sidebar.text_input("Langues OCR (ex: eng, fra)", value="eng").split(",")
|
124 |
|
125 |
-
|
126 |
-
uploaded_files = st.file_uploader("Téléchargez vos fichiers (PDF, DOCX, etc.)", type=["pdf", "docx", "pptx"], accept_multiple_files=True)
|
127 |
|
128 |
-
|
129 |
-
# Sauvegarder les fichiers téléchargés
|
130 |
input_paths = []
|
131 |
for uploaded_file in uploaded_files:
|
132 |
file_path = OUTPUT_DIR / uploaded_file.name
|
@@ -134,54 +153,62 @@ if st.button("Lancer le traitement") and uploaded_files:
|
|
134 |
f.write(uploaded_file.read())
|
135 |
input_paths.append(file_path)
|
136 |
|
137 |
-
#
|
138 |
-
start_time = time.time()
|
139 |
-
converter = create_document_converter(
|
140 |
-
use_ocr,
|
141 |
-
export_figures,
|
142 |
-
export_tables,
|
143 |
-
accelerator,
|
144 |
-
ocr_engine,
|
145 |
-
table_mode,
|
146 |
-
ocr_languages,
|
147 |
-
)
|
148 |
conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
|
149 |
-
|
150 |
-
# Traiter les fichiers et collecter les résultats
|
151 |
exported_files = {"figures": [], "tables": [], "exports": []}
|
|
|
152 |
for conv_res in conv_results:
|
153 |
if conv_res.status == ConversionStatus.SUCCESS:
|
|
|
154 |
doc_filename = conv_res.input.file.stem
|
155 |
|
156 |
# Export des tableaux
|
157 |
for table_ix, table in enumerate(conv_res.document.tables):
|
158 |
-
csv_file =
|
159 |
table.export_to_dataframe().to_csv(csv_file, index=False)
|
160 |
exported_files["tables"].append(str(csv_file))
|
161 |
|
162 |
-
# Export des
|
163 |
-
for
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
st.download_button(
|
183 |
label="Télécharger tous les résultats (ZIP)",
|
184 |
-
data=
|
185 |
-
file_name="
|
186 |
mime="application/zip",
|
187 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
|
|
2 |
import time
|
3 |
+
import zipfile
|
4 |
from pathlib import Path
|
5 |
from typing import List
|
6 |
+
import shutil
|
7 |
+
import json
|
8 |
+
import yaml
|
9 |
+
import datetime
|
10 |
+
import easyocr
|
11 |
import pandas as pd
|
12 |
import streamlit as st
|
13 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
|
18 |
AcceleratorOptions,
|
19 |
AcceleratorDevice,
|
20 |
TableStructureOptions,
|
21 |
+
TableFormerMode,
|
22 |
EasyOcrOptions,
|
23 |
TesseractCliOcrOptions,
|
24 |
TesseractOcrOptions,
|
|
|
28 |
|
29 |
# Répertoires de sortie
|
30 |
OUTPUT_DIR = Path("output")
|
|
|
|
|
31 |
FIGURES_DIR = OUTPUT_DIR / "figures"
|
|
|
|
|
32 |
TABLES_DIR = OUTPUT_DIR / "tables"
|
33 |
+
|
34 |
+
for directory in [OUTPUT_DIR, FIGURES_DIR, TABLES_DIR]:
|
35 |
+
directory.mkdir(exist_ok=True)
|
36 |
+
|
37 |
+
# Récupération des langues supportées par EasyOCR
|
38 |
+
supported_languages = easyocr.Reader(lang_list=None).lang_list
|
39 |
+
|
40 |
+
# Configuration Streamlit
|
41 |
+
st.set_page_config(page_title="Docling API UI", layout="wide")
|
42 |
+
|
43 |
+
st.title("Docling Document Conversion API")
|
44 |
+
st.sidebar.header("Configuration")
|
45 |
+
|
46 |
+
# Formulaire de configuration
|
47 |
+
use_ocr = st.sidebar.checkbox("Activer l'OCR", value=True)
|
48 |
+
export_figures = st.sidebar.checkbox("Exporter les figures", value=True)
|
49 |
+
export_tables = st.sidebar.checkbox("Exporter les tableaux", value=True)
|
50 |
+
|
51 |
+
accelerator = st.sidebar.selectbox(
|
52 |
+
"Accélérateur",
|
53 |
+
["auto", "cpu", "cuda", "mps"],
|
54 |
+
index=0,
|
55 |
+
format_func=lambda x: x.upper(),
|
56 |
+
)
|
57 |
+
|
58 |
+
ocr_engine = st.sidebar.selectbox(
|
59 |
+
"Moteur OCR",
|
60 |
+
["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"],
|
61 |
+
index=0,
|
62 |
+
)
|
63 |
+
|
64 |
+
ocr_languages = st.sidebar.multiselect(
|
65 |
+
"Langues OCR",
|
66 |
+
options=supported_languages,
|
67 |
+
default=["en"],
|
68 |
+
)
|
69 |
+
|
70 |
+
table_mode = st.sidebar.selectbox(
|
71 |
+
"Mode Table",
|
72 |
+
["accurate", "fast"],
|
73 |
+
index=0,
|
74 |
+
format_func=lambda x: x.capitalize(),
|
75 |
+
)
|
76 |
+
|
77 |
+
export_formats = st.sidebar.multiselect(
|
78 |
+
"Formats d'export",
|
79 |
+
["json", "yaml", "md", "multimodal"],
|
80 |
+
default=["md"],
|
81 |
+
)
|
82 |
+
|
83 |
+
uploaded_files = st.file_uploader(
|
84 |
+
"Uploader vos fichiers (PDF, DOCX, PPTX, HTML, IMAGES)",
|
85 |
+
type=["pdf", "docx", "pptx", "html", "png", "jpg", "jpeg"],
|
86 |
+
accept_multiple_files=True,
|
87 |
+
)
|
88 |
+
|
89 |
+
# Fonction pour créer le convertisseur
|
90 |
+
def create_document_converter():
|
91 |
accelerator_options = AcceleratorOptions(
|
92 |
num_threads=8,
|
93 |
device=AcceleratorDevice[accelerator.upper()],
|
|
|
98 |
do_cell_matching=True,
|
99 |
)
|
100 |
|
101 |
+
# Validation des langues
|
102 |
+
if not all(lang in supported_languages for lang in ocr_languages):
|
103 |
+
st.error(f"Certaines langues sélectionnées ne sont pas prises en charge : {ocr_languages}")
|
104 |
+
st.stop()
|
105 |
+
|
106 |
if ocr_engine == "easyocr":
|
107 |
ocr_options = EasyOcrOptions(lang=ocr_languages)
|
108 |
elif ocr_engine == "tesseract_cli":
|
|
|
139 |
},
|
140 |
)
|
141 |
|
142 |
+
# Traitement des fichiers
|
143 |
+
if st.button("Lancer la conversion") and uploaded_files:
|
144 |
+
st.info("Conversion en cours, veuillez patienter...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
+
converter = create_document_converter()
|
|
|
147 |
|
148 |
+
# Préparer les fichiers pour le traitement
|
|
|
149 |
input_paths = []
|
150 |
for uploaded_file in uploaded_files:
|
151 |
file_path = OUTPUT_DIR / uploaded_file.name
|
|
|
153 |
f.write(uploaded_file.read())
|
154 |
input_paths.append(file_path)
|
155 |
|
156 |
+
# Conversion des fichiers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
conv_results = list(converter.convert_all(input_paths, raises_on_error=False))
|
158 |
+
success_count, failure_count = 0, 0
|
|
|
159 |
exported_files = {"figures": [], "tables": [], "exports": []}
|
160 |
+
|
161 |
for conv_res in conv_results:
|
162 |
if conv_res.status == ConversionStatus.SUCCESS:
|
163 |
+
success_count += 1
|
164 |
doc_filename = conv_res.input.file.stem
|
165 |
|
166 |
# Export des tableaux
|
167 |
for table_ix, table in enumerate(conv_res.document.tables):
|
168 |
+
csv_file = TABLES_DIR / f"{doc_filename}-table-{table_ix+1}.csv"
|
169 |
table.export_to_dataframe().to_csv(csv_file, index=False)
|
170 |
exported_files["tables"].append(str(csv_file))
|
171 |
|
172 |
+
# Export des formats demandés
|
173 |
+
for fmt in export_formats:
|
174 |
+
output_file = OUTPUT_DIR / f"{doc_filename}.{fmt}"
|
175 |
+
if fmt == "json":
|
176 |
+
with open(output_file, "w", encoding="utf-8") as jf:
|
177 |
+
json.dump(conv_res.document.export_to_dict(), jf, ensure_ascii=False, indent=2)
|
178 |
+
elif fmt == "yaml":
|
179 |
+
with open(output_file, "w", encoding="utf-8") as yf:
|
180 |
+
yaml.dump(conv_res.document.export_to_dict(), yf, allow_unicode=True)
|
181 |
+
elif fmt == "md":
|
182 |
+
with open(output_file, "w", encoding="utf-8") as mf:
|
183 |
+
mf.write(conv_res.document.export_to_markdown())
|
184 |
+
exported_files["exports"].append(str(output_file))
|
185 |
+
|
186 |
+
else:
|
187 |
+
failure_count += 1
|
188 |
+
|
189 |
+
# Création du fichier ZIP
|
190 |
+
zip_path = OUTPUT_DIR / "exports.zip"
|
191 |
+
with zipfile.ZipFile(zip_path, "w") as zipf:
|
192 |
+
for category, files in exported_files.items():
|
193 |
+
for file in files:
|
194 |
+
zipf.write(file, arcname=Path(file).name)
|
195 |
+
|
196 |
+
# Affichage des résultats
|
197 |
+
st.success(f"Conversion terminée : {success_count} fichiers convertis avec succès.")
|
198 |
+
if failure_count > 0:
|
199 |
+
st.warning(f"{failure_count} fichiers n'ont pas pu être convertis.")
|
200 |
+
|
201 |
st.download_button(
|
202 |
label="Télécharger tous les résultats (ZIP)",
|
203 |
+
data=zip_path.read_bytes(),
|
204 |
+
file_name="exports.zip",
|
205 |
mime="application/zip",
|
206 |
+
)
|
207 |
+
|
208 |
+
# Affichage des métriques
|
209 |
+
st.subheader("Métriques de conversion")
|
210 |
+
st.write(f"Fichiers traités : {len(uploaded_files)}")
|
211 |
+
st.write(f"Conversions réussies : {success_count}")
|
212 |
+
st.write(f"Échecs : {failure_count}")
|
213 |
+
else:
|
214 |
+
st.info("Ajoutez des fichiers pour commencer la conversion.")
|