Lucas ARRIESSE
commited on
Commit
·
46800f4
1
Parent(s):
5f1cdfa
wip
Browse files- api/docs.py +29 -44
- api/solutions.py +1 -2
api/docs.py
CHANGED
@@ -6,7 +6,6 @@ from fastapi.routing import APIRouter
|
|
6 |
import logging
|
7 |
import io
|
8 |
import zipfile
|
9 |
-
import json
|
10 |
import os
|
11 |
from httpx import AsyncClient
|
12 |
from pydantic import BaseModel
|
@@ -17,8 +16,8 @@ import re
|
|
17 |
import tempfile
|
18 |
from lxml import etree
|
19 |
from bs4 import BeautifulSoup
|
20 |
-
from fastapi import Depends,
|
21 |
-
from dependencies import
|
22 |
from fastapi.responses import StreamingResponse
|
23 |
from litellm.router import Router
|
24 |
|
@@ -99,15 +98,20 @@ def get_docx_archive(url: str) -> zipfile.ZipFile:
|
|
99 |
raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
|
100 |
|
101 |
|
102 |
-
def
|
103 |
-
"""
|
104 |
-
|
105 |
-
|
106 |
-
return etree.fromstring(xml_bytes, parser=parser)
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
-
def clean_document_xml(root: etree._Element) -> None:
|
110 |
-
"""Nettoie le XML en modifiant l'arbre directement"""
|
111 |
# Suppression des balises <w:del> et leur contenu
|
112 |
for del_elem in root.xpath('//w:del', namespaces=NSMAP):
|
113 |
parent = del_elem.getparent()
|
@@ -117,11 +121,12 @@ def clean_document_xml(root: etree._Element) -> None:
|
|
117 |
# Désencapsulation des balises <w:ins>
|
118 |
for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
|
119 |
parent = ins_elem.getparent()
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
125 |
|
126 |
# Nettoyage des commentaires
|
127 |
for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
|
@@ -130,20 +135,18 @@ def clean_document_xml(root: etree._Element) -> None:
|
|
130 |
if parent is not None:
|
131 |
parent.remove(elem)
|
132 |
|
133 |
-
|
134 |
-
def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> io.BytesIO:
|
135 |
-
"""Crée un nouveau docx avec le XML modifié"""
|
136 |
output = io.BytesIO()
|
137 |
|
138 |
with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
|
139 |
# Copier tous les fichiers non modifiés
|
140 |
-
for
|
141 |
-
if
|
142 |
-
new_zip.writestr(
|
143 |
|
144 |
# Ajouter le document.xml modifié
|
145 |
xml_str = etree.tostring(
|
146 |
-
|
147 |
xml_declaration=True,
|
148 |
encoding='UTF-8',
|
149 |
pretty_print=True
|
@@ -156,10 +159,7 @@ def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._El
|
|
156 |
|
157 |
def docx_to_txt(doc_id: str, url: str) -> str:
|
158 |
docx_zip = get_docx_archive(url)
|
159 |
-
|
160 |
-
clean_document_xml(root)
|
161 |
-
|
162 |
-
modified_bytes = create_modified_docx(docx_zip, root)
|
163 |
|
164 |
final_bytes = convert_file(
|
165 |
modified_bytes, f"{doc_id}", "docx", "txt")
|
@@ -278,32 +278,17 @@ def download_tdocs(req: DocDownloadRequest):
|
|
278 |
try:
|
279 |
text_lines = docx_to_txt(doc_id, doc_url)
|
280 |
content_bytes = "\n".join(text_lines).encode("utf-8")
|
281 |
-
return
|
282 |
except Exception as e:
|
283 |
logging.warning(
|
284 |
f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
|
285 |
error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
|
286 |
"utf-8")
|
287 |
-
return
|
288 |
|
289 |
for doc in req.documents:
|
290 |
-
|
291 |
documents_content[doc.document] = content
|
292 |
-
if not success:
|
293 |
-
failed_documents.append(doc.doc_id)
|
294 |
-
|
295 |
-
# sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
|
296 |
-
for requested_doc_id in document_ids:
|
297 |
-
if requested_doc_id not in documents_content:
|
298 |
-
error_msg = (
|
299 |
-
f"Failed to retrieve or process document '{requested_doc_id}'. "
|
300 |
-
).encode("utf-8")
|
301 |
-
|
302 |
-
documents_content[requested_doc_id] = error_msg
|
303 |
-
logging.warning(
|
304 |
-
f"Document '{requested_doc_id}' was requested but not found or processed.")
|
305 |
-
if requested_doc_id not in failed_documents:
|
306 |
-
failed_documents.append(requested_doc_id)
|
307 |
|
308 |
zip_buffer = io.BytesIO()
|
309 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
|
|
6 |
import logging
|
7 |
import io
|
8 |
import zipfile
|
|
|
9 |
import os
|
10 |
from httpx import AsyncClient
|
11 |
from pydantic import BaseModel
|
|
|
16 |
import tempfile
|
17 |
from lxml import etree
|
18 |
from bs4 import BeautifulSoup
|
19 |
+
from fastapi import Depends, HTTPException
|
20 |
+
from dependencies import get_http_client, get_llm_router
|
21 |
from fastapi.responses import StreamingResponse
|
22 |
from litellm.router import Router
|
23 |
|
|
|
98 |
raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
|
99 |
|
100 |
|
101 |
+
def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
|
102 |
+
"""
|
103 |
+
Applique les révisions des .docx avant de retourner le contenu
|
104 |
+
"""
|
|
|
105 |
|
106 |
+
try:
|
107 |
+
xml_bytes = docx_zip.read('word/document.xml')
|
108 |
+
except KeyError:
|
109 |
+
raise FileNotFoundError(
|
110 |
+
"word/document.xml not found in the DOCX archive.")
|
111 |
+
|
112 |
+
parser = etree.XMLParser(remove_blank_text=True)
|
113 |
+
root = etree.fromstring(xml_bytes, parser=parser)
|
114 |
|
|
|
|
|
115 |
# Suppression des balises <w:del> et leur contenu
|
116 |
for del_elem in root.xpath('//w:del', namespaces=NSMAP):
|
117 |
parent = del_elem.getparent()
|
|
|
121 |
# Désencapsulation des balises <w:ins>
|
122 |
for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
|
123 |
parent = ins_elem.getparent()
|
124 |
+
if parent is not None:
|
125 |
+
index = parent.index(ins_elem)
|
126 |
+
for child in ins_elem.iterchildren():
|
127 |
+
parent.insert(index, child)
|
128 |
+
index += 1
|
129 |
+
parent.remove(ins_elem)
|
130 |
|
131 |
# Nettoyage des commentaires
|
132 |
for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
|
|
|
135 |
if parent is not None:
|
136 |
parent.remove(elem)
|
137 |
|
138 |
+
# 3. Create a new docx with the modified XML
|
|
|
|
|
139 |
output = io.BytesIO()
|
140 |
|
141 |
with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
|
142 |
# Copier tous les fichiers non modifiés
|
143 |
+
for file_info in docx_zip.infolist():
|
144 |
+
if file_info.filename != 'word/document.xml':
|
145 |
+
new_zip.writestr(file_info, docx_zip.read(file_info.filename))
|
146 |
|
147 |
# Ajouter le document.xml modifié
|
148 |
xml_str = etree.tostring(
|
149 |
+
root,
|
150 |
xml_declaration=True,
|
151 |
encoding='UTF-8',
|
152 |
pretty_print=True
|
|
|
159 |
|
160 |
def docx_to_txt(doc_id: str, url: str) -> str:
|
161 |
docx_zip = get_docx_archive(url)
|
162 |
+
modified_bytes = apply_docx_revisions(docx_zip)
|
|
|
|
|
|
|
163 |
|
164 |
final_bytes = convert_file(
|
165 |
modified_bytes, f"{doc_id}", "docx", "txt")
|
|
|
278 |
try:
|
279 |
text_lines = docx_to_txt(doc_id, doc_url)
|
280 |
content_bytes = "\n".join(text_lines).encode("utf-8")
|
281 |
+
return content_bytes
|
282 |
except Exception as e:
|
283 |
logging.warning(
|
284 |
f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
|
285 |
error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
|
286 |
"utf-8")
|
287 |
+
return error_message
|
288 |
|
289 |
for doc in req.documents:
|
290 |
+
content = _process_single_document(doc.document, doc.url)
|
291 |
documents_content[doc.document] = content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
zip_buffer = io.BytesIO()
|
294 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
api/solutions.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
-
import
|
4 |
-
from fastapi import APIRouter, Depends, HTTPException, Response
|
5 |
from httpx import AsyncClient
|
6 |
from jinja2 import Environment, TemplateNotFound
|
7 |
from litellm.router import Router
|
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
+
from fastapi import APIRouter, Depends
|
|
|
4 |
from httpx import AsyncClient
|
5 |
from jinja2 import Environment, TemplateNotFound
|
6 |
from litellm.router import Router
|