Lucas ARRIESSE commited on
Commit
46800f4
·
1 Parent(s): 5f1cdfa
Files changed (2) hide show
  1. api/docs.py +29 -44
  2. api/solutions.py +1 -2
api/docs.py CHANGED
@@ -6,7 +6,6 @@ from fastapi.routing import APIRouter
6
  import logging
7
  import io
8
  import zipfile
9
- import json
10
  import os
11
  from httpx import AsyncClient
12
  from pydantic import BaseModel
@@ -17,8 +16,8 @@ import re
17
  import tempfile
18
  from lxml import etree
19
  from bs4 import BeautifulSoup
20
- from fastapi import Depends, BackgroundTasks, HTTPException, Request
21
- from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
22
  from fastapi.responses import StreamingResponse
23
  from litellm.router import Router
24
 
@@ -99,15 +98,20 @@ def get_docx_archive(url: str) -> zipfile.ZipFile:
99
  raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
100
 
101
 
102
- def parse_document_xml(docx_zip: zipfile.ZipFile) -> etree._ElementTree:
103
- """Parse le document.xml principal"""
104
- xml_bytes = docx_zip.read('word/document.xml')
105
- parser = etree.XMLParser(remove_blank_text=True)
106
- return etree.fromstring(xml_bytes, parser=parser)
107
 
 
 
 
 
 
 
 
 
108
 
109
- def clean_document_xml(root: etree._Element) -> None:
110
- """Nettoie le XML en modifiant l'arbre directement"""
111
  # Suppression des balises <w:del> et leur contenu
112
  for del_elem in root.xpath('//w:del', namespaces=NSMAP):
113
  parent = del_elem.getparent()
@@ -117,11 +121,12 @@ def clean_document_xml(root: etree._Element) -> None:
117
  # Désencapsulation des balises <w:ins>
118
  for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
119
  parent = ins_elem.getparent()
120
- index = parent.index(ins_elem)
121
- for child in ins_elem.iterchildren():
122
- parent.insert(index, child)
123
- index += 1
124
- parent.remove(ins_elem)
 
125
 
126
  # Nettoyage des commentaires
127
  for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
@@ -130,20 +135,18 @@ def clean_document_xml(root: etree._Element) -> None:
130
  if parent is not None:
131
  parent.remove(elem)
132
 
133
-
134
- def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> io.BytesIO:
135
- """Crée un nouveau docx avec le XML modifié"""
136
  output = io.BytesIO()
137
 
138
  with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
139
  # Copier tous les fichiers non modifiés
140
- for file in original_zip.infolist():
141
- if file.filename != 'word/document.xml':
142
- new_zip.writestr(file, original_zip.read(file.filename))
143
 
144
  # Ajouter le document.xml modifié
145
  xml_str = etree.tostring(
146
- modified_root,
147
  xml_declaration=True,
148
  encoding='UTF-8',
149
  pretty_print=True
@@ -156,10 +159,7 @@ def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._El
156
 
157
  def docx_to_txt(doc_id: str, url: str) -> str:
158
  docx_zip = get_docx_archive(url)
159
- root = parse_document_xml(docx_zip)
160
- clean_document_xml(root)
161
-
162
- modified_bytes = create_modified_docx(docx_zip, root)
163
 
164
  final_bytes = convert_file(
165
  modified_bytes, f"{doc_id}", "docx", "txt")
@@ -278,32 +278,17 @@ def download_tdocs(req: DocDownloadRequest):
278
  try:
279
  text_lines = docx_to_txt(doc_id, doc_url)
280
  content_bytes = "\n".join(text_lines).encode("utf-8")
281
- return True, content_bytes
282
  except Exception as e:
283
  logging.warning(
284
  f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
285
  error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
286
  "utf-8")
287
- return False, error_message
288
 
289
  for doc in req.documents:
290
- success, content = _process_single_document(doc.document, doc.url)
291
  documents_content[doc.document] = content
292
- if not success:
293
- failed_documents.append(doc.doc_id)
294
-
295
- # sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
296
- for requested_doc_id in document_ids:
297
- if requested_doc_id not in documents_content:
298
- error_msg = (
299
- f"Failed to retrieve or process document '{requested_doc_id}'. "
300
- ).encode("utf-8")
301
-
302
- documents_content[requested_doc_id] = error_msg
303
- logging.warning(
304
- f"Document '{requested_doc_id}' was requested but not found or processed.")
305
- if requested_doc_id not in failed_documents:
306
- failed_documents.append(requested_doc_id)
307
 
308
  zip_buffer = io.BytesIO()
309
  with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
 
6
  import logging
7
  import io
8
  import zipfile
 
9
  import os
10
  from httpx import AsyncClient
11
  from pydantic import BaseModel
 
16
  import tempfile
17
  from lxml import etree
18
  from bs4 import BeautifulSoup
19
+ from fastapi import Depends, HTTPException
20
+ from dependencies import get_http_client, get_llm_router
21
  from fastapi.responses import StreamingResponse
22
  from litellm.router import Router
23
 
 
98
  raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
99
 
100
 
101
+ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
102
+ """
103
+ Applique les révisions des .docx avant de retourner le contenu
104
+ """
 
105
 
106
+ try:
107
+ xml_bytes = docx_zip.read('word/document.xml')
108
+ except KeyError:
109
+ raise FileNotFoundError(
110
+ "word/document.xml not found in the DOCX archive.")
111
+
112
+ parser = etree.XMLParser(remove_blank_text=True)
113
+ root = etree.fromstring(xml_bytes, parser=parser)
114
 
 
 
115
  # Suppression des balises <w:del> et leur contenu
116
  for del_elem in root.xpath('//w:del', namespaces=NSMAP):
117
  parent = del_elem.getparent()
 
121
  # Désencapsulation des balises <w:ins>
122
  for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
123
  parent = ins_elem.getparent()
124
+ if parent is not None:
125
+ index = parent.index(ins_elem)
126
+ for child in ins_elem.iterchildren():
127
+ parent.insert(index, child)
128
+ index += 1
129
+ parent.remove(ins_elem)
130
 
131
  # Nettoyage des commentaires
132
  for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
 
135
  if parent is not None:
136
  parent.remove(elem)
137
 
138
+ # 3. Create a new docx with the modified XML
 
 
139
  output = io.BytesIO()
140
 
141
  with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
142
  # Copier tous les fichiers non modifiés
143
+ for file_info in docx_zip.infolist():
144
+ if file_info.filename != 'word/document.xml':
145
+ new_zip.writestr(file_info, docx_zip.read(file_info.filename))
146
 
147
  # Ajouter le document.xml modifié
148
  xml_str = etree.tostring(
149
+ root,
150
  xml_declaration=True,
151
  encoding='UTF-8',
152
  pretty_print=True
 
159
 
160
  def docx_to_txt(doc_id: str, url: str) -> str:
161
  docx_zip = get_docx_archive(url)
162
+ modified_bytes = apply_docx_revisions(docx_zip)
 
 
 
163
 
164
  final_bytes = convert_file(
165
  modified_bytes, f"{doc_id}", "docx", "txt")
 
278
  try:
279
  text_lines = docx_to_txt(doc_id, doc_url)
280
  content_bytes = "\n".join(text_lines).encode("utf-8")
281
+ return content_bytes
282
  except Exception as e:
283
  logging.warning(
284
  f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
285
  error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
286
  "utf-8")
287
+ return error_message
288
 
289
  for doc in req.documents:
290
+ content = _process_single_document(doc.document, doc.url)
291
  documents_content[doc.document] = content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  zip_buffer = io.BytesIO()
294
  with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
api/solutions.py CHANGED
@@ -1,7 +1,6 @@
1
  import asyncio
2
  import json
3
- import logging
4
- from fastapi import APIRouter, Depends, HTTPException, Response
5
  from httpx import AsyncClient
6
  from jinja2 import Environment, TemplateNotFound
7
  from litellm.router import Router
 
1
  import asyncio
2
  import json
3
+ from fastapi import APIRouter, Depends
 
4
  from httpx import AsyncClient
5
  from jinja2 import Environment, TemplateNotFound
6
  from litellm.router import Router