Lucas ARRIESSE commited on
Commit
5f1cdfa
·
1 Parent(s): d2dc29e

Use single method for retrieving TDocs + prepare code to refine

Browse files
Files changed (1) hide show
  1. api/docs.py +62 -69
api/docs.py CHANGED
@@ -1,4 +1,6 @@
1
  import asyncio
 
 
2
  from typing import Dict, List, Literal, Tuple
3
  from fastapi.routing import APIRouter
4
  import logging
@@ -12,6 +14,7 @@ import requests
12
  import subprocess
13
  import pandas as pd
14
  import re
 
15
  from lxml import etree
16
  from bs4 import BeautifulSoup
17
  from fastapi import Depends, BackgroundTasks, HTTPException, Request
@@ -33,14 +36,54 @@ NSMAP = {
33
  # ================================== Converting of files to .txt ====================================
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def get_docx_archive(url: str) -> zipfile.ZipFile:
37
  """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
38
  if not url.endswith("zip"):
39
  raise ValueError("URL doit pointer vers un fichier ZIP")
 
40
  doc_id = os.path.splitext(os.path.basename(url))[0]
41
  resp = requests.get(url, verify=False, headers={
42
  "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
43
  })
 
44
  resp.raise_for_status()
45
 
46
  with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
@@ -49,28 +92,9 @@ def get_docx_archive(url: str) -> zipfile.ZipFile:
49
  docx_bytes = zf.read(file_name)
50
  return zipfile.ZipFile(io.BytesIO(docx_bytes))
51
  elif file_name.endswith(".doc"):
52
- input_path = f"/tmp/{doc_id}.doc"
53
- output_path = f"/tmp/{doc_id}.docx"
54
- docx_bytes = zf.read(file_name)
55
-
56
- with open(input_path, "wb") as f:
57
- f.write(docx_bytes)
58
-
59
- subprocess.run([
60
- "libreoffice",
61
- "--headless",
62
- "--convert-to", "docx",
63
- "--outdir", "/tmp",
64
- input_path
65
- ], check=True)
66
-
67
- with open(output_path, "rb") as f:
68
- docx_bytes = f.read()
69
-
70
- os.remove(input_path)
71
- os.remove(output_path)
72
-
73
- return zipfile.ZipFile(io.BytesIO(docx_bytes))
74
 
75
  raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
76
 
@@ -107,7 +131,7 @@ def clean_document_xml(root: etree._Element) -> None:
107
  parent.remove(elem)
108
 
109
 
110
- def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> bytes:
111
  """Crée un nouveau docx avec le XML modifié"""
112
  output = io.BytesIO()
113
 
@@ -127,33 +151,24 @@ def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._El
127
  new_zip.writestr('word/document.xml', xml_str)
128
 
129
  output.seek(0)
130
- return output.getvalue()
131
 
132
 
133
- def docx_to_txt(doc_id: str, url: str):
134
  docx_zip = get_docx_archive(url)
135
  root = parse_document_xml(docx_zip)
136
  clean_document_xml(root)
 
137
  modified_bytes = create_modified_docx(docx_zip, root)
138
 
139
- input_path = f"/tmp/{doc_id}_cleaned.docx"
140
- output_path = f"/tmp/{doc_id}_cleaned.txt"
141
- with open(input_path, "wb") as f:
142
- f.write(modified_bytes)
143
 
144
- subprocess.run([
145
- "libreoffice",
146
- "--headless",
147
- "--convert-to", "txt",
148
- "--outdir", "/tmp",
149
- input_path
150
- ], check=True)
151
 
152
- with open(output_path, "r", encoding="utf-8") as f:
153
- txt_data = [line.strip() for line in f if line.strip()]
154
 
155
- os.remove(input_path)
156
- os.remove(output_path)
157
  return txt_data
158
 
159
 
@@ -255,27 +270,6 @@ def download_tdocs(req: DocDownloadRequest):
255
 
256
  logging.info(f"Downloading TDocs: {document_ids}")
257
 
258
- # Retrieve all doc URLs to download
259
- doc_urls_req = requests.post(DOC_FINDER_BASE_URL + "find/batch",
260
- headers={
261
- "Content-Type": "application/json"
262
- },
263
- data=json.dumps({
264
- "doc_ids": document_ids
265
- }),
266
- verify=False)
267
-
268
- doc_urls_req.raise_for_status()
269
- doc_urls = doc_urls_req.json()
270
-
271
- # early check to bail out if no doc is available.
272
- if len(doc_urls["results"]) == 0:
273
- logging.warning(
274
- f"Got no URL results for docs {document_ids}. 3GPP index may not be up to date")
275
-
276
- raise HTTPException(
277
- status_code=501, detail="Got no URL results for docs {documents}. 3GPP index may not be up to date")
278
-
279
  documents_content: Dict[str, bytes] = {}
280
  failed_documents: List[str] = []
281
 
@@ -292,18 +286,17 @@ def download_tdocs(req: DocDownloadRequest):
292
  "utf-8")
293
  return False, error_message
294
 
295
- for doc_id, doc_url in doc_urls["results"].items():
296
- success, content = _process_single_document(doc_id, doc_url)
297
- documents_content[doc_id] = content
298
  if not success:
299
- failed_documents.append(doc_id)
300
 
301
  # sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
302
  for requested_doc_id in document_ids:
303
  if requested_doc_id not in documents_content:
304
  error_msg = (
305
  f"Failed to retrieve or process document '{requested_doc_id}'. "
306
- "The 3GPP index may not be up to date, or the document might be unavailable."
307
  ).encode("utf-8")
308
 
309
  documents_content[requested_doc_id] = error_msg
@@ -361,9 +354,9 @@ async def gen_reqs(req: ExtractRequirementsRequest, llm_router: Router = Depends
361
  try:
362
  full = "\n".join(docx_to_txt(doc_id, url))
363
  except Exception as e:
364
- logging.error(
365
- f"Failed to process document {doc_id}", e, stack_info=True)
366
- return [DocRequirements(document=doc_id, context="Error LLM", requirements=[])]
367
 
368
  try:
369
  await concurrency_sema.acquire()
 
1
  import asyncio
2
+ from pathlib import Path
3
+ import traceback
4
  from typing import Dict, List, Literal, Tuple
5
  from fastapi.routing import APIRouter
6
  import logging
 
14
  import subprocess
15
  import pandas as pd
16
  import re
17
+ import tempfile
18
  from lxml import etree
19
  from bs4 import BeautifulSoup
20
  from fastapi import Depends, BackgroundTasks, HTTPException, Request
 
36
  # ================================== Converting of files to .txt ====================================
37
 
38
 
39
+ def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
40
+ """
41
+ Converts the given file bytes using Libreoffice headless to the specified file type.
42
+
43
+ Args:
44
+ contents: File contents
45
+ filename: File base name WITHOUT THE EXTENSION
46
+ input_ext: Input extension (WITHOUT THE DOT)
47
+ output_ext: Output extension (WITHOUT THE DOT)
48
+ filter: The conversion filter to use.
49
+ """
50
+ with tempfile.TemporaryDirectory() as tmpdir:
51
+ dir_path = Path(tmpdir)
52
+ input_file_path = dir_path / f"{filename}.{input_ext}"
53
+ output_file_path = dir_path / f"{filename}.{output_ext}"
54
+
55
+ # write the memory contents to the input file
56
+ with open(input_file_path, "wb") as in_file:
57
+ in_file.write(contents.read())
58
+
59
+ out_bytes = io.BytesIO()
60
+
61
+ # convert using libreoffice
62
+ subprocess.run([
63
+ "libreoffice",
64
+ "--headless",
65
+ "--convert-to", f"{output_ext}:{filter}" if filter else output_ext,
66
+ "--outdir", tmpdir,
67
+ input_file_path
68
+ ], check=True)
69
+
70
+ with open(output_file_path, mode="rb") as out:
71
+ out_bytes.write(out.read())
72
+
73
+ out_bytes.seek(0)
74
+ return out_bytes
75
+
76
+
77
  def get_docx_archive(url: str) -> zipfile.ZipFile:
78
  """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
79
  if not url.endswith("zip"):
80
  raise ValueError("URL doit pointer vers un fichier ZIP")
81
+
82
  doc_id = os.path.splitext(os.path.basename(url))[0]
83
  resp = requests.get(url, verify=False, headers={
84
  "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
85
  })
86
+
87
  resp.raise_for_status()
88
 
89
  with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
 
92
  docx_bytes = zf.read(file_name)
93
  return zipfile.ZipFile(io.BytesIO(docx_bytes))
94
  elif file_name.endswith(".doc"):
95
+ in_bytes = io.BytesIO(zf.read(file_name))
96
+ docx_bytes = convert_file(in_bytes, doc_id, "doc", "docx")
97
+ return zipfile.ZipFile(docx_bytes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
100
 
 
131
  parent.remove(elem)
132
 
133
 
134
+ def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> io.BytesIO:
135
  """Crée un nouveau docx avec le XML modifié"""
136
  output = io.BytesIO()
137
 
 
151
  new_zip.writestr('word/document.xml', xml_str)
152
 
153
  output.seek(0)
154
+ return output
155
 
156
 
157
+ def docx_to_txt(doc_id: str, url: str) -> str:
158
  docx_zip = get_docx_archive(url)
159
  root = parse_document_xml(docx_zip)
160
  clean_document_xml(root)
161
+
162
  modified_bytes = create_modified_docx(docx_zip, root)
163
 
164
+ final_bytes = convert_file(
165
+ modified_bytes, f"{doc_id}", "docx", "txt")
 
 
166
 
167
+ final_bytes_text = str(final_bytes.read(), encoding="utf-8")
 
 
 
 
 
 
168
 
169
+ txt_data = [line.strip()
170
+ for line in final_bytes_text.splitlines() if line.strip()]
171
 
 
 
172
  return txt_data
173
 
174
 
 
270
 
271
  logging.info(f"Downloading TDocs: {document_ids}")
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  documents_content: Dict[str, bytes] = {}
274
  failed_documents: List[str] = []
275
 
 
286
  "utf-8")
287
  return False, error_message
288
 
289
+ for doc in req.documents:
290
+ success, content = _process_single_document(doc.document, doc.url)
291
+ documents_content[doc.document] = content
292
  if not success:
293
+ failed_documents.append(doc.doc_id)
294
 
295
  # sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
296
  for requested_doc_id in document_ids:
297
  if requested_doc_id not in documents_content:
298
  error_msg = (
299
  f"Failed to retrieve or process document '{requested_doc_id}'. "
 
300
  ).encode("utf-8")
301
 
302
  documents_content[requested_doc_id] = error_msg
 
354
  try:
355
  full = "\n".join(docx_to_txt(doc_id, url))
356
  except Exception as e:
357
+ fmt = "".join(traceback.format_exception(e))
358
+ logging.error(f"Failed to process doc {doc_id} : {fmt}")
359
+ return [DocRequirements(document=doc_id, context="Failed to process document", requirements=[])]
360
 
361
  try:
362
  await concurrency_sema.acquire()