Lucas ARRIESSE commited on
Commit
2c86f91
·
1 Parent(s): 2ba1434

Use kreuzberg for document extraction + rate limit downloads

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -1
  2. api/docs.py +50 -37
  3. app.py +1 -1
  4. requirements.txt +2 -1
Dockerfile CHANGED
@@ -1,7 +1,7 @@
1
  FROM python:3.11.3
2
 
3
  RUN apt-get update && \
4
- apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress && \
5
  apt-get clean && rm -rf /var/lib/apt/lists/*
6
 
7
  RUN useradd -m -u 1000 user
 
1
  FROM python:3.11.3
2
 
3
  RUN apt-get update && \
4
+ apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress pandoc && \
5
  apt-get clean && rm -rf /var/lib/apt/lists/*
6
 
7
  RUN useradd -m -u 1000 user
api/docs.py CHANGED
@@ -1,4 +1,5 @@
1
  import asyncio
 
2
  from pathlib import Path
3
  import traceback
4
  from typing import Literal, Tuple
@@ -19,6 +20,7 @@ from fastapi import Depends, HTTPException
19
  from dependencies import get_http_client, get_llm_router
20
  from fastapi.responses import StreamingResponse
21
  from litellm.router import Router
 
22
 
23
  from schemas import DataRequest, DataResponse, DocRequirements, DocDownloadRequest, MeetingsRequest, MeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
24
 
@@ -33,8 +35,11 @@ NSMAP = {
33
 
34
  # ================================== Converting of files to .txt ====================================
35
 
 
 
 
36
  # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
37
- CONVERSION_MUTEX = asyncio.Semaphore(1)
38
 
39
 
40
  async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
@@ -50,7 +55,7 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
50
  filter: The conversion filter to use.
51
  """
52
 
53
- await CONVERSION_MUTEX.acquire()
54
 
55
  with tempfile.TemporaryDirectory() as tmpdir:
56
  dir_path = Path(tmpdir)
@@ -91,7 +96,7 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
91
  stderr=stderr
92
  )
93
 
94
- CONVERSION_MUTEX.release()
95
 
96
  with open(output_file_path, mode="rb") as out:
97
  out_bytes.write(out.read())
@@ -100,31 +105,35 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
100
  return out_bytes
101
 
102
 
 
 
 
103
  async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
104
  """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
105
 
106
- if not url.endswith("zip"):
107
- raise ValueError("URL doit pointer vers un fichier ZIP")
 
108
 
109
- doc_id = os.path.splitext(os.path.basename(url))[0]
110
- resp = await client.get(url, headers={
111
- "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
112
- })
113
 
114
- resp.raise_for_status()
115
 
116
- with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
117
- # there should be a single file per file
118
- for entry in zf.infolist():
119
- if entry.is_dir():
120
- continue
121
 
122
- file_name = entry.filename
123
- root, ext = os.path.splitext(file_name)
124
- doc_bytes = zf.read(file_name)
125
- return (root, ext.lower(), io.BytesIO(doc_bytes))
126
 
127
- raise ValueError("Aucun fichier trouvé dans l'archive")
128
 
129
 
130
  def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
@@ -189,6 +198,13 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
189
  return output
190
 
191
 
 
 
 
 
 
 
 
192
  async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
193
  """
194
  Télécharge le TDoc spécifié et le convertit en texte.
@@ -197,31 +213,28 @@ async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
197
  # Grab the document archive
198
  filename, ext, bytes = await get_doc_archive(url, client)
199
 
200
- final_bytes: io.BytesIO = None
201
  if ext == ".doc":
202
  logging.debug(f"Converting {filename} .doc --> .docx")
203
  docx_bytes = await convert_file(bytes, doc_id, "doc", "docx")
204
- logging.debug(f"Converting {filename} .docx --> .txt")
205
- final_bytes = await convert_file(docx_bytes, f"{doc_id}", "docx", "txt")
206
  elif ext == ".docx":
207
- logging.debug(f"Updating .docx revisions")
 
208
  applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
209
- logging.debug(f"Converting {filename} .docx --> .txt")
210
- final_bytes = await convert_file(applied_revision, f"{doc_id}", "docx", "txt")
211
- elif ext == ".pdf":
212
- logging.debug(f"Converting {filename} .pdf --> .txt")
213
- final_bytes = await convert_file(bytes, doc_id, "pdf", "txt")
214
- elif ext == ".pptx":
215
- logging.debug(f"Converting {filename} .pptx --> .pdf")
216
- pdf_bytes = await convert_file(bytes, doc_id, "pptx", "pdf")
217
- logging.debug(f"Converting {filename} .pdf --> .txt")
218
- final_bytes = await convert_file(pdf_bytes, doc_id, "pdf", "txt")
219
  else:
220
- raise Exception(f"Unsupported file type: {ext}, filename: {filename}")
 
 
 
 
 
221
 
222
- text_from_bytes = str(final_bytes.read(), encoding="utf-8")
223
  txt_data = [line.strip()
224
- for line in text_from_bytes.splitlines() if line.strip()]
225
 
226
  return txt_data
227
 
 
1
  import asyncio
2
+ from aiolimiter import AsyncLimiter
3
  from pathlib import Path
4
  import traceback
5
  from typing import Literal, Tuple
 
20
  from dependencies import get_http_client, get_llm_router
21
  from fastapi.responses import StreamingResponse
22
  from litellm.router import Router
23
+ from kreuzberg import ExtractionConfig, extract_bytes
24
 
25
  from schemas import DataRequest, DataResponse, DocRequirements, DocDownloadRequest, MeetingsRequest, MeetingsResponse, ExtractRequirementsRequest, ExtractRequirementsResponse
26
 
 
35
 
36
  # ================================== Converting of files to .txt ====================================
37
 
38
+ KREUZBERG_CONFIG: ExtractionConfig = ExtractionConfig(
39
+ force_ocr=False, ocr_backend=None)
40
+
41
  # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
42
+ LO_CONVERSION_MUTEX = asyncio.Semaphore(1)
43
 
44
 
45
  async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
 
55
  filter: The conversion filter to use.
56
  """
57
 
58
+ await LO_CONVERSION_MUTEX.acquire()
59
 
60
  with tempfile.TemporaryDirectory() as tmpdir:
61
  dir_path = Path(tmpdir)
 
96
  stderr=stderr
97
  )
98
 
99
+ LO_CONVERSION_MUTEX.release()
100
 
101
  with open(output_file_path, mode="rb") as out:
102
  out_bytes.write(out.read())
 
105
  return out_bytes
106
 
107
 
108
+ # Rate limit of FTP downloads per minute
109
+ FTP_DOWNLOAD_RATE_LIMITER = AsyncLimiter(max_rate=120, time_period=60)
110
+
111
  async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
112
  """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
113
 
114
+ async with FTP_DOWNLOAD_RATE_LIMITER:
115
+ if not url.endswith("zip"):
116
+ raise ValueError("URL doit pointer vers un fichier ZIP")
117
 
118
+ doc_id = os.path.splitext(os.path.basename(url))[0]
119
+ resp = await client.get(url, headers={
120
+ "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
121
+ })
122
 
123
+ resp.raise_for_status()
124
 
125
+ with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
126
+ # there should be a single file per file
127
+ for entry in zf.infolist():
128
+ if entry.is_dir():
129
+ continue
130
 
131
+ file_name = entry.filename
132
+ root, ext = os.path.splitext(file_name)
133
+ doc_bytes = zf.read(file_name)
134
+ return (root, ext.lower(), io.BytesIO(doc_bytes))
135
 
136
+ raise ValueError("Aucun fichier trouvé dans l'archive")
137
 
138
 
139
  def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
 
198
  return output
199
 
200
 
201
+ FORMAT_MIME_TYPES = {
202
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
203
+ ".pdf": "application/pdf",
204
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
205
+ }
206
+
207
+
208
  async def doc_to_txt(doc_id: str, url: str, client: AsyncClient) -> str:
209
  """
210
  Télécharge le TDoc spécifié et le convertit en texte.
 
213
  # Grab the document archive
214
  filename, ext, bytes = await get_doc_archive(url, client)
215
 
216
+ final_text: str = None
217
  if ext == ".doc":
218
  logging.debug(f"Converting {filename} .doc --> .docx")
219
  docx_bytes = await convert_file(bytes, doc_id, "doc", "docx")
220
+ extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
221
+ final_text = extracted_data.content
222
  elif ext == ".docx":
223
+ # Applying doc revisions to docx files (especially for pCR / draftCR files)
224
+ logging.debug(f"Updating .docx revisions for {doc_id}.")
225
  applied_revision = apply_docx_revisions(zipfile.ZipFile(bytes))
226
+ extracted_data = await extract_bytes(applied_revision.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
227
+ final_text = extracted_data.content
 
 
 
 
 
 
 
 
228
  else:
229
+ if ext in FORMAT_MIME_TYPES: # file extension is supported
230
+ extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
231
+ final_text = extracted_data.content
232
+ else:
233
+ raise Exception(
234
+ f"Unsupported file type: {ext}, filename: {filename}")
235
 
 
236
  txt_data = [line.strip()
237
+ for line in final_text.splitlines() if line.strip()]
238
 
239
  return txt_data
240
 
app.py CHANGED
@@ -21,7 +21,7 @@ logging.basicConfig(
21
  datefmt='%Y-%m-%d %H:%M:%S'
22
  )
23
 
24
- logging.info("DEBUG logging is disabled. Set `DEBUG_LOG` env var to 1 to enable debug logging.")
25
 
26
  # Initialize global dependencies
27
  init_dependencies()
 
21
  datefmt='%Y-%m-%d %H:%M:%S'
22
  )
23
 
24
+ logging.info(f"Set `DEBUG_LOG` env var to 1 to enable debug logging.")
25
 
26
  # Initialize global dependencies
27
  init_dependencies()
requirements.txt CHANGED
@@ -10,4 +10,5 @@ openpyxl
10
  beautifulsoup4
11
  aiolimiter
12
  httpx
13
- Jinja2
 
 
10
  beautifulsoup4
11
  aiolimiter
12
  httpx
13
+ Jinja2
14
+ kreuzberg