Lucas ARRIESSE
commited on
Commit
·
a83bff5
1
Parent(s):
f6da275
Fix TDoc download
Browse files- api/docs.py +67 -38
- dependencies.py +1 -0
api/docs.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import asyncio
|
2 |
-
from typing import Literal
|
3 |
from fastapi.routing import APIRouter
|
4 |
import logging
|
5 |
import string
|
@@ -19,7 +19,7 @@ from bs4 import BeautifulSoup
|
|
19 |
from nltk.corpus import stopwords
|
20 |
from nltk.stem import WordNetLemmatizer
|
21 |
from fastapi import Depends, BackgroundTasks, HTTPException, Request
|
22 |
-
from dependencies import get_llm_router
|
23 |
from fastapi.responses import StreamingResponse
|
24 |
from litellm.router import Router
|
25 |
|
@@ -253,52 +253,81 @@ def get_change_request_dataframe(req: DataRequest):
|
|
253 |
@router.post("/download_tdocs")
|
254 |
def download_tdocs(req: DownloadRequest):
|
255 |
"""Download the specified TDocs and zips them in a single archive"""
|
256 |
-
documents = req.documents
|
257 |
-
|
258 |
-
logging.info(f"Downloading TDocs: {documents}")
|
259 |
-
|
260 |
-
def process_document(doc: str):
|
261 |
-
doc_id = doc
|
262 |
-
url = requests.post(
|
263 |
-
'https://organizedprogrammers-3gppdocfinder.hf.space/find',
|
264 |
-
headers={"Content-Type": "application/json"},
|
265 |
-
data=json.dumps({"doc_id": doc_id}),
|
266 |
-
verify=False
|
267 |
-
)
|
268 |
-
logging.info(
|
269 |
-
f"Retrieving URL for doc {doc_id} returned http status {url.status_code}")
|
270 |
-
url = url.json()['url']
|
271 |
-
logging.debug(f"Doc URL for {doc_id} is {url}")
|
272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
try:
|
274 |
-
|
|
|
|
|
275 |
except Exception as e:
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
zip_buffer = io.BytesIO()
|
294 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
295 |
-
for doc_id,
|
296 |
-
|
|
|
297 |
|
298 |
zip_buffer.seek(0)
|
|
|
299 |
return StreamingResponse(
|
300 |
zip_buffer,
|
301 |
-
media_type="application/zip"
|
|
|
302 |
)
|
303 |
|
304 |
|
|
|
1 |
import asyncio
|
2 |
+
from typing import Dict, List, Literal, Tuple
|
3 |
from fastapi.routing import APIRouter
|
4 |
import logging
|
5 |
import string
|
|
|
19 |
from nltk.corpus import stopwords
|
20 |
from nltk.stem import WordNetLemmatizer
|
21 |
from fastapi import Depends, BackgroundTasks, HTTPException, Request
|
22 |
+
from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
|
23 |
from fastapi.responses import StreamingResponse
|
24 |
from litellm.router import Router
|
25 |
|
|
|
253 |
@router.post("/download_tdocs")
|
254 |
def download_tdocs(req: DownloadRequest):
|
255 |
"""Download the specified TDocs and zips them in a single archive"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
+
# Document IDs to download
|
258 |
+
document_ids = req.documents
|
259 |
+
|
260 |
+
logging.info(f"Downloading TDocs: {document_ids}")
|
261 |
+
|
262 |
+
# Retrieve all doc URLs to download
|
263 |
+
doc_urls_req = requests.post(DOC_FINDER_BASE_URL + "find/batch",
|
264 |
+
headers={
|
265 |
+
"Content-Type": "application/json"
|
266 |
+
},
|
267 |
+
data=json.dumps({
|
268 |
+
"doc_ids": document_ids
|
269 |
+
}),
|
270 |
+
verify=False)
|
271 |
+
|
272 |
+
doc_urls_req.raise_for_status()
|
273 |
+
doc_urls = doc_urls_req.json()
|
274 |
+
|
275 |
+
# early check to bail out if no doc is available.
|
276 |
+
if len(doc_urls["results"]) == 0:
|
277 |
+
logging.warning(
|
278 |
+
f"Got no URL results for docs {document_ids}. 3GPP index may not be up to date")
|
279 |
+
|
280 |
+
raise HTTPException(
|
281 |
+
status_code=501, detail="Got no URL results for docs {documents}. 3GPP index may not be up to date")
|
282 |
+
|
283 |
+
documents_content: Dict[str, bytes] = {}
|
284 |
+
failed_documents: List[str] = []
|
285 |
+
|
286 |
+
def _process_single_document(doc_id: str, doc_url: str) -> Tuple[bool, bytes]:
|
287 |
+
"""Attempts to convert a document to text and returns success status and content."""
|
288 |
try:
|
289 |
+
text_lines = docx_to_txt(doc_id, doc_url)
|
290 |
+
content_bytes = "\n".join(text_lines).encode("utf-8")
|
291 |
+
return True, content_bytes
|
292 |
except Exception as e:
|
293 |
+
logging.warning(
|
294 |
+
f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
|
295 |
+
error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
|
296 |
+
"utf-8")
|
297 |
+
return False, error_message
|
298 |
+
|
299 |
+
for doc_id, doc_url in doc_urls["results"].items():
|
300 |
+
success, content = _process_single_document(doc_id, doc_url)
|
301 |
+
documents_content[doc_id] = content
|
302 |
+
if not success:
|
303 |
+
failed_documents.append(doc_id)
|
304 |
+
|
305 |
+
# sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
|
306 |
+
for requested_doc_id in document_ids:
|
307 |
+
if requested_doc_id not in documents_content:
|
308 |
+
error_msg = (
|
309 |
+
f"Failed to retrieve or process document '{requested_doc_id}'. "
|
310 |
+
"The 3GPP index may not be up to date, or the document might be unavailable."
|
311 |
+
).encode("utf-8")
|
312 |
+
|
313 |
+
documents_content[requested_doc_id] = error_msg
|
314 |
+
logging.warning(
|
315 |
+
f"Document '{requested_doc_id}' was requested but not found or processed.")
|
316 |
+
if requested_doc_id not in failed_documents:
|
317 |
+
failed_documents.append(requested_doc_id)
|
318 |
|
319 |
zip_buffer = io.BytesIO()
|
320 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
321 |
+
for doc_id, content_bytes in documents_content.items():
|
322 |
+
safe_filename = f"{doc_id}.txt"
|
323 |
+
zip_file.writestr(safe_filename, content_bytes)
|
324 |
|
325 |
zip_buffer.seek(0)
|
326 |
+
|
327 |
return StreamingResponse(
|
328 |
zip_buffer,
|
329 |
+
media_type="application/zip",
|
330 |
+
headers={"Content-Disposition": "attachment; filename=tdocs.zip"}
|
331 |
)
|
332 |
|
333 |
|
dependencies.py
CHANGED
@@ -9,6 +9,7 @@ from jinja2 import Environment, StrictUndefined, FileSystemLoader
|
|
9 |
|
10 |
|
11 |
INSIGHT_FINDER_BASE_URL = "https://organizedprogrammers-insight-finder.hf.space/"
|
|
|
12 |
|
13 |
def init_dependencies():
|
14 |
"""Initialize the application global dependencies"""
|
|
|
9 |
|
10 |
|
11 |
INSIGHT_FINDER_BASE_URL = "https://organizedprogrammers-insight-finder.hf.space/"
|
12 |
+
DOC_FINDER_BASE_URL = "https://organizedprogrammers-docfinder.hf.space/"
|
13 |
|
14 |
def init_dependencies():
|
15 |
"""Initialize the application global dependencies"""
|