om4r932 commited on
Commit
03e159d
·
1 Parent(s): 497a265

Add download TDocs

Browse files
Files changed (3) hide show
  1. app.py +49 -2
  2. index.html +51 -14
  3. schemas.py +10 -4
app.py CHANGED
@@ -1,8 +1,10 @@
 
 
1
  import traceback
2
- from fastapi import FastAPI, BackgroundTasks
3
  from schemas import *
4
  from fastapi.middleware.cors import CORSMiddleware
5
- from fastapi.responses import FileResponse
6
  from litellm.router import Router
7
  from aiolimiter import AsyncLimiter
8
  import pandas as pd
@@ -238,6 +240,51 @@ def get_change_request_dataframe(req: DataRequest):
238
  df = filtered_df.fillna("")
239
  return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  @app.post("/generate_requirements", response_model=RequirementsResponse)
242
  async def gen_reqs(req: RequirementsRequest, background_tasks: BackgroundTasks):
243
  documents = req.documents
 
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ import json
3
  import traceback
4
+ from fastapi import FastAPI, BackgroundTasks, HTTPException
5
  from schemas import *
6
  from fastapi.middleware.cors import CORSMiddleware
7
+ from fastapi.responses import FileResponse, StreamingResponse
8
  from litellm.router import Router
9
  from aiolimiter import AsyncLimiter
10
  import pandas as pd
 
240
  df = filtered_df.fillna("")
241
  return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
242
 
243
+ @app.post("/download_tdocs")
244
+ def download_tdocs(req: DownloadRequest):
245
+ documents = req.documents
246
+
247
+ def process_document(doc: str):
248
+ doc_id = doc
249
+ url = requests.post(
250
+ 'https://organizedprogrammers-3gppdocfinder.hf.space/find',
251
+ headers={"Content-Type": "application/json"},
252
+ data=json.dumps({"doc_id": doc_id}),
253
+ verify=False
254
+ )
255
+ print(url.status_code)
256
+ url = url.json()['url']
257
+ print(url)
258
+ try:
259
+ txt = "\n".join(docx_to_txt(doc_id, url))
260
+ except Exception as e:
261
+ txt = f"Document {doc_id} text extraction failed: {e}"
262
+ return doc_id, txt.encode("utf-8")
263
+
264
+ def process_batch(batch):
265
+ results = {}
266
+ for doc in batch:
267
+ try:
268
+ doc_id, file_bytes = process_document(doc)
269
+ results[doc_id] = file_bytes
270
+ except Exception as e:
271
+ traceback.print_exception(e)
272
+ results[doc] = b"Erreur"
273
+ return results
274
+
275
+ documents_bytes = process_batch(documents)
276
+
277
+ zip_buffer = io.BytesIO()
278
+ with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
279
+ for doc_id, txt_data in documents_bytes.items():
280
+ zip_file.writestr(f'{doc_id}.txt', txt_data)
281
+
282
+ zip_buffer.seek(0)
283
+ return StreamingResponse(
284
+ zip_buffer,
285
+ media_type="application/zip"
286
+ )
287
+
288
  @app.post("/generate_requirements", response_model=RequirementsResponse)
289
  async def gen_reqs(req: RequirementsRequest, background_tasks: BackgroundTasks):
290
  documents = req.documents
index.html CHANGED
@@ -52,7 +52,7 @@
52
  </div>
53
  </div>
54
 
55
- <div class="flex justify-center mt-12 min-h-screen hidden" id="queryReqForm">
56
  <div class="w-full max-w-md">
57
  <div class="grid grid-cols-1 gap-4">
58
  <textarea placeholder="Enter your problem description here ..." class="w-full mx-auto px-4 py-2 border rounded" id="problemDescription"></textarea>
@@ -69,7 +69,7 @@
69
  </center>
70
 
71
  <!-- Tableau des données -->
72
- <div class="max-h-[65vh] overflow-y-auto" id="dataFrameDiv">
73
  <table class="table table-zebra w-full" id="dataFrame">
74
  <thead class="sticky top-0 bg-base-200 z-10">
75
  <tr class="bg-base-200">
@@ -90,7 +90,8 @@
90
  <div id="buttons">
91
  <p id="reqStatus" class="mt-6 hidden">Requirements extracted</p>
92
  <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
93
- <button class="btn mt-6" id="getReqs">Get Requirements</button>
 
94
  <button class="btn mt-6 hidden" id="searchReq">Query requirements</button>
95
  <button class="btn mt-6 hidden" id="categorizeReq">Categorize requirements</button>
96
  </div>
@@ -101,6 +102,34 @@
101
  <script>
102
  let requirements;
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  function getDataFrame(){
105
  document.getElementById("loadingBar").classList.remove("hidden");
106
  const wg = document.getElementById('workingGroupSelect').value;
@@ -126,8 +155,10 @@
126
  fetch("/get_dataframe", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"working_group": wg, "meeting": meeting})})
127
  .then(resp => resp.json())
128
  .then(data => {
129
- document.getElementById("filters").classList.remove("hidden")
130
  document.getElementById("loadingBar").classList.add("hidden");
 
 
131
  const dataframeBody = dataFrame.querySelector("tbody");
132
  dataframeBody.innerHTML = "";
133
  const setType = new Set();
@@ -216,7 +247,7 @@
216
  }
217
 
218
  function generateRequirements(){
219
- const bodyreq = tableToGenBody();
220
  document.getElementById("progressText").classList.remove('hidden');
221
  document.getElementById("progressText").innerHTML = "Generating requirements, please wait, it may take a while ...";
222
  document.getElementById("loadingBar").classList.remove("hidden");
@@ -224,19 +255,18 @@
224
  fetch("/generate_requirements", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"documents": bodyreq})})
225
  .then(resp => resp.json())
226
  .then(data => {
227
- requirements = [];
228
- data.requirements.forEach(obj => {
229
- obj.requirements.forEach(req => {
230
- requirements.push({"document": obj.document, "context": obj.context, "requirement": req})
231
- })
232
- })
233
-
234
  document.getElementById("loadingBar").classList.add("hidden");
235
  document.getElementById("progressText").classList.add("hidden");
236
  document.getElementById("reqStatus").classList.remove("hidden");
237
  document.getElementById("getReqs").classList.add("hidden");
238
  document.getElementById("searchReq").classList.remove("hidden");
239
  document.getElementById("categorizeReq").classList.remove("hidden");
 
 
 
 
 
 
240
  })
241
  }
242
 
@@ -247,6 +277,12 @@
247
  const dataFrame = document.getElementById("dataFrameDiv");
248
  const dataFrameHead = dataFrame.querySelector("thead");
249
  const dataFrameBody = dataFrame.querySelector("tbody");
 
 
 
 
 
 
250
 
251
  dataFrame.classList.remove("hidden");
252
 
@@ -270,9 +306,9 @@
270
  })
271
  }
272
 
273
- function tableToGenBody() {
274
  // columnsMap : { "NomHeaderDansTable": "nom_voulu", ... }
275
- let columnsMap = {"TDoc": "document", "URL": "url"};
276
  const headers = Array.from(dataFrame.querySelectorAll('thead th')).map(th => th.innerText.trim());
277
 
278
  // Indices des colonnes à extraire
@@ -302,6 +338,7 @@
302
  document.getElementById("workingGroupSelect").addEventListener('change', getMeetings)
303
  document.getElementById('getTDocs').addEventListener('click', getDataFrame)
304
  document.getElementById("getReqs").addEventListener("click", generateRequirements);
 
305
  document.getElementById("queryReq").addEventListener("click", queryRequirements)
306
  document.getElementById('searchReq').addEventListener('click', ()=>{
307
  document.getElementById('dataFrameForm').classList.add('hidden');
 
52
  </div>
53
  </div>
54
 
55
+ <div class="flex justify-center mt-12 min-h-[10vh] hidden" id="queryReqForm">
56
  <div class="w-full max-w-md">
57
  <div class="grid grid-cols-1 gap-4">
58
  <textarea placeholder="Enter your problem description here ..." class="w-full mx-auto px-4 py-2 border rounded" id="problemDescription"></textarea>
 
69
  </center>
70
 
71
  <!-- Tableau des données -->
72
+ <div class="max-h-[65vh] overflow-y-auto mt-12" id="dataFrameDiv">
73
  <table class="table table-zebra w-full" id="dataFrame">
74
  <thead class="sticky top-0 bg-base-200 z-10">
75
  <tr class="bg-base-200">
 
90
  <div id="buttons">
91
  <p id="reqStatus" class="mt-6 hidden">Requirements extracted</p>
92
  <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
93
+ <button class="btn mt-6 hidden" id="getReqs">Extract Requirements</button>
94
+ <button class="btn mt-6 hidden" id="downloadZip">Download TDocs</button>
95
  <button class="btn mt-6 hidden" id="searchReq">Query requirements</button>
96
  <button class="btn mt-6 hidden" id="categorizeReq">Categorize requirements</button>
97
  </div>
 
102
  <script>
103
  let requirements;
104
 
105
+ function downloadTDocs(){
106
+ const data = tableToGenBody({"TDoc": "doc"});
107
+ const dataSet = [...new Set(data.map(item => item.doc))];
108
+ console.log(dataSet);
109
+ let body = {"documents": dataSet, "meeting": document.getElementById('meetingSelect').value};
110
+ if (document.getElementById('agendaItem').value != "" | document.getElementById('agendaItem').value != "Tous"){
111
+ body['agenda_item'] = document.getElementById('agendaItem').value;
112
+ }
113
+ fetch('/download_tdocs', {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify(body)})
114
+ .then(resp => resp.blob())
115
+ .then(blob => {
116
+ const url = window.URL.createObjectURL(blob);
117
+ const a = document.createElement("a");
118
+ a.href = url;
119
+ let dl_name = `${document.getElementById('meetingSelect').value}`;
120
+ if (document.getElementById('agendaItem').value != "" | document.getElementById('agendaItem').value != "Tous"){dl_name = dl_name + `_${document.getElementById('agendaItem').value}`};
121
+ if (document.getElementById('docStatus').value != "" | document.getElementById('docStatus').value != "Tous"){dl_name = dl_name + `_${document.getElementById('docStatus').value}`};
122
+ if (document.getElementById('docType').value != "" | document.getElementById('docType').value != "Tous"){dl_name = `${document.getElementById('docType').value}_${dl_name}`};
123
+ if (document.getElementById('queryReqForm').classList.contains('hidden')){dl_name = `requirements_${dl_name}_${url.split('/').pop()}`}
124
+ dl_name = dl_name + ".zip";
125
+ a.download = dl_name;
126
+ document.body.appendChild(a);
127
+ a.click();
128
+ a.remove();
129
+ window.URL.revokeObjectURL(url); // libération mémoire
130
+ })
131
+ }
132
+
133
  function getDataFrame(){
134
  document.getElementById("loadingBar").classList.remove("hidden");
135
  const wg = document.getElementById('workingGroupSelect').value;
 
155
  fetch("/get_dataframe", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"working_group": wg, "meeting": meeting})})
156
  .then(resp => resp.json())
157
  .then(data => {
158
+ document.getElementById("filters").classList.remove("hidden");
159
  document.getElementById("loadingBar").classList.add("hidden");
160
+ document.getElementById("downloadZip").classList.remove("hidden");
161
+ document.getElementById("getReqs").classList.remove("hidden");
162
  const dataframeBody = dataFrame.querySelector("tbody");
163
  dataframeBody.innerHTML = "";
164
  const setType = new Set();
 
247
  }
248
 
249
  function generateRequirements(){
250
+ const bodyreq = tableToGenBody({"TDoc": "document", "URL": "url"});
251
  document.getElementById("progressText").classList.remove('hidden');
252
  document.getElementById("progressText").innerHTML = "Generating requirements, please wait, it may take a while ...";
253
  document.getElementById("loadingBar").classList.remove("hidden");
 
255
  fetch("/generate_requirements", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"documents": bodyreq})})
256
  .then(resp => resp.json())
257
  .then(data => {
 
 
 
 
 
 
 
258
  document.getElementById("loadingBar").classList.add("hidden");
259
  document.getElementById("progressText").classList.add("hidden");
260
  document.getElementById("reqStatus").classList.remove("hidden");
261
  document.getElementById("getReqs").classList.add("hidden");
262
  document.getElementById("searchReq").classList.remove("hidden");
263
  document.getElementById("categorizeReq").classList.remove("hidden");
264
+ requirements = [];
265
+ data.requirements.forEach(obj => {
266
+ obj.requirements.forEach(req => {
267
+ requirements.push({"document": obj.document, "context": obj.context, "requirement": req})
268
+ })
269
+ })
270
  })
271
  }
272
 
 
277
  const dataFrame = document.getElementById("dataFrameDiv");
278
  const dataFrameHead = dataFrame.querySelector("thead");
279
  const dataFrameBody = dataFrame.querySelector("tbody");
280
+ document.getElementById("buttons").classList.remove("hidden");
281
+ document.getElementById("searchReq").classList.add("hidden");
282
+ document.getElementById("categorizeReq").classList.add("hidden");
283
+ document.getElementById("getReqs").classList.add("hidden");
284
+ document.getElementById("reqStatus").classList.add("hidden");
285
+ document.getElementById("downloadZip").classList.remove("hidden");
286
 
287
  dataFrame.classList.remove("hidden");
288
 
 
306
  })
307
  }
308
 
309
+ function tableToGenBody(columnsMap) {
310
  // columnsMap : { "NomHeaderDansTable": "nom_voulu", ... }
311
+ const dataFrame = document.getElementById("dataFrame");
312
  const headers = Array.from(dataFrame.querySelectorAll('thead th')).map(th => th.innerText.trim());
313
 
314
  // Indices des colonnes à extraire
 
338
  document.getElementById("workingGroupSelect").addEventListener('change', getMeetings)
339
  document.getElementById('getTDocs').addEventListener('click', getDataFrame)
340
  document.getElementById("getReqs").addEventListener("click", generateRequirements);
341
+ document.getElementById("downloadZip").addEventListener('click', downloadTDocs)
342
  document.getElementById("queryReq").addEventListener("click", queryRequirements)
343
  document.getElementById('searchReq').addEventListener('click', ()=>{
344
  document.getElementById('dataFrameForm').classList.add('hidden');
schemas.py CHANGED
@@ -16,10 +16,11 @@ class DataResponse(BaseModel):
16
 
17
  # --------------------------------------
18
 
 
 
 
 
19
  class RequirementsRequest(BaseModel):
20
- class DocInfo(BaseModel):
21
- document: str
22
- url: str
23
  documents: List[DocInfo]
24
 
25
  class DocRequirements(BaseModel):
@@ -41,4 +42,9 @@ class ReqSearchRequest(BaseModel):
41
  requirements: List[SingleRequirement]
42
 
43
  class ReqSearchResponse(BaseModel):
44
- requirements: List[SingleRequirement]
 
 
 
 
 
 
16
 
17
  # --------------------------------------
18
 
19
+ class DocInfo(BaseModel):
20
+ document: str
21
+ url: str
22
+
23
  class RequirementsRequest(BaseModel):
 
 
 
24
  documents: List[DocInfo]
25
 
26
  class DocRequirements(BaseModel):
 
42
  requirements: List[SingleRequirement]
43
 
44
  class ReqSearchResponse(BaseModel):
45
+ requirements: List[SingleRequirement]
46
+
47
+ # --------------------------------------
48
+
49
+ class DownloadRequest(BaseModel):
50
+ documents: List[str]