om4r932 commited on
Commit
1392287
·
1 Parent(s): a39fe1d

First version

Browse files
Files changed (4) hide show
  1. Dockerfile +17 -0
  2. app.py +234 -0
  3. index.html +223 -0
  4. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.3
2
+
3
+ RUN apt-get update && \
4
+ apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress && \
5
+ apt-get clean && rm -rf /var/lib/apt/lists/*
6
+
7
+ RUN useradd -m -u 1000 user
8
+ USER user
9
+ ENV PATH="/home/user/.local/bin:$PATH"
10
+
11
+ WORKDIR /app
12
+
13
+ COPY --chown=user ./requirements.txt requirements.txt
14
+ RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
15
+
16
+ COPY --chown=user . /app
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import FileResponse
4
+ import litellm
5
+ import pandas as pd
6
+ from pydantic import BaseModel, Field
7
+ from typing import Any, List, Dict, Optional
8
+ import re
9
+ import subprocess
10
+ import requests
11
+ import os
12
+ from lxml import etree
13
+ import zipfile
14
+ import io
15
+ import warnings
16
+ warnings.filterwarnings("ignore")
17
+ from bs4 import BeautifulSoup
18
+
19
+ app = FastAPI(title="Requirements Extractor")
20
+ app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_methods=["*"], allow_origins=["*"])
21
+
22
+ class MeetingsRequest(BaseModel):
23
+ working_group: str
24
+
25
+ class MeetingsResponse(BaseModel):
26
+ meetings: Dict[str, str]
27
+
28
+ class DataRequest(BaseModel):
29
+ working_group: str
30
+ meeting: str
31
+
32
+ class DataResponse(BaseModel):
33
+ data: List[Dict[Any, Any]]
34
+
35
+ class DocRequirements(BaseModel):
36
+ doc_id: str
37
+ context: str
38
+ requirements: List[str]
39
+
40
+ class DocInfo(BaseModel):
41
+ document: str
42
+ url: str
43
+
44
+ class RequirementsRequest(BaseModel):
45
+ documents: List[DocInfo]
46
+
47
+ class RequirementsResponse(BaseModel):
48
+ requirements: List[DocRequirements]
49
+
50
+ NSMAP = {
51
+ 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
52
+ 'v': 'urn:schemas-microsoft-com:vml'
53
+ }
54
+
55
+ def get_docx_archive(url: str) -> zipfile.ZipFile:
56
+ """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
57
+ if not url.endswith("zip"):
58
+ raise ValueError("URL doit pointer vers un fichier ZIP")
59
+
60
+ resp = requests.get(url, verify=False, headers={
61
+ "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
62
+ })
63
+ resp.raise_for_status()
64
+
65
+ with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
66
+ for file_name in zf.namelist():
67
+ if file_name.endswith((".docx", ".doc")):
68
+ docx_bytes = zf.read(file_name)
69
+ return zipfile.ZipFile(io.BytesIO(docx_bytes))
70
+
71
+ raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
72
+
73
+ def parse_document_xml(docx_zip: zipfile.ZipFile) -> etree._ElementTree:
74
+ """Parse le document.xml principal"""
75
+ xml_bytes = docx_zip.read('word/document.xml')
76
+ parser = etree.XMLParser(remove_blank_text=True)
77
+ return etree.fromstring(xml_bytes, parser=parser)
78
+
79
+ def clean_document_xml(root: etree._Element) -> None:
80
+ """Nettoie le XML en modifiant l'arbre directement"""
81
+ # Suppression des balises <w:del> et leur contenu
82
+ for del_elem in root.xpath('//w:del', namespaces=NSMAP):
83
+ parent = del_elem.getparent()
84
+ if parent is not None:
85
+ parent.remove(del_elem)
86
+
87
+ # Désencapsulation des balises <w:ins>
88
+ for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
89
+ parent = ins_elem.getparent()
90
+ index = parent.index(ins_elem)
91
+ for child in ins_elem.iterchildren():
92
+ parent.insert(index, child)
93
+ index += 1
94
+ parent.remove(ins_elem)
95
+
96
+ # Nettoyage des commentaires
97
+ for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
98
+ for elem in root.xpath(f'//{tag}', namespaces=NSMAP):
99
+ parent = elem.getparent()
100
+ if parent is not None:
101
+ parent.remove(elem)
102
+
103
+ def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> bytes:
104
+ """Crée un nouveau docx avec le XML modifié"""
105
+ output = io.BytesIO()
106
+
107
+ with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
108
+ # Copier tous les fichiers non modifiés
109
+ for file in original_zip.infolist():
110
+ if file.filename != 'word/document.xml':
111
+ new_zip.writestr(file, original_zip.read(file.filename))
112
+
113
+ # Ajouter le document.xml modifié
114
+ xml_str = etree.tostring(
115
+ modified_root,
116
+ xml_declaration=True,
117
+ encoding='UTF-8',
118
+ pretty_print=True
119
+ )
120
+ new_zip.writestr('word/document.xml', xml_str)
121
+
122
+ output.seek(0)
123
+ return output.getvalue()
124
+
125
+ def docx_to_txt(doc_id: str, url: str):
126
+ docx_zip = get_docx_archive(url)
127
+ root = parse_document_xml(docx_zip)
128
+ clean_document_xml(root)
129
+ modified_bytes = create_modified_docx(docx_zip, root)
130
+
131
+ input_path = f"/tmp/{doc_id}_cleaned.docx"
132
+ output_path = f"/tmp/{doc_id}_cleaned.txt"
133
+ with open(input_path, "wb") as f:
134
+ f.write(modified_bytes)
135
+
136
+ subprocess.run([
137
+ "libreoffice",
138
+ "--headless",
139
+ "--convert-to", "txt",
140
+ "--outdir", "/tmp",
141
+ input_path
142
+ ], check=True)
143
+
144
+ with open(output_path, "r", encoding="utf-8") as f:
145
+ txt_data = [line.strip() for line in f if line.strip()]
146
+
147
+ os.remove(input_path)
148
+ os.remove(output_path)
149
+ return txt_data
150
+
151
+ @app.get("/")
152
+ def render_page():
153
+ return FileResponse("index.html")
154
+
155
+ @app.post("/get_meetings", response_model=MeetingsResponse)
156
+ def get_meetings(req: MeetingsRequest):
157
+ working_group = req.working_group
158
+ tsg = re.sub(r"\d+", "", working_group)
159
+ wg_number = re.search(r"\d", working_group).group(0)
160
+ url = "https://www.3gpp.org/ftp/tsg_" + tsg
161
+ resp = requests.get(url, verify=False)
162
+ soup = BeautifulSoup(resp.text, "html.parser")
163
+ meeting_folders = []
164
+ all_meetings = []
165
+ wg_folders = [item.get_text() for item in soup.select("tr td a")]
166
+ selected_folder = None
167
+ for folder in wg_folders:
168
+ if str(wg_number) in folder:
169
+ selected_folder = folder
170
+ break
171
+
172
+ url += "/" + selected_folder
173
+
174
+ if selected_folder:
175
+ resp = requests.get(url, verify=False)
176
+ soup = BeautifulSoup(resp.text, "html.parser")
177
+ meeting_folders = [item.get_text() for item in soup.select("tr td a") if item.get_text().startswith("TSG")]
178
+ all_meetings = [working_group + "#" + meeting.split("_", 1)[1].replace("_", " ").replace("-", " ") for meeting in meeting_folders]
179
+
180
+ return MeetingsResponse(meetings=dict(zip(all_meetings, meeting_folders)))
181
+
182
+ @app.post("/get_dataframe", response_model=DataResponse)
183
+ def get_change_request_dataframe(req: DataRequest):
184
+ working_group = req.working_group
185
+ tsg = re.sub(r"\d+", "", working_group)
186
+ wg_number = re.search(r"\d", working_group).group(0)
187
+ url = "https://www.3gpp.org/ftp/tsg_" + tsg
188
+ resp = requests.get(url, verify=False)
189
+ soup = BeautifulSoup(resp.text, "html.parser")
190
+ wg_folders = [item.get_text() for item in soup.select("tr td a")]
191
+ selected_folder = None
192
+ for folder in wg_folders:
193
+ if str(wg_number) in folder:
194
+ selected_folder = folder
195
+ break
196
+
197
+ url += "/" + selected_folder + "/" + req.meeting + "/docs"
198
+ resp = requests.get(url, verify=False)
199
+ soup = BeautifulSoup(resp.text, "html.parser")
200
+ files = [item.get_text() for item in soup.select("tr td a") if item.get_text().endswith(".xlsx")]
201
+
202
+ def gen_url(tdoc: str):
203
+ return f"{url}/{tdoc}.zip"
204
+
205
+ df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
206
+ filtered_df = df[(((df["Type"] == "CR") & ((df["CR category"] == "B") | (df["CR category"] == "C"))) | (df["Type"] == "pCR")) & ~(df["Uploaded"].isna())][["TDoc", "Title", "CR category", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
207
+ filtered_df["URL"] = filtered_df["TDoc"].apply(gen_url)
208
+
209
+ df = filtered_df.fillna("")
210
+ return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
211
+
212
+ @app.post("/generate_requirements", response_model=RequirementsResponse)
213
+ def gen_reqs(req: RequirementsRequest):
214
+ documents = req.documents
215
+ output = []
216
+ for doc in documents:
217
+ doc_id = doc.document
218
+ url = doc.url
219
+
220
+ full = "\n".join(docx_to_txt(doc_id, url))
221
+
222
+ resp_ai = litellm.completion(
223
+ model="gemini/gemini-2.0-flash",
224
+ api_key="SECRET API HERE",
225
+ messages=[{"role":"user","content": f"Here's the document whose ID is {doc_id} with requirements : {full}\n\nI want you to extract all the requirements and give me a context (not giving the section or whatever, a sentence is needed) where that calls for those requirements. If multiples covered contexts is present, make as many requirements list by context as you want."}],
226
+ response_format=DocRequirements
227
+ )
228
+
229
+ reqs = DocRequirements.model_validate_json(resp_ai.choices[0].message.content)
230
+ output.append(reqs)
231
+
232
+ return RequirementsResponse(requirements=output)
233
+
234
+
index.html ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="fr" data-theme="light">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Requirements Extractor</title>
7
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/full.css" rel="stylesheet">
8
+ <script src="https://cdn.tailwindcss.com"></script>
9
+ </head>
10
+ <body class="p-8 bg-base-100">
11
+ <div class="container mx-auto">
12
+ <h1 class="text-4xl font-bold text-center mb-8">Requirements Extractor</h1>
13
+ <div>
14
+ <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
15
+ <select class="select select-bordered" id="workingGroupSelect">
16
+ <option disabled selected value="">Working Group</option>
17
+ <option>SA1</option>
18
+ <option>SA2</option>
19
+ <option>SA3</option>
20
+ <option>SA4</option>
21
+ <option>SA5</option>
22
+ <option>SA6</option>
23
+ <option>CT1</option>
24
+ <option>CT2</option>
25
+ <option>CT3</option>
26
+ <option>CT4</option>
27
+ <option>CT5</option>
28
+ <option>CT6</option>
29
+ </select>
30
+ <select class="select select-bordered" id="meetingSelect" disabled>
31
+ <option disabled selected value="">Select a working group</option>
32
+ </select>
33
+ <button class="btn" id="getTDocs">Get TDocs</button>
34
+ </div>
35
+ </div>
36
+ <div class="hidden" id="filters">
37
+ <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
38
+ <select class="select select-bordered" id="docType">
39
+ <option disabled selected value="">Type</option>
40
+ <option>Tous</option>
41
+ </select>
42
+
43
+ <select class="select select-bordered" id="docStatus">
44
+ <option disabled selected value="">Status</option>
45
+ <option>Tous</option>
46
+ </select>
47
+
48
+ <select class="select select-bordered" id="agendaItem">
49
+ <option disabled selected value = "">Agenda</option>
50
+ <option>Tous</option>
51
+ </select>
52
+ </div>
53
+ </div>
54
+
55
+
56
+ <!-- Tableau des données -->
57
+ <div class="max-h-[65vh] overflow-y-auto">
58
+ <table class="table table-zebra w-full" id="dataFrame">
59
+ <thead class="sticky top-0 bg-base-200 z-10">
60
+ <tr class="bg-base-200">
61
+ <th>TDoc</th>
62
+ <th>Title</th>
63
+ <th>Type</th>
64
+ <th>Status</th>
65
+ <th>Agenda Item N°</th>
66
+ <th>URL</th>
67
+ </tr>
68
+ </thead>
69
+ <tbody>
70
+ </tbody>
71
+ </table>
72
+ </div>
73
+
74
+ <center><button class="btn mt-6 gap-4" id="getReqs">Get Requirements</button></center>
75
+ </div>
76
+
77
+ <script>
78
+ function getDataFrame(){
79
+ const wg = document.getElementById('workingGroupSelect').value;
80
+ const meeting = document.getElementById('meetingSelect').value;
81
+ document.getElementById('docType').innerHTML = `
82
+ <option disabled selected value="">Type</option>
83
+ <option>Tous</option>
84
+ `
85
+
86
+ document.getElementById('docStatus').innerHTML = `
87
+ <option disabled selected value="">Type</option>
88
+ <option>Tous</option>
89
+ `
90
+
91
+ document.getElementById('agendaItem').innerHTML = `
92
+ <option disabled selected value="">Type</option>
93
+ <option>Tous</option>
94
+ `
95
+ const dataFrame = document.getElementById("dataFrame");
96
+ document.getElementById("getTDocs").setAttribute('disabled', 'true')
97
+ document.getElementById("getTDocs").innerHTML = "Loading ...";
98
+ fetch("/get_dataframe", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"working_group": wg, "meeting": meeting})})
99
+ .then(resp => resp.json())
100
+ .then(data => {
101
+ document.getElementById("filters").classList.remove("hidden")
102
+ const dataframeBody = dataFrame.querySelector("tbody");
103
+ dataframeBody.innerHTML = "";
104
+ const setType = new Set();
105
+ const setAgenda = new Set();
106
+ const setStatus = new Set();
107
+ data.data.forEach(row => {
108
+ const tr = document.createElement("tr");
109
+ tr.setAttribute("data-type", row['Type']);
110
+ tr.setAttribute("data-status", row["TDoc Status"]);
111
+ tr.setAttribute("data-agenda", row["Agenda item description"]);
112
+ tr.innerHTML = `
113
+ <td>${row["TDoc"]}</td>
114
+ <td>${row["Title"]}</td>
115
+ <td>${row["Type"]}</td>
116
+ <td>${row["TDoc Status"]}</td>
117
+ <td>${row["Agenda item description"]}</td>
118
+ <td>
119
+ <a href="${row["URL"]}" class="link">${row["URL"]}</a>
120
+ </td>
121
+ `;
122
+ dataframeBody.appendChild(tr);
123
+ setType.add(row["Type"]);
124
+ setAgenda.add(row["Agenda item description"]);
125
+ setStatus.add(row["TDoc Status"]);
126
+ })
127
+
128
+ setType.forEach(tdoctype => {
129
+ const option = document.createElement("option");
130
+ option.textContent = tdoctype;
131
+ option.value = tdoctype;
132
+ document.getElementById('docType').appendChild(option);
133
+ })
134
+
135
+ setAgenda.forEach(agenda => {
136
+ const option = document.createElement("option");
137
+ option.textContent = agenda;
138
+ option.value = agenda;
139
+ document.getElementById('agendaItem').appendChild(option);
140
+ })
141
+
142
+ setStatus.forEach(status => {
143
+ const option = document.createElement("option");
144
+ option.textContent = status;
145
+ option.value = status;
146
+ document.getElementById('docStatus').appendChild(option);
147
+ })
148
+ })
149
+
150
+ document.getElementById("getTDocs").removeAttribute("disabled")
151
+ document.getElementById("getTDocs").innerHTML = "Get TDocs";
152
+ }
153
+
154
+ function filterTable() {
155
+ const type = document.getElementById('docType').value
156
+ const status = document.getElementById('docStatus').value
157
+ const agenda = document.getElementById('agendaItem').value
158
+
159
+ document.querySelectorAll('#dataFrame tbody tr').forEach(row => {
160
+ const showRow =
161
+ (type === 'Tous' || row.dataset.type === type || type === "") &&
162
+ (status === 'Tous' || row.dataset.status === status || status === "") &&
163
+ (agenda === 'Tous' || row.dataset.agenda === agenda || agenda === "")
164
+
165
+ row.style.display = showRow ? '' : 'none'
166
+ })
167
+ }
168
+
169
+ function getMeetings(){
170
+ const workingGroup = document.getElementById("workingGroupSelect").value;
171
+ document.getElementById("meetingSelect").setAttribute('disabled', 'true')
172
+ document.getElementById("meetingSelect").innerHTML = "<option>Loading...</option>"
173
+ document.getElementById("getTDocs").setAttribute('disabled', 'true')
174
+ fetch("/get_meetings", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"working_group": workingGroup})})
175
+ .then(resp => resp.json())
176
+ .then(data => {
177
+ document.getElementById("meetingSelect").innerHTML = "";
178
+ document.getElementById("meetingSelect").removeAttribute("disabled");
179
+ document.getElementById("getTDocs").removeAttribute("disabled")
180
+ for(const [key, value] of Object.entries(data.meetings)){
181
+ const option = document.createElement("option");
182
+ option.textContent = key;
183
+ option.value = value;
184
+ document.getElementById('meetingSelect').appendChild(option);
185
+ }
186
+ })
187
+ }
188
+
189
+ function tableToGenBody(tableSelector) {
190
+ // columnsMap : { "NomHeaderDansTable": "nom_voulu", ... }
191
+ let columnsMap = {"TDoc": "doc_id", "URL": "url"};
192
+ const table = document.querySelector(tableSelector);
193
+ const headers = Array.from(table.querySelectorAll('thead th')).map(th => th.innerText.trim());
194
+
195
+ // Indices des colonnes à extraire
196
+ const selectedIndices = headers
197
+ .map((header, idx) => columnsMap[header] ? idx : -1)
198
+ .filter(idx => idx !== -1);
199
+
200
+ return Array.from(table.querySelectorAll('tbody tr'))
201
+ .filter(row => getComputedStyle(row).display !== 'none')
202
+ .map(row => {
203
+ const cells = Array.from(row.querySelectorAll('td'));
204
+ const obj = {};
205
+ selectedIndices.forEach(idx => {
206
+ const originalHeader = headers[idx];
207
+ const newKey = columnsMap[originalHeader];
208
+ obj[newKey] = cells[idx].innerText.trim();
209
+ });
210
+ return obj;
211
+ });
212
+ }
213
+
214
+ // Écouteurs d'événements pour les filtres
215
+
216
+ document.getElementById('docType').addEventListener('change', filterTable)
217
+ document.getElementById('docStatus').addEventListener('change', filterTable)
218
+ document.getElementById('agendaItem').addEventListener('change', filterTable)
219
+ document.getElementById("workingGroupSelect").addEventListener('change', getMeetings)
220
+ document.getElementById('getTDocs').addEventListener('click', getDataFrame)
221
+ </script>
222
+ </body>
223
+ </html>
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ litellm
2
+ fastapi
3
+ uvicorn[standard]
4
+ pandas
5
+ numpy
6
+ pydantic
7
+ requests
8
+ lxml
9
+ openpyxl
10
+ beautifulsoup4