Spaces:

OrganizedProgrammers
/

Reqxtract-v2

Running

App Files Files Community

om4r932 commited on Jun 18

Commit

1392287

1 Parent(s): a39fe1d

First version

Browse files

Files changed (4) hide show

Dockerfile +17 -0
app.py +234 -0
index.html +223 -0
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.11.3
+RUN apt-get update && \
+    apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+import litellm
+import pandas as pd
+from pydantic import BaseModel, Field
+from typing import Any, List, Dict, Optional
+import re
+import subprocess
+import requests
+import os
+from lxml import etree
+import zipfile
+import io
+import warnings
+warnings.filterwarnings("ignore")
+from bs4 import BeautifulSoup
+app = FastAPI(title="Requirements Extractor")
+app.add_middleware(CORSMiddleware, allow_credentials=True, allow_headers=["*"], allow_methods=["*"], allow_origins=["*"])
+class MeetingsRequest(BaseModel):
+    working_group: str
+class MeetingsResponse(BaseModel):
+    meetings: Dict[str, str]
+class DataRequest(BaseModel):
+    working_group: str
+    meeting: str
+class DataResponse(BaseModel):
+    data: List[Dict[Any, Any]]
+class DocRequirements(BaseModel):
+    doc_id: str
+    context: str
+    requirements: List[str]
+class DocInfo(BaseModel):
+    document: str
+    url: str
+class RequirementsRequest(BaseModel):
+    documents: List[DocInfo]
+class RequirementsResponse(BaseModel):
+    requirements: List[DocRequirements]
+NSMAP = {
+    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+    'v': 'urn:schemas-microsoft-com:vml'
+}
+def get_docx_archive(url: str) -> zipfile.ZipFile:
+    """Récupère le docx depuis l'URL et le retourne comme objet ZipFile"""
+    if not url.endswith("zip"):
+        raise ValueError("URL doit pointer vers un fichier ZIP")
+    resp = requests.get(url, verify=False, headers={
+        "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    })
+    resp.raise_for_status()
+    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
+        for file_name in zf.namelist():
+            if file_name.endswith((".docx", ".doc")):
+                docx_bytes = zf.read(file_name)
+                return zipfile.ZipFile(io.BytesIO(docx_bytes))
+    raise ValueError("Aucun fichier docx/doc trouvé dans l'archive")
+def parse_document_xml(docx_zip: zipfile.ZipFile) -> etree._ElementTree:
+    """Parse le document.xml principal"""
+    xml_bytes = docx_zip.read('word/document.xml')
+    parser = etree.XMLParser(remove_blank_text=True)
+    return etree.fromstring(xml_bytes, parser=parser)
+def clean_document_xml(root: etree._Element) -> None:
+    """Nettoie le XML en modifiant l'arbre directement"""
+    # Suppression des balises <w:del> et leur contenu
+    for del_elem in root.xpath('//w:del', namespaces=NSMAP):
+        parent = del_elem.getparent()
+        if parent is not None:
+            parent.remove(del_elem)
+    # Désencapsulation des balises <w:ins>
+    for ins_elem in root.xpath('//w:ins', namespaces=NSMAP):
+        parent = ins_elem.getparent()
+        index = parent.index(ins_elem)
+        for child in ins_elem.iterchildren():
+            parent.insert(index, child)
+            index += 1
+        parent.remove(ins_elem)
+    # Nettoyage des commentaires
+    for tag in ['w:commentRangeStart', 'w:commentRangeEnd', 'w:commentReference']:
+        for elem in root.xpath(f'//{tag}', namespaces=NSMAP):
+            parent = elem.getparent()
+            if parent is not None:
+                parent.remove(elem)
+def create_modified_docx(original_zip: zipfile.ZipFile, modified_root: etree._Element) -> bytes:
+    """Crée un nouveau docx avec le XML modifié"""
+    output = io.BytesIO()
+    with zipfile.ZipFile(output, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:
+        # Copier tous les fichiers non modifiés
+        for file in original_zip.infolist():
+            if file.filename != 'word/document.xml':
+                new_zip.writestr(file, original_zip.read(file.filename))
+        # Ajouter le document.xml modifié
+        xml_str = etree.tostring(
+            modified_root,
+            xml_declaration=True,
+            encoding='UTF-8',
+            pretty_print=True
+        )
+        new_zip.writestr('word/document.xml', xml_str)
+    output.seek(0)
+    return output.getvalue()
+def docx_to_txt(doc_id: str, url: str):
+    docx_zip = get_docx_archive(url)
+    root = parse_document_xml(docx_zip)
+    clean_document_xml(root)
+    modified_bytes = create_modified_docx(docx_zip, root)
+    input_path = f"/tmp/{doc_id}_cleaned.docx"
+    output_path = f"/tmp/{doc_id}_cleaned.txt"
+    with open(input_path, "wb") as f:
+        f.write(modified_bytes)
+    subprocess.run([
+        "libreoffice",
+        "--headless",
+        "--convert-to", "txt",
+        "--outdir", "/tmp",
+        input_path
+    ], check=True)
+    with open(output_path, "r", encoding="utf-8") as f:
+        txt_data = [line.strip() for line in f if line.strip()]
+    os.remove(input_path)
+    os.remove(output_path)
+    return txt_data
+@app.get("/")
+def render_page():
+    return FileResponse("index.html")
+@app.post("/get_meetings", response_model=MeetingsResponse)
+def get_meetings(req: MeetingsRequest):
+    working_group = req.working_group
+    tsg = re.sub(r"\d+", "", working_group)
+    wg_number = re.search(r"\d", working_group).group(0)
+    url = "https://www.3gpp.org/ftp/tsg_" + tsg
+    resp = requests.get(url, verify=False)
+    soup = BeautifulSoup(resp.text, "html.parser")
+    meeting_folders = []
+    all_meetings = []
+    wg_folders = [item.get_text() for item in soup.select("tr td a")]
+    selected_folder = None
+    for folder in wg_folders:
+        if str(wg_number) in folder:
+            selected_folder = folder
+            break
+    url += "/" + selected_folder
+    if selected_folder:
+        resp = requests.get(url, verify=False)
+        soup = BeautifulSoup(resp.text, "html.parser")
+        meeting_folders = [item.get_text() for item in soup.select("tr td a") if item.get_text().startswith("TSG")]
+        all_meetings = [working_group + "#" + meeting.split("_", 1)[1].replace("_", " ").replace("-", " ") for meeting in meeting_folders]
+    return MeetingsResponse(meetings=dict(zip(all_meetings, meeting_folders)))
+@app.post("/get_dataframe", response_model=DataResponse)
+def get_change_request_dataframe(req: DataRequest):
+    working_group = req.working_group
+    tsg = re.sub(r"\d+", "", working_group)
+    wg_number = re.search(r"\d", working_group).group(0)
+    url = "https://www.3gpp.org/ftp/tsg_" + tsg
+    resp = requests.get(url, verify=False)
+    soup = BeautifulSoup(resp.text, "html.parser")
+    wg_folders = [item.get_text() for item in soup.select("tr td a")]
+    selected_folder = None
+    for folder in wg_folders:
+        if str(wg_number) in folder:
+            selected_folder = folder
+            break
+    url += "/" + selected_folder + "/" + req.meeting + "/docs"
+    resp = requests.get(url, verify=False)
+    soup = BeautifulSoup(resp.text, "html.parser")
+    files = [item.get_text() for item in soup.select("tr td a") if item.get_text().endswith(".xlsx")]
+    def gen_url(tdoc: str):
+        return f"{url}/{tdoc}.zip"
+    df = pd.read_excel(str(url + "/" + files[0]).replace("#", "%23"))
+    filtered_df = df[(((df["Type"] == "CR") & ((df["CR category"] == "B") | (df["CR category"] == "C"))) | (df["Type"] == "pCR")) & ~(df["Uploaded"].isna())][["TDoc", "Title", "CR category", "Source", "Type", "Agenda item", "Agenda item description", "TDoc Status"]]
+    filtered_df["URL"] = filtered_df["TDoc"].apply(gen_url)
+    df = filtered_df.fillna("")
+    return DataResponse(data=df[["TDoc", "Title", "Type", "TDoc Status", "Agenda item description", "URL"]].to_dict(orient="records"))
+@app.post("/generate_requirements", response_model=RequirementsResponse)
+def gen_reqs(req: RequirementsRequest):
+    documents = req.documents
+    output = []
+    for doc in documents:
+        doc_id = doc.document
+        url = doc.url
+        full = "\n".join(docx_to_txt(doc_id, url))
+        resp_ai = litellm.completion(
+            model="gemini/gemini-2.0-flash",
+            api_key="SECRET API HERE",
+            messages=[{"role":"user","content": f"Here's the document whose ID is {doc_id} with requirements : {full}\n\nI want you to extract all the requirements and give me a context (not giving the section or whatever, a sentence is needed) where that calls for those requirements. If multiples covered contexts is present, make as many requirements list by context as you want."}],
+            response_format=DocRequirements
+        )
+        reqs = DocRequirements.model_validate_json(resp_ai.choices[0].message.content)
+        output.append(reqs)
+    return RequirementsResponse(requirements=output)

index.html ADDED Viewed

	@@ -0,0 +1,223 @@

+<!DOCTYPE html>
+<html lang="fr" data-theme="light">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Requirements Extractor</title>
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/full.css" rel="stylesheet">
+    <script src="https://cdn.tailwindcss.com"></script>
+</head>
+<body class="p-8 bg-base-100">
+    <div class="container mx-auto">
+        <h1 class="text-4xl font-bold text-center mb-8">Requirements Extractor</h1>
+        <div>
+            <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
+                <select class="select select-bordered" id="workingGroupSelect">
+                    <option disabled selected value="">Working Group</option>
+                    <option>SA1</option>
+                    <option>SA2</option>
+                    <option>SA3</option>
+                    <option>SA4</option>
+                    <option>SA5</option>
+                    <option>SA6</option>
+                    <option>CT1</option>
+                    <option>CT2</option>
+                    <option>CT3</option>
+                    <option>CT4</option>
+                    <option>CT5</option>
+                    <option>CT6</option>
+                </select>
+                <select class="select select-bordered" id="meetingSelect" disabled>
+                    <option disabled selected value="">Select a working group</option>
+                </select>
+                <button class="btn" id="getTDocs">Get TDocs</button>
+            </div>
+        </div>
+        <div class="hidden" id="filters">
+            <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mb-6">
+                <select class="select select-bordered" id="docType">
+                    <option disabled selected value="">Type</option>
+                    <option>Tous</option>
+                </select>
+                <select class="select select-bordered" id="docStatus">
+                    <option disabled selected value="">Status</option>
+                    <option>Tous</option>
+                </select>
+                <select class="select select-bordered" id="agendaItem">
+                    <option disabled selected value = "">Agenda</option>
+                    <option>Tous</option>
+                </select>
+            </div>
+        </div>
+        <!-- Tableau des données -->
+        <div class="max-h-[65vh] overflow-y-auto">
+            <table class="table table-zebra w-full" id="dataFrame">
+                <thead class="sticky top-0 bg-base-200 z-10">
+                    <tr class="bg-base-200">
+                        <th>TDoc</th>
+                        <th>Title</th>
+                        <th>Type</th>
+                        <th>Status</th>
+                        <th>Agenda Item N°</th>
+                        <th>URL</th>
+                    </tr>
+                </thead>
+                <tbody>
+                </tbody>
+            </table>
+        </div>
+        <center><button class="btn mt-6 gap-4" id="getReqs">Get Requirements</button></center>
+    </div>
+    <script>
+        function getDataFrame(){
+            const wg = document.getElementById('workingGroupSelect').value;
+            const meeting = document.getElementById('meetingSelect').value;
+            document.getElementById('docType').innerHTML = `
+                <option disabled selected value="">Type</option>
+                <option>Tous</option>
+            `
+            document.getElementById('docStatus').innerHTML = `
+                <option disabled selected value="">Type</option>
+                <option>Tous</option>
+            `
+            document.getElementById('agendaItem').innerHTML = `
+                <option disabled selected value="">Type</option>
+                <option>Tous</option>
+            `
+            const dataFrame = document.getElementById("dataFrame");
+            document.getElementById("getTDocs").setAttribute('disabled', 'true')
+            document.getElementById("getTDocs").innerHTML = "Loading ...";
+            fetch("/get_dataframe", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"working_group": wg, "meeting": meeting})})
+            .then(resp => resp.json())
+            .then(data => {
+                document.getElementById("filters").classList.remove("hidden")
+                const dataframeBody = dataFrame.querySelector("tbody");
+                dataframeBody.innerHTML = "";
+                const setType = new Set();
+                const setAgenda = new Set();
+                const setStatus = new Set();
+                data.data.forEach(row => {
+                    const tr = document.createElement("tr");
+                    tr.setAttribute("data-type", row['Type']);
+                    tr.setAttribute("data-status", row["TDoc Status"]);
+                    tr.setAttribute("data-agenda", row["Agenda item description"]);
+                    tr.innerHTML = `
+                        <td>${row["TDoc"]}</td>
+                        <td>${row["Title"]}</td>
+                        <td>${row["Type"]}</td>
+                        <td>${row["TDoc Status"]}</td>
+                        <td>${row["Agenda item description"]}</td>
+                        <td>
+                            <a href="${row["URL"]}" class="link">${row["URL"]}</a>
+                        </td>
+                    `;
+                    dataframeBody.appendChild(tr);
+                    setType.add(row["Type"]);
+                    setAgenda.add(row["Agenda item description"]);
+                    setStatus.add(row["TDoc Status"]);
+                })
+                setType.forEach(tdoctype => {
+                    const option = document.createElement("option");
+                    option.textContent = tdoctype;
+                    option.value = tdoctype;
+                    document.getElementById('docType').appendChild(option);
+                })
+                setAgenda.forEach(agenda => {
+                    const option = document.createElement("option");
+                    option.textContent = agenda;
+                    option.value = agenda;
+                    document.getElementById('agendaItem').appendChild(option);
+                })
+                setStatus.forEach(status => {
+                    const option = document.createElement("option");
+                    option.textContent = status;
+                    option.value = status;
+                    document.getElementById('docStatus').appendChild(option);
+                })
+            })
+            document.getElementById("getTDocs").removeAttribute("disabled")
+            document.getElementById("getTDocs").innerHTML = "Get TDocs";
+        }
+        function filterTable() {
+            const type = document.getElementById('docType').value
+            const status = document.getElementById('docStatus').value
+            const agenda = document.getElementById('agendaItem').value
+            document.querySelectorAll('#dataFrame tbody tr').forEach(row => {
+                const showRow =
+                    (type === 'Tous' || row.dataset.type === type || type === "") &&
+                    (status === 'Tous' || row.dataset.status === status || status === "") &&
+                    (agenda === 'Tous' || row.dataset.agenda === agenda || agenda === "")
+                row.style.display = showRow ? '' : 'none'
+            })
+        }
+        function getMeetings(){
+            const workingGroup = document.getElementById("workingGroupSelect").value;
+            document.getElementById("meetingSelect").setAttribute('disabled', 'true')
+            document.getElementById("meetingSelect").innerHTML = "<option>Loading...</option>"
+            document.getElementById("getTDocs").setAttribute('disabled', 'true')
+            fetch("/get_meetings", {method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({"working_group": workingGroup})})
+            .then(resp => resp.json())
+            .then(data => {
+                document.getElementById("meetingSelect").innerHTML = "";
+                document.getElementById("meetingSelect").removeAttribute("disabled");
+                document.getElementById("getTDocs").removeAttribute("disabled")
+                for(const [key, value] of Object.entries(data.meetings)){
+                    const option = document.createElement("option");
+                    option.textContent = key;
+                    option.value = value;
+                    document.getElementById('meetingSelect').appendChild(option);
+                }
+            })
+        }
+        function tableToGenBody(tableSelector) {
+            // columnsMap : { "NomHeaderDansTable": "nom_voulu", ... }
+            let columnsMap = {"TDoc": "doc_id", "URL": "url"};
+            const table = document.querySelector(tableSelector);
+            const headers = Array.from(table.querySelectorAll('thead th')).map(th => th.innerText.trim());
+            // Indices des colonnes à extraire
+            const selectedIndices = headers
+                .map((header, idx) => columnsMap[header] ? idx : -1)
+                .filter(idx => idx !== -1);
+            return Array.from(table.querySelectorAll('tbody tr'))
+                .filter(row => getComputedStyle(row).display !== 'none')
+                .map(row => {
+                    const cells = Array.from(row.querySelectorAll('td'));
+                    const obj = {};
+                    selectedIndices.forEach(idx => {
+                        const originalHeader = headers[idx];
+                        const newKey = columnsMap[originalHeader];
+                        obj[newKey] = cells[idx].innerText.trim();
+                    });
+                    return obj;
+                });
+        }
+        // Écouteurs d'événements pour les filtres
+        document.getElementById('docType').addEventListener('change', filterTable)
+        document.getElementById('docStatus').addEventListener('change', filterTable)
+        document.getElementById('agendaItem').addEventListener('change', filterTable)
+        document.getElementById("workingGroupSelect").addEventListener('change', getMeetings)
+        document.getElementById('getTDocs').addEventListener('click', getDataFrame)
+    </script>
+</body>
+</html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+litellm
+fastapi
+uvicorn[standard]
+pandas
+numpy
+pydantic
+requests
+lxml
+openpyxl
+beautifulsoup4