Spaces:
Sleeping
Sleeping
First version
Browse files- Dockerfile +17 -0
- app.py +214 -0
- requirements.txt +9 -0
Dockerfile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11.3
|
2 |
+
|
3 |
+
RUN apt-get update && \
|
4 |
+
apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress && \
|
5 |
+
apt-get clean && rm -rf /var/lib/apt/lists/*
|
6 |
+
|
7 |
+
RUN useradd -m -u 1000 user
|
8 |
+
USER user
|
9 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
10 |
+
|
11 |
+
WORKDIR /app
|
12 |
+
|
13 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
14 |
+
RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
|
15 |
+
|
16 |
+
COPY --chown=user . /app
|
17 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests, os, zipfile, subprocess, re, warnings
|
2 |
+
warnings.filterwarnings("ignore")
|
3 |
+
os.environ["CURL_CA_BUNDLE"] = ""
|
4 |
+
from io import BytesIO
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
load_dotenv()
|
7 |
+
from datasets import load_dataset
|
8 |
+
import fitz
|
9 |
+
from fastapi import FastAPI, HTTPException
|
10 |
+
from fastapi.middleware.cors import CORSMiddleware
|
11 |
+
from pydantic import BaseModel
|
12 |
+
|
13 |
+
app = FastAPI(title="Specification Retriever/Splitter API",
|
14 |
+
description="API that enable to extract text or split text by their chapters & sub-chapters of 3GPP & ETSI specifications",
|
15 |
+
docs_url="/")
|
16 |
+
|
17 |
+
origins = [
|
18 |
+
"*",
|
19 |
+
]
|
20 |
+
|
21 |
+
app.add_middleware(
|
22 |
+
CORSMiddleware,
|
23 |
+
allow_origins=origins,
|
24 |
+
allow_credentials=True,
|
25 |
+
allow_methods=["*"],
|
26 |
+
allow_headers=["*"],
|
27 |
+
)
|
28 |
+
|
29 |
+
spec_contents_3gpp = load_dataset("OrganizedProgrammers/3GPPSpecContent")
|
30 |
+
spec_contents_3gpp = spec_contents_3gpp["train"].to_list()
|
31 |
+
|
32 |
+
spec_contents_etsi = load_dataset("OrganizedProgrammers/ETSISpecContent")
|
33 |
+
spec_contents_etsi = spec_contents_etsi["train"].to_list()
|
34 |
+
|
35 |
+
spec_3gpp_format = re.compile(r'^\d{2}\.\d{3}(?:-\d+)?')
|
36 |
+
spec_etsi_format = re.compile(r'^\d{,3} \d{,3}(?:-\d+)?')
|
37 |
+
|
38 |
+
class SpecRequest(BaseModel):
|
39 |
+
spec_id: str
|
40 |
+
|
41 |
+
def is_doc_indexed(spec_id: str):
|
42 |
+
return any([True if spec_id == s["doc_id"] else False for s in spec_contents_3gpp]) or any([True if spec_id == s["doc_id"] else False for s in spec_contents_etsi])
|
43 |
+
|
44 |
+
def get_doc(spec_id: str):
|
45 |
+
doc = []
|
46 |
+
for spec in spec_contents_3gpp + spec_contents_etsi:
|
47 |
+
if spec["doc_id"] == spec_id:
|
48 |
+
doc.append(f"{spec['section']}\n{spec['content']}")
|
49 |
+
return "\n\n".join(doc)
|
50 |
+
|
51 |
+
def get_structured_doc(spec_id: str):
|
52 |
+
doc = {}
|
53 |
+
for spec in spec_contents_3gpp + spec_contents_etsi:
|
54 |
+
if spec["doc_id"] == spec_id:
|
55 |
+
doc[spec["section"]] = spec["content"]
|
56 |
+
return doc
|
57 |
+
|
58 |
+
def get_pdf_data(request: SpecRequest):
|
59 |
+
specification = request.spec_id
|
60 |
+
if is_doc_indexed(specification):
|
61 |
+
return get_doc(specification)
|
62 |
+
url = requests.post(
|
63 |
+
"https://organizedprogrammers-docfinder.hf.space/find/single",
|
64 |
+
verify=False,
|
65 |
+
headers={"Content-Type": "application/json"},
|
66 |
+
json={"doc_id": specification}
|
67 |
+
)
|
68 |
+
|
69 |
+
if url.status_code != 200:
|
70 |
+
raise HTTPException(404, detail="Not found")
|
71 |
+
|
72 |
+
url = url.json()['url']
|
73 |
+
response = requests.get(
|
74 |
+
url,
|
75 |
+
verify=False,
|
76 |
+
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
|
77 |
+
|
78 |
+
)
|
79 |
+
|
80 |
+
pdf = fitz.open(stream=response.content, filetype="pdf")
|
81 |
+
return pdf, pdf.get_toc()
|
82 |
+
|
83 |
+
@app.post("/extract_text/full")
|
84 |
+
def extract_full_spec(request: SpecRequest):
|
85 |
+
specification = request.spec_id
|
86 |
+
if is_doc_indexed(specification):
|
87 |
+
return get_doc(specification)
|
88 |
+
print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
|
89 |
+
total_file = []
|
90 |
+
if spec_3gpp_format.match(specification):
|
91 |
+
url = requests.post(
|
92 |
+
"https://organizedprogrammers-docfinder.hf.space/find/single",
|
93 |
+
verify=False,
|
94 |
+
headers={"Content-Type": "application/json"},
|
95 |
+
json={"doc_id": specification}
|
96 |
+
)
|
97 |
+
|
98 |
+
if url.status_code != 200:
|
99 |
+
raise HTTPException(404, detail="Not found")
|
100 |
+
|
101 |
+
url = url.json()['url']
|
102 |
+
response = requests.get(
|
103 |
+
url,
|
104 |
+
verify=False,
|
105 |
+
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
|
106 |
+
|
107 |
+
)
|
108 |
+
|
109 |
+
zip_bytes = BytesIO(response.content)
|
110 |
+
current_zip_file = zipfile.ZipFile(zip_bytes)
|
111 |
+
for file_info in current_zip_file.infolist():
|
112 |
+
if file_info.filename.endswith(".zip") and len(current_zip_file.namelist()) == 1:
|
113 |
+
nested_zip_bytes = BytesIO(current_zip_file.read(file_info.filename))
|
114 |
+
current_zip_file = zipfile.ZipFile(nested_zip_bytes)
|
115 |
+
break
|
116 |
+
|
117 |
+
for file_info in current_zip_file.infolist():
|
118 |
+
filename = file_info.filename
|
119 |
+
if (filename.endswith('.doc') or filename.endswith('.docx')) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
|
120 |
+
doc_bytes = current_zip_file.read(filename)
|
121 |
+
ext = filename.split(".")[-1]
|
122 |
+
input_path = f"/tmp/{specification}.{ext}"
|
123 |
+
output_path = f"/tmp/{specification}.txt"
|
124 |
+
with open(input_path, "wb") as f:
|
125 |
+
f.write(doc_bytes)
|
126 |
+
|
127 |
+
subprocess.run([
|
128 |
+
"libreoffice",
|
129 |
+
"--headless",
|
130 |
+
"--convert-to", "txt",
|
131 |
+
"--outdir", "/tmp",
|
132 |
+
input_path
|
133 |
+
], check=True)
|
134 |
+
|
135 |
+
with open(output_path, "r") as f:
|
136 |
+
txt_data = [line.strip() for line in f if line.strip()]
|
137 |
+
|
138 |
+
os.remove(input_path)
|
139 |
+
os.remove(output_path)
|
140 |
+
total_file.extend(txt_data)
|
141 |
+
if total_file == []:
|
142 |
+
raise HTTPException(status_code=404, detail="Not found !")
|
143 |
+
else:
|
144 |
+
return total_file
|
145 |
+
elif spec_etsi_format.match(specification):
|
146 |
+
print("\n[INFO] Tentative de récupération du texte", flush=True)
|
147 |
+
pdf, doc_toc = get_pdf_data(request)
|
148 |
+
text = []
|
149 |
+
first = 0
|
150 |
+
for level, title, page in doc_toc:
|
151 |
+
if title[0].isnumeric():
|
152 |
+
first = page - 1
|
153 |
+
break
|
154 |
+
for page in pdf[first:]:
|
155 |
+
text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
|
156 |
+
text = "\n".join(text)
|
157 |
+
|
158 |
+
if not text or not doc_toc:
|
159 |
+
print("\n[ERREUR] Pas de texte/table of contents trouvé !")
|
160 |
+
return {}
|
161 |
+
print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True)
|
162 |
+
return text
|
163 |
+
else:
|
164 |
+
raise HTTPException(status_code=400, detail="Document ID format invalid !")
|
165 |
+
|
166 |
+
@app.post("/extract_text/structured")
|
167 |
+
def extract_full_spec_by_chapters(request: SpecRequest):
|
168 |
+
specification = request.spec_id
|
169 |
+
if is_doc_indexed(request.spec_id):
|
170 |
+
return get_structured_doc(request.spec_id)
|
171 |
+
print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
|
172 |
+
total_file = []
|
173 |
+
text = extract_full_spec(request)
|
174 |
+
if spec_3gpp_format.match(specification):
|
175 |
+
chapters = []
|
176 |
+
chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")
|
177 |
+
|
178 |
+
for i, line in enumerate(text):
|
179 |
+
if chapter_regex.fullmatch(line):
|
180 |
+
chapters.append((i, line))
|
181 |
+
|
182 |
+
document = {}
|
183 |
+
for i in range(len(chapters)):
|
184 |
+
start_index, chapter_title = chapters[i]
|
185 |
+
end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
|
186 |
+
content_lines = text[start_index + 1 : end_index]
|
187 |
+
document[chapter_title.replace('\t', " ")] = "\n".join(content_lines)
|
188 |
+
return document
|
189 |
+
elif spec_etsi_format.match(specification):
|
190 |
+
def extract_sections(text, titles):
|
191 |
+
sections = {}
|
192 |
+
# On trie les titres selon leur position dans le texte
|
193 |
+
sorted_titles = sorted(titles, key=lambda t: text.find(t))
|
194 |
+
for i, title in enumerate(sorted_titles):
|
195 |
+
start = text.find(title)
|
196 |
+
if i + 1 < len(sorted_titles):
|
197 |
+
end = text.find(sorted_titles[i + 1])
|
198 |
+
sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
|
199 |
+
else:
|
200 |
+
sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
|
201 |
+
return sections
|
202 |
+
pdf, toc = get_pdf_data(request)
|
203 |
+
if not text or not toc:
|
204 |
+
print("\n[ERREUR] Pas de texte/table of contents trouvé !")
|
205 |
+
return {}
|
206 |
+
print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True)
|
207 |
+
titles = []
|
208 |
+
for level, title, page in toc:
|
209 |
+
if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
|
210 |
+
titles.append('\n'.join(title.strip().split(" ", 1)))
|
211 |
+
|
212 |
+
return extract_sections(text, titles)
|
213 |
+
else:
|
214 |
+
raise HTTPException(status_code=400, detail="Document ID format invalid !")
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn[standard]
|
3 |
+
requests
|
4 |
+
pydantic
|
5 |
+
lxml
|
6 |
+
huggingface_hub
|
7 |
+
datasets
|
8 |
+
python-dotenv
|
9 |
+
PyMuPDF
|