om4r932 commited on
Commit
d00574b
·
1 Parent(s): f092a99

First version

Browse files
Files changed (3) hide show
  1. Dockerfile +17 -0
  2. app.py +214 -0
  3. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.3
2
+
3
+ RUN apt-get update && \
4
+ apt-get install -y libreoffice libreoffice-writer libreoffice-calc libreoffice-impress && \
5
+ apt-get clean && rm -rf /var/lib/apt/lists/*
6
+
7
+ RUN useradd -m -u 1000 user
8
+ USER user
9
+ ENV PATH="/home/user/.local/bin:$PATH"
10
+
11
+ WORKDIR /app
12
+
13
+ COPY --chown=user ./requirements.txt requirements.txt
14
+ RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
15
+
16
+ COPY --chown=user . /app
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests, os, zipfile, subprocess, re, warnings
2
+ warnings.filterwarnings("ignore")
3
+ os.environ["CURL_CA_BUNDLE"] = ""
4
+ from io import BytesIO
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+ from datasets import load_dataset
8
+ import fitz
9
+ from fastapi import FastAPI, HTTPException
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from pydantic import BaseModel
12
+
13
+ app = FastAPI(title="Specification Retriever/Splitter API",
14
+ description="API that enable to extract text or split text by their chapters & sub-chapters of 3GPP & ETSI specifications",
15
+ docs_url="/")
16
+
17
+ origins = [
18
+ "*",
19
+ ]
20
+
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=origins,
24
+ allow_credentials=True,
25
+ allow_methods=["*"],
26
+ allow_headers=["*"],
27
+ )
28
+
29
+ spec_contents_3gpp = load_dataset("OrganizedProgrammers/3GPPSpecContent")
30
+ spec_contents_3gpp = spec_contents_3gpp["train"].to_list()
31
+
32
+ spec_contents_etsi = load_dataset("OrganizedProgrammers/ETSISpecContent")
33
+ spec_contents_etsi = spec_contents_etsi["train"].to_list()
34
+
35
+ spec_3gpp_format = re.compile(r'^\d{2}\.\d{3}(?:-\d+)?')
36
+ spec_etsi_format = re.compile(r'^\d{,3} \d{,3}(?:-\d+)?')
37
+
38
+ class SpecRequest(BaseModel):
39
+ spec_id: str
40
+
41
+ def is_doc_indexed(spec_id: str):
42
+ return any([True if spec_id == s["doc_id"] else False for s in spec_contents_3gpp]) or any([True if spec_id == s["doc_id"] else False for s in spec_contents_etsi])
43
+
44
+ def get_doc(spec_id: str):
45
+ doc = []
46
+ for spec in spec_contents_3gpp + spec_contents_etsi:
47
+ if spec["doc_id"] == spec_id:
48
+ doc.append(f"{spec['section']}\n{spec['content']}")
49
+ return "\n\n".join(doc)
50
+
51
+ def get_structured_doc(spec_id: str):
52
+ doc = {}
53
+ for spec in spec_contents_3gpp + spec_contents_etsi:
54
+ if spec["doc_id"] == spec_id:
55
+ doc[spec["section"]] = spec["content"]
56
+ return doc
57
+
58
+ def get_pdf_data(request: SpecRequest):
59
+ specification = request.spec_id
60
+ if is_doc_indexed(specification):
61
+ return get_doc(specification)
62
+ url = requests.post(
63
+ "https://organizedprogrammers-docfinder.hf.space/find/single",
64
+ verify=False,
65
+ headers={"Content-Type": "application/json"},
66
+ json={"doc_id": specification}
67
+ )
68
+
69
+ if url.status_code != 200:
70
+ raise HTTPException(404, detail="Not found")
71
+
72
+ url = url.json()['url']
73
+ response = requests.get(
74
+ url,
75
+ verify=False,
76
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
77
+
78
+ )
79
+
80
+ pdf = fitz.open(stream=response.content, filetype="pdf")
81
+ return pdf, pdf.get_toc()
82
+
83
+ @app.post("/extract_text/full")
84
+ def extract_full_spec(request: SpecRequest):
85
+ specification = request.spec_id
86
+ if is_doc_indexed(specification):
87
+ return get_doc(specification)
88
+ print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
89
+ total_file = []
90
+ if spec_3gpp_format.match(specification):
91
+ url = requests.post(
92
+ "https://organizedprogrammers-docfinder.hf.space/find/single",
93
+ verify=False,
94
+ headers={"Content-Type": "application/json"},
95
+ json={"doc_id": specification}
96
+ )
97
+
98
+ if url.status_code != 200:
99
+ raise HTTPException(404, detail="Not found")
100
+
101
+ url = url.json()['url']
102
+ response = requests.get(
103
+ url,
104
+ verify=False,
105
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
106
+
107
+ )
108
+
109
+ zip_bytes = BytesIO(response.content)
110
+ current_zip_file = zipfile.ZipFile(zip_bytes)
111
+ for file_info in current_zip_file.infolist():
112
+ if file_info.filename.endswith(".zip") and len(current_zip_file.namelist()) == 1:
113
+ nested_zip_bytes = BytesIO(current_zip_file.read(file_info.filename))
114
+ current_zip_file = zipfile.ZipFile(nested_zip_bytes)
115
+ break
116
+
117
+ for file_info in current_zip_file.infolist():
118
+ filename = file_info.filename
119
+ if (filename.endswith('.doc') or filename.endswith('.docx')) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
120
+ doc_bytes = current_zip_file.read(filename)
121
+ ext = filename.split(".")[-1]
122
+ input_path = f"/tmp/{specification}.{ext}"
123
+ output_path = f"/tmp/{specification}.txt"
124
+ with open(input_path, "wb") as f:
125
+ f.write(doc_bytes)
126
+
127
+ subprocess.run([
128
+ "libreoffice",
129
+ "--headless",
130
+ "--convert-to", "txt",
131
+ "--outdir", "/tmp",
132
+ input_path
133
+ ], check=True)
134
+
135
+ with open(output_path, "r") as f:
136
+ txt_data = [line.strip() for line in f if line.strip()]
137
+
138
+ os.remove(input_path)
139
+ os.remove(output_path)
140
+ total_file.extend(txt_data)
141
+ if total_file == []:
142
+ raise HTTPException(status_code=404, detail="Not found !")
143
+ else:
144
+ return total_file
145
+ elif spec_etsi_format.match(specification):
146
+ print("\n[INFO] Tentative de récupération du texte", flush=True)
147
+ pdf, doc_toc = get_pdf_data(request)
148
+ text = []
149
+ first = 0
150
+ for level, title, page in doc_toc:
151
+ if title[0].isnumeric():
152
+ first = page - 1
153
+ break
154
+ for page in pdf[first:]:
155
+ text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
156
+ text = "\n".join(text)
157
+
158
+ if not text or not doc_toc:
159
+ print("\n[ERREUR] Pas de texte/table of contents trouvé !")
160
+ return {}
161
+ print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True)
162
+ return text
163
+ else:
164
+ raise HTTPException(status_code=400, detail="Document ID format invalid !")
165
+
166
+ @app.post("/extract_text/structured")
167
+ def extract_full_spec_by_chapters(request: SpecRequest):
168
+ specification = request.spec_id
169
+ if is_doc_indexed(request.spec_id):
170
+ return get_structured_doc(request.spec_id)
171
+ print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
172
+ total_file = []
173
+ text = extract_full_spec(request)
174
+ if spec_3gpp_format.match(specification):
175
+ chapters = []
176
+ chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")
177
+
178
+ for i, line in enumerate(text):
179
+ if chapter_regex.fullmatch(line):
180
+ chapters.append((i, line))
181
+
182
+ document = {}
183
+ for i in range(len(chapters)):
184
+ start_index, chapter_title = chapters[i]
185
+ end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
186
+ content_lines = text[start_index + 1 : end_index]
187
+ document[chapter_title.replace('\t', " ")] = "\n".join(content_lines)
188
+ return document
189
+ elif spec_etsi_format.match(specification):
190
+ def extract_sections(text, titles):
191
+ sections = {}
192
+ # On trie les titres selon leur position dans le texte
193
+ sorted_titles = sorted(titles, key=lambda t: text.find(t))
194
+ for i, title in enumerate(sorted_titles):
195
+ start = text.find(title)
196
+ if i + 1 < len(sorted_titles):
197
+ end = text.find(sorted_titles[i + 1])
198
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
199
+ else:
200
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
201
+ return sections
202
+ pdf, toc = get_pdf_data(request)
203
+ if not text or not toc:
204
+ print("\n[ERREUR] Pas de texte/table of contents trouvé !")
205
+ return {}
206
+ print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True)
207
+ titles = []
208
+ for level, title, page in toc:
209
+ if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
210
+ titles.append('\n'.join(title.strip().split(" ", 1)))
211
+
212
+ return extract_sections(text, titles)
213
+ else:
214
+ raise HTTPException(status_code=400, detail="Document ID format invalid !")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ requests
4
+ pydantic
5
+ lxml
6
+ huggingface_hub
7
+ datasets
8
+ python-dotenv
9
+ PyMuPDF