om4r932 commited on
Commit
405abe1
·
1 Parent(s): a6af380

Add documentation + fix bugs

Browse files
Files changed (4) hide show
  1. app.py +92 -10
  2. classes.py +9 -0
  3. documentation.md +48 -0
  4. schemas.py +152 -25
app.py CHANGED
@@ -58,7 +58,7 @@ def get_tdoc_url(doc_id):
58
  for tdoc in tdoc_locations:
59
  if tdoc["doc_id"] == doc_id:
60
  return tdoc["url"]
61
- return "Document not indexed (Re-index TDocs)"
62
 
63
  def get_spec_url(document):
64
  series = document.split(".")[0].zfill(2)
@@ -74,7 +74,33 @@ def get_document(spec_id: str, spec_title: str, source: str):
74
  text.extend([section['section'], section['content']])
75
  return text
76
 
77
- app = FastAPI(title="Document Finder Back-End", docs_url="/", description="Backend for DocFinder - Searching technical documents & specifications from 3GPP & ETSI")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  app.add_middleware(
79
  CORSMiddleware,
80
  allow_origins=["*"],
@@ -92,13 +118,35 @@ valid_3gpp_spec_format = re.compile(r'^\d{2}\.\d{3}(?:-\d+)?')
92
  valid_etsi_doc_format = re.compile(r'^(?:SET|SCP|SETTEC|SETREQ|SCPTEC|SCPREQ)\(\d+\)\d+(?:r\d+)?', flags=re.IGNORECASE)
93
  valid_etsi_spec_format = re.compile(r'^\d{3} \d{3}(?:-\d+)?')
94
 
95
- @app.post("/find", response_model=DocResponse)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def find_document(request: DocRequest):
97
  start_time = time.time()
98
  document = request.doc_id
99
- source = request.source
100
- spec_metadatas = spec_metadatas_3gpp if source == "3GPP" else spec_metadatas_etsi if source == "ETSI" else spec_metadatas_3gpp + spec_metadatas_etsi
101
- is_3gpp = valid_3gpp_doc_format.match(document) or valid_3gpp_spec_format.match(document)
102
 
103
  url = get_tdoc_url(document) if valid_3gpp_doc_format.match(document) else \
104
  get_spec_url(document) if valid_3gpp_spec_format.match(document) else \
@@ -108,9 +156,10 @@ def find_document(request: DocRequest):
108
  raise HTTPException(status_code=404, detail=url)
109
 
110
  version = None
111
- if is_3gpp:
112
  version = url.split("/")[-1].replace(".zip", "").split("-")[-1]
113
  scope = None
 
114
  for spec in spec_metadatas:
115
  if spec['id'] == document:
116
  scope = spec['scope']
@@ -124,7 +173,23 @@ def find_document(request: DocRequest):
124
  scope=scope
125
  )
126
 
127
- @app.post("/batch", response_model=BatchDocResponse)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def find_document_batch(request: BatchDocRequest):
129
  start_time = time.time()
130
  documents = request.doc_ids
@@ -148,7 +213,17 @@ def find_document_batch(request: BatchDocRequest):
148
  search_time=time.time()-start_time
149
  )
150
 
151
- @app.post('/search-spec', response_model=KeywordResponse)
 
 
 
 
 
 
 
 
 
 
152
  def search_specifications(request: KeywordRequest):
153
  start_time = time.time()
154
  boolSensitiveCase = request.case_sensitive
@@ -215,7 +290,14 @@ def search_specifications(request: KeywordRequest):
215
  else:
216
  raise HTTPException(status_code=404, detail="Specifications not found")
217
 
218
- @app.post("/search-spec/experimental", response_model=KeywordResponse)
 
 
 
 
 
 
 
219
  def bm25_search_specification(request: BM25KeywordRequest):
220
  start_time = time.time()
221
  source = request.source
 
58
  for tdoc in tdoc_locations:
59
  if tdoc["doc_id"] == doc_id:
60
  return tdoc["url"]
61
+ return "Document not indexed (re-indexing documents ?)"
62
 
63
  def get_spec_url(document):
64
  series = document.split(".")[0].zfill(2)
 
74
  text.extend([section['section'], section['content']])
75
  return text
76
 
77
+ tags_metadata = [
78
+ {
79
+ "name": "Document Retrieval",
80
+ "description": """
81
+ Direct document lookup operations for retrieving specific documents by their unique identifiers.
82
+
83
+ These endpoints provide fast access to document URLs, versions, and metadata without requiring keyword searches.
84
+ Perfect for when you know the exact document ID you're looking for.
85
+ """,
86
+ },
87
+ {
88
+ "name": "Content Search",
89
+ "description": """
90
+ Advanced search operations for finding documents based on keywords and content matching.
91
+
92
+ Includes both quick metadata-based searches and deep content analysis with flexible filtering options.
93
+ Supports different search modes and logical operators for precise results.
94
+ """,
95
+ },
96
+ ]
97
+
98
+ app = FastAPI(
99
+ title="3GPP & ETSI Document Finder API",
100
+ description=open('documentation.md').read(),
101
+ openapi_tags=tags_metadata
102
+ )
103
+
104
  app.add_middleware(
105
  CORSMiddleware,
106
  allow_origins=["*"],
 
118
  valid_etsi_doc_format = re.compile(r'^(?:SET|SCP|SETTEC|SETREQ|SCPTEC|SCPREQ)\(\d+\)\d+(?:r\d+)?', flags=re.IGNORECASE)
119
  valid_etsi_spec_format = re.compile(r'^\d{3} \d{3}(?:-\d+)?')
120
 
121
+ @app.post("/find/single", response_model=DocResponse, tags=["Document Retrieval"], summary="Retrieve a single document by ID", responses={
122
+ 200: {
123
+ "description": "Document found successfully",
124
+ "content": {
125
+ "application/json": {
126
+ "example": {
127
+ "doc_id": "23.401",
128
+ "url": "https://www.3gpp.org/ftp/Specs/archive/23_series/23.401/23401-h20.zip",
129
+ "version": "h20",
130
+ "scope": "General Packet Radio Service (GPRS) enhancements for Evolved Universal Terrestrial Radio Access Network (E-UTRAN) access",
131
+ "search_time": 0.0234
132
+ }
133
+ }
134
+ }
135
+ },
136
+ 404: {
137
+ "description": "Document not found or not indexed",
138
+ "content": {
139
+ "application/json": {
140
+ "example": {
141
+ "detail": "Specification 99.999 not found"
142
+ }
143
+ }
144
+ }
145
+ }
146
+ })
147
  def find_document(request: DocRequest):
148
  start_time = time.time()
149
  document = request.doc_id
 
 
 
150
 
151
  url = get_tdoc_url(document) if valid_3gpp_doc_format.match(document) else \
152
  get_spec_url(document) if valid_3gpp_spec_format.match(document) else \
 
156
  raise HTTPException(status_code=404, detail=url)
157
 
158
  version = None
159
+ if valid_3gpp_spec_format.match(document):
160
  version = url.split("/")[-1].replace(".zip", "").split("-")[-1]
161
  scope = None
162
+ spec_metadatas = spec_metadatas_3gpp if valid_3gpp_spec_format.match(document) else spec_metadatas_etsi
163
  for spec in spec_metadatas:
164
  if spec['id'] == document:
165
  scope = spec['scope']
 
173
  scope=scope
174
  )
175
 
176
+ @app.post("/find/batch", response_model=BatchDocResponse, summary="Retrieve multiple documents by IDs", tags=["Document Retrieval"], responses={
177
+ 200: {
178
+ "description": "Batch processing completed",
179
+ "content": {
180
+ "application/json": {
181
+ "example": {
182
+ "results": {
183
+ "23.401": "https://www.3gpp.org/ftp/Specs/archive/23_series/23.401/23401-h20.zip",
184
+ "S1-123456": "https://www.3gpp.org/ftp/tsg_sa/WG1_Serv/TSGSI_123/Docs/S1-123456.zip"
185
+ },
186
+ "missing": ["99.999", "INVALID-DOC"],
187
+ "search_time": 0.156
188
+ }
189
+ }
190
+ }
191
+ }
192
+ })
193
  def find_document_batch(request: BatchDocRequest):
194
  start_time = time.time()
195
  documents = request.doc_ids
 
213
  search_time=time.time()-start_time
214
  )
215
 
216
+ @app.post('/search', response_model=KeywordResponse, tags=["Content Search"], summary="Search specifications by keywords", responses={
217
+ 200: {
218
+ "description": "Search completed successfully"
219
+ },
220
+ 400: {
221
+ "description": "You must enter keywords in deep search mode"
222
+ },
223
+ 404: {
224
+ "description": "No specifications found matching the criteria"
225
+ }
226
+ })
227
  def search_specifications(request: KeywordRequest):
228
  start_time = time.time()
229
  boolSensitiveCase = request.case_sensitive
 
290
  else:
291
  raise HTTPException(status_code=404, detail="Specifications not found")
292
 
293
+ @app.post("/search/bm25", response_model=KeywordResponse, tags=["Content Search"], summary="Advanced BM25 search with relevance scoring", responses={
294
+ 200: {
295
+ "description": "BM25 search completed successfully"
296
+ },
297
+ 404: {
298
+ "description": "No specifications found above the relevance threshold"
299
+ }
300
+ })
301
  def bm25_search_specification(request: BM25KeywordRequest):
302
  start_time = time.time()
303
  source = request.source
classes.py CHANGED
@@ -59,6 +59,7 @@ class ETSIDocFinder:
59
  class ETSISpecFinder:
60
  def __init__(self):
61
  self.main_url = "https://www.etsi.org/deliver/etsi_ts"
 
62
  self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}
63
 
64
  def get_spec_path(self, doc_id: str):
@@ -89,12 +90,20 @@ class ETSISpecFinder:
89
  original = doc_id
90
 
91
  url = f"{self.main_url}/{self.get_spec_path(original)}/"
 
92
  print(url)
 
93
 
94
  releases = self.get_docs_from_url(url)
95
  files = self.get_docs_from_url(url + releases[-1])
96
  for f in files:
97
  if f.endswith(".pdf"):
98
  return url + releases[-1] + "/" + f
 
 
 
 
 
 
99
 
100
  return f"Specification {doc_id} not found"
 
59
  class ETSISpecFinder:
60
  def __init__(self):
61
  self.main_url = "https://www.etsi.org/deliver/etsi_ts"
62
+ self.second_url = "https://www.etsi.org/deliver/etsi_tr"
63
  self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}
64
 
65
  def get_spec_path(self, doc_id: str):
 
90
  original = doc_id
91
 
92
  url = f"{self.main_url}/{self.get_spec_path(original)}/"
93
+ url2 = f"{self.second_url}/{self.get_spec_path(original)}/"
94
  print(url)
95
+ print(url2)
96
 
97
  releases = self.get_docs_from_url(url)
98
  files = self.get_docs_from_url(url + releases[-1])
99
  for f in files:
100
  if f.endswith(".pdf"):
101
  return url + releases[-1] + "/" + f
102
+
103
+ releases = self.get_docs_from_url(url2)
104
+ files = self.get_docs_from_url(url + releases[-1])
105
+ for f in files:
106
+ if f.endswith('.pdf'):
107
+ return url + releases[-1] + "/" + f
108
 
109
  return f"Specification {doc_id} not found"
documentation.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 📋 Document Finder Backend API
2
+
3
+ A comprehensive REST API for searching and retrieving technical documents and specifications from **3GPP** and **ETSI** organizations.
4
+
5
+ ### 🚀 Key Features
6
+
7
+ * **Document Retrieval**: Get direct download URLs and metadata for specific documents
8
+ * **Batch Processing**: Handle multiple document requests simultaneously
9
+ * **Advanced Search**: Multiple search modes with keyword matching
10
+ * **BM25 Scoring**: State-of-the-art relevance ranking using BM25 algorithm
11
+ * **Cross-Organization**: Search across both 3GPP and ETSI document repositories
12
+
13
+ ### 📚 Supported Document Types
14
+
15
+ #### 3GPP Documents
16
+ * **TDocs (Technical Documents)**:
17
+ - Format: `S1-123456`, `C4-234567`, `R2-345678`
18
+ - Working group documents from SA, CT, RAN groups
19
+ * **Technical Specifications**:
20
+ - Format: `23.401`, `38.331-16`
21
+ - Official published specifications
22
+
23
+ #### ETSI Documents
24
+ * **TDocs (Technical Documents)**:
25
+ - Format: `SET(25)000001`, `SCPTEQ(19)000011`
26
+ - Committee working documents
27
+ * **Technical Specifications**:
28
+ - Format: `131 102`, `188 008-2`
29
+ - Published ETSI standards
30
+
31
+ ### 🔍 Search Capabilities
32
+
33
+ * **Quick Search**: Lightning-fast metadata-only search
34
+ * **Deep Search**: Comprehensive content-based search within document sections
35
+ * **BM25 Search**: Advanced relevance scoring with normalization
36
+ * **Flexible Filtering**: By source organization, document type, and specification category
37
+
38
+ ### 🛡️ Data Sources
39
+
40
+ This API indexes and searches through:
41
+ - 3GPP specification metadata and content
42
+ - ETSI specification metadata and content
43
+ - 3GPP TDoc location mappings
44
+ - Pre-built BM25 search indices
45
+
46
+ ### 🔧 Technical Stack
47
+
48
+ Built with FastAPI, featuring automatic OpenAPI documentation, request validation, and comprehensive error handling.
schemas.py CHANGED
@@ -1,38 +1,165 @@
1
- from pydantic import BaseModel
2
  from typing import *
3
 
4
  class DocRequest(BaseModel):
5
- doc_id: str
 
 
 
 
 
 
 
 
 
6
 
7
  class DocResponse(BaseModel):
8
- doc_id: str
9
- url: str
10
- version: Optional[str] = None
11
- scope: Optional[str] = None
12
- search_time: float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  class BatchDocRequest(BaseModel):
15
- doc_ids: List[str]
 
 
 
 
 
 
 
 
 
16
 
17
  class BatchDocResponse(BaseModel):
18
- results: Dict[str, str]
19
- missing: List[str]
20
- search_time: float
21
-
22
- class BM25KeywordRequest(BaseModel):
23
- keywords: Optional[str] = ""
24
- source: Optional[Literal["3GPP", "ETSI", "all"]] = "all"
25
- threshold: Optional[int] = 60
26
- spec_type: Optional[Literal["TS", "TR"]] = None
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  class KeywordRequest(BaseModel):
29
- keywords: Optional[str] = ""
30
- search_mode: Literal["quick", "deep"]
31
- case_sensitive: Optional[bool] = False
32
- source: Optional[Literal["3GPP", "ETSI", "all"]] = "all"
33
- spec_type: Optional[Literal["TS", "TR"]] = None
34
- mode: Optional[Literal["and", "or"]] = "and"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  class KeywordResponse(BaseModel):
37
- results: List[Dict[str, Any]]
38
- search_time: float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
  from typing import *
3
 
4
  class DocRequest(BaseModel):
5
+ """
6
+ Request model for single document retrieval.
7
+
8
+ Used to specify which document or specification to retrieve by its unique identifier.
9
+ """
10
+ doc_id: str = Field(
11
+ ...,
12
+ title="Document Identifier",
13
+ description="Unique identifier for the document or specification.",
14
+ )
15
 
16
  class DocResponse(BaseModel):
17
+ """
18
+ Response model for single document retrieval.
19
+
20
+ Contains all available metadata and access information for the requested document.
21
+ """
22
+ doc_id: str = Field(
23
+ ...,
24
+ title="Document Identifier",
25
+ description="Echoed document identifier from the request"
26
+ )
27
+ url: str = Field(
28
+ ...,
29
+ title="Document URL",
30
+ description="Direct download URL"
31
+ )
32
+ version: Optional[str] = Field(
33
+ None,
34
+ title="Document Version",
35
+ description="Extracted version information (e.g., 'h20', 'v17.9.0') when available"
36
+ )
37
+ scope: Optional[str] = Field(
38
+ None,
39
+ title="Document Scope",
40
+ description="Brief description of the document's scope and purpose from metadata"
41
+ )
42
+ search_time: float = Field(
43
+ ...,
44
+ title="Search Duration",
45
+ description="Time spent processing the request in seconds"
46
+ )
47
 
48
  class BatchDocRequest(BaseModel):
49
+ """
50
+ Request model for batch document retrieval.
51
+
52
+ Allows retrieval of multiple documents in a single API call for efficiency.
53
+ """
54
+ doc_ids: List[str] = Field(
55
+ ...,
56
+ title="Document Identifier List",
57
+ description="List of document identifiers to retrieve."
58
+ )
59
 
60
  class BatchDocResponse(BaseModel):
61
+ """
62
+ Response model for batch document retrieval.
63
+
64
+ Provides organized results separating found documents from missing ones.
65
+ """
66
+ results: Dict[str, str] = Field(
67
+ ...,
68
+ title="Found Documents",
69
+ description="Dictionary mapping document IDs to their corresponding URLs"
70
+ )
71
+ missing: List[str] = Field(
72
+ ...,
73
+ title="Missing Documents",
74
+ description="List of document IDs that could not be found or are not indexed"
75
+ )
76
+ search_time: float = Field(
77
+ ...,
78
+ title="Total Search Duration",
79
+ description="Total time spent processing the batch request in seconds"
80
+ )
81
 
82
  class KeywordRequest(BaseModel):
83
+ """
84
+ Request model for keyword-based specification search.
85
+
86
+ Provides flexible search options with multiple modes and filtering capabilities.
87
+ """
88
+ keywords: Optional[str] = Field(
89
+ "",
90
+ title="Search Keywords",
91
+ description="Comma-separated keywords for searching specifications.",
92
+ examples=["5G NR,authentication", "handover,mobility", "security,encryption"]
93
+ )
94
+ search_mode: Literal["quick", "deep"] = Field(
95
+ ...,
96
+ title="Search Mode",
97
+ description="Search mode: 'quick' searches metadata only, 'deep' searches metadata and document content"
98
+ )
99
+ case_sensitive: Optional[bool] = Field(
100
+ False,
101
+ title="Case Sensitive Search",
102
+ description="Enable case-sensitive keyword matching"
103
+ )
104
+ source: Optional[Literal["3GPP", "ETSI", "all"]] = Field(
105
+ "all",
106
+ title="Document Source",
107
+ description="Limit search to specific organization or search all repositories"
108
+ )
109
+ spec_type: Optional[Literal["TS", "TR"]] = Field(
110
+ None,
111
+ title="Specification Type",
112
+ description="Filter by specification type: 'TS' (Technical Specification) or 'TR' (Technical Report)"
113
+ )
114
+ mode: Optional[Literal["and", "or"]] = Field(
115
+ "and",
116
+ title="Search Logic",
117
+ description="Logical operator: 'and' requires all keywords to match, 'or' matches any keyword"
118
+ )
119
+
120
+ class BM25KeywordRequest(BaseModel):
121
+ """
122
+ Request model for BM25 advanced search.
123
+
124
+ Provides parameters for relevance-based search using BM25 scoring algorithm.
125
+ """
126
+ keywords: Optional[str] = Field(
127
+ "",
128
+ title="Search Query",
129
+ description="Natural language search query for BM25 processing",
130
+ examples=["5G authentication procedures", "handover mobility management", "security key derivation"]
131
+ )
132
+ source: Optional[Literal["3GPP", "ETSI", "all"]] = Field(
133
+ "all",
134
+ title="Document Source",
135
+ description="Limit search to specific organization repositories"
136
+ )
137
+ threshold: Optional[int] = Field(
138
+ 60,
139
+ title="Relevance Threshold",
140
+ description="Minimum normalized BM25 relevance score (0-100) for results inclusion",
141
+ ge=0,
142
+ le=100
143
+ )
144
+ spec_type: Optional[Literal["TS", "TR"]] = Field(
145
+ None,
146
+ title="Specification Type",
147
+ description="Filter results by specification type"
148
+ )
149
 
150
  class KeywordResponse(BaseModel):
151
+ """
152
+ Response model for keyword and BM25 search results.
153
+
154
+ Contains ranked search results with metadata and timing information.
155
+ """
156
+ results: List[Dict[str, Any]] = Field(
157
+ ...,
158
+ title="Search Results",
159
+ description="List of matching specifications with complete metadata. In deep search mode, includes 'contains' field with matching content sections."
160
+ )
161
+ search_time: float = Field(
162
+ ...,
163
+ title="Search Duration",
164
+ description="Time spent processing the search request in seconds"
165
+ )