Spaces:

OrganizedProgrammers
/

DocFinder

Running

App Files Files Community

DocFinder / classes.py

om4r932

Update

681e106 11 days ago

raw

history blame contribute delete

5.03 kB

	from fastapi import HTTPException
	import requests
	import re
	from bs4 import BeautifulSoup
	import os
	import json

	class ETSIDocFinder:
	def __init__(self):
	self.main_ftp_url = "https://docbox.etsi.org/SET"
	req_data = self.connect()
	print(req_data['message'])
	self.session = req_data['session']

	def connect(self):
	session = requests.Session()
	req = session.post("https://portal.etsi.org/ETSIPages/LoginEOL.ashx", verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}, data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}))
	if req.text == "Failed":
	return {"error": True, "session": session, "message": "Login failed ! Check your credentials"}
	return {"error": False, "session": session, "message": "Login successful"}

	def get_workgroup(self, doc: str):
	main_tsg = "SET-WG-R" if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"]) else "SET-WG-T" if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"]) else "SET" if any(doc.startswith(kw) for kw in ["SET", "SCP"]) else None
	if main_tsg is None:
	return None, None, None
	regex = re.search(r'\(([^)]+)\)', doc)
	workgroup = "20" + regex.group(1)
	return main_tsg, workgroup, doc

	def find_workgroup_url(self, main_tsg, workgroup):
	response = self.session.get(f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS", verify=False)
	soup = BeautifulSoup(response.text, 'html.parser')
	for item in soup.find_all("tr"):
	link = item.find("a")
	if link and workgroup in link.get_text():
	return f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS/{link.get_text()}"

	return f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS/{workgroup}"

	def get_docs_from_url(self, url):
	try:
	response = self.session.get(url, verify=False, timeout=15)
	soup = BeautifulSoup(response.text, "html.parser")
	return [item.get_text() for item in soup.select("tr td a")]
	except Exception as e:
	print(f"Error accessing {url}: {e}")
	return []

	def search_document(self, doc_id: str):
	original = doc_id

	main_tsg, workgroup, doc = self.get_workgroup(doc_id)
	urls = []
	if main_tsg:
	wg_url = self.find_workgroup_url(main_tsg, workgroup)
	print(wg_url)
	if wg_url:
	files = self.get_docs_from_url(wg_url)
	print(files)
	for f in files:
	if doc in f.lower() or original in f:
	print(f)
	doc_url = f"{wg_url}/{f}"
	urls.append(doc_url)
	return urls[0] if len(urls) == 1 else urls[-2] if len(urls) > 1 else f"Document {doc_id} not found"

	class ETSISpecFinder:
	def __init__(self):
	self.main_url = "https://www.etsi.org/deliver/etsi_ts"
	self.second_url = "https://www.etsi.org/deliver/etsi_tr"
	self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}

	def get_spec_path(self, doc_id: str):
	if "-" in doc_id:
	position, part = doc_id.split("-")
	else:
	position, part = doc_id, None

	position = position.replace(" ", "")
	if part:
	if len(part) == 1:
	part = "0" + part
	spec_folder = position + part if part is not None else position
	return f"{int(position) - (int(position)%100)}_{int(position) - (int(position)%100) + 99}/{spec_folder}"

	def get_docs_from_url(self, url):
	try:
	response = requests.get(url, verify=False, timeout=15)
	soup = BeautifulSoup(response.text, "html.parser")
	docs = [item.get_text() for item in soup.find_all("a")][1:]
	return docs
	except Exception as e:
	print(f"Error accessing {url}: {e}")
	return []

	def search_document(self, doc_id: str):
	# Example : 103 666[-2 opt]
	original = doc_id

	url = f"{self.main_url}/{self.get_spec_path(original)}/"
	url2 = f"{self.second_url}/{self.get_spec_path(original)}/"
	print(url)
	print(url2)

	releases = self.get_docs_from_url(url)
	files = self.get_docs_from_url(url + releases[-1])
	for f in files:
	if f.endswith(".pdf"):
	return url + releases[-1] + "/" + f

	releases = self.get_docs_from_url(url2)
	files = self.get_docs_from_url(url + releases[-1])
	for f in files:
	if f.endswith('.pdf'):
	return url + releases[-1] + "/" + f

	return f"Specification {doc_id} not found"