Spaces:

DavMelchi
/

db_query

Running

App Files Files Community

db_query / apps /fnb_parser.py

DavMelchi

adding fn4b parser

f991b3c about 2 months ago

raw

history blame

12.1 kB

	"""
	Streamlit application for extracting site and sector information from .docx design files.
	The logic is adapted from `Sector Stacked.py` but provides an interactive UI where users can
	upload one or many Word documents and instantly visualise / download the results.
	"""

	import io
	import os
	import re
	from typing import List

	import pandas as pd
	import plotly.express as px
	import streamlit as st
	from docx import Document

	###############################################################################
	# --------------------------- Core extract logic -------------------------- #
	###############################################################################


	def extract_info_from_docx_separated_sectors(
	docx_bytes: bytes, filename: str
	) -> List[dict]:
	"""Extract the site-level and sector-level information from a Word design file.

	Parameters
	----------
	docx_bytes : bytes
	Raw bytes of the `.docx` file – read directly from the Streamlit uploader.
	filename : str
	Original filename. Used only for reference in the output.

	Returns
	-------
	list[dict]
	A list containing up to three dictionaries – one for each sector.
	"""
	# python-docx can open a file-like object, so we wrap the bytes in BytesIO
	doc = Document(io.BytesIO(docx_bytes))

	# Shared site information
	site_shared = {
	"File": filename,
	"Code": None,
	"Site Name": None,
	"Localité": None,
	"Adresse": None,
	"X": None,
	"Y": None,
	"Z": None,
	"UTM_Zone": None,
	}

	# Per-sector placeholders (we assume max 3 sectors)
	sector_data = {
	"Azimuth": [None] * 3,
	"Height": [None] * 3,
	"MechTilt": [None] * 3,
	"ElecTilt": [None] * 3,
	}

	# Iterate tables / rows / cells once, filling the data structures
	for table in doc.tables:
	for row in table.rows:
	# Drop empty cells and overspaces
	cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
	if not cells:
	continue

	row_text_lower = " \| ".join(cells).lower()

	# Code (assumes pattern "T00" / "N01" typical of site codes)
	if site_shared["Code"] is None and any("code" in c.lower() for c in cells):
	for val in cells:
	if ("t00" in val.lower()) or ("n01" in val.lower()):
	site_shared["Code"] = val.replace(" ", "").strip()
	break

	# Site Name – same heuristic as original script
	if site_shared["Site Name"] is None and any(
	"nom" in c.lower() for c in cells
	):
	for val in cells:
	if ("t00" in val.lower()) or ("n01" in val.lower()):
	site_shared["Site Name"] = val.strip()
	break

	# UTM Zone
	if site_shared["UTM_Zone"] is None:
	utm_match = re.search(r"utm\s*(\d+)", row_text_lower)
	if utm_match:
	site_shared["UTM_Zone"] = f"UTM{utm_match.group(1)}"

	# Localité and Adresse
	if site_shared["Localité"] is None and any(
	"localité" in c.lower() for c in cells
	):
	for val in cells:
	if val.lower() != "localité:":
	site_shared["Localité"] = val.strip()
	break
	if site_shared["Adresse"] is None and any(
	"adresse" in c.lower() for c in cells
	):
	for val in cells:
	if val.lower() != "adresse:":
	site_shared["Adresse"] = val.strip()
	break

	# Coordinates (X, Y, Z)
	if {"X", "Y", "Z"}.intersection(cells):
	for i, cell_text in enumerate(cells):
	text = cell_text.strip()
	# X coordinate
	if text == "X" and i + 1 < len(cells):
	site_shared["X"] = cells[i + 1].strip()
	# Y coordinate – could be in same cell e.g. "Y 123" or split
	elif re.search(r"Y\s*[0-9]", text):
	match = re.search(r"Y\s*([0-9°'\.\sWE]+)", text)
	if match:
	site_shared["Y"] = match.group(1).strip()
	elif text == "Y" and i + 1 < len(cells):
	site_shared["Y"] = cells[i + 1].strip()
	# Z / Elevation
	elif re.search(r"Z\s*[0-9]", text):
	match = re.search(r"Z\s*([0-9]+)", text)
	if match:
	site_shared["Z"] = match.group(1).strip()
	elif text == "Z" and i + 1 < len(cells):
	z_val = re.search(r"([0-9]+)", cells[i + 1])
	if z_val:
	site_shared["Z"] = z_val.group(1).strip()

	# Sector-specific lines
	first_cell = cells[0].lower()
	if first_cell == "azimut":
	for i in range(min(3, len(cells) - 1)):
	sector_data["Azimuth"][i] = cells[i + 1]
	elif "hauteur des aériens" in first_cell:
	for i in range(min(3, len(cells) - 1)):
	sector_data["Height"][i] = cells[i + 1]
	elif "tilt mécanique" in first_cell:
	for i in range(min(3, len(cells) - 1)):
	sector_data["MechTilt"][i] = cells[i + 1]
	elif "tilt électrique" in first_cell:
	for i in range(min(3, len(cells) - 1)):
	sector_data["ElecTilt"][i] = cells[i + 1]

	# Convert to per-sector rows
	rows: List[dict] = []
	for sector_id in range(3):
	if sector_data["Azimuth"][sector_id]:
	rows.append(
	{
	**site_shared,
	"Sector ID": sector_id + 1,
	"Azimuth": sector_data["Azimuth"][sector_id],
	"Height": sector_data["Height"][sector_id],
	"MechTilt": sector_data["MechTilt"][sector_id],
	"ElecTilt": sector_data["ElecTilt"][sector_id],
	}
	)
	return rows


	def convert_coord_to_decimal(coord: str, default_direction: str \| None = None):
	"""Convert coordinate strings containing degrees/minutes/seconds to decimal degrees.

	Handles various formats, e.g. "3° 33' 12.4\" W", "3 33 12.4 O", "-3.5534", "3.5534E".
	West (W/O) or South (S) are returned as negative values.
	Returns None if conversion fails.
	"""

	if coord is None or (isinstance(coord, float) and pd.isna(coord)):
	return None

	# Normalise the string – unify decimal separator and strip spaces
	text = str(coord).replace(",", ".").strip()
	if not text:
	return None

	# Detect hemisphere / direction letters
	direction = None
	match_dir = re.search(r"([NSEWnsewOo])", text)
	if match_dir:
	direction = match_dir.group(1).upper()
	text = text.replace(match_dir.group(1), "") # remove letter for numeric parsing
	else:
	# No explicit letter – use supplied default if provided
	if default_direction is not None:
	direction = default_direction.upper()

	# Grab all numeric components
	nums = re.findall(r"[-+]?(?:\d+\.?\d*)", text)
	if not nums:
	return None

	# Convert strings to float
	nums_f = [float(n) for n in nums]

	# Determine decimal value depending on how many components we have
	if len(nums_f) >= 3:
	deg, minute, sec = nums_f[0], nums_f[1], nums_f[2]
	dec = deg + minute / 60 + sec / 3600
	elif len(nums_f) == 2:
	deg, minute = nums_f[0], nums_f[1]
	dec = deg + minute / 60
	else: # Already decimal degrees
	dec = nums_f[0]

	# Apply sign for West/Ouest/South
	if direction in {"W", "O", "S"}: # West/Ouest or South => negative
	dec = -abs(dec)

	return dec


	def process_files_to_dataframe(uploaded_files) -> pd.DataFrame:
	"""Run extraction on the uploaded files and return a concatenated dataframe."""
	all_rows: List[dict] = []
	for uploaded in uploaded_files:
	rows = extract_info_from_docx_separated_sectors(uploaded.read(), uploaded.name)
	all_rows.extend(rows)
	df = pd.DataFrame(all_rows)

	# Add decimal conversion for X and Y
	if not df.empty and {"X", "Y"}.issubset(df.columns):
	df["X_decimal"] = df["X"].apply(
	lambda c: convert_coord_to_decimal(c, default_direction="N")
	)
	df["Y_decimal"] = df["Y"].apply(
	lambda c: convert_coord_to_decimal(c, default_direction="W")
	)

	return df


	###############################################################################
	# ----------------------------- Streamlit UI ------------------------------ #
	###############################################################################


	def main() -> None:
	st.set_page_config(
	page_title="F4NB Extractor to Excel", page_icon="📄", layout="wide"
	)

	st.title("📄 F4NB Extractor to Excel")
	st.markdown(
	"Convert F4NB Word documents into a tidy Excel / DataFrame containing site & sector information.\n"
	"Upload one or many F4NB `.docx` files and hit Process."
	)

	st.subheader("Upload Files")
	uploaded_files = st.file_uploader(
	"Select one or more F4NB `.docx` files",
	type=["docx"],
	accept_multiple_files=True,
	)
	process_btn = st.button("Process", type="primary", disabled=not uploaded_files)

	if process_btn and uploaded_files:
	with st.spinner("Extracting information…"):
	df = process_files_to_dataframe(uploaded_files)

	if df.empty:
	st.warning(
	"No data extracted. Check that the files conform to the expected format."
	)
	return

	st.success(
	f"Processed {len(uploaded_files)} file(s) – extracted {len(df)} sector rows."
	)
	st.dataframe(df, use_container_width=True)

	# Interactive map of extracted coordinates using Plotly
	if {"Y_decimal", "X_decimal"}.issubset(df.columns):
	geo_df = (
	df[["Y_decimal", "X_decimal", "Site Name", "Code"]]
	.dropna()
	.rename(columns={"Y_decimal": "Longitude", "X_decimal": "Latitude"})
	.assign(
	Size=lambda d: (
	pd.to_numeric(d["Height"], errors="coerce").fillna(10)
	if "Height" in d.columns
	else 10
	)
	)
	)
	if not geo_df.empty:
	st.subheader("🗺️ Site Locations")
	fig = px.scatter_map(
	geo_df,
	lat="Latitude",
	lon="Longitude",
	hover_name="Site Name",
	hover_data={"Code": True},
	size="Size",
	size_max=10,
	zoom=6,
	height=500,
	)
	fig.update_layout(
	mapbox_style="open-street-map",
	margin={"r": 0, "t": 0, "l": 0, "b": 0},
	)
	st.plotly_chart(fig, use_container_width=True)

	# Offer download as Excel
	buffer = io.BytesIO()
	with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
	df.to_excel(writer, index=False, sheet_name="Extract")
	st.download_button(
	label="💾 Download Excel",
	data=buffer.getvalue(),
	file_name="extracted_fnb.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	)


	if __name__ == "__main__": # pragma: no cover
	main()