""" Streamlit application for extracting site and sector information from .docx design files. The logic is adapted from `Sector Stacked.py` but provides an interactive UI where users can upload one or many Word documents and instantly visualise / download the results. """ import io import os import re from typing import List import pandas as pd import plotly.express as px import streamlit as st from docx import Document ############################################################################### # --------------------------- Core extract logic -------------------------- # ############################################################################### def extract_info_from_docx_separated_sectors( docx_bytes: bytes, filename: str ) -> List[dict]: """Extract the site-level and sector-level information from a Word design file. Parameters ---------- docx_bytes : bytes Raw bytes of the `.docx` file – read directly from the Streamlit uploader. filename : str Original filename. Used only for reference in the output. Returns ------- list[dict] A list containing up to three dictionaries – one for each sector. """ # python-docx can open a file-like object, so we wrap the bytes in BytesIO doc = Document(io.BytesIO(docx_bytes)) # Shared site information site_shared = { "File": filename, "Code": None, "Site Name": None, "Localité": None, "Adresse": None, "X": None, "Y": None, "Z": None, "UTM_Zone": None, } # Per-sector placeholders (we assume max 3 sectors) sector_data = { "Azimuth": [None] * 3, "Height": [None] * 3, "MechTilt": [None] * 3, "ElecTilt": [None] * 3, } # Iterate tables / rows / cells once, filling the data structures for table in doc.tables: for row in table.rows: # Drop empty cells and overspaces cells = [cell.text.strip() for cell in row.cells if cell.text.strip()] if not cells: continue row_text_lower = " | ".join(cells).lower() # Code (assumes pattern "T00" / "N01" typical of site codes) if site_shared["Code"] is None and any("code" in c.lower() for c in cells): for val in cells: if ("t00" in val.lower()) or ("n01" in val.lower()): site_shared["Code"] = val.replace(" ", "").strip() break # Site Name – same heuristic as original script if site_shared["Site Name"] is None and any( "nom" in c.lower() for c in cells ): for val in cells: if ("t00" in val.lower()) or ("n01" in val.lower()): site_shared["Site Name"] = val.strip() break # UTM Zone if site_shared["UTM_Zone"] is None: utm_match = re.search(r"utm\s*(\d+)", row_text_lower) if utm_match: site_shared["UTM_Zone"] = f"UTM{utm_match.group(1)}" # Localité and Adresse if site_shared["Localité"] is None and any( "localité" in c.lower() for c in cells ): for val in cells: if val.lower() != "localité:": site_shared["Localité"] = val.strip() break if site_shared["Adresse"] is None and any( "adresse" in c.lower() for c in cells ): for val in cells: if val.lower() != "adresse:": site_shared["Adresse"] = val.strip() break # Coordinates (X, Y, Z) if {"X", "Y", "Z"}.intersection(cells): for i, cell_text in enumerate(cells): text = cell_text.strip() # X coordinate if text == "X" and i + 1 < len(cells): site_shared["X"] = cells[i + 1].strip() # Y coordinate – could be in same cell e.g. "Y 123" or split elif re.search(r"Y\s*[0-9]", text): match = re.search(r"Y\s*([0-9°'\.\sWE]+)", text) if match: site_shared["Y"] = match.group(1).strip() elif text == "Y" and i + 1 < len(cells): site_shared["Y"] = cells[i + 1].strip() # Z / Elevation elif re.search(r"Z\s*[0-9]", text): match = re.search(r"Z\s*([0-9]+)", text) if match: site_shared["Z"] = match.group(1).strip() elif text == "Z" and i + 1 < len(cells): z_val = re.search(r"([0-9]+)", cells[i + 1]) if z_val: site_shared["Z"] = z_val.group(1).strip() # Sector-specific lines first_cell = cells[0].lower() if first_cell == "azimut": for i in range(min(3, len(cells) - 1)): sector_data["Azimuth"][i] = cells[i + 1] elif "hauteur des aériens" in first_cell: for i in range(min(3, len(cells) - 1)): sector_data["Height"][i] = cells[i + 1] elif "tilt mécanique" in first_cell: for i in range(min(3, len(cells) - 1)): sector_data["MechTilt"][i] = cells[i + 1] elif "tilt électrique" in first_cell: for i in range(min(3, len(cells) - 1)): sector_data["ElecTilt"][i] = cells[i + 1] # Convert to per-sector rows rows: List[dict] = [] for sector_id in range(3): if sector_data["Azimuth"][sector_id]: rows.append( { **site_shared, "Sector ID": sector_id + 1, "Azimuth": sector_data["Azimuth"][sector_id], "Height": sector_data["Height"][sector_id], "MechTilt": sector_data["MechTilt"][sector_id], "ElecTilt": sector_data["ElecTilt"][sector_id], } ) return rows def convert_coord_to_decimal(coord: str, default_direction: str | None = None): """Convert coordinate strings containing degrees/minutes/seconds to decimal degrees. Handles various formats, e.g. "3° 33' 12.4\" W", "3 33 12.4 O", "-3.5534", "3.5534E". West (W/O) or South (S) are returned as negative values. Returns None if conversion fails. """ if coord is None or (isinstance(coord, float) and pd.isna(coord)): return None # Normalise the string – unify decimal separator and strip spaces text = str(coord).replace(",", ".").strip() if not text: return None # Detect hemisphere / direction letters direction = None match_dir = re.search(r"([NSEWnsewOo])", text) if match_dir: direction = match_dir.group(1).upper() text = text.replace(match_dir.group(1), "") # remove letter for numeric parsing else: # No explicit letter – use supplied default if provided if default_direction is not None: direction = default_direction.upper() # Grab all numeric components nums = re.findall(r"[-+]?(?:\d+\.?\d*)", text) if not nums: return None # Convert strings to float nums_f = [float(n) for n in nums] # Determine decimal value depending on how many components we have if len(nums_f) >= 3: deg, minute, sec = nums_f[0], nums_f[1], nums_f[2] dec = deg + minute / 60 + sec / 3600 elif len(nums_f) == 2: deg, minute = nums_f[0], nums_f[1] dec = deg + minute / 60 else: # Already decimal degrees dec = nums_f[0] # Apply sign for West/Ouest/South if direction in {"W", "O", "S"}: # West/Ouest or South => negative dec = -abs(dec) return dec def process_files_to_dataframe(uploaded_files) -> pd.DataFrame: """Run extraction on the uploaded files and return a concatenated dataframe.""" all_rows: List[dict] = [] for uploaded in uploaded_files: rows = extract_info_from_docx_separated_sectors(uploaded.read(), uploaded.name) all_rows.extend(rows) df = pd.DataFrame(all_rows) # Add decimal conversion for X and Y if not df.empty and {"X", "Y"}.issubset(df.columns): df["X_decimal"] = df["X"].apply( lambda c: convert_coord_to_decimal(c, default_direction="N") ) df["Y_decimal"] = df["Y"].apply( lambda c: convert_coord_to_decimal(c, default_direction="W") ) return df ############################################################################### # ----------------------------- Streamlit UI ------------------------------ # ############################################################################### def main() -> None: st.set_page_config( page_title="F4NB Extractor to Excel", page_icon="📄", layout="wide" ) st.title("📄 F4NB Extractor to Excel") st.markdown( "Convert F4NB Word documents into a tidy Excel / DataFrame containing site & sector information.\n" "Upload one or many F4NB `.docx` files and hit **Process**." ) st.subheader("Upload Files") uploaded_files = st.file_uploader( "Select one or more F4NB `.docx` files", type=["docx"], accept_multiple_files=True, ) process_btn = st.button("Process", type="primary", disabled=not uploaded_files) if process_btn and uploaded_files: with st.spinner("Extracting information…"): df = process_files_to_dataframe(uploaded_files) if df.empty: st.warning( "No data extracted. Check that the files conform to the expected format." ) return st.success( f"Processed {len(uploaded_files)} file(s) – extracted {len(df)} sector rows." ) st.dataframe(df, use_container_width=True) # Interactive map of extracted coordinates using Plotly if {"Y_decimal", "X_decimal"}.issubset(df.columns): geo_df = ( df[["Y_decimal", "X_decimal", "Site Name", "Code"]] .dropna() .rename(columns={"Y_decimal": "Longitude", "X_decimal": "Latitude"}) .assign( Size=lambda d: ( pd.to_numeric(d["Height"], errors="coerce").fillna(10) if "Height" in d.columns else 10 ) ) ) if not geo_df.empty: st.subheader("🗺️ Site Locations") fig = px.scatter_map( geo_df, lat="Latitude", lon="Longitude", hover_name="Site Name", hover_data={"Code": True}, size="Size", size_max=10, zoom=6, height=500, ) fig.update_layout( mapbox_style="open-street-map", margin={"r": 0, "t": 0, "l": 0, "b": 0}, ) st.plotly_chart(fig, use_container_width=True) # Offer download as Excel buffer = io.BytesIO() with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer: df.to_excel(writer, index=False, sheet_name="Extract") st.download_button( label="💾 Download Excel", data=buffer.getvalue(), file_name="extracted_fnb.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ) if __name__ == "__main__": # pragma: no cover main()