DavMelchi commited on
Commit
c83b398
·
1 Parent(s): 7c8002a

New code for dump compare

Browse files
Files changed (1) hide show
  1. apps/dump_compare.py +93 -177
apps/dump_compare.py CHANGED
@@ -1,13 +1,11 @@
1
  import os
2
- import shutil
3
- import tempfile
4
 
5
  import pandas as pd
6
  import streamlit as st
7
 
8
- # import xlwings as xw
9
-
10
- # === Core Logic ===
11
 
12
 
13
  def find_header_row(df, keyword="Dist_Name"):
@@ -18,37 +16,13 @@ def find_header_row(df, keyword="Dist_Name"):
18
  raise ValueError(f"No row with '{keyword}' found.")
19
 
20
 
21
- # import xlwings as xw
22
- # def read_sheet_fallback(file, sheet):
23
- # try:
24
- # app = xw.App(visible=False)
25
- # book = app.books.open(file)
26
- # sht = book.sheets[sheet]
27
- # df = sht.used_range.options(pd.DataFrame, header=False, index=False).value
28
- # book.close()
29
- # app.quit()
30
- # return df
31
- # except Exception as e2:
32
- # raise RuntimeError(f"xlwings failed: {e2}")
33
-
34
-
35
- def read_sheet_fallback(file, sheet):
36
- try:
37
- # pandas can directly read Excel files
38
- # sheet_name can be the sheet name (string) or sheet number (0-indexed)
39
- df = pd.read_excel(file, sheet_name=sheet, header=None, engine="calamine")
40
- return df
41
- except FileNotFoundError:
42
- raise FileNotFoundError(f"The file '{file}' was not found.")
43
- except ValueError as e:
44
- # This could happen if the sheet doesn't exist, or other pandas-related errors
45
- raise ValueError(f"Error reading sheet '{sheet}' from '{file}': {e}")
46
- except Exception as e:
47
- raise RuntimeError(f"An unexpected error occurred: {e}")
48
-
49
-
50
- def load_clean_df(file, sheet):
51
- df_raw = read_sheet_fallback(file, sheet)
52
  header_row = find_header_row(df_raw)
53
  df_raw.columns = df_raw.iloc[header_row]
54
  df = df_raw.drop(index=list(range(header_row + 1)))
@@ -64,152 +38,94 @@ def detect_dist_col(columns):
64
  raise ValueError("Dist_Name column not found.")
65
 
66
 
67
- def compare_dumps(
68
- old_file,
69
- new_file,
70
- mo_list,
71
- output_dir,
72
- # progress_callback=None
73
- ):
74
- os.makedirs(output_dir, exist_ok=True)
75
-
76
- # Friendly column labels based on file names
77
- old_label = os.path.basename(old_file)
78
- new_label = os.path.basename(new_file)
79
-
80
- total_changes = 0
81
- logs = []
82
-
83
- for i, sheet_name in enumerate(mo_list):
84
- try:
85
- df_old = load_clean_df(old_file, sheet_name)
86
- df_new = load_clean_df(new_file, sheet_name)
87
-
88
- dist_col_old = detect_dist_col(df_old.columns)
89
- dist_col_new = detect_dist_col(df_new.columns)
90
-
91
- df_old = df_old[df_old[dist_col_old].notna()].set_index(dist_col_old)
92
- df_new = df_new[df_new[dist_col_new].notna()].set_index(dist_col_new)
93
-
94
- common = df_old.index.intersection(df_new.index)
95
- df_old_common = df_old.loc[common]
96
- df_new_common = df_new.loc[common]
97
-
98
- mask = (df_old_common != df_new_common) & ~(
99
- df_old_common.isna() & df_new_common.isna()
100
- )
101
 
102
- changes = []
103
- for dist in mask.index:
104
- for param in mask.columns[mask.loc[dist]]:
105
- if param.strip().lower() == "file_name":
106
- continue
107
-
108
- changes.append(
109
- {
110
- "Dist_Name": dist,
111
- "Parameter": param,
112
- old_label: df_old_common.loc[dist, param],
113
- new_label: df_new_common.loc[dist, param],
114
- }
115
- )
116
-
117
- df_changes = pd.DataFrame(changes)
118
- if not df_changes.empty:
119
- output_path = os.path.join(output_dir, f"{sheet_name}_differences.xlsx")
120
- df_changes.to_excel(output_path, index=False)
121
- logs.append(f"{len(df_changes)} changes in {sheet_name}")
122
- total_changes += len(df_changes)
123
- else:
124
- logs.append(f"No changes in {sheet_name}")
125
-
126
- except Exception as e:
127
- logs.append(f"Error in {sheet_name}: {e}")
128
-
129
- # if progress_callback:
130
- # progress_callback((i + 1) / len(mo_list))
131
-
132
- return total_changes, logs
133
-
134
-
135
- # === Streamlit UI ===
136
-
137
- st.title("📊 Dump Compare Tool")
138
 
139
  old_file = st.file_uploader("Upload Old Dump (.xlsb)", type=["xlsb"], key="old")
140
  new_file = st.file_uploader("Upload New Dump (.xlsb)", type=["xlsb"], key="new")
141
 
142
- # Determine common sheet names available in BOTH uploaded dumps and let the user pick
143
- common_sheets: list[str] = []
144
- selected_sheets: list[str] = []
145
-
146
- if old_file and new_file:
147
- import tempfile as _tmp
148
-
149
- from pyxlsb import open_workbook as _open_wb
150
-
151
- def _get_sheet_names(uploaded_file) -> list[str]:
152
- """Return sheet names from an `st.uploaded_file` object."""
153
- with _tmp.NamedTemporaryFile(delete=False, suffix=".xlsb") as tmp:
154
- tmp.write(uploaded_file.getvalue())
155
- tmp_path = tmp.name
156
- try:
157
- with _open_wb(tmp_path) as wb:
158
- # `wb.sheets` in pyxlsb already returns a list of sheet names (str)
159
- return list(wb.sheets)
160
- finally:
161
- os.remove(tmp_path)
162
-
163
- common_sheets = sorted(
164
- set(_get_sheet_names(old_file)).intersection(_get_sheet_names(new_file))
165
- )
166
-
167
- if common_sheets:
168
- selected_sheets = st.multiselect(
169
- "MO Sheet Names (choose one or more)",
170
- common_sheets,
171
- default=common_sheets[:1], # select only the first sheet by default
172
- )
173
- else:
174
- st.warning("No common sheet names found between the two files.")
175
- output_dir = "comparison_output" # fixed output folder name
176
 
177
  if st.button("Run Comparison", type="primary", use_container_width=True):
178
- if not all([old_file, new_file]) or not selected_sheets:
179
- st.warning("Please upload both files and select at least one common sheet.")
180
  else:
181
- mo_list = selected_sheets
182
- # Reset file pointers because they may have been consumed while reading sheet names
183
- old_file.seek(0)
184
- new_file.seek(0)
185
- with st.spinner("Comparing dumps..."):
186
- with tempfile.TemporaryDirectory() as tmpdir:
187
- output_path = os.path.join(tmpdir, output_dir)
188
- old_path = os.path.join(tmpdir, "old.xlsb")
189
- new_path = os.path.join(tmpdir, "new.xlsb")
190
-
191
- with open(old_path, "wb") as f:
192
- f.write(old_file.read())
193
- with open(new_path, "wb") as f:
194
- f.write(new_file.read())
195
-
196
- # progress_bar = st.progress(0.0)
197
-
198
- # def update_progress(pct):
199
- # progress_bar.progress(pct)
200
-
201
- total, logs = compare_dumps(old_path, new_path, mo_list, output_path)
202
-
203
- st.success(f"✅ Comparison completed. Total changes: {total}")
204
-
205
- # Zip and offer download
206
- shutil.make_archive(output_path, "zip", output_path)
207
- with open(f"{output_path}.zip", "rb") as f:
208
- st.download_button(
209
- "Download Results (.zip)",
210
- f,
211
- file_name="differences.zip",
212
- mime="application/zip",
213
- type="primary",
214
- on_click="ignore",
215
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import zipfile
3
+ from io import BytesIO
4
 
5
  import pandas as pd
6
  import streamlit as st
7
 
8
+ # === Fonctions ===
 
 
9
 
10
 
11
  def find_header_row(df, keyword="Dist_Name"):
 
16
  raise ValueError(f"No row with '{keyword}' found.")
17
 
18
 
19
+ def read_sheet_fallback(file_bytes, sheet):
20
+ file_bytes.seek(0)
21
+ return pd.read_excel(file_bytes, sheet_name=sheet, header=None, engine="calamine")
22
+
23
+
24
+ def load_clean_df(file_bytes, sheet):
25
+ df_raw = read_sheet_fallback(file_bytes, sheet)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  header_row = find_header_row(df_raw)
27
  df_raw.columns = df_raw.iloc[header_row]
28
  df = df_raw.drop(index=list(range(header_row + 1)))
 
38
  raise ValueError("Dist_Name column not found.")
39
 
40
 
41
+ # === Interface Streamlit ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ st.title("📊 Dump Compare Tool (In-Memory with Calamine)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  old_file = st.file_uploader("Upload Old Dump (.xlsb)", type=["xlsb"], key="old")
46
  new_file = st.file_uploader("Upload New Dump (.xlsb)", type=["xlsb"], key="new")
47
 
48
+ sheet_list_input = st.text_input(
49
+ "Enter sheet names (comma-separated)", placeholder="e.g. BCF, BTS, CELL"
50
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  if st.button("Run Comparison", type="primary", use_container_width=True):
53
+ if not all([old_file, new_file, sheet_list_input.strip()]):
54
+ st.warning("Please upload both files and provide at least one sheet name.")
55
  else:
56
+ sheet_names = [s.strip() for s in sheet_list_input.split(",") if s.strip()]
57
+ old_bytes = BytesIO(old_file.read())
58
+ new_bytes = BytesIO(new_file.read())
59
+
60
+ logs = []
61
+ total = 0
62
+ all_results = {}
63
+
64
+ for sheet in sheet_names:
65
+ try:
66
+ df_old = load_clean_df(old_bytes, sheet)
67
+ old_bytes.seek(0)
68
+ df_new = load_clean_df(new_bytes, sheet)
69
+ new_bytes.seek(0)
70
+
71
+ dist_col_old = detect_dist_col(df_old.columns)
72
+ dist_col_new = detect_dist_col(df_new.columns)
73
+
74
+ df_old = df_old[df_old[dist_col_old].notna()].set_index(dist_col_old)
75
+ df_new = df_new[df_new[dist_col_new].notna()].set_index(dist_col_new)
76
+
77
+ common = df_old.index.intersection(df_new.index)
78
+ df_old_common = df_old.loc[common]
79
+ df_new_common = df_new.loc[common]
80
+
81
+ mask = (df_old_common != df_new_common) & ~(
82
+ df_old_common.isna() & df_new_common.isna()
83
+ )
84
+
85
+ changes = []
86
+ for dist in mask.index:
87
+ for param in mask.columns[mask.loc[dist]]:
88
+ if param.strip().lower() == "file_name":
89
+ continue
90
+ changes.append(
91
+ {
92
+ "Dist_Name": dist,
93
+ "Parameter": param,
94
+ os.path.basename(old_file.name): df_old_common.loc[
95
+ dist, param
96
+ ],
97
+ os.path.basename(new_file.name): df_new_common.loc[
98
+ dist, param
99
+ ],
100
+ }
101
+ )
102
+
103
+ df_changes = pd.DataFrame(changes)
104
+ if not df_changes.empty:
105
+ all_results[sheet] = df_changes
106
+ logs.append(f"{len(df_changes)} changes in '{sheet}'")
107
+ total += len(df_changes)
108
+ else:
109
+ logs.append(f"No changes in '{sheet}'")
110
+
111
+ except Exception as e:
112
+ logs.append(f"❌ Error in '{sheet}': {e}")
113
+
114
+ st.success(f"✅ Comparison completed. Total changes: {total}")
115
+ for log in logs:
116
+ st.write(log)
117
+
118
+ if all_results:
119
+ output_buffer = BytesIO()
120
+ with zipfile.ZipFile(output_buffer, mode="w") as zf:
121
+ for sheet, df in all_results.items():
122
+ file_buffer = BytesIO()
123
+ df.to_excel(file_buffer, index=False)
124
+ zf.writestr(f"{sheet}_differences.xlsx", file_buffer.getvalue())
125
+
126
+ st.download_button(
127
+ "Download Results (.zip)",
128
+ data=output_buffer.getvalue(),
129
+ file_name="differences.zip",
130
+ mime="application/zip",
131
+ )