Testys commited on
Commit
748bb2f
·
1 Parent(s): dd6b309

Create diagnostics.py

Browse files
Files changed (1) hide show
  1. diagnostics.py +83 -0
diagnostics.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # diagnostics.py
2
+ import logging
3
+ import time
4
+ import os
5
+ from pathlib import Path
6
+ import pandas as pd
7
+ import streamlit as st
8
+
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
13
+ handlers=[
14
+ logging.FileHandler("metadata_manager.log"),
15
+ logging.StreamHandler()
16
+ ]
17
+ )
18
+
19
+ def diagnose_parquet_files(directory_path):
20
+ """Diagnostic tool to verify parquet files are readable and valid"""
21
+ logger = logging.getLogger("ParquetDiagnostic")
22
+ logger.info(f"Starting parquet file diagnostics in {directory_path}")
23
+
24
+ dir_path = Path(directory_path)
25
+ if not dir_path.exists():
26
+ logger.error(f"Directory does not exist: {dir_path}")
27
+ return False
28
+
29
+ all_files = list(dir_path.glob("*.parquet"))
30
+ logger.info(f"Found {len(all_files)} parquet files")
31
+
32
+ if not all_files:
33
+ logger.warning("No parquet files found")
34
+ return False
35
+
36
+ success_count = 0
37
+ issue_count = 0
38
+ total_rows = 0
39
+
40
+ for file_path in all_files:
41
+ logger.info(f"Diagnosing file: {file_path}")
42
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
43
+ logger.info(f"File size: {file_size_mb:.2f} MB")
44
+
45
+ try:
46
+ # Try to read the file metadata without loading the data
47
+ import pyarrow.parquet as pq
48
+ parquet_file = pq.ParquetFile(file_path)
49
+
50
+ # Log schema information
51
+ schema = parquet_file.schema.to_arrow_schema()
52
+ logger.info(f"Schema: {schema}")
53
+
54
+ # Log file metadata
55
+ metadata = parquet_file.metadata
56
+ num_rows = metadata.num_rows
57
+ num_columns = len(schema.names)
58
+ logger.info(f"Rows: {num_rows}, Columns: {num_columns}")
59
+
60
+ # Try to read a small sample to verify data can be loaded
61
+ sample_df = pd.read_parquet(file_path, engine='pyarrow')
62
+ actual_rows = len(sample_df)
63
+
64
+ logger.info(f"Successfully read {actual_rows} rows")
65
+ total_rows += actual_rows
66
+ success_count += 1
67
+
68
+ except Exception as e:
69
+ logger.error(f"Failed to read file {file_path}: {str(e)}", exc_info=True)
70
+ issue_count += 1
71
+
72
+ # Try alternate engines if primary fails
73
+ try:
74
+ logger.info("Attempting to read with fastparquet engine")
75
+ sample_df = pd.read_parquet(file_path, engine='fastparquet')
76
+ logger.info(f"fastparquet succeeded, read {len(sample_df)} rows")
77
+ except Exception as e2:
78
+ logger.error(f"fastparquet also failed: {str(e2)}")
79
+
80
+ logger.info(f"Diagnostics complete: {success_count} files OK, {issue_count} files with issues")
81
+ logger.info(f"Total rows across all files: {total_rows}")
82
+
83
+ return success_count > 0