import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from pathlib import Path import gzip def load_gzip_csv(file_path): with gzip.open(file_path, 'rt') as f: return pd.read_csv(f) def plot_distribution(data, property_name, output_folder): plt.figure(figsize=(12, 7)) if property_name in ['BBBP', 'HIV', 'BACE']: # For binary properties, create a bar plot value_counts = data[property_name].value_counts().sort_index() sns.barplot(x=value_counts.index, y=value_counts.values) plt.title(f'Distribution of {property_name}') plt.xlabel(property_name) plt.ylabel('Count') else: # For continuous properties, create a histogram with KDE sns.histplot(data[property_name], kde=True) plt.title(f'Distribution of {property_name}') plt.xlabel(property_name) plt.ylabel('Density') # If the property is one of the gas properties, use log scale if property_name in ['CO2', 'N2', 'O2']: plt.xscale('log') plt.xlabel(f'{property_name} (log scale)') # Add labels for min and max values min_val = data[property_name].min() max_val = data[property_name].max() # Determine y-position for labels (you might need to adjust these) y_min = plt.ylim()[1] * 0.9 y_max = plt.ylim()[1] * 0.95 plt.text(min_val, y_min, f'Min: {min_val:.2e}', ha='left', va='top') plt.text(max_val, y_max, f'Max: {max_val:.2e}', ha='right', va='top') # Add vertical lines for min and max plt.axvline(min_val, color='r', linestyle='--', alpha=0.5) plt.axvline(max_val, color='r', linestyle='--', alpha=0.5) plt.tight_layout() plt.savefig(output_folder / f'{property_name}_distribution.png', dpi=300) plt.close() def main(): # Input folder (current directory) input_folder = Path('.') # Output folder output_folder = Path('dist') output_folder.mkdir(exist_ok=True) # List of properties properties = ['BBBP', 'HIV', 'BACE', 'CO2', 'N2', 'O2', 'FFV', 'TC'] for prop in properties: file_path = input_folder / f'{prop}.csv.gz' if file_path.exists(): print(f"Processing {prop}...") data = load_gzip_csv(file_path) plot_distribution(data, prop, output_folder) print(f"Saved distribution plot for {prop}") else: print(f"File not found: {file_path}") if __name__ == "__main__": main()