File size: 2,600 Bytes

94f90b6

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import gzip

def load_gzip_csv(file_path):
    with gzip.open(file_path, 'rt') as f:
        return pd.read_csv(f)

def plot_distribution(data, property_name, output_folder):
    plt.figure(figsize=(12, 7))
    
    if property_name in ['BBBP', 'HIV', 'BACE']:
        # For binary properties, create a bar plot
        value_counts = data[property_name].value_counts().sort_index()
        sns.barplot(x=value_counts.index, y=value_counts.values)
        plt.title(f'Distribution of {property_name}')
        plt.xlabel(property_name)
        plt.ylabel('Count')
    else:
        # For continuous properties, create a histogram with KDE
        sns.histplot(data[property_name], kde=True)
        plt.title(f'Distribution of {property_name}')
        plt.xlabel(property_name)
        plt.ylabel('Density')
        
        # If the property is one of the gas properties, use log scale
        if property_name in ['CO2', 'N2', 'O2']:
            plt.xscale('log')
            plt.xlabel(f'{property_name} (log scale)')
        
        # Add labels for min and max values
        min_val = data[property_name].min()
        max_val = data[property_name].max()
        
        # Determine y-position for labels (you might need to adjust these)
        y_min = plt.ylim()[1] * 0.9
        y_max = plt.ylim()[1] * 0.95
        
        plt.text(min_val, y_min, f'Min: {min_val:.2e}', ha='left', va='top')
        plt.text(max_val, y_max, f'Max: {max_val:.2e}', ha='right', va='top')
        
        # Add vertical lines for min and max
        plt.axvline(min_val, color='r', linestyle='--', alpha=0.5)
        plt.axvline(max_val, color='r', linestyle='--', alpha=0.5)

    plt.tight_layout()
    plt.savefig(output_folder / f'{property_name}_distribution.png', dpi=300)
    plt.close()

def main():
    # Input folder (current directory)
    input_folder = Path('.')
    
    # Output folder
    output_folder = Path('dist')
    output_folder.mkdir(exist_ok=True)

    # List of properties
    properties = ['BBBP', 'HIV', 'BACE', 'CO2', 'N2', 'O2', 'FFV', 'TC']

    for prop in properties:
        file_path = input_folder / f'{prop}.csv.gz'
        if file_path.exists():
            print(f"Processing {prop}...")
            data = load_gzip_csv(file_path)
            plot_distribution(data, prop, output_folder)
            print(f"Saved distribution plot for {prop}")
        else:
            print(f"File not found: {file_path}")

if __name__ == "__main__":
    main()