File size: 2,600 Bytes
94f90b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import gzip
def load_gzip_csv(file_path):
with gzip.open(file_path, 'rt') as f:
return pd.read_csv(f)
def plot_distribution(data, property_name, output_folder):
plt.figure(figsize=(12, 7))
if property_name in ['BBBP', 'HIV', 'BACE']:
# For binary properties, create a bar plot
value_counts = data[property_name].value_counts().sort_index()
sns.barplot(x=value_counts.index, y=value_counts.values)
plt.title(f'Distribution of {property_name}')
plt.xlabel(property_name)
plt.ylabel('Count')
else:
# For continuous properties, create a histogram with KDE
sns.histplot(data[property_name], kde=True)
plt.title(f'Distribution of {property_name}')
plt.xlabel(property_name)
plt.ylabel('Density')
# If the property is one of the gas properties, use log scale
if property_name in ['CO2', 'N2', 'O2']:
plt.xscale('log')
plt.xlabel(f'{property_name} (log scale)')
# Add labels for min and max values
min_val = data[property_name].min()
max_val = data[property_name].max()
# Determine y-position for labels (you might need to adjust these)
y_min = plt.ylim()[1] * 0.9
y_max = plt.ylim()[1] * 0.95
plt.text(min_val, y_min, f'Min: {min_val:.2e}', ha='left', va='top')
plt.text(max_val, y_max, f'Max: {max_val:.2e}', ha='right', va='top')
# Add vertical lines for min and max
plt.axvline(min_val, color='r', linestyle='--', alpha=0.5)
plt.axvline(max_val, color='r', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig(output_folder / f'{property_name}_distribution.png', dpi=300)
plt.close()
def main():
# Input folder (current directory)
input_folder = Path('.')
# Output folder
output_folder = Path('dist')
output_folder.mkdir(exist_ok=True)
# List of properties
properties = ['BBBP', 'HIV', 'BACE', 'CO2', 'N2', 'O2', 'FFV', 'TC']
for prop in properties:
file_path = input_folder / f'{prop}.csv.gz'
if file_path.exists():
print(f"Processing {prop}...")
data = load_gzip_csv(file_path)
plot_distribution(data, prop, output_folder)
print(f"Saved distribution plot for {prop}")
else:
print(f"File not found: {file_path}")
if __name__ == "__main__":
main() |