|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import numpy as np |
|
from pathlib import Path |
|
import gzip |
|
|
|
def load_gzip_csv(file_path): |
|
with gzip.open(file_path, 'rt') as f: |
|
return pd.read_csv(f) |
|
|
|
def plot_distribution(data, property_name, output_folder): |
|
plt.figure(figsize=(12, 7)) |
|
|
|
if property_name in ['BBBP', 'HIV', 'BACE']: |
|
|
|
value_counts = data[property_name].value_counts().sort_index() |
|
sns.barplot(x=value_counts.index, y=value_counts.values) |
|
plt.title(f'Distribution of {property_name}') |
|
plt.xlabel(property_name) |
|
plt.ylabel('Count') |
|
else: |
|
|
|
sns.histplot(data[property_name], kde=True) |
|
plt.title(f'Distribution of {property_name}') |
|
plt.xlabel(property_name) |
|
plt.ylabel('Density') |
|
|
|
|
|
if property_name in ['CO2', 'N2', 'O2']: |
|
plt.xscale('log') |
|
plt.xlabel(f'{property_name} (log scale)') |
|
|
|
|
|
min_val = data[property_name].min() |
|
max_val = data[property_name].max() |
|
|
|
|
|
y_min = plt.ylim()[1] * 0.9 |
|
y_max = plt.ylim()[1] * 0.95 |
|
|
|
plt.text(min_val, y_min, f'Min: {min_val:.2e}', ha='left', va='top') |
|
plt.text(max_val, y_max, f'Max: {max_val:.2e}', ha='right', va='top') |
|
|
|
|
|
plt.axvline(min_val, color='r', linestyle='--', alpha=0.5) |
|
plt.axvline(max_val, color='r', linestyle='--', alpha=0.5) |
|
|
|
plt.tight_layout() |
|
plt.savefig(output_folder / f'{property_name}_distribution.png', dpi=300) |
|
plt.close() |
|
|
|
def main(): |
|
|
|
input_folder = Path('.') |
|
|
|
|
|
output_folder = Path('dist') |
|
output_folder.mkdir(exist_ok=True) |
|
|
|
|
|
properties = ['BBBP', 'HIV', 'BACE', 'CO2', 'N2', 'O2', 'FFV', 'TC'] |
|
|
|
for prop in properties: |
|
file_path = input_folder / f'{prop}.csv.gz' |
|
if file_path.exists(): |
|
print(f"Processing {prop}...") |
|
data = load_gzip_csv(file_path) |
|
plot_distribution(data, prop, output_folder) |
|
print(f"Saved distribution plot for {prop}") |
|
else: |
|
print(f"File not found: {file_path}") |
|
|
|
if __name__ == "__main__": |
|
main() |