liuganghuggingface's picture
Upload evaluatorQA/2_visualize_dist.py with huggingface_hub
94f90b6 verified
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import gzip
def load_gzip_csv(file_path):
with gzip.open(file_path, 'rt') as f:
return pd.read_csv(f)
def plot_distribution(data, property_name, output_folder):
plt.figure(figsize=(12, 7))
if property_name in ['BBBP', 'HIV', 'BACE']:
# For binary properties, create a bar plot
value_counts = data[property_name].value_counts().sort_index()
sns.barplot(x=value_counts.index, y=value_counts.values)
plt.title(f'Distribution of {property_name}')
plt.xlabel(property_name)
plt.ylabel('Count')
else:
# For continuous properties, create a histogram with KDE
sns.histplot(data[property_name], kde=True)
plt.title(f'Distribution of {property_name}')
plt.xlabel(property_name)
plt.ylabel('Density')
# If the property is one of the gas properties, use log scale
if property_name in ['CO2', 'N2', 'O2']:
plt.xscale('log')
plt.xlabel(f'{property_name} (log scale)')
# Add labels for min and max values
min_val = data[property_name].min()
max_val = data[property_name].max()
# Determine y-position for labels (you might need to adjust these)
y_min = plt.ylim()[1] * 0.9
y_max = plt.ylim()[1] * 0.95
plt.text(min_val, y_min, f'Min: {min_val:.2e}', ha='left', va='top')
plt.text(max_val, y_max, f'Max: {max_val:.2e}', ha='right', va='top')
# Add vertical lines for min and max
plt.axvline(min_val, color='r', linestyle='--', alpha=0.5)
plt.axvline(max_val, color='r', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig(output_folder / f'{property_name}_distribution.png', dpi=300)
plt.close()
def main():
# Input folder (current directory)
input_folder = Path('.')
# Output folder
output_folder = Path('dist')
output_folder.mkdir(exist_ok=True)
# List of properties
properties = ['BBBP', 'HIV', 'BACE', 'CO2', 'N2', 'O2', 'FFV', 'TC']
for prop in properties:
file_path = input_folder / f'{prop}.csv.gz'
if file_path.exists():
print(f"Processing {prop}...")
data = load_gzip_csv(file_path)
plot_distribution(data, prop, output_folder)
print(f"Saved distribution plot for {prop}")
else:
print(f"File not found: {file_path}")
if __name__ == "__main__":
main()