import os import requests from PIL import Image from io import BytesIO import time from config import IMAGE_DIR, DATASET_SOURCES class DataCollector: def __init__(self): self.image_dir = IMAGE_DIR self.sources = DATASET_SOURCES def collect_sample_images(self): """Collect sample images from public sources""" # These are example URLs - in practice you'd scrape or use APIs sample_urls = [ "https://c7.alamy.com/comp/3AJ86J0/gold-on-quartz-bradshaw-mountains-arizona-gold-on-quartz-from-the-bradshaw-mountains-arizona-is-a-classic-and-highly-sought-after-mineral-associa-3AJ86J0.jpg", "https://www.nuggetsbygrant.com/cdn/shop/products/243A0948.jpg?v=1670014792&width=1080", "https://news.rice.edu/sites/g/files/bxs2656/files/inline-images/BIF5-0524_540_1.jpeg", "https://c7.alamy.com/comp/2FNKTF3/copper-bearing-rock-against-a-gravel-ground-surface-2FNKTF3.jpg", "https://www.shutterstock.com/shutterstock/photos/2618131965/display_1500/stock-photo-close-up-of-a-rough-weathered-copper-ore-stone-with-natural-crystal-formations-2618131965.jpg", "https://geologyistheway.com/wp-content/uploads/2021/06/118-milky-quartz.jpg", "https://geologyistheway.com/wp-content/uploads/2021/06/201210-4-1024x726.jpg" ] print("Collecting sample drill core images...") for i, url in enumerate(sample_urls): try: response = requests.get(url, timeout=10) response.raise_for_status() img = Image.open(BytesIO(response.content)) img_path = os.path.join(self.image_dir, f"sample_core_{i+1}.jpg") img.save(img_path) print(f"Downloaded: sample_core_{i+1}.jpg") time.sleep(0.5) # Be respectful to servers except Exception as e: print(f"Failed to download {url}: {e}") print(f"Collected {len(os.listdir(self.image_dir))} images") def get_dataset_info(self): """Return information about available datasets""" return self.sources if __name__ == "__main__": collector = DataCollector() collector.collect_sample_images() print("\nAvailable geological datasets:") for source in collector.get_dataset_info(): print(f"- {source['name']}: {source['description']}") print(f" URL: {source['url']}\n")