Spaces:
Sleeping
Sleeping
import os | |
import requests | |
from PIL import Image | |
from io import BytesIO | |
import time | |
from config import IMAGE_DIR, DATASET_SOURCES | |
class DataCollector: | |
def __init__(self): | |
self.image_dir = IMAGE_DIR | |
self.sources = DATASET_SOURCES | |
def collect_sample_images(self): | |
"""Collect sample images from public sources""" | |
# These are example URLs - in practice you'd scrape or use APIs | |
sample_urls = [ | |
"https://c7.alamy.com/comp/3AJ86J0/gold-on-quartz-bradshaw-mountains-arizona-gold-on-quartz-from-the-bradshaw-mountains-arizona-is-a-classic-and-highly-sought-after-mineral-associa-3AJ86J0.jpg", | |
"https://www.nuggetsbygrant.com/cdn/shop/products/243A0948.jpg?v=1670014792&width=1080", | |
"https://news.rice.edu/sites/g/files/bxs2656/files/inline-images/BIF5-0524_540_1.jpeg", | |
"https://c7.alamy.com/comp/2FNKTF3/copper-bearing-rock-against-a-gravel-ground-surface-2FNKTF3.jpg", | |
"https://www.shutterstock.com/shutterstock/photos/2618131965/display_1500/stock-photo-close-up-of-a-rough-weathered-copper-ore-stone-with-natural-crystal-formations-2618131965.jpg", | |
"https://geologyistheway.com/wp-content/uploads/2021/06/118-milky-quartz.jpg", | |
"https://geologyistheway.com/wp-content/uploads/2021/06/201210-4-1024x726.jpg" | |
] | |
print("Collecting sample drill core images...") | |
for i, url in enumerate(sample_urls): | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
img = Image.open(BytesIO(response.content)) | |
img_path = os.path.join(self.image_dir, f"sample_core_{i+1}.jpg") | |
img.save(img_path) | |
print(f"Downloaded: sample_core_{i+1}.jpg") | |
time.sleep(0.5) # Be respectful to servers | |
except Exception as e: | |
print(f"Failed to download {url}: {e}") | |
print(f"Collected {len(os.listdir(self.image_dir))} images") | |
def get_dataset_info(self): | |
"""Return information about available datasets""" | |
return self.sources | |
if __name__ == "__main__": | |
collector = DataCollector() | |
collector.collect_sample_images() | |
print("\nAvailable geological datasets:") | |
for source in collector.get_dataset_info(): | |
print(f"- {source['name']}: {source['description']}") | |
print(f" URL: {source['url']}\n") | |