Geologist_AI / data_collector.py
solfedge's picture
Upload 9 files
71c32d5 verified
import os
import requests
from PIL import Image
from io import BytesIO
import time
from config import IMAGE_DIR, DATASET_SOURCES
class DataCollector:
def __init__(self):
self.image_dir = IMAGE_DIR
self.sources = DATASET_SOURCES
def collect_sample_images(self):
"""Collect sample images from public sources"""
# These are example URLs - in practice you'd scrape or use APIs
sample_urls = [
"https://c7.alamy.com/comp/3AJ86J0/gold-on-quartz-bradshaw-mountains-arizona-gold-on-quartz-from-the-bradshaw-mountains-arizona-is-a-classic-and-highly-sought-after-mineral-associa-3AJ86J0.jpg",
"https://www.nuggetsbygrant.com/cdn/shop/products/243A0948.jpg?v=1670014792&width=1080",
"https://news.rice.edu/sites/g/files/bxs2656/files/inline-images/BIF5-0524_540_1.jpeg",
"https://c7.alamy.com/comp/2FNKTF3/copper-bearing-rock-against-a-gravel-ground-surface-2FNKTF3.jpg",
"https://www.shutterstock.com/shutterstock/photos/2618131965/display_1500/stock-photo-close-up-of-a-rough-weathered-copper-ore-stone-with-natural-crystal-formations-2618131965.jpg",
"https://geologyistheway.com/wp-content/uploads/2021/06/118-milky-quartz.jpg",
"https://geologyistheway.com/wp-content/uploads/2021/06/201210-4-1024x726.jpg"
]
print("Collecting sample drill core images...")
for i, url in enumerate(sample_urls):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
img = Image.open(BytesIO(response.content))
img_path = os.path.join(self.image_dir, f"sample_core_{i+1}.jpg")
img.save(img_path)
print(f"Downloaded: sample_core_{i+1}.jpg")
time.sleep(0.5) # Be respectful to servers
except Exception as e:
print(f"Failed to download {url}: {e}")
print(f"Collected {len(os.listdir(self.image_dir))} images")
def get_dataset_info(self):
"""Return information about available datasets"""
return self.sources
if __name__ == "__main__":
collector = DataCollector()
collector.collect_sample_images()
print("\nAvailable geological datasets:")
for source in collector.get_dataset_info():
print(f"- {source['name']}: {source['description']}")
print(f" URL: {source['url']}\n")