Spaces:

dropbop
/

TerraNomaly

Sleeping

App Files Files Community

TerraNomaly / earthview.py

dropbop

Create earthview.py

d82e8e5 verified 11 months ago

raw

history blame

5.68 kB

	from datasets import load_dataset as _load_dataset
	from os import environ
	from PIL import Image
	import numpy as np
	import json

	from pyarrow.parquet import ParquetFile
	from pyarrow import Table as pa_Table
	from datasets import Dataset

	DATASET = "satellogic/EarthView"

	sets = {
	"satellogic": {
	"shards" : 7863,
	},
	"sentinel_1": {
	"shards" : 1763,
	},
	"neon": {
	"config" : "default",
	"shards" : 607,
	"path" : "data",
	},
	"sentinel_2": {
	"shards" : 19997,
	},
	}

	def get_subsets():
	return sets.keys()

	def get_nshards(subset):
	return sets[subset]["shards"]

	def get_path(subset):
	return sets[subset].get("path", subset)

	def get_config(subset):
	return sets[subset].get("config", subset)

	def load_dataset(subset, dataset="satellogic/EarthView", split="train", shards = None, streaming=True, **kwargs):
	config = get_config(subset)
	nshards = get_nshards(subset)
	path = get_path(subset)
	if shards is None:
	data_files = None
	else:
	if subset == "sentinel_2":
	data_files = [f"{path}/sentinel_2-{shard//10}/{split}-{shard % 10:05d}-of-00010.parquet" for shard in shards]
	else:
	data_files = [f"{path}/{split}-{shard:05d}-of-{nshards:05d}.parquet" for shard in shards]
	data_files = {split: data_files}

	ds = _load_dataset(
	path=dataset,
	name=config,
	save_infos=True,
	split=split,
	data_files=data_files,
	streaming=streaming,
	token=environ.get("HF_TOKEN", None),
	**kwargs)

	return ds

	def load_parquet(subset_or_filename, batch_size=100):
	if subset_or_filename in get_subsets():
	filename = f"dataset/{subset_or_filename}/sample.parquet"
	else:
	filename = subset_or_filename

	pqfile = ParquetFile(filename)
	batch = pqfile.iter_batches(batch_size=batch_size)
	return Dataset(pa_Table.from_batches(batch))

	def item_to_images(subset, item):
	"""
	Converts the images within an item (arrays), as retrieved from the dataset to proper PIL.Image
	subset: The name of the Subset, one of "satellogic", "neon", "sentinel-1"
	item: The item as retrieved from the subset
	returns the item, with arrays converted to PIL.Image
	"""
	metadata = item["metadata"]
	if type(metadata) == str:
	metadata = json.loads(metadata)

	item = {
	k: np.asarray(v).astype("uint8")
	for k,v in item.items()
	if k != "metadata"
	}
	item["metadata"] = metadata

	if subset == "satellogic":
	# item["rgb"] = [
	# Image.fromarray(np.average(image.transpose(1,2,0), 2).astype("uint8"))
	# for image in item["rgb"]
	# ]
	rgbs = []
	for rgb in item["rgb"]:
	rgbs.append(Image.fromarray(rgb.transpose(1,2,0)))
	# rgbs.append(Image.fromarray(rgb[0,:,:])) # Red
	# rgbs.append(Image.fromarray(rgb[1,:,:])) # Green
	# rgbs.append(Image.fromarray(rgb[2,:,:])) # Blue
	item["rgb"] = rgbs
	item["1m"] = [
	Image.fromarray(image[0,:,:])
	for image in item["1m"]
	]
	count = len(item["1m"])
	elif subset == "sentinel_1":
	# Mapping of V and H to RGB. May not be correct
	# https://gis.stackexchange.com/questions/400726/creating-composite-rgb-images-from-sentinel-1-channels
	i10m = item["10m"]
	i10m = np.concatenate(
	( i10m,
	np.expand_dims(
	i10m[:,0,:,:]/(i10m[:,1,:,:]+0.01)*256,
	1
	).astype("uint8")
	),
	1
	)
	item["10m"] = [
	Image.fromarray(image.transpose(1,2,0))
	for image in i10m
	]
	count = len(item["10m"])
	elif subset == "sentinel_2":
	for channel in ['10m', '20m', 'rgb', 'scl']: #, '40m']:
	data = item[channel]
	count = len(data)
	data = np.asarray(data).astype("uint8").transpose(0,2,3,1)
	if channel == "20m":
	data = data[:,:,:,[0,2,4]]
	mode = "L" if channel in ["10m", "scl"] else "RGB"
	images = [Image.fromarray(data[i].squeeze(), mode=mode) for i in range(count)]
	item[channel] = images
	for field in ["solarAngles", "tileGeometry", "viewIncidenceAngles"]:
	item["metadata"][field] = [json.loads(s) for s in item["metadata"][field]]
	elif subset == "neon":
	item["rgb"] = [
	Image.fromarray(image.transpose(1,2,0))
	for image in item["rgb"]
	]
	item["chm"] = [
	Image.fromarray(image[0])
	for image in item["chm"]
	]

	# The next is a very arbitrary conversion from the 369 hyperspectral data to RGB
	# It just averages each 1/3 of the bads and assigns it to a channel
	item["1m"] = [
	Image.fromarray(
	np.concatenate((
	np.expand_dims(np.average(image[:124],0),2),
	np.expand_dims(np.average(image[124:247],0),2),
	np.expand_dims(np.average(image[247:],0),2))
	,2).astype("uint8"))
	for image in item["1m"]
	]
	count = len(item["rgb"])
	bounds = item["metadata"]["bounds"]

	# swap pairs
	item["metadata"]["bounds"] = [bounds[i+1-l] for i in range(0, len(bounds), 2) for l in range(2)]

	# fix CRS
	item["metadata"]["epsg"] = "EPSG:4326"

	item["metadata"]["count"] = count
	return item