Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MMEarth dataset #2202

Merged
merged 33 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5f3e257
more tests
nilsleh Jul 31, 2024
b1d472b
more comments
nilsleh Jul 31, 2024
c9f1408
files for dm
nilsleh Jul 31, 2024
b7ce2fb
lazy import
nilsleh Jul 31, 2024
64cac20
metadata in raw format
nilsleh Aug 1, 2024
d5eb7fd
mypy
nilsleh Aug 2, 2024
376d4ab
mypy another one
nilsleh Aug 2, 2024
d570081
Merge branch 'main' into mmearth
nilsleh Aug 2, 2024
62fe6e1
Merge branch 'main' into mmearth
nilsleh Aug 20, 2024
b452a4d
ruff
nilsleh Aug 20, 2024
eecc5c7
merge main
nilsleh Aug 20, 2024
50229f3
class var
nilsleh Aug 20, 2024
df1251b
merge main
nilsleh Oct 2, 2024
10d2a99
requested changes
nilsleh Oct 2, 2024
d32f624
ds_version -> subset
nilsleh Oct 2, 2024
c95c0e8
separate Sentinel 1 ascending and descending
nilsleh Oct 2, 2024
f84af05
remove mmearth from datamodule docs
nilsleh Oct 2, 2024
f5b3fa2
separate reading item for subclasses
nilsleh Oct 4, 2024
f360b9c
sentinel1 only return available data
nilsleh Oct 4, 2024
0f47444
remove split from dataset
nilsleh Oct 4, 2024
e37027f
fix tests
nilsleh Oct 4, 2024
46d6753
requests
nilsleh Oct 8, 2024
c810be5
requests
nilsleh Oct 8, 2024
9ea0960
resolution
nilsleh Oct 8, 2024
be756ad
more band logic
nilsleh Oct 10, 2024
420f143
review
nilsleh Oct 10, 2024
fe01e88
typo
nilsleh Oct 10, 2024
adf3413
Update torchgeo/datasets/mmearth.py
nilsleh Oct 10, 2024
55e5521
Merge branch 'main' into mmearth
nilsleh Oct 15, 2024
e3c0937
attribute documentation
nilsleh Oct 15, 2024
ad21bb3
merge main
nilsleh Oct 15, 2024
d52e110
another try
nilsleh Oct 15, 2024
a8c9077
remove reference
nilsleh Oct 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,11 @@ Million-AID

.. autoclass:: MillionAID

MMEarth
^^^^^^^^

.. autoclass:: MMEarth

NASA Marine Debris
^^^^^^^^^^^^^^^^^^

Expand Down
1 change: 1 addition & 0 deletions docs/api/datasets/non_geo_datasets.csv
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Dataset,Task,Source,License,# Samples,# Classes,Size (px),Resolution (m),Bands
`LoveDA`_,S,Google Earth,"CC-BY-NC-SA-4.0","5,987",7,"1,024x1,024",0.3,RGB
`MapInWild`_,S,"Sentinel-1/2, ESA WorldCover, NOAA VIIRS DNB","CC-BY-4.0",1018,1,1920x1920,10--463.83,"SAR, MSI, 2020_Map, avg_rad"
`Million-AID`_,C,Google Earth,-,1M,51--73,,0.5--153,RGB
`MMEarth`_,"C, S","Aster, Sentinel, ERA5","CC-BY-4.0","100K--1M",,"128x128 or 64x64",10,MSI
`NASA Marine Debris`_,OD,PlanetScope,"Apache-2.0",707,1,256x256,3,RGB
`OSCD`_,CD,Sentinel-2,"CC-BY-4.0",24,2,"40--1,180",60,MSI
`PASTIS`_,I,Sentinel-1/2,"CC-BY-4.0","2,433",19,128x128xT,10,MSI
Expand Down
221 changes: 221 additions & 0 deletions tests/data/mmearth/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import json
import os
import shutil

import h5py
import numpy as np

meta_dummy_dict = {
'S2_DATE': '2018-07-16',
'S2_type': 'l1c',
'CRS': 'EPSG:32721',
'lat': -14.499441524746077,
'lon': -56.98355999998649,
}

num_tiles = 10

meta_id_strings = [str(i) for i in range(num_tiles)]

modalities = {
'aster': {'bands': 2, 'dtype': np.int16},
'biome': {'bands': 14, 'dtype': np.uint8},
'canopy_height_eth': {'bands': 2, 'dtype': np.int8},
'dynamic_world': {'bands': 1, 'dtype': np.uint8},
'eco_region': {'bands': 846, 'dtype': np.uint16},
'era5': {'bands': 12, 'dtype': np.float32},
'esa_worldcover': {'bands': 1, 'dtype': np.uint8},
'sentinel1': {'bands': 8, 'dtype': np.float32},
'sentinel2': {'bands': 13, 'dtype': np.uint16},
'sentinel2_cloudmask': {'bands': 1, 'dtype': np.uint16},
'sentinel2_cloudprod': {'bands': 1, 'dtype': np.uint16},
'sentinel2_scl': {'bands': 1, 'dtype': np.uint16},
}

all_modality_bands = {
'sentinel2': [
'B1',
'B2',
'B3',
'B4',
'B5',
'B6',
'B7',
'B8A',
'B8',
'B9',
'B10',
'B11',
'B12',
],
'sentinel2_cloudmask': ['QA60'],
'sentinel2_cloudprod': ['MSK_CLDPRB'],
'sentinel2_scl': ['SCL'],
'sentinel1_asc': ['VV', 'VH', 'HH', 'HV'],
'sentinel1_desc': ['VV', 'VH', 'HH', 'HV'],
'aster': ['elevation', 'slope'],
'era5': [
'prev_month_avg_temp',
'prev_month_min_temp',
'prev_month_max_temp',
'prev_month_total_precip',
'curr_month_avg_temp',
'curr_month_min_temp',
'curr_month_max_temp',
'curr_month_total_precip',
'year_avg_temp',
'year_min_temp',
'year_max_temp',
'year_total_precip',
],
'dynamic_world': ['landcover'],
'canopy_height_eth': ['height', 'std'],
'lat': ['sin', 'cos'],
'lon': ['sin', 'cos'],
'biome': ['biome'],
'eco_region': ['eco_region'],
'month': ['sin_month', 'cos_month'],
'esa_worldcover': ['map'],
}


def create_hd5f(dataset_name: str, px_dim: tuple[int]) -> list[dict[str, str]]:
# Create the HDF5 file
with h5py.File(f'{dataset_name}.h5', 'w') as h5file:
# Create datasets for each modality
for modality, modal_info in modalities.items():
bands = modal_info['bands']
if modality in ['era5', 'eco_region', 'biome']:
h5file.create_dataset(
modality, (num_tiles, bands), dtype=modal_info['dtype']
)
else:
h5file.create_dataset(
modality, (num_tiles, bands, *px_dim), dtype=modal_info['dtype']
)

# Create datasets for metadata
h5file.create_dataset('lat', (num_tiles, 2), dtype=np.float32)
h5file.create_dataset('lon', (num_tiles, 2), dtype=np.float32)
h5file.create_dataset('month', (num_tiles, 2), dtype=np.int32)
h5file.create_dataset(
'metadata',
(num_tiles,),
dtype=np.dtype([('meta_id', 'S10'), ('S2_type', 'S3')]),
)

# Populate the datasets with sample data
tile_info = {}
for i in range(num_tiles):
for modality in modalities:
if modality == 'dynamic_world':
old_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
data = np.random.choice(old_values, size=(bands, *px_dim))
elif modality == 'esa_worldcover':
old_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100, 255]
data = np.random.choice(old_values, size=(bands, *px_dim))
elif modality == 'era5':
# only vector not image data
data = np.random.random(size=(bands,))
elif modality in ['biome', 'eco_region']:
data = np.random.randint(0, 2, size=(bands,))
elif modality == 'sentinel2':
data = np.random.randint(0, 65535, size=(bands, *px_dim))
elif modality in ['aster', 'canopy_height_eth', 'sentinel1']:
data = np.random.random(size=(bands, *px_dim))
elif modality in [
'sentinel2_cloudmask',
'sentinel2_cloudprod',
'sentinel2_scl',
]:
data = np.random.randint(0, 2, size=(bands, *px_dim))

data = data.astype(modal_info['dtype'])
h5file[modality][i] = data

# add other data for lat, lon, month
h5file['lat'][i] = np.random.random(size=(2,))
h5file['lon'][i] = np.random.random(size=(2,))
h5file['month'][i] = np.random.random(size=(2,))

# Assign S2_type and store in metadata
S2_type = np.random.choice(['l1c', 'l2a']).encode('utf-8')
meta_id = str(i).encode('utf-8')
h5file['metadata'][i] = (meta_id, S2_type)

# Collect tile info for JSON file
tile_meta = meta_dummy_dict.copy()
tile_meta['S2_type'] = S2_type.decode('utf-8')
tile_meta['BANDS'] = all_modality_bands
tile_info[str(i)] = tile_meta

return tile_info


extra_band_stats = {
'sentinel2_l1c': {'bands': 13, 'dtype': np.uint16},
'sentinel2_l2a': {'bands': 13, 'dtype': np.uint16},
'lat': {'bands': 2, 'dtype': np.float32},
'lon': {'bands': 2, 'dtype': np.float32},
'month': {'bands': 2, 'dtype': np.float32},
}

band_modalities = {
k: v
for k, v in {**modalities, **extra_band_stats}.items()
if k not in {'biome', 'eco_region', 'dynamic_world', 'esa_worldcover'}
}

# Create JSON files for band stats and splits
# sentinel 2 has l1c and l2a but there is only a common sentinel 2 data entry
band_stats = {
modality: {
'mean': np.random.random(size=(mod_info['bands'])).tolist(),
'std': np.random.random(size=(mod_info['bands'])).tolist(),
'min': np.random.random(size=(mod_info['bands'])).tolist(),
'max': np.random.random(size=(mod_info['bands'])).tolist(),
}
for modality, mod_info in band_modalities.items()
}

train_split = num_tiles
val_split = 0
test_split = 0

splits = {
'train': list(range(train_split)),
'val': list(range(train_split, train_split + val_split)),
'test': list(range(train_split + val_split, num_tiles)),
}

if __name__ == '__main__':
filenames = {
'MMEarth': {'dirname': 'data_1M_v001', 'px_dim': (128, 128)},
'MMEarth64': {'dirname': 'data_1M_v001_64', 'px_dim': (64, 64)},
'MMEarth100k': {'dirname': 'data_100k_v001', 'px_dim': (128, 128)},
}
for key, vals in filenames.items():
dirname = vals['dirname']
# remove existing files
if os.path.exists(dirname):
shutil.rmtree(dirname)

# create directory
os.makedirs(dirname)
tile_info = create_hd5f(os.path.join(dirname, dirname), vals['px_dim'])

print(f'{key} data file and JSON files created successfully.')

with open(os.path.join(dirname, f'{dirname}_splits.json'), 'w') as f:
json.dump(splits, f, indent=4)

with open(os.path.join(dirname, f'{dirname}_band_stats.json'), 'w') as f:
json.dump(band_stats, f, indent=4)

with open(os.path.join(dirname, f'{dirname}_tile_info.json'), 'w') as f:
json.dump(tile_info, f, indent=4)
Binary file not shown.
Loading