From 8c78f709f9db28553b02f461368a30db896aff33 Mon Sep 17 00:00:00 2001 From: LI <1569910942@qq.com> Date: Mon, 17 Jun 2024 16:28:29 -0400 Subject: [PATCH 1/3] Create the_last_metric.py An information-based metric comparing the recoverability of redshift information of simulated OpSims runs. It returns a single number as the Figure of Merit for an OpSim. --- rubin_sim/maf/maf_contrib/the_last_metric.py | 557 +++++++++++++++++++ 1 file changed, 557 insertions(+) create mode 100644 rubin_sim/maf/maf_contrib/the_last_metric.py diff --git a/rubin_sim/maf/maf_contrib/the_last_metric.py b/rubin_sim/maf/maf_contrib/the_last_metric.py new file mode 100644 index 00000000..02f06106 --- /dev/null +++ b/rubin_sim/maf/maf_contrib/the_last_metric.py @@ -0,0 +1,557 @@ +__all__ = ("TheLastMetric") + +""" +TheLastMetric +An information-based metric comparing the recoverability of redshift information +of simulated OpSims runs. It returns a single number as the Figure of Merit for an OpSim. + +Dependencies: +- jax +- pzflow + +Catalog file: +https://github.com/dirac-institute/CMNN_Photoz_Estimator/blob/master/mock_catalog.dat + +Demonstration: +https://colab.research.google.com/drive/1aJjgYS9XvWlyK_qIKbYXz2Rh4IbCwfh7?usp=sharing + +Reference: +Alex M, et al. An information-based metric for observing strategy optimization, +demonstrated in the context of photometric redshifts with applications to cosmology +https://arxiv.org/abs/2104.08229 +""" + +# Install required libraries (uncomment if needed) +# !pip install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +# !pip install astropy pzflow corner + + +import os +import numpy as np +import pandas as pd +import matplotlib +import matplotlib.pyplot as plt +#%matplotlib inline +#import seaborn as sn + +import healpy as hp +import rubin_sim.maf.metrics as metrics +import rubin_sim.maf.slicers as slicers +import rubin_sim.maf.metric_bundles as metricBundles +import rubin_sim.maf.db as db + +import jax.numpy as jnp +import pickle +import corner +from astropy.table import Table +from pzflow import Flow, FlowEnsemble +from pzflow.distributions import Uniform +from pzflow.bijectors import Chain, StandardScaler, NeuralSplineCoupling, ColorTransform, InvSoftplus, RollingSplineCoupling, ShiftBounds +from collections import namedtuple + +import scipy.stats as sps +import datetime + +class TheLastMetric(metrics.BaseMetric): + """ + Parameters: + colname: list, ['observationStartMJD', 'filter', 'fiveSigmaDepth'] + nside: nside of healpix + input_data_dir: string, directory to input catalog data, zphot.cat, test.cat + flowfile: pickle, pre-trained flow model + + Returns: + tlm: the last metric value, tlm = flow.log_prob + entropy, + reduce_tlm: reduce to single number, mean + + """ + + def __init__(self, colname=['observationStartMJD', 'fieldRA', 'fieldDec', 'filter', 'fiveSigmaDepth'], + nside=16, opsdb='/drive/Shareddrives/MAF/TheLastMetric/baseline_v2.0_10yrs.db', + catalog = './mock_catalog.dat', + outDir='./outDir', + input_data_dir = '/drive/Shareddrives/MAF/TheLastMetric/', + flowfile = None, + **kwargs): + self.colname = colname + self.nside = nside + self.opsdb = opsdb + self.outDir = outDir + + self.catalog = catalog + + self.input_data_dir = input_data_dir + self.flowfile = flowfile + super().__init__(col=self.colname, **kwargs) + + + def make_test_and_train(self, verbose=True, runid='1', filtmask=[1,1,1,1,1,1], yfilt=0, outDir='output/', + catalog='mock_catalog.dat', roman_exp=0, + test_m5=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], train_m5=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], + test_mcut=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], train_mcut=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], + force_idet=True, force_gridet=True, + test_N=5000, train_N=20000, cmnn_minNc=3): + + ''' + from https://github.com/dirac-institute/CMNN_Photoz_Estimator/tree/master + Create the test and training set based on user specifications. + + Inputs described in cmnn_run.main. + + Outputs: output/run_/test.cat and train.cat + ''' + + if verbose: + print('Starting cmnn_catalog.make_test_and_train: ',datetime.datetime.now()) + + # read galaxy data from the catalog + # recall use of the yfilt parameter is + # yfilt = 0 : use PanSTARRs y-band (default, column 7) + # yfilt = 1 : use Euclid y-band (column 8) + all_id = np.loadtxt(catalog, dtype='float', usecols=(0)) + all_tz = np.loadtxt(catalog, dtype='float', usecols=(1)) + if yfilt == 0: + all_tm = np.loadtxt(catalog, dtype='float', usecols=(2, 3, 4, 5, 6, 7, 9, 10, 11)) + elif yfilt == 1: + all_tm = np.loadtxt(catalog, dtype='float', usecols=(2, 3, 4, 5, 6, 8, 9, 10, 11)) + + # convert user-specified magnitude limits to numpy arrays + np_test_m5 = np.asarray(test_m5, dtype='float') + np_train_m5 = np.asarray(train_m5, dtype='float') + np_test_mcut = np.asarray(test_mcut, dtype='float') + np_train_mcut = np.asarray(train_mcut, dtype='float') + + # gamma sets the impact of sky brightness in magnitude error estimates + gamma = np.asarray( [0.037, 0.038, 0.039, 0.039, 0.04, 0.04, 0.04, 0.04, 0.04], dtype='float' ) + + # apply user-specified m5 depths to calculate magnitude errors for all galaxies + all_test_me = np.sqrt((0.04 - gamma) * (np.power(10.0, 0.4 * (all_tm[:] - np_test_m5))) + \ + gamma * (np.power(10.0, 0.4*(all_tm[:] - np_test_m5))**2)) + all_train_me = np.sqrt((0.04 - gamma) * (np.power(10.0, 0.4 * (all_tm[:] - np_train_m5))) + \ + gamma * (np.power(10.0, 0.4 * (all_tm[:] - np_train_m5))**2)) + + # apply the uncertainty floor of 0.005 mag + for f in range(9): + tex = np.where( all_test_me[:,f] < 0.0050)[0] + all_test_me[tex,f] = float(0.0050) + trx = np.where( all_train_me[:,f] < 0.0050)[0] + all_train_me[trx,f] = float(0.0050) + + # use the errors to calculate apparent observed magnitudes + all_test_m = all_tm + all_test_me * np.random.normal(size=(len(all_tm), 9)) + all_train_m = all_tm + all_train_me * np.random.normal(size=(len(all_tm), 9)) + + # apply 17 mag as the saturation limit for all filters + for f in range(9): + tx = np.where(all_tm[:,f] < 17.0000)[0] + all_test_me[tx] = np.nan + all_test_m[tx] = np.nan + all_train_me[tx] = np.nan + all_train_m[tx] = np.nan + del tx + + # do not allow "upscattering" of > 0.2 mag + for f in range(9): + tx = np.where(all_tm[:,f] > np_test_m5[f] + 0.20)[0] + all_test_me[tx] = np.nan + all_test_m[tx] = np.nan + del tx + tx = np.where(all_tm[:,f] > np_train_m5[f] + 0.20)[0] + all_train_me[tx] = np.nan + all_train_m[tx] = np.nan + del tx + + # apply the user-specified magnitude cuts + for f in range(9): + te_x = np.where(all_test_m[:,f] > np_test_mcut[f])[0] + if len(te_x) > 0: + all_test_m[te_x, f] = np.nan + all_test_me[te_x, f] = np.nan + if (force_idet == True) & (f == 3): + all_test_m[te_x, :] = np.nan + all_test_me[te_x, :] = np.nan + if (force_gridet == True) & ((f == 1) | (f == 2) | (f == 3)): + all_test_m[te_x, :] = np.nan + all_test_me[te_x, :] = np.nan + tr_x = np.where(all_train_m[:,f] > np_train_mcut[f])[0] + if len(tr_x) > 0: + all_train_m[tr_x, f] = np.nan + all_train_me[tr_x, f] = np.nan + if (force_idet == True) & (f == 3): + all_train_m[tr_x, :] = np.nan + all_train_me[tr_x, :] = np.nan + if (force_gridet == True) & ((f == 1) | (f == 2) | (f == 3)): + all_train_m[tr_x, :] = np.nan + all_train_me[tr_x, :] = np.nan + del te_x,tr_x + + # Roman special experiements + # 0 : fifth color is z-y; do nothing + # 1 : fifth color is z-J; put J into y + # 2 : fifth color is z-H; put H into y + # 3 : fifth color is z-K; put K into y + # 4 : sixth color is y-J; do nothing + # 5 : sixth color is y-H; put H into J + # 6 : sixth color is y-K; put K into J + if roman_exp == 1: + all_test_m[:, 5] = all_test_m[:, 6] + all_test_me[:, 5] = all_test_me[:, 6] + all_train_m[:, 5] = all_train_m[:, 6] + all_train_me[:, 5] = all_train_me[:, 6] + if roman_exp == 2: + all_test_m[:, 5] = all_test_m[:, 7] + all_test_me[:, 5] = all_test_me[:, 7] + all_train_m[:, 5] = all_train_m[:, 7] + all_train_me[:, 5] = all_train_me[:, 7] + if roman_exp == 3: + all_test_m[:, 5] = all_test_m[:, 8] + all_test_me[:, 5] = all_test_me[:, 8] + all_train_m[:, 5] = all_train_m[:, 8] + all_train_me[:, 5] = all_train_me[:, 8] + if roman_exp == 5: + all_test_m[:, 6] = all_test_m[:, 7] + all_test_me[:, 6] = all_test_me[:, 7] + all_train_m[:, 6] = all_train_m[:, 7] + all_train_me[:, 6] = all_train_me[:, 7] + if roman_exp == 6: + all_test_m[:, 6] = all_test_m[:, 8] + all_test_me[:, 6] = all_test_me[:, 8] + all_train_m[:, 6] = all_train_m[:, 8] + all_train_me[:, 6] = all_train_me[:, 8] + + # apply filtmask + for f, fm in enumerate(filtmask): + if fm == 0: + all_test_m[:, f] = np.nan + all_test_me[:, f] = np.nan + all_train_m[:, f] = np.nan + all_train_me[:, f] = np.nan + + # calculate colors, color errors, and number of colors + all_test_c = np.zeros((len(all_tm), 8), dtype='float') + all_test_ce = np.zeros((len(all_tm), 8), dtype='float') + all_train_c = np.zeros((len(all_tm), 8), dtype='float') + all_train_ce = np.zeros((len(all_tm), 8), dtype='float') + for c in range(8): + all_test_c[:, c] = all_test_m[:, c] - all_test_m[:, c+1] + all_train_c[:, c] = all_train_m[:, c] - all_train_m[:, c+1] + all_test_ce[:, c] = np.sqrt( all_test_me[:, c]**2 + all_test_me[:, c+1]**2 ) + all_train_ce[:, c] = np.sqrt( all_train_me[:, c]**2 + all_train_me[:, c+1]**2 ) + all_test_Nc = np.nansum(all_test_c/all_test_c, axis=1) + all_train_Nc = np.nansum(all_train_c/all_train_c, axis=1) + + # create test and training sets + te_x = np.where( all_test_Nc >= cmnn_minNc )[0] + tr_x = np.where( all_train_Nc >= cmnn_minNc )[0] + + if (len(te_x) < test_N) | (len(tr_x) < train_N): + print('Error. Desired number of test/training galaxies higher than what is available.') + print(' test number desired, available: %i %i' % (test_N, len(te_x))) + print(' train number desired, available: %i %i' % (train_N, len(tr_x))) + print('Exit (inputs too constraining to build test/train set).') + exit() + + else: + te_rx = np.random.choice(te_x, size=test_N, replace=False) + test_fout = open(outDir + '/run_'+runid+'_test.cat', 'w') + for i in te_rx: + test_fout.write('%10i %10.8f ' % (all_id[i], all_tz[i])) + for f in range(9): + test_fout.write('%9.6f %9.6f ' % (all_test_m[i, f], all_test_me[i, f])) + for c in range(8): + test_fout.write('%9.6f %9.6f ' % (all_test_c[i, c], all_test_ce[i, c])) + test_fout.write('\n') + test_fout.close() + del te_rx,test_fout + + tr_rx = np.random.choice(tr_x, size=train_N, replace=False) + train_fout = open(outDir + '/run_'+runid+'_train.cat','w') + for i in tr_rx: + train_fout.write('%10i %10.8f ' % (all_id[i], all_tz[i])) + for f in range(9): + train_fout.write('%9.6f %9.6f ' % (all_train_m[i, f], all_train_me[i, f])) + for c in range(8): + train_fout.write('%9.6f %9.6f ' % (all_train_c[i, c], all_train_ce[i, c])) + train_fout.write('\n') + train_fout.close() + del tr_rx,train_fout + + if verbose: + print('Wrote ', outDir+'run_'+runid+'_test.cat' + outDir + 'run_'+runid+'_train.cat') + print('Finished cmnn_catalog.make_test_and_train: ',datetime.datetime.now()) + train_cat = outDir + 'run_'+runid + '_train.cat' + test_cat = outDir + 'run_'+runid + '_test.cat' + return train_cat, test_cat + + + + def get_coaddM5(self, outDir='./outDir', colname=['observationStartMJD', 'filter', 'fiveSigmaDepth'], + nside=16, opsdb = '/drive/Shareddrives/MAF/TheLastMetric/baseline_v2.0_10yrs.db' + ): + + """run a ExGalCoadd to get the median of M5""" + resultsDb = db.ResultsDb(out_dir=outDir) + # metric, slicer, constraint + metric = ExGalCoaddM5Metric( colname=colname, ) + + slicer = slicers.HealpixSlicer(nside=nside) + + # bundle + metricSky = metricBundles.MetricBundle(metric, slicer, sqlstr) + + # group bundle + bundleDict = {'metricSky':metricSky} + + # table names='observations' for v2.0 + #if 'observations' in opsdb.get_table_names(): + # dbTable='observations' + #else: + # dbTable = 'SummaryAllProps' + + dbTable = 'observations' + + #group = metricBundles.MetricBundleGroup(bundleDict, opsdb, + # outDir=outDir, + # resultsDb=resultsDb, + # dbTable=dbTable ) + + group = metricBundles.MetricBundleGroup(bundleDict, opsdb, + out_dir=outDir, + results_db=resultsDb, + db_table=dbTable ) + + group.run_all() + + data = metricSky.metric_values.data[ ~metricSky.metric_values.mask] + df = pd.DataFrame.from_records(data) + coaddM5 = df[df['ebv']<0.2].dropna().median()[['coaddm5_u', 'coaddm5_g','coaddm5_r', + 'coaddm5_i', 'coaddm5_z','coaddm5_y',]].values + + return coaddM5 + + + def run(self, dataSlice, slice_point=None): + # sort the dataSlice in order of time. + # slicePoint {'ra':, 'dec':, 'sid':} + #dataSlice.sort(order='observationStartMJD') + #result = {} + #result['fieldRA'] = slicePoint['fieldRA'] + #print(slicePoint) + #slice_point['Nv'] = len(dataSlice) + + #----------------- + # load catalog data + + #input_data_dir = '/drive/Shareddrives/MAF/TheLastMetric/' + + #names_z=('ID', 'z_true', 'z_phot', 'dz_phot', 'NN', 'N_train') + + #names_phot=('ID', 'z_true', + # 'u', 'err_u', 'g', 'err_g', 'r', 'err_r', 'i', 'err_i', 'z', 'err_z', 'y', 'err_y', + # 'u-g', 'err_u-g', 'g-r', 'err_g-r', 'r-i', 'err_r-i', 'i-z', 'err_i-z', 'z-y', 'err_z-y') + + #z_cat = Table.read(self.input_data_dir + 'zphot.cat', + # format='ascii', + # names=names_z) + + #phot_cat = Table.read(self.input_data_dir + 'test.cat', + # format='ascii', + # names=names_phot) + + #phot_cat = Table.from_pandas(phot_cat.to_pandas().dropna()) + + #cat = phot_cat.to_pandas().merge(z_cat.to_pandas()) + + ## --------cut by median m5-------------- + + # ----------------------- + + # get M5 median + coaddM5 = self.get_coaddM5(outDir = self.outDir, colname = self.colname, + nside = self.nside, opsdb=self.opsdb) + + # append cut for J, H, K used in make_test_and_train + coaddM5 = np.append(coaddM5, [30, 30, 30]) + + print('coaddM5', coaddM5) + _, test_cat_file = self.make_test_and_train(runid='1', catalog=self.catalog, + train_mcut=coaddM5, + test_mcut=coaddM5) + + names_phot=('ID', 'z_true', + 'u', 'err_u', 'g', 'err_g', 'r', 'err_r', 'i', 'err_i', 'z', 'err_z', 'y', 'err_y', + 'u-g', 'err_u-g', 'g-r', 'err_g-r', 'r-i', 'err_r-i', 'i-z', 'err_i-z', 'z-y', 'err_z-y') + + usecols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, #14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, #30, 31, 32, 33, 34, 35 + ] + + df_cat = pd.read_csv(test_cat_file, delim_whitespace=True, names=names_phot, usecols=usecols).dropna() + print('loaded df_cat') + + data_columns = ["z_true"] + drop_cols = ['ID', 'z_true', 'u', 'g', 'i', 'z', 'y', + 'err_u', 'err_g', 'err_r', 'err_i', 'err_z', 'err_y', + 'err_u-g', 'err_g-r', 'err_r-i', 'err_i-z', 'err_z-y'] + + conditional_columns = df_cat.drop(drop_cols, axis = 1) + # data_columns: z_true + # conditional_columns: r, u-g, g-r, r-i, i-z, z-y + + if self.flowfile!=None: + flow = FlowEnsemble(file=self.flowfile) + else: + # train flow model + ndcol = len(data_columns) + ncond = len(conditional_columns) + ndim = ndcol+ncond + + K = 16 + bijector = Chain( + StandardScaler(np.atleast_1d(1.6), np.atleast_1d(0.32)), + NeuralSplineCoupling(B=5, n_conditions=6, K=K) + ) + + latent = Uniform(input_dim=ndcol, B=7) # this has changed. + + info = f"Models z_true conditioned on galaxy colors and r mag from {test_cat_file}. K = 16" + + flow = FlowEnsemble(data_columns = data_columns, + conditional_columns = conditional_columns, + bijector = bijector, + latent = latent, + info = info, + N = 10 + ) + + loss = flow.train(df_cat, #conditional_columns], + convolve_errs=False, + epochs=150, verbose=True); + + #IDS = phot_cat['ID'] + #z_cat_no_nan = z_cat.to_pandas()[z_cat.to_pandas()['ID'].isin(IDS)] + + b = sps.mstats.mquantiles(df_cat['z_true'], np.linspace(0,1, 101, endpoint=True)) + # print(len(z_cats_no_nan[which_os]['z_true'])) + # print(b) + + #b_centers = 0.5*(b[1:] + b[:-1]) + + # Computing the entropy H(z) + pz = sps.rv_histogram(np.histogram(df_cat['z_true'], bins=b)) + entropy = pz.entropy() + + # mutual information lower bound + milb = flow.log_prob(df_cat, returnEnsemble=True, err_samples=10)# + entropy + + tlm = milb.mean(axis=0) + entropy + print("calculated tlm") + + #with open('test_tlm.pkl', 'wb') as outfile: + # pickle.dump(tlm, outfile) + + return { #'dataSlice': dataSlice, + 'entropy': entropy, + 'milb': milb, + 'tlm': tlm + } + + def reduce_tlm(self, metric): + """get number of visits""" + return metric['tlm'].mean() + + + +class ExGalCoaddM5Metric(metrics.BaseMetric): + """ + Calculate coadd M5 for all filters used to cut simulated catalog in theLastMetric + Parameters: + colname: list, ['observationStartMJD', 'filter', 'fiveSigmaDepth'] + nside: nside of healpix + + Returns: + return coaddedM5 # if Ebv<0.2 mag + + """ + + def __init__(self, colname=['observationStartMJD', 'fieldRA', + 'fieldDec', 'filter', 'fiveSigmaDepth'], + maps=['DustMap'], + **kwargs): + self.colname = colname + #self.nside = nside + + super().__init__(col=self.colname, maps=['DustMap'], **kwargs) + + def run(self, data_slice, slice_point=None): + # sort the dataSlice in order of time. + # slicePoint {'ra':, 'dec':, 'sid':} + data_slice.sort(order='observationStartMJD') + #result = {} + #result['fieldRA'] = slicePoint['fieldRA'] + #print(slicePoint) + slice_point['Nv'] = len(data_slice) + + for f in np.unique(data_slice['filter']): + data_slice_f = data_slice[data_slice['filter']==f] + slice_point[f'Nv_{f}'] = len(data_slice_f) + slice_point[f'coaddm5_{f}'] = metrics.Coaddm5Metric().run(data_slice_f) + + return slice_point + +# def reduce_m5(self, metric): +# """return coaddM5""" +# return metric['coaddm5'] + + def reduce_ebv(self, metric): + return metric['ebv'] + + + +if __name__=='__main__': + + outDir = './outDir' + # outDir = '/drive/Shareddrives/MAF/TheLastMetric/outDir' + resultsDb = db.ResultsDb(out_dir=outDir) + + colname = ['observationStartMJD', 'filter', 'fiveSigmaDepth'] + sqlstr = 'night<400' + + nside = 16 + + opsdb = '/drive/Shareddrives/MAF/TheLastMetric/baseline_v2.0_10yrs.db' + # opsdb = './baseline_v2.0_10yrs.db' + #opsdb = db.Database(dbpath_v30+'baseline_v3.0_10yrs.db') + + # metric, slicer, constraint + metric = TheLastMetric( colname=colname, nside=nside, opsdb=opsdb) + + #slicer = slicers.HealpixSlicer(nside=nside) + + slicer = slicers.UniSlicer() + # bundle + metricSky = metricBundles.MetricBundle(metric, slicer, sqlstr) + + # group bundle + bundleDict = {'metricSky':metricSky} + + # table names='observations' for v2.0 + #if 'observations' in opsdb.get_table_names(): + # dbTable='observations' + #else: + # dbTable = 'SummaryAllProps' + + dbTable = 'observations' + + #group = metricBundles.MetricBundleGroup(bundleDict, opsdb, + # outDir=outDir, + # resultsDb=resultsDb, + # dbTable=dbTable ) + + group = metricBundles.MetricBundleGroup(bundleDict, opsdb, + out_dir=outDir, + results_db=resultsDb, + db_table=dbTable ) + + group.run_all() + From dcf183ae8e4786b2b9c7973e979b8d9d04af3e2d Mon Sep 17 00:00:00 2001 From: Lynne Jones Date: Thu, 17 Oct 2024 21:18:46 -0700 Subject: [PATCH 2/3] Modify TheLastMetric to MAF api --- requirements.txt | 1 + rubin_sim/maf/maf_contrib/__init__.py | 1 + rubin_sim/maf/maf_contrib/the_last_metric.py | 820 +++++++++--------- rubin_sim/maf/metrics/exgal_m5.py | 4 + .../weak_lensing_systematics_metric.py | 112 ++- 5 files changed, 508 insertions(+), 430 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8f8f882a..2a05f8f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ astroplan colorcet cycler george +pzflow scikit-learn shapely skyfield diff --git a/rubin_sim/maf/maf_contrib/__init__.py b/rubin_sim/maf/maf_contrib/__init__.py index 251546c8..238adf0a 100644 --- a/rubin_sim/maf/maf_contrib/__init__.py +++ b/rubin_sim/maf/maf_contrib/__init__.py @@ -18,6 +18,7 @@ from .star_count_metric import * from .static_probes_fom_summary_metric import * from .tdes_pop_metric import * +from .the_last_metric import * from .triplet_metric import * from .var_depth_metric import * from .xrb_metrics import * diff --git a/rubin_sim/maf/maf_contrib/the_last_metric.py b/rubin_sim/maf/maf_contrib/the_last_metric.py index 02f06106..d4f29a00 100644 --- a/rubin_sim/maf/maf_contrib/the_last_metric.py +++ b/rubin_sim/maf/maf_contrib/the_last_metric.py @@ -1,141 +1,201 @@ -__all__ = ("TheLastMetric") - -""" -TheLastMetric -An information-based metric comparing the recoverability of redshift information -of simulated OpSims runs. It returns a single number as the Figure of Merit for an OpSim. - -Dependencies: -- jax -- pzflow - -Catalog file: -https://github.com/dirac-institute/CMNN_Photoz_Estimator/blob/master/mock_catalog.dat - -Demonstration: -https://colab.research.google.com/drive/1aJjgYS9XvWlyK_qIKbYXz2Rh4IbCwfh7?usp=sharing - -Reference: -Alex M, et al. An information-based metric for observing strategy optimization, -demonstrated in the context of photometric redshifts with applications to cosmology -https://arxiv.org/abs/2104.08229 -""" - -# Install required libraries (uncomment if needed) -# !pip install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html -# !pip install astropy pzflow corner - +__all__ = ("TheLastMetric", "theLastMetricBatch") +import datetime import os + import numpy as np import pandas as pd -import matplotlib -import matplotlib.pyplot as plt -#%matplotlib inline -#import seaborn as sn - -import healpy as hp -import rubin_sim.maf.metrics as metrics -import rubin_sim.maf.slicers as slicers -import rubin_sim.maf.metric_bundles as metricBundles -import rubin_sim.maf.db as db - -import jax.numpy as jnp -import pickle -import corner -from astropy.table import Table -from pzflow import Flow, FlowEnsemble -from pzflow.distributions import Uniform -from pzflow.bijectors import Chain, StandardScaler, NeuralSplineCoupling, ColorTransform, InvSoftplus, RollingSplineCoupling, ShiftBounds -from collections import namedtuple - import scipy.stats as sps -import datetime +from pzflow import FlowEnsemble +from pzflow.bijectors import Chain, NeuralSplineCoupling, StandardScaler +from pzflow.distributions import Uniform -class TheLastMetric(metrics.BaseMetric): +from rubin_sim.data import get_data_dir + +from ..batches import col_map_dict +from ..metric_bundles import MetricBundle, MetricBundleGroup +from ..metrics import BaseMetric, ExgalM5WithCuts +from ..slicers import HealpixSlicer, UniSlicer + +# To run with GPU, install jax with cuda +# pip install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + + +class TheLastMetric(BaseMetric): + """Calculate an information-based metric comparing the recoverability + of redshift information based on exposure metadata. + + Parameters + ---------- + catalog : `str` or None + Path to the mock catalog. Default None uses + $RUBIN_SIM_DATA_DIR/maf/mlg_mock/mock_catalog.dat + flowfile : `str` or None + Path to the XXXX + m5_col : `str` + Name of the column with m5 information + filter_col : `str` + Name of the column with filter information + additional_cols : `list` [`str`] + Additional columns necessary to run the metric. + metric_name : `str` + Name to apply to the metric (for labels and outputs). + + Returns + ------- + entropy, milb, tlm + tlm: the last metric value, tlm = flow.log_prob + entropy + + Notes + ----- + An information-based metric comparing the recoverability of + redshift information of simulated OpSims runs. + It returns a single number as the Figure of Merit for an OpSim. + + Catalog file: + https://github.com/dirac-institute/CMNN_Photoz_Estimator/blob/master/mock_catalog.dat + + Demonstration: + https://colab.research.google.com/drive/1aJjgYS9XvWlyK_qIKbYXz2Rh4IbCwfh7?usp=sharing + + Reference: + Alex M, et al. An information-based metric for observing strategy + optimization, demonstrated in the context of photometric redshifts + with applications to cosmology + https://arxiv.org/abs/2104.08229 """ - Parameters: - colname: list, ['observationStartMJD', 'filter', 'fiveSigmaDepth'] - nside: nside of healpix - input_data_dir: string, directory to input catalog data, zphot.cat, test.cat - flowfile: pickle, pre-trained flow model - - Returns: - tlm: the last metric value, tlm = flow.log_prob + entropy, - reduce_tlm: reduce to single number, mean - """ + def __init__( + self, + catalog=None, + flowfile=None, + m5_col="fiveSigmaDepth", + filter_col="filter", + additional_cols=["fieldRA", "fieldDec", "rotSkyPos"], + metric_name="TheLastMetric", + scratch_dir=None, + **kwargs, + ): + self.m5_col = m5_col + self.filter_col = filter_col + + if catalog is not None: + self.catalog = catalog + else: + self.catalog = os.path.join(get_data_dir(), "maf", "mlg_mock", "mock_catalog.dat") - def __init__(self, colname=['observationStartMJD', 'fieldRA', 'fieldDec', 'filter', 'fiveSigmaDepth'], - nside=16, opsdb='/drive/Shareddrives/MAF/TheLastMetric/baseline_v2.0_10yrs.db', - catalog = './mock_catalog.dat', - outDir='./outDir', - input_data_dir = '/drive/Shareddrives/MAF/TheLastMetric/', - flowfile = None, - **kwargs): - self.colname = colname - self.nside = nside - self.opsdb = opsdb - self.outDir = outDir - - self.catalog = catalog - - self.input_data_dir = input_data_dir self.flowfile = flowfile - super().__init__(col=self.colname, **kwargs) - - def make_test_and_train(self, verbose=True, runid='1', filtmask=[1,1,1,1,1,1], yfilt=0, outDir='output/', - catalog='mock_catalog.dat', roman_exp=0, - test_m5=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], train_m5=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], - test_mcut=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], train_mcut=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], - force_idet=True, force_gridet=True, - test_N=5000, train_N=20000, cmnn_minNc=3): - - ''' - from https://github.com/dirac-institute/CMNN_Photoz_Estimator/tree/master - Create the test and training set based on user specifications. - - Inputs described in cmnn_run.main. - - Outputs: output/run_/test.cat and train.cat - ''' + cols = [self.m5_col, self.filter_col] + additional_cols + super().__init__(col=cols, metric_name=metric_name, **kwargs) + + if scratch_dir is None: + scratch_dir = os.getcwd() + self.scratch_dir = scratch_dir + + def make_test_and_train( + self, + verbose=False, + filtmask=[1, 1, 1, 1, 1, 1], + yfilt=0, + roman_exp=0, + test_m5=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], + train_m5=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], + test_mcut=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], + train_mcut=[26.1, 27.4, 27.5, 26.8, 26.1, 24.9, 30, 30, 30], + force_idet=True, + force_gridet=True, + test_N=5000, + train_N=20000, + cmnn_minNc=3, + ): + """Create the test and training set based on user specifications. + + This is based on work in + https://github.com/dirac-institute/CMNN_Photoz_Estimator + Parameters are further described in the argparser for + `cmnn_run` in that repository. + + The input catalog (default is mock_catalog.dat) includes galaxy + measurements in u g r i z y J H K -- Rubin supplemented with Roman. + This method sets m5 and mcut values in each bandpass, for creating + testing and training catalogs from the input catalog. + + Parameters + ---------- + filtmask : `list` [`int`] + Set mask for use of filters u g r i z y J H K + yfilt : `int` + 0 - use PanStarrs y band, 1 - use Euclid y-band + roman_exp : `int` + Set the 5th or 6th color to use Roman mags (0 = no Roman?) + test_m5 : `list` [`float`] + 5-sigma magnitude limits (depths) for test-set galaxies + train_m5 : `list` [`float`] + 5-sigma magnitude limits (depths) for training-set galaxies + test_mcut : `list` [`float`] + Magnitude cut-off to apply to the test-set galaxies + train_mcut : `list` [`float`] + Magnitude cut-off to apply to the training-set galaxies + force_idet : `bool` + True - force i-band detection for all galaxies + force_gridet : `bool` + True - force g+r+i-band detection for all galaxies + test_N : `int` + Number of test-set galaxies + train_N : `int` + Number of training-set galaxies + cmnn_minNc : `int` + Minimum number of colors for galaxies (2 to 8) + + Returns + ------- + train_cat, test_cat : `np.ndarray` (N, M), `np.ndarray` (N, M) + Training and testing sub-selected catalogs. + """ if verbose: - print('Starting cmnn_catalog.make_test_and_train: ',datetime.datetime.now()) + print("Starting cmnn_catalog.make_test_and_train: ", datetime.datetime.now()) # read galaxy data from the catalog # recall use of the yfilt parameter is # yfilt = 0 : use PanSTARRs y-band (default, column 7) # yfilt = 1 : use Euclid y-band (column 8) - all_id = np.loadtxt(catalog, dtype='float', usecols=(0)) - all_tz = np.loadtxt(catalog, dtype='float', usecols=(1)) + all_id = np.loadtxt(self.catalog, dtype="float", usecols=(0)) + all_tz = np.loadtxt(self.catalog, dtype="float", usecols=(1)) if yfilt == 0: - all_tm = np.loadtxt(catalog, dtype='float', usecols=(2, 3, 4, 5, 6, 7, 9, 10, 11)) + all_tm = np.loadtxt(self.catalog, dtype="float", usecols=(2, 3, 4, 5, 6, 7, 9, 10, 11)) elif yfilt == 1: - all_tm = np.loadtxt(catalog, dtype='float', usecols=(2, 3, 4, 5, 6, 8, 9, 10, 11)) + all_tm = np.loadtxt(self.catalog, dtype="float", usecols=(2, 3, 4, 5, 6, 8, 9, 10, 11)) # convert user-specified magnitude limits to numpy arrays - np_test_m5 = np.asarray(test_m5, dtype='float') - np_train_m5 = np.asarray(train_m5, dtype='float') - np_test_mcut = np.asarray(test_mcut, dtype='float') - np_train_mcut = np.asarray(train_mcut, dtype='float') + np_test_m5 = np.asarray(test_m5, dtype="float") + np_train_m5 = np.asarray(train_m5, dtype="float") + np_test_mcut = np.asarray(test_mcut, dtype="float") + np_train_mcut = np.asarray(train_mcut, dtype="float") # gamma sets the impact of sky brightness in magnitude error estimates - gamma = np.asarray( [0.037, 0.038, 0.039, 0.039, 0.04, 0.04, 0.04, 0.04, 0.04], dtype='float' ) - - # apply user-specified m5 depths to calculate magnitude errors for all galaxies - all_test_me = np.sqrt((0.04 - gamma) * (np.power(10.0, 0.4 * (all_tm[:] - np_test_m5))) + \ - gamma * (np.power(10.0, 0.4*(all_tm[:] - np_test_m5))**2)) - all_train_me = np.sqrt((0.04 - gamma) * (np.power(10.0, 0.4 * (all_tm[:] - np_train_m5))) + \ - gamma * (np.power(10.0, 0.4 * (all_tm[:] - np_train_m5))**2)) + # Gamma can be double-checked against the values in + # rubin_scheduler.utils.SysEngVals, and is here extended further red + gamma = np.asarray([0.037, 0.038, 0.039, 0.039, 0.04, 0.04, 0.04, 0.04, 0.04], dtype="float") + + # apply user-specified m5 depths to calculate magnitude errors + # for all galaxies + all_test_me = np.sqrt( + (0.04 - gamma) * (np.power(10.0, 0.4 * (all_tm[:] - np_test_m5))) + + gamma * (np.power(10.0, 0.4 * (all_tm[:] - np_test_m5)) ** 2) + ) + all_train_me = np.sqrt( + (0.04 - gamma) * (np.power(10.0, 0.4 * (all_tm[:] - np_train_m5))) + + gamma * (np.power(10.0, 0.4 * (all_tm[:] - np_train_m5)) ** 2) + ) # apply the uncertainty floor of 0.005 mag for f in range(9): - tex = np.where( all_test_me[:,f] < 0.0050)[0] - all_test_me[tex,f] = float(0.0050) - trx = np.where( all_train_me[:,f] < 0.0050)[0] - all_train_me[trx,f] = float(0.0050) + tex = np.where(all_test_me[:, f] < 0.0050)[0] + all_test_me[tex, f] = float(0.0050) + trx = np.where(all_train_me[:, f] < 0.0050)[0] + all_train_me[trx, f] = float(0.0050) # use the errors to calculate apparent observed magnitudes all_test_m = all_tm + all_test_me * np.random.normal(size=(len(all_tm), 9)) @@ -143,7 +203,7 @@ def make_test_and_train(self, verbose=True, runid='1', filtmask=[1,1,1,1,1,1], y # apply 17 mag as the saturation limit for all filters for f in range(9): - tx = np.where(all_tm[:,f] < 17.0000)[0] + tx = np.where(all_tm[:, f] < 17.0000)[0] all_test_me[tx] = np.nan all_test_m[tx] = np.nan all_train_me[tx] = np.nan @@ -152,18 +212,18 @@ def make_test_and_train(self, verbose=True, runid='1', filtmask=[1,1,1,1,1,1], y # do not allow "upscattering" of > 0.2 mag for f in range(9): - tx = np.where(all_tm[:,f] > np_test_m5[f] + 0.20)[0] + tx = np.where(all_tm[:, f] > np_test_m5[f] + 0.20)[0] all_test_me[tx] = np.nan all_test_m[tx] = np.nan del tx - tx = np.where(all_tm[:,f] > np_train_m5[f] + 0.20)[0] + tx = np.where(all_tm[:, f] > np_train_m5[f] + 0.20)[0] all_train_me[tx] = np.nan all_train_m[tx] = np.nan del tx # apply the user-specified magnitude cuts for f in range(9): - te_x = np.where(all_test_m[:,f] > np_test_mcut[f])[0] + te_x = np.where(all_test_m[:, f] > np_test_mcut[f])[0] if len(te_x) > 0: all_test_m[te_x, f] = np.nan all_test_me[te_x, f] = np.nan @@ -173,7 +233,7 @@ def make_test_and_train(self, verbose=True, runid='1', filtmask=[1,1,1,1,1,1], y if (force_gridet == True) & ((f == 1) | (f == 2) | (f == 3)): all_test_m[te_x, :] = np.nan all_test_me[te_x, :] = np.nan - tr_x = np.where(all_train_m[:,f] > np_train_mcut[f])[0] + tr_x = np.where(all_train_m[:, f] > np_train_mcut[f])[0] if len(tr_x) > 0: all_train_m[tr_x, f] = np.nan all_train_me[tr_x, f] = np.nan @@ -183,7 +243,7 @@ def make_test_and_train(self, verbose=True, runid='1', filtmask=[1,1,1,1,1,1], y if (force_gridet == True) & ((f == 1) | (f == 2) | (f == 3)): all_train_m[tr_x, :] = np.nan all_train_me[tr_x, :] = np.nan - del te_x,tr_x + del te_x, tr_x # Roman special experiements # 0 : fifth color is z-y; do nothing @@ -228,330 +288,304 @@ def make_test_and_train(self, verbose=True, runid='1', filtmask=[1,1,1,1,1,1], y all_train_me[:, f] = np.nan # calculate colors, color errors, and number of colors - all_test_c = np.zeros((len(all_tm), 8), dtype='float') - all_test_ce = np.zeros((len(all_tm), 8), dtype='float') - all_train_c = np.zeros((len(all_tm), 8), dtype='float') - all_train_ce = np.zeros((len(all_tm), 8), dtype='float') + all_test_c = np.zeros((len(all_tm), 8), dtype="float") + all_test_ce = np.zeros((len(all_tm), 8), dtype="float") + all_train_c = np.zeros((len(all_tm), 8), dtype="float") + all_train_ce = np.zeros((len(all_tm), 8), dtype="float") for c in range(8): - all_test_c[:, c] = all_test_m[:, c] - all_test_m[:, c+1] - all_train_c[:, c] = all_train_m[:, c] - all_train_m[:, c+1] - all_test_ce[:, c] = np.sqrt( all_test_me[:, c]**2 + all_test_me[:, c+1]**2 ) - all_train_ce[:, c] = np.sqrt( all_train_me[:, c]**2 + all_train_me[:, c+1]**2 ) - all_test_Nc = np.nansum(all_test_c/all_test_c, axis=1) - all_train_Nc = np.nansum(all_train_c/all_train_c, axis=1) + all_test_c[:, c] = all_test_m[:, c] - all_test_m[:, c + 1] + all_train_c[:, c] = all_train_m[:, c] - all_train_m[:, c + 1] + all_test_ce[:, c] = np.sqrt(all_test_me[:, c] ** 2 + all_test_me[:, c + 1] ** 2) + all_train_ce[:, c] = np.sqrt(all_train_me[:, c] ** 2 + all_train_me[:, c + 1] ** 2) + all_test_Nc = np.nansum(all_test_c / all_test_c, axis=1) + all_train_Nc = np.nansum(all_train_c / all_train_c, axis=1) # create test and training sets - te_x = np.where( all_test_Nc >= cmnn_minNc )[0] - tr_x = np.where( all_train_Nc >= cmnn_minNc )[0] + te_x = np.where(all_test_Nc >= cmnn_minNc)[0] + tr_x = np.where(all_train_Nc >= cmnn_minNc)[0] if (len(te_x) < test_N) | (len(tr_x) < train_N): - print('Error. Desired number of test/training galaxies higher than what is available.') - print(' test number desired, available: %i %i' % (test_N, len(te_x))) - print(' train number desired, available: %i %i' % (train_N, len(tr_x))) - print('Exit (inputs too constraining to build test/train set).') - exit() + print("Error. Desired number of test/training galaxies higher than what is available.") + print(" test number desired, available: %i %i" % (test_N, len(te_x))) + print(" train number desired, available: %i %i" % (train_N, len(tr_x))) + return None, None else: + # I would like this to return the actual catalogs, not filenames + # Can you build these as a subselection from the arrays and just + # pass them back? te_rx = np.random.choice(te_x, size=test_N, replace=False) - test_fout = open(outDir + '/run_'+runid+'_test.cat', 'w') - for i in te_rx: - test_fout.write('%10i %10.8f ' % (all_id[i], all_tz[i])) - for f in range(9): - test_fout.write('%9.6f %9.6f ' % (all_test_m[i, f], all_test_me[i, f])) - for c in range(8): - test_fout.write('%9.6f %9.6f ' % (all_test_c[i, c], all_test_ce[i, c])) - test_fout.write('\n') - test_fout.close() - del te_rx,test_fout + test_cat = os.path.join(self.scratch_dir, "run_test.cat") + with open(test_cat, "w") as test_fout: + for i in te_rx: + test_fout.write("%10i %10.8f " % (all_id[i], all_tz[i])) + for f in range(9): + test_fout.write("%9.6f %9.6f " % (all_test_m[i, f], all_test_me[i, f])) + for c in range(8): + test_fout.write("%9.6f %9.6f " % (all_test_c[i, c], all_test_ce[i, c])) + test_fout.write("\n") + del te_rx tr_rx = np.random.choice(tr_x, size=train_N, replace=False) - train_fout = open(outDir + '/run_'+runid+'_train.cat','w') - for i in tr_rx: - train_fout.write('%10i %10.8f ' % (all_id[i], all_tz[i])) - for f in range(9): - train_fout.write('%9.6f %9.6f ' % (all_train_m[i, f], all_train_me[i, f])) - for c in range(8): - train_fout.write('%9.6f %9.6f ' % (all_train_c[i, c], all_train_ce[i, c])) - train_fout.write('\n') - train_fout.close() - del tr_rx,train_fout + train_cat = os.path.join(self.scratch_dir, "run_train.cat") + with open(train_cat, "w") as train_fout: + for i in tr_rx: + train_fout.write("%10i %10.8f " % (all_id[i], all_tz[i])) + for f in range(9): + train_fout.write("%9.6f %9.6f " % (all_train_m[i, f], all_train_me[i, f])) + for c in range(8): + train_fout.write("%9.6f %9.6f " % (all_train_c[i, c], all_train_ce[i, c])) + train_fout.write("\n") + del tr_rx if verbose: - print('Wrote ', outDir+'run_'+runid+'_test.cat' + outDir + 'run_'+runid+'_train.cat') - print('Finished cmnn_catalog.make_test_and_train: ',datetime.datetime.now()) - train_cat = outDir + 'run_'+runid + '_train.cat' - test_cat = outDir + 'run_'+runid + '_test.cat' - return train_cat, test_cat - - - - def get_coaddM5(self, outDir='./outDir', colname=['observationStartMJD', 'filter', 'fiveSigmaDepth'], - nside=16, opsdb = '/drive/Shareddrives/MAF/TheLastMetric/baseline_v2.0_10yrs.db' - ): - - """run a ExGalCoadd to get the median of M5""" - resultsDb = db.ResultsDb(out_dir=outDir) - # metric, slicer, constraint - metric = ExGalCoaddM5Metric( colname=colname, ) - - slicer = slicers.HealpixSlicer(nside=nside) - - # bundle - metricSky = metricBundles.MetricBundle(metric, slicer, sqlstr) - - # group bundle - bundleDict = {'metricSky':metricSky} - - # table names='observations' for v2.0 - #if 'observations' in opsdb.get_table_names(): - # dbTable='observations' - #else: - # dbTable = 'SummaryAllProps' - - dbTable = 'observations' + print(f"Wrote {test_cat} and {train_cat}") + print("Finished cmnn_catalog.make_test_and_train: ", datetime.datetime.now()) - #group = metricBundles.MetricBundleGroup(bundleDict, opsdb, - # outDir=outDir, - # resultsDb=resultsDb, - # dbTable=dbTable ) - - group = metricBundles.MetricBundleGroup(bundleDict, opsdb, - out_dir=outDir, - results_db=resultsDb, - db_table=dbTable ) - - group.run_all() - - data = metricSky.metric_values.data[ ~metricSky.metric_values.mask] - df = pd.DataFrame.from_records(data) - coaddM5 = df[df['ebv']<0.2].dropna().median()[['coaddm5_u', 'coaddm5_g','coaddm5_r', - 'coaddm5_i', 'coaddm5_z','coaddm5_y',]].values - - return coaddM5 + return train_cat, test_cat + def get_median_coaddM5(self, dataSlice): + """Run ExgalM5WithCuts over the sky, return median per filter.""" + lsst_filters = ["u", "g", "r", "i", "z", "y"] + metric = ExgalM5WithCuts(m5_col=self.m5_col, filter_col=self.filter_col, lsst_filter=lsst_filters) + slicer = HealpixSlicer(nside=64, use_cache=False) + exgal_bundle = MetricBundle(metric=metric, slicer=slicer, constraint="") + + g = MetricBundleGroup( + {"exgal": exgal_bundle}, + db_con=None, + out_dir=".", + save_early=False, + results_db=None, + verbose=False, + ) + g.run_current(constraint="", sim_data=dataSlice) + + median_coadd_depths = {} + for i, f in enumerate(lsst_filters): + median_coadd_depths[f] = np.median(exgal_bundle.metric_values[:, i].compressed()) + + return median_coadd_depths def run(self, dataSlice, slice_point=None): - # sort the dataSlice in order of time. - # slicePoint {'ra':, 'dec':, 'sid':} - #dataSlice.sort(order='observationStartMJD') - #result = {} - #result['fieldRA'] = slicePoint['fieldRA'] - #print(slicePoint) - #slice_point['Nv'] = len(dataSlice) - - #----------------- - # load catalog data - - #input_data_dir = '/drive/Shareddrives/MAF/TheLastMetric/' - - #names_z=('ID', 'z_true', 'z_phot', 'dz_phot', 'NN', 'N_train') - - #names_phot=('ID', 'z_true', - # 'u', 'err_u', 'g', 'err_g', 'r', 'err_r', 'i', 'err_i', 'z', 'err_z', 'y', 'err_y', - # 'u-g', 'err_u-g', 'g-r', 'err_g-r', 'r-i', 'err_r-i', 'i-z', 'err_i-z', 'z-y', 'err_z-y') - - #z_cat = Table.read(self.input_data_dir + 'zphot.cat', - # format='ascii', - # names=names_z) - - #phot_cat = Table.read(self.input_data_dir + 'test.cat', - # format='ascii', - # names=names_phot) - - #phot_cat = Table.from_pandas(phot_cat.to_pandas().dropna()) - - #cat = phot_cat.to_pandas().merge(z_cat.to_pandas()) - - ## --------cut by median m5-------------- - - # ----------------------- - # get M5 median - coaddM5 = self.get_coaddM5(outDir = self.outDir, colname = self.colname, - nside = self.nside, opsdb=self.opsdb) + # Get median m5 depths across the DESC footprint + median_coadd_depths = self.get_median_coaddM5(dataSlice) + coaddM5 = list(median_coadd_depths.values()) # append cut for J, H, K used in make_test_and_train coaddM5 = np.append(coaddM5, [30, 30, 30]) - print('coaddM5', coaddM5) - _, test_cat_file = self.make_test_and_train(runid='1', catalog=self.catalog, - train_mcut=coaddM5, - test_mcut=coaddM5) - - names_phot=('ID', 'z_true', - 'u', 'err_u', 'g', 'err_g', 'r', 'err_r', 'i', 'err_i', 'z', 'err_z', 'y', 'err_y', - 'u-g', 'err_u-g', 'g-r', 'err_g-r', 'r-i', 'err_r-i', 'i-z', 'err_i-z', 'z-y', 'err_z-y') - - usecols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, #14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, #30, 31, 32, 33, 34, 35 - ] + _, test_cat_file = self.make_test_and_train(train_mcut=coaddM5, test_mcut=coaddM5) + if test_cat_file is None: + return self.badval + + names_phot = ( + "ID", + "z_true", + "u", + "err_u", + "g", + "err_g", + "r", + "err_r", + "i", + "err_i", + "z", + "err_z", + "y", + "err_y", + "u-g", + "err_u-g", + "g-r", + "err_g-r", + "r-i", + "err_r-i", + "i-z", + "err_i-z", + "z-y", + "err_z-y", + ) + + usecols = [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, # 14, 15, 16, 17, 18, 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, # 30, 31, 32, 33, 34, 35 + ] df_cat = pd.read_csv(test_cat_file, delim_whitespace=True, names=names_phot, usecols=usecols).dropna() - print('loaded df_cat') + print("loaded df_cat") data_columns = ["z_true"] - drop_cols = ['ID', 'z_true', 'u', 'g', 'i', 'z', 'y', - 'err_u', 'err_g', 'err_r', 'err_i', 'err_z', 'err_y', - 'err_u-g', 'err_g-r', 'err_r-i', 'err_i-z', 'err_z-y'] - - conditional_columns = df_cat.drop(drop_cols, axis = 1) + drop_cols = [ + "ID", + "z_true", + "u", + "g", + "i", + "z", + "y", + "err_u", + "err_g", + "err_r", + "err_i", + "err_z", + "err_y", + "err_u-g", + "err_g-r", + "err_r-i", + "err_i-z", + "err_z-y", + ] + + conditional_columns = df_cat.drop(drop_cols, axis=1) # data_columns: z_true # conditional_columns: r, u-g, g-r, r-i, i-z, z-y - if self.flowfile!=None: + if self.flowfile is not None: flow = FlowEnsemble(file=self.flowfile) else: # train flow model ndcol = len(data_columns) - ncond = len(conditional_columns) - ndim = ndcol+ncond K = 16 bijector = Chain( StandardScaler(np.atleast_1d(1.6), np.atleast_1d(0.32)), - NeuralSplineCoupling(B=5, n_conditions=6, K=K) - ) + NeuralSplineCoupling(B=5, n_conditions=6, K=K), + ) - latent = Uniform(input_dim=ndcol, B=7) # this has changed. + latent = Uniform(input_dim=ndcol, B=7) info = f"Models z_true conditioned on galaxy colors and r mag from {test_cat_file}. K = 16" - flow = FlowEnsemble(data_columns = data_columns, - conditional_columns = conditional_columns, - bijector = bijector, - latent = latent, - info = info, - N = 10 - ) - - loss = flow.train(df_cat, #conditional_columns], - convolve_errs=False, - epochs=150, verbose=True); + flow = FlowEnsemble( + data_columns=data_columns, + conditional_columns=conditional_columns, + bijector=bijector, + latent=latent, + info=info, + N=10, + ) - #IDS = phot_cat['ID'] - #z_cat_no_nan = z_cat.to_pandas()[z_cat.to_pandas()['ID'].isin(IDS)] + _ = flow.train(df_cat, convolve_errs=False, epochs=150, verbose=True) - b = sps.mstats.mquantiles(df_cat['z_true'], np.linspace(0,1, 101, endpoint=True)) - # print(len(z_cats_no_nan[which_os]['z_true'])) - # print(b) - - #b_centers = 0.5*(b[1:] + b[:-1]) + b = sps.mstats.mquantiles(df_cat["z_true"], np.linspace(0, 1, 101, endpoint=True)) # Computing the entropy H(z) - pz = sps.rv_histogram(np.histogram(df_cat['z_true'], bins=b)) + pz = sps.rv_histogram(np.histogram(df_cat["z_true"], bins=b)) entropy = pz.entropy() # mutual information lower bound - milb = flow.log_prob(df_cat, returnEnsemble=True, err_samples=10)# + entropy + milb = flow.log_prob(df_cat, returnEnsemble=True, err_samples=10) tlm = milb.mean(axis=0) + entropy - print("calculated tlm") - - #with open('test_tlm.pkl', 'wb') as outfile: - # pickle.dump(tlm, outfile) - - return { #'dataSlice': dataSlice, - 'entropy': entropy, - 'milb': milb, - 'tlm': tlm - } - - def reduce_tlm(self, metric): - """get number of visits""" - return metric['tlm'].mean() - - - -class ExGalCoaddM5Metric(metrics.BaseMetric): + # These should all be saved into npz file although format + # may have trouble - let's see, if the rest works. + return {"entropy": entropy, "milb": milb, "tlm": tlm} + + def reduce_tlm_mean(self, metric): + """Return mean tlm as single number - will save into resultsDb""" + return metric["tlm"].mean() + + +# I will move this over to the "batches" directory -- it'll probably +# be incorporated with some other batches, but this is how we specify +# how to run a given metric +def theLastMetricBatch( + colmap=None, + run_name="opsim", + extra_sql=None, + extra_info_label=None, + display_group="Cosmology", + subgroup="TheLastMetric", + out_dir=".", +): + """Set up TheLastMetric + within a night. + + Parameters + ---------- + colmap : `dict` or None, optional + A dictionary with a mapping of column names. + run_name : `str`, optional + The name of the simulated survey. + extra_sql : `str` or None, optional + Additional sql constraint to apply to all metrics. + extra_info_label : `str` or None, optional + Additional info_label to apply to all results. + display_group : `str` or None, optional + In show_maf pages, the division where the metric will appear. + subgroup : `str` or None, optional + In show_maf pages, the section within the division for the metrics. + out_dir : `str` + To be used (temporarily) for the scratch directory for the metric + BUT the metric should just pass back the test and train catalogs + from the function "make_test_and_train" + instead of writing and then reading them from disk + + Returns + ------- + metric_bundleDict : `dict` of `maf.MetricBundle` """ - Calculate coadd M5 for all filters used to cut simulated catalog in theLastMetric - Parameters: - colname: list, ['observationStartMJD', 'filter', 'fiveSigmaDepth'] - nside: nside of healpix - - Returns: - return coaddedM5 # if Ebv<0.2 mag - - """ - - def __init__(self, colname=['observationStartMJD', 'fieldRA', - 'fieldDec', 'filter', 'fiveSigmaDepth'], - maps=['DustMap'], - **kwargs): - self.colname = colname - #self.nside = nside - - super().__init__(col=self.colname, maps=['DustMap'], **kwargs) - - def run(self, data_slice, slice_point=None): - # sort the dataSlice in order of time. - # slicePoint {'ra':, 'dec':, 'sid':} - data_slice.sort(order='observationStartMJD') - #result = {} - #result['fieldRA'] = slicePoint['fieldRA'] - #print(slicePoint) - slice_point['Nv'] = len(data_slice) - - for f in np.unique(data_slice['filter']): - data_slice_f = data_slice[data_slice['filter']==f] - slice_point[f'Nv_{f}'] = len(data_slice_f) - slice_point[f'coaddm5_{f}'] = metrics.Coaddm5Metric().run(data_slice_f) - - return slice_point - -# def reduce_m5(self, metric): -# """return coaddM5""" -# return metric['coaddm5'] - - def reduce_ebv(self, metric): - return metric['ebv'] - - - -if __name__=='__main__': - - outDir = './outDir' - # outDir = '/drive/Shareddrives/MAF/TheLastMetric/outDir' - resultsDb = db.ResultsDb(out_dir=outDir) - - colname = ['observationStartMJD', 'filter', 'fiveSigmaDepth'] - sqlstr = 'night<400' - - nside = 16 - - opsdb = '/drive/Shareddrives/MAF/TheLastMetric/baseline_v2.0_10yrs.db' - # opsdb = './baseline_v2.0_10yrs.db' - #opsdb = db.Database(dbpath_v30+'baseline_v3.0_10yrs.db') - - # metric, slicer, constraint - metric = TheLastMetric( colname=colname, nside=nside, opsdb=opsdb) - - #slicer = slicers.HealpixSlicer(nside=nside) - - slicer = slicers.UniSlicer() - # bundle - metricSky = metricBundles.MetricBundle(metric, slicer, sqlstr) - - # group bundle - bundleDict = {'metricSky':metricSky} - - # table names='observations' for v2.0 - #if 'observations' in opsdb.get_table_names(): - # dbTable='observations' - #else: - # dbTable = 'SummaryAllProps' - - dbTable = 'observations' - - #group = metricBundles.MetricBundleGroup(bundleDict, opsdb, - # outDir=outDir, - # resultsDb=resultsDb, - # dbTable=dbTable ) - - group = metricBundles.MetricBundleGroup(bundleDict, opsdb, - out_dir=outDir, - results_db=resultsDb, - db_table=dbTable ) - - group.run_all() + if colmap is None: + colmap = col_map_dict() + + info_label = extra_info_label + if extra_sql is not None and len(extra_sql) > 0: + if info_label is None: + info_label = extra_sql + + bundleList = [] + + display_dict = { + "group": display_group, + "subgroup": subgroup, + "caption": None, + "order": 0, + } + + metric = TheLastMetric( + m5_col=colmap["fiveSigmaDepth"], + filter_col=colmap["filter"], + additional_cols=[colmap["ra"], colmap["dec"], "rotSkyPos"], + scratch_dir=out_dir, + ) + slicer = UniSlicer() + + display_dict["caption"] = ( + "The mean value of TheLastMetric, calculated using the median " + "extinction-corrected m5 depths over the DESC footprint." + ) + bundle = MetricBundle( + metric, + slicer, + extra_sql, + info_label=info_label, + display_dict=display_dict, + run_name=run_name, + ) + bundleList.append(bundle) + + return bundleList diff --git a/rubin_sim/maf/metrics/exgal_m5.py b/rubin_sim/maf/metrics/exgal_m5.py index d34af24e..769a2917 100644 --- a/rubin_sim/maf/metrics/exgal_m5.py +++ b/rubin_sim/maf/metrics/exgal_m5.py @@ -1,5 +1,7 @@ __all__ = ("ExgalM5",) +import warnings + from rubin_sim.phot_utils import DustValues from .base_metric import BaseMetric @@ -48,6 +50,8 @@ def run(self, data_slice, slice_point): """Compute the co-added m5 depth and then apply dust extinction to that magnitude. """ + if len(set(data_slice[self.filter_col])) > 1: + warnings.warn("Called coadd m5 with more than one filter in data.") m5 = self.coaddm5_metric.run(data_slice) if m5 == self.coaddm5_metric.badval: return self.badval diff --git a/rubin_sim/maf/metrics/weak_lensing_systematics_metric.py b/rubin_sim/maf/metrics/weak_lensing_systematics_metric.py index 3accd323..ebbcebaa 100644 --- a/rubin_sim/maf/metrics/weak_lensing_systematics_metric.py +++ b/rubin_sim/maf/metrics/weak_lensing_systematics_metric.py @@ -8,16 +8,37 @@ class ExgalM5WithCuts(BaseMetric): """ - Calculate co-added five-sigma limiting depth, but apply dust extinction and depth cuts. - This means that places on the sky that don't meet the dust extinction, coadded depth, or filter coverage - cuts will have masked values on those places. + Calculate co-added five-sigma limiting depth, + but apply dust extinction and depth cuts. + This means that places on the sky that don't meet the dust extinction, + coadded depth, or filter coverage cuts will have masked values + on those places, providing a way to limit the footprint to extragalactic + useful areas. - This metric is useful for DESC static science and weak lensing metrics. - In particular, it is required as input for the StaticProbesFoMEmulatorMetricSimple - (a summary metric to emulate a 3x2pt FOM). - - Note: this metric calculates the depth after dust extinction in band 'lsst_filter', but because - it looks for coverage in all bands, there should generally be no filter-constraint on the sql query. + Parameters + ---------- + m5_col : `str` + Name of the fiveSigmaDepth column. + filter_col : `str` + Name of the filter column. + metric_name : `str` + Name for the metric. + units : `str` + Units for the metric (for plot labels). + lsst_filter : `str` or `list` [`str`] + The filter or filters in which to calculate dust-extincted m5 values. + extinction_cut : `float` + The maximum E(B-V) value to allow. + depth_cut : `float` + Minimum coadded depth to allow. + n_filters : `int` + Number of filters required in coverage. + + Notes + ----- + This metric calculates the depth after dust extinction in band(s) + 'lsst_filter', but because it looks for coverage in all bands, there + should generally be no filter-constraint on the sql query. """ def __init__( @@ -38,7 +59,8 @@ def __init__( self.depth_cut = depth_cut self.n_filters = n_filters self.lsst_filter = lsst_filter - # I thought about inheriting from ExGalM5 instead, but the columns specification is more complicated + # I thought about inheriting from ExGalM5 instead, + # but the columns specification is more complicated self.exgal_m5 = ExgalM5(m5_col=m5_col, units=units) super().__init__( col=[self.m5_col, self.filter_col], @@ -47,37 +69,53 @@ def __init__( maps=self.exgal_m5.maps, **kwargs, ) + self.shape = len(self.lsst_filter) def run(self, data_slice, slice_point): # exclude areas with high extinction if slice_point["ebv"] > self.extinction_cut: return self.badval - # check to make sure there is at least some coverage in the required number of bands + # check to make sure there is coverage in the required number of bands n_filters = len(set(data_slice[self.filter_col])) if n_filters < self.n_filters: return self.badval - # if coverage and dust criteria are valid, move forward with only lsstFilter-band visits - d_s = data_slice[data_slice[self.filter_col] == self.lsst_filter] - # calculate the lsstFilter-band coadded depth - dustdepth = self.exgal_m5.run(d_s, slice_point) - - # exclude areas that are shallower than the depth cut - if dustdepth < self.depth_cut: - return self.badval + # if coverage and dust criteria are valid, + # move forward with calculating depth in lsstFilter-band visits + dustdepths = [] + if not isinstance(self.lsst_filter, list): + lsst_filter = [self.lsst_filter] else: - return dustdepth + lsst_filter = self.lsst_filter + + for f in lsst_filter: + d_s = data_slice[data_slice[self.filter_col] == f] + # calculate the lsstFilter-band coadded depth + dustdepth = self.exgal_m5.run(d_s, slice_point) + # exclude areas that are shallower than the depth cut + if dustdepth < self.depth_cut: + return self.badval + else: + dustdepths.append(dustdepth) + + # Return a single number, not a list, if only asked for single filter + if len(dustdepths) == 1: + dustdepths = dustdepths[0] + + return dustdepths class WeakLensingNvisits(BaseMetric): - """A proxy metric for WL systematics. Higher values indicate better systematics mitigation. + """A proxy metric for WL systematics. + Higher values indicate better systematics mitigation. - Weak Lensing systematics metric : Computes the average number of visits per point on a HEALPix grid + Weak Lensing systematics metric : Computes the average number of + visits per point on a HEALPix grid after a maximum E(B-V) cut and a minimum co-added depth cut. - Intended to be used to count visits in gri, but can be any filter combination as long as it + Intended to be used to count visits in gri, but can be any filter + combination as long as it includes `lsst_filter` band visits. - """ def __init__( @@ -130,34 +168,34 @@ class RIZDetectionCoaddExposureTime(BaseMetric): circular (and thus confuses MRB's feeble mind :/). TODO maybe: - - apply some sort of inverse variance weighting to the coadd based on sky - level? - - use some sort of effective exposure time that accounts for the PSF? - - @rhiannonlynne nicely suggested this Teff computation: - rubin_sim/rubin_sim/maf/metrics/technicalMetrics.py + - apply some sort of inverse variance weighting to the coadd based on sky + level? + - use some sort of effective exposure time that accounts for the PSF? + - @rhiannonlynne nicely suggested this Teff computation: + rubin_sim/rubin_sim/maf/metrics/technicalMetrics.py However, given the unknown nature of detection will look like in LSST, a simple sum of exposure time is probably ok. Parameters ---------- - bins : list of float + bins : `list` [`float`] The bin edges. Typically this will be a list of nights for which to compute the riz coadd exposure times. - bin_col : str, optional + bin_col : `str`, optional The column to bin on. The default is 'night'. - exp_time_col : str, optional + exp_time_col : `str`, optional The column name for the exposure time. - filter_col : str, optional + filter_col : `str`, optional The column name for the filter name. - ebvlim : float, optional + ebvlim : `float`, optional The upper limit on E(B-V). Regions with E(B-V) greater than this limit are excluded. - min_exp_time : float, optional + min_exp_time : `float`, optional The minimal exposure time for a visit to contribute to a coadd. - det_bands : list of str, optional + det_bands : `list` [`str`], optional If not None, the bands to use for detection. If None, defaults to riz. - min_bands : list of str, optional + min_bands : `list` [`str`], optional If not None, the bands whose presence is used to cut the survey data. If None, defaults to ugrizY. """ From f4f615cc877b4c33f4172cfaba7795db47466122 Mon Sep 17 00:00:00 2001 From: Lynne Jones Date: Thu, 17 Oct 2024 21:23:33 -0700 Subject: [PATCH 3/3] Modify depth cut in ExgalM5WithCuts --- rubin_sim/maf/maf_contrib/the_last_metric.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rubin_sim/maf/maf_contrib/the_last_metric.py b/rubin_sim/maf/maf_contrib/the_last_metric.py index d4f29a00..687299e0 100644 --- a/rubin_sim/maf/maf_contrib/the_last_metric.py +++ b/rubin_sim/maf/maf_contrib/the_last_metric.py @@ -347,7 +347,8 @@ def make_test_and_train( def get_median_coaddM5(self, dataSlice): """Run ExgalM5WithCuts over the sky, return median per filter.""" lsst_filters = ["u", "g", "r", "i", "z", "y"] - metric = ExgalM5WithCuts(m5_col=self.m5_col, filter_col=self.filter_col, lsst_filter=lsst_filters) + metric = ExgalM5WithCuts(m5_col=self.m5_col, filter_col=self.filter_col, + depth_cut=20, lsst_filter=lsst_filters) slicer = HealpixSlicer(nside=64, use_cache=False) exgal_bundle = MetricBundle(metric=metric, slicer=slicer, constraint="")