From bf28785160e45a50f8f3cf33d2eab49e41c2502a Mon Sep 17 00:00:00 2001
From: justin-richling <richling@ucar.edu>
Date: Tue, 24 Sep 2024 12:10:25 -0600
Subject: [PATCH 1/2] Add global attrs to all ADF generated files

This will add attributes for:
- timeseries files: ADF user, history files location, and history file name(s)

- climo files: ADF user, climo years, and time series file name(s)

- regridded files: ADF user, climo years, and climo file name(s)
---
 lib/adf_diag.py                              | 56 +++++++++++++++++---
 lib/adf_info.py                              | 12 ++++-
 scripts/averaging/create_climo_files.py      | 17 ++++--
 scripts/regridding/regrid_and_vert_interp.py | 40 +++++++++++++-
 4 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/lib/adf_diag.py b/lib/adf_diag.py
index bf87cb498..de4d3228d 100644
--- a/lib/adf_diag.py
+++ b/lib/adf_diag.py
@@ -514,6 +514,8 @@ def call_ncrcat(cmd):
 
                 # Loop over CAM history variables:
                 list_of_commands = []
+                list_of_ncattend_commands = []
+                list_of_hist_commands = []
                 vars_to_derive = []
                 # create copy of var list that can be modified for derivable variables
                 diag_var_list = self.diag_var_list
@@ -691,20 +693,62 @@ def call_ncrcat(cmd):
                         + ["-o", ts_outfil_str]
                     )
 
+                    # Example ncatted command (you can modify it with the specific attribute changes you need)
+                    #cmd_ncatted = ["ncatted", "-O", "-a", f"adf_user,global,a,c,{self.user}", ts_outfil_str]
+                    # Step 1: Convert Path objects to strings and concatenate the list of historical files into a single string
+                    hist_files_str = ', '.join(str(f.name) for f in hist_files)
+                    #3parent
+                    #hist_locs = []
+                    #for f in hist_files:
+                    hist_locs_str = ', '.join(str(loc) for loc in cam_hist_locs)
+
+                    # Step 2: Create the ncatted command to add both global attributes
+                    cmd_ncatted = [
+                        "ncatted", "-O",
+                        "-a", "adf_user,global,a,c," + f"{self.user}",
+                        "-a", "hist_file_locs,global,a,c," + f"{hist_locs_str}",
+                        "-a", "hist_file_list,global,a,c," + f"{hist_files_str}",
+                        ts_outfil_str
+                    ]
+
+                    # Step 3: Create the ncatted command to remove the history attribute
+                    cmd_remove_history = [
+                        "ncatted", "-O", "-h",
+                        "-a", "history,global,d,,",
+                        ts_outfil_str
+                    ]
+
                     # Add to command list for use in multi-processing pool:
+                    # -----------------------------------------------------
+                    # generate time series files
                     list_of_commands.append(cmd)
+                    # Add global attributes: user, original hist file loc(s) and all filenames
+                    list_of_ncattend_commands.append(cmd_ncatted)
+                    # Remove the `history` attr that gets tacked on (for clean up)
+                    # NOTE: this may not be best practice, but it the history attr repeats
+                    #       the files attrs so the global attrs become obtrusive...
+                    list_of_hist_commands.append(cmd_remove_history)
 
                 # End variable loop
 
                 # Now run the "ncrcat" subprocesses in parallel:
                 with mp.Pool(processes=self.num_procs) as mpool:
                     _ = mpool.map(call_ncrcat, list_of_commands)
+                # End with
 
-                    if vars_to_derive:
-                        self.derive_variables(
-                            res=res, hist_str=hist_str, vars_to_derive=vars_to_derive,
-                            constit_dict=constit_dict, ts_dir=ts_dir[case_idx]
-                        )
+                # Run ncatted commands after ncrcat is done
+                with mp.Pool(processes=self.num_procs) as mpool:
+                    _ = mpool.map(call_ncrcat, list_of_ncattend_commands)
+
+                # Run ncatted command to remove history attribute after the global attributes are set
+                with mp.Pool(processes=self.num_procs) as mpool:
+                    _ = mpool.map(call_ncrcat, list_of_hist_commands)
+
+                if vars_to_derive:
+                    self.derive_variables(
+                        res=res, hist_str=hist_str, vars_to_derive=vars_to_derive,
+                        constit_dict=constit_dict, ts_dir=ts_dir[case_idx]
+                    )
                 # End with
             # End for hist_str
         # End cases loop
@@ -1481,4 +1525,4 @@ def my_formatwarning(msg, *args, **kwargs):
         return xr.open_dataset(fils[0])
     #End if
 # End def
-########
+########
\ No newline at end of file
diff --git a/lib/adf_info.py b/lib/adf_info.py
index f5b667b09..e8f285f3c 100644
--- a/lib/adf_info.py
+++ b/lib/adf_info.py
@@ -31,6 +31,7 @@
 from pathlib import Path
 import copy
 import os
+import getpass
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++
 #import non-standard python modules, including ADF
@@ -94,6 +95,9 @@ def __init__(self, config_file, debug=False):
             self.expand_references(self.__mdtf_info)
         # End if
 
+        # Get the current system user
+        self.__user = getpass.getuser()
+
         # Check if inputs are of the correct type:
         # -------------------------------------------
 
@@ -569,6 +573,12 @@ def hist_str_to_list(self, conf_var, conf_val):
 
     #########
 
+    # Create property needed to return "user" name to user:
+    @property
+    def user(self):
+        """Return the "user" name if requested."""
+        return self.__user
+
     # Create property needed to return "compare_obs" logical to user:
     @property
     def compare_obs(self):
@@ -832,4 +842,4 @@ def get_climo_yrs_from_ts(self, input_ts_loc, case_name):
 
 #++++++++++++++++++++
 #End Class definition
-#++++++++++++++++++++
+#++++++++++++++++++++
\ No newline at end of file
diff --git a/scripts/averaging/create_climo_files.py b/scripts/averaging/create_climo_files.py
index 11844e189..d90bfbe52 100644
--- a/scripts/averaging/create_climo_files.py
+++ b/scripts/averaging/create_climo_files.py
@@ -178,7 +178,7 @@ def create_climo_files(adf, clobber=False, search=None):
                 warnings.warn(errmsg)
                 continue
 
-            list_of_arguments.append((ts_files, syr, eyr, output_file))
+            list_of_arguments.append((adf, ts_files, syr, eyr, output_file))
 
 
         #End of var_list loop
@@ -198,7 +198,7 @@ def create_climo_files(adf, clobber=False, search=None):
 #
 # Local functions
 #
-def process_variable(ts_files, syr, eyr, output_file):
+def process_variable(adf, ts_files, syr, eyr, output_file):
     '''
     Compute and save the climatology file.
     '''
@@ -227,6 +227,17 @@ def process_variable(ts_files, syr, eyr, output_file):
     enc_c  = {xname: {'_FillValue': None} for xname in cam_climo_data.coords}
     enc    = {**enc_c, **enc_dv}
 
+    # Create a dictionary of attributes
+    # Convert the list to a string (join with commas)
+    ts_files_str = [str(path) for path in ts_files]
+    ts_files_str = ', '.join(ts_files_str)
+    attrs_dict = {
+        "adf_user": adf.user,
+        "climo_yrs": f"{syr}-{eyr}",
+        "time_series_files": ts_files_str,
+    }
+    cam_climo_data = cam_climo_data.assign_attrs(attrs_dict)
+
     #Output variable climatology to NetCDF-4 file:
     cam_climo_data.to_netcdf(output_file, format='NETCDF4', encoding=enc)
     return 1  # All funcs return something. Could do error checking with this if needed.
@@ -273,4 +284,4 @@ def check_averaging_interval(syear_in, eyear_in):
     else:
         eyr = None
     #End if
-    return syr, eyr
+    return syr, eyr
\ No newline at end of file
diff --git a/scripts/regridding/regrid_and_vert_interp.py b/scripts/regridding/regrid_and_vert_interp.py
index a501de16d..fab356b0e 100644
--- a/scripts/regridding/regrid_and_vert_interp.py
+++ b/scripts/regridding/regrid_and_vert_interp.py
@@ -59,6 +59,10 @@ def regrid_and_vert_interp(adf):
     case_names = adf.get_cam_info("cam_case_name", required=True)
     input_climo_locs = adf.get_cam_info("cam_climo_loc", required=True)
 
+    #Grab case years
+    syear_cases = adf.climo_yrs["syears"]
+    eyear_cases = adf.climo_yrs["eyears"]
+
     #Check if mid-level pressure, ocean fraction or land fraction exist
     #in the variable list:
     for var in ["PMID", "OCNFRAC", "LANDFRAC"]:
@@ -91,6 +95,9 @@ def regrid_and_vert_interp(adf):
     #Regrid target variables (either obs or a baseline run):
     if adf.compare_obs:
 
+        #Set obs name to match baseline (non-obs)
+        target_list = ["Obs"]
+
         #Extract variable-obs dictionary:
         var_obs_dict = adf.var_obs_dict
 
@@ -108,6 +115,13 @@ def regrid_and_vert_interp(adf):
         target_list = [adf.get_baseline_info("cam_case_name", required=True)]
     #End if
 
+    #Grab baseline years (which may be empty strings if using Obs):
+    syear_baseline = adf.climo_yrs["syear_baseline"]
+    eyear_baseline = adf.climo_yrs["eyear_baseline"]
+
+    #Set attributes dictionary for climo years to save in the file attributes
+    base_climo_yrs_attr = f"{target_list[0]}: {syear_baseline}-{eyear_baseline}"
+
     #-----------------------------------------
 
     #Set output/target data path variables:
@@ -137,6 +151,10 @@ def regrid_and_vert_interp(adf):
         ps_loc_dict = {}
         pmid_loc_dict = {}
 
+        #Get climo years for case
+        syear = syear_cases[case_idx]
+        eyear = eyear_cases[case_idx]
+
         # probably want to do this one variable at a time:
         for var in var_list:
 
@@ -274,6 +292,15 @@ def regrid_and_vert_interp(adf):
                     #End if
 
                     #Finally, write re-gridded data to output file:
+                    #Convert the list of Path objects to a list of strings
+                    climatology_files_str = [str(path) for path in mclim_fils]
+                    climatology_files_str = ', '.join(climatology_files_str)
+                    test_attrs_dict = {
+                            "adf_user": adf.user,
+                            "climo_yrs": f"{case_name}: {syear}-{eyear}",
+                            "climatology_files": climatology_files_str,
+                        }
+                    rgdata_interp = rgdata_interp.assign_attrs(test_attrs_dict)
                     save_to_nc(rgdata_interp, regridded_file_loc)
                     rgdata_interp.close()  # bpm: we are completely done with this data
 
@@ -339,6 +366,17 @@ def regrid_and_vert_interp(adf):
                             #End if
                         #End if
 
+                        # Convert the list to a string (join with commas or another separator)
+                        climatology_files_str = [str(path) for path in tclim_fils]
+                        climatology_files_str = ', '.join(climatology_files_str)
+                        # Create a dictionary of attributes
+                        base_attrs_dict = {
+                            "adf_user": adf.user,
+                            "climo_yrs": f"{case_name}: {syear}-{eyear}; {base_climo_yrs_attr}",
+                            "climatology_files": climatology_files_str,
+                        }
+                        tgdata_interp = tgdata_interp.assign_attrs(base_attrs_dict)
+
                         #Write interpolated baseline climatology to file:
                         save_to_nc(tgdata_interp, interp_bl_file)
                     #End if
@@ -658,4 +696,4 @@ def regrid_data(fromthis, tothis, method=1):
         return result
     #End if
 
-#####
+#####
\ No newline at end of file

From 167db5ea8b8cf4c88e10be34417d2746c83de3b5 Mon Sep 17 00:00:00 2001
From: justin-richling <richling@ucar.edu>
Date: Tue, 24 Sep 2024 12:21:31 -0600
Subject: [PATCH 2/2] github clean up

---
 lib/adf_diag.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/adf_diag.py b/lib/adf_diag.py
index de4d3228d..27d7866f6 100644
--- a/lib/adf_diag.py
+++ b/lib/adf_diag.py
@@ -523,7 +523,8 @@ def call_ncrcat(cmd):
                 # Aerosol Calcs
                 # --------------
                 # Always make sure PMID is made if aerosols are desired in config file
-                # Since there's no requirement for `aerosol_zonal_list` to be included, allow it to be absent:
+                # Since there's no requirement for `aerosol_zonal_list` to be included,
+                # allow it to be absent:
 
                 azl = res.get("aerosol_zonal_list", [])
                 if "PMID" not in diag_var_list:
@@ -569,7 +570,7 @@ def call_ncrcat(cmd):
                             constit_list = vres["derivable_from_cam_chem"]
                             if constit_list:
                                 if all(item in hist_file_ds.data_vars for item in constit_list):
-                                    # Set check to look for regular CAM constituents in variable defaults
+                                    # Set check to look for regular CAM constituents
                                     try_cam_constits = False
                                     derive = True
                                     msg = f"create time series for {case_name}:"
@@ -608,12 +609,12 @@ def call_ncrcat(cmd):
                             # Add constituent list to variable key in dictionary
                             constit_dict[var] = constit_list
                             continue
-                            # Log if this variable can be derived but is missing list of constituents
+                            # Log if variable can be derived but is missing list of constituents
                         elif (derive) and (not constit_list):
                             self.debug_log(constit_errmsg)
                             continue
-                        # Lastly, raise error if the variable is not a derived quanitity but is also not
-                        # in the history file(s)
+                        # Lastly, raise error if the variable is not a derived quanitity
+                        # but is also not in the history file(s)
                         else:
                             msg = f"WARNING: {var} is not in the file {hist_files[0]} "
                             msg += "nor can it be derived.\n"
@@ -656,7 +657,7 @@ def call_ncrcat(cmd):
                     if has_lev and vert_coord_type:
                         # For now, only add these variables if using CAM:
                         if "cam" in hist_str:
-                            # PS might be in a different history file. If so, continue without error.
+                            # PS might be in a different history file. If so, continue w/o error.
                             ncrcat_var_list = ncrcat_var_list + ",hyam,hybm,hyai,hybi"
 
                             if "PS" in hist_file_var_list:
@@ -1377,7 +1378,6 @@ def move_tsfiles_for_mdtf(self, verbose):
         freq_string_options = ["month", "day", "6hr", "3hr", "1hr"]           #values
         freq_string_dict    = dict(zip(freq_string_cesm,freq_string_options)) #make dict
 
-        
         hist_str_list = self.get_cam_info("hist_str")
         case_names = self.get_cam_info("cam_case_name", required=True)
         var_list = self.diag_var_list
@@ -1454,7 +1454,7 @@ def move_tsfiles_for_mdtf(self, verbose):
                         continue
                     freq = freq_string_dict.get(found_strings[0])
                     print(f"Translated {found_strings[0]} to {freq}")
-                    
+
                     #
                     # Destination file is MDTF directory and name structure
                     #