update year handling logic

singularity-energy · Aug 21, 2024 · cef341e · cef341e
1 parent 442cf37
commit cef341e
Show file tree

Hide file tree

Showing 10 changed files with 77 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ Parts of the input data used for the Open Grid Emissions dataset is released by
 Updated datasets will also be published whenever a new version of the open-grid-emissions repository is released.
 
 ### Running the pipeline with early release data
-The OGE pipeline can be used to generate data using Early Release EIA data as soon as it is integrated into the PUDL nightly builds. In order to do that, `constants.latest_validated_year` must be changed to match `constants.current_early_release_year` before running the pipeline.
+The OGE pipeline can be used to generate data using Early Release EIA data as soon as it is integrated into the PUDL nightly builds. In order to do that, `constants.current_early_release_year` must be updated to the current early release year (such that `current_early_release_year` is 1 year greater than `latest_validated_year`). Early release data is typically available from EIA in June/July of the following year, and is integrated into PUDL shortly thereafter.
 
 In addition, you will need to download and use the pudl nightly build data until the data becomes available through a stable release. To do so, you need to set your `PUDL_BUILD` environment variable to "nightly". You can do this through the command line using `set PUDL_BUILD=nightly` (for Windows), or by adding the following to the `__init__.py` file in `src/oge`:
 ```python

diff --git a/notebooks/work_in_progress/sandbox.ipynb b/notebooks/work_in_progress/sandbox.ipynb
@@ -40,34 +40,6 @@
     "year = 2022\n",
     "path_prefix = f\"{year}/\""
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = pd.read_csv(results_folder(\"2023/plant_data/hourly/us_units/AZPS.csv\"))\n",
-    "data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "px.line(data, x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"plant_id_eia\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "px.line(data, x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"plant_id_eia\")"
-   ]
   }
  ],
  "metadata": {

diff --git a/src/oge/constants.py b/src/oge/constants.py
@@ -14,6 +14,11 @@
 latest_validated_year = 2022
 # current_early_release_year is the year for which non-final (early-release) data
 # is available from the EIA. This enables running the OGE pipeline for this year
+# EIA-860ER data is generally available in June and EIA-923ER data is generally
+# available in July of the following year. This should not be updated to the next year
+# until ER data is available, so for part of the year, latest_validated_year will equal
+# current_early_release_year
+# TODO: Change this to 2024 around July 2025 (check PUDL to see when integrated)
 current_early_release_year = 2023
 
 # specify the energy_source_codes that are considered clean/carbon-free

diff --git a/src/oge/data_pipeline.py b/src/oge/data_pipeline.py
@@ -30,6 +30,7 @@
 from oge.constants import (
     TIME_RESOLUTIONS,
     latest_validated_year,
+    current_early_release_year,
     earliest_hourly_data_year,
 )
 
@@ -161,7 +162,9 @@ def main(args):
     # integrated into pudl
     download_data.download_raw_eia860(year)
     # download eia860 from the latest validated year for use in subplant identification
-    download_data.download_raw_eia860(latest_validated_year)
+    download_data.download_raw_eia860(
+        max(latest_validated_year, current_early_release_year)
+    )
     download_data.download_raw_eia923(year)
 
     # 2. Identify subplants

diff --git a/src/oge/download_data.py b/src/oge/download_data.py
@@ -9,7 +9,7 @@
 
 from oge.filepaths import downloads_folder, data_folder, get_pudl_build_version
 from oge.logging_util import get_logger
-from oge.constants import current_early_release_year
+from oge.constants import current_early_release_year, latest_validated_year
 
 logger = get_logger(__name__)
 
@@ -341,9 +341,12 @@ def download_raw_eia923(year: int):
         download_raw_eia_906_920(year)
     else:
         os.makedirs(downloads_folder("eia923"), exist_ok=True)
-        url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip"
-        if year == current_early_release_year:
+        if (year == current_early_release_year) and (
+            current_early_release_year != latest_validated_year
+        ):
             url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}er.zip"
+        else:
+            url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip"
         archive_url = (
             f"https://www.eia.gov/electricity/data/eia923/archive/xls/f923_{year}.zip"
         )
@@ -402,9 +405,12 @@ def download_raw_eia860(year: int):
     if year < 2005:
         raise NotImplementedError(f"We haven't tested EIA-860 for '{year}'.")
     os.makedirs(downloads_folder("eia860"), exist_ok=True)
-    url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip"
-    if year == current_early_release_year:
+    if (year == current_early_release_year) and (
+        current_early_release_year != latest_validated_year
+    ):
         url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}ER.zip"
+    else:
+        url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip"
     archive_url = (
         f"https://www.eia.gov/electricity/data/eia860/archive/xls/eia860{year}.zip"
     )

diff --git a/src/oge/helpers.py b/src/oge/helpers.py
@@ -6,7 +6,11 @@
 from urllib3.exceptions import ReadTimeoutError
 
 from oge.column_checks import get_dtypes, apply_dtypes
-from oge.constants import earliest_data_year, latest_validated_year
+from oge.constants import (
+    earliest_data_year,
+    latest_validated_year,
+    current_early_release_year,
+)
 from oge.filepaths import reference_table_folder, outputs_folder
 
 import oge.load_data as load_data
@@ -385,7 +389,7 @@ def add_plant_operating_and_retirement_dates(df: pd.DataFrame) -> pd.DataFrame:
     generator_dates = load_data.load_pudl_table(
         "out_eia__yearly_generators",
         year=earliest_data_year,
-        end_year=latest_validated_year,
+        end_year=max(latest_validated_year, current_early_release_year),
         columns=[
             "plant_id_eia",
             "generator_id",
@@ -456,7 +460,7 @@ def add_plant_nameplate_capacity(year: int, df: pd.DataFrame) -> pd.DataFrame:
     generator_capacity = load_data.load_pudl_table(
         "core_eia860__scd_generators",
         year=earliest_data_year,
-        end_year=latest_validated_year,
+        end_year=max(latest_validated_year, current_early_release_year),
         columns=[
             "plant_id_eia",
             "generator_id",
@@ -687,7 +691,7 @@ def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame:
         columns=["plant_id_eia", "timezone"] + eia860_info,
     )
     plants_entity_from_eia860 = load_data.load_raw_eia860_plant_geographical_info(
-        latest_validated_year
+        max(latest_validated_year, current_early_release_year)
     )
     complete_plants_entity = plants_entity.merge(
         plants_entity_from_eia860,

diff --git a/src/oge/load_data.py b/src/oge/load_data.py
@@ -135,7 +135,9 @@ def load_cems_ids() -> pd.DataFrame:
     # duplicates before concatenating the next year to the dataframe
     cems_ids = []
     # The `constants.earliest_data_year` is 2005
-    for year in range(earliest_data_year, latest_validated_year + 1):
+    for year in range(
+        earliest_data_year, max(latest_validated_year, current_early_release_year) + 1
+    ):
         cems_id_year = pd.read_parquet(
             pudl_folder("core_epacems__hourly_emissions.parquet"),
             filters=[["year", "==", year]],
@@ -195,7 +197,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame:
     # this avoids using potentially preliminary early-release data
     complete_gens = complete_gens[
         (complete_gens["report_date"].dt.year >= earliest_data_year)
-        & (complete_gens["report_date"].dt.year <= latest_validated_year)
+        & (
+            complete_gens["report_date"].dt.year
+            <= max(latest_validated_year, current_early_release_year)
+        )
     ]
 
     # for any retired gens, forward fill the most recently available unit_id_pudl to
@@ -231,7 +236,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame:
     under_construction_status_codes = ["U", "V", "TS"]
     complete_gens = complete_gens[
         ~(
-            (complete_gens["report_date"].dt.year < latest_validated_year)
+            (
+                complete_gens["report_date"].dt.year
+                < max(latest_validated_year, current_early_release_year)
+            )
             & (
                 complete_gens["operational_status_code"].isin(
                     under_construction_status_codes
@@ -257,15 +265,18 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame:
         ~(
             (complete_gens["generator_operating_date"].isna())
             & (complete_gens["generator_retirement_date"].isna())
-            & (complete_gens["report_date"].dt.year < latest_validated_year)
+            & (
+                complete_gens["report_date"].dt.year
+                < max(latest_validated_year, current_early_release_year)
+            )
             & (complete_gens["operational_status_code"] != "TS")
         )
     ]
 
     ####################
     # merge into complete_gens and fill missing operating dates with the EIA-860 data
     generator_data_from_eia860 = load_raw_eia860_generator_dates_and_unit_ids(
-        latest_validated_year
+        max(latest_validated_year, current_early_release_year)
     )
     complete_gens = complete_gens.merge(
         generator_data_from_eia860,
@@ -307,11 +318,14 @@ def load_raw_eia860_plant_geographical_info(year: int) -> pd.DataFrame:
     """
     # load geographic information from the raw EIA-860 file to supplement missing
     # information from pudl
-    filepath = f"eia860/eia860{year}/2___Plant_Y{year}.xlsx"
-    header_row = 1
-    if year == current_early_release_year:
+    if (year == current_early_release_year) and (
+        current_early_release_year != latest_validated_year
+    ):
         filepath = f"eia860/eia860{year}ER/2___Plant_Y{year}_Early_Release.xlsx"
         header_row = 2
+    else:
+        filepath = f"eia860/eia860{year}/2___Plant_Y{year}.xlsx"
+        header_row = 1
     plant_geographical_eia860 = pd.read_excel(
         downloads_folder(filepath),
         header=header_row,
@@ -362,11 +376,14 @@ def load_raw_eia860_generator_dates_and_unit_ids(year: int) -> pd.DataFrame:
     """
     # load operating dates from the raw EIA-860 file to supplement missing operating
     # dates from pudl
-    filepath = f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx"
-    header_row = 1
-    if year == current_early_release_year:
+    if (year == current_early_release_year) and (
+        current_early_release_year != latest_validated_year
+    ):
         filepath = f"eia860/eia860{year}ER/3_1_Generator_Y{year}_Early_Release.xlsx"
         header_row = 2
+    else:
+        filepath = f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx"
+        header_row = 1
     generator_op_dates_eia860 = pd.read_excel(
         downloads_folder(filepath),
         header=header_row,
@@ -1168,7 +1185,7 @@ def load_emissions_controls_eia923(year: int) -> pd.DataFrame:
         )
 
     if year >= 2012:
-        if year < current_early_release_year:
+        if year <= latest_validated_year:
             # Handle filename changes across years.
             schedule_8_filename = {
                 2012: downloads_folder(
@@ -1206,7 +1223,9 @@ def load_emissions_controls_eia923(year: int) -> pd.DataFrame:
                 ),
             }[year]
             header_row = 4
-        elif year == current_early_release_year:
+        elif (year == current_early_release_year) and (
+            current_early_release_year != latest_validated_year
+        ):
             schedule_8_filename = downloads_folder(
                 f"eia923/f923_{year}er/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Early_Release.xlsx"
             )

diff --git a/src/oge/output_data.py b/src/oge/output_data.py
@@ -14,6 +14,7 @@
     earliest_validated_year,
     earliest_hourly_data_year,
     latest_validated_year,
+    current_early_release_year,
 )
 
 logger = get_logger(__name__)
@@ -135,7 +136,7 @@ def zip_results_for_s3():
         root_dir=data_folder(f"s3_upload/{year_range}_plant_attributes"),
     )
     shutil.rmtree(data_folder(f"s3_upload/{year_range}_plant_attributes"))
-    for year in range(2019, latest_validated_year + 1):
+    for year in range(2019, max(latest_validated_year, current_early_release_year) + 1):
         for data_type in ["power_sector_data", "carbon_accounting", "plant_data"]:
             for aggregation in ["hourly", "monthly", "annual"]:
                 for unit in ["metric_units", "us_units"]:

diff --git a/src/oge/subplant_identification.py b/src/oge/subplant_identification.py
@@ -5,7 +5,7 @@
 
 import oge.load_data as load_data
 import oge.validation as validation
-from oge.constants import latest_validated_year
+from oge.constants import latest_validated_year, current_early_release_year
 from oge.logging_util import get_logger
 
 logger = get_logger(__name__)
@@ -33,7 +33,9 @@ def generate_subplant_ids() -> pd.DataFrame:
     cems_ids = load_data.load_cems_ids()
 
     # load the crosswalk and filter it by the data that actually exists in cems
-    crosswalk = load_data.load_epa_eia_crosswalk(latest_validated_year)
+    crosswalk = load_data.load_epa_eia_crosswalk(
+        max(latest_validated_year, current_early_release_year)
+    )
 
     # filter the crosswalk to drop any units that don't exist in CEMS
     filtered_crosswalk = epacamd_eia.filter_crosswalk(crosswalk, cems_ids)
@@ -161,7 +163,8 @@ def generate_subplant_ids() -> pd.DataFrame:
 
     # validate that there are no orphaned combined cycle plant parts in a subplant
     validation.check_for_orphaned_cc_part_in_subplant(
-        subplant_crosswalk_complete, latest_validated_year
+        subplant_crosswalk_complete,
+        max(latest_validated_year, current_early_release_year),
     )
 
     return subplant_crosswalk_complete

diff --git a/src/oge/validation.py b/src/oge/validation.py
@@ -41,13 +41,17 @@ def validate_year(year):
     Input data for {end+1} should be available from the EIA in Fall {end+2} and we 
     will work to validate that the pipeline works with {end+1} data as soon as 
     possible after the data is released.
+
+    If you are looking to run the pipeline with Early Release data, check that
+    this data is available and integrated into PUDL, then update 
+    `constants.current_early_release_year`
     #########################################################################
     """
-    if year == current_early_release_year:
-        raise UserWarning(
-            "To run the pipeline with Early Release data, change `constants.latest_validated_year` to match `constants.current_early_release_year`"
+    if (year == current_early_release_year) and (year != latest_validated_year):
+        logger.warning(
+            f"Running pipeline with unvalidated Early Release data for {year}"
         )
-    if year < earliest_data_year or year > latest_validated_year:
+    if year < earliest_data_year or year > current_early_release_year:
         raise UserWarning(year_warning)