diff --git a/README.md b/README.md index 65fbe4c..3e3538f 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Parts of the input data used for the Open Grid Emissions dataset is released by Updated datasets will also be published whenever a new version of the open-grid-emissions repository is released. ### Running the pipeline with early release data -The OGE pipeline can be used to generate data using Early Release EIA data as soon as it is integrated into the PUDL nightly builds. In order to do that, `constants.latest_validated_year` must be changed to match `constants.current_early_release_year` before running the pipeline. +The OGE pipeline can be used to generate data using Early Release EIA data as soon as it is integrated into the PUDL nightly builds. In order to do that, `constants.current_early_release_year` must be updated to the current early release year (such that `current_early_release_year` is 1 year greater than `latest_validated_year`). Early release data is typically available from EIA in June/July of the following year, and is integrated into PUDL shortly thereafter. In addition, you will need to download and use the pudl nightly build data until the data becomes available through a stable release. To do so, you need to set your `PUDL_BUILD` environment variable to "nightly". You can do this through the command line using `set PUDL_BUILD=nightly` (for Windows), or by adding the following to the `__init__.py` file in `src/oge`: ```python diff --git a/notebooks/work_in_progress/sandbox.ipynb b/notebooks/work_in_progress/sandbox.ipynb index e13fc39..3bf99ab 100644 --- a/notebooks/work_in_progress/sandbox.ipynb +++ b/notebooks/work_in_progress/sandbox.ipynb @@ -40,34 +40,6 @@ "year = 2022\n", "path_prefix = f\"{year}/\"" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv(results_folder(\"2023/plant_data/hourly/us_units/AZPS.csv\"))\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "px.line(data, x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"plant_id_eia\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "px.line(data, x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"plant_id_eia\")" - ] } ], "metadata": { diff --git a/src/oge/constants.py b/src/oge/constants.py index 43cc155..63f3402 100644 --- a/src/oge/constants.py +++ b/src/oge/constants.py @@ -14,6 +14,11 @@ latest_validated_year = 2022 # current_early_release_year is the year for which non-final (early-release) data # is available from the EIA. This enables running the OGE pipeline for this year +# EIA-860ER data is generally available in June and EIA-923ER data is generally +# available in July of the following year. This should not be updated to the next year +# until ER data is available, so for part of the year, latest_validated_year will equal +# current_early_release_year +# TODO: Change this to 2024 around July 2025 (check PUDL to see when integrated) current_early_release_year = 2023 # specify the energy_source_codes that are considered clean/carbon-free diff --git a/src/oge/data_pipeline.py b/src/oge/data_pipeline.py index 48e004d..45d7ed4 100644 --- a/src/oge/data_pipeline.py +++ b/src/oge/data_pipeline.py @@ -30,6 +30,7 @@ from oge.constants import ( TIME_RESOLUTIONS, latest_validated_year, + current_early_release_year, earliest_hourly_data_year, ) @@ -161,7 +162,9 @@ def main(args): # integrated into pudl download_data.download_raw_eia860(year) # download eia860 from the latest validated year for use in subplant identification - download_data.download_raw_eia860(latest_validated_year) + download_data.download_raw_eia860( + max(latest_validated_year, current_early_release_year) + ) download_data.download_raw_eia923(year) # 2. Identify subplants diff --git a/src/oge/download_data.py b/src/oge/download_data.py index 88ec82a..20da070 100644 --- a/src/oge/download_data.py +++ b/src/oge/download_data.py @@ -9,7 +9,7 @@ from oge.filepaths import downloads_folder, data_folder, get_pudl_build_version from oge.logging_util import get_logger -from oge.constants import current_early_release_year +from oge.constants import current_early_release_year, latest_validated_year logger = get_logger(__name__) @@ -341,9 +341,12 @@ def download_raw_eia923(year: int): download_raw_eia_906_920(year) else: os.makedirs(downloads_folder("eia923"), exist_ok=True) - url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip" - if year == current_early_release_year: + if (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}er.zip" + else: + url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip" archive_url = ( f"https://www.eia.gov/electricity/data/eia923/archive/xls/f923_{year}.zip" ) @@ -402,9 +405,12 @@ def download_raw_eia860(year: int): if year < 2005: raise NotImplementedError(f"We haven't tested EIA-860 for '{year}'.") os.makedirs(downloads_folder("eia860"), exist_ok=True) - url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip" - if year == current_early_release_year: + if (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}ER.zip" + else: + url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip" archive_url = ( f"https://www.eia.gov/electricity/data/eia860/archive/xls/eia860{year}.zip" ) diff --git a/src/oge/helpers.py b/src/oge/helpers.py index 9b2c1bd..0437fdc 100644 --- a/src/oge/helpers.py +++ b/src/oge/helpers.py @@ -6,7 +6,11 @@ from urllib3.exceptions import ReadTimeoutError from oge.column_checks import get_dtypes, apply_dtypes -from oge.constants import earliest_data_year, latest_validated_year +from oge.constants import ( + earliest_data_year, + latest_validated_year, + current_early_release_year, +) from oge.filepaths import reference_table_folder, outputs_folder import oge.load_data as load_data @@ -385,7 +389,7 @@ def add_plant_operating_and_retirement_dates(df: pd.DataFrame) -> pd.DataFrame: generator_dates = load_data.load_pudl_table( "out_eia__yearly_generators", year=earliest_data_year, - end_year=latest_validated_year, + end_year=max(latest_validated_year, current_early_release_year), columns=[ "plant_id_eia", "generator_id", @@ -456,7 +460,7 @@ def add_plant_nameplate_capacity(year: int, df: pd.DataFrame) -> pd.DataFrame: generator_capacity = load_data.load_pudl_table( "core_eia860__scd_generators", year=earliest_data_year, - end_year=latest_validated_year, + end_year=max(latest_validated_year, current_early_release_year), columns=[ "plant_id_eia", "generator_id", @@ -687,7 +691,7 @@ def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame: columns=["plant_id_eia", "timezone"] + eia860_info, ) plants_entity_from_eia860 = load_data.load_raw_eia860_plant_geographical_info( - latest_validated_year + max(latest_validated_year, current_early_release_year) ) complete_plants_entity = plants_entity.merge( plants_entity_from_eia860, diff --git a/src/oge/load_data.py b/src/oge/load_data.py index 04ba4a7..46a85bc 100644 --- a/src/oge/load_data.py +++ b/src/oge/load_data.py @@ -135,7 +135,9 @@ def load_cems_ids() -> pd.DataFrame: # duplicates before concatenating the next year to the dataframe cems_ids = [] # The `constants.earliest_data_year` is 2005 - for year in range(earliest_data_year, latest_validated_year + 1): + for year in range( + earliest_data_year, max(latest_validated_year, current_early_release_year) + 1 + ): cems_id_year = pd.read_parquet( pudl_folder("core_epacems__hourly_emissions.parquet"), filters=[["year", "==", year]], @@ -195,7 +197,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame: # this avoids using potentially preliminary early-release data complete_gens = complete_gens[ (complete_gens["report_date"].dt.year >= earliest_data_year) - & (complete_gens["report_date"].dt.year <= latest_validated_year) + & ( + complete_gens["report_date"].dt.year + <= max(latest_validated_year, current_early_release_year) + ) ] # for any retired gens, forward fill the most recently available unit_id_pudl to @@ -231,7 +236,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame: under_construction_status_codes = ["U", "V", "TS"] complete_gens = complete_gens[ ~( - (complete_gens["report_date"].dt.year < latest_validated_year) + ( + complete_gens["report_date"].dt.year + < max(latest_validated_year, current_early_release_year) + ) & ( complete_gens["operational_status_code"].isin( under_construction_status_codes @@ -257,7 +265,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame: ~( (complete_gens["generator_operating_date"].isna()) & (complete_gens["generator_retirement_date"].isna()) - & (complete_gens["report_date"].dt.year < latest_validated_year) + & ( + complete_gens["report_date"].dt.year + < max(latest_validated_year, current_early_release_year) + ) & (complete_gens["operational_status_code"] != "TS") ) ] @@ -265,7 +276,7 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame: #################### # merge into complete_gens and fill missing operating dates with the EIA-860 data generator_data_from_eia860 = load_raw_eia860_generator_dates_and_unit_ids( - latest_validated_year + max(latest_validated_year, current_early_release_year) ) complete_gens = complete_gens.merge( generator_data_from_eia860, @@ -307,11 +318,14 @@ def load_raw_eia860_plant_geographical_info(year: int) -> pd.DataFrame: """ # load geographic information from the raw EIA-860 file to supplement missing # information from pudl - filepath = f"eia860/eia860{year}/2___Plant_Y{year}.xlsx" - header_row = 1 - if year == current_early_release_year: + if (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): filepath = f"eia860/eia860{year}ER/2___Plant_Y{year}_Early_Release.xlsx" header_row = 2 + else: + filepath = f"eia860/eia860{year}/2___Plant_Y{year}.xlsx" + header_row = 1 plant_geographical_eia860 = pd.read_excel( downloads_folder(filepath), header=header_row, @@ -362,11 +376,14 @@ def load_raw_eia860_generator_dates_and_unit_ids(year: int) -> pd.DataFrame: """ # load operating dates from the raw EIA-860 file to supplement missing operating # dates from pudl - filepath = f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx" - header_row = 1 - if year == current_early_release_year: + if (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): filepath = f"eia860/eia860{year}ER/3_1_Generator_Y{year}_Early_Release.xlsx" header_row = 2 + else: + filepath = f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx" + header_row = 1 generator_op_dates_eia860 = pd.read_excel( downloads_folder(filepath), header=header_row, @@ -1168,7 +1185,7 @@ def load_emissions_controls_eia923(year: int) -> pd.DataFrame: ) if year >= 2012: - if year < current_early_release_year: + if year <= latest_validated_year: # Handle filename changes across years. schedule_8_filename = { 2012: downloads_folder( @@ -1206,7 +1223,9 @@ def load_emissions_controls_eia923(year: int) -> pd.DataFrame: ), }[year] header_row = 4 - elif year == current_early_release_year: + elif (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): schedule_8_filename = downloads_folder( f"eia923/f923_{year}er/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Early_Release.xlsx" ) diff --git a/src/oge/output_data.py b/src/oge/output_data.py index e7ba3d3..bbe43e9 100644 --- a/src/oge/output_data.py +++ b/src/oge/output_data.py @@ -14,6 +14,7 @@ earliest_validated_year, earliest_hourly_data_year, latest_validated_year, + current_early_release_year, ) logger = get_logger(__name__) @@ -135,7 +136,7 @@ def zip_results_for_s3(): root_dir=data_folder(f"s3_upload/{year_range}_plant_attributes"), ) shutil.rmtree(data_folder(f"s3_upload/{year_range}_plant_attributes")) - for year in range(2019, latest_validated_year + 1): + for year in range(2019, max(latest_validated_year, current_early_release_year) + 1): for data_type in ["power_sector_data", "carbon_accounting", "plant_data"]: for aggregation in ["hourly", "monthly", "annual"]: for unit in ["metric_units", "us_units"]: diff --git a/src/oge/subplant_identification.py b/src/oge/subplant_identification.py index b5c20dc..2beeef5 100644 --- a/src/oge/subplant_identification.py +++ b/src/oge/subplant_identification.py @@ -5,7 +5,7 @@ import oge.load_data as load_data import oge.validation as validation -from oge.constants import latest_validated_year +from oge.constants import latest_validated_year, current_early_release_year from oge.logging_util import get_logger logger = get_logger(__name__) @@ -33,7 +33,9 @@ def generate_subplant_ids() -> pd.DataFrame: cems_ids = load_data.load_cems_ids() # load the crosswalk and filter it by the data that actually exists in cems - crosswalk = load_data.load_epa_eia_crosswalk(latest_validated_year) + crosswalk = load_data.load_epa_eia_crosswalk( + max(latest_validated_year, current_early_release_year) + ) # filter the crosswalk to drop any units that don't exist in CEMS filtered_crosswalk = epacamd_eia.filter_crosswalk(crosswalk, cems_ids) @@ -161,7 +163,8 @@ def generate_subplant_ids() -> pd.DataFrame: # validate that there are no orphaned combined cycle plant parts in a subplant validation.check_for_orphaned_cc_part_in_subplant( - subplant_crosswalk_complete, latest_validated_year + subplant_crosswalk_complete, + max(latest_validated_year, current_early_release_year), ) return subplant_crosswalk_complete diff --git a/src/oge/validation.py b/src/oge/validation.py index 0f494c7..922ed3e 100644 --- a/src/oge/validation.py +++ b/src/oge/validation.py @@ -41,13 +41,17 @@ def validate_year(year): Input data for {end+1} should be available from the EIA in Fall {end+2} and we will work to validate that the pipeline works with {end+1} data as soon as possible after the data is released. + + If you are looking to run the pipeline with Early Release data, check that + this data is available and integrated into PUDL, then update + `constants.current_early_release_year` ######################################################################### """ - if year == current_early_release_year: - raise UserWarning( - "To run the pipeline with Early Release data, change `constants.latest_validated_year` to match `constants.current_early_release_year`" + if (year == current_early_release_year) and (year != latest_validated_year): + logger.warning( + f"Running pipeline with unvalidated Early Release data for {year}" ) - if year < earliest_data_year or year > latest_validated_year: + if year < earliest_data_year or year > current_early_release_year: raise UserWarning(year_warning)