Skip to content

Commit

Permalink
update year handling logic
Browse files Browse the repository at this point in the history
  • Loading branch information
grgmiller committed Aug 21, 2024
1 parent 442cf37 commit cef341e
Show file tree
Hide file tree
Showing 10 changed files with 77 additions and 60 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ Parts of the input data used for the Open Grid Emissions dataset is released by
Updated datasets will also be published whenever a new version of the open-grid-emissions repository is released.

### Running the pipeline with early release data
The OGE pipeline can be used to generate data using Early Release EIA data as soon as it is integrated into the PUDL nightly builds. In order to do that, `constants.latest_validated_year` must be changed to match `constants.current_early_release_year` before running the pipeline.
The OGE pipeline can be used to generate data using Early Release EIA data as soon as it is integrated into the PUDL nightly builds. In order to do that, `constants.current_early_release_year` must be updated to the current early release year (such that `current_early_release_year` is 1 year greater than `latest_validated_year`). Early release data is typically available from EIA in June/July of the following year, and is integrated into PUDL shortly thereafter.

In addition, you will need to download and use the pudl nightly build data until the data becomes available through a stable release. To do so, you need to set your `PUDL_BUILD` environment variable to "nightly". You can do this through the command line using `set PUDL_BUILD=nightly` (for Windows), or by adding the following to the `__init__.py` file in `src/oge`:
```python
Expand Down
28 changes: 0 additions & 28 deletions notebooks/work_in_progress/sandbox.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,34 +40,6 @@
"year = 2022\n",
"path_prefix = f\"{year}/\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(results_folder(\"2023/plant_data/hourly/us_units/AZPS.csv\"))\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"px.line(data, x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"plant_id_eia\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"px.line(data, x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"plant_id_eia\")"
]
}
],
"metadata": {
Expand Down
5 changes: 5 additions & 0 deletions src/oge/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
latest_validated_year = 2022
# current_early_release_year is the year for which non-final (early-release) data
# is available from the EIA. This enables running the OGE pipeline for this year
# EIA-860ER data is generally available in June and EIA-923ER data is generally
# available in July of the following year. This should not be updated to the next year
# until ER data is available, so for part of the year, latest_validated_year will equal
# current_early_release_year
# TODO: Change this to 2024 around July 2025 (check PUDL to see when integrated)
current_early_release_year = 2023

# specify the energy_source_codes that are considered clean/carbon-free
Expand Down
5 changes: 4 additions & 1 deletion src/oge/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from oge.constants import (
TIME_RESOLUTIONS,
latest_validated_year,
current_early_release_year,
earliest_hourly_data_year,
)

Expand Down Expand Up @@ -161,7 +162,9 @@ def main(args):
# integrated into pudl
download_data.download_raw_eia860(year)
# download eia860 from the latest validated year for use in subplant identification
download_data.download_raw_eia860(latest_validated_year)
download_data.download_raw_eia860(
max(latest_validated_year, current_early_release_year)
)
download_data.download_raw_eia923(year)

# 2. Identify subplants
Expand Down
16 changes: 11 additions & 5 deletions src/oge/download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from oge.filepaths import downloads_folder, data_folder, get_pudl_build_version
from oge.logging_util import get_logger
from oge.constants import current_early_release_year
from oge.constants import current_early_release_year, latest_validated_year

logger = get_logger(__name__)

Expand Down Expand Up @@ -341,9 +341,12 @@ def download_raw_eia923(year: int):
download_raw_eia_906_920(year)
else:
os.makedirs(downloads_folder("eia923"), exist_ok=True)
url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip"
if year == current_early_release_year:
if (year == current_early_release_year) and (
current_early_release_year != latest_validated_year
):
url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}er.zip"
else:
url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip"
archive_url = (
f"https://www.eia.gov/electricity/data/eia923/archive/xls/f923_{year}.zip"
)
Expand Down Expand Up @@ -402,9 +405,12 @@ def download_raw_eia860(year: int):
if year < 2005:
raise NotImplementedError(f"We haven't tested EIA-860 for '{year}'.")
os.makedirs(downloads_folder("eia860"), exist_ok=True)
url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip"
if year == current_early_release_year:
if (year == current_early_release_year) and (
current_early_release_year != latest_validated_year
):
url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}ER.zip"
else:
url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip"
archive_url = (
f"https://www.eia.gov/electricity/data/eia860/archive/xls/eia860{year}.zip"
)
Expand Down
12 changes: 8 additions & 4 deletions src/oge/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
from urllib3.exceptions import ReadTimeoutError

from oge.column_checks import get_dtypes, apply_dtypes
from oge.constants import earliest_data_year, latest_validated_year
from oge.constants import (
earliest_data_year,
latest_validated_year,
current_early_release_year,
)
from oge.filepaths import reference_table_folder, outputs_folder

import oge.load_data as load_data
Expand Down Expand Up @@ -385,7 +389,7 @@ def add_plant_operating_and_retirement_dates(df: pd.DataFrame) -> pd.DataFrame:
generator_dates = load_data.load_pudl_table(
"out_eia__yearly_generators",
year=earliest_data_year,
end_year=latest_validated_year,
end_year=max(latest_validated_year, current_early_release_year),
columns=[
"plant_id_eia",
"generator_id",
Expand Down Expand Up @@ -456,7 +460,7 @@ def add_plant_nameplate_capacity(year: int, df: pd.DataFrame) -> pd.DataFrame:
generator_capacity = load_data.load_pudl_table(
"core_eia860__scd_generators",
year=earliest_data_year,
end_year=latest_validated_year,
end_year=max(latest_validated_year, current_early_release_year),
columns=[
"plant_id_eia",
"generator_id",
Expand Down Expand Up @@ -687,7 +691,7 @@ def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame:
columns=["plant_id_eia", "timezone"] + eia860_info,
)
plants_entity_from_eia860 = load_data.load_raw_eia860_plant_geographical_info(
latest_validated_year
max(latest_validated_year, current_early_release_year)
)
complete_plants_entity = plants_entity.merge(
plants_entity_from_eia860,
Expand Down
45 changes: 32 additions & 13 deletions src/oge/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ def load_cems_ids() -> pd.DataFrame:
# duplicates before concatenating the next year to the dataframe
cems_ids = []
# The `constants.earliest_data_year` is 2005
for year in range(earliest_data_year, latest_validated_year + 1):
for year in range(
earliest_data_year, max(latest_validated_year, current_early_release_year) + 1
):
cems_id_year = pd.read_parquet(
pudl_folder("core_epacems__hourly_emissions.parquet"),
filters=[["year", "==", year]],
Expand Down Expand Up @@ -195,7 +197,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame:
# this avoids using potentially preliminary early-release data
complete_gens = complete_gens[
(complete_gens["report_date"].dt.year >= earliest_data_year)
& (complete_gens["report_date"].dt.year <= latest_validated_year)
& (
complete_gens["report_date"].dt.year
<= max(latest_validated_year, current_early_release_year)
)
]

# for any retired gens, forward fill the most recently available unit_id_pudl to
Expand Down Expand Up @@ -231,7 +236,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame:
under_construction_status_codes = ["U", "V", "TS"]
complete_gens = complete_gens[
~(
(complete_gens["report_date"].dt.year < latest_validated_year)
(
complete_gens["report_date"].dt.year
< max(latest_validated_year, current_early_release_year)
)
& (
complete_gens["operational_status_code"].isin(
under_construction_status_codes
Expand All @@ -257,15 +265,18 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame:
~(
(complete_gens["generator_operating_date"].isna())
& (complete_gens["generator_retirement_date"].isna())
& (complete_gens["report_date"].dt.year < latest_validated_year)
& (
complete_gens["report_date"].dt.year
< max(latest_validated_year, current_early_release_year)
)
& (complete_gens["operational_status_code"] != "TS")
)
]

####################
# merge into complete_gens and fill missing operating dates with the EIA-860 data
generator_data_from_eia860 = load_raw_eia860_generator_dates_and_unit_ids(
latest_validated_year
max(latest_validated_year, current_early_release_year)
)
complete_gens = complete_gens.merge(
generator_data_from_eia860,
Expand Down Expand Up @@ -307,11 +318,14 @@ def load_raw_eia860_plant_geographical_info(year: int) -> pd.DataFrame:
"""
# load geographic information from the raw EIA-860 file to supplement missing
# information from pudl
filepath = f"eia860/eia860{year}/2___Plant_Y{year}.xlsx"
header_row = 1
if year == current_early_release_year:
if (year == current_early_release_year) and (
current_early_release_year != latest_validated_year
):
filepath = f"eia860/eia860{year}ER/2___Plant_Y{year}_Early_Release.xlsx"
header_row = 2
else:
filepath = f"eia860/eia860{year}/2___Plant_Y{year}.xlsx"
header_row = 1
plant_geographical_eia860 = pd.read_excel(
downloads_folder(filepath),
header=header_row,
Expand Down Expand Up @@ -362,11 +376,14 @@ def load_raw_eia860_generator_dates_and_unit_ids(year: int) -> pd.DataFrame:
"""
# load operating dates from the raw EIA-860 file to supplement missing operating
# dates from pudl
filepath = f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx"
header_row = 1
if year == current_early_release_year:
if (year == current_early_release_year) and (
current_early_release_year != latest_validated_year
):
filepath = f"eia860/eia860{year}ER/3_1_Generator_Y{year}_Early_Release.xlsx"
header_row = 2
else:
filepath = f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx"
header_row = 1
generator_op_dates_eia860 = pd.read_excel(
downloads_folder(filepath),
header=header_row,
Expand Down Expand Up @@ -1168,7 +1185,7 @@ def load_emissions_controls_eia923(year: int) -> pd.DataFrame:
)

if year >= 2012:
if year < current_early_release_year:
if year <= latest_validated_year:
# Handle filename changes across years.
schedule_8_filename = {
2012: downloads_folder(
Expand Down Expand Up @@ -1206,7 +1223,9 @@ def load_emissions_controls_eia923(year: int) -> pd.DataFrame:
),
}[year]
header_row = 4
elif year == current_early_release_year:
elif (year == current_early_release_year) and (
current_early_release_year != latest_validated_year
):
schedule_8_filename = downloads_folder(
f"eia923/f923_{year}er/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Early_Release.xlsx"
)
Expand Down
3 changes: 2 additions & 1 deletion src/oge/output_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
earliest_validated_year,
earliest_hourly_data_year,
latest_validated_year,
current_early_release_year,
)

logger = get_logger(__name__)
Expand Down Expand Up @@ -135,7 +136,7 @@ def zip_results_for_s3():
root_dir=data_folder(f"s3_upload/{year_range}_plant_attributes"),
)
shutil.rmtree(data_folder(f"s3_upload/{year_range}_plant_attributes"))
for year in range(2019, latest_validated_year + 1):
for year in range(2019, max(latest_validated_year, current_early_release_year) + 1):
for data_type in ["power_sector_data", "carbon_accounting", "plant_data"]:
for aggregation in ["hourly", "monthly", "annual"]:
for unit in ["metric_units", "us_units"]:
Expand Down
9 changes: 6 additions & 3 deletions src/oge/subplant_identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import oge.load_data as load_data
import oge.validation as validation
from oge.constants import latest_validated_year
from oge.constants import latest_validated_year, current_early_release_year
from oge.logging_util import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -33,7 +33,9 @@ def generate_subplant_ids() -> pd.DataFrame:
cems_ids = load_data.load_cems_ids()

# load the crosswalk and filter it by the data that actually exists in cems
crosswalk = load_data.load_epa_eia_crosswalk(latest_validated_year)
crosswalk = load_data.load_epa_eia_crosswalk(
max(latest_validated_year, current_early_release_year)
)

# filter the crosswalk to drop any units that don't exist in CEMS
filtered_crosswalk = epacamd_eia.filter_crosswalk(crosswalk, cems_ids)
Expand Down Expand Up @@ -161,7 +163,8 @@ def generate_subplant_ids() -> pd.DataFrame:

# validate that there are no orphaned combined cycle plant parts in a subplant
validation.check_for_orphaned_cc_part_in_subplant(
subplant_crosswalk_complete, latest_validated_year
subplant_crosswalk_complete,
max(latest_validated_year, current_early_release_year),
)

return subplant_crosswalk_complete
Expand Down
12 changes: 8 additions & 4 deletions src/oge/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,17 @@ def validate_year(year):
Input data for {end+1} should be available from the EIA in Fall {end+2} and we
will work to validate that the pipeline works with {end+1} data as soon as
possible after the data is released.
If you are looking to run the pipeline with Early Release data, check that
this data is available and integrated into PUDL, then update
`constants.current_early_release_year`
#########################################################################
"""
if year == current_early_release_year:
raise UserWarning(
"To run the pipeline with Early Release data, change `constants.latest_validated_year` to match `constants.current_early_release_year`"
if (year == current_early_release_year) and (year != latest_validated_year):
logger.warning(
f"Running pipeline with unvalidated Early Release data for {year}"
)
if year < earliest_data_year or year > latest_validated_year:
if year < earliest_data_year or year > current_early_release_year:
raise UserWarning(year_warning)


Expand Down

0 comments on commit cef341e

Please sign in to comment.