Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update data export notebook and small bug fixes #391

Merged
merged 3 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 187 additions & 0 deletions notebooks/explore_data/export_generator_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# import packages\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"# # Tell python where to look for modules.\n",
"import sys\n",
"\n",
"sys.path.append(\"../../../open-grid-emissions/src/oge\")\n",
"\n",
"\n",
"import load_data\n",
"import helpers\n",
"from filepaths import *\n",
"\n",
"year = 2023\n",
"path_prefix = f\"{year}/\"\n",
"\n",
"os.environ[\"PUDL_BUILD\"] = \"nightly\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# load data for each generator\n",
"gens = load_data.load_pudl_table(\n",
" \"core_eia860__scd_generators\",\n",
" year,\n",
" columns=[\n",
" \"report_date\",\n",
" \"plant_id_eia\",\n",
" \"generator_id\",\n",
" \"prime_mover_code\",\n",
" \"capacity_mw\",\n",
" \"generator_retirement_date\",\n",
" \"operational_status\",\n",
" \"operational_status_code\",\n",
" \"current_planned_generator_operating_date\",\n",
" \"planned_generator_retirement_date\",\n",
" ],\n",
")\n",
"\n",
"gens_entity = load_data.load_pudl_table(\n",
" \"core_eia__entity_generators\",\n",
" columns=[\n",
" \"plant_id_eia\",\n",
" \"generator_id\",\n",
" \"generator_operating_date\",\n",
" \"original_planned_generator_operating_date\",\n",
" ],\n",
")\n",
"\n",
"plants_entity = load_data.load_pudl_table(\n",
" \"core_eia__entity_plants\",\n",
" columns=[\"plant_id_eia\", \"latitude\", \"longitude\", \"city\", \"county\", \"state\"],\n",
")\n",
"\n",
"plant_ba = load_data.load_pudl_table(\n",
" \"core_eia860__scd_plants\",\n",
" year,\n",
" columns=[\"plant_id_eia\", \"balancing_authority_code_eia\"],\n",
").rename(columns={\"balancing_authority_code_eia\": \"ba_code\"})\n",
"\n",
"# merge the ba code into the gens data\n",
"gens = (\n",
" gens.merge(\n",
" gens_entity, how=\"left\", on=[\"plant_id_eia\", \"generator_id\"], validate=\"m:1\"\n",
" )\n",
" .merge(plant_ba, how=\"left\", on=\"plant_id_eia\", validate=\"m:1\")\n",
" .merge(plants_entity, how=\"left\", on=\"plant_id_eia\", validate=\"m:1\")\n",
")\n",
"\n",
"# drop data for generators outside of continental US\n",
"gens = gens[~gens[\"state\"].isin([\"AK\", \"HI\"])]\n",
"\n",
"# if there are longitudes in the wrong hemisphere, try reversing the sign\n",
"gens.loc[gens[\"longitude\"] > 0, \"longitude\"] = (\n",
" -1 * gens.loc[gens[\"longitude\"] > 0, \"longitude\"]\n",
")\n",
"\n",
"# fix coordinates\n",
"# NOTE: This is the bounding box for the continental US and the southern canadian provinces\n",
"LONGITUDE_MIN = -125.25\n",
"LONGITUDE_MAX = -67\n",
"LATITUDE_MIN = 24.5\n",
"LATITUDE_MAX = 49.25\n",
"\n",
"# are there any old station locations that are outside of North America?\n",
"gens.loc[\n",
" (gens[\"longitude\"] < LONGITUDE_MIN)\n",
" | (gens[\"longitude\"] > LONGITUDE_MAX)\n",
" | (gens[\"latitude\"] < LATITUDE_MIN)\n",
" | (gens[\"latitude\"] > LATITUDE_MAX),\n",
" [\"latitude\", \"longitude\"],\n",
"] = np.NaN\n",
"\n",
"# fill in missing locations\n",
"# get lat/lon\n",
"# loop this process twice to try and address any geocoder errors\n",
"loop_count = 1\n",
"missing_coord = gens[gens[\"longitude\"].isna() | gens[\"latitude\"].isna()]\n",
"while loop_count <= 2 and len(missing_coord) > 0:\n",
" if len(missing_coord) > 0:\n",
" print(f\"Finding coordinates for {len(missing_coord)} missing locations\")\n",
" # only get coordinates when state, county and city are available\n",
" for i in missing_coord.index:\n",
" state = gens.loc[i, \"state\"]\n",
" county = gens.loc[i, \"county\"]\n",
" city = gens.loc[i, \"city\"]\n",
"\n",
" lat, lon = helpers.get_coordinates_of_location(state, county, city)\n",
" gens.loc[i, \"latitude\"] = lat\n",
" gens.loc[i, \"longitude\"] = lon\n",
" missing_coord = gens[gens[\"longitude\"].isna() | gens[\"latitude\"].isna()]\n",
" loop_count += 1\n",
"\n",
"# load wind data\n",
"wind = pd.read_excel(\n",
" downloads_folder(f\"eia860/eia860{year}/3_2_Wind_Y{year}.xlsx\"),\n",
" sheet_name=\"Operable\",\n",
" header=1,\n",
").rename(\n",
" columns={\n",
" \"Plant Code\": \"plant_id_eia\",\n",
" \"Generator ID\": \"generator_id\",\n",
" \"Design Wind Speed (mph)\": \"rated_speed_mph\",\n",
" \"Turbine Hub Height (Feet)\": \"hub_height_ft\",\n",
" }\n",
")\n",
"\n",
"# convert to metric units\n",
"wind[\"rated_speed_m_per_s\"] = (wind[\"rated_speed_mph\"] * 0.44704).round(1)\n",
"wind[\"hub_height_m\"] = (wind[\"hub_height_ft\"] * 0.3048).round(0)\n",
"\n",
"wind = wind[[\"plant_id_eia\", \"generator_id\", \"rated_speed_m_per_s\", \"hub_height_m\"]]\n",
"\n",
"# merge the wind data into the gens data\n",
"gens = gens.merge(wind, how=\"left\", on=[\"plant_id_eia\", \"generator_id\"])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"gens.to_csv(outputs_folder(f\"gens_{year}.csv\"), index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "open-grid-emissions-QkuIZ37I",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion src/oge/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def main(args):
logger.info("1. Downloading data")
# PUDL
download_data.download_pudl_data(source="aws")
logger.info(f"Using {os.getenv('PUDL_BUILD', default="stable")} PUDL build")
logger.info(f"Using {os.getenv('PUDL_BUILD', default='stable')} PUDL build")
# eGRID
download_data.download_egrid_files()
# EIA-930
Expand Down
10 changes: 8 additions & 2 deletions src/oge/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd

from geopy.geocoders import Nominatim
from geopy.exc import GeocoderUnavailable
from timezonefinder import TimezoneFinder
from urllib3.exceptions import ReadTimeoutError

Expand Down Expand Up @@ -849,8 +850,8 @@ def get_coordinates_of_location(state: str, county: str, city: str) -> tuple[flo
"""
if pd.isna(state):
return np.NaN, np.NaN
if pd.isna(city):
if not pd.isna(county):
if pd.isna(city) | (city == "unsited"):
if not (pd.isna(county) | (county == "NOT IN FILE")):
query = f"{county} county, {state}, USA"
else:
query = f"{state}, USA"
Expand All @@ -860,10 +861,15 @@ def get_coordinates_of_location(state: str, county: str, city: str) -> tuple[flo
try:
location = geolocator.geocode(query, country_codes="us")
if location is None:
logger.warning(f"No location returned for {query}")
return np.NaN, np.NaN
else:
return float(location.raw["lat"]), float(location.raw["lon"])
except ReadTimeoutError:
logger.warning(f"ReadTimeoutError for {query}")
return np.NaN, np.NaN
except GeocoderUnavailable:
logger.warning(f"GeocoderUnavailable for {query}")
return np.NaN, np.NaN
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for fixing that.



Expand Down
4 changes: 2 additions & 2 deletions src/oge/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def graph_hourly_data_by_fuel_category(
"biomass",
"petroleum",
"waste",
"solar",
"wind",
"natural_gas",
"wind",
"solar",
]

hourly_data = hourly_data[hourly_data[fuel_category_name] != "total"]
Expand Down
Loading