diff --git a/notebooks/explore_data/export_generator_data.ipynb b/notebooks/explore_data/export_generator_data.ipynb new file mode 100644 index 0000000..7459a58 --- /dev/null +++ b/notebooks/explore_data/export_generator_data.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import packages\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "\n", + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# # Tell python where to look for modules.\n", + "import sys\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/oge\")\n", + "\n", + "\n", + "import load_data\n", + "import helpers\n", + "from filepaths import *\n", + "\n", + "year = 2023\n", + "path_prefix = f\"{year}/\"\n", + "\n", + "os.environ[\"PUDL_BUILD\"] = \"nightly\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# load data for each generator\n", + "gens = load_data.load_pudl_table(\n", + " \"core_eia860__scd_generators\",\n", + " year,\n", + " columns=[\n", + " \"report_date\",\n", + " \"plant_id_eia\",\n", + " \"generator_id\",\n", + " \"prime_mover_code\",\n", + " \"capacity_mw\",\n", + " \"generator_retirement_date\",\n", + " \"operational_status\",\n", + " \"operational_status_code\",\n", + " \"current_planned_generator_operating_date\",\n", + " \"planned_generator_retirement_date\",\n", + " ],\n", + ")\n", + "\n", + "gens_entity = load_data.load_pudl_table(\n", + " \"core_eia__entity_generators\",\n", + " columns=[\n", + " \"plant_id_eia\",\n", + " \"generator_id\",\n", + " \"generator_operating_date\",\n", + " \"original_planned_generator_operating_date\",\n", + " ],\n", + ")\n", + "\n", + "plants_entity = load_data.load_pudl_table(\n", + " \"core_eia__entity_plants\",\n", + " columns=[\"plant_id_eia\", \"latitude\", \"longitude\", \"city\", \"county\", \"state\"],\n", + ")\n", + "\n", + "plant_ba = load_data.load_pudl_table(\n", + " \"core_eia860__scd_plants\",\n", + " year,\n", + " columns=[\"plant_id_eia\", \"balancing_authority_code_eia\"],\n", + ").rename(columns={\"balancing_authority_code_eia\": \"ba_code\"})\n", + "\n", + "# merge the ba code into the gens data\n", + "gens = (\n", + " gens.merge(\n", + " gens_entity, how=\"left\", on=[\"plant_id_eia\", \"generator_id\"], validate=\"m:1\"\n", + " )\n", + " .merge(plant_ba, how=\"left\", on=\"plant_id_eia\", validate=\"m:1\")\n", + " .merge(plants_entity, how=\"left\", on=\"plant_id_eia\", validate=\"m:1\")\n", + ")\n", + "\n", + "# drop data for generators outside of continental US\n", + "gens = gens[~gens[\"state\"].isin([\"AK\", \"HI\"])]\n", + "\n", + "# if there are longitudes in the wrong hemisphere, try reversing the sign\n", + "gens.loc[gens[\"longitude\"] > 0, \"longitude\"] = (\n", + " -1 * gens.loc[gens[\"longitude\"] > 0, \"longitude\"]\n", + ")\n", + "\n", + "# fix coordinates\n", + "# NOTE: This is the bounding box for the continental US and the southern canadian provinces\n", + "LONGITUDE_MIN = -125.25\n", + "LONGITUDE_MAX = -67\n", + "LATITUDE_MIN = 24.5\n", + "LATITUDE_MAX = 49.25\n", + "\n", + "# are there any old station locations that are outside of North America?\n", + "gens.loc[\n", + " (gens[\"longitude\"] < LONGITUDE_MIN)\n", + " | (gens[\"longitude\"] > LONGITUDE_MAX)\n", + " | (gens[\"latitude\"] < LATITUDE_MIN)\n", + " | (gens[\"latitude\"] > LATITUDE_MAX),\n", + " [\"latitude\", \"longitude\"],\n", + "] = np.NaN\n", + "\n", + "# fill in missing locations\n", + "# get lat/lon\n", + "# loop this process twice to try and address any geocoder errors\n", + "loop_count = 1\n", + "missing_coord = gens[gens[\"longitude\"].isna() | gens[\"latitude\"].isna()]\n", + "while loop_count <= 2 and len(missing_coord) > 0:\n", + " if len(missing_coord) > 0:\n", + " print(f\"Finding coordinates for {len(missing_coord)} missing locations\")\n", + " # only get coordinates when state, county and city are available\n", + " for i in missing_coord.index:\n", + " state = gens.loc[i, \"state\"]\n", + " county = gens.loc[i, \"county\"]\n", + " city = gens.loc[i, \"city\"]\n", + "\n", + " lat, lon = helpers.get_coordinates_of_location(state, county, city)\n", + " gens.loc[i, \"latitude\"] = lat\n", + " gens.loc[i, \"longitude\"] = lon\n", + " missing_coord = gens[gens[\"longitude\"].isna() | gens[\"latitude\"].isna()]\n", + " loop_count += 1\n", + "\n", + "# load wind data\n", + "wind = pd.read_excel(\n", + " downloads_folder(f\"eia860/eia860{year}/3_2_Wind_Y{year}.xlsx\"),\n", + " sheet_name=\"Operable\",\n", + " header=1,\n", + ").rename(\n", + " columns={\n", + " \"Plant Code\": \"plant_id_eia\",\n", + " \"Generator ID\": \"generator_id\",\n", + " \"Design Wind Speed (mph)\": \"rated_speed_mph\",\n", + " \"Turbine Hub Height (Feet)\": \"hub_height_ft\",\n", + " }\n", + ")\n", + "\n", + "# convert to metric units\n", + "wind[\"rated_speed_m_per_s\"] = (wind[\"rated_speed_mph\"] * 0.44704).round(1)\n", + "wind[\"hub_height_m\"] = (wind[\"hub_height_ft\"] * 0.3048).round(0)\n", + "\n", + "wind = wind[[\"plant_id_eia\", \"generator_id\", \"rated_speed_m_per_s\", \"hub_height_m\"]]\n", + "\n", + "# merge the wind data into the gens data\n", + "gens = gens.merge(wind, how=\"left\", on=[\"plant_id_eia\", \"generator_id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "gens.to_csv(outputs_folder(f\"gens_{year}.csv\"), index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open-grid-emissions-QkuIZ37I", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/oge/data_pipeline.py b/src/oge/data_pipeline.py index 4be11e1..48f064a 100644 --- a/src/oge/data_pipeline.py +++ b/src/oge/data_pipeline.py @@ -142,7 +142,7 @@ def main(args): logger.info("1. Downloading data") # PUDL download_data.download_pudl_data(source="aws") - logger.info(f"Using {os.getenv('PUDL_BUILD', default="stable")} PUDL build") + logger.info(f"Using {os.getenv('PUDL_BUILD', default='stable')} PUDL build") # eGRID download_data.download_egrid_files() # EIA-930 diff --git a/src/oge/helpers.py b/src/oge/helpers.py index 0437fdc..c4a844a 100644 --- a/src/oge/helpers.py +++ b/src/oge/helpers.py @@ -2,6 +2,7 @@ import pandas as pd from geopy.geocoders import Nominatim +from geopy.exc import GeocoderUnavailable from timezonefinder import TimezoneFinder from urllib3.exceptions import ReadTimeoutError @@ -849,8 +850,8 @@ def get_coordinates_of_location(state: str, county: str, city: str) -> tuple[flo """ if pd.isna(state): return np.NaN, np.NaN - if pd.isna(city): - if not pd.isna(county): + if pd.isna(city) | (city == "unsited"): + if not (pd.isna(county) | (county == "NOT IN FILE")): query = f"{county} county, {state}, USA" else: query = f"{state}, USA" @@ -860,10 +861,15 @@ def get_coordinates_of_location(state: str, county: str, city: str) -> tuple[flo try: location = geolocator.geocode(query, country_codes="us") if location is None: + logger.warning(f"No location returned for {query}") return np.NaN, np.NaN else: return float(location.raw["lat"]), float(location.raw["lon"]) except ReadTimeoutError: + logger.warning(f"ReadTimeoutError for {query}") + return np.NaN, np.NaN + except GeocoderUnavailable: + logger.warning(f"GeocoderUnavailable for {query}") return np.NaN, np.NaN diff --git a/src/oge/visualization.py b/src/oge/visualization.py index 1859516..9591149 100644 --- a/src/oge/visualization.py +++ b/src/oge/visualization.py @@ -60,9 +60,9 @@ def graph_hourly_data_by_fuel_category( "biomass", "petroleum", "waste", - "solar", - "wind", "natural_gas", + "wind", + "solar", ] hourly_data = hourly_data[hourly_data[fuel_category_name] != "total"]