Skip to content

Commit

Permalink
Merge pull request #368 from singularity-energy/ben/tz
Browse files Browse the repository at this point in the history
Fix and add information to plant static attributes
  • Loading branch information
rouille authored Jun 25, 2024
2 parents 8f9ab43 + fe24a67 commit 4d39b75
Show file tree
Hide file tree
Showing 6 changed files with 2,200 additions and 2,048 deletions.
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ sqlalchemy = "*"
statsmodels = "*"
coloredlogs = "*"
s3fs = "*"
timezonefinder = "*"
geopy = "*"
"catalystcoop.pudl" = {git = "git+https://github.com/singularity-energy/pudl.git@oge_release"}
gridemissions = {git = "git+https://github.com/singularity-energy/gridemissions"}

Expand Down
4,053 changes: 2,015 additions & 2,038 deletions Pipfile.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/oge/column_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ def get_dtypes() -> dict:
"n2o_mass_lb_for_electricity": "float64",
"n2o_mass_lb_for_electricity_adjusted": "float64",
"nox_mass_lb": "float64",
"controlled_nox_mass_lb": "float64",
"nox_mass_lb_adjusted": "float64",
"nox_mass_lb_for_electricity": "float64",
"nox_mass_lb_for_electricity_adjusted": "float64",
Expand Down
5 changes: 5 additions & 0 deletions src/oge/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,11 @@ def main(args):
year,
args.skip_outputs,
)
if not args.skip_outputs:
plant_attributes.assign(shaped_plant_id=pd.NA).to_csv(
results_folder(f"{path_prefix}plant_data/plant_static_attributes.csv"),
index=False,
)


if __name__ == "__main__":
Expand Down
20 changes: 12 additions & 8 deletions src/oge/emissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,11 +620,13 @@ def calculate_nox_from_fuel_consumption(
)
# if there were not season-specific rates, fill in using the annual rate if
# available
gen_fuel_allocated["controlled_nox_mass_lb"] = gen_fuel_allocated[
"controlled_nox_mass_lb"
].fillna(
gen_fuel_allocated["fuel_consumed_mmbtu"]
* gen_fuel_allocated["controlled_annual_nox_ef_lb_per_mmbtu"]
gen_fuel_allocated["controlled_nox_mass_lb"] = (
gen_fuel_allocated["controlled_nox_mass_lb"]
.astype(get_dtypes()["controlled_nox_mass_lb"])
.fillna(
gen_fuel_allocated["fuel_consumed_mmbtu"]
* gen_fuel_allocated["controlled_annual_nox_ef_lb_per_mmbtu"]
)
)
# update the emision total using the controlled mass if available
gen_fuel_allocated.update(
Expand Down Expand Up @@ -1365,9 +1367,11 @@ def calculate_so2_from_fuel_consumption(gen_fuel_allocated, year):
validate="m:1",
)
# assume all other generators have uncontrolled emissions
gen_fuel_allocated["so2_removal_efficiency_annual"] = gen_fuel_allocated[
"so2_removal_efficiency_annual"
].fillna(0)
gen_fuel_allocated["so2_removal_efficiency_annual"] = (
gen_fuel_allocated["so2_removal_efficiency_annual"]
.astype(get_dtypes()["so2_removal_efficiency_annual"])
.fillna(0)
)

# calculate controlled so2 emissions
gen_fuel_allocated["so2_mass_lb"] = gen_fuel_allocated["so2_mass_lb"] * (
Expand Down
167 changes: 165 additions & 2 deletions src/oge/helpers.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
import numpy as np
import pandas as pd

from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
from urllib3.exceptions import ReadTimeoutError

from oge.column_checks import get_dtypes, apply_dtypes
from oge.constants import earliest_data_year, latest_validated_year
from oge.filepaths import reference_table_folder, outputs_folder

import oge.load_data as load_data
from oge.logging_util import get_logger
import oge.validation as validation

logger = get_logger(__name__)

tf = TimezoneFinder()
geolocator = Nominatim(user_agent="oge")


def create_plant_attributes_table(
cems: pd.DataFrame,
Expand Down Expand Up @@ -120,6 +128,9 @@ def create_plant_attributes_table(
# add geographical info
plant_attributes = add_plant_entity(plant_attributes)

# fill out missing location/coordinates
plant_attributes = add_missing_location(plant_attributes)

# add nameplate capacity
plant_attributes = add_plant_nameplate_capacity(year, plant_attributes)

Expand Down Expand Up @@ -259,7 +270,7 @@ def create_plant_ba_table(year: int) -> pd.DataFrame:
"core_eia__entity_plants", columns=["plant_id_eia", "state"]
)
plant_ba = plant_ba.merge(
plant_states, how="left", on="plant_id_eia", validate="m:1"
plant_states, how="outer", on="plant_id_eia", validate="m:1"
)

# load the ba name reference
Expand Down Expand Up @@ -568,11 +579,35 @@ def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame:
)

for c in eia860_info:
# Handle NAs
if complete_plants_entity[c].isna().sum() > 0:
complete_plants_entity[c] = complete_plants_entity[c].fillna(
complete_plants_entity[f"{c}_eia"]
)
complete_plants_entity = complete_plants_entity.drop(columns=f"{c}_eia")
# Handle positive longitude
if c == "longitude" and (complete_plants_entity[c] > 0).any():
# Replace if EIA-860 longitude is negative, otherwise flip the sign.
for i in complete_plants_entity[complete_plants_entity[c] > 0].index:
lat_eia = complete_plants_entity.loc[i, "latitude_eia"]
lon_eia = complete_plants_entity.loc[i, "longitude_eia"]
if lon_eia < 0:
complete_plants_entity.loc[i, "latitude"] = lat_eia
complete_plants_entity.loc[i, "longitude"] = lon_eia
# Otherwise flip the sign of longitude and keep PUDL latitude
else:
complete_plants_entity.loc[
i, "longitude"
] = -complete_plants_entity.loc[i, "longitude"]
# Get new timezone
complete_plants_entity.loc[i, "timezone"] = tf.timezone_at(
lng=complete_plants_entity.loc[i, "longitude"],
lat=complete_plants_entity.loc[i, "latitude"],
)

# Clean data frame
complete_plants_entity = complete_plants_entity.drop(
columns=[f"{c}_eia" for c in eia860_info]
)

df = df.merge(
complete_plants_entity, how="left", on=["plant_id_eia"], validate="m:1"
Expand All @@ -581,6 +616,134 @@ def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame:
return df


def add_missing_location(df: pd.DataFrame) -> pd.DataFrame:
"""Add missing latitude, longitude, state, county and city when possible.
Args:
df (pd.DataFrame): table with 'latitude', 'longitude', 'state', 'county' and
'city' columns
Returns:
pd.DataFrame: original data frame with missing 'latitude', 'longitude',
'state', 'county' and 'city' filled out when possible.
"""
# get lat/lon
missing_coord = df[df["longitude"].isna() | df["latitude"].isna()]
if len(missing_coord) > 0:
# only get coordinates when state, county and city are available
for i in missing_coord.index:
state = df.loc[i, "state"]
county = df.loc[i, "county"]
city = df.loc[i, "city"]

lat, lon = get_coordinates_of_location(state, county, city)
df.loc[i, "latitude"] = lat
df.loc[i, "longitude"] = lon

# get missing state, county and city from coordinates
missing_location = df[df["state"].isna() | df["county"].isna() | df["city"].isna()]
if len(missing_location) > 0:
for i in missing_location.index:
if df.loc[i, ["latitude", "longitude"]].isna().sum() == 0:
state, county, city = search_location_from_coordinates(
df.loc[i, "latitude"],
df.loc[i, "longitude"],
)
if pd.isna(df.loc[i, "state"]):
df.loc[i, "state"] = state
if pd.isna(df.loc[i, "county"]):
df.loc[i, "county"] = county
if pd.isna(df.loc[i, "city"]):
df.loc[i, "city"] = city

return df


def search_location_from_coordinates(latitude: float, longitude: float) -> tuple[str]:
"""Get state, county, city at latitude/longitude.
Example:
>>> latitude = 33.458665
>>> longitude = -87.35682
>>> location = geolocator.reverse(f"{latitude}, {longitude}").raw
>>> location
{'place_id': 149439, 'licence': 'Data © OpenStreetMap contributors,
ODbL 1.0. http://osm.org/copyright', 'osm_type': 'way', 'osm_id': 8885591,
'lat': '33.460586', 'lon': '-87.359444', 'class': 'highway',
'type': 'unclassified', 'place_rank': 26, 'importance': 0.10000999999999993,
'addresstype': 'road', 'name': 'County Road 38', 'display_name':
'County Road 38, Tuscaloosa County, Alabama, United States', 'address':
{'road': 'County Road 38', 'county': 'Tuscaloosa County', 'state': 'Alabama',
'ISO3166-2-lvl4': 'US-AL', 'country': 'United States', 'country_code': 'us'},
'boundingbox': ['33.4552300', '33.4758370', '-87.3873120', '-87.3589650']}
Args:
latitude (float): latitude of the location.
longitude (float): longitude of the location.
Returns:
tuple[str]: state, county and city of the location.
"""
try:
address = geolocator.reverse(f"{latitude}, {longitude}").raw["address"]
if address["country_code"] != "us":
return pd.NA, pd.NA, pd.NA
except ReadTimeoutError:
return pd.NA, pd.NA, pd.NA

# Check for State
state = (
address["ISO3166-2-lvl4"].split("-")[1]
if "ISO3166-2-lvl4" in address.keys()
else pd.NA
)
county = address["county"].split(" ")[0] if "county" in address.keys() else pd.NA
city = address["city"] if "city" in address.keys() else pd.NA
return state, county, city


def get_coordinates_of_location(state: str, county: str, city: str) -> tuple[float]:
"""Use state, county and city information to get coordinates.
Example:
>>> location = geolocator.geocode("Bucks, Mobile county, AL").raw
>>> location
{'place_id': 379392554, 'licence': 'Data © OpenStreetMap contributors,
ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation',
'osm_id': 17019769, 'lat': '31.01630685', 'lon': '-88.02448016014876',
'class': 'boundary', 'type': 'census', 'place_rank': 25,
'importance': 0.4106648553911631, 'addresstype': 'census', 'name': 'Bucks',
'display_name': 'Bucks, Mobile County, Alabama, United States',
'boundingbox': ['31.0072629', '31.0244919', '-88.0286929', '-88.0198599']}
Args:
state (str): state of the location.
county (str): county of the location.
city (str): city of the location.
Returns:
tuple[float]: the latitude and longitude.
"""
if pd.isna(state):
return np.NaN, np.NaN
if pd.isna(city):
if not pd.isna(county):
query = f"{county} county, {state}, USA"
else:
query = f"{state}, USA"
else:
query = f"{city}, {state}, USA"

try:
location = geolocator.geocode(query, country_codes="us")
if location is None:
return np.NaN, np.NaN
else:
return float(location.raw["lat"]), float(location.raw["lon"])
except ReadTimeoutError:
return np.NaN, np.NaN


def add_subplant_ids_to_df(
df: pd.DataFrame,
year: int,
Expand Down

0 comments on commit 4d39b75

Please sign in to comment.