from __future__ import annotations
import pandas as pd
import numpy as np
import re
[docs]def convert_lat_lon(
coord: str
) -> str:
"""Convert lat/long of type 00N/S to +/-"""
if 'S' in coord or 'W' in coord:
val = '-' + (coord.translate({ord(i): '' for i in 'SW'})).strip()
return val
else:
return coord.translate({ord(i): '' for i in 'NE'})
[docs]def preprocess_noaa_textfile(
data: list
) -> list:
"""Some data preprocessing before reading into pandas df.
assigning event to each row, deleting headers, reformatting lat/long.
Must have been read in from standard new NOAA .txt file format."""
reformatted_data = []
for i, line in enumerate(data):
split_line = line.split(',')
if re.search('[a-z]', split_line[0].lower()):
line = ','.join([el.strip() for el in split_line])
header = line
else:
split_line[4], split_line[5] = convert_lat_lon(
split_line[4]), convert_lat_lon(split_line[5])
reformatted_data.append(''.join((header, ','.join(split_line))))
return reformatted_data
[docs]def windspeed_to_strength_category(
val: float | int
) -> bool | int:
"""Assign an intensity value based on maximum sustained wind speed
Parameters
----------
val : float | int
numerical value to be compared
Returns
-------
int
storm categorisation
"""
wind_thresholds = [0, 64, 83, 96, 113, 137][::-1]
cats = [0, 1, 2, 3, 4, 5][::-1]
if np.isnan(val):
return np.NaN
else:
for i, thresh in enumerate(wind_thresholds):
if val >= thresh:
return cats[i]
[docs]def return_most_recent_events_by_name(
df: pd.DataFrame,
event_names: list[str]
) -> pd.DataFrame:
"""Returns the df containing the data for the most recent occurence of each
event included in 'names'. df must have a 'date' column to judge most recent
Parameters
----------
Returns
-------
restricted pd.DataFrame
TODO: make this more flexible for selecting events
"""
# restrict to requested names only
df_lim = df.loc[df['name'].isin(event_names)]
# order df by date
df_sorted = df_lim.sort_values(['name', 'date'], ascending=[True, False])
# extract unique tags for most recent events
recent_tags = df_sorted.groupby('name').first().tag
return df_sorted.loc[df['tag'].isin(recent_tags)]