Source code for h3.dataloading.noaa_six_hourly_processing

from __future__ import annotations

import pandas as pd
import numpy as np
import re


[docs]def convert_lat_lon(
	coord: str
) -> str:
	"""Convert lat/long of type 00N/S to +/-"""

	if 'S' in coord or 'W' in coord:
		val = '-' + (coord.translate({ord(i): '' for i in 'SW'})).strip()
		return val
	else:
		return coord.translate({ord(i): '' for i in 'NE'})


[docs]def preprocess_noaa_textfile(
	data: list
) -> list:
	"""Some data preprocessing before reading into pandas df.
	assigning event to each row, deleting headers, reformatting lat/long.
	Must have been read in from standard new NOAA .txt file format."""
	reformatted_data = []
	for i, line in enumerate(data):
		split_line = line.split(',')
		if re.search('[a-z]', split_line[0].lower()):
			line = ','.join([el.strip() for el in split_line])
			header = line
		else:
			split_line[4], split_line[5] = convert_lat_lon(
				split_line[4]), convert_lat_lon(split_line[5])
			reformatted_data.append(''.join((header, ','.join(split_line))))

	return reformatted_data


[docs]def reformat_noaa_df(
	df: pd.DataFrame
) -> pd.DataFrame:
	"""Tidy up data types in pd.DataFrame"""

	# convert columns to correct data type
	numeric_cols = df.columns.drop(
		['tag', 'name', 'date', 'time', 'record_id', 'sys_status'])
	df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)
	# calculate storm intensity
	df['strength'] = df['max_sust_wind'].apply(windspeed_to_strength_category).astype('Int64')
	# combine date and time and correct format
	df['date'] = (df[['date', 'time']].agg(' '.join, axis=1)).apply(
		pd.to_datetime)
	# then drop time column
	df.drop('time', axis=1, inplace=True)
	# replace -999 values (shorthand for no data) with NaNs
	df.replace(-999, np.NaN, inplace=True)

	return df


[docs]def windspeed_to_strength_category(
	val: float | int
) -> bool | int:
	"""Assign an intensity value based on maximum sustained wind speed

	Parameters
	----------
		val : float | int
			numerical value to be compared
	Returns
	-------
		int
			storm categorisation
	"""
	wind_thresholds = [0, 64, 83, 96, 113, 137][::-1]
	cats = [0, 1, 2, 3, 4, 5][::-1]

	if np.isnan(val):
		return np.NaN
	else:
		for i, thresh in enumerate(wind_thresholds):
			if val >= thresh:
				return cats[i]


[docs]def return_most_recent_events_by_name(
	df: pd.DataFrame,
	event_names: list[str]
) -> pd.DataFrame:
	"""Returns the df containing the data for the most recent occurence of each
	event included in 'names'. df must have a 'date' column to judge most recent

	Parameters
	----------
	Returns
	-------
	restricted pd.DataFrame

	TODO: make this more flexible for selecting events
	"""
	# restrict to requested names only
	df_lim = df.loc[df['name'].isin(event_names)]
	# order df by date
	df_sorted = df_lim.sort_values(['name', 'date'], ascending=[True, False])
	# extract unique tags for most recent events
	recent_tags = df_sorted.groupby('name').first().tag

	return df_sorted.loc[df['tag'].isin(recent_tags)]