Source code for h3.dataprocessing.extract_metadata
from __future__ import annotations
import os
import fnmatch
import json
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely import wkt
from tqdm import tqdm
from typing import Literal
from h3.constants import DMG_CLASSES_DICT
from h3.utils.directories import get_metadata_pickle_dir, get_xbd_hlabel_dir, \
get_xbd_dir
# extract pre-event hurricane imagery
[docs]def filter_files(files: list, filepath: str, search_criteria: str) -> list:
"""Filter all json label files and returns a list of post-event files for
hurricanes.
Parameters
----------
files : list
list of json files in the label directory
filepath: str
path to file, assisting in search criteria process
search_criteria : str
filter out hurricanes, post-event imagery in json format.
i.e. input `hurricane*pre*json`, supports glob wildcard `*`
Returns
-------
list:
list of filtered files for corresponding criteria.
"""
list_of_files = []
search_path = os.path.join(filepath, search_criteria)
for f in files:
if fnmatch.fnmatch(f, search_path):
list_of_files.append(f)
return list_of_files
[docs]def extract_point(building):
"""Extract coordinate information from polygon and convert to a centroid
point.
Parameters
----------
building : object
polygon information in shapely coordinates.
Returns
-------
object
centroid point of polygon.
"""
building_polygon = building["wkt"]
building_coordinates = wkt.loads(building_polygon).centroid.wkt
return building_coordinates
[docs]def extract_polygon(building):
"""Extract polygon coordinate reference system information.
Parameters
----------
building : object
polygon shapely coordinates.
Returns
-------
object
polygon with spatial coordinate information.
"""
building_polygon = building["wkt"]
return building_polygon
[docs]def extract_metadata(json_link: str, CLASSES_DICT: dict, crs: str,
event_type: str):
"""
Extracts location in xy and long-lat format, gives damage name, class and
date.
Parameters
----------
json_link : path
path to json file containing location and metadata.
classes_dict : dict
dictionary mapping between damage classes (str) and
damage numbers (int).
crs : str
coordinate reference system to put as geometry in geodataframe.
event_type : str
post or pre event json files to filter out.
Returns
-------
Geodataframe
contains polygons of json file, corresponding metadata.
"""
# json_file = open(json_link)
# json_data = json.load(json_file)
with open(json_link, 'r') as j:
json_data = json.loads(j.read())
meta_data = json_data["metadata"]
disaster_type = meta_data["disaster"]
image_name = meta_data["img_name"]
capture_date = meta_data["capture_date"]
# for plotting on maps and finding environmental factors, use lng lat
coordinates_lnlat = json_data["features"]["lng_lat"]
# for coordination with imagery, use xy coordinates
coordinates_xy = json_data["features"]["xy"]
damage_location = []
for (building_lnglat, building_xy) in zip(coordinates_lnlat,
coordinates_xy):
lnglat_point = extract_point(building_lnglat)
lnglat_polygon = extract_polygon(building_lnglat)
xy_point = extract_point(building_xy)
xy_polygon = extract_polygon(building_xy)
# arbitrary if taking from xy or long lat features
if event_type == "pre":
damage_num = np.NaN
else:
damage_class = building_lnglat["properties"]["subtype"]
damage_num = CLASSES_DICT[damage_class]
damage_location.append([
lnglat_point, lnglat_polygon, xy_point,
xy_polygon, damage_num, disaster_type, image_name, capture_date,
json_link])
if crs == "xy":
df = gpd.GeoDataFrame(
damage_location,
columns=["point_lnglat", "polygon_lnglat", "point_xy", "geometry",
"damage_class", "disaster_name", "image_name",
"capture_date", "json_link"])
else:
df = gpd.GeoDataFrame(
damage_location,
columns=["geometry", "polygon_lnglat", "point_xy", "polygon_xy",
"damage_class", "disaster_name", "image_name",
"capture_date", "json_link"])
df["capture_date"] = pd.to_datetime(df["capture_date"])
df["geometry"] = df["geometry"].apply(wkt.loads)
return df
[docs]def extract_damage_allfiles_separate(filepaths_dict: dict,
crs: str, event: Literal["pre", "post"]):
"""
Filters all label files for hurricanes, extracts the metadata,
concatenates all files for post and pre images separately.
Parameters
----------
directory_files : dict
.json files in xBD data folder to filter organised by their folder.
These files are a value for the holdout, tier1, tier3 and test folder
as a key.
crs : str
coordinate reference system to put as geometry in geodataframe.
event : str
post or pre event json files to filter out.
Returns
-------
geodataframe
two geodataframes with a summary of metadata for all
hurricane events with labels.
"""
if event == "pre":
search_criterium = "hurricane*pre*.json"
if event == "post":
search_criterium = "hurricane*post*.json"
dataframes_list = []
for directory in filepaths_dict:
filepath_list = (filepaths_dict[directory])
full_hurr_json_files = filter_files(filepath_list,
directory,
search_criterium)
if len(full_hurr_json_files) > 0:
for file in tqdm(full_hurr_json_files,
desc=f"Extracting metadata for {event}"
"hurricane"):
loc_and_damage_df = extract_metadata(file, DMG_CLASSES_DICT,
crs, event)
dataframes_list.append(loc_and_damage_df)
rdf = gpd.GeoDataFrame(pd.concat(dataframes_list,
ignore_index=True))
return rdf
# check if polygons from pre and post overlap
[docs]def overlapping_polygons(geoms, p):
"""Checks if polygons from pre- and post-event imagery overlap.
If they do, the damage class from post-event dataframe can be allocated
to the pre-event polygon.
Parameters
----------
geoms : series
post-event geodataframe geometry column containing the polygon
p : object
pre-event polygon extracted from geodataframe
Returns
-------
series
column in post-event dataframe containing which row number the
post-event polygon matches with in the pre-event dataframe.
"""
overlap = (geoms.intersection(p).area / p.area) > 0.7
return pd.Series(overlap.index[overlap])
[docs]def extract_damage_allfiles_ensemble(
filepaths_dict: dict,
crs: str,
):
"""
Filters all pre and post label files for hurricanes, extracts the metadata
from the post and pre json files. Takes damage information from post and
adds that to the pre-event metadata dataframe.
Parameters
----------
filepaths_dict : dict
.json files in xBD data folder to filter organised by their folder.
These files are a value for the holdout, tier1, tier3 and test folder
as a key.
crs : str
coordinate reference system to put as geometry in geodataframe.
Returns
-------
geodataframe
geodataframes with a summary of metadata for all
pre-event hurricane events with post-event labels.
"""
full_pre_dataframes_list = []
for directory in filepaths_dict:
full_pre_hurr_json_files = filter_files(filepaths_dict[directory],
directory,
"hurricane*pre*.json")
# pre_dataframes_list = []
for pre_json_name in tqdm(full_pre_hurr_json_files,
desc=f"Extracting metadata for pre event and post damage label hurricane"):
post_json_name = pre_json_name.replace("pre", "post")
pre_metadata = extract_metadata(
pre_json_name, DMG_CLASSES_DICT, crs, "pre")
post_metadata = extract_metadata(
post_json_name, DMG_CLASSES_DICT, crs, "post")
# which row matches with which?
# post_metadata["match_num"] = pre_metadata.geometry.apply(
# lambda x: overlapping_polygons(post_metadata, x))
post_metadata["match_num"] = post_metadata.index
merge_post_metadata = post_metadata[["damage_class", "match_num"]]
pre_metadata["match_num"] = pre_metadata.index
pre_metadata = pre_metadata.drop("damage_class", axis=1)
# Assume order of polygons in pre and post json data is same
# otherwise, use the overlapping_polygons function which indicates
# polygonsare overlapping and are therefore pre and post pairs
polygons_pre = pre_metadata.merge(merge_post_metadata,
on="match_num")
polygons_pre = polygons_pre.drop(["match_num"], axis=1)
full_pre_dataframes_list.append(polygons_pre)
pre_rdf = gpd.GeoDataFrame(pd.concat(full_pre_dataframes_list,
ignore_index=True))
return pre_rdf
[docs]def load_and_save_df(filepaths_dict: dict, output_dir: str, reload_pickle: bool = False):
"""
Loads the json label files for all hurricanes in the "hold" section of the
xBD data, extracts the points and polygons in both xy coordinates,
referring to the corresponding imagery file, and the longitude and
latitude.
Parameters
----------
filepaths_dict : dict
pathnames in a dictionary for the holdout, tier1, tier3 and test
folder.
output_dir : str
reload_pickle : bool, optional
If True recreate the pickle files as if they did not exist. The default is False.
Returns
-------
Geodataframe
all metadata and locations in a geodataframe that is
saved in the data/datasets/EFs directory. Choose to return the gdf
with long-lat coordinate system and pre polygons with post damage
as this is most useful for choosing the EFs.
"""
# update filepaths_dictionary
for filepath in filepaths_dict:
directory_files = [os.path.join(filepath, file)
for file in os.listdir(filepath)]
filepaths_dict[filepath] = directory_files
path_save_post = os.path.join(output_dir, "pre_polygon.pkl")
if not os.path.exists(path_save_post) or reload_pickle:
df_points_post_hurr = extract_damage_allfiles_separate(
filepaths_dict=filepaths_dict,
crs="xy",
event="pre"
)
df_points_post_hurr.to_pickle(path_save_post)
else:
df_points_post_hurr = pd.read_pickle(path_save_post) # TODO: validate this
path_save_pre = os.path.join(output_dir, "xy_pre_pol_post_damage.pkl")
if not os.path.exists(path_save_pre) or reload_pickle:
df_pre_post_hurr_xy = extract_damage_allfiles_ensemble(
filepaths_dict=filepaths_dict,
crs="xy"
)
df_pre_post_hurr_xy.to_pickle(path_save_pre)
else:
df_pre_post_hurr_xy = pd.read_pickle(path_save_pre)
path_save_pre_longlat = os.path.join(output_dir, "lnglat_pre_pol_post_damage.pkl")
if not os.path.exists(path_save_pre_longlat) or reload_pickle:
df_pre_post_hurr_ll = extract_damage_allfiles_ensemble(
filepaths_dict=filepaths_dict,
crs="lng_lat"
)
df_pre_post_hurr_ll.to_pickle(path_save_pre_longlat)
else:
df_pre_post_hurr_ll = pd.read_pickle(path_save_pre_longlat)
return df_pre_post_hurr_xy, df_pre_post_hurr_ll
[docs]def main():
xbd_dir = get_xbd_dir()
#xbd_dir = "/Users/Lisanne/Documents/AI4ER/hurricane-harm-herald/data/test_geotiffs"
output_dir = get_metadata_pickle_dir()
#output_dir = "/Users/Lisanne/Documents/AI4ER/hurricane-harm-herald/data/test_output"
# hold_filepath = get_xbd_hlabel_dir()
hold_filepath = os.path.join(xbd_dir, "geotiffs/hold/labels")
tier1_filepath = os.path.join(xbd_dir, "geotiffs/tier1/labels")
tier3_filepath = os.path.join(xbd_dir, "geotiffs/tier3/labels")
test_filepath = os.path.join(xbd_dir, "geotiffs/test/labels")
# filepath = os.path.join(xbd_dir, labels_path, "")
filepaths_dict = dict.fromkeys([hold_filepath, tier1_filepath,
tier3_filepath, test_filepath])
df_pre_post_hurr_xy, df_pre_post_hurr_ll = load_and_save_df(filepaths_dict, output_dir)
return df_pre_post_hurr_xy, df_pre_post_hurr_ll
if __name__ == "__main__":
main()