Source code for h3.dataprocessing.extract_metadata

from __future__ import annotations

import os
import fnmatch
import json

import geopandas as gpd
import numpy as np
import pandas as pd

from shapely import wkt
from tqdm import tqdm

from typing import Literal
from h3.constants import DMG_CLASSES_DICT
from h3.utils.directories import get_metadata_pickle_dir, get_xbd_hlabel_dir, \
    get_xbd_dir


# extract pre-event hurricane imagery
[docs]def filter_files(files: list, filepath: str, search_criteria: str) -> list:
    """Filter all json label files and returns a list of post-event files for
     hurricanes.

    Parameters
    ----------
    files : list
        list of json files in the label directory
    filepath: str
        path to file, assisting in search criteria process
    search_criteria : str
        filter out hurricanes, post-event imagery in json format.
        i.e. input `hurricane*pre*json`, supports glob wildcard `*`

    Returns
    -------
    list:
        list of filtered files for corresponding criteria.
    """
    list_of_files = []
    search_path = os.path.join(filepath, search_criteria)

    for f in files:
        if fnmatch.fnmatch(f, search_path):
            list_of_files.append(f)
    return list_of_files


[docs]def extract_point(building):
    """Extract coordinate information from polygon and convert to a centroid
    point.

    Parameters
    ----------
    building : object
        polygon information in shapely coordinates.

    Returns
    -------
    object
        centroid point of polygon.
    """
    building_polygon = building["wkt"]
    building_coordinates = wkt.loads(building_polygon).centroid.wkt
    return building_coordinates


[docs]def extract_polygon(building):
    """Extract polygon coordinate reference system information.

    Parameters
    ----------
    building : object
        polygon shapely coordinates.

    Returns
    -------
    object
        polygon with spatial coordinate information.
    """
    building_polygon = building["wkt"]
    return building_polygon


[docs]def extract_metadata(json_link: str, CLASSES_DICT: dict, crs: str,
                     event_type: str):
    """
    Extracts location in xy and long-lat format, gives damage name, class and
    date.

    Parameters
    ----------
    json_link : path
        path to json file containing location and metadata.
    classes_dict : dict
        dictionary mapping between damage classes (str) and
        damage numbers (int).
    crs : str
        coordinate reference system to put as geometry in geodataframe.
    event_type : str
        post or pre event json files to filter out.

    Returns
    -------
    Geodataframe
        contains polygons of json file, corresponding metadata.
    """
    # json_file = open(json_link)
    # json_data = json.load(json_file)
    with open(json_link, 'r') as j:
        json_data = json.loads(j.read())

    meta_data = json_data["metadata"]
    disaster_type = meta_data["disaster"]
    image_name = meta_data["img_name"]
    capture_date = meta_data["capture_date"]

    # for plotting on maps and finding environmental factors, use lng lat
    coordinates_lnlat = json_data["features"]["lng_lat"]
    # for coordination with imagery, use xy coordinates
    coordinates_xy = json_data["features"]["xy"]

    damage_location = []

    for (building_lnglat, building_xy) in zip(coordinates_lnlat,
                                              coordinates_xy):
        lnglat_point = extract_point(building_lnglat)
        lnglat_polygon = extract_polygon(building_lnglat)
        xy_point = extract_point(building_xy)
        xy_polygon = extract_polygon(building_xy)

        # arbitrary if taking from xy or long lat features
        if event_type == "pre":
            damage_num = np.NaN
        else:
            damage_class = building_lnglat["properties"]["subtype"]
            damage_num = CLASSES_DICT[damage_class]

        damage_location.append([
            lnglat_point, lnglat_polygon, xy_point,
            xy_polygon, damage_num, disaster_type, image_name, capture_date,
            json_link])
    if crs == "xy":
        df = gpd.GeoDataFrame(
            damage_location,
            columns=["point_lnglat", "polygon_lnglat", "point_xy", "geometry",
                     "damage_class", "disaster_name", "image_name",
                     "capture_date", "json_link"])

    else:
        df = gpd.GeoDataFrame(
            damage_location,
            columns=["geometry", "polygon_lnglat", "point_xy", "polygon_xy",
                     "damage_class", "disaster_name", "image_name",
                     "capture_date", "json_link"])
    df["capture_date"] = pd.to_datetime(df["capture_date"])
    df["geometry"] = df["geometry"].apply(wkt.loads)
    return df


[docs]def extract_damage_allfiles_separate(filepaths_dict: dict,
                                     crs: str, event: Literal["pre", "post"]):
    """
    Filters all label files for hurricanes, extracts the metadata,
    concatenates all files for post and pre images separately.

    Parameters
    ----------
    directory_files : dict
        .json files in xBD data folder to filter organised by their folder.
        These files are a value for the holdout, tier1, tier3 and test folder
        as a key.
    crs : str
        coordinate reference system to put as geometry in geodataframe.
    event : str
        post or pre event json files to filter out.

    Returns
    -------
    geodataframe
        two geodataframes with a summary of metadata for all
        hurricane events with labels.
    """
    if event == "pre":
        search_criterium = "hurricane*pre*.json"
    if event == "post":
        search_criterium = "hurricane*post*.json"

    dataframes_list = []
    for directory in filepaths_dict:
        filepath_list = (filepaths_dict[directory])
        full_hurr_json_files = filter_files(filepath_list,
                                            directory,
                                            search_criterium)
        if len(full_hurr_json_files) > 0:
            for file in tqdm(full_hurr_json_files,
                             desc=f"Extracting metadata for {event}"
                             "hurricane"):
                loc_and_damage_df = extract_metadata(file, DMG_CLASSES_DICT,
                                                     crs, event)
                dataframes_list.append(loc_and_damage_df)
    rdf = gpd.GeoDataFrame(pd.concat(dataframes_list,
                                     ignore_index=True))
    return rdf


# check if polygons from pre and post overlap
[docs]def overlapping_polygons(geoms, p):
    """Checks if polygons from pre- and post-event imagery overlap.
    If they do, the damage class from post-event dataframe can be allocated
    to the pre-event polygon.

    Parameters
    ----------
    geoms : series
        post-event geodataframe geometry column containing the polygon
    p : object
        pre-event polygon extracted from geodataframe

    Returns
    -------
    series
        column in post-event dataframe containing which row number the
        post-event polygon matches with in the pre-event dataframe.
    """
    overlap = (geoms.intersection(p).area / p.area) > 0.7
    return pd.Series(overlap.index[overlap])


[docs]def extract_damage_allfiles_ensemble(
        filepaths_dict: dict,
        crs: str,
):
    """
    Filters all pre and post label files for hurricanes, extracts the metadata
    from the post and pre json files. Takes damage information from post and
    adds that to the pre-event metadata dataframe.

    Parameters
    ----------
     filepaths_dict : dict
        .json files in xBD data folder to filter organised by their folder.
        These files are a value for the holdout, tier1, tier3 and test folder
        as a key.
    crs : str
        coordinate reference system to put as geometry in geodataframe.

    Returns
    -------
    geodataframe
        geodataframes with a summary of metadata for all
        pre-event hurricane events with post-event labels.
    """
    full_pre_dataframes_list = []
    for directory in filepaths_dict:
        full_pre_hurr_json_files = filter_files(filepaths_dict[directory],
                                                directory,
                                                "hurricane*pre*.json")
        # pre_dataframes_list = []
        for pre_json_name in tqdm(full_pre_hurr_json_files,
                                  desc=f"Extracting metadata for pre event and post damage label hurricane"):
            post_json_name = pre_json_name.replace("pre", "post")

            pre_metadata = extract_metadata(
                pre_json_name, DMG_CLASSES_DICT, crs, "pre")
            post_metadata = extract_metadata(
                post_json_name, DMG_CLASSES_DICT, crs, "post")

            # which row matches with which?
            # post_metadata["match_num"] = pre_metadata.geometry.apply(
            # lambda x: overlapping_polygons(post_metadata, x))
            post_metadata["match_num"] = post_metadata.index
            merge_post_metadata = post_metadata[["damage_class", "match_num"]]
            pre_metadata["match_num"] = pre_metadata.index
            pre_metadata = pre_metadata.drop("damage_class", axis=1)
            # Assume order of polygons in pre and post json data is same
            # otherwise, use the overlapping_polygons function which indicates
            # polygonsare overlapping and are therefore pre and post pairs
            polygons_pre = pre_metadata.merge(merge_post_metadata,
                                              on="match_num")
            polygons_pre = polygons_pre.drop(["match_num"], axis=1)
            full_pre_dataframes_list.append(polygons_pre)
    pre_rdf = gpd.GeoDataFrame(pd.concat(full_pre_dataframes_list,
                                         ignore_index=True))
    return pre_rdf


[docs]def load_and_save_df(filepaths_dict: dict, output_dir: str, reload_pickle: bool = False):
    """
    Loads the json label files for all hurricanes in the "hold" section of the
    xBD data, extracts the points and polygons in both xy coordinates,
    referring to the corresponding imagery file, and the longitude and
    latitude.

    Parameters
    ----------
    filepaths_dict : dict
        pathnames in a dictionary for the holdout, tier1, tier3 and test
        folder.
    output_dir : str
    reload_pickle : bool, optional
        If True recreate the pickle files as if they did not exist. The default is False.

    Returns
    -------
    Geodataframe
        all metadata and locations in a geodataframe that is
        saved in the data/datasets/EFs directory. Choose to return the gdf
        with long-lat coordinate system and pre polygons with post damage
        as this is most useful for choosing the EFs.
    """
    # update filepaths_dictionary
    for filepath in filepaths_dict:
        directory_files = [os.path.join(filepath, file)
                           for file in os.listdir(filepath)]
        filepaths_dict[filepath] = directory_files

    path_save_post = os.path.join(output_dir, "pre_polygon.pkl")
    if not os.path.exists(path_save_post) or reload_pickle:
        df_points_post_hurr = extract_damage_allfiles_separate(
            filepaths_dict=filepaths_dict,
            crs="xy",
            event="pre"
        )
        df_points_post_hurr.to_pickle(path_save_post)
    else:
        df_points_post_hurr = pd.read_pickle(path_save_post)    # TODO: validate this

    path_save_pre = os.path.join(output_dir, "xy_pre_pol_post_damage.pkl")
    if not os.path.exists(path_save_pre) or reload_pickle:
        df_pre_post_hurr_xy = extract_damage_allfiles_ensemble(
            filepaths_dict=filepaths_dict,
            crs="xy"
        )
        df_pre_post_hurr_xy.to_pickle(path_save_pre)
    else:
        df_pre_post_hurr_xy = pd.read_pickle(path_save_pre)

    path_save_pre_longlat = os.path.join(output_dir, "lnglat_pre_pol_post_damage.pkl")
    if not os.path.exists(path_save_pre_longlat) or reload_pickle:
        df_pre_post_hurr_ll = extract_damage_allfiles_ensemble(
            filepaths_dict=filepaths_dict,
            crs="lng_lat"
        )
        df_pre_post_hurr_ll.to_pickle(path_save_pre_longlat)
    else:
        df_pre_post_hurr_ll = pd.read_pickle(path_save_pre_longlat)

    return df_pre_post_hurr_xy, df_pre_post_hurr_ll


[docs]def main():
    xbd_dir = get_xbd_dir()
    #xbd_dir = "/Users/Lisanne/Documents/AI4ER/hurricane-harm-herald/data/test_geotiffs"
    output_dir = get_metadata_pickle_dir()
    #output_dir = "/Users/Lisanne/Documents/AI4ER/hurricane-harm-herald/data/test_output"
    # hold_filepath = get_xbd_hlabel_dir()
    hold_filepath = os.path.join(xbd_dir, "geotiffs/hold/labels")
    tier1_filepath = os.path.join(xbd_dir, "geotiffs/tier1/labels")
    tier3_filepath = os.path.join(xbd_dir, "geotiffs/tier3/labels")
    test_filepath = os.path.join(xbd_dir, "geotiffs/test/labels")
    # filepath = os.path.join(xbd_dir, labels_path, "")
    filepaths_dict = dict.fromkeys([hold_filepath, tier1_filepath,
                                    tier3_filepath, test_filepath])
    df_pre_post_hurr_xy, df_pre_post_hurr_ll = load_and_save_df(filepaths_dict, output_dir)
    return df_pre_post_hurr_xy, df_pre_post_hurr_ll


if __name__ == "__main__":
    main()