Source code for h3.dataprocessing.pre_processing

import numpy as np
import pandas as pd

import os
import rasterio as rio
# import cv2
# import scipy.ndimage
# from os.path import exists

from tqdm import tqdm
# from threading import Thread
# from h3 import logger
# from h3.constants import DMG_CLASSES_DICT
from h3.utils.directories import get_metadata_pickle_dir, get_xbd_hlabel_dir
from h3.utils.directories import get_xbd_dir, get_data_dir, get_processed_data_dir
from h3.dataprocessing.extract_metadata import load_and_save_df
from h3.dataprocessing.crop_images_img import crop_images


[docs]def image_loading(polygons_df, zoom_levels: list, pixel_num: int,
                  zoomdir_dict: dict):
    """Loads images and crops them based on the required zoom levels
    and the required imagery input pixel size for the model.

    Parameters
    ----------
    polygons_df : geopandas dataframe
        Pandas dataframe containing metadata information about the pre-event
        polygoms, combined with the damage class from the post-event data.
        The reference system is "xy" referring to the corresponding image file.
    zoom_levels : list
        list containing all required zoom levels as integers.
    pixel_num : int
        Value of the sides of the squared cropped image as input for the model.
    zoomdir_dict : dict
        Dictionary containing all filepaths for the zoom directories with the
        values of the zoom level.
    """
    polygons_df["index"] = np.arange(len(polygons_df))
    filtered_df = polygons_df[["image_name", "json_link"]].drop_duplicates(
        subset=['image_name'])
    for json_path in tqdm(filtered_df["json_link"], desc="Processing images", position=0, leave=False):
        tif_name = json_path.replace("json", "tif")
        image_path = os.path.join(json_path, tif_name).replace("labels",
                                                               "images")
        # image name is in the final folder so index with -1
        name_img = os.path.basename(json_path).replace("json", "png")
        image_pol_df = polygons_df.query('image_name == @name_img')
        polygons_image = image_pol_df[["geometry", "index"]]
        with rio.open(image_path) as img:
            image_size = 1024
            img_metadata = img.meta
            img_array = img.read()

        for building in tqdm(range(len(polygons_image)), desc="Buildings", position=1, leave=False):
            polygon_for_img = polygons_image.iloc[building]["geometry"]
            polygon_num = polygons_image.iloc[building]["index"]

            for zoom_level in tqdm(zoom_levels, desc="zoom", position=2, leave=False):
                zoom_dir = zoomdir_dict[zoom_level]
                img_num = str(polygon_num) + ".png"
                output_path = os.path.join(zoom_dir, img_num)
                crop_images(
                    image_array=img_array,
                    img_metadata=img_metadata,
                    polygon_df=polygon_for_img,
                    zoom_level=zoom_level,
                    pixel_num=pixel_num,
                    im_size=image_size,
                    output_path=output_path
                )


[docs]def main():
    zoom_levels = [1, 2, 4, 0.5]
    pixel_num = 224

    # TODO: look/fix the geotiffs.old and all
    output_dir = get_metadata_pickle_dir()
    data_dir = get_data_dir()
    xbd_dir = get_xbd_dir()
    # xbd_dir = "/Users/Lisanne/Documents/AI4ER/hurricane-harm-herald/data/test_geotiffs"
    # data_dir = "/Users/Lisanne/Documents/AI4ER/hurricane-harm-herald/data/test_output/images"
    # output_dir = "/Users/Lisanne/Documents/AI4ER/hurricane-harm-herald/data/test_output"

    # hold_filepath = get_xbd_hlabel_dir()
    hold_filepath = os.path.join(xbd_dir, "geotiffs", "hold", "labels")
    tier1_filepath = os.path.join(xbd_dir, "geotiffs", "tier1", "labels")
    tier3_filepath = os.path.join(xbd_dir, "geotiffs", "tier3", "labels")
    test_filepath = os.path.join(xbd_dir, "geotiffs", "test", "labels")

    filepaths_dict = dict.fromkeys([hold_filepath, tier1_filepath,
                                    tier3_filepath, test_filepath])

    # use df_pre_post_hurr_ll (longitude & latitude) for environmental factors
    # use df_pre_post_hurr_xy for image cropping
    df_pre_post_hurr_xy, df_pre_post_hurr_ll = load_and_save_df(
        filepaths_dict, output_dir)

    # where to save zoomed and cropped images
    save_dir_path = os.path.join(get_processed_data_dir(), "processed_xbd", "geotiffs_zoom", "images")

    zoomdir_dict = {}
    for zoom_num in zoom_levels:
        zoom_dir = "zoom_" + str(zoom_num)
        zoom_path = os.path.join(data_dir, save_dir_path, zoom_dir)
        zoomdir_dict[zoom_num] = zoom_path
        if not os.path.exists(zoom_path):
            os.makedirs(zoom_path)
    image_loading(df_pre_post_hurr_xy, zoom_levels, pixel_num, zoomdir_dict)


if __name__ == '__main__':
    main()