NASA_EarthData_Script/HLS_SuPER/HLS_SuPER.py

# -*- coding: utf-8 -*-
"""
===============================================================================
HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Contact: lpdaac@usgs.gov
Editor: Hong Xie
Last Updated: 2025-01-06
===============================================================================
"""

# Possible Future Improvements:
# TODO Improve CF-1.6 NetCDF Compliance
# TODO Improve behavior around deletion of cogs when a netcdf is requested
# TODO Add ZARR as output option

import argparse
import sys
import os
import shutil
import logging
import time
import json

import earthaccess
from shapely.geometry import polygon, box
import geopandas as gpd
from datetime import datetime as dt
import dask.distributed

from HLS_Su import hls_search
from HLS_PER import process_granule, create_timeseries_dataset


def parse_arguments():
    """
    Function to parse command line input arguments.
    """
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Performs Spatial/Temporal/Band Subsetting, Processing, and Customized Exporting for HLS V2.0 files",
    )

    # roi: Region of interest as shapefile, geojson, or comma separated LL Lon, LL Lat, UR Lon, UR Lat
    parser.add_argument(
        "-roi",
        type=str,
        required=True,
        help="(Required) Region of Interest (ROI) for spatial subset. \
                        Valid inputs are: (1) a geojson or shapefile (absolute path to file required if not in same directory as this script), or \
                        (2) bounding box coordinates: 'LowerLeft_lon,LowerLeft_lat,UpperRight_lon,UpperRight_lat'\
                        NOTE: Negative coordinates MUST be written in single quotation marks '-120,43,-118,48'\
                        NOTE 2: If providing an absolute path with spaces in directory names, please use double quotation marks "
        " ",
    )

    # (Add) clip: 裁剪参数, 默认为 False, 表示不裁剪, 如果为 True, 则表示裁剪
    parser.add_argument(
        "-clip",
        choices=["True", "False"],
        required=False,
        help="(Optional) If provided, the script will clip the output to the ROI.",
        default="False",
    )

    # (Add) tile: HLS 的瓦片 ID
    parser.add_argument(
        "-tile",
        type=str,
        required=False,
        help="(Optional) Tile ID for spatial subset. If provided, the script will search for the tile ID based on the ROI.",
    )

    # dir: Directory to save the files to
    parser.add_argument(
        "-dir",
        required=False,
        help="Directory to export output HLS files to.",
        default=os.getcwd(),
    )

    # start: Start Date
    parser.add_argument(
        "-start",
        required=False,
        help="Start date for time period of interest: valid format is yyyy-mm-dd (e.g. 2020-10-20).",
        default="2014-04-03",
    )

    # end: End Date
    parser.add_argument(
        "-end",
        required=False,
        help="Start date for time period of interest: valid format is yyyy-mm-dd (e.g. 2022-10-24).",
        default=dt.today().strftime("%Y-%m-%d"),
    )

    # prod: product(s) desired to be downloaded
    parser.add_argument(
        "-prod",
        choices=["HLSS30", "HLSL30", "both"],
        required=False,
        help="Desired product(s) to be subset and processed.",
        default="both",
    )

    # layers: layers desired to be processed within the products selected
    parser.add_argument(
        "-bands",
        required=False,
        help="Desired layers to be processed. Valid inputs are ALL, COASTAL-AEROSOL, BLUE, GREEN, RED, RED-EDGE1, RED-EDGE2, RED-EDGE3, NIR1, SWIR1, SWIR2, CIRRUS, TIR1, TIR2, WATER-VAPOR, FMASK, VZA, VAA, SZA, SAA. To request multiple layers, provide them in comma separated format with no spaces. Unsure of the names for your bands?--check out the README which contains a table of all bands and band names.",
        default="ALL",
    )

    # cc: maximum cloud cover (%) allowed to be returned (by scene)
    parser.add_argument(
        "-cc",
        required=False,
        help="Maximum (scene-level) cloud cover (percent) allowed for returned observations (e.g. 35). Valid range: 0 to 100 (integers only)",
        default="100",
    )

    # qf: quality filter flag: filter out poor quality data yes/no
    parser.add_argument(
        "-qf",
        choices=["True", "False"],
        required=False,
        help="Flag to quality filter before exporting output files (see README for quality filtering performed).",
        default="True",
    )

    # sf: scale factor flag: Scale data or leave unscaled yes/no
    parser.add_argument(
        "-scale",
        choices=["True", "False"],
        required=False,
        help="Flag to apply scale factor to layers before exporting output files. This is generally unecessary as most applications will scale automatically.",
        default="True",
    )

    # of: output file format
    parser.add_argument(
        "-of",
        choices=["COG", "NC4", "ZARR"],
        required=False,
        help="Define the desired output file format",
        default="COG",
    )

    # chunksize: chunk size for processing with dask
    parser.add_argument(
        "-cs",
        type=str,
        help="Chunksize for processing scenes with dask in format 'band,x,y'. This is used to provide chunk_size argument to rioxarray.open_rasterio to improve processing speed.\
            For example: '1,512,512' (native hls chunk size) provides better performance for ROIs that fall within a single scene, while '1,3600,3600' (full HLS scene) provides better performance for \
            larger ROIs that span multiple scenes. The default is '1,512,512', but this can lead to a very large task list for large ROIs.",
        default="1,512,512",
    )

    # logfile: Optional logfile path
    parser.add_argument(
        "-logfile",
        required=False,
        help="Optional path to output logfile. If not provided, logging will only be to the console.",
    )

    return parser.parse_args()


def format_roi(roi):
    """
    Determines if submitted ROI is a file or bbox coordinates.

    If a file, opens a GeoJSON or shapefile and creates a list of polygon vertices in the correct order. If the file has multiple polygons it will use a unary union convex hull of the external bounds.

    If bbox coordinates, creates a geodataframe with a single Polygon geometry.

    Returns a geopandas dataframe for clipping and a list of vertices for searching.
    """
    if os.path.isfile(roi):  # and roi.endswith(("geojson", "shp")):
        print(roi)
        try:
            # Open ROI if file
            roi = gpd.read_file(roi)
            if len(roi) > 1:
                # Merge all Polygon geometries and create external boundary
                logging.info(
                    "Multiple polygons detected. Creating single geometry of external coordinates."
                )
                single_geometry = roi.unary_union.convex_hull
                roi = gpd.GeoDataFrame(geometry=[single_geometry], crs=roi.crs)
                logging.info(roi)
            # Check if ROI is in Geographic CRS, if not, convert to it
            if roi.crs.is_geographic:
                # List Vertices in correct order for search
                vertices_list = list(roi.geometry[0].exterior.coords)

            else:
                roi_geographic = roi.to_crs("EPSG:4326")
                logging.info(
                    "Note: ROI submitted is being converted to Geographic CRS (EPSG:4326)"
                )
                vertices_list = list(roi_geographic.geometry[0].exterior.coords)
        except (FileNotFoundError, ValueError):
            sys.exit(
                f"The GeoJSON/shapefile is either not valid or could not be found.\nPlease double check the name and provide the absolute path to the file or make sure that it is located in {os.getcwd()}"
            )
    else:
        # If bbox coordinates are submitted
        bbox = tuple(map(float, roi.strip("'\"").split(",")))
        print(bbox)

        # Convert bbox to a geodataframe for clipping
        roi = gpd.GeoDataFrame(geometry=[box(*bbox)], crs="EPSG:4326")

        vertices_list = list(roi.geometry[0].exterior.coords)

    return (roi, vertices_list)


def format_dates(start, end):
    # Strip Quotes
    start = start.strip("'").strip('"')
    end = end.strip("'").strip('"')
    # Convert to datetime
    try:
        start = dt.strptime(start, "%Y-%m-%d")
        end = dt.strptime(end, "%Y-%m-%d")
    except ValueError:
        sys.exit(
            "A date format is not valid. The valid format is ISO 8601: YYYY-MM-DD (e.g. 2020-10-20)"
        )
    if start > end:
        sys.exit(
            f"The Start Date requested: {start} is after the End Date Requested: {end}."
        )
    else:
        dates = (start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d"))
    return dates


def format_tile_id(tile_id):
    """
    (Add) 格式化tile_id参数
    """
    tile_id = tile_id.strip("'").strip('"')
    return str(tile_id)


def format_cloud_cover(cc):
    try:
        cc = int(cc.strip("'").strip('"'))
    except ValueError:
        sys.exit(
            f"{cc} is not a valid input for filtering by cloud cover (e.g. 35). Valid range: 0 to 100 (integers only)"
        )

    # Validate that cc is in the valid range (0-100)
    if cc < 0 or cc > 100:
        sys.exit(
            f"{cc} is not a valid input option for filtering by cloud cover (e.g. 35). Valid range: 0 to 100 (integers only)"
        )
    return cc


def str_to_bool(value):
    """
    Converts a string to a boolean.
    Accepts 'True', 'true', '1' as True.
    Accepts 'False', 'false', '0' as False.
    """
    if isinstance(value, str):
        if value.lower() in ("true", "1"):
            return True
        elif value.lower() in ("false", "0"):
            return False
    raise ValueError(f"Cannot convert {value} to boolean.")


def create_band_dict(prod, bands):
    """
    Creates a dictionary of bands and common band names for each collection requested.
    """
    shortname = {"HLSS30": "HLSS30.v2.0", "HLSL30": "HLSL30.v2.0"}

    # Create a dictionary with product name and shortname
    if prod == "both":
        prods = shortname
    else:
        prods = {prod: shortname[prod]}

    # Strip spacing, quotes, make all upper case and create a list
    bands = bands.strip(" ").strip("'").strip('"').upper()
    band_list = bands.split(",")

    # Create a LUT dict including the HLS product bands mapped to names
    lut = {
        "HLSS30": {
            "COASTAL-AEROSOL": "B01",
            "BLUE": "B02",
            "GREEN": "B03",
            "RED": "B04",
            "RED-EDGE1": "B05",
            "RED-EDGE2": "B06",
            "RED-EDGE3": "B07",
            "NIR-Broad": "B08",
            "NIR1": "B8A",
            "WATER-VAPOR": "B09",
            "CIRRUS": "B10",
            "SWIR1": "B11",
            "SWIR2": "B12",
            "FMASK": "Fmask",
            "VZA": "VZA",
            "VAA": "VAA",
            "SZA": "SZA",
            "SAA": "SAA",
        },
        "HLSL30": {
            "COASTAL-AEROSOL": "B01",
            "BLUE": "B02",
            "GREEN": "B03",
            "RED": "B04",
            "NIR1": "B05",
            "SWIR1": "B06",
            "SWIR2": "B07",
            "CIRRUS": "B09",
            "TIR1": "B10",
            "TIR2": "B11",
            "FMASK": "Fmask",
            "VZA": "VZA",
            "VAA": "VAA",
            "SZA": "SZA",
            "SAA": "SAA",
        },
    }

    # List of all available/acceptable band names
    all_bands = [
        "ALL",
        "COASTAL-AEROSOL",
        "BLUE",
        "GREEN",
        "RED",
        "RED-EDGE1",
        "RED-EDGE2",
        "RED-EDGE3",
        "NIR1",
        "SWIR1",
        "SWIR2",
        "CIRRUS",
        "TIR1",
        "TIR2",
        "WATER-VAPOR",
        "FMASK",
        "VZA",
        "VAA",
        "SZA",
        "SAA",
    ]

    # Validate that bands are named correctly
    for b in band_list:
        if b not in all_bands:
            sys.exit(
                f"Band: {b} is not a valid input option. Valid inputs are {all_bands}. To request multiple layers, provide them in comma separated format with no spaces. Unsure of the names for your bands?--check out the README which contains a table of all bands and band names."
            )

    # Set up a dictionary of band names and numbers by product
    band_dict = {}
    for p in prods:
        band_dict[p] = {}
        for b in band_list:
            if b == "ALL":
                band_dict[p] = lut[p]
            else:
                try:
                    band_dict[p][b] = lut[p][b]
                except ValueError:
                    print(f"Product {p} does not contain band {b}")
    return band_dict


def format_chunksize(chunksize):
    """
    Converts comma-separated chunksize string to dictionary.
    """
    keys = ["band", "x", "y"]
    values = list(map(int, chunksize.strip("'\"").split(",")))

    if len(values) != len(keys):
        raise ValueError(
            "Chunksize must provide band, x and y (3) values separated by commas."
        )

    return dict(zip(keys, values))


def confirm_action(prompt):
    """
    Prompts the user to confirm an action.
    """
    while True:
        response = input(prompt).lower()
        if response in ["y", "yes"]:
            return True
        elif response in ["n", "no"]:
            return False
        else:
            print("Invalid input. Please enter 'y' or 'n'.")


def setup_dask_environment():
    """
    Passes RIO environment variables to dask workers for authentication.
    """
    import os
    import rasterio

    cookie_file_path = os.path.expanduser("~/cookies.txt")

    global env
    gdal_config = {
        "GDAL_HTTP_UNSAFESSL": "YES",
        "GDAL_HTTP_COOKIEFILE": cookie_file_path,
        "GDAL_HTTP_COOKIEJAR": cookie_file_path,
        "GDAL_DISABLE_READDIR_ON_OPEN": "YES",
        "CPL_VSIL_CURL_ALLOWED_EXTENSIONS": "TIF",
        "GDAL_HTTP_MAX_RETRY": "10",
        "GDAL_HTTP_RETRY_DELAY": "0.5",
        "GDAL_HTTP_TIMEOUT": "300",
    }

    env = rasterio.Env(**gdal_config)
    env.__enter__()


def files_collection(output_dir):
    """
    将已下载的HLS影像数据按照文件名中的日期进行归档
    """

    # 获取当前目录下的所有文件
    files = os.listdir(output_dir)
    # 遍历所有文件
    for file in files:
        # 检查文件是否为tif格式
        if file.endswith(".tif"):
            # 提取文件名中的日期
            doy = file.split(".")[3][:7]

            # 构建目标目录路径
            target_dir = os.path.join(output_dir, doy)

            # 如果目标目录不存在，则创建它
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            # 移动文件到目标目录
            source_path = os.path.join(output_dir, file)
            target_path = os.path.join(target_dir, file)
            shutil.move(source_path, target_path)


def main():
    """
    Main function to run the HLS SuPER script.
    """

    # Parse arguments
    args = parse_arguments()

    # Configure logging
    log_handlers = [logging.StreamHandler(sys.stdout)]
    if args.logfile:
        log_handlers.append(logging.FileHandler(args.logfile))

    logging.basicConfig(
        level=logging.INFO,
        format="%(levelname)s:%(asctime)s ||| %(message)s",
        handlers=log_handlers,
    )

    # Handle Login Credentials with earthaccess
    earthaccess.login(persist=True)

    # Start Log
    logging.info("HLS SuPER script started")

    # Format ROI
    roi, vl = format_roi(args.roi)
    logging.info("Region of Interest formatted successfully")

    # (Add) 格式化 clip 参数
    clip = str_to_bool(args.clip)
    logging.info(f"Clip to ROI: {clip}")

    # Set Output Directory
    if args.dir is not None:
        output_dir = os.path.normpath(args.dir.strip("'").strip('"')) + os.sep
    else:
        # Defaults to the current directory
        output_dir = os.getcwd() + os.sep

    logging.info(f"Output directory set to: {output_dir}")

    # Format/Validate Dates
    dates = format_dates(args.start, args.end)
    logging.info(f"Date Parameters: {dates}")

    # Create Product/Band Dictionary
    band_dict = create_band_dict(args.prod, args.bands)
    logging.info(f"Products/Bands Selected: {band_dict}")

    # Format Cloud Cover
    cc = format_cloud_cover(args.cc)
    logging.info(f"Cloud Cover Filter <= {cc}")

    # (Add) 格式化 Tile ID 参数
    tile = format_tile_id(args.tile)
    logging.info(f"Tile ID: {tile}")

    # Quality Filtering
    qf = str_to_bool(args.qf)
    logging.info(f"Quality Filtering: {qf}")

    # Scale Factor
    scale = str_to_bool(args.scale)
    logging.info(f"Apply Scale Factor: {scale}")

    # Chunk Size
    chunk_size = format_chunksize(args.cs)
    logging.info(f"Chunk Size: {chunk_size}")

    # Output File Type
    if args.of not in ["COG", "NC4"]:
        sys.exit(
            f"Output format {args.of} is not a valid output format. Please choose from 'COG', 'NC4'."
        )

    logging.info(f"Output format: {args.of}")

    # Search for Data and Save Results
    results_urls_file = os.path.join(output_dir, "hls_super_results_urls.json")
    use_existing_file = False

    if os.path.isfile(results_urls_file):
        logging.info(f"Results url list already exists in {output_dir}.")
        # Confirm if user wants to use existing file.
        if confirm_action(
            f"Do you want to use the existing results file ({results_urls_file})? (y/n)"
        ):
            use_existing_file = True

        else:
            if not confirm_action(
                "Do you want to overwrite the existing results file? (y/n)"
            ):
                sys.exit(
                    f"Processing aborted. Please move, rename, or remove existing file: {results_urls_file}."
                )

    if use_existing_file:
        logging.info("Using existing results file.")
        with open(results_urls_file, "r") as file:
            results_urls = json.load(file)

    else:
        logging.info("Searching for data...")
        results_urls = hls_search(
            roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc, tile_id=tile
        )
        logging.info(f"Writing search results to {results_urls_file}")
        with open(results_urls_file, "w") as file:
            json.dump(results_urls, file)

    results_count = len(results_urls)
    total_assets = sum(len(sublist) for sublist in results_urls)

    filter_descriptions = []
    if cc:
        filter_descriptions.append("cloud")
    if tile:
        filter_descriptions.append("tile")
    if filter_descriptions:
        filter_log = f"{results_count} granules remain after {' and '.join(filter_descriptions)} filtering. {total_assets} assets will be processed."
    else:
        filter_log = (
            f"{results_count} granules remain. {total_assets} assets will be processed."
        )
    logging.info(filter_log)

    # Confirm Processing
    if not confirm_action("Do you want to proceed with processing? (y/n)"):
        sys.exit("Processing aborted.")

    # Initialize Dask Cluster
    client = dask.distributed.Client()

    # Setup Dask Environment (GDAL Configs)
    client.run(setup_dask_environment)

    logging.info(
        f"Dask environment setup successfully. View dashboard: {client.dashboard_link}."
    )

    # Scatter Results Results url
    client.scatter(results_urls)

    # If NC4, create a temporary directory to store COGs
    if args.of == "NC4":
        cog_dir = os.path.join(output_dir, "temp")
        if not os.path.exists(cog_dir):
            os.makedirs(cog_dir, exist_ok=True)
        else:
            if not confirm_action(
                "Temporary directory to store COGs already exists. Use these files to create NC4 outputs? (y/n)"
            ):
                sys.exit(
                    f"Processing aborted. Please remove existing directory: {cog_dir}."
                )

    else:
        cog_dir = output_dir

    # Process Granules
    start_time = time.time()
    logging.info("Processing...")
    tasks = [
        dask.delayed(process_granule)(
            granule_url,
            roi=roi,
            clip=clip,
            quality_filter=qf,
            scale=scale,
            output_dir=cog_dir,
            band_dict=band_dict,
            bit_nums=[1, 3],
            chunk_size=chunk_size,
        )
        for granule_url in results_urls
    ]
    dask.compute(*tasks)

    # Create Timeseries Dataset if NC4
    if args.of == "NC4":
        logging.info("Creating timeseries dataset...")
        create_timeseries_dataset(cog_dir, output_type=args.of, output_dir=output_dir)

    # Close Dask Client
    client.close()

    # Remove Temporary COGs if NC4
    if args.of == "NC4":
        logging.info("Timeseries Dataset Created. Removing Temporary Files...")
        shutil.rmtree(cog_dir)

    # (Add) 下载影像数据按照DOY归档
    logging.info("开始将已下载的HLS影像按照DOY日期进行归档")
    files_collection(output_dir)
    logging.info("归档完成!")

    # End Timer
    total_time = time.time() - start_time
    logging.info(f"Processing complete. Total time: {round(total_time,2)}s, ")


if __name__ == "__main__":
    main()