NASA_EarthData_Script/HLS_SuPER/HLS_SuPER.py

# -*- coding: utf-8 -*-
"""
===============================================================================
HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Contact: lpdaac@usgs.gov
Editor: Hong Xie
Last Updated: 2025-10-16
===============================================================================
"""

# Possible Future Improvements:
# TODO Improve CF-1.6 NetCDF Compliance
# TODO Improve behavior around deletion of cogs when a netcdf is requested
# TODO Add ZARR as output option

from HLS_PER import process_granule, create_timeseries_dataset
from HLS_Su import hls_search, format_roi
from utils.common_utils import setup_dask_environment
import os
import sys
import argparse
import shutil
import logging
import time
import json
from datetime import datetime as dt
import earthaccess
import dask.distributed

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


def parse_arguments():
    """
    Function to parse command line input arguments.
    """
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Performs Spatial/Temporal/Band Subsetting, Processing, and Customized Exporting for HLS V2.0 files",
    )

    # roi: Region of interest as shapefile, geojson, or comma separated LL Lon, LL Lat, UR Lon, UR Lat
    parser.add_argument(
        "-roi",
        type=str,
        required=True,
        help="(Required) Region of Interest (ROI) for spatial subset. \
                        Valid inputs are: (1) a geojson or shapefile (absolute path to file required if not in same directory as this script), or \
                        (2) bounding box coordinates: 'LowerLeft_lon,LowerLeft_lat,UpperRight_lon,UpperRight_lat'\
                        NOTE: Negative coordinates MUST be written in single quotation marks '-120,43,-118,48'\
                        NOTE 2: If providing an absolute path with spaces in directory names, please use double quotation marks "
        " ",
    )

    # (Add) clip: 裁剪参数, 默认为 False, 表示不裁剪, 如果为 True, 则表示裁剪
    parser.add_argument(
        "-clip",
        choices=["True", "False"],
        required=False,
        help="(Optional) If provided, the script will clip the output to the ROI.",
        default="False",
    )

    # (Add) tile: HLS 的瓦片 ID
    parser.add_argument(
        "-tile",
        type=str,
        required=False,
        help="(Optional) Tile ID for spatial subset. If provided, the script will search for the tile ID based on the ROI.",
    )

    # dir: Directory to save the files to
    parser.add_argument(
        "-dir",
        required=False,
        help="Directory to export output HLS files to.",
        default=os.getcwd(),
    )

    # start: Start Date
    parser.add_argument(
        "-start",
        required=False,
        help="Start date for time period of interest: valid format is yyyy-mm-dd (e.g. 2020-10-20).",
        default="2014-04-03",
    )

    # end: End Date
    parser.add_argument(
        "-end",
        required=False,
        help="Start date for time period of interest: valid format is yyyy-mm-dd (e.g. 2022-10-24).",
        default=dt.today().strftime("%Y-%m-%d"),
    )

    # prod: product(s) desired to be downloaded
    parser.add_argument(
        "-prod",
        choices=["HLSS30", "HLSL30", "both"],
        required=False,
        help="Desired product(s) to be subset and processed.",
        default="both",
    )

    # layers: layers desired to be processed within the products selected
    parser.add_argument(
        "-bands",
        required=False,
        help="Desired layers to be processed. Valid inputs are ALL, COASTAL-AEROSOL, BLUE, GREEN, RED, RED-EDGE1, RED-EDGE2, RED-EDGE3, NIR1, SWIR1, SWIR2, CIRRUS, TIR1, TIR2, WATER-VAPOR, FMASK, VZA, VAA, SZA, SAA. To request multiple layers, provide them in comma separated format with no spaces. Unsure of the names for your bands?--check out the README which contains a table of all bands and band names.",
        default="ALL",
    )

    # cc: maximum cloud cover (%) allowed to be returned (by scene)
    parser.add_argument(
        "-cc",
        required=False,
        help="Maximum (scene-level) cloud cover (percent) allowed for returned observations (e.g. 35). Valid range: 0 to 100 (integers only)",
        default="100",
    )

    # qf: quality filter flag: filter out poor quality data yes/no
    parser.add_argument(
        "-qf",
        choices=["True", "False"],
        required=False,
        help="Flag to quality filter before exporting output files (see README for quality filtering performed).",
        default="True",
    )

    # sf: scale factor flag: Scale data or leave unscaled yes/no
    parser.add_argument(
        "-scale",
        choices=["True", "False"],
        required=False,
        help="Flag to apply scale factor to layers before exporting output files. This is generally unecessary as most applications will scale automatically.",
        default="True",
    )

    # of: output file format
    parser.add_argument(
        "-of",
        choices=["COG", "NC4", "ZARR"],
        required=False,
        help="Define the desired output file format",
        default="COG",
    )

    # chunksize: chunk size for processing with dask
    parser.add_argument(
        "-cs",
        type=str,
        help="Chunksize for processing scenes with dask in format 'band,x,y'. This is used to provide chunk_size argument to rioxarray.open_rasterio to improve processing speed.\
            For example: '1,512,512' (native hls chunk size) provides better performance for ROIs that fall within a single scene, while '1,3600,3600' (full HLS scene) provides better performance for \
            larger ROIs that span multiple scenes. The default is '1,512,512', but this can lead to a very large task list for large ROIs.",
        default="1,512,512",
    )

    # logfile: Optional logfile path
    parser.add_argument(
        "-logfile",
        required=False,
        help="Optional path to output logfile. If not provided, logging will only be to the console.",
    )

    return parser.parse_args()


def format_dates(start, end):
    # Strip Quotes
    start = start.strip("'").strip('"')
    end = end.strip("'").strip('"')
    # Convert to datetime
    try:
        start = dt.strptime(start, "%Y-%m-%d")
        end = dt.strptime(end, "%Y-%m-%d")
    except ValueError:
        sys.exit(
            "A date format is not valid. The valid format is ISO 8601: YYYY-MM-DD (e.g. 2020-10-20)"
        )
    if start > end:
        sys.exit(
            f"The Start Date requested: {start} is after the End Date Requested: {end}."
        )
    else:
        dates = (start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d"))
    return dates


def format_tile_id(tile_id):
    """
    (Add) 格式化tile_id参数
    """
    if tile_id is None:
        return None
    tile_id = tile_id.strip("'").strip('"')
    return str(tile_id)


def format_cloud_cover(cc):
    try:
        cc = int(cc.strip("'").strip('"'))
    except ValueError:
        sys.exit(
            f"{cc} is not a valid input for filtering by cloud cover (e.g. 35). Valid range: 0 to 100 (integers only)"
        )

    # Validate that cc is in the valid range (0-100)
    if cc < 0 or cc > 100:
        sys.exit(
            f"{cc} is not a valid input option for filtering by cloud cover (e.g. 35). Valid range: 0 to 100 (integers only)"
        )
    return cc


def str_to_bool(value):
    """
    Converts a string to a boolean.
    Accepts 'True', 'true', '1' as True.
    Accepts 'False', 'false', '0' as False.
    """
    if isinstance(value, str):
        if value.lower() in ("true", "1"):
            return True
        elif value.lower() in ("false", "0"):
            return False
    raise ValueError(f"Cannot convert {value} to boolean.")


def create_band_dict(prod, bands):
    """
    Creates a dictionary of bands and common band names for each collection requested.
    """
    shortname = {"HLSS30": "HLSS30.v2.0", "HLSL30": "HLSL30.v2.0"}

    # Create a dictionary with product name and shortname
    if prod == "both":
        prods = shortname
    else:
        prods = {prod: shortname[prod]}

    # Strip spacing, quotes, make all upper case and create a list
    bands = bands.strip(" ").strip("'").strip('"').upper()
    band_list = bands.split(",")

    # Create a LUT dict including the HLS product bands mapped to names
    lut = {
        "HLSS30": {
            "COASTAL-AEROSOL": "B01",
            "BLUE": "B02",
            "GREEN": "B03",
            "RED": "B04",
            "RED-EDGE1": "B05",
            "RED-EDGE2": "B06",
            "RED-EDGE3": "B07",
            "NIR-Broad": "B08",
            "NIR1": "B8A",
            "WATER-VAPOR": "B09",
            "CIRRUS": "B10",
            "SWIR1": "B11",
            "SWIR2": "B12",
            "FMASK": "Fmask",
            "VZA": "VZA",
            "VAA": "VAA",
            "SZA": "SZA",
            "SAA": "SAA",
        },
        "HLSL30": {
            "COASTAL-AEROSOL": "B01",
            "BLUE": "B02",
            "GREEN": "B03",
            "RED": "B04",
            "NIR1": "B05",
            "SWIR1": "B06",
            "SWIR2": "B07",
            "CIRRUS": "B09",
            "TIR1": "B10",
            "TIR2": "B11",
            "FMASK": "Fmask",
            "VZA": "VZA",
            "VAA": "VAA",
            "SZA": "SZA",
            "SAA": "SAA",
        },
    }

    # List of all available/acceptable band names
    all_bands = [
        "ALL",
        "COASTAL-AEROSOL",
        "BLUE",
        "GREEN",
        "RED",
        "RED-EDGE1",
        "RED-EDGE2",
        "RED-EDGE3",
        "NIR1",
        "SWIR1",
        "SWIR2",
        "CIRRUS",
        "TIR1",
        "TIR2",
        "WATER-VAPOR",
        "FMASK",
        "VZA",
        "VAA",
        "SZA",
        "SAA",
    ]

    # Validate that bands are named correctly
    for b in band_list:
        if b not in all_bands:
            sys.exit(
                f"Band: {b} is not a valid input option. Valid inputs are {all_bands}. To request multiple layers, provide them in comma separated format with no spaces. Unsure of the names for your bands?--check out the README which contains a table of all bands and band names."
            )

    # Set up a dictionary of band names and numbers by product
    band_dict = {}
    for p in prods:
        band_dict[p] = {}
        for b in band_list:
            if b == "ALL":
                band_dict[p] = lut[p]
            else:
                try:
                    band_dict[p][b] = lut[p][b]
                except ValueError:
                    print(f"Product {p} does not contain band {b}")
    return band_dict


def format_chunksize(chunksize):
    """
    Converts comma-separated chunksize string to dictionary.
    """
    keys = ["band", "x", "y"]
    values = list(map(int, chunksize.strip("'\"").split(",")))

    if len(values) != len(keys):
        raise ValueError(
            "Chunksize must provide band, x and y (3) values separated by commas."
        )

    return dict(zip(keys, values))


def confirm_action(prompt):
    """
    Prompts the user to confirm an action.
    """
    while True:
        response = input(prompt).lower()
        if response in ["y", "yes"]:
            return True
        elif response in ["n", "no"]:
            return False
        else:
            print("Invalid input. Please enter 'y' or 'n'.")


def files_collection(output_dir):
    """
    将已下载的HLS影像数据按照文件名中的日期进行归档
    """

    # 获取当前目录下的所有文件
    files = os.listdir(output_dir)
    # 遍历所有文件
    for file in files:
        # 检查文件是否为tif格式
        if file.endswith(".tif"):
            # 提取文件名中的日期
            doy = file.split(".")[3][:7]

            # 构建目标目录路径
            target_dir = os.path.join(output_dir, doy)

            # 如果目标目录不存在，则创建它
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            # 移动文件到目标目录
            source_path = os.path.join(output_dir, file)
            target_path = os.path.join(target_dir, file)
            shutil.move(source_path, target_path)


def main():
    """
    Main function to run the HLS SuPER script.
    """

    # Parse arguments
    args = parse_arguments()

    # Configure logging
    log_handlers = [logging.StreamHandler(sys.stdout)]
    if args.logfile:
        log_handlers.append(logging.FileHandler(args.logfile))

    logging.basicConfig(
        level=logging.INFO,
        format="%(levelname)s:%(asctime)s ||| %(message)s",
        handlers=log_handlers,
    )

    # Handle Login Credentials with earthaccess
    earthaccess.login(persist=True)

    # Start Log
    logging.info("HLS SuPER script started")

    # Format ROI
    roi, vl = format_roi(args.roi)
    logging.info("Region of Interest formatted successfully")

    # (Add) 格式化 clip 参数
    clip = str_to_bool(args.clip)
    logging.info(f"Clip to ROI: {clip}")

    # Set Output Directory
    if args.dir is not None:
        output_dir = os.path.normpath(args.dir.strip("'").strip('"')) + os.sep
    else:
        # Defaults to the current directory
        output_dir = os.getcwd() + os.sep

    os.makedirs(output_dir, exist_ok=True)
    logging.info(f"Output directory set to: {output_dir}")

    # Format/Validate Dates
    dates = format_dates(args.start, args.end)
    logging.info(f"Date Parameters: {dates}")

    # Create Product/Band Dictionary
    band_dict = create_band_dict(args.prod, args.bands)
    logging.info(f"Products/Bands Selected: {band_dict}")

    # Format Cloud Cover
    cc = format_cloud_cover(args.cc)
    logging.info(f"Cloud Cover Filter <= {cc}")

    # (Add) 格式化 Tile ID 参数
    tile = format_tile_id(args.tile)
    logging.info(f"Tile ID: {tile}")

    # Quality Filtering
    qf = str_to_bool(args.qf)
    logging.info(f"Quality Filtering: {qf}")

    # Scale Factor
    scale = str_to_bool(args.scale)
    logging.info(f"Apply Scale Factor: {scale}")

    # Chunk Size
    chunk_size = format_chunksize(args.cs)
    logging.info(f"Chunk Size: {chunk_size}")

    # Output File Type
    if args.of not in ["COG", "NC4"]:
        sys.exit(
            f"Output format {args.of} is not a valid output format. Please choose from 'COG', 'NC4'."
        )

    logging.info(f"Output format: {args.of}")

    # Search for Data and Save Results
    results_urls_file = os.path.join(output_dir, "hls_super_results_urls.json")
    use_existing_file = False

    if os.path.isfile(results_urls_file):
        logging.info(f"Results url list already exists in {output_dir}.")
        # Confirm if user wants to use existing file.
        if confirm_action(
            f"Do you want to use the existing results file ({results_urls_file})? (y/n)"
        ):
            use_existing_file = True

        else:
            if not confirm_action(
                "Do you want to overwrite the existing results file? (y/n)"
            ):
                sys.exit(
                    f"Processing aborted. Please move, rename, or remove existing file: {results_urls_file}."
                )

    if use_existing_file:
        logging.info("Using existing results file.")
        with open(results_urls_file, "r") as file:
            results_urls = json.load(file)

    else:
        logging.info("Searching for data...")
        results_urls = hls_search(
            roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc, tile_id=tile
        )
        logging.info(f"Writing search results to {results_urls_file}")
        with open(results_urls_file, "w") as file:
            json.dump(results_urls, file)

    results_count = len(results_urls)
    total_assets = sum(len(sublist) for sublist in results_urls)

    filter_descriptions = []
    if cc:
        filter_descriptions.append("cloud")
    if tile:
        filter_descriptions.append("tile")
    if filter_descriptions:
        filter_log = f"{results_count} granules remain after {' and '.join(filter_descriptions)} filtering. {total_assets} assets will be processed."
    else:
        filter_log = (
            f"{results_count} granules remain. {total_assets} assets will be processed."
        )
    logging.info(filter_log)

    # Confirm Processing
    if not confirm_action("Do you want to proceed with processing? (y/n)"):
        sys.exit("Processing aborted.")

    # Initialize Dask Cluster
    client = dask.distributed.Client()

    # Setup Dask Environment (GDAL Configs)
    client.run(setup_dask_environment)

    logging.info(
        f"Dask environment setup successfully. View dashboard: {client.dashboard_link}."
    )

    # Scatter Results Results url
    client.scatter(results_urls)

    # If NC4, create a temporary directory to store COGs
    if args.of == "NC4":
        cog_dir = os.path.join(output_dir, "temp")
        if not os.path.exists(cog_dir):
            os.makedirs(cog_dir, exist_ok=True)
        else:
            if not confirm_action(
                "Temporary directory to store COGs already exists. Use these files to create NC4 outputs? (y/n)"
            ):
                sys.exit(
                    f"Processing aborted. Please remove existing directory: {cog_dir}."
                )

    else:
        cog_dir = output_dir

    # Process Granules
    start_time = time.time()
    logging.info("Processing...")
    tasks = [
        dask.delayed(process_granule)(
            granule_url,
            roi=roi,
            clip=clip,
            quality_filter=qf,
            scale=scale,
            output_dir=cog_dir,
            band_dict=band_dict,
            bit_nums=[1, 3],
            chunk_size=chunk_size,
        )
        for granule_url in results_urls
    ]
    dask.compute(*tasks)

    # Create Timeseries Dataset if NC4
    if args.of == "NC4":
        logging.info("Creating timeseries dataset...")
        create_timeseries_dataset(
            cog_dir, output_type=args.of, output_dir=output_dir)

    # Close Dask Client
    client.close()

    # Remove Temporary COGs if NC4
    if args.of == "NC4":
        logging.info("Timeseries Dataset Created. Removing Temporary Files...")
        shutil.rmtree(cog_dir)

    # (Add) 下载影像数据按照DOY归档
    logging.info("开始将已下载的HLS影像按照DOY日期进行归档")
    files_collection(output_dir)
    logging.info("归档完成!")

    # End Timer
    total_time = time.time() - start_time
    logging.info(f"Processing complete. Total time: {round(total_time, 2)}s, ")


if __name__ == "__main__":
    main()