596 lines
18 KiB
Python

# -*- coding: utf-8 -*-
"""
===============================================================================
HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Contact: lpdaac@usgs.gov
Editor: Hong Xie
Last Updated: 2025-10-16
===============================================================================
"""
# Possible Future Improvements:
# TODO Improve CF-1.6 NetCDF Compliance
# TODO Improve behavior around deletion of cogs when a netcdf is requested
# TODO Add ZARR as output option
from HLS_PER import process_granule, create_timeseries_dataset
from HLS_Su import hls_search, format_roi
from utils.common_utils import setup_dask_environment
import os
import sys
import argparse
import shutil
import logging
import time
import json
from datetime import datetime as dt
import earthaccess
import dask.distributed
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
def parse_arguments():
"""
Function to parse command line input arguments.
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="Performs Spatial/Temporal/Band Subsetting, Processing, and Customized Exporting for HLS V2.0 files",
)
# roi: Region of interest as shapefile, geojson, or comma separated LL Lon, LL Lat, UR Lon, UR Lat
parser.add_argument(
"-roi",
type=str,
required=True,
help="(Required) Region of Interest (ROI) for spatial subset. \
Valid inputs are: (1) a geojson or shapefile (absolute path to file required if not in same directory as this script), or \
(2) bounding box coordinates: 'LowerLeft_lon,LowerLeft_lat,UpperRight_lon,UpperRight_lat'\
NOTE: Negative coordinates MUST be written in single quotation marks '-120,43,-118,48'\
NOTE 2: If providing an absolute path with spaces in directory names, please use double quotation marks "
" ",
)
# (Add) clip: 裁剪参数, 默认为 False, 表示不裁剪, 如果为 True, 则表示裁剪
parser.add_argument(
"-clip",
choices=["True", "False"],
required=False,
help="(Optional) If provided, the script will clip the output to the ROI.",
default="False",
)
# (Add) tile: HLS 的瓦片 ID
parser.add_argument(
"-tile",
type=str,
required=False,
help="(Optional) Tile ID for spatial subset. If provided, the script will search for the tile ID based on the ROI.",
)
# dir: Directory to save the files to
parser.add_argument(
"-dir",
required=False,
help="Directory to export output HLS files to.",
default=os.getcwd(),
)
# start: Start Date
parser.add_argument(
"-start",
required=False,
help="Start date for time period of interest: valid format is yyyy-mm-dd (e.g. 2020-10-20).",
default="2014-04-03",
)
# end: End Date
parser.add_argument(
"-end",
required=False,
help="Start date for time period of interest: valid format is yyyy-mm-dd (e.g. 2022-10-24).",
default=dt.today().strftime("%Y-%m-%d"),
)
# prod: product(s) desired to be downloaded
parser.add_argument(
"-prod",
choices=["HLSS30", "HLSL30", "both"],
required=False,
help="Desired product(s) to be subset and processed.",
default="both",
)
# layers: layers desired to be processed within the products selected
parser.add_argument(
"-bands",
required=False,
help="Desired layers to be processed. Valid inputs are ALL, COASTAL-AEROSOL, BLUE, GREEN, RED, RED-EDGE1, RED-EDGE2, RED-EDGE3, NIR1, SWIR1, SWIR2, CIRRUS, TIR1, TIR2, WATER-VAPOR, FMASK, VZA, VAA, SZA, SAA. To request multiple layers, provide them in comma separated format with no spaces. Unsure of the names for your bands?--check out the README which contains a table of all bands and band names.",
default="ALL",
)
# cc: maximum cloud cover (%) allowed to be returned (by scene)
parser.add_argument(
"-cc",
required=False,
help="Maximum (scene-level) cloud cover (percent) allowed for returned observations (e.g. 35). Valid range: 0 to 100 (integers only)",
default="100",
)
# qf: quality filter flag: filter out poor quality data yes/no
parser.add_argument(
"-qf",
choices=["True", "False"],
required=False,
help="Flag to quality filter before exporting output files (see README for quality filtering performed).",
default="True",
)
# sf: scale factor flag: Scale data or leave unscaled yes/no
parser.add_argument(
"-scale",
choices=["True", "False"],
required=False,
help="Flag to apply scale factor to layers before exporting output files. This is generally unecessary as most applications will scale automatically.",
default="True",
)
# of: output file format
parser.add_argument(
"-of",
choices=["COG", "NC4", "ZARR"],
required=False,
help="Define the desired output file format",
default="COG",
)
# chunksize: chunk size for processing with dask
parser.add_argument(
"-cs",
type=str,
help="Chunksize for processing scenes with dask in format 'band,x,y'. This is used to provide chunk_size argument to rioxarray.open_rasterio to improve processing speed.\
For example: '1,512,512' (native hls chunk size) provides better performance for ROIs that fall within a single scene, while '1,3600,3600' (full HLS scene) provides better performance for \
larger ROIs that span multiple scenes. The default is '1,512,512', but this can lead to a very large task list for large ROIs.",
default="1,512,512",
)
# logfile: Optional logfile path
parser.add_argument(
"-logfile",
required=False,
help="Optional path to output logfile. If not provided, logging will only be to the console.",
)
return parser.parse_args()
def format_dates(start, end):
# Strip Quotes
start = start.strip("'").strip('"')
end = end.strip("'").strip('"')
# Convert to datetime
try:
start = dt.strptime(start, "%Y-%m-%d")
end = dt.strptime(end, "%Y-%m-%d")
except ValueError:
sys.exit(
"A date format is not valid. The valid format is ISO 8601: YYYY-MM-DD (e.g. 2020-10-20)"
)
if start > end:
sys.exit(
f"The Start Date requested: {start} is after the End Date Requested: {end}."
)
else:
dates = (start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d"))
return dates
def format_tile_id(tile_id):
"""
(Add) 格式化tile_id参数
"""
if tile_id is None:
return None
tile_id = tile_id.strip("'").strip('"')
return str(tile_id)
def format_cloud_cover(cc):
try:
cc = int(cc.strip("'").strip('"'))
except ValueError:
sys.exit(
f"{cc} is not a valid input for filtering by cloud cover (e.g. 35). Valid range: 0 to 100 (integers only)"
)
# Validate that cc is in the valid range (0-100)
if cc < 0 or cc > 100:
sys.exit(
f"{cc} is not a valid input option for filtering by cloud cover (e.g. 35). Valid range: 0 to 100 (integers only)"
)
return cc
def str_to_bool(value):
"""
Converts a string to a boolean.
Accepts 'True', 'true', '1' as True.
Accepts 'False', 'false', '0' as False.
"""
if isinstance(value, str):
if value.lower() in ("true", "1"):
return True
elif value.lower() in ("false", "0"):
return False
raise ValueError(f"Cannot convert {value} to boolean.")
def create_band_dict(prod, bands):
"""
Creates a dictionary of bands and common band names for each collection requested.
"""
shortname = {"HLSS30": "HLSS30.v2.0", "HLSL30": "HLSL30.v2.0"}
# Create a dictionary with product name and shortname
if prod == "both":
prods = shortname
else:
prods = {prod: shortname[prod]}
# Strip spacing, quotes, make all upper case and create a list
bands = bands.strip(" ").strip("'").strip('"').upper()
band_list = bands.split(",")
# Create a LUT dict including the HLS product bands mapped to names
lut = {
"HLSS30": {
"COASTAL-AEROSOL": "B01",
"BLUE": "B02",
"GREEN": "B03",
"RED": "B04",
"RED-EDGE1": "B05",
"RED-EDGE2": "B06",
"RED-EDGE3": "B07",
"NIR-Broad": "B08",
"NIR1": "B8A",
"WATER-VAPOR": "B09",
"CIRRUS": "B10",
"SWIR1": "B11",
"SWIR2": "B12",
"FMASK": "Fmask",
"VZA": "VZA",
"VAA": "VAA",
"SZA": "SZA",
"SAA": "SAA",
},
"HLSL30": {
"COASTAL-AEROSOL": "B01",
"BLUE": "B02",
"GREEN": "B03",
"RED": "B04",
"NIR1": "B05",
"SWIR1": "B06",
"SWIR2": "B07",
"CIRRUS": "B09",
"TIR1": "B10",
"TIR2": "B11",
"FMASK": "Fmask",
"VZA": "VZA",
"VAA": "VAA",
"SZA": "SZA",
"SAA": "SAA",
},
}
# List of all available/acceptable band names
all_bands = [
"ALL",
"COASTAL-AEROSOL",
"BLUE",
"GREEN",
"RED",
"RED-EDGE1",
"RED-EDGE2",
"RED-EDGE3",
"NIR1",
"SWIR1",
"SWIR2",
"CIRRUS",
"TIR1",
"TIR2",
"WATER-VAPOR",
"FMASK",
"VZA",
"VAA",
"SZA",
"SAA",
]
# Validate that bands are named correctly
for b in band_list:
if b not in all_bands:
sys.exit(
f"Band: {b} is not a valid input option. Valid inputs are {all_bands}. To request multiple layers, provide them in comma separated format with no spaces. Unsure of the names for your bands?--check out the README which contains a table of all bands and band names."
)
# Set up a dictionary of band names and numbers by product
band_dict = {}
for p in prods:
band_dict[p] = {}
for b in band_list:
if b == "ALL":
band_dict[p] = lut[p]
else:
try:
band_dict[p][b] = lut[p][b]
except ValueError:
print(f"Product {p} does not contain band {b}")
return band_dict
def format_chunksize(chunksize):
"""
Converts comma-separated chunksize string to dictionary.
"""
keys = ["band", "x", "y"]
values = list(map(int, chunksize.strip("'\"").split(",")))
if len(values) != len(keys):
raise ValueError(
"Chunksize must provide band, x and y (3) values separated by commas."
)
return dict(zip(keys, values))
def confirm_action(prompt):
"""
Prompts the user to confirm an action.
"""
while True:
response = input(prompt).lower()
if response in ["y", "yes"]:
return True
elif response in ["n", "no"]:
return False
else:
print("Invalid input. Please enter 'y' or 'n'.")
def files_collection(output_dir):
"""
将已下载的HLS影像数据按照文件名中的日期进行归档
"""
# 获取当前目录下的所有文件
files = os.listdir(output_dir)
# 遍历所有文件
for file in files:
# 检查文件是否为tif格式
if file.endswith(".tif"):
# 提取文件名中的日期
doy = file.split(".")[3][:7]
# 构建目标目录路径
target_dir = os.path.join(output_dir, doy)
# 如果目标目录不存在,则创建它
if not os.path.exists(target_dir):
os.makedirs(target_dir)
# 移动文件到目标目录
source_path = os.path.join(output_dir, file)
target_path = os.path.join(target_dir, file)
shutil.move(source_path, target_path)
def main():
"""
Main function to run the HLS SuPER script.
"""
# Parse arguments
args = parse_arguments()
# Configure logging
log_handlers = [logging.StreamHandler(sys.stdout)]
if args.logfile:
log_handlers.append(logging.FileHandler(args.logfile))
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s:%(asctime)s ||| %(message)s",
handlers=log_handlers,
)
# Handle Login Credentials with earthaccess
earthaccess.login(persist=True)
# Start Log
logging.info("HLS SuPER script started")
# Format ROI
roi, vl = format_roi(args.roi)
logging.info("Region of Interest formatted successfully")
# (Add) 格式化 clip 参数
clip = str_to_bool(args.clip)
logging.info(f"Clip to ROI: {clip}")
# Set Output Directory
if args.dir is not None:
output_dir = os.path.normpath(args.dir.strip("'").strip('"')) + os.sep
else:
# Defaults to the current directory
output_dir = os.getcwd() + os.sep
os.makedirs(output_dir, exist_ok=True)
logging.info(f"Output directory set to: {output_dir}")
# Format/Validate Dates
dates = format_dates(args.start, args.end)
logging.info(f"Date Parameters: {dates}")
# Create Product/Band Dictionary
band_dict = create_band_dict(args.prod, args.bands)
logging.info(f"Products/Bands Selected: {band_dict}")
# Format Cloud Cover
cc = format_cloud_cover(args.cc)
logging.info(f"Cloud Cover Filter <= {cc}")
# (Add) 格式化 Tile ID 参数
tile = format_tile_id(args.tile)
logging.info(f"Tile ID: {tile}")
# Quality Filtering
qf = str_to_bool(args.qf)
logging.info(f"Quality Filtering: {qf}")
# Scale Factor
scale = str_to_bool(args.scale)
logging.info(f"Apply Scale Factor: {scale}")
# Chunk Size
chunk_size = format_chunksize(args.cs)
logging.info(f"Chunk Size: {chunk_size}")
# Output File Type
if args.of not in ["COG", "NC4"]:
sys.exit(
f"Output format {args.of} is not a valid output format. Please choose from 'COG', 'NC4'."
)
logging.info(f"Output format: {args.of}")
# Search for Data and Save Results
results_urls_file = os.path.join(output_dir, "hls_super_results_urls.json")
use_existing_file = False
if os.path.isfile(results_urls_file):
logging.info(f"Results url list already exists in {output_dir}.")
# Confirm if user wants to use existing file.
if confirm_action(
f"Do you want to use the existing results file ({results_urls_file})? (y/n)"
):
use_existing_file = True
else:
if not confirm_action(
"Do you want to overwrite the existing results file? (y/n)"
):
sys.exit(
f"Processing aborted. Please move, rename, or remove existing file: {results_urls_file}."
)
if use_existing_file:
logging.info("Using existing results file.")
with open(results_urls_file, "r") as file:
results_urls = json.load(file)
else:
logging.info("Searching for data...")
results_urls = hls_search(
roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc, tile_id=tile
)
logging.info(f"Writing search results to {results_urls_file}")
with open(results_urls_file, "w") as file:
json.dump(results_urls, file)
results_count = len(results_urls)
total_assets = sum(len(sublist) for sublist in results_urls)
filter_descriptions = []
if cc:
filter_descriptions.append("cloud")
if tile:
filter_descriptions.append("tile")
if filter_descriptions:
filter_log = f"{results_count} granules remain after {' and '.join(filter_descriptions)} filtering. {total_assets} assets will be processed."
else:
filter_log = (
f"{results_count} granules remain. {total_assets} assets will be processed."
)
logging.info(filter_log)
# Confirm Processing
if not confirm_action("Do you want to proceed with processing? (y/n)"):
sys.exit("Processing aborted.")
# Initialize Dask Cluster
client = dask.distributed.Client()
# Setup Dask Environment (GDAL Configs)
client.run(setup_dask_environment)
logging.info(
f"Dask environment setup successfully. View dashboard: {client.dashboard_link}."
)
# Scatter Results Results url
client.scatter(results_urls)
# If NC4, create a temporary directory to store COGs
if args.of == "NC4":
cog_dir = os.path.join(output_dir, "temp")
if not os.path.exists(cog_dir):
os.makedirs(cog_dir, exist_ok=True)
else:
if not confirm_action(
"Temporary directory to store COGs already exists. Use these files to create NC4 outputs? (y/n)"
):
sys.exit(
f"Processing aborted. Please remove existing directory: {cog_dir}."
)
else:
cog_dir = output_dir
# Process Granules
start_time = time.time()
logging.info("Processing...")
tasks = [
dask.delayed(process_granule)(
granule_url,
roi=roi,
clip=clip,
quality_filter=qf,
scale=scale,
output_dir=cog_dir,
band_dict=band_dict,
bit_nums=[1, 3],
chunk_size=chunk_size,
)
for granule_url in results_urls
]
dask.compute(*tasks)
# Create Timeseries Dataset if NC4
if args.of == "NC4":
logging.info("Creating timeseries dataset...")
create_timeseries_dataset(
cog_dir, output_type=args.of, output_dir=output_dir)
# Close Dask Client
client.close()
# Remove Temporary COGs if NC4
if args.of == "NC4":
logging.info("Timeseries Dataset Created. Removing Temporary Files...")
shutil.rmtree(cog_dir)
# (Add) 下载影像数据按照DOY归档
logging.info("开始将已下载的HLS影像按照DOY日期进行归档")
files_collection(output_dir)
logging.info("归档完成!")
# End Timer
total_time = time.time() - start_time
logging.info(f"Processing complete. Total time: {round(total_time, 2)}s, ")
if __name__ == "__main__":
main()