diff --git a/HLS_SuPER/HLS_PER.py b/HLS_SuPER/HLS_PER.py index 7b2a6b7..580f84a 100644 --- a/HLS_SuPER/HLS_PER.py +++ b/HLS_SuPER/HLS_PER.py @@ -7,7 +7,8 @@ This module contains functions to conduct subsetting and quality filtering of search results. ------------------------------------------------------------------------------- Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch -Last Updated: 2024-09-18 +Editor: Hong Xie +Last Updated: 2025-01-06 =============================================================================== """ @@ -21,6 +22,7 @@ import xarray as xr import rioxarray as rxr import dask.distributed + def create_output_name(url, band_dict): """ Uses HLS default naming scheme to generate an output name with common band names. @@ -42,7 +44,9 @@ def create_output_name(url, band_dict): return output_name -def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)): +def open_hls( + url, roi=None, clip=False, scale=True, chunk_size=dict(band=1, x=512, y=512) +): """ Generic Function to open an HLS COG and clip to ROI. For consistent scaling, this must be done manually. Some HLS Landsat scenes have the metadata in the wrong location. @@ -52,23 +56,40 @@ def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)): "band", drop=True ) - # Reproject ROI and Clip if ROI is provided - if roi is not None: + # (Add) 读取波段名称 + split_asset = url.split("/")[-1].split(".") + asset_name = split_asset[1] + band_name = split_asset[-2] + + # Reproject ROI and Clip if ROI is provided and clip is True + if roi is not None and clip: roi = roi.to_crs(da.spatial_ref.crs_wkt) da = da.rio.clip(roi.geometry.values, roi.crs, all_touched=True) + # (Add) 即使大部分影像已经被缩放且填补了缺失值, 但可能仍然有些影像需要进行手动在本地GIS软件中进行缩放和填补缺失值 # Apply Scale Factor if desired for non-quality layer - if scale and "Fmask" not in url: - # Mask Fill Values - da = xr.where(da == -9999, np.nan, da) - # Scale Data - da = da * 0.0001 - # Remove Scale Factor After Scaling - Prevents Double Scaling - da.attrs["scale_factor"] = 1.0 + if band_name != "Fmask": + if scale: + # Mask Fill Values + da = xr.where(da == -9999, np.nan, da) + # Scale Data + # (Add) 除质量层, 以及 L30 的两个热红外波段外, 其他光谱波段缩放因子均为 0.0001 + # (Add) L30 的两个热红外波段缩放因子为 0.01 + # NOTE: 需要注意的是热红外此时未被改名 + if (band_name == "B10" or band_name == "B11") and (asset_name == "L30"): + da = da * 0.01 + else: + da = da * 0.0001 + # Remove Scale Factor After Scaling - Prevents Double Scaling + da.attrs["scale_factor"] = 1.0 - # Add Scale Factor to Attributes Manually - This will overwrite/add if the data is missing. - if not scale and "Fmask" not in url: - da.attrs["scale_factor"] = 0.0001 + # Add Scale Factor to Attributes Manually - This will overwrite/add if the data is missing. + # (Add) 若要手动缩放, 则需要手动添加缩放因子 + else: + if (band_name == "B10" or band_name == "B11") and (asset_name == "L30"): + da.attrs["scale_factor"] = 0.01 + else: + da.attrs["scale_factor"] = 0.0001 return da @@ -91,6 +112,7 @@ def create_quality_mask(quality_data, bit_nums: list = [0, 1, 2, 3, 4, 5]): def process_granule( granule_urls, roi, + clip, quality_filter, scale, output_dir, @@ -100,6 +122,31 @@ def process_granule( ): """ Processes a list of HLS asset urls for a single granule. + + args: + granule_urls (list): List of HLS asset urls to process. + + roi (geopandas.GeoDataFrame): ROI to filter data. + + clip (bool): If True, ROI will be clipped to the image. + + quality_filter (bool): If True, quality layer will be used to mask data. + + scale (bool): If True, data will be scaled to reflectance. + + output_dir (str): Directory to save output files. + + band_dict (dict): Dictionary of band names and asset names. + + bit_nums (list): List of bit numbers to use for quality mask. + - 0: Cirrus + - 1: Cloud + - 2: Adjacent to cloud/shadow + - 3: Cloud shadow + - 4: Snow/ice + - 5: Water + + chunk_size (dict): Dictionary of chunk sizes for dask. """ # Setup Logging @@ -116,6 +163,7 @@ def process_granule( ): # First Handle Quality Layer + # (Add) 简化原有的冗余处理, 仅处理质量层, 并最后移除质量层下载url if quality_filter: # Generate Quality Layer URL split_asset = granule_urls[0].split("/")[-1].split(".") @@ -123,31 +171,26 @@ def process_granule( quality_url = ( f"{'/'.join(granule_urls[0].split('/')[:-1])}/{'.'.join(split_asset)}" ) - # Check if File exists in Output Directory - output_name = create_output_name(quality_url, band_dict) - output_file = f"{output_dir}/{output_name}" - - # Open Quality Layer - qa_da = open_hls(quality_url, roi, scale, chunk_size) - + quality_output_name = create_output_name(quality_url, band_dict) + quality_output_file = f"{output_dir}/{quality_output_name}" # Check if quality asset is already processed - if not os.path.isfile(output_file): + if not os.path.isfile(quality_output_file): # Write Output - qa_da.rio.to_raster(raster_path=output_file, driver="COG") + # Open Quality Layer + qa_da = open_hls(quality_url, roi, clip, scale, chunk_size) + qa_da.rio.to_raster(raster_path=quality_output_file, driver="COG") else: logging.info( - f"Existing file {output_name} found in {output_dir}. Skipping." + f"Existing quality file {quality_output_name} found in {output_dir}." ) - - # Remove Quality Layer from Granule Asset List if Present - granule_urls = [asset for asset in granule_urls if asset != quality_url] - # Create Quality Mask + # TODO: 掩膜数组的存在可能会造成Dask内存的溢出, 需要优化 qa_mask = create_quality_mask(qa_da, bit_nums=bit_nums) + # (Add) 若设置 quality_filter=True, 则在生成质量掩码后, 需要移除质量层, 避免后续重复处理 + granule_urls = [url for url in granule_urls if "Fmask" not in url] # Process Remaining Assets - for url in granule_urls: # Check if File exists in Output Directory output_name = create_output_name(url, band_dict) @@ -156,14 +199,20 @@ def process_granule( # Check if scene is already processed if not os.path.isfile(output_file): # Open Asset - da = open_hls(url, roi, scale, chunk_size) + da = open_hls(url, roi, clip, scale, chunk_size) # Apply Quality Mask if Desired if quality_filter: da = da.where(~qa_mask) # Write Output - da.rio.to_raster(raster_path=output_file, driver="COG") + if "FMASK" in output_name: + da.rio.to_raster(raster_path=output_file, driver="COG") + else: + # (Add) 固定输出为 float32 类型, 否则会默认 float64 类型 + da.rio.to_raster( + raster_path=output_file, driver="COG", dtype="float32" + ) else: logging.info( f"Existing file {output_name} found in {output_dir}. Skipping." diff --git a/HLS_SuPER/HLS_Su.py b/HLS_SuPER/HLS_Su.py index 6e7bdcb..02f63f5 100644 --- a/HLS_SuPER/HLS_Su.py +++ b/HLS_SuPER/HLS_Su.py @@ -5,8 +5,9 @@ This module contains functions related to searching and preprocessing HLS data. ------------------------------------------------------------------------------- Authors: Mahsa Jami, Cole Krehbiel, and Erik Bolch -Contact: lpdaac@usgs.gov -Last Updated: 2024-09-18 +Contact: lpdaac@usgs.gov +Editor: Hong Xie +Last Updated: 2025-01-06 =============================================================================== """ @@ -16,7 +17,9 @@ import earthaccess # Main function to search and filter HLS data -def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=False): +def hls_search( + roi: list, band_dict: dict, dates=None, cloud_cover=None, tile_id=None, log=False +): """ This function uses earthaccess to search for HLS data using an roi and temporal parameter, filter by cloud cover and delivers a list of results urls for the selected bands. """ @@ -27,6 +30,10 @@ def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=Fal temporal=dates, ) + # (Add) 根据瓦片ID过滤影像 + if tile_id: + results = hls_tileid_filter(results, tile_id) + # Filter by cloud cover if cloud_cover: results = hls_cc_filter(results, cloud_cover) @@ -45,6 +52,23 @@ def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=Fal return selected_results_urls +def hls_tileid_filter(results, tile_id): + """ + (Add) 基于给定的瓦片ID过滤earthaccess检索的数据结果 + """ + + tile_ids = [] + for result in results: + # 从json中检索瓦片ID,转换为字符串并放入数组中 + tmp_id = str(result["meta"]["native-id"].split(".")[2]) + tile_ids.append(tmp_id) + tile_ids = np.array(tile_ids) + # 根据瓦片ID找到对应的索引 + tile_id_indices = np.where(tile_ids == tile_id) + # 根据索引过滤结果 + return [results[i] for i in tile_id_indices[0]] + + # Filter earthaccess results based on cloud cover threshold def hls_cc_filter(results, cc_threshold): """ diff --git a/HLS_SuPER/HLS_SuPER.py b/HLS_SuPER/HLS_SuPER.py index 2bba32a..53deb7c 100644 --- a/HLS_SuPER/HLS_SuPER.py +++ b/HLS_SuPER/HLS_SuPER.py @@ -4,7 +4,8 @@ HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch Contact: lpdaac@usgs.gov -Last Updated: 2024-09-18 +Editor: Hong Xie +Last Updated: 2025-01-06 =============================================================================== """ @@ -53,6 +54,23 @@ def parse_arguments(): " ", ) + # (Add) clip: 裁剪参数, 默认为 False, 表示不裁剪, 如果为 True, 则表示裁剪 + parser.add_argument( + "-clip", + choices=["True", "False"], + required=False, + help="(Optional) If provided, the script will clip the output to the ROI.", + default="False", + ) + + # (Add) tile: HLS 的瓦片 ID + parser.add_argument( + "-tile", + type=str, + required=False, + help="(Optional) Tile ID for spatial subset. If provided, the script will search for the tile ID based on the ROI.", + ) + # dir: Directory to save the files to parser.add_argument( "-dir", @@ -117,7 +135,7 @@ def parse_arguments(): choices=["True", "False"], required=False, help="Flag to apply scale factor to layers before exporting output files. This is generally unecessary as most applications will scale automatically.", - default="False", + default="True", ) # of: output file format @@ -221,6 +239,14 @@ def format_dates(start, end): return dates +def format_tile_id(tile_id): + """ + (Add) 格式化tile_id参数 + """ + tile_id = tile_id.strip("'").strip('"') + return str(tile_id) + + def format_cloud_cover(cc): try: cc = int(cc.strip("'").strip('"')) @@ -464,6 +490,10 @@ def main(): roi, vl = format_roi(args.roi) logging.info("Region of Interest formatted successfully") + # (Add) 格式化 clip 参数 + clip = str_to_bool(args.clip) + logging.info(f"Clip to ROI: {clip}") + # Set Output Directory if args.dir is not None: output_dir = os.path.normpath(args.dir.strip("'").strip('"')) + os.sep @@ -485,6 +515,10 @@ def main(): cc = format_cloud_cover(args.cc) logging.info(f"Cloud Cover Filter <= {cc}") + # (Add) 格式化 Tile ID 参数 + tile = format_tile_id(args.tile) + logging.info(f"Tile ID: {tile}") + # Quality Filtering qf = str_to_bool(args.qf) logging.info(f"Quality Filtering: {qf}") @@ -533,20 +567,27 @@ def main(): else: logging.info("Searching for data...") results_urls = hls_search( - roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc + roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc, tile_id=tile ) logging.info(f"Writing search results to {results_urls_file}") with open(results_urls_file, "w") as file: json.dump(results_urls, file) + results_count = len(results_urls) total_assets = sum(len(sublist) for sublist in results_urls) + filter_descriptions = [] if cc: - logging.info( - f"{len(results_urls)} granules remain after cloud filtering. {total_assets} assets will be processed." - ) + filter_descriptions.append("cloud") + if tile: + filter_descriptions.append("tile") + if filter_descriptions: + filter_log = f"{results_count} granules remain after {' and '.join(filter_descriptions)} filtering. {total_assets} assets will be processed." else: - logging.info(f"{total_assets} assets will be processed.") + filter_log = ( + f"{results_count} granules remain. {total_assets} assets will be processed." + ) + logging.info(filter_log) # Confirm Processing if not confirm_action("Do you want to proceed with processing? (y/n)"): @@ -588,11 +629,12 @@ def main(): dask.delayed(process_granule)( granule_url, roi=roi, + clip=clip, quality_filter=qf, scale=scale, output_dir=cog_dir, band_dict=band_dict, - bit_nums=[0, 1, 2, 3, 4, 5], + bit_nums=[1, 3], chunk_size=chunk_size, ) for granule_url in results_urls @@ -612,8 +654,8 @@ def main(): logging.info("Timeseries Dataset Created. Removing Temporary Files...") shutil.rmtree(cog_dir) - # Add: 下载影像数据按照DOY归档 - logging.info("开始对已下载影像按照DOY日期进行归档.") + # (Add) 下载影像数据按照DOY归档 + logging.info("开始将已下载的HLS影像按照DOY日期进行归档") files_collection(output_dir) logging.info("归档完成!") diff --git a/README.md b/README.md index bb18d97..91d1eca 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ### 1.2 配置环境变量 -- 为了在控制台中直接使用conda命令, 需要将安装的相关目录配置到Path环境变量中。 +- 为了在控制台中直接使用conda命令,需要将安装的相关目录配置到Path环境变量中。 ``` D:\program\miniforge3 @@ -98,9 +98,11 @@ mamba activate lpdaac_windows - 需要注意的是,密码中最好不要出 `@/#/$/%` 等符号,爬取时可能会出错。 - 单个用户每秒限制最多100次请求,参考自:https://forum.earthdata.nasa.gov/viewtopic.php?t=3734 -### 3.2 爬取云端数据并在内存中进行预处理 +### 3.2 脚本可用参数 - `-roi`:感兴趣区,需要按照 **左下右上** 的逆时针顺序设置点坐标 +- `-clip`:是否对影像进行裁剪,默认 `False` +- `-tile`:HLS影像瓦片ID,例如 `T49RGQ` - `-dir`:输出目录,必须是已存在的目录 - `-start`:开始时间,格式为 `YYYY-MM-DD` - `-end`:结束时间,格式为 `YYYY-MM-DD` @@ -110,14 +112,28 @@ mamba activate lpdaac_windows - `-qf`:是否使用质量波段过滤云/云阴影像元,默认 `True` - `-scale`:是否对影像使用缩放因子,默认 `True` -- 爬取所有光谱波段 +### 3.3 爬取云端数据并在内存中进行预处理示例 + +- 爬取 L30 与 S30 的核心光谱波段:仅按照感兴趣区,瓦片ID,起止时间以及产品名称筛选影像,不进行云量筛选影像,对影像进行去云掩膜处理 ```sh -python .\\HLS_SuPER\\HLS_SuPER.py -roi '113.10114,30.62845,114.24349,31.59081' -dir .\\data\\HLS\\L30 -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands COASTAL-AEROSOL,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,CIRRUS,TIR1,TIR2,Fmask -cc 70 -qf True -scale True +python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\ALL -start 2024-01-01 -end 2024-01-31 -prod both -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,Fmask -scale True ``` -- 仅爬取必要的核心波段 +- 爬取 L30 的所有波段:按照感兴趣区,瓦片ID,起止时间以及产品名称筛选影像,过滤云量小于70% 的影像,对影像进行去云掩膜处理 ```sh -python .\\HLS_SuPER\\HLS_SuPER.py -roi '113.10114,30.62845,114.24349,31.59081' -dir .\\data\\HLS\\L30\\subset -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2 -cc 70 -qf True -scale True -``` \ No newline at end of file +python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\subset -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands COASTAL-AEROSOL,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,CIRRUS,TIR1,TIR2,Fmask -cc 70 -scale True +``` + +- 仅爬取 L30 的热红外波段:仅按照感兴趣区,瓦片ID,起止时间以及产品名称筛选影像,不进行云量筛选影像,对影像进行去云掩膜处理 + +```sh +python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\TIR -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands TIR1,TIR2,Fmask -scale True +``` + +- 【测试用】不进行云量筛选,直接爬取 L30 2024 年暑期光谱波段与热红外波段 + +```sh +python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\subset\\2024 -start 2024-06-01 -end 2024-08-31 -prod HLSL30 -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,TIR1,TIR2,Fmask -scale True +```