feat: 更新经过多轮测试使用后的稳定版本.

This commit is contained in:
谢泓 2025-01-09 11:31:41 +08:00
parent 6d16e32022
commit ac1a0a96f2
4 changed files with 182 additions and 51 deletions

View File

@ -7,7 +7,8 @@ This module contains functions to conduct subsetting and quality filtering of
search results. search results.
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Last Updated: 2024-09-18 Editor: Hong Xie
Last Updated: 2025-01-06
=============================================================================== ===============================================================================
""" """
@ -21,6 +22,7 @@ import xarray as xr
import rioxarray as rxr import rioxarray as rxr
import dask.distributed import dask.distributed
def create_output_name(url, band_dict): def create_output_name(url, band_dict):
""" """
Uses HLS default naming scheme to generate an output name with common band names. Uses HLS default naming scheme to generate an output name with common band names.
@ -42,7 +44,9 @@ def create_output_name(url, band_dict):
return output_name return output_name
def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)): def open_hls(
url, roi=None, clip=False, scale=True, chunk_size=dict(band=1, x=512, y=512)
):
""" """
Generic Function to open an HLS COG and clip to ROI. For consistent scaling, this must be done manually. Generic Function to open an HLS COG and clip to ROI. For consistent scaling, this must be done manually.
Some HLS Landsat scenes have the metadata in the wrong location. Some HLS Landsat scenes have the metadata in the wrong location.
@ -52,23 +56,40 @@ def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)):
"band", drop=True "band", drop=True
) )
# Reproject ROI and Clip if ROI is provided # (Add) 读取波段名称
if roi is not None: split_asset = url.split("/")[-1].split(".")
asset_name = split_asset[1]
band_name = split_asset[-2]
# Reproject ROI and Clip if ROI is provided and clip is True
if roi is not None and clip:
roi = roi.to_crs(da.spatial_ref.crs_wkt) roi = roi.to_crs(da.spatial_ref.crs_wkt)
da = da.rio.clip(roi.geometry.values, roi.crs, all_touched=True) da = da.rio.clip(roi.geometry.values, roi.crs, all_touched=True)
# (Add) 即使大部分影像已经被缩放且填补了缺失值, 但可能仍然有些影像需要进行手动在本地GIS软件中进行缩放和填补缺失值
# Apply Scale Factor if desired for non-quality layer # Apply Scale Factor if desired for non-quality layer
if scale and "Fmask" not in url: if band_name != "Fmask":
# Mask Fill Values if scale:
da = xr.where(da == -9999, np.nan, da) # Mask Fill Values
# Scale Data da = xr.where(da == -9999, np.nan, da)
da = da * 0.0001 # Scale Data
# Remove Scale Factor After Scaling - Prevents Double Scaling # (Add) 除质量层, 以及 L30 的两个热红外波段外, 其他光谱波段缩放因子均为 0.0001
da.attrs["scale_factor"] = 1.0 # (Add) L30 的两个热红外波段缩放因子为 0.01
# NOTE: 需要注意的是热红外此时未被改名
if (band_name == "B10" or band_name == "B11") and (asset_name == "L30"):
da = da * 0.01
else:
da = da * 0.0001
# Remove Scale Factor After Scaling - Prevents Double Scaling
da.attrs["scale_factor"] = 1.0
# Add Scale Factor to Attributes Manually - This will overwrite/add if the data is missing. # Add Scale Factor to Attributes Manually - This will overwrite/add if the data is missing.
if not scale and "Fmask" not in url: # (Add) 若要手动缩放, 则需要手动添加缩放因子
da.attrs["scale_factor"] = 0.0001 else:
if (band_name == "B10" or band_name == "B11") and (asset_name == "L30"):
da.attrs["scale_factor"] = 0.01
else:
da.attrs["scale_factor"] = 0.0001
return da return da
@ -91,6 +112,7 @@ def create_quality_mask(quality_data, bit_nums: list = [0, 1, 2, 3, 4, 5]):
def process_granule( def process_granule(
granule_urls, granule_urls,
roi, roi,
clip,
quality_filter, quality_filter,
scale, scale,
output_dir, output_dir,
@ -100,6 +122,31 @@ def process_granule(
): ):
""" """
Processes a list of HLS asset urls for a single granule. Processes a list of HLS asset urls for a single granule.
args:
granule_urls (list): List of HLS asset urls to process.
roi (geopandas.GeoDataFrame): ROI to filter data.
clip (bool): If True, ROI will be clipped to the image.
quality_filter (bool): If True, quality layer will be used to mask data.
scale (bool): If True, data will be scaled to reflectance.
output_dir (str): Directory to save output files.
band_dict (dict): Dictionary of band names and asset names.
bit_nums (list): List of bit numbers to use for quality mask.
- 0: Cirrus
- 1: Cloud
- 2: Adjacent to cloud/shadow
- 3: Cloud shadow
- 4: Snow/ice
- 5: Water
chunk_size (dict): Dictionary of chunk sizes for dask.
""" """
# Setup Logging # Setup Logging
@ -116,6 +163,7 @@ def process_granule(
): ):
# First Handle Quality Layer # First Handle Quality Layer
# (Add) 简化原有的冗余处理, 仅处理质量层, 并最后移除质量层下载url
if quality_filter: if quality_filter:
# Generate Quality Layer URL # Generate Quality Layer URL
split_asset = granule_urls[0].split("/")[-1].split(".") split_asset = granule_urls[0].split("/")[-1].split(".")
@ -123,31 +171,26 @@ def process_granule(
quality_url = ( quality_url = (
f"{'/'.join(granule_urls[0].split('/')[:-1])}/{'.'.join(split_asset)}" f"{'/'.join(granule_urls[0].split('/')[:-1])}/{'.'.join(split_asset)}"
) )
# Check if File exists in Output Directory # Check if File exists in Output Directory
output_name = create_output_name(quality_url, band_dict) quality_output_name = create_output_name(quality_url, band_dict)
output_file = f"{output_dir}/{output_name}" quality_output_file = f"{output_dir}/{quality_output_name}"
# Open Quality Layer
qa_da = open_hls(quality_url, roi, scale, chunk_size)
# Check if quality asset is already processed # Check if quality asset is already processed
if not os.path.isfile(output_file): if not os.path.isfile(quality_output_file):
# Write Output # Write Output
qa_da.rio.to_raster(raster_path=output_file, driver="COG") # Open Quality Layer
qa_da = open_hls(quality_url, roi, clip, scale, chunk_size)
qa_da.rio.to_raster(raster_path=quality_output_file, driver="COG")
else: else:
logging.info( logging.info(
f"Existing file {output_name} found in {output_dir}. Skipping." f"Existing quality file {quality_output_name} found in {output_dir}."
) )
# Remove Quality Layer from Granule Asset List if Present
granule_urls = [asset for asset in granule_urls if asset != quality_url]
# Create Quality Mask # Create Quality Mask
# TODO: 掩膜数组的存在可能会造成Dask内存的溢出, 需要优化
qa_mask = create_quality_mask(qa_da, bit_nums=bit_nums) qa_mask = create_quality_mask(qa_da, bit_nums=bit_nums)
# (Add) 若设置 quality_filter=True, 则在生成质量掩码后, 需要移除质量层, 避免后续重复处理
granule_urls = [url for url in granule_urls if "Fmask" not in url]
# Process Remaining Assets # Process Remaining Assets
for url in granule_urls: for url in granule_urls:
# Check if File exists in Output Directory # Check if File exists in Output Directory
output_name = create_output_name(url, band_dict) output_name = create_output_name(url, band_dict)
@ -156,14 +199,20 @@ def process_granule(
# Check if scene is already processed # Check if scene is already processed
if not os.path.isfile(output_file): if not os.path.isfile(output_file):
# Open Asset # Open Asset
da = open_hls(url, roi, scale, chunk_size) da = open_hls(url, roi, clip, scale, chunk_size)
# Apply Quality Mask if Desired # Apply Quality Mask if Desired
if quality_filter: if quality_filter:
da = da.where(~qa_mask) da = da.where(~qa_mask)
# Write Output # Write Output
da.rio.to_raster(raster_path=output_file, driver="COG") if "FMASK" in output_name:
da.rio.to_raster(raster_path=output_file, driver="COG")
else:
# (Add) 固定输出为 float32 类型, 否则会默认 float64 类型
da.rio.to_raster(
raster_path=output_file, driver="COG", dtype="float32"
)
else: else:
logging.info( logging.info(
f"Existing file {output_name} found in {output_dir}. Skipping." f"Existing file {output_name} found in {output_dir}. Skipping."

View File

@ -6,7 +6,8 @@ This module contains functions related to searching and preprocessing HLS data.
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
Authors: Mahsa Jami, Cole Krehbiel, and Erik Bolch Authors: Mahsa Jami, Cole Krehbiel, and Erik Bolch
Contact: lpdaac@usgs.gov Contact: lpdaac@usgs.gov
Last Updated: 2024-09-18 Editor: Hong Xie
Last Updated: 2025-01-06
=============================================================================== ===============================================================================
""" """
@ -16,7 +17,9 @@ import earthaccess
# Main function to search and filter HLS data # Main function to search and filter HLS data
def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=False): def hls_search(
roi: list, band_dict: dict, dates=None, cloud_cover=None, tile_id=None, log=False
):
""" """
This function uses earthaccess to search for HLS data using an roi and temporal parameter, filter by cloud cover and delivers a list of results urls for the selected bands. This function uses earthaccess to search for HLS data using an roi and temporal parameter, filter by cloud cover and delivers a list of results urls for the selected bands.
""" """
@ -27,6 +30,10 @@ def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=Fal
temporal=dates, temporal=dates,
) )
# (Add) 根据瓦片ID过滤影像
if tile_id:
results = hls_tileid_filter(results, tile_id)
# Filter by cloud cover # Filter by cloud cover
if cloud_cover: if cloud_cover:
results = hls_cc_filter(results, cloud_cover) results = hls_cc_filter(results, cloud_cover)
@ -45,6 +52,23 @@ def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=Fal
return selected_results_urls return selected_results_urls
def hls_tileid_filter(results, tile_id):
"""
(Add) 基于给定的瓦片ID过滤earthaccess检索的数据结果
"""
tile_ids = []
for result in results:
# 从json中检索瓦片ID转换为字符串并放入数组中
tmp_id = str(result["meta"]["native-id"].split(".")[2])
tile_ids.append(tmp_id)
tile_ids = np.array(tile_ids)
# 根据瓦片ID找到对应的索引
tile_id_indices = np.where(tile_ids == tile_id)
# 根据索引过滤结果
return [results[i] for i in tile_id_indices[0]]
# Filter earthaccess results based on cloud cover threshold # Filter earthaccess results based on cloud cover threshold
def hls_cc_filter(results, cc_threshold): def hls_cc_filter(results, cc_threshold):
""" """

View File

@ -4,7 +4,8 @@
HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Contact: lpdaac@usgs.gov Contact: lpdaac@usgs.gov
Last Updated: 2024-09-18 Editor: Hong Xie
Last Updated: 2025-01-06
=============================================================================== ===============================================================================
""" """
@ -53,6 +54,23 @@ def parse_arguments():
" ", " ",
) )
# (Add) clip: 裁剪参数, 默认为 False, 表示不裁剪, 如果为 True, 则表示裁剪
parser.add_argument(
"-clip",
choices=["True", "False"],
required=False,
help="(Optional) If provided, the script will clip the output to the ROI.",
default="False",
)
# (Add) tile: HLS 的瓦片 ID
parser.add_argument(
"-tile",
type=str,
required=False,
help="(Optional) Tile ID for spatial subset. If provided, the script will search for the tile ID based on the ROI.",
)
# dir: Directory to save the files to # dir: Directory to save the files to
parser.add_argument( parser.add_argument(
"-dir", "-dir",
@ -117,7 +135,7 @@ def parse_arguments():
choices=["True", "False"], choices=["True", "False"],
required=False, required=False,
help="Flag to apply scale factor to layers before exporting output files. This is generally unecessary as most applications will scale automatically.", help="Flag to apply scale factor to layers before exporting output files. This is generally unecessary as most applications will scale automatically.",
default="False", default="True",
) )
# of: output file format # of: output file format
@ -221,6 +239,14 @@ def format_dates(start, end):
return dates return dates
def format_tile_id(tile_id):
"""
(Add) 格式化tile_id参数
"""
tile_id = tile_id.strip("'").strip('"')
return str(tile_id)
def format_cloud_cover(cc): def format_cloud_cover(cc):
try: try:
cc = int(cc.strip("'").strip('"')) cc = int(cc.strip("'").strip('"'))
@ -464,6 +490,10 @@ def main():
roi, vl = format_roi(args.roi) roi, vl = format_roi(args.roi)
logging.info("Region of Interest formatted successfully") logging.info("Region of Interest formatted successfully")
# (Add) 格式化 clip 参数
clip = str_to_bool(args.clip)
logging.info(f"Clip to ROI: {clip}")
# Set Output Directory # Set Output Directory
if args.dir is not None: if args.dir is not None:
output_dir = os.path.normpath(args.dir.strip("'").strip('"')) + os.sep output_dir = os.path.normpath(args.dir.strip("'").strip('"')) + os.sep
@ -485,6 +515,10 @@ def main():
cc = format_cloud_cover(args.cc) cc = format_cloud_cover(args.cc)
logging.info(f"Cloud Cover Filter <= {cc}") logging.info(f"Cloud Cover Filter <= {cc}")
# (Add) 格式化 Tile ID 参数
tile = format_tile_id(args.tile)
logging.info(f"Tile ID: {tile}")
# Quality Filtering # Quality Filtering
qf = str_to_bool(args.qf) qf = str_to_bool(args.qf)
logging.info(f"Quality Filtering: {qf}") logging.info(f"Quality Filtering: {qf}")
@ -533,20 +567,27 @@ def main():
else: else:
logging.info("Searching for data...") logging.info("Searching for data...")
results_urls = hls_search( results_urls = hls_search(
roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc, tile_id=tile
) )
logging.info(f"Writing search results to {results_urls_file}") logging.info(f"Writing search results to {results_urls_file}")
with open(results_urls_file, "w") as file: with open(results_urls_file, "w") as file:
json.dump(results_urls, file) json.dump(results_urls, file)
results_count = len(results_urls)
total_assets = sum(len(sublist) for sublist in results_urls) total_assets = sum(len(sublist) for sublist in results_urls)
filter_descriptions = []
if cc: if cc:
logging.info( filter_descriptions.append("cloud")
f"{len(results_urls)} granules remain after cloud filtering. {total_assets} assets will be processed." if tile:
) filter_descriptions.append("tile")
if filter_descriptions:
filter_log = f"{results_count} granules remain after {' and '.join(filter_descriptions)} filtering. {total_assets} assets will be processed."
else: else:
logging.info(f"{total_assets} assets will be processed.") filter_log = (
f"{results_count} granules remain. {total_assets} assets will be processed."
)
logging.info(filter_log)
# Confirm Processing # Confirm Processing
if not confirm_action("Do you want to proceed with processing? (y/n)"): if not confirm_action("Do you want to proceed with processing? (y/n)"):
@ -588,11 +629,12 @@ def main():
dask.delayed(process_granule)( dask.delayed(process_granule)(
granule_url, granule_url,
roi=roi, roi=roi,
clip=clip,
quality_filter=qf, quality_filter=qf,
scale=scale, scale=scale,
output_dir=cog_dir, output_dir=cog_dir,
band_dict=band_dict, band_dict=band_dict,
bit_nums=[0, 1, 2, 3, 4, 5], bit_nums=[1, 3],
chunk_size=chunk_size, chunk_size=chunk_size,
) )
for granule_url in results_urls for granule_url in results_urls
@ -612,8 +654,8 @@ def main():
logging.info("Timeseries Dataset Created. Removing Temporary Files...") logging.info("Timeseries Dataset Created. Removing Temporary Files...")
shutil.rmtree(cog_dir) shutil.rmtree(cog_dir)
# Add: 下载影像数据按照DOY归档 # (Add) 下载影像数据按照DOY归档
logging.info("开始对已下载影像按照DOY日期进行归档.") logging.info("开始将已下载的HLS影像按照DOY日期进行归档")
files_collection(output_dir) files_collection(output_dir)
logging.info("归档完成!") logging.info("归档完成!")

View File

@ -13,7 +13,7 @@
### 1.2 配置环境变量 ### 1.2 配置环境变量
- 为了在控制台中直接使用conda命令, 需要将安装的相关目录配置到Path环境变量中。 - 为了在控制台中直接使用conda命令需要将安装的相关目录配置到Path环境变量中。
``` ```
D:\program\miniforge3 D:\program\miniforge3
@ -98,9 +98,11 @@ mamba activate lpdaac_windows
- 需要注意的是,密码中最好不要出 `@/#/$/%` 等符号,爬取时可能会出错。 - 需要注意的是,密码中最好不要出 `@/#/$/%` 等符号,爬取时可能会出错。
- 单个用户每秒限制最多100次请求参考自https://forum.earthdata.nasa.gov/viewtopic.php?t=3734 - 单个用户每秒限制最多100次请求参考自https://forum.earthdata.nasa.gov/viewtopic.php?t=3734
### 3.2 爬取云端数据并在内存中进行预处理 ### 3.2 脚本可用参数
- `-roi`:感兴趣区,需要按照 **左下右上** 的逆时针顺序设置点坐标 - `-roi`:感兴趣区,需要按照 **左下右上** 的逆时针顺序设置点坐标
- `-clip`:是否对影像进行裁剪,默认 `False`
- `-tile`HLS影像瓦片ID例如 `T49RGQ`
- `-dir`:输出目录,必须是已存在的目录 - `-dir`:输出目录,必须是已存在的目录
- `-start`:开始时间,格式为 `YYYY-MM-DD` - `-start`:开始时间,格式为 `YYYY-MM-DD`
- `-end`:结束时间,格式为 `YYYY-MM-DD` - `-end`:结束时间,格式为 `YYYY-MM-DD`
@ -110,14 +112,28 @@ mamba activate lpdaac_windows
- `-qf`:是否使用质量波段过滤云/云阴影像元,默认 `True` - `-qf`:是否使用质量波段过滤云/云阴影像元,默认 `True`
- `-scale`:是否对影像使用缩放因子,默认 `True` - `-scale`:是否对影像使用缩放因子,默认 `True`
- 爬取所有光谱波段 ### 3.3 爬取云端数据并在内存中进行预处理示例
- 爬取 L30 与 S30 的核心光谱波段仅按照感兴趣区瓦片ID起止时间以及产品名称筛选影像不进行云量筛选影像对影像进行去云掩膜处理
```sh ```sh
python .\\HLS_SuPER\\HLS_SuPER.py -roi '113.10114,30.62845,114.24349,31.59081' -dir .\\data\\HLS\\L30 -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands COASTAL-AEROSOL,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,CIRRUS,TIR1,TIR2,Fmask -cc 70 -qf True -scale True python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\ALL -start 2024-01-01 -end 2024-01-31 -prod both -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,Fmask -scale True
``` ```
- 仅爬取必要的核心波段 - 爬取 L30 的所有波段按照感兴趣区瓦片ID起止时间以及产品名称筛选影像过滤云量小于70% 的影像,对影像进行去云掩膜处理
```sh ```sh
python .\\HLS_SuPER\\HLS_SuPER.py -roi '113.10114,30.62845,114.24349,31.59081' -dir .\\data\\HLS\\L30\\subset -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2 -cc 70 -qf True -scale True python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\subset -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands COASTAL-AEROSOL,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,CIRRUS,TIR1,TIR2,Fmask -cc 70 -scale True
```
- 仅爬取 L30 的热红外波段仅按照感兴趣区瓦片ID起止时间以及产品名称筛选影像不进行云量筛选影像对影像进行去云掩膜处理
```sh
python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\TIR -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands TIR1,TIR2,Fmask -scale True
```
- 【测试用】不进行云量筛选,直接爬取 L30 2024 年暑期光谱波段与热红外波段
```sh
python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\subset\\2024 -start 2024-06-01 -end 2024-08-31 -prod HLSL30 -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,TIR1,TIR2,Fmask -scale True
``` ```