feat: 更新经过多轮测试使用后的稳定版本.

2025-01-09 11:31:41 +08:00 · 2025-01-09 11:31:41 +08:00 · ac1a0a96f2
commit ac1a0a96f2
parent 6d16e32022
4 changed files with 182 additions and 51 deletions
--- a/HLS_SuPER/HLS_PER.py
+++ b/HLS_SuPER/HLS_PER.py
@ -7,7 +7,8 @@ This module contains functions to conduct subsetting and quality filtering of
 search results.
 -------------------------------------------------------------------------------
 Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
-Last Updated: 2024-09-18
+Editor: Hong Xie
 Last Updated: 2025-01-06
 ===============================================================================
 """
@ -21,6 +22,7 @@ import xarray as xr
 import rioxarray as rxr
 import dask.distributed
 def create_output_name(url, band_dict):
    """
    Uses HLS default naming scheme to generate an output name with common band names.
@ -42,7 +44,9 @@ def create_output_name(url, band_dict):
    return output_name
-def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)):
+def open_hls(
    url, roi=None, clip=False, scale=True, chunk_size=dict(band=1, x=512, y=512)
 ):
    """
    Generic Function to open an HLS COG and clip to ROI. For consistent scaling, this must be done manually.
    Some HLS Landsat scenes have the metadata in the wrong location.
@ -52,22 +56,39 @@ def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)):
        "band", drop=True
    )
-    # Reproject ROI and Clip if ROI is provided
+    # (Add) 读取波段名称
-    if roi is not None:
+    split_asset = url.split("/")[-1].split(".")
    asset_name = split_asset[1]
    band_name = split_asset[-2]
    # Reproject ROI and Clip if ROI is provided and clip is True
    if roi is not None and clip:
        roi = roi.to_crs(da.spatial_ref.crs_wkt)
        da = da.rio.clip(roi.geometry.values, roi.crs, all_touched=True)
    # (Add) 即使大部分影像已经被缩放且填补了缺失值, 但可能仍然有些影像需要进行手动在本地GIS软件中进行缩放和填补缺失值
    # Apply Scale Factor if desired for non-quality layer
-    if scale and "Fmask" not in url:
+    if band_name != "Fmask":
        if scale:
            # Mask Fill Values
            da = xr.where(da == -9999, np.nan, da)
            # Scale Data
            # (Add) 除质量层, 以及 L30 的两个热红外波段外, 其他光谱波段缩放因子均为 0.0001
            # (Add) L30 的两个热红外波段缩放因子为 0.01
            # NOTE: 需要注意的是热红外此时未被改名
            if (band_name == "B10" or band_name == "B11") and (asset_name == "L30"):
                da = da * 0.01
            else:
                da = da * 0.0001
            # Remove Scale Factor After Scaling - Prevents Double Scaling
            da.attrs["scale_factor"] = 1.0
        # Add Scale Factor to Attributes Manually - This will overwrite/add if the data is missing.
-    if not scale and "Fmask" not in url:
+        # (Add) 若要手动缩放, 则需要手动添加缩放因子
        else:
            if (band_name == "B10" or band_name == "B11") and (asset_name == "L30"):
                da.attrs["scale_factor"] = 0.01
            else:
                da.attrs["scale_factor"] = 0.0001
    return da
@ -91,6 +112,7 @@ def create_quality_mask(quality_data, bit_nums: list = [0, 1, 2, 3, 4, 5]):
 def process_granule(
    granule_urls,
    roi,
    clip,
    quality_filter,
    scale,
    output_dir,
@ -100,6 +122,31 @@ def process_granule(
 ):
    """
    Processes a list of HLS asset urls for a single granule.
    args:
        granule_urls (list): List of HLS asset urls to process.
        roi (geopandas.GeoDataFrame): ROI to filter data.
        clip (bool): If True, ROI will be clipped to the image.
        quality_filter (bool): If True, quality layer will be used to mask data.
        scale (bool): If True, data will be scaled to reflectance.
        output_dir (str): Directory to save output files.
        band_dict (dict): Dictionary of band names and asset names.
        bit_nums (list): List of bit numbers to use for quality mask.
        - 0: Cirrus
        - 1: Cloud
        - 2: Adjacent to cloud/shadow
        - 3: Cloud shadow
        - 4: Snow/ice
        - 5: Water
        chunk_size (dict): Dictionary of chunk sizes for dask.
    """
    # Setup Logging
@ -116,6 +163,7 @@ def process_granule(
    ):
        # First Handle Quality Layer
        # (Add) 简化原有的冗余处理, 仅处理质量层, 并最后移除质量层下载url
        if quality_filter:
            # Generate Quality Layer URL
            split_asset = granule_urls[0].split("/")[-1].split(".")
@ -123,31 +171,26 @@ def process_granule(
            quality_url = (
                f"{'/'.join(granule_urls[0].split('/')[:-1])}/{'.'.join(split_asset)}"
            )
            # Check if File exists in Output Directory
-            output_name = create_output_name(quality_url, band_dict)
+            quality_output_name = create_output_name(quality_url, band_dict)
-            output_file = f"{output_dir}/{output_name}"
+            quality_output_file = f"{output_dir}/{quality_output_name}"
            # Open Quality Layer
            qa_da = open_hls(quality_url, roi, scale, chunk_size)
            # Check if quality asset is already processed
-            if not os.path.isfile(output_file):
+            if not os.path.isfile(quality_output_file):
                # Write Output
-                qa_da.rio.to_raster(raster_path=output_file, driver="COG")
+                # Open Quality Layer
                qa_da = open_hls(quality_url, roi, clip, scale, chunk_size)
                qa_da.rio.to_raster(raster_path=quality_output_file, driver="COG")
            else:
                logging.info(
-                    f"Existing file {output_name} found in {output_dir}. Skipping."
+                    f"Existing quality file {quality_output_name} found in {output_dir}."
                )
            # Remove Quality Layer from Granule Asset List if Present
            granule_urls = [asset for asset in granule_urls if asset != quality_url]
            # Create Quality Mask
            # TODO: 掩膜数组的存在可能会造成Dask内存的溢出, 需要优化
            qa_mask = create_quality_mask(qa_da, bit_nums=bit_nums)
            # (Add) 若设置 quality_filter=True, 则在生成质量掩码后, 需要移除质量层, 避免后续重复处理
            granule_urls = [url for url in granule_urls if "Fmask" not in url]
        # Process Remaining Assets
        for url in granule_urls:
            # Check if File exists in Output Directory
            output_name = create_output_name(url, band_dict)
@ -156,14 +199,20 @@ def process_granule(
            # Check if scene is already processed
            if not os.path.isfile(output_file):
                # Open Asset
-                da = open_hls(url, roi, scale, chunk_size)
+                da = open_hls(url, roi, clip, scale, chunk_size)
                # Apply Quality Mask if Desired
                if quality_filter:
                    da = da.where(~qa_mask)
                # Write Output
                if "FMASK" in output_name:
                    da.rio.to_raster(raster_path=output_file, driver="COG")
                else:
                    # (Add) 固定输出为 float32 类型, 否则会默认 float64 类型
                    da.rio.to_raster(
                        raster_path=output_file, driver="COG", dtype="float32"
                    )
            else:
                logging.info(
                    f"Existing file {output_name} found in {output_dir}. Skipping."
--- a/HLS_SuPER/HLS_Su.py
+++ b/HLS_SuPER/HLS_Su.py
@ -6,7 +6,8 @@ This module contains functions related to searching and preprocessing HLS data.
 -------------------------------------------------------------------------------                        
 Authors: Mahsa Jami, Cole Krehbiel, and Erik Bolch
 Contact: lpdaac@usgs.gov   
-Last Updated: 2024-09-18                                    
+Editor: Hong Xie
 Last Updated: 2025-01-06                                    
 ===============================================================================
 """
@ -16,7 +17,9 @@ import earthaccess
 # Main function to search and filter HLS data
-def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=False):
+def hls_search(
    roi: list, band_dict: dict, dates=None, cloud_cover=None, tile_id=None, log=False
 ):
    """
    This function uses earthaccess to search for HLS data using an roi and temporal parameter, filter by cloud cover and delivers a list of results urls for the selected bands.
    """
@ -27,6 +30,10 @@ def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=Fal
        temporal=dates,
    )
    # (Add) 根据瓦片ID过滤影像
    if tile_id:
        results = hls_tileid_filter(results, tile_id)
    # Filter by cloud cover
    if cloud_cover:
        results = hls_cc_filter(results, cloud_cover)
@ -45,6 +52,23 @@ def hls_search(roi: list, band_dict: dict, dates=None, cloud_cover=None, log=Fal
    return selected_results_urls
 def hls_tileid_filter(results, tile_id):
    """
    (Add) 基于给定的瓦片ID过滤earthaccess检索的数据结果
    """
    tile_ids = []
    for result in results:
        # 从json中检索瓦片ID，转换为字符串并放入数组中
        tmp_id = str(result["meta"]["native-id"].split(".")[2])
        tile_ids.append(tmp_id)
    tile_ids = np.array(tile_ids)
    # 根据瓦片ID找到对应的索引
    tile_id_indices = np.where(tile_ids == tile_id)
    # 根据索引过滤结果
    return [results[i] for i in tile_id_indices[0]]
 # Filter earthaccess results based on cloud cover threshold
 def hls_cc_filter(results, cc_threshold):
    """
--- a/HLS_SuPER/HLS_SuPER.py
+++ b/HLS_SuPER/HLS_SuPER.py
@ -4,7 +4,8 @@
 HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script                         
 Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
 Contact: lpdaac@usgs.gov
-Last Updated: 2024-09-18  
+Editor: Hong Xie
 Last Updated: 2025-01-06 
 ===============================================================================
 """
@ -53,6 +54,23 @@ def parse_arguments():
        " ",
    )
    # (Add) clip: 裁剪参数, 默认为 False, 表示不裁剪, 如果为 True, 则表示裁剪
    parser.add_argument(
        "-clip",
        choices=["True", "False"],
        required=False,
        help="(Optional) If provided, the script will clip the output to the ROI.",
        default="False",
    )
    # (Add) tile: HLS 的瓦片 ID
    parser.add_argument(
        "-tile",
        type=str,
        required=False,
        help="(Optional) Tile ID for spatial subset. If provided, the script will search for the tile ID based on the ROI.",
    )
    # dir: Directory to save the files to
    parser.add_argument(
        "-dir",
@ -117,7 +135,7 @@ def parse_arguments():
        choices=["True", "False"],
        required=False,
        help="Flag to apply scale factor to layers before exporting output files. This is generally unecessary as most applications will scale automatically.",
-        default="False",
+        default="True",
    )
    # of: output file format
@ -221,6 +239,14 @@ def format_dates(start, end):
    return dates
 def format_tile_id(tile_id):
    """
    (Add) 格式化tile_id参数
    """
    tile_id = tile_id.strip("'").strip('"')
    return str(tile_id)
 def format_cloud_cover(cc):
    try:
        cc = int(cc.strip("'").strip('"'))
@ -464,6 +490,10 @@ def main():
    roi, vl = format_roi(args.roi)
    logging.info("Region of Interest formatted successfully")
    # (Add) 格式化 clip 参数
    clip = str_to_bool(args.clip)
    logging.info(f"Clip to ROI: {clip}")
    # Set Output Directory
    if args.dir is not None:
        output_dir = os.path.normpath(args.dir.strip("'").strip('"')) + os.sep
@ -485,6 +515,10 @@ def main():
    cc = format_cloud_cover(args.cc)
    logging.info(f"Cloud Cover Filter <= {cc}")
    # (Add) 格式化 Tile ID 参数
    tile = format_tile_id(args.tile)
    logging.info(f"Tile ID: {tile}")
    # Quality Filtering
    qf = str_to_bool(args.qf)
    logging.info(f"Quality Filtering: {qf}")
@ -533,20 +567,27 @@ def main():
    else:
        logging.info("Searching for data...")
        results_urls = hls_search(
-            roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc
+            roi=vl, band_dict=band_dict, dates=dates, cloud_cover=cc, tile_id=tile
        )
        logging.info(f"Writing search results to {results_urls_file}")
        with open(results_urls_file, "w") as file:
            json.dump(results_urls, file)
    results_count = len(results_urls)
    total_assets = sum(len(sublist) for sublist in results_urls)
    filter_descriptions = []
    if cc:
-        logging.info(
+        filter_descriptions.append("cloud")
-            f"{len(results_urls)} granules remain after cloud filtering. {total_assets} assets will be processed."
+    if tile:
-        )
+        filter_descriptions.append("tile")
    if filter_descriptions:
        filter_log = f"{results_count} granules remain after {' and '.join(filter_descriptions)} filtering. {total_assets} assets will be processed."
    else:
-        logging.info(f"{total_assets} assets will be processed.")
+        filter_log = (
            f"{results_count} granules remain. {total_assets} assets will be processed."
        )
    logging.info(filter_log)
    # Confirm Processing
    if not confirm_action("Do you want to proceed with processing? (y/n)"):
@ -588,11 +629,12 @@ def main():
        dask.delayed(process_granule)(
            granule_url,
            roi=roi,
            clip=clip,
            quality_filter=qf,
            scale=scale,
            output_dir=cog_dir,
            band_dict=band_dict,
-            bit_nums=[0, 1, 2, 3, 4, 5],
+            bit_nums=[1, 3],
            chunk_size=chunk_size,
        )
        for granule_url in results_urls
@ -612,8 +654,8 @@ def main():
        logging.info("Timeseries Dataset Created. Removing Temporary Files...")
        shutil.rmtree(cog_dir)
-    # Add: 下载影像数据按照DOY归档
+    # (Add) 下载影像数据按照DOY归档
-    logging.info("开始对已下载影像按照DOY日期进行归档.")
+    logging.info("开始将已下载的HLS影像按照DOY日期进行归档")
    files_collection(output_dir)
    logging.info("归档完成!")
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@
 ### 1.2 配置环境变量
- 为了在控制台中直接使用conda命令, 需要将安装的相关目录配置到Path环境变量中。
+- 为了在控制台中直接使用conda命令，需要将安装的相关目录配置到Path环境变量中。
 ```
 D:\program\miniforge3
@ -98,9 +98,11 @@ mamba activate lpdaac_windows
 - 需要注意的是，密码中最好不要出 `@/#/$/%` 等符号，爬取时可能会出错。
 - 单个用户每秒限制最多100次请求，参考自：https://forum.earthdata.nasa.gov/viewtopic.php?t=3734
-### 3.2 爬取云端数据并在内存中进行预处理
+### 3.2 脚本可用参数
 - `-roi`：感兴趣区，需要按照 **左下右上** 的逆时针顺序设置点坐标
 - `-clip`：是否对影像进行裁剪，默认 `False`
 - `-tile`：HLS影像瓦片ID，例如 `T49RGQ`
 - `-dir`：输出目录，必须是已存在的目录
 - `-start`：开始时间，格式为 `YYYY-MM-DD`
 - `-end`：结束时间，格式为 `YYYY-MM-DD`
@ -110,14 +112,28 @@ mamba activate lpdaac_windows
 - `-qf`：是否使用质量波段过滤云/云阴影像元，默认 `True`
 - `-scale`：是否对影像使用缩放因子，默认 `True`
- 爬取所有光谱波段
+### 3.3 爬取云端数据并在内存中进行预处理示例
 - 爬取 L30 与 S30 的核心光谱波段：仅按照感兴趣区，瓦片ID，起止时间以及产品名称筛选影像，不进行云量筛选影像，对影像进行去云掩膜处理
 ```sh
-python .\\HLS_SuPER\\HLS_SuPER.py -roi '113.10114,30.62845,114.24349,31.59081' -dir .\\data\\HLS\\L30 -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands COASTAL-AEROSOL,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,CIRRUS,TIR1,TIR2,Fmask -cc 70 -qf True -scale True
+python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\ALL -start 2024-01-01 -end 2024-01-31 -prod both -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,Fmask -scale True
 ```
- 仅爬取必要的核心波段
+- 爬取 L30 的所有波段：按照感兴趣区，瓦片ID，起止时间以及产品名称筛选影像，过滤云量小于70% 的影像，对影像进行去云掩膜处理
 ```sh
-python .\\HLS_SuPER\\HLS_SuPER.py -roi '113.10114,30.62845,114.24349,31.59081' -dir .\\data\\HLS\\L30\\subset -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2 -cc 70 -qf True -scale True
+python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\subset -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands COASTAL-AEROSOL,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,CIRRUS,TIR1,TIR2,Fmask -cc 70 -scale True
 ```
 - 仅爬取 L30 的热红外波段：仅按照感兴趣区，瓦片ID，起止时间以及产品名称筛选影像，不进行云量筛选影像，对影像进行去云掩膜处理
 ```sh
 python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\TIR -start 2024-01-01 -end 2024-01-31 -prod HLSL30 -bands TIR1,TIR2,Fmask -scale True
 ```
 - 【测试用】不进行云量筛选，直接爬取 L30 2024 年暑期光谱波段与热红外波段
 ```sh
 python .\\HLS_SuPER\\HLS_SuPER.py -roi '112.9834,30.5286,114.32373,31.64448' -tile T49RGQ -dir .\\data\\HLS\\L30\\subset\\2024 -start 2024-06-01 -end 2024-08-31 -prod HLSL30 -bands BLUE,GREEN,RED,NIR1,SWIR1,SWIR2,TIR1,TIR2,Fmask -scale True
 ```