refactor(HLS_SuPER): 修复HLS数据下载与处理代码存在的问题,并优化代码结构

- 统一调整所有文件的导入顺序,清理冗余导入项
- 修复format_roi函数中误用convex_hull获取顶点的问题,改用exterior.coords
- 修正dask并行任务中的参数传递错误
- 新增非交互运行模式支持,优化confirm_action函数
- 添加无搜索结果时提前退出的检查逻辑
- 优化日志配置,新增netrc认证支持
- 修复裸露的except语句,改用明确的Exception捕获
- 更新所有文件的最后更新日期
This commit is contained in:
谢泓 2026-05-18 14:06:06 +08:00
parent cc0791f1ba
commit 339755a42c
4 changed files with 101 additions and 52 deletions

View File

@ -3,24 +3,24 @@
===============================================================================
HLS Processing and Exporting Reformatted Data (HLS_PER)
This module contains functions to conduct subsetting and quality filtering of
This module contains functions to conduct subsetting and quality filtering of
search results.
-------------------------------------------------------------------------------
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Editor: Hong Xie
Last Updated: 2025-03-30
Last Updated: 2026-05-17
===============================================================================
"""
import logging
import os
import sys
import logging
import numpy as np
from datetime import datetime as dt
import xarray as xr
import rioxarray as rxr
import dask.distributed
import numpy as np
import rioxarray as rxr
import xarray as xr
def create_output_name(url, band_dict):
@ -181,7 +181,6 @@ def process_granule(
os.path.isfile(f"{output_dir}/{create_output_name(url, band_dict)}")
for url in granule_urls
):
# First Handle Quality Layer
# (Add) 简化原有的冗余处理, 仅处理质量层, 并最后移除质量层下载url
if quality_filter:

View File

@ -3,21 +3,23 @@
===============================================================================
This module contains functions related to searching and preprocessing HLS data.
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Authors: Mahsa Jami, Cole Krehbiel, and Erik Bolch
Contact: lpdaac@usgs.gov
Contact: lpdaac@usgs.gov
Editor: Hong Xie
Last Updated: 2025-10-16
Last Updated: 2026-05-17
===============================================================================
"""
# Import necessary packages
import os
import logging
import os
import sys
from pathlib import Path
import numpy as np
import earthaccess
import geopandas as gpd
import numpy as np
from shapely.geometry import box
from shapely.geometry.polygon import orient
@ -77,7 +79,7 @@ def format_roi(roi: Path):
roi = gpd.GeoDataFrame(geometry=[box(*bbox)], crs="EPSG:4326")
roi["geometry"] = roi["geometry"].apply(ensure_ccw)
vertices_list = list(roi.geometry[0].convex_hull.coords)
vertices_list = list(roi.geometry[0].exterior.coords)
return (roi, vertices_list)

View File

@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
"""
===============================================================================
HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script
HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Contact: lpdaac@usgs.gov
Editor: Hong Xie
Last Updated: 2025-10-16
Last Updated: 2026-05-17
===============================================================================
"""
@ -14,22 +14,25 @@ Last Updated: 2025-10-16
# TODO Improve behavior around deletion of cogs when a netcdf is requested
# TODO Add ZARR as output option
from HLS_PER import process_granule, create_timeseries_dataset
from HLS_Su import hls_search, format_roi
from utils.common_utils import setup_dask_environment
import os
import sys
import argparse
import shutil
import logging
import time
import json
import logging
import os
import shutil
import sys
import time
from datetime import datetime as dt
import earthaccess
import dask.distributed
import earthaccess
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from HLS_PER import create_timeseries_dataset, process_granule
from HLS_Su import format_roi, hls_search
from utils.common_utils import setup_dask_environment
def parse_arguments():
"""
@ -349,6 +352,21 @@ def confirm_action(prompt):
"""
Prompts the user to confirm an action.
"""
non_interactive = not sys.stdin.isatty() or os.environ.get(
"HLS_SUPER_NON_INTERACTIVE", ""
).lower() in {"1", "true", "yes", "y"}
if non_interactive:
prompt_l = prompt.lower()
if "use the existing results file" in prompt_l:
return False
if "overwrite the existing results file" in prompt_l:
return True
if "proceed with processing" in prompt_l:
return True
if "temporary directory" in prompt_l:
return True
return True
while True:
response = input(prompt).lower()
if response in ["y", "yes"]:
@ -515,6 +533,10 @@ def main():
)
logging.info(filter_log)
if results_count == 0:
logging.warning("No data found matching the search criteria. Exiting.")
sys.exit("No data found. Processing aborted.")
# Confirm Processing
if not confirm_action("Do you want to proceed with processing? (y/n)"):
sys.exit("Processing aborted.")
@ -553,7 +575,7 @@ def main():
logging.info("Processing...")
tasks = [
dask.delayed(process_granule)(
granule_url,
granule_urls,
roi=roi,
clip=clip,
quality_filter=qf,
@ -563,15 +585,14 @@ def main():
bit_nums=[1, 3],
chunk_size=chunk_size,
)
for granule_url in results_urls
for granule_urls in results_urls
]
dask.compute(*tasks)
# Create Timeseries Dataset if NC4
if args.of == "NC4":
logging.info("Creating timeseries dataset...")
create_timeseries_dataset(
cog_dir, output_type=args.of, output_dir=output_dir)
create_timeseries_dataset(cog_dir, output_type=args.of, output_dir=output_dir)
# Close Dask Client
client.close()
@ -588,7 +609,7 @@ def main():
# End Timer
total_time = time.time() - start_time
logging.info(f"Processing complete. Total time: {round(total_time, 2)}s, ")
logging.info(f"Processing complete. Total time: {round(total_time, 2)}s.")
if __name__ == "__main__":

View File

@ -9,25 +9,25 @@ Last Updated: 2025-09-11
===============================================================================
"""
import os
import sys
import glob
import json
import logging
import os
import sys
from datetime import datetime
import earthaccess
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
from affine import Affine
from osgeo import gdal, gdal_array
from shapely import box
import xarray as xr
from rasterio.enums import Resampling
from rasterio.merge import merge
from rioxarray.merge import merge_arrays
from rioxarray import open_rasterio
import geopandas as gpd
import matplotlib.pyplot as plt
from rioxarray.merge import merge_arrays
from shapely import box
gdal.UseExceptions()
@ -229,16 +229,32 @@ def setup_dask_environment():
"""
Passes RIO environment variables to dask workers for authentication.
"""
import os
import rasterio
cookie_file_path = os.path.expanduser("~/cookies.txt")
candidate_cookie_paths = [
os.environ.get("EARTHDATA_COOKIE_FILE"),
os.path.expanduser("~/.urs_cookies"),
os.path.expanduser("~/.cookies"),
os.path.expanduser("~/cookies.txt"),
]
cookie_file_path = next(
(p for p in candidate_cookie_paths if p and os.path.exists(p)),
os.path.expanduser("~/cookies.txt"),
)
netrc_path = os.environ.get("EARTHDATA_NETRC_FILE") or os.path.expanduser(
"~/.netrc"
)
enable_netrc = "YES" if os.path.exists(netrc_path) else "NO"
global env
gdal_config = {
"GDAL_HTTP_UNSAFESSL": "YES",
"GDAL_HTTP_COOKIEFILE": cookie_file_path,
"GDAL_HTTP_COOKIEJAR": cookie_file_path,
"GDAL_HTTP_NETRC": enable_netrc,
"GDAL_HTTP_NETRC_FILE": netrc_path,
"GDAL_DISABLE_READDIR_ON_OPEN": "YES",
"CPL_VSIL_CURL_ALLOWED_EXTENSIONS": "TIF",
"GDAL_HTTP_MAX_RETRY": "10",
@ -250,25 +266,34 @@ def setup_dask_environment():
env.__enter__()
def setup_logging(log_file: str = "dask_worker.log"):
def setup_logging(log_file: str = None):
"""
在Dask工作进程中设置logging
设置logging
Parameters
----------
log_file : str, optional
日志文件路径, by default "dask_worker.log"
日志文件路径, by default None
"""
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s:%(asctime)s ||| %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(log_file),
],
)
if log_file is None:
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s:%(asctime)s ||| %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
encoding="utf-8", # Python 3.9+ 支持此参数
)
else:
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s:%(asctime)s ||| %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(log_file, encoding="utf-8"),
],
encoding="utf-8", # Python 3.9+ 支持此参数
)
def load_band_as_arr(org_tif_path, band_num=1):
@ -386,7 +411,7 @@ def array_to_raster(
except AttributeError:
# For backwards compatibility with older version of GDAL
rast = gdal.Open(gdal_array.GetArrayFilename(data))
except:
except Exception:
rast = gdal_array.OpenArray(data)
rast.SetGeoTransform(transform)
rast.SetProjection(wkt)
@ -479,7 +504,7 @@ def clip_roi_image(
raster = open_rasterio(file_path)
try:
doy = os.path.basename(file_path).split(".")[3]
except Exception as e:
except Exception:
doy = None
if doy:
raster.attrs["DOY"] = doy
@ -815,6 +840,8 @@ def plot(data, title=None, cmap="gray"):
title (str): 标题
cmap (str): 颜色映射
"""
import matplotlib.pyplot as plt
plt.imshow(data)
plt.title(title)
plt.axis("off") # 关闭坐标轴