feat: 调整数据检索方式, 统一使用外包Polygon检索以减少所需处理的数据量.

This commit is contained in:
谢泓 2025-10-20 08:48:38 +08:00
parent c90432a815
commit 62b2370fd7
4 changed files with 111 additions and 101 deletions

View File

@ -6,7 +6,7 @@ For example, MCD43A3, MCD43A4, MOD11A1.
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
Authors: Hong Xie Authors: Hong Xie
Last Updated: 2025-09-11 Last Updated: 2025-10-16
=============================================================================== ===============================================================================
""" """
@ -15,6 +15,7 @@ import sys
import json import json
import time import time
import logging import logging
from pathlib import Path
import earthaccess import earthaccess
import rioxarray as rxr import rioxarray as rxr
import dask.distributed import dask.distributed
@ -236,7 +237,7 @@ def process_granule(
def main( def main(
region: list, region_file: Path,
asset_name: str, asset_name: str,
modis_tile_id: str, modis_tile_id: str,
years: list, years: list,
@ -245,7 +246,7 @@ def main(
target_crs: str, target_crs: str,
output_root_dir: str, output_root_dir: str,
): ):
bbox = tuple(list(region.total_bounds)) region = gpd.read_file(region_file)
results_urls = [] results_urls = []
output_dir = os.path.join(output_root_dir, asset_name) output_dir = os.path.join(output_root_dir, asset_name)
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
@ -260,7 +261,7 @@ def main(
) )
year_temporal = (f"{year}-{dates[0]}T00:00:00", f"{year}-{dates[1]}T23:59:59") year_temporal = (f"{year}-{dates[0]}T00:00:00", f"{year}-{dates[1]}T23:59:59")
year_results = earthdata_search( year_results = earthdata_search(
[asset_name], year_temporal, bbox, modis_tile_id [asset_name], year_temporal, region_file, modis_tile_id
) )
results_urls.extend(year_results) results_urls.extend(year_results)
with open(year_results_file, "w") as f: with open(year_results_file, "w") as f:
@ -327,7 +328,7 @@ if __name__ == "__main__":
earthaccess.login(persist=True) earthaccess.login(persist=True)
# region = gpd.read_file("./data/vectors/wuling_guanqu_polygon.geojson") # region = gpd.read_file("./data/vectors/wuling_guanqu_polygon.geojson")
tile_id = "49REL" tile_id = "49REL"
region = gpd.read_file(f"./data/vectors/{tile_id}.geojson") region_file = f"./data/vectors/{tile_id}.geojson"
target_crs = "EPSG:32649" target_crs = "EPSG:32649"
asset_name = "MOD11A1" asset_name = "MOD11A1"
# asset_name = "MCD43A3" # asset_name = "MCD43A3"
@ -338,7 +339,7 @@ if __name__ == "__main__":
dates = ("03-01", "10-31") dates = ("03-01", "10-31")
output_root_dir = ".\\data\\MODIS\\" output_root_dir = ".\\data\\MODIS\\"
main( main(
region, region_file,
asset_name, asset_name,
modis_tile_id, modis_tile_id,
years, years,

View File

@ -29,13 +29,14 @@ RTC-S1 数据产品由美国宇航局喷气推进实验室 (JPL) OPERA 项目组
------------------------------------------------------------------------------- -------------------------------------------------------------------------------
Authors: Hong Xie Authors: Hong Xie
Last Updated: 2025-03-10 Last Updated: 2025-10-20
=============================================================================== ===============================================================================
""" """
import os import os
import sys import sys
import glob import glob
from pathlib import Path
import json import json
import time import time
from datetime import datetime from datetime import datetime
@ -141,7 +142,7 @@ def convert_to_db(image: xr.DataArray) -> xr.DataArray:
def process_sar_granule( def process_sar_granule(
url_basenames: list[str], url_basenames: list[str],
output_dir: str, output_dir: Path,
date: str, date: str,
tile_id: str, tile_id: str,
roi: list, roi: list,
@ -156,7 +157,7 @@ def process_sar_granule(
---------- ----------
url_basenames : list[str] url_basenames : list[str]
用于文件匹配的 RTC-S1 数据下载链接的文件名列表 用于文件匹配的 RTC-S1 数据下载链接的文件名列表
output_dir : str output_dir : Path
输出根目录 输出根目录
date : str date : str
日期, 格式为 YYYY%j 日期, 格式为 YYYY%j
@ -243,7 +244,7 @@ def process_sar_granule(
def process_sar_static_granule( def process_sar_static_granule(
url_basenames: list[str], output_dir: str, tile_id: str, roi: list, clip=True url_basenames: list[str], output_dir: Path, tile_id: str, roi: list, clip=True
) -> bool: ) -> bool:
""" """
OPERA RTC-S1-STATIC 静态数据预处理 OPERA RTC-S1-STATIC 静态数据预处理
@ -254,7 +255,7 @@ def process_sar_static_granule(
---------- ----------
url_basenames : list[str] url_basenames : list[str]
用于文件匹配的 RTC-S1-STATIC 数据下载链接的文件名列表 用于文件匹配的 RTC-S1-STATIC 数据下载链接的文件名列表
output_dir: str output_dir: Path
输出根目录 输出根目录
tile_id: str tile_id: str
tile id tile id
@ -327,12 +328,12 @@ def process_sar_static_granule(
def main( def main(
asset_name: list[str], asset_name: list[str],
region: list, region_file: Path,
years: list[str | int], years: list[str | int],
output_dir: str, output_dir: Path,
tile_id: str, tile_id: str,
): ):
bbox = tuple(list(region.total_bounds)) region = read_file(region_file)
# 示例文件名称: OPERA_L2_RTC-S1_T040-083952-IW1_20240605T102012Z_20240612T053743Z_S1A_30_v1.0_VH.tif # 示例文件名称: OPERA_L2_RTC-S1_T040-083952-IW1_20240605T102012Z_20240612T053743Z_S1A_30_v1.0_VH.tif
static_name = ["OPERA_L2_RTC-S1-STATIC_V1"] static_name = ["OPERA_L2_RTC-S1-STATIC_V1"]
sar_output_dir = os.path.join(output_dir, "RTC_S1") sar_output_dir = os.path.join(output_dir, "RTC_S1")
@ -353,7 +354,7 @@ def main(
year_results_dir, f"RTC_S1_{tile_id}_{year}_results_urls.json" year_results_dir, f"RTC_S1_{tile_id}_{year}_results_urls.json"
) )
year_temporal = (f"{year}-06-01T00:00:00", f"{year}-08-31T23:59:59") year_temporal = (f"{year}-06-01T00:00:00", f"{year}-08-31T23:59:59")
year_results = earthdata_search(asset_name, year_temporal, roi=bbox) year_results = earthdata_search(asset_name, year_temporal, region_file=region_file)
results_urls.extend(year_results) results_urls.extend(year_results)
with open(year_results_file, "w") as f: with open(year_results_file, "w") as f:
json.dump(year_results, f) json.dump(year_results, f)
@ -361,7 +362,7 @@ def main(
with open(results_urls_file, "w") as f: with open(results_urls_file, "w") as f:
json.dump(results_urls, f) json.dump(results_urls, f)
static_granule_urls = earthdata_search(static_name, roi=bbox) static_granule_urls = earthdata_search(static_name, region_file=region_file)
static_url_basenames = [ static_url_basenames = [
os.path.basename(url) for sublist in static_granule_urls for url in sublist os.path.basename(url) for sublist in static_granule_urls for url in sublist
] ]
@ -454,10 +455,11 @@ if __name__ == "__main__":
earthaccess.login(persist=True) earthaccess.login(persist=True)
asset_name = ["OPERA_L2_RTC-S1_V1"] asset_name = ["OPERA_L2_RTC-S1_V1"]
tile_id = "49REL" tile_id = "49REL"
region = read_file(f".\\data\\vectors\\{tile_id}.geojson") # region = read_file(f".\\data\\vectors\\{tile_id}.geojson")
region_file = Path(f".\\data\\vectors\\{tile_id}.geojson")
# region = read_file("./data/vectors/wuling_guanqu_polygon.geojson") # region = read_file("./data/vectors/wuling_guanqu_polygon.geojson")
# years = ["2024", "2023", "2022"] # years = ["2024", "2023", "2022"]
years = ["2024"] years = ["2024"]
output_dir = ".\\data\\SAR" output_dir = Path(".\\data\\SAR")
os.makedirs(output_dir, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
main(asset_name, region, years, output_dir, tile_id) main(asset_name, region_file, years, output_dir, tile_id)

View File

@ -7,19 +7,85 @@ This module contains functions related to searching and preprocessing HLS data.
Authors: Mahsa Jami, Cole Krehbiel, and Erik Bolch Authors: Mahsa Jami, Cole Krehbiel, and Erik Bolch
Contact: lpdaac@usgs.gov Contact: lpdaac@usgs.gov
Editor: Hong Xie Editor: Hong Xie
Last Updated: 2025-02-20 Last Updated: 2025-10-16
=============================================================================== ===============================================================================
""" """
# Import necessary packages # Import necessary packages
import os
import logging
from pathlib import Path
import numpy as np import numpy as np
import earthaccess import earthaccess
import geopandas as gpd
from shapely.geometry import box
from shapely.geometry.polygon import orient
def ensure_ccw(geom):
"""
Ensure the exterior ring of the polygon is counterclockwise.
"""
if geom.exterior.is_ccw:
return geom # Already counterclockwise
else:
return orient(geom, sign=1.0) # Make it counterclockwise
def format_roi(roi: Path):
"""
Determines if submitted ROI is a file or bbox coordinates.
If a file, opens a GeoJSON or shapefile and creates a list of polygon vertices in the correct order. If the file has multiple polygons it will use a unary union convex hull of the external bounds.
If bbox coordinates, creates a geodataframe with a single Polygon geometry.
Returns a geopandas dataframe for clipping and a list of vertices for searching.
"""
if os.path.isfile(roi): # and roi.endswith(("geojson", "shp")):
try:
# Open ROI if file
roi = gpd.read_file(roi)
# Check if ROI is in Geographic CRS, if not, convert to it
if not roi.crs.is_geographic:
roi = roi.to_crs("EPSG:4326")
logging.info(
"Note: ROI submitted is being converted to Geographic CRS (EPSG:4326)"
)
# (Add) 添加对多种几何图形类型的支持, 将MultiPolygon合并为Polygon
if len(roi) > 1 or roi.geometry[0].geom_type == "MultiPolygon":
# Merge all Polygon geometries and create external boundary
logging.info(
"Multiple polygons detected. Creating single geometry of external coordinates."
)
single_geometry = roi.union_all().convex_hull
roi = gpd.GeoDataFrame(geometry=[single_geometry], crs=roi.crs)
roi["geometry"] = roi["geometry"].apply(ensure_ccw)
# List Vertices in correct order for search
# (Add) 使用外包矩形坐标简化提交检索时使用的坐标, 仅支持逆时针
vertices_list = list(roi.geometry[0].exterior.coords)
except (FileNotFoundError, ValueError):
sys.exit(
f"The GeoJSON/shapefile is either not valid or could not be found.\nPlease double check the name and provide the absolute path to the file or make sure that it is located in {os.getcwd()}"
)
else:
# If bbox coordinates are submitted
bbox = tuple(map(float, roi.strip("'\"").split(",")))
print(bbox)
# Convert bbox to a geodataframe for clipping
roi = gpd.GeoDataFrame(geometry=[box(*bbox)], crs="EPSG:4326")
roi["geometry"] = roi["geometry"].apply(ensure_ccw)
vertices_list = list(roi.geometry[0].convex_hull.coords)
return (roi, vertices_list)
def earthdata_search( def earthdata_search(
asset_name: list, asset_name: list,
dates: tuple = None, dates: tuple = None,
roi: list = None, region_file: Path = None,
tile_id: str = None, tile_id: str = None,
hours: tuple = None, hours: tuple = None,
log=False, log=False,
@ -30,28 +96,32 @@ def earthdata_search(
For example: For example:
- MODIS: MCD43A3, MCD43A4, MOD11A1, MOD11A2, ... - MODIS: MCD43A3, MCD43A4, MOD11A1, MOD11A2, ...
- SMAP: SPL3SMP_E, SPL4SMGP, ... - SMAP: SPL3SMP_E, SPL4SMGP, ...
- DEM: NASADEM_HGT, NASADEM_SC, ... - GPM: GPM_3IMERGDL, ...
- DEM: NASADEM_HGT, NASADEM_SC, ALOS_PSR_RTC_HIGH, ALOS_PSR_RTC_LOW, ...
""" """
# Search for data # Search for data
if dates and not roi: if not region_file:
# 全球范围的数据集不需要roi参数 # 全球范围的数据集不需要roi参数, 如 SMAP, GPM
results = earthaccess.search_data( results = earthaccess.search_data(
short_name=list(asset_name), short_name=list(asset_name),
temporal=dates, temporal=dates,
) )
elif roi and not dates:
# 适用非时间序列数据, 如 DEM
results = earthaccess.search_data(
short_name=list(asset_name),
bounding_box=roi,
)
else: else:
results = earthaccess.search_data( # 基于 roi 的外包多边形进行数据检索
short_name=list(asset_name), roi, vertices_list = format_roi(region_file)
bounding_box=roi, if not dates:
temporal=dates, # 适用非时间序列数据, 如 DEM
) results = earthaccess.search_data(
short_name=list(asset_name),
polygon=vertices_list,
)
else:
results = earthaccess.search_data(
short_name=list(asset_name),
polygon=vertices_list,
temporal=dates,
)
# 根据瓦片ID过滤影像 # 根据瓦片ID过滤影像
if tile_id: if tile_id:

View File

@ -5,7 +5,7 @@ HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Contact: lpdaac@usgs.gov Contact: lpdaac@usgs.gov
Editor: Hong Xie Editor: Hong Xie
Last Updated: 2025-10-13 Last Updated: 2025-10-16
=============================================================================== ===============================================================================
""" """
@ -15,7 +15,7 @@ Last Updated: 2025-10-13
# TODO Add ZARR as output option # TODO Add ZARR as output option
from HLS_PER import process_granule, create_timeseries_dataset from HLS_PER import process_granule, create_timeseries_dataset
from HLS_Su import hls_search from HLS_Su import hls_search, format_roi
from utils.common_utils import setup_dask_environment from utils.common_utils import setup_dask_environment
import os import os
import sys import sys
@ -26,9 +26,6 @@ import time
import json import json
from datetime import datetime as dt from datetime import datetime as dt
import earthaccess import earthaccess
from shapely.geometry import box
from shapely.geometry.polygon import orient
import geopandas as gpd
import dask.distributed import dask.distributed
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -169,66 +166,6 @@ def parse_arguments():
return parser.parse_args() return parser.parse_args()
def ensure_ccw(geom):
"""
Ensure the exterior ring of the polygon is counterclockwise.
"""
if geom.exterior.is_ccw:
return geom # Already counterclockwise
else:
return orient(geom, sign=1.0) # Make it counterclockwise
def format_roi(roi):
"""
Determines if submitted ROI is a file or bbox coordinates.
If a file, opens a GeoJSON or shapefile and creates a list of polygon vertices in the correct order. If the file has multiple polygons it will use a unary union convex hull of the external bounds.
If bbox coordinates, creates a geodataframe with a single Polygon geometry.
Returns a geopandas dataframe for clipping and a list of vertices for searching.
"""
if os.path.isfile(roi): # and roi.endswith(("geojson", "shp")):
try:
# Open ROI if file
roi = gpd.read_file(roi)
# Check if ROI is in Geographic CRS, if not, convert to it
if not roi.crs.is_geographic:
roi = roi.to_crs("EPSG:4326")
logging.info(
"Note: ROI submitted is being converted to Geographic CRS (EPSG:4326)"
)
# (Add) 添加对多种几何图形类型的支持, 将MultiPolygon合并为Polygon
if len(roi) > 1 or roi.geometry[0].geom_type == "MultiPolygon":
# Merge all Polygon geometries and create external boundary
logging.info(
"Multiple polygons detected. Creating single geometry of external coordinates."
)
single_geometry = roi.union_all().convex_hull
roi = gpd.GeoDataFrame(geometry=[single_geometry], crs=roi.crs)
roi['geometry'] = roi['geometry'].apply(ensure_ccw)
# List Vertices in correct order for search
# (Add) 使用外包矩形坐标简化提交检索时使用的坐标, 检索时仅支持逆时针
vertices_list = list(roi.geometry[0].exterior.coords)
except (FileNotFoundError, ValueError):
sys.exit(
f"The GeoJSON/shapefile is either not valid or could not be found.\nPlease double check the name and provide the absolute path to the file or make sure that it is located in {os.getcwd()}"
)
else:
# If bbox coordinates are submitted
bbox = tuple(map(float, roi.strip("'\"").split(",")))
print(bbox)
# Convert bbox to a geodataframe for clipping
roi = gpd.GeoDataFrame(geometry=[box(*bbox)], crs="EPSG:4326")
roi['geometry'] = roi['geometry'].apply(ensure_ccw)
vertices_list = list(roi.geometry[0].convex_hull.coords)
return (roi, vertices_list)
def format_dates(start, end): def format_dates(start, end):
# Strip Quotes # Strip Quotes
start = start.strip("'").strip('"') start = start.strip("'").strip('"')