feat: 调整数据检索方式, 统一使用外包Polygon检索以减少所需处理的数据量.

This commit is contained in:
谢泓 2025-10-20 08:48:38 +08:00
parent c90432a815
commit 62b2370fd7
4 changed files with 111 additions and 101 deletions

View File

@ -6,7 +6,7 @@ For example, MCD43A3, MCD43A4, MOD11A1.
-------------------------------------------------------------------------------
Authors: Hong Xie
Last Updated: 2025-09-11
Last Updated: 2025-10-16
===============================================================================
"""
@ -15,6 +15,7 @@ import sys
import json
import time
import logging
from pathlib import Path
import earthaccess
import rioxarray as rxr
import dask.distributed
@ -236,7 +237,7 @@ def process_granule(
def main(
region: list,
region_file: Path,
asset_name: str,
modis_tile_id: str,
years: list,
@ -245,7 +246,7 @@ def main(
target_crs: str,
output_root_dir: str,
):
bbox = tuple(list(region.total_bounds))
region = gpd.read_file(region_file)
results_urls = []
output_dir = os.path.join(output_root_dir, asset_name)
os.makedirs(output_dir, exist_ok=True)
@ -260,7 +261,7 @@ def main(
)
year_temporal = (f"{year}-{dates[0]}T00:00:00", f"{year}-{dates[1]}T23:59:59")
year_results = earthdata_search(
[asset_name], year_temporal, bbox, modis_tile_id
[asset_name], year_temporal, region_file, modis_tile_id
)
results_urls.extend(year_results)
with open(year_results_file, "w") as f:
@ -327,7 +328,7 @@ if __name__ == "__main__":
earthaccess.login(persist=True)
# region = gpd.read_file("./data/vectors/wuling_guanqu_polygon.geojson")
tile_id = "49REL"
region = gpd.read_file(f"./data/vectors/{tile_id}.geojson")
region_file = f"./data/vectors/{tile_id}.geojson"
target_crs = "EPSG:32649"
asset_name = "MOD11A1"
# asset_name = "MCD43A3"
@ -338,7 +339,7 @@ if __name__ == "__main__":
dates = ("03-01", "10-31")
output_root_dir = ".\\data\\MODIS\\"
main(
region,
region_file,
asset_name,
modis_tile_id,
years,

View File

@ -29,13 +29,14 @@ RTC-S1 数据产品由美国宇航局喷气推进实验室 (JPL) OPERA 项目组
-------------------------------------------------------------------------------
Authors: Hong Xie
Last Updated: 2025-03-10
Last Updated: 2025-10-20
===============================================================================
"""
import os
import sys
import glob
from pathlib import Path
import json
import time
from datetime import datetime
@ -141,7 +142,7 @@ def convert_to_db(image: xr.DataArray) -> xr.DataArray:
def process_sar_granule(
url_basenames: list[str],
output_dir: str,
output_dir: Path,
date: str,
tile_id: str,
roi: list,
@ -156,7 +157,7 @@ def process_sar_granule(
----------
url_basenames : list[str]
用于文件匹配的 RTC-S1 数据下载链接的文件名列表
output_dir : str
output_dir : Path
输出根目录
date : str
日期, 格式为 YYYY%j
@ -243,7 +244,7 @@ def process_sar_granule(
def process_sar_static_granule(
url_basenames: list[str], output_dir: str, tile_id: str, roi: list, clip=True
url_basenames: list[str], output_dir: Path, tile_id: str, roi: list, clip=True
) -> bool:
"""
OPERA RTC-S1-STATIC 静态数据预处理
@ -254,7 +255,7 @@ def process_sar_static_granule(
----------
url_basenames : list[str]
用于文件匹配的 RTC-S1-STATIC 数据下载链接的文件名列表
output_dir: str
output_dir: Path
输出根目录
tile_id: str
tile id
@ -327,12 +328,12 @@ def process_sar_static_granule(
def main(
asset_name: list[str],
region: list,
region_file: Path,
years: list[str | int],
output_dir: str,
output_dir: Path,
tile_id: str,
):
bbox = tuple(list(region.total_bounds))
region = read_file(region_file)
# 示例文件名称: OPERA_L2_RTC-S1_T040-083952-IW1_20240605T102012Z_20240612T053743Z_S1A_30_v1.0_VH.tif
static_name = ["OPERA_L2_RTC-S1-STATIC_V1"]
sar_output_dir = os.path.join(output_dir, "RTC_S1")
@ -353,7 +354,7 @@ def main(
year_results_dir, f"RTC_S1_{tile_id}_{year}_results_urls.json"
)
year_temporal = (f"{year}-06-01T00:00:00", f"{year}-08-31T23:59:59")
year_results = earthdata_search(asset_name, year_temporal, roi=bbox)
year_results = earthdata_search(asset_name, year_temporal, region_file=region_file)
results_urls.extend(year_results)
with open(year_results_file, "w") as f:
json.dump(year_results, f)
@ -361,7 +362,7 @@ def main(
with open(results_urls_file, "w") as f:
json.dump(results_urls, f)
static_granule_urls = earthdata_search(static_name, roi=bbox)
static_granule_urls = earthdata_search(static_name, region_file=region_file)
static_url_basenames = [
os.path.basename(url) for sublist in static_granule_urls for url in sublist
]
@ -454,10 +455,11 @@ if __name__ == "__main__":
earthaccess.login(persist=True)
asset_name = ["OPERA_L2_RTC-S1_V1"]
tile_id = "49REL"
region = read_file(f".\\data\\vectors\\{tile_id}.geojson")
# region = read_file(f".\\data\\vectors\\{tile_id}.geojson")
region_file = Path(f".\\data\\vectors\\{tile_id}.geojson")
# region = read_file("./data/vectors/wuling_guanqu_polygon.geojson")
# years = ["2024", "2023", "2022"]
years = ["2024"]
output_dir = ".\\data\\SAR"
os.makedirs(output_dir, exist_ok=True)
main(asset_name, region, years, output_dir, tile_id)
output_dir = Path(".\\data\\SAR")
output_dir.mkdir(parents=True, exist_ok=True)
main(asset_name, region_file, years, output_dir, tile_id)

View File

@ -7,19 +7,85 @@ This module contains functions related to searching and preprocessing HLS data.
Authors: Mahsa Jami, Cole Krehbiel, and Erik Bolch
Contact: lpdaac@usgs.gov
Editor: Hong Xie
Last Updated: 2025-02-20
Last Updated: 2025-10-16
===============================================================================
"""
# Import necessary packages
import os
import logging
from pathlib import Path
import numpy as np
import earthaccess
import geopandas as gpd
from shapely.geometry import box
from shapely.geometry.polygon import orient
def ensure_ccw(geom):
"""
Ensure the exterior ring of the polygon is counterclockwise.
"""
if geom.exterior.is_ccw:
return geom # Already counterclockwise
else:
return orient(geom, sign=1.0) # Make it counterclockwise
def format_roi(roi: Path):
"""
Determines if submitted ROI is a file or bbox coordinates.
If a file, opens a GeoJSON or shapefile and creates a list of polygon vertices in the correct order. If the file has multiple polygons it will use a unary union convex hull of the external bounds.
If bbox coordinates, creates a geodataframe with a single Polygon geometry.
Returns a geopandas dataframe for clipping and a list of vertices for searching.
"""
if os.path.isfile(roi): # and roi.endswith(("geojson", "shp")):
try:
# Open ROI if file
roi = gpd.read_file(roi)
# Check if ROI is in Geographic CRS, if not, convert to it
if not roi.crs.is_geographic:
roi = roi.to_crs("EPSG:4326")
logging.info(
"Note: ROI submitted is being converted to Geographic CRS (EPSG:4326)"
)
# (Add) 添加对多种几何图形类型的支持, 将MultiPolygon合并为Polygon
if len(roi) > 1 or roi.geometry[0].geom_type == "MultiPolygon":
# Merge all Polygon geometries and create external boundary
logging.info(
"Multiple polygons detected. Creating single geometry of external coordinates."
)
single_geometry = roi.union_all().convex_hull
roi = gpd.GeoDataFrame(geometry=[single_geometry], crs=roi.crs)
roi["geometry"] = roi["geometry"].apply(ensure_ccw)
# List Vertices in correct order for search
# (Add) 使用外包矩形坐标简化提交检索时使用的坐标, 仅支持逆时针
vertices_list = list(roi.geometry[0].exterior.coords)
except (FileNotFoundError, ValueError):
sys.exit(
f"The GeoJSON/shapefile is either not valid or could not be found.\nPlease double check the name and provide the absolute path to the file or make sure that it is located in {os.getcwd()}"
)
else:
# If bbox coordinates are submitted
bbox = tuple(map(float, roi.strip("'\"").split(",")))
print(bbox)
# Convert bbox to a geodataframe for clipping
roi = gpd.GeoDataFrame(geometry=[box(*bbox)], crs="EPSG:4326")
roi["geometry"] = roi["geometry"].apply(ensure_ccw)
vertices_list = list(roi.geometry[0].convex_hull.coords)
return (roi, vertices_list)
def earthdata_search(
asset_name: list,
dates: tuple = None,
roi: list = None,
region_file: Path = None,
tile_id: str = None,
hours: tuple = None,
log=False,
@ -30,26 +96,30 @@ def earthdata_search(
For example:
- MODIS: MCD43A3, MCD43A4, MOD11A1, MOD11A2, ...
- SMAP: SPL3SMP_E, SPL4SMGP, ...
- DEM: NASADEM_HGT, NASADEM_SC, ...
- GPM: GPM_3IMERGDL, ...
- DEM: NASADEM_HGT, NASADEM_SC, ALOS_PSR_RTC_HIGH, ALOS_PSR_RTC_LOW, ...
"""
# Search for data
if dates and not roi:
# 全球范围的数据集不需要roi参数
if not region_file:
# 全球范围的数据集不需要roi参数, 如 SMAP, GPM
results = earthaccess.search_data(
short_name=list(asset_name),
temporal=dates,
)
elif roi and not dates:
else:
# 基于 roi 的外包多边形进行数据检索
roi, vertices_list = format_roi(region_file)
if not dates:
# 适用非时间序列数据, 如 DEM
results = earthaccess.search_data(
short_name=list(asset_name),
bounding_box=roi,
polygon=vertices_list,
)
else:
results = earthaccess.search_data(
short_name=list(asset_name),
bounding_box=roi,
polygon=vertices_list,
temporal=dates,
)

View File

@ -5,7 +5,7 @@ HLS Subsetting, Processing, and Exporting Reformatted Data Prep Script
Authors: Cole Krehbiel, Mahsa Jami, and Erik Bolch
Contact: lpdaac@usgs.gov
Editor: Hong Xie
Last Updated: 2025-10-13
Last Updated: 2025-10-16
===============================================================================
"""
@ -15,7 +15,7 @@ Last Updated: 2025-10-13
# TODO Add ZARR as output option
from HLS_PER import process_granule, create_timeseries_dataset
from HLS_Su import hls_search
from HLS_Su import hls_search, format_roi
from utils.common_utils import setup_dask_environment
import os
import sys
@ -26,9 +26,6 @@ import time
import json
from datetime import datetime as dt
import earthaccess
from shapely.geometry import box
from shapely.geometry.polygon import orient
import geopandas as gpd
import dask.distributed
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -169,66 +166,6 @@ def parse_arguments():
return parser.parse_args()
def ensure_ccw(geom):
"""
Ensure the exterior ring of the polygon is counterclockwise.
"""
if geom.exterior.is_ccw:
return geom # Already counterclockwise
else:
return orient(geom, sign=1.0) # Make it counterclockwise
def format_roi(roi):
"""
Determines if submitted ROI is a file or bbox coordinates.
If a file, opens a GeoJSON or shapefile and creates a list of polygon vertices in the correct order. If the file has multiple polygons it will use a unary union convex hull of the external bounds.
If bbox coordinates, creates a geodataframe with a single Polygon geometry.
Returns a geopandas dataframe for clipping and a list of vertices for searching.
"""
if os.path.isfile(roi): # and roi.endswith(("geojson", "shp")):
try:
# Open ROI if file
roi = gpd.read_file(roi)
# Check if ROI is in Geographic CRS, if not, convert to it
if not roi.crs.is_geographic:
roi = roi.to_crs("EPSG:4326")
logging.info(
"Note: ROI submitted is being converted to Geographic CRS (EPSG:4326)"
)
# (Add) 添加对多种几何图形类型的支持, 将MultiPolygon合并为Polygon
if len(roi) > 1 or roi.geometry[0].geom_type == "MultiPolygon":
# Merge all Polygon geometries and create external boundary
logging.info(
"Multiple polygons detected. Creating single geometry of external coordinates."
)
single_geometry = roi.union_all().convex_hull
roi = gpd.GeoDataFrame(geometry=[single_geometry], crs=roi.crs)
roi['geometry'] = roi['geometry'].apply(ensure_ccw)
# List Vertices in correct order for search
# (Add) 使用外包矩形坐标简化提交检索时使用的坐标, 检索时仅支持逆时针
vertices_list = list(roi.geometry[0].exterior.coords)
except (FileNotFoundError, ValueError):
sys.exit(
f"The GeoJSON/shapefile is either not valid or could not be found.\nPlease double check the name and provide the absolute path to the file or make sure that it is located in {os.getcwd()}"
)
else:
# If bbox coordinates are submitted
bbox = tuple(map(float, roi.strip("'\"").split(",")))
print(bbox)
# Convert bbox to a geodataframe for clipping
roi = gpd.GeoDataFrame(geometry=[box(*bbox)], crs="EPSG:4326")
roi['geometry'] = roi['geometry'].apply(ensure_ccw)
vertices_list = list(roi.geometry[0].convex_hull.coords)
return (roi, vertices_list)
def format_dates(start, end):
# Strip Quotes
start = start.strip("'").strip('"')