324 lines
11 KiB
Python

# -*- coding: utf-8 -*-
"""
===============================================================================
This module contains functions related to preprocessing MODIS data.
For example, MCD43A3, MCD43A4, MOD11A1.
-------------------------------------------------------------------------------
Authors: Hong Xie
Last Updated: 2025-07-15
===============================================================================
"""
import os
import sys
import json
import time
import logging
import earthaccess
import rioxarray as rxr
import dask.distributed
import geopandas as gpd
# 动态获取项目根目录路径
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(project_root)
from utils.common_utils import clip_image, reproject_image, setup_dask_environment
from HLS_SuPER.HLS_Su import earthdata_search
def open_mcd43a4(file_path):
"""
Open MODIS MCD43A4 EOS-HDF4 file.
"""
# MCD43A4 影像中有 14 个波段, 仅需加载其中的 6 个核心波段
# 排除其他波段, 如第 5 个波段的水汽波段
bands = [
"Nadir_Reflectance_Band1",
"Nadir_Reflectance_Band2",
"Nadir_Reflectance_Band3",
"Nadir_Reflectance_Band4",
"Nadir_Reflectance_Band6",
"Nadir_Reflectance_Band7",
]
# 原始波段名称到波段名称的映射
bands_mapping = {
"Nadir_Reflectance_Band1": "BLUE",
"Nadir_Reflectance_Band2": "NIR",
"Nadir_Reflectance_Band3": "RED",
"Nadir_Reflectance_Band4": "GREEN",
"Nadir_Reflectance_Band6": "SWIR1",
"Nadir_Reflectance_Band7": "SWIR2",
}
# 波段顺序
sorted_bands = ["BLUE", "GREEN", "RED", "NIR", "SWIR1", "SWIR2"]
# 打开 MODIS MCD43A4 HDF4 文件
mcd43a4_bands = (
rxr.open_rasterio(file_path, variable=bands, masked=True)
.squeeze("band", drop=True)
.rename(bands_mapping)
)
# 对波段进行排序, 这是 xarray 的特殊写法, 可以直接对 Data variables 进行排序
mcd43a4_bands = mcd43a4_bands[sorted_bands]
return mcd43a4_bands
def open_mcd43a3(file_path):
"""
Open MODIS MCD43A3 EOS-HDF4 file.
"""
# MCD43A3 影像中有 14 个波段, 仅需加载其中的 6 个核心波段
bands = [
"MOD_Grid_BRDF:Albedo_BSA_Band1",
"MOD_Grid_BRDF:Albedo_BSA_Band2",
"MOD_Grid_BRDF:Albedo_BSA_Band3",
"MOD_Grid_BRDF:Albedo_BSA_Band4",
"MOD_Grid_BRDF:Albedo_BSA_Band6",
"MOD_Grid_BRDF:Albedo_BSA_Band7",
]
mcd43a3_bands = rxr.open_rasterio(file_path, variable=bands, masked=True).squeeze(
"band", drop=True
)
return mcd43a3_bands
def open_mod11a1(file_path):
"""
Open MODIS MOD11A1 EOS-HDF4 file.
"""
# MOD11A1 影像中有 12 个波段, 仅提取出日间温度波段
bands = ["LST_Day_1km"]
# 原始波段名称到波段名称的映射
bands_mapping = {"LST_Day_1km": "LST"}
# 打开 MODIS MOD11A1 HDF4 文件
mod11a1_bands = (
rxr.open_rasterio(file_path, variable=bands, masked=True)
.squeeze("band", drop=True)
.rename(bands_mapping)
)
return mod11a1_bands
def open_modis(file_path, prod_name):
"""
Open MODIS EOS-HDF4 file.
"""
if prod_name == "MOD11A1":
return open_mod11a1(file_path)
elif prod_name == "MCD43A3":
return open_mcd43a3(file_path)
elif prod_name == "MCD43A4":
return open_mcd43a4(file_path)
else:
raise ValueError(f"Unknown MODIS product: {prod_name}.")
def process_modis(download_file, prod_name, roi, clip=True, scale=True, target_crs=None, output_file=None):
"""
对 MODIS 数据进行预处理, 包括裁剪, 重投影和缩放.
"""
modis = open_modis(download_file, prod_name)
if roi is not None and clip:
modis = clip_image(modis, roi)
if target_crs is not None:
modis = reproject_image(modis, target_crs)
# 重投影后再裁剪一次
if roi is not None and clip:
modis = clip_image(modis, roi)
if scale:
# 缩放计算后会丢源属性和坐标系, 需要先备份源数据属性信息
org_attrs = modis.attrs
if prod_name == "MOD11A1":
# MOD11A1 数据的温度波段单位为 K, 缩放因子为 0.02, 需要转换为摄氏度
modis = modis * 0.02 - 273.15
elif prod_name == "MCD43A3":
modis = modis * 0.001
elif prod_name == "MCD43A4":
modis = modis * 0.0001
# 恢复源数据属性信息
modis.attrs = org_attrs.copy()
modis.rio.write_crs(target_crs, inplace=True)
modis.attrs["scale_factor"] = 1
else:
if prod_name == "MOD11A1":
modis.attrs["scale_factor"] = 0.02
elif prod_name == "MCD43A3":
modis.attrs["scale_factor"] = 0.001
elif prod_name == "MCD43A4":
modis.attrs["scale_factor"] = 0.0001
modis.rio.to_raster(output_file, compress="DEFLATE")
return
def process_granule(
granule_urls,
roi,
clip,
scale,
output_dir,
target_crs="EPSG:4326",
tile_id=None,
):
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s:%(asctime)s ||| %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
)
download_hdf_name = os.path.basename(granule_urls[0])
# 获取名称与日期
file_name_part = download_hdf_name.split(".")
date = file_name_part[1][1:]
prod_name = file_name_part[0]
download_path = os.path.join(output_dir, "HDF")
os.makedirs(download_path, exist_ok=True)
download_file = os.path.join(download_path, download_hdf_name)
if prod_name == "MOD11A1":
out_tif_name = f"MODIS.{prod_name}.{tile_id}.{date}.LST.tif"
elif prod_name == "MCD43A3":
out_tif_name = f"MODIS.{prod_name}.{tile_id}.{date}.Albedo.tif"
elif prod_name == "MCD43A4":
out_tif_name = f"MODIS.{prod_name}.{tile_id}.{date}.NBRDF.tif"
else:
out_tif_name = download_hdf_name.replace(".hdf", ".tif")
# 除 MCD43A4 需用于光谱指数计算外, MOD11A1 日间温度与 MCD43A4 反照率无需再按日期归档
if prod_name == "MOD11A1" or prod_name == "MCD43A3":
output_path = os.path.join(output_dir, "TIF")
else:
output_path = os.path.join(output_dir, "TIF", date)
os.makedirs(output_path, exist_ok=True)
output_file = os.path.join(output_path, out_tif_name)
if not os.path.isfile(output_file):
# Step1: 下载 HDF 文件
if not os.path.isfile(download_file):
try:
earthaccess.download(granule_urls, download_path)
except Exception as e:
logging.error(f"Error downloading {download_file}: {e}")
return
else:
logging.warning(f"{download_file} already exists. Skipping.")
# Step2: 处理 HDF 文件
try:
process_modis(download_file, prod_name, roi, clip, scale, target_crs, output_file)
except Exception as e:
os.remove(download_file)
logging.info(f"Removed corrupted file {download_file}. Retrying download.")
process_granule(granule_urls, roi, clip, scale, output_dir, target_crs, tile_id)
logging.info(f"Processed {output_file} successfully.")
else:
logging.warning(f"{output_file} already exists. Skipping.")
def main(
region: list,
asset_name: str,
modis_tile_id: str,
years: list,
dates: tuple[str, str],
tile_id: str,
output_root_dir: str,
):
bbox = tuple(list(region.total_bounds))
results_urls = []
output_dir = os.path.join(output_root_dir, asset_name)
os.makedirs(output_dir, exist_ok=True)
results_urls_file = os.path.join(
output_dir, f"{asset_name}_{modis_tile_id}_results_urls.json"
)
for year in years:
year_results_dir = os.path.join(output_dir, year)
os.makedirs(year_results_dir, exist_ok=True)
year_results_file = os.path.join(
year_results_dir, f"{asset_name}_{modis_tile_id}_{year}_results_urls.json"
)
year_temporal = (f"{year}-{dates[0]}T00:00:00", f"{year}-{dates[1]}T23:59:59")
year_results = earthdata_search(
[asset_name], year_temporal, bbox, modis_tile_id
)
results_urls.extend(year_results)
with open(year_results_file, "w") as f:
json.dump(year_results, f)
with open(results_urls_file, "w") as f:
json.dump(results_urls, f)
# 配置日志, 首次配置生效, 后续嵌套配置无效
logging.basicConfig(
level=logging.INFO, # 级别为INFO及以上的日志会被记录
format="%(levelname)s:%(asctime)s ||| %(message)s",
handlers=[
logging.StreamHandler(sys.stdout), # 输出到控制台
logging.FileHandler(
f"{output_dir}\\{asset_name}_{tile_id}_SuPER.log"
), # 输出到日志文件
],
)
client = dask.distributed.Client(timeout=60, memory_limit="8GB")
client.run(setup_dask_environment)
all_start_time = time.time()
for year in years:
year_results_dir = os.path.join(output_dir, year)
year_results_file = os.path.join(
year_results_dir, f"{asset_name}_{modis_tile_id}_{year}_results_urls.json"
)
year_results = json.load(open(year_results_file))
client.scatter(year_results)
start_time = time.time()
logging.info(f"Start {year}...")
tasks = [
dask.delayed(process_granule)(
granule_url,
roi=region,
clip=True,
scale=True,
output_dir=year_results_dir,
target_crs="EPSG:32649",
tile_id=tile_id,
)
for granule_url in year_results
]
dask.compute(*tasks)
total_time = time.time() - start_time
logging.info(
f"{year} MODIS {asset_name} Downloading complete and proccessed. Total time: {total_time} seconds"
)
client.close()
all_total_time = time.time() - all_start_time
logging.info(
f"All MODIS {asset_name} Downloading complete and proccessed. Total time: {all_total_time} seconds"
)
if __name__ == "__main__":
earthaccess.login(persist=True)
# region = gpd.read_file("./data/vectors/wuling_guanqu_polygon.geojson")
tile_id = "49REL"
region = gpd.read_file(f"./data/vectors/{tile_id}.geojson")
# asset_name = "MOD11A1"
# asset_name = "MCD43A3"
asset_name = "MCD43A4"
modis_tile_id = "h27v06"
# 示例文件名称: MCD43A4.A2024001.h27v05.061.2024010140610.hdf
years = ["2024", "2023", "2022"]
dates = ("03-01", "10-31")
output_root_dir = ".\\data\\MODIS\\"
main(region, asset_name, modis_tile_id, years, dates, tile_id, output_root_dir)