# -*- coding: utf-8 -*- """ =============================================================================== This module contains functions related to preprocessing MODIS data. For example, MCD43A3, MCD43A4, MOD11A1. ------------------------------------------------------------------------------- Authors: Hong Xie Last Updated: 2025-07-15 =============================================================================== """ import os import sys import json import time import logging import earthaccess import rioxarray as rxr import dask.distributed import geopandas as gpd # 动态获取项目根目录路径 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(project_root) from utils.common_utils import clip_image, reproject_image, setup_dask_environment from HLS_SuPER.HLS_Su import earthdata_search def open_mcd43a4(file_path): """ Open MODIS MCD43A4 EOS-HDF4 file. """ # MCD43A4 影像中有 14 个波段, 仅需加载其中的 6 个核心波段 # 排除其他波段, 如第 5 个波段的水汽波段 bands = [ "Nadir_Reflectance_Band1", "Nadir_Reflectance_Band2", "Nadir_Reflectance_Band3", "Nadir_Reflectance_Band4", "Nadir_Reflectance_Band6", "Nadir_Reflectance_Band7", ] # 原始波段名称到波段名称的映射 bands_mapping = { "Nadir_Reflectance_Band1": "BLUE", "Nadir_Reflectance_Band2": "NIR", "Nadir_Reflectance_Band3": "RED", "Nadir_Reflectance_Band4": "GREEN", "Nadir_Reflectance_Band6": "SWIR1", "Nadir_Reflectance_Band7": "SWIR2", } # 波段顺序 sorted_bands = ["BLUE", "GREEN", "RED", "NIR", "SWIR1", "SWIR2"] # 打开 MODIS MCD43A4 HDF4 文件 mcd43a4_bands = ( rxr.open_rasterio(file_path, variable=bands, masked=True) .squeeze("band", drop=True) .rename(bands_mapping) ) # 对波段进行排序, 这是 xarray 的特殊写法, 可以直接对 Data variables 进行排序 mcd43a4_bands = mcd43a4_bands[sorted_bands] return mcd43a4_bands def open_mcd43a3(file_path): """ Open MODIS MCD43A3 EOS-HDF4 file. """ # MCD43A3 影像中有 14 个波段, 仅需加载其中的 6 个核心波段 bands = [ "MOD_Grid_BRDF:Albedo_BSA_Band1", "MOD_Grid_BRDF:Albedo_BSA_Band2", "MOD_Grid_BRDF:Albedo_BSA_Band3", "MOD_Grid_BRDF:Albedo_BSA_Band4", "MOD_Grid_BRDF:Albedo_BSA_Band6", "MOD_Grid_BRDF:Albedo_BSA_Band7", ] mcd43a3_bands = rxr.open_rasterio(file_path, variable=bands, masked=True).squeeze( "band", drop=True ) return mcd43a3_bands def open_mod11a1(file_path): """ Open MODIS MOD11A1 EOS-HDF4 file. """ # MOD11A1 影像中有 12 个波段, 仅提取出日间温度波段 bands = ["LST_Day_1km"] # 原始波段名称到波段名称的映射 bands_mapping = {"LST_Day_1km": "LST"} # 打开 MODIS MOD11A1 HDF4 文件 mod11a1_bands = ( rxr.open_rasterio(file_path, variable=bands, masked=True) .squeeze("band", drop=True) .rename(bands_mapping) ) return mod11a1_bands def open_modis(file_path, prod_name): """ Open MODIS EOS-HDF4 file. """ if prod_name == "MOD11A1": return open_mod11a1(file_path) elif prod_name == "MCD43A3": return open_mcd43a3(file_path) elif prod_name == "MCD43A4": return open_mcd43a4(file_path) else: raise ValueError(f"Unknown MODIS product: {prod_name}.") def process_modis(download_file, prod_name, roi, clip=True, scale=True, target_crs=None, output_file=None): """ 对 MODIS 数据进行预处理, 包括裁剪, 重投影和缩放. """ modis = open_modis(download_file, prod_name) if roi is not None and clip: modis = clip_image(modis, roi) if target_crs is not None: modis = reproject_image(modis, target_crs) # 重投影后再裁剪一次 if roi is not None and clip: modis = clip_image(modis, roi) if scale: # 缩放计算后会丢源属性和坐标系, 需要先备份源数据属性信息 org_attrs = modis.attrs if prod_name == "MOD11A1": # MOD11A1 数据的温度波段单位为 K, 缩放因子为 0.02, 需要转换为摄氏度 modis = modis * 0.02 - 273.15 elif prod_name == "MCD43A3": modis = modis * 0.001 elif prod_name == "MCD43A4": modis = modis * 0.0001 # 恢复源数据属性信息 modis.attrs = org_attrs.copy() modis.rio.write_crs(target_crs, inplace=True) modis.attrs["scale_factor"] = 1 else: if prod_name == "MOD11A1": modis.attrs["scale_factor"] = 0.02 elif prod_name == "MCD43A3": modis.attrs["scale_factor"] = 0.001 elif prod_name == "MCD43A4": modis.attrs["scale_factor"] = 0.0001 modis.rio.to_raster(output_file, compress="DEFLATE") return def process_granule( granule_urls, roi, clip, scale, output_dir, target_crs="EPSG:4326", tile_id=None, ): logging.basicConfig( level=logging.INFO, format="%(levelname)s:%(asctime)s ||| %(message)s", handlers=[logging.StreamHandler(sys.stdout)], ) download_hdf_name = os.path.basename(granule_urls[0]) # 获取名称与日期 file_name_part = download_hdf_name.split(".") date = file_name_part[1][1:] prod_name = file_name_part[0] download_path = os.path.join(output_dir, "HDF") os.makedirs(download_path, exist_ok=True) download_file = os.path.join(download_path, download_hdf_name) if prod_name == "MOD11A1": out_tif_name = f"MODIS.{prod_name}.{tile_id}.{date}.LST.tif" elif prod_name == "MCD43A3": out_tif_name = f"MODIS.{prod_name}.{tile_id}.{date}.Albedo.tif" elif prod_name == "MCD43A4": out_tif_name = f"MODIS.{prod_name}.{tile_id}.{date}.NBRDF.tif" else: out_tif_name = download_hdf_name.replace(".hdf", ".tif") # 除 MCD43A4 需用于光谱指数计算外, MOD11A1 日间温度与 MCD43A4 反照率无需再按日期归档 if prod_name == "MOD11A1" or prod_name == "MCD43A3": output_path = os.path.join(output_dir, "TIF") else: output_path = os.path.join(output_dir, "TIF", date) os.makedirs(output_path, exist_ok=True) output_file = os.path.join(output_path, out_tif_name) if not os.path.isfile(output_file): # Step1: 下载 HDF 文件 if not os.path.isfile(download_file): try: earthaccess.download(granule_urls, download_path) except Exception as e: logging.error(f"Error downloading {download_file}: {e}") return else: logging.warning(f"{download_file} already exists. Skipping.") # Step2: 处理 HDF 文件 try: process_modis(download_file, prod_name, roi, clip, scale, target_crs, output_file) except Exception as e: os.remove(download_file) logging.info(f"Removed corrupted file {download_file}. Retrying download.") process_granule(granule_urls, roi, clip, scale, output_dir, target_crs, tile_id) logging.info(f"Processed {output_file} successfully.") else: logging.warning(f"{output_file} already exists. Skipping.") def main( region: list, asset_name: str, modis_tile_id: str, years: list, dates: tuple[str, str], tile_id: str, output_root_dir: str, ): bbox = tuple(list(region.total_bounds)) results_urls = [] output_dir = os.path.join(output_root_dir, asset_name) os.makedirs(output_dir, exist_ok=True) results_urls_file = os.path.join( output_dir, f"{asset_name}_{modis_tile_id}_results_urls.json" ) for year in years: year_results_dir = os.path.join(output_dir, year) os.makedirs(year_results_dir, exist_ok=True) year_results_file = os.path.join( year_results_dir, f"{asset_name}_{modis_tile_id}_{year}_results_urls.json" ) year_temporal = (f"{year}-{dates[0]}T00:00:00", f"{year}-{dates[1]}T23:59:59") year_results = earthdata_search( [asset_name], year_temporal, bbox, modis_tile_id ) results_urls.extend(year_results) with open(year_results_file, "w") as f: json.dump(year_results, f) with open(results_urls_file, "w") as f: json.dump(results_urls, f) # 配置日志, 首次配置生效, 后续嵌套配置无效 logging.basicConfig( level=logging.INFO, # 级别为INFO及以上的日志会被记录 format="%(levelname)s:%(asctime)s ||| %(message)s", handlers=[ logging.StreamHandler(sys.stdout), # 输出到控制台 logging.FileHandler( f"{output_dir}\\{asset_name}_{tile_id}_SuPER.log" ), # 输出到日志文件 ], ) client = dask.distributed.Client(timeout=60, memory_limit="8GB") client.run(setup_dask_environment) all_start_time = time.time() for year in years: year_results_dir = os.path.join(output_dir, year) year_results_file = os.path.join( year_results_dir, f"{asset_name}_{modis_tile_id}_{year}_results_urls.json" ) year_results = json.load(open(year_results_file)) client.scatter(year_results) start_time = time.time() logging.info(f"Start {year}...") tasks = [ dask.delayed(process_granule)( granule_url, roi=region, clip=True, scale=True, output_dir=year_results_dir, target_crs="EPSG:32649", tile_id=tile_id, ) for granule_url in year_results ] dask.compute(*tasks) total_time = time.time() - start_time logging.info( f"{year} MODIS {asset_name} Downloading complete and proccessed. Total time: {total_time} seconds" ) client.close() all_total_time = time.time() - all_start_time logging.info( f"All MODIS {asset_name} Downloading complete and proccessed. Total time: {all_total_time} seconds" ) if __name__ == "__main__": earthaccess.login(persist=True) # region = gpd.read_file("./data/vectors/wuling_guanqu_polygon.geojson") tile_id = "49REL" region = gpd.read_file(f"./data/vectors/{tile_id}.geojson") # asset_name = "MOD11A1" # asset_name = "MCD43A3" asset_name = "MCD43A4" modis_tile_id = "h27v06" # 示例文件名称: MCD43A4.A2024001.h27v05.061.2024010140610.hdf years = ["2024", "2023", "2022"] dates = ("03-01", "10-31") output_root_dir = ".\\data\\MODIS\\" main(region, asset_name, modis_tile_id, years, dates, tile_id, output_root_dir)