feat: 优化下载与归档逻辑, 下载完成后再进行归档.

This commit is contained in:
谢泓 2025-01-04 16:13:46 +08:00
parent 7ecd0378f1
commit 6d16e32022
3 changed files with 36 additions and 14 deletions

View File

@ -21,7 +21,6 @@ import xarray as xr
import rioxarray as rxr import rioxarray as rxr
import dask.distributed import dask.distributed
def create_output_name(url, band_dict): def create_output_name(url, band_dict):
""" """
Uses HLS default naming scheme to generate an output name with common band names. Uses HLS default naming scheme to generate an output name with common band names.
@ -30,9 +29,6 @@ def create_output_name(url, band_dict):
# Get Necessary Strings # Get Necessary Strings
prod = url.split("/")[4].split(".")[0] prod = url.split("/")[4].split(".")[0]
asset = url.split("/")[-1].split(".")[-2] asset = url.split("/")[-1].split(".")[-2]
# Add: 获取影像DOY以备影像归档
time = url.split("/")[-1].split(".")[3]
file_doy = time[:8]
# Hard-coded one off for Fmask name incase it is not in the band_dict but is needed for masking # Hard-coded one off for Fmask name incase it is not in the band_dict but is needed for masking
# 翻译硬编码一个Fmask名称, 以防它不在band_dict中但需要用于掩膜处理 # 翻译硬编码一个Fmask名称, 以防它不在band_dict中但需要用于掩膜处理
if asset == "Fmask": if asset == "Fmask":
@ -43,7 +39,7 @@ def create_output_name(url, band_dict):
output_name = ( output_name = (
f"{'.'.join(url.split('/')[-1].split('.')[:-2])}.{key}.subset.tif" f"{'.'.join(url.split('/')[-1].split('.')[:-2])}.{key}.subset.tif"
) )
return [output_name, file_doy] return output_name
def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)): def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)):
@ -115,9 +111,7 @@ def process_granule(
# Check if all Outputs Exist for a Granule # Check if all Outputs Exist for a Granule
if not all( if not all(
os.path.isfile( os.path.isfile(f"{output_dir}/{create_output_name(url, band_dict)}")
f"{output_dir}/{create_output_name(url, band_dict)[1]}/{create_output_name(url, band_dict)[0]}"
)
for url in granule_urls for url in granule_urls
): ):
@ -131,12 +125,7 @@ def process_granule(
) )
# Check if File exists in Output Directory # Check if File exists in Output Directory
output_name = create_output_name(quality_url, band_dict)[0] output_name = create_output_name(quality_url, band_dict)
# Add: 以影像DOY为子目录归档同日影像
file_doy = create_output_name(quality_url, band_dict)[1]
output_dir = f"{output_dir}/{file_doy}"
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
output_file = f"{output_dir}/{output_name}" output_file = f"{output_dir}/{output_name}"
# Open Quality Layer # Open Quality Layer

View File

@ -408,6 +408,33 @@ def setup_dask_environment():
env.__enter__() env.__enter__()
def files_collection(output_dir):
"""
将已下载的HLS影像数据按照文件名中的日期进行归档
"""
# 获取当前目录下的所有文件
files = os.listdir(output_dir)
# 遍历所有文件
for file in files:
# 检查文件是否为tif格式
if file.endswith(".tif"):
# 提取文件名中的日期
doy = file.split(".")[3][:7]
# 构建目标目录路径
target_dir = os.path.join(output_dir, doy)
# 如果目标目录不存在,则创建它
if not os.path.exists(target_dir):
os.makedirs(target_dir)
# 移动文件到目标目录
source_path = os.path.join(output_dir, file)
target_path = os.path.join(target_dir, file)
shutil.move(source_path, target_path)
def main(): def main():
""" """
Main function to run the HLS SuPER script. Main function to run the HLS SuPER script.
@ -585,6 +612,11 @@ def main():
logging.info("Timeseries Dataset Created. Removing Temporary Files...") logging.info("Timeseries Dataset Created. Removing Temporary Files...")
shutil.rmtree(cog_dir) shutil.rmtree(cog_dir)
# Add: 下载影像数据按照DOY归档
logging.info("开始对已下载影像按照DOY日期进行归档.")
files_collection(output_dir)
logging.info("归档完成!")
# End Timer # End Timer
total_time = time.time() - start_time total_time = time.time() - start_time
logging.info(f"Processing complete. Total time: {round(total_time,2)}s, ") logging.info(f"Processing complete. Total time: {round(total_time,2)}s, ")

View File

@ -96,6 +96,7 @@ mamba activate lpdaac_windows
- 参考自NASA官网示例Demohttps://github.com/nasa/LPDAAC-Data-Resources/blob/main/setup/setup_instructions_python.md - 参考自NASA官网示例Demohttps://github.com/nasa/LPDAAC-Data-Resources/blob/main/setup/setup_instructions_python.md
- 首次运行爬取命令时,需要输入用户名和密码,用户名和密码可以在 [Earthdata](https://urs.earthdata.nasa.gov/) 注册获取。 - 首次运行爬取命令时,需要输入用户名和密码,用户名和密码可以在 [Earthdata](https://urs.earthdata.nasa.gov/) 注册获取。
- 需要注意的是,密码中最好不要出 `@/#/$/%` 等符号,爬取时可能会出错。 - 需要注意的是,密码中最好不要出 `@/#/$/%` 等符号,爬取时可能会出错。
- 单个用户每秒限制最多100次请求参考自https://forum.earthdata.nasa.gov/viewtopic.php?t=3734
### 3.2 爬取云端数据并在内存中进行预处理 ### 3.2 爬取云端数据并在内存中进行预处理