feat: 优化下载与归档逻辑, 下载完成后再进行归档.

2025-01-04 16:13:46 +08:00 · 2025-01-04 16:13:46 +08:00 · 6d16e32022
commit 6d16e32022
parent 7ecd0378f1
3 changed files with 36 additions and 14 deletions
--- a/HLS_SuPER/HLS_PER.py
+++ b/HLS_SuPER/HLS_PER.py
@ -21,7 +21,6 @@ import xarray as xr
 import rioxarray as rxr
 import dask.distributed

-
 def create_output_name(url, band_dict):
    """
    Uses HLS default naming scheme to generate an output name with common band names.
@ -30,9 +29,6 @@ def create_output_name(url, band_dict):
    # Get Necessary Strings
    prod = url.split("/")[4].split(".")[0]
    asset = url.split("/")[-1].split(".")[-2]
-    # Add: 获取影像DOY以备影像归档
-    time = url.split("/")[-1].split(".")[3]
-    file_doy = time[:8]
    # Hard-coded one off for Fmask name incase it is not in the band_dict but is needed for masking
    # 翻译：硬编码一个Fmask名称, 以防它不在band_dict中但需要用于掩膜处理
    if asset == "Fmask":
@ -43,7 +39,7 @@ def create_output_name(url, band_dict):
                output_name = (
                    f"{'.'.join(url.split('/')[-1].split('.')[:-2])}.{key}.subset.tif"
                )
-    return [output_name, file_doy]
+    return output_name


 def open_hls(url, roi=None, scale=True, chunk_size=dict(band=1, x=512, y=512)):
@ -115,9 +111,7 @@ def process_granule(

    # Check if all Outputs Exist for a Granule
    if not all(
-        os.path.isfile(
-            f"{output_dir}/{create_output_name(url, band_dict)[1]}/{create_output_name(url, band_dict)[0]}"
-        )
+        os.path.isfile(f"{output_dir}/{create_output_name(url, band_dict)}")
        for url in granule_urls
    ):

@ -131,12 +125,7 @@ def process_granule(
            )

            # Check if File exists in Output Directory
-            output_name = create_output_name(quality_url, band_dict)[0]
-            # Add: 以影像DOY为子目录归档同日影像
-            file_doy = create_output_name(quality_url, band_dict)[1]
-            output_dir = f"{output_dir}/{file_doy}"
-            if not os.path.isdir(output_dir):
-                os.makedirs(output_dir)
+            output_name = create_output_name(quality_url, band_dict)
            output_file = f"{output_dir}/{output_name}"

            # Open Quality Layer
--- a/HLS_SuPER/HLS_SuPER.py
+++ b/HLS_SuPER/HLS_SuPER.py
@ -408,6 +408,33 @@ def setup_dask_environment():
    env.__enter__()


+def files_collection(output_dir):
+    """
+    将已下载的HLS影像数据按照文件名中的日期进行归档
+    """
+
+    # 获取当前目录下的所有文件
+    files = os.listdir(output_dir)
+    # 遍历所有文件
+    for file in files:
+        # 检查文件是否为tif格式
+        if file.endswith(".tif"):
+            # 提取文件名中的日期
+            doy = file.split(".")[3][:7]
+
+            # 构建目标目录路径
+            target_dir = os.path.join(output_dir, doy)
+
+            # 如果目标目录不存在，则创建它
+            if not os.path.exists(target_dir):
+                os.makedirs(target_dir)
+
+            # 移动文件到目标目录
+            source_path = os.path.join(output_dir, file)
+            target_path = os.path.join(target_dir, file)
+            shutil.move(source_path, target_path)
+
+
 def main():
    """
    Main function to run the HLS SuPER script.
@ -585,6 +612,11 @@ def main():
        logging.info("Timeseries Dataset Created. Removing Temporary Files...")
        shutil.rmtree(cog_dir)

+    # Add: 下载影像数据按照DOY归档
+    logging.info("开始对已下载影像按照DOY日期进行归档.")
+    files_collection(output_dir)
+    logging.info("归档完成!")
+
    # End Timer
    total_time = time.time() - start_time
    logging.info(f"Processing complete. Total time: {round(total_time,2)}s, ")
--- a/README.md
+++ b/README.md
@ -96,6 +96,7 @@ mamba activate lpdaac_windows
 - 参考自NASA官网示例Demo：https://github.com/nasa/LPDAAC-Data-Resources/blob/main/setup/setup_instructions_python.md
 - 首次运行爬取命令时，需要输入用户名和密码，用户名和密码可以在 [Earthdata](https://urs.earthdata.nasa.gov/) 注册获取。
 - 需要注意的是，密码中最好不要出 `@/#/$/%` 等符号，爬取时可能会出错。
+- 单个用户每秒限制最多100次请求，参考自：https://forum.earthdata.nasa.gov/viewtopic.php?t=3734

 ### 3.2 爬取云端数据并在内存中进行预处理