""" 湖南省水文站点数据爬取 1. 处理爬取到的站点分布数据 2. 逐站点下载站点数据 3. 数据清洗进行数据解密 4. 检查数据完整性 数据示例: 站点位置信息: { "STCD": "61100800", "STNM": "东安 ", "ADDVCD": "431122000000000", "DRNA": 689, "FRGRD": "2", "LGTD": 111.288333, "LTTD": 26.402222, "RVNM": "紫溪 ", "STLC": "永州市东安县白牙市镇蒋家村 ", "STTP": "ZQ", "HNNM": "湘江 " }, 站点观测数据: { "_id": "eac8a911a751d75d6f67e4ba", "CRPGRWPRD": null, "CRPTY": null, "EXKEY": "@", "HITRSN": null, "SLM10": 19.9, "SLM100": null, "SLM20": 22.1, "SLM30": null, "SLM40": 24.7, "SLM60": null, "SLM80": null, "SLMMMT": null, "SRLSLM": null, "STCD": "61107700", "TM": "2024-01-31T12:00:00.000Z", "VTAVSLM": null, "AVG": "22.2", "XDSD": "91.17" }, """ import os import sys import csv import glob import pandas as pd import requests import logging import time from datetime import datetime, timedelta sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) class getInsituData: """ 获取湖南省水文站点数据 """ def __init__(self, output_dir: str, start_date: str, end_date: str) -> None: """ 初始化 Parameters ---------- output_dir : str 保存根路径 start_date : str 起始日期, 如 "2024-01-01" end_date : str 结束日期, 如 "2024-01-31" """ self.output_dir = output_dir self.start_date = start_date self.end_date = end_date def get_data_from_url(self, target_url: str) -> list[dict]: """ 获取湖南省水文站点数据 Parameters ---------- target_url : str 目标URL Returns ------- result_list : list[dict] 站点数据列表 """ # 模拟请求头 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Host": "58.20.42.94:9090", "Origin": "http://yzt.hnswkcj.com:9090", "Proxy-Connection": "keep-alive", "Referer": "http://yzt.hnswkcj.com:9090/", } # 模拟请求 result_list = [] try: with requests.get(target_url, headers=headers, timeout=10) as response: result_list = response.json()["data"] except Exception as e: result_list = [] return result_list # 使用POST请求获取数据 def get_data_from_post(self) -> list[dict]: """ 获取湖南省水文站点数据 Parameters ---------- target_url : str 目标URL data : dict POST请求参数 Returns ------- result_list : list[dict] 站点数据列表 """ # 模拟请求头 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Host": "58.20.42.94:9090", "Origin": "http://yzt.hnswkcj.com:9090", "Proxy-Connection": "keep-alive", "Referer": "http://yzt.hnswkcj.com:9090/", } # POST 请求参数 (仅筛选墒情站点) data = { "sw": { "isQuery": "true", "czlb": "水文", "xzbjz": [], "ytfl": [], "czdj": [], "zwlb": [], "swjs": [], }, "swei": { "isQuery": "false", "xzbjz": [], "ytfl": [], "czdj": [], "zwlb": [], "swjs": [], }, "js": { "isQuery": "false", "xzbjz": [], "ytfl": [], "czdj": [], "zwlb": [], "swjs": [], }, "zf": { "isQuery": "false", "xzbjz": [], "ytfl": [], "czdj": [], "zwlb": [], "swjs": [], }, "ns": { "isQuery": "false", "xzbjz": [], "ytfl": [], "czdj": [], "zwlb": [], "swjs": [], }, "swen": { "isQuery": "false", "xzbjz": [], "ytfl": [], "czdj": [], "zwlb": [], "swjs": [], }, "sq": {"isQuery": "true", "trlx": []}, "sz": {"isQuery": "false"}, "dxs": {"isQuery": "false"}, "slz": {"isQuery": "false"}, "stll": {"isQuery": "false"}, "rhpwk": {"isQuery": "false"}, "nhjc": {"isQuery": "false"}, "adcd": "43", } # 模拟请求 target_url = "http://58.20.42.94:9090/api/core/zwnew/zwxxylbMap" result_list = [] try: with requests.post( target_url, headers=headers, data=data, timeout=10 ) as response: result_list = response.json()["data"] except Exception as e: result_list = [] return result_list def get_insitu_list(self) -> list[dict]: """ 获取并存储湖南省全省土壤墒情站点信息 Parameters ---------- None Returns ------- insitu_list : list[dict] 数据清洗后的站点信息列表 """ logging.info("正在获取湖南省全省土壤墒情站点信息...") output_file_path = os.path.join(self.output_dir, "hunan_insitu_position.csv") if os.path.exists(output_file_path): logging.info("湖南省土壤墒情站点信息已存在, 正在读取...") with open(output_file_path, "r", encoding="gbk") as csvfile: clean_insitu_list = list(csv.DictReader(csvfile)) else: target_url = ( "http://58.20.42.94:9090/api/core/db/mm_soil/list/0/10000000/qb" ) insitu_list = self.get_data_from_url(target_url)[0]["data"] clean_insitu_list = [] for insitu in insitu_list: insitu["STNM"] = insitu.pop("STNM").strip() insitu["STCD"] = str(insitu.pop("STCD")) insitu["ADDVCD"] = str(insitu.pop("ADDVCD")) insitu["DRNA"] = insitu.pop("DRNA") insitu["FRGRD"] = insitu.pop("FRGRD") insitu["lng"] = insitu.pop("LGTD") insitu["lat"] = insitu.pop("LTTD") insitu["RVNM"] = insitu.pop("RVNM").strip() insitu["STLC"] = insitu.pop("STLC").strip() insitu["STTP"] = insitu.pop("STTP") insitu["HNNM"] = insitu.pop("HNNM").strip() clean_insitu_list.append(insitu) # 将清洗后的数据保存到csv文件 with open(output_file_path, "w", newline="", encoding="gbk") as csvfile: # 创建csv写入器对象 csvwriter = csv.writer(csvfile) field_names = clean_insitu_list[0].keys() csvwriter = csv.DictWriter(csvfile, fieldnames=field_names) # 写入标题行 csvwriter.writeheader() # 写入数据行 for insitu in clean_insitu_list: csvwriter.writerow(insitu) logging.info( f"{len(clean_insitu_list)}条站点数据已存入csv文件中: {output_file_path}." ) return clean_insitu_list def clean_insitu_data( self, insitu_name: str, insitu_data_list: list[dict], output_path: str ) -> None: """ 清洗并存储指定站点的观测数据 Parameters ---------- insitu_name : str 站点名称 insitu_data_list : list[dict] 站点数据列表 output_path : str 输出文件路径 """ clean_insitu_data = [] for data in insitu_data_list: clean_data = {} # 获取数据时间为UTC时间 clean_data["TIME"] = ( datetime.strptime(data.pop("TM"), "%Y-%m-%dT%H:%M:%S.%fZ") ).strftime("%Y-%m-%dT%H:%M:%S") clean_data["SLM10"] = data.pop("SLM10") clean_data["SLM20"] = data.pop("SLM20") clean_data["SLM40"] = data.pop("SLM40") clean_data["AVG"] = data.pop("AVG") clean_data["XDSD"] = data.pop("XDSD") clean_insitu_data.append(clean_data) with open(output_path, "w", newline="", encoding="gbk") as csvfile: csvwriter = csv.writer(csvfile) field_names = clean_insitu_data[0].keys() csvwriter = csv.DictWriter(csvfile, fieldnames=field_names) csvwriter.writeheader() # 按时间列TIME排序后再写入 clean_insitu_data.sort(key=lambda x: x["TIME"]) csvwriter.writerows(clean_insitu_data) logging.info(f"{insitu_name} 站土壤墒情数据已保存至:{output_path}") return def get_insitu_data( self, insitu_name: str, insitu_id: str, output_file_path: str, ) -> None: """ 获取并存储指定站点指定时间范围内的观测数据 网站页面中请求时间不能超过3个月, 否则会被前端拦截报错 但直接交互不会被拦截 Parameters ---------- insitu_name : str 站点名称 insitu_id : str 站点编号 output_file_path : str 输出文件路径 """ # 获取数据时间为UTC时间, 但请求时间为本地时间, 为确保数据完整性, 请求时间范围需要扩大1天 real_start_date = ( datetime.strptime(self.start_date, "%Y-%m-%d") - timedelta(days=1) ).strftime("%Y-%m-%d") real_end_date = ( datetime.strptime(self.end_date, "%Y-%m-%d") + timedelta(days=1) ).strftime("%Y-%m-%d") target_url = f"http://58.20.42.94:9090/api/core/soil/soildrTM/{insitu_id}/{real_start_date}%2000:00/{real_end_date}%2023:59" result_list = self.get_data_from_url(target_url) if len(result_list) != 0: self.clean_insitu_data(insitu_name, result_list, output_file_path) else: logging.warning(f"暂未查询到 {insitu_name} 站土壤墒情数据.") return def save_all_insitu_data(self) -> None: """ 获取并存储所有站点指定时间范围内的观测数据 """ start_time = time.time() insitu_list = self.get_insitu_list() for insitu in insitu_list: insitu_id = insitu["STCD"] insitu_name = insitu["STNM"] insitu_output_dir = os.path.join(self.output_dir, str(insitu_name)) os.makedirs(insitu_output_dir, exist_ok=True) output_file_path = os.path.join( insitu_output_dir, f"{insitu_name}_{self.start_date}_{self.end_date}.csv", ) if os.path.exists(output_file_path): continue self.get_insitu_data(insitu_name, insitu_id, output_file_path) total_time = time.time() - start_time logging.info(f"数据获取完毕. Total time: {total_time} seconds") class checkInsituData: """ 检查站点数据完整性 """ def __init__(self, output_dir: str, year: int, start_date: str, end_date: str) -> None: self.output_dir = output_dir self.year = year self.start_date = start_date self.end_date = end_date self.start_dt = datetime.strptime(self.start_date, "%Y-%m-%d") self.end_dt = datetime.strptime(self.end_date, "%Y-%m-%d") self.all_insitu_file = os.path.join(output_dir, "hunan_insitu_position.csv") def check_all_insitu_data(self) -> None: """ 检查所有站点数据完整性 1. 检查站点是否有数据文件 2. 检查数据时间范围是否覆盖全部日期 3. 检查每日数据是否至少有3次观测 4. 根据检查结果标记数据完整性等级 (LEVEL, 0=无数据, 1=部分完整, 2=完整) """ rows = [] with open(self.all_insitu_file, "r", encoding="gbk") as csvfile: rows = list(csv.DictReader(csvfile)) # 计算日期范围和最小数据量要求 day_num = (self.end_dt - self.start_dt).days + 1 min_data_num = day_num * 3 # 生成完整的日期范围 (用于检查缺失日期) full_dates = pd.date_range(start=self.start_dt, end=self.end_dt, freq="D") for row in rows: insitu_name = row["STNM"] insitu_files = glob.glob( os.path.join( self.output_dir, str(insitu_name), f"*_{self.start_date}_{self.end_date}.csv", ) ) # 添加新字段记录数据完整性 level_field = f"LEVEL_{self.year}" row[level_field] = 0 # 默认为无数据 if len(insitu_files) == 0: continue insitu_df = pd.read_csv(insitu_files[0], parse_dates=["TIME"]) cleaned_data = self.clean_data(insitu_df) # 保存清理后的数据 (本地时间和UTC时间) base_path = insitu_files[0].replace(".csv", "") # UTC版本 cleaned_data.to_csv(f"{base_path}_clean_UTC.csv", index=False) # 检查1: 总数据量是否满足最低要求 if len(cleaned_data) == 0: continue row[level_field] = 1 # 标记为部分数据 if len(cleaned_data) < min_data_num: continue # 检查2: 每日数据是否完整 (至少3次观测) daily_counts = cleaned_data.set_index("TIME").resample("D").size() # 检查是否覆盖所有日期且每日至少有3次观测 missing_dates = full_dates.difference(daily_counts.index) insufficient_days = daily_counts[daily_counts < 3] # if missing_dates.empty or insufficient_days.empty: if missing_dates.empty: row[level_field] = 2 # 标记为完整数据 # 本地时间版本 cleaned_data_UTC8 = cleaned_data.copy() cleaned_data_UTC8["TIME"] = cleaned_data_UTC8["TIME"] + pd.Timedelta(hours=8) cleaned_data_UTC8.to_csv(f"{base_path}_clean.csv", index=False) # 将修改后的数据写回文件 output_file = self.all_insitu_file.replace(".csv", "_checked.csv") with open(output_file, "w", encoding="gbk", newline="") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=rows[0].keys()) writer.writeheader() writer.writerows(rows) # 提取 LEVEL 列表 levels = [row[level_field] for row in rows] nodata_count = levels.count(0) partial_count = levels.count(1) complete_count = levels.count(2) logging.info( f"站点数据完整性检查完毕, {self.start_date} - {self.end_date} 完整数据站点数: {complete_count}, 不完整数据站点数: {partial_count}, 无数据站点数: {nodata_count}." ) return def clean_data(self, data: pd.DataFrame): """ 通过请求直接获取的数据为加密数据, 需要进行数据清洗解密 """ # 经观察, UTC 时间 00:00, 06:00, 08:00, 12:00, 16:00, 18:00 为真实观测时间 data["REAL_SLM10"] = data["SLM10"].where( data["TIME"].dt.hour.isin([0, 6, 8, 12, 16, 18]) ) # 仅保留SLM10与REAL_SLM10相等的数据点, 并删除REAL_SLM10列 data_cleaned = data[data["SLM10"] == data["REAL_SLM10"]].drop( columns=["REAL_SLM10"] ) # 剔除超过时间范围的数据 data_cleaned = data_cleaned[ (data_cleaned["TIME"] >= self.start_dt) & (data_cleaned["TIME"] < self.end_dt + timedelta(days=1)) ] # 考虑时间重复的情况, 以及极端值噪声过滤 drop_index = [] for i in range(1, len(data_cleaned) - 1): current_time = data_cleaned.iloc[i]["TIME"] current_value = data_cleaned.iloc[i]["SLM10"] if i == 0: next_value = data_cleaned.iloc[i + 1]["SLM10"] if abs(current_value - next_value) > current_value: drop_index.append(i) elif i == len(data_cleaned) - 1: previous_value = data_cleaned.iloc[i - 1]["SLM10"] if abs(current_value - previous_value) > current_value: drop_index.append(i) else: previous_value = data_cleaned.iloc[i - 1]["SLM10"] next_value = data_cleaned.iloc[i + 1]["SLM10"] previous_time = data_cleaned.iloc[i - 1]["TIME"] next_time = data_cleaned.iloc[i + 1]["TIME"] if current_time == next_time: # 仅在首次发现时间重复时, 判断异常噪声 if current_value > next_value: if (current_value - next_value) >= next_value * 0.6: if (previous_value - next_value) >= next_value * 0.6: # 下一条数据为极小值噪声 drop_index.append(i + 1) elif ( current_value - previous_value ) >= previous_value * 0.6: # 当前数据为极大值噪声 drop_index.append(i) else: # 当前数据为极大值噪声 drop_index.append(i) elif current_value < next_value: if (next_value - current_value) >= current_value * 0.6: if (previous_value - current_value) >= current_value * 0.6: # 当前数据为极小值噪声 drop_index.append(i) elif (next_value - previous_value) >= current_value * 0.6: # 下一条数据为极大值噪声 drop_index.append(i + 1) else: # 下一条数据为极大值噪声 drop_index.append(i + 1) elif current_time == previous_time: # 若与上一条数据时间相同, 已处理跳过 continue else: # 处理在观测时间不重复时的异常噪声值 if current_value < previous_value and current_value < next_value: # 若为极小值 threshold = current_value * 0.6 if (previous_value - current_value) >= threshold or ( next_value - current_value ) >= threshold: drop_index.append(i) elif current_value > previous_value and current_value > next_value: # 若为极大值 if (current_value - previous_value) >= previous_value or ( current_value - next_value ) >= next_value: drop_index.append(i) data_cleaned = data_cleaned.drop(data_cleaned.index[drop_index]) return data_cleaned def main( output_dir: str, year: str | int = 2024, start_date: str = "01-01", end_date: str = None, ) -> None: start_date = f"{year}-{start_date}" end_date = f"{year}-{end_date}" if end_date is not None else start_date logging.basicConfig( level=logging.INFO, format="%(levelname)s:%(asctime)s ||| %(message)s", handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler( f"{output_dir}\\insitu_SuPER_{start_date}_{end_date}.log" ), ], ) getInsituData(output_dir, start_date, end_date).save_all_insitu_data() checkInsituData(output_dir, year, start_date, end_date).check_all_insitu_data() if __name__ == "__main__": output_dir = ".\\data\\vectors\\HUNAN_INSITUS" os.makedirs(output_dir, exist_ok=True) year = 2024 start_date = "01-01" end_date = "12-31" main(output_dir, year, start_date, end_date) # start_date = "2024-01-01" # end_date = "2024-01-03" # target_url = f"http://58.20.42.94:9090/api/core/soil/soildrTM/61400400/{start_date}%2000:00/{end_date}%2000:00" # results = get_data_from_url(target_url) # print(results)