586 lines
21 KiB
Python
586 lines
21 KiB
Python
"""
|
|
湖南省水文站点数据爬取
|
|
|
|
1. 处理爬取到的站点分布数据
|
|
2. 逐站点下载站点数据
|
|
3. 数据清洗进行数据解密
|
|
4. 检查数据完整性
|
|
|
|
数据示例:
|
|
站点位置信息:
|
|
{
|
|
"STCD": "61100800",
|
|
"STNM": "东安 ",
|
|
"ADDVCD": "431122000000000",
|
|
"DRNA": 689,
|
|
"FRGRD": "2",
|
|
"LGTD": 111.288333,
|
|
"LTTD": 26.402222,
|
|
"RVNM": "紫溪 ",
|
|
"STLC": "永州市东安县白牙市镇蒋家村 ",
|
|
"STTP": "ZQ",
|
|
"HNNM": "湘江 "
|
|
},
|
|
站点观测数据:
|
|
{
|
|
"_id": "eac8a911a751d75d6f67e4ba",
|
|
"CRPGRWPRD": null,
|
|
"CRPTY": null,
|
|
"EXKEY": "@",
|
|
"HITRSN": null,
|
|
"SLM10": 19.9,
|
|
"SLM100": null,
|
|
"SLM20": 22.1,
|
|
"SLM30": null,
|
|
"SLM40": 24.7,
|
|
"SLM60": null,
|
|
"SLM80": null,
|
|
"SLMMMT": null,
|
|
"SRLSLM": null,
|
|
"STCD": "61107700",
|
|
"TM": "2024-01-31T12:00:00.000Z",
|
|
"VTAVSLM": null,
|
|
"AVG": "22.2",
|
|
"XDSD": "91.17"
|
|
},
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import csv
|
|
import glob
|
|
import pandas as pd
|
|
import requests
|
|
import logging
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
|
|
class getInsituData:
|
|
"""
|
|
获取湖南省水文站点数据
|
|
"""
|
|
|
|
def __init__(self, output_dir: str, start_date: str, end_date: str) -> None:
|
|
"""
|
|
初始化
|
|
|
|
Parameters
|
|
----------
|
|
output_dir : str
|
|
保存根路径
|
|
start_date : str
|
|
起始日期, 如 "2024-01-01"
|
|
end_date : str
|
|
结束日期, 如 "2024-01-31"
|
|
"""
|
|
|
|
self.output_dir = output_dir
|
|
self.start_date = start_date
|
|
self.end_date = end_date
|
|
|
|
def get_data_from_url(self, target_url: str) -> list[dict]:
|
|
"""
|
|
获取湖南省水文站点数据
|
|
|
|
Parameters
|
|
----------
|
|
target_url : str
|
|
目标URL
|
|
|
|
Returns
|
|
-------
|
|
result_list : list[dict]
|
|
站点数据列表
|
|
"""
|
|
|
|
# 模拟请求头
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
"Host": "58.20.42.94:9090",
|
|
"Origin": "http://yzt.hnswkcj.com:9090",
|
|
"Proxy-Connection": "keep-alive",
|
|
"Referer": "http://yzt.hnswkcj.com:9090/",
|
|
}
|
|
# 模拟请求
|
|
result_list = []
|
|
try:
|
|
with requests.get(target_url, headers=headers, timeout=10) as response:
|
|
result_list = response.json()["data"]
|
|
except Exception as e:
|
|
result_list = []
|
|
return result_list
|
|
|
|
# 使用POST请求获取数据
|
|
def get_data_from_post(self) -> list[dict]:
|
|
"""
|
|
获取湖南省水文站点数据
|
|
|
|
Parameters
|
|
----------
|
|
target_url : str
|
|
目标URL
|
|
data : dict
|
|
POST请求参数
|
|
|
|
Returns
|
|
-------
|
|
result_list : list[dict]
|
|
站点数据列表
|
|
"""
|
|
|
|
# 模拟请求头
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
"Host": "58.20.42.94:9090",
|
|
"Origin": "http://yzt.hnswkcj.com:9090",
|
|
"Proxy-Connection": "keep-alive",
|
|
"Referer": "http://yzt.hnswkcj.com:9090/",
|
|
}
|
|
# POST 请求参数 (仅筛选墒情站点)
|
|
data = {
|
|
"sw": {
|
|
"isQuery": "true",
|
|
"czlb": "水文",
|
|
"xzbjz": [],
|
|
"ytfl": [],
|
|
"czdj": [],
|
|
"zwlb": [],
|
|
"swjs": [],
|
|
},
|
|
"swei": {
|
|
"isQuery": "false",
|
|
"xzbjz": [],
|
|
"ytfl": [],
|
|
"czdj": [],
|
|
"zwlb": [],
|
|
"swjs": [],
|
|
},
|
|
"js": {
|
|
"isQuery": "false",
|
|
"xzbjz": [],
|
|
"ytfl": [],
|
|
"czdj": [],
|
|
"zwlb": [],
|
|
"swjs": [],
|
|
},
|
|
"zf": {
|
|
"isQuery": "false",
|
|
"xzbjz": [],
|
|
"ytfl": [],
|
|
"czdj": [],
|
|
"zwlb": [],
|
|
"swjs": [],
|
|
},
|
|
"ns": {
|
|
"isQuery": "false",
|
|
"xzbjz": [],
|
|
"ytfl": [],
|
|
"czdj": [],
|
|
"zwlb": [],
|
|
"swjs": [],
|
|
},
|
|
"swen": {
|
|
"isQuery": "false",
|
|
"xzbjz": [],
|
|
"ytfl": [],
|
|
"czdj": [],
|
|
"zwlb": [],
|
|
"swjs": [],
|
|
},
|
|
"sq": {"isQuery": "true", "trlx": []},
|
|
"sz": {"isQuery": "false"},
|
|
"dxs": {"isQuery": "false"},
|
|
"slz": {"isQuery": "false"},
|
|
"stll": {"isQuery": "false"},
|
|
"rhpwk": {"isQuery": "false"},
|
|
"nhjc": {"isQuery": "false"},
|
|
"adcd": "43",
|
|
}
|
|
# 模拟请求
|
|
target_url = "http://58.20.42.94:9090/api/core/zwnew/zwxxylbMap"
|
|
result_list = []
|
|
try:
|
|
with requests.post(
|
|
target_url, headers=headers, data=data, timeout=10
|
|
) as response:
|
|
result_list = response.json()["data"]
|
|
except Exception as e:
|
|
result_list = []
|
|
return result_list
|
|
|
|
def get_insitu_list(self) -> list[dict]:
|
|
"""
|
|
获取并存储湖南省全省土壤墒情站点信息
|
|
|
|
Parameters
|
|
----------
|
|
None
|
|
|
|
Returns
|
|
-------
|
|
insitu_list : list[dict]
|
|
数据清洗后的站点信息列表
|
|
"""
|
|
|
|
logging.info("正在获取湖南省全省土壤墒情站点信息...")
|
|
output_file_path = os.path.join(self.output_dir, "hunan_insitu_position.csv")
|
|
if os.path.exists(output_file_path):
|
|
logging.info("湖南省土壤墒情站点信息已存在, 正在读取...")
|
|
with open(output_file_path, "r", encoding="gbk") as csvfile:
|
|
clean_insitu_list = list(csv.DictReader(csvfile))
|
|
else:
|
|
target_url = (
|
|
"http://58.20.42.94:9090/api/core/db/mm_soil/list/0/10000000/qb"
|
|
)
|
|
insitu_list = self.get_data_from_url(target_url)[0]["data"]
|
|
|
|
clean_insitu_list = []
|
|
for insitu in insitu_list:
|
|
insitu["STNM"] = insitu.pop("STNM").strip()
|
|
insitu["STCD"] = str(insitu.pop("STCD"))
|
|
insitu["ADDVCD"] = str(insitu.pop("ADDVCD"))
|
|
insitu["DRNA"] = insitu.pop("DRNA")
|
|
insitu["FRGRD"] = insitu.pop("FRGRD")
|
|
insitu["lng"] = insitu.pop("LGTD")
|
|
insitu["lat"] = insitu.pop("LTTD")
|
|
insitu["RVNM"] = insitu.pop("RVNM").strip()
|
|
insitu["STLC"] = insitu.pop("STLC").strip()
|
|
insitu["STTP"] = insitu.pop("STTP")
|
|
insitu["HNNM"] = insitu.pop("HNNM").strip()
|
|
clean_insitu_list.append(insitu)
|
|
|
|
# 将清洗后的数据保存到csv文件
|
|
with open(output_file_path, "w", newline="", encoding="gbk") as csvfile:
|
|
# 创建csv写入器对象
|
|
csvwriter = csv.writer(csvfile)
|
|
field_names = clean_insitu_list[0].keys()
|
|
csvwriter = csv.DictWriter(csvfile, fieldnames=field_names)
|
|
# 写入标题行
|
|
csvwriter.writeheader()
|
|
# 写入数据行
|
|
for insitu in clean_insitu_list:
|
|
csvwriter.writerow(insitu)
|
|
logging.info(
|
|
f"{len(clean_insitu_list)}条站点数据已存入csv文件中: {output_file_path}."
|
|
)
|
|
return clean_insitu_list
|
|
|
|
def clean_insitu_data(
|
|
self, insitu_name: str, insitu_data_list: list[dict], output_path: str
|
|
) -> None:
|
|
"""
|
|
清洗并存储指定站点的观测数据
|
|
|
|
Parameters
|
|
----------
|
|
insitu_name : str
|
|
站点名称
|
|
insitu_data_list : list[dict]
|
|
站点数据列表
|
|
output_path : str
|
|
输出文件路径
|
|
"""
|
|
|
|
clean_insitu_data = []
|
|
for data in insitu_data_list:
|
|
clean_data = {}
|
|
# 获取数据时间为UTC时间
|
|
clean_data["TIME"] = (
|
|
datetime.strptime(data.pop("TM"), "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
).strftime("%Y-%m-%dT%H:%M:%S")
|
|
clean_data["SLM10"] = data.pop("SLM10")
|
|
clean_data["SLM20"] = data.pop("SLM20")
|
|
clean_data["SLM40"] = data.pop("SLM40")
|
|
clean_data["AVG"] = data.pop("AVG")
|
|
clean_data["XDSD"] = data.pop("XDSD")
|
|
clean_insitu_data.append(clean_data)
|
|
|
|
with open(output_path, "w", newline="", encoding="gbk") as csvfile:
|
|
csvwriter = csv.writer(csvfile)
|
|
field_names = clean_insitu_data[0].keys()
|
|
csvwriter = csv.DictWriter(csvfile, fieldnames=field_names)
|
|
csvwriter.writeheader()
|
|
# 按时间列TIME排序后再写入
|
|
clean_insitu_data.sort(key=lambda x: x["TIME"])
|
|
csvwriter.writerows(clean_insitu_data)
|
|
logging.info(f"{insitu_name} 站土壤墒情数据已保存至:{output_path}")
|
|
return
|
|
|
|
def get_insitu_data(
|
|
self,
|
|
insitu_name: str,
|
|
insitu_id: str,
|
|
output_file_path: str,
|
|
) -> None:
|
|
"""
|
|
获取并存储指定站点指定时间范围内的观测数据
|
|
|
|
网站页面中请求时间不能超过3个月, 否则会被前端拦截报错
|
|
但直接交互不会被拦截
|
|
|
|
Parameters
|
|
----------
|
|
insitu_name : str
|
|
站点名称
|
|
insitu_id : str
|
|
站点编号
|
|
output_file_path : str
|
|
输出文件路径
|
|
"""
|
|
|
|
# 获取数据时间为UTC时间, 但请求时间为本地时间, 为确保数据完整性, 请求时间范围需要扩大1天
|
|
real_start_date = (
|
|
datetime.strptime(self.start_date, "%Y-%m-%d") - timedelta(days=1)
|
|
).strftime("%Y-%m-%d")
|
|
real_end_date = (
|
|
datetime.strptime(self.end_date, "%Y-%m-%d") + timedelta(days=1)
|
|
).strftime("%Y-%m-%d")
|
|
target_url = f"http://58.20.42.94:9090/api/core/soil/soildrTM/{insitu_id}/{real_start_date}%2000:00/{real_end_date}%2023:59"
|
|
result_list = self.get_data_from_url(target_url)
|
|
if len(result_list) != 0:
|
|
self.clean_insitu_data(insitu_name, result_list, output_file_path)
|
|
else:
|
|
logging.warning(f"暂未查询到 {insitu_name} 站土壤墒情数据.")
|
|
return
|
|
|
|
def save_all_insitu_data(self) -> None:
|
|
"""
|
|
获取并存储所有站点指定时间范围内的观测数据
|
|
"""
|
|
start_time = time.time()
|
|
insitu_list = self.get_insitu_list()
|
|
for insitu in insitu_list:
|
|
insitu_id = insitu["STCD"]
|
|
insitu_name = insitu["STNM"]
|
|
insitu_output_dir = os.path.join(self.output_dir, str(insitu_name))
|
|
os.makedirs(insitu_output_dir, exist_ok=True)
|
|
output_file_path = os.path.join(
|
|
insitu_output_dir,
|
|
f"{insitu_name}_{self.start_date}_{self.end_date}.csv",
|
|
)
|
|
if os.path.exists(output_file_path):
|
|
continue
|
|
self.get_insitu_data(insitu_name, insitu_id, output_file_path)
|
|
|
|
total_time = time.time() - start_time
|
|
logging.info(f"数据获取完毕. Total time: {total_time} seconds")
|
|
|
|
|
|
class checkInsituData:
|
|
"""
|
|
检查站点数据完整性
|
|
"""
|
|
|
|
def __init__(self, output_dir: str, year: int, start_date: str, end_date: str) -> None:
|
|
self.output_dir = output_dir
|
|
self.year = year
|
|
self.start_date = start_date
|
|
self.end_date = end_date
|
|
self.start_dt = datetime.strptime(self.start_date, "%Y-%m-%d")
|
|
self.end_dt = datetime.strptime(self.end_date, "%Y-%m-%d")
|
|
self.all_insitu_file = os.path.join(output_dir, "hunan_insitu_position.csv")
|
|
|
|
def check_all_insitu_data(self) -> None:
|
|
"""
|
|
检查所有站点数据完整性
|
|
|
|
1. 检查站点是否有数据文件
|
|
2. 检查数据时间范围是否覆盖全部日期
|
|
3. 检查每日数据是否至少有3次观测
|
|
4. 根据检查结果标记数据完整性等级 (LEVEL, 0=无数据, 1=部分完整, 2=完整)
|
|
"""
|
|
|
|
rows = []
|
|
with open(self.all_insitu_file, "r", encoding="gbk") as csvfile:
|
|
rows = list(csv.DictReader(csvfile))
|
|
|
|
# 计算日期范围和最小数据量要求
|
|
day_num = (self.end_dt - self.start_dt).days + 1
|
|
min_data_num = day_num * 3
|
|
# 生成完整的日期范围 (用于检查缺失日期)
|
|
full_dates = pd.date_range(start=self.start_dt, end=self.end_dt, freq="D")
|
|
|
|
for row in rows:
|
|
insitu_name = row["STNM"]
|
|
insitu_files = glob.glob(
|
|
os.path.join(
|
|
self.output_dir,
|
|
str(insitu_name),
|
|
f"*_{self.start_date}_{self.end_date}.csv",
|
|
)
|
|
)
|
|
# 添加新字段记录数据完整性
|
|
level_field = f"LEVEL_{self.year}"
|
|
row[level_field] = 0 # 默认为无数据
|
|
if len(insitu_files) == 0:
|
|
continue
|
|
insitu_df = pd.read_csv(insitu_files[0], parse_dates=["TIME"])
|
|
cleaned_data = self.clean_data(insitu_df)
|
|
# 保存清理后的数据 (本地时间和UTC时间)
|
|
base_path = insitu_files[0].replace(".csv", "")
|
|
# UTC版本
|
|
cleaned_data.to_csv(f"{base_path}_clean_UTC.csv", index=False)
|
|
|
|
# 检查1: 总数据量是否满足最低要求
|
|
if len(cleaned_data) == 0:
|
|
continue
|
|
row[level_field] = 1 # 标记为部分数据
|
|
if len(cleaned_data) < min_data_num:
|
|
continue
|
|
# 检查2: 每日数据是否完整 (至少3次观测)
|
|
daily_counts = cleaned_data.set_index("TIME").resample("D").size()
|
|
|
|
# 检查是否覆盖所有日期且每日至少有3次观测
|
|
missing_dates = full_dates.difference(daily_counts.index)
|
|
insufficient_days = daily_counts[daily_counts < 3]
|
|
# if missing_dates.empty or insufficient_days.empty:
|
|
if missing_dates.empty:
|
|
row[level_field] = 2 # 标记为完整数据
|
|
# 本地时间版本
|
|
cleaned_data_UTC8 = cleaned_data.copy()
|
|
cleaned_data_UTC8["TIME"] = cleaned_data_UTC8["TIME"] + pd.Timedelta(hours=8)
|
|
cleaned_data_UTC8.to_csv(f"{base_path}_clean.csv", index=False)
|
|
|
|
# 将修改后的数据写回文件
|
|
output_file = self.all_insitu_file.replace(".csv", "_checked.csv")
|
|
with open(output_file, "w", encoding="gbk", newline="") as csvfile:
|
|
writer = csv.DictWriter(csvfile, fieldnames=rows[0].keys())
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
# 提取 LEVEL 列表
|
|
levels = [row[level_field] for row in rows]
|
|
nodata_count = levels.count(0)
|
|
partial_count = levels.count(1)
|
|
complete_count = levels.count(2)
|
|
logging.info(
|
|
f"站点数据完整性检查完毕, {self.start_date} - {self.end_date} 完整数据站点数: {complete_count}, 不完整数据站点数: {partial_count}, 无数据站点数: {nodata_count}."
|
|
)
|
|
return
|
|
|
|
def clean_data(self, data: pd.DataFrame):
|
|
"""
|
|
通过请求直接获取的数据为加密数据, 需要进行数据清洗解密
|
|
"""
|
|
# 经观察, UTC 时间 00:00, 06:00, 08:00, 12:00, 16:00, 18:00 为真实观测时间
|
|
data["REAL_SLM10"] = data["SLM10"].where(
|
|
data["TIME"].dt.hour.isin([0, 6, 8, 12, 16, 18])
|
|
)
|
|
# 仅保留SLM10与REAL_SLM10相等的数据点, 并删除REAL_SLM10列
|
|
data_cleaned = data[data["SLM10"] == data["REAL_SLM10"]].drop(
|
|
columns=["REAL_SLM10"]
|
|
)
|
|
# 剔除超过时间范围的数据
|
|
data_cleaned = data_cleaned[
|
|
(data_cleaned["TIME"] >= self.start_dt) & (data_cleaned["TIME"] < self.end_dt + timedelta(days=1))
|
|
]
|
|
# 考虑时间重复的情况, 以及极端值噪声过滤
|
|
drop_index = []
|
|
for i in range(1, len(data_cleaned) - 1):
|
|
current_time = data_cleaned.iloc[i]["TIME"]
|
|
current_value = data_cleaned.iloc[i]["SLM10"]
|
|
if i == 0:
|
|
next_value = data_cleaned.iloc[i + 1]["SLM10"]
|
|
if abs(current_value - next_value) > current_value:
|
|
drop_index.append(i)
|
|
elif i == len(data_cleaned) - 1:
|
|
previous_value = data_cleaned.iloc[i - 1]["SLM10"]
|
|
if abs(current_value - previous_value) > current_value:
|
|
drop_index.append(i)
|
|
else:
|
|
previous_value = data_cleaned.iloc[i - 1]["SLM10"]
|
|
next_value = data_cleaned.iloc[i + 1]["SLM10"]
|
|
previous_time = data_cleaned.iloc[i - 1]["TIME"]
|
|
next_time = data_cleaned.iloc[i + 1]["TIME"]
|
|
if current_time == next_time:
|
|
# 仅在首次发现时间重复时, 判断异常噪声
|
|
if current_value > next_value:
|
|
if (current_value - next_value) >= next_value * 0.6:
|
|
if (previous_value - next_value) >= next_value * 0.6:
|
|
# 下一条数据为极小值噪声
|
|
drop_index.append(i + 1)
|
|
elif (
|
|
current_value - previous_value
|
|
) >= previous_value * 0.6:
|
|
# 当前数据为极大值噪声
|
|
drop_index.append(i)
|
|
else:
|
|
# 当前数据为极大值噪声
|
|
drop_index.append(i)
|
|
elif current_value < next_value:
|
|
if (next_value - current_value) >= current_value * 0.6:
|
|
if (previous_value - current_value) >= current_value * 0.6:
|
|
# 当前数据为极小值噪声
|
|
drop_index.append(i)
|
|
elif (next_value - previous_value) >= current_value * 0.6:
|
|
# 下一条数据为极大值噪声
|
|
drop_index.append(i + 1)
|
|
else:
|
|
# 下一条数据为极大值噪声
|
|
drop_index.append(i + 1)
|
|
elif current_time == previous_time:
|
|
# 若与上一条数据时间相同, 已处理跳过
|
|
continue
|
|
else:
|
|
# 处理在观测时间不重复时的异常噪声值
|
|
if current_value < previous_value and current_value < next_value:
|
|
# 若为极小值
|
|
threshold = current_value * 0.6
|
|
if (previous_value - current_value) >= threshold or (
|
|
next_value - current_value
|
|
) >= threshold:
|
|
drop_index.append(i)
|
|
elif current_value > previous_value and current_value > next_value:
|
|
# 若为极大值
|
|
if (current_value - previous_value) >= previous_value or (
|
|
current_value - next_value
|
|
) >= next_value:
|
|
drop_index.append(i)
|
|
data_cleaned = data_cleaned.drop(data_cleaned.index[drop_index])
|
|
return data_cleaned
|
|
|
|
|
|
def main(
|
|
output_dir: str,
|
|
year: str | int = 2024,
|
|
start_date: str = "01-01",
|
|
end_date: str = None,
|
|
) -> None:
|
|
start_date = f"{year}-{start_date}"
|
|
end_date = f"{year}-{end_date}" if end_date is not None else start_date
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(levelname)s:%(asctime)s ||| %(message)s",
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout),
|
|
logging.FileHandler(
|
|
f"{output_dir}\\insitu_SuPER_{start_date}_{end_date}.log"
|
|
),
|
|
],
|
|
)
|
|
getInsituData(output_dir, start_date, end_date).save_all_insitu_data()
|
|
checkInsituData(output_dir, year, start_date, end_date).check_all_insitu_data()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
output_dir = ".\\data\\vectors\\HUNAN_INSITUS"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
year = 2024
|
|
start_date = "01-01"
|
|
end_date = "12-31"
|
|
main(output_dir, year, start_date, end_date)
|
|
# start_date = "2024-01-01"
|
|
# end_date = "2024-01-03"
|
|
# target_url = f"http://58.20.42.94:9090/api/core/soil/soildrTM/61400400/{start_date}%2000:00/{end_date}%2000:00"
|
|
# results = get_data_from_url(target_url)
|
|
# print(results)
|