588 lines
22 KiB
Python

"""
湖南省水文站点数据爬取
1. 处理爬取到的站点分布数据
2. 逐站点下载站点数据
3. 数据清洗进行数据解密
4. 检查数据完整性
数据示例:
站点位置信息:
{
"STCD": "61100800",
"STNM": "东安 ",
"ADDVCD": "431122000000000",
"DRNA": 689,
"FRGRD": "2",
"LGTD": 111.288333,
"LTTD": 26.402222,
"RVNM": "紫溪 ",
"STLC": "永州市东安县白牙市镇蒋家村 ",
"STTP": "ZQ",
"HNNM": "湘江 "
},
站点观测数据:
{
"_id": "eac8a911a751d75d6f67e4ba",
"CRPGRWPRD": null,
"CRPTY": null,
"EXKEY": "@",
"HITRSN": null,
"SLM10": 19.9,
"SLM100": null,
"SLM20": 22.1,
"SLM30": null,
"SLM40": 24.7,
"SLM60": null,
"SLM80": null,
"SLMMMT": null,
"SRLSLM": null,
"STCD": "61107700",
"TM": "2024-01-31T12:00:00.000Z",
"VTAVSLM": null,
"AVG": "22.2",
"XDSD": "91.17"
},
"""
import os
import sys
import csv
import glob
import pandas as pd
import requests
import logging
import time
from datetime import datetime, timedelta
# 动态获取项目根目录路径
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(project_root)
class getInsituData:
"""
获取湖南省水文站点数据
"""
def __init__(self, output_dir: str, start_date: str, end_date: str) -> None:
"""
初始化
Parameters
----------
output_dir : str
保存根路径
start_date : str
起始日期, 如 "2024-01-01"
end_date : str
结束日期, 如 "2024-01-31"
"""
self.output_dir = output_dir
self.start_date = start_date
self.end_date = end_date
def get_data_from_url(self, target_url: str) -> list[dict]:
"""
获取湖南省水文站点数据
Parameters
----------
target_url : str
目标URL
Returns
-------
result_list : list[dict]
站点数据列表
"""
# 模拟请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Host": "58.20.42.94:9090",
"Origin": "http://yzt.hnswkcj.com:9090",
"Proxy-Connection": "keep-alive",
"Referer": "http://yzt.hnswkcj.com:9090/",
}
# 模拟请求
result_list = []
try:
with requests.get(target_url, headers=headers, timeout=10) as response:
result_list = response.json()["data"]
except Exception as e:
result_list = []
return result_list
# 使用POST请求获取数据
def get_data_from_post(self) -> list[dict]:
"""
获取湖南省水文站点数据
Parameters
----------
target_url : str
目标URL
data : dict
POST请求参数
Returns
-------
result_list : list[dict]
站点数据列表
"""
# 模拟请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Host": "58.20.42.94:9090",
"Origin": "http://yzt.hnswkcj.com:9090",
"Proxy-Connection": "keep-alive",
"Referer": "http://yzt.hnswkcj.com:9090/",
}
# POST 请求参数 (仅筛选墒情站点)
data = {
"sw": {
"isQuery": "true",
"czlb": "水文",
"xzbjz": [],
"ytfl": [],
"czdj": [],
"zwlb": [],
"swjs": [],
},
"swei": {
"isQuery": "false",
"xzbjz": [],
"ytfl": [],
"czdj": [],
"zwlb": [],
"swjs": [],
},
"js": {
"isQuery": "false",
"xzbjz": [],
"ytfl": [],
"czdj": [],
"zwlb": [],
"swjs": [],
},
"zf": {
"isQuery": "false",
"xzbjz": [],
"ytfl": [],
"czdj": [],
"zwlb": [],
"swjs": [],
},
"ns": {
"isQuery": "false",
"xzbjz": [],
"ytfl": [],
"czdj": [],
"zwlb": [],
"swjs": [],
},
"swen": {
"isQuery": "false",
"xzbjz": [],
"ytfl": [],
"czdj": [],
"zwlb": [],
"swjs": [],
},
"sq": {"isQuery": "true", "trlx": []},
"sz": {"isQuery": "false"},
"dxs": {"isQuery": "false"},
"slz": {"isQuery": "false"},
"stll": {"isQuery": "false"},
"rhpwk": {"isQuery": "false"},
"nhjc": {"isQuery": "false"},
"adcd": "43",
}
# 模拟请求
target_url = "http://58.20.42.94:9090/api/core/zwnew/zwxxylbMap"
result_list = []
try:
with requests.post(
target_url, headers=headers, data=data, timeout=10
) as response:
result_list = response.json()["data"]
except Exception as e:
result_list = []
return result_list
def get_insitu_list(self) -> list[dict]:
"""
获取并存储湖南省全省土壤墒情站点信息
Parameters
----------
None
Returns
-------
insitu_list : list[dict]
数据清洗后的站点信息列表
"""
logging.info("正在获取湖南省全省土壤墒情站点信息...")
output_file_path = os.path.join(self.output_dir, "hunan_insitu_position.csv")
if os.path.exists(output_file_path):
logging.info("湖南省土壤墒情站点信息已存在, 正在读取...")
with open(output_file_path, "r", encoding="gbk") as csvfile:
clean_insitu_list = list(csv.DictReader(csvfile))
else:
target_url = (
"http://58.20.42.94:9090/api/core/db/mm_soil/list/0/10000000/qb"
)
insitu_list = self.get_data_from_url(target_url)[0]["data"]
clean_insitu_list = []
for insitu in insitu_list:
insitu["STNM"] = insitu.pop("STNM").strip()
insitu["STCD"] = str(insitu.pop("STCD"))
insitu["ADDVCD"] = str(insitu.pop("ADDVCD"))
insitu["DRNA"] = insitu.pop("DRNA")
insitu["FRGRD"] = insitu.pop("FRGRD")
insitu["lng"] = insitu.pop("LGTD")
insitu["lat"] = insitu.pop("LTTD")
insitu["RVNM"] = insitu.pop("RVNM").strip()
insitu["STLC"] = insitu.pop("STLC").strip()
insitu["STTP"] = insitu.pop("STTP")
insitu["HNNM"] = insitu.pop("HNNM").strip()
clean_insitu_list.append(insitu)
# 将清洗后的数据保存到csv文件
with open(output_file_path, "w", newline="", encoding="gbk") as csvfile:
# 创建csv写入器对象
csvwriter = csv.writer(csvfile)
field_names = clean_insitu_list[0].keys()
csvwriter = csv.DictWriter(csvfile, fieldnames=field_names)
# 写入标题行
csvwriter.writeheader()
# 写入数据行
for insitu in clean_insitu_list:
csvwriter.writerow(insitu)
logging.info(
f"{len(clean_insitu_list)}条站点数据已存入csv文件中: {output_file_path}."
)
return clean_insitu_list
def clean_insitu_data(
self, insitu_name: str, insitu_data_list: list[dict], output_path: str
) -> None:
"""
清洗并存储指定站点的观测数据
Parameters
----------
insitu_name : str
站点名称
insitu_data_list : list[dict]
站点数据列表
output_path : str
输出文件路径
"""
clean_insitu_data = []
for data in insitu_data_list:
clean_data = {}
# 获取数据时间为UTC时间
clean_data["TIME"] = (
datetime.strptime(data.pop("TM"), "%Y-%m-%dT%H:%M:%S.%fZ")
).strftime("%Y-%m-%dT%H:%M:%S")
clean_data["SLM10"] = data.pop("SLM10")
clean_data["SLM20"] = data.pop("SLM20")
clean_data["SLM40"] = data.pop("SLM40")
clean_data["AVG"] = data.pop("AVG")
clean_data["XDSD"] = data.pop("XDSD")
clean_insitu_data.append(clean_data)
with open(output_path, "w", newline="", encoding="gbk") as csvfile:
csvwriter = csv.writer(csvfile)
field_names = clean_insitu_data[0].keys()
csvwriter = csv.DictWriter(csvfile, fieldnames=field_names)
csvwriter.writeheader()
# 按时间列TIME排序后再写入
clean_insitu_data.sort(key=lambda x: x["TIME"])
csvwriter.writerows(clean_insitu_data)
logging.info(f"{insitu_name} 站土壤墒情数据已保存至:{output_path}")
return
def get_insitu_data(
self,
insitu_name: str,
insitu_id: str,
output_file_path: str,
) -> None:
"""
获取并存储指定站点指定时间范围内的观测数据
网站页面中请求时间不能超过3个月, 否则会被前端拦截报错
但直接交互不会被拦截
Parameters
----------
insitu_name : str
站点名称
insitu_id : str
站点编号
output_file_path : str
输出文件路径
"""
# 获取数据时间为UTC时间, 但请求时间为本地时间, 为确保数据完整性, 请求时间范围需要扩大1天
real_start_date = (
datetime.strptime(self.start_date, "%Y-%m-%d") - timedelta(days=1)
).strftime("%Y-%m-%d")
real_end_date = (
datetime.strptime(self.end_date, "%Y-%m-%d") + timedelta(days=1)
).strftime("%Y-%m-%d")
target_url = f"http://58.20.42.94:9090/api/core/soil/soildrTM/{insitu_id}/{real_start_date}%2000:00/{real_end_date}%2023:59"
result_list = self.get_data_from_url(target_url)
if len(result_list) != 0:
self.clean_insitu_data(insitu_name, result_list, output_file_path)
else:
logging.warning(f"暂未查询到 {insitu_name} 站土壤墒情数据.")
return
def save_all_insitu_data(self) -> None:
"""
获取并存储所有站点指定时间范围内的观测数据
"""
start_time = time.time()
insitu_list = self.get_insitu_list()
for insitu in insitu_list:
insitu_id = insitu["STCD"]
insitu_name = insitu["STNM"]
insitu_output_dir = os.path.join(self.output_dir, str(insitu_name))
os.makedirs(insitu_output_dir, exist_ok=True)
output_file_path = os.path.join(
insitu_output_dir,
f"{insitu_name}_{self.start_date}_{self.end_date}.csv",
)
if os.path.exists(output_file_path):
continue
self.get_insitu_data(insitu_name, insitu_id, output_file_path)
total_time = time.time() - start_time
logging.info(f"数据获取完毕. Total time: {total_time} seconds")
class checkInsituData:
"""
检查站点数据完整性
"""
def __init__(self, output_dir: str, year: int, start_date: str, end_date: str) -> None:
self.output_dir = output_dir
self.year = year
self.start_date = start_date
self.end_date = end_date
self.start_dt = datetime.strptime(self.start_date, "%Y-%m-%d")
self.end_dt = datetime.strptime(self.end_date, "%Y-%m-%d")
self.all_insitu_file = os.path.join(output_dir, "hunan_insitu_position.csv")
def check_all_insitu_data(self) -> None:
"""
检查所有站点数据完整性
1. 检查站点是否有数据文件
2. 检查数据时间范围是否覆盖全部日期
3. 检查每日数据是否至少有3次观测
4. 根据检查结果标记数据完整性等级 (LEVEL, 0=无数据, 1=部分完整, 2=完整)
"""
rows = []
with open(self.all_insitu_file, "r", encoding="gbk") as csvfile:
rows = list(csv.DictReader(csvfile))
# 计算日期范围和最小数据量要求
day_num = (self.end_dt - self.start_dt).days + 1
min_data_num = day_num * 3
# 生成完整的日期范围 (用于检查缺失日期)
full_dates = pd.date_range(start=self.start_dt, end=self.end_dt, freq="D")
for row in rows:
insitu_name = row["STNM"]
insitu_files = glob.glob(
os.path.join(
self.output_dir,
str(insitu_name),
f"*_{self.start_date}_{self.end_date}.csv",
)
)
# 添加新字段记录数据完整性
level_field = f"LEVEL_{self.year}"
row[level_field] = 0 # 默认为无数据
if len(insitu_files) == 0:
continue
insitu_df = pd.read_csv(insitu_files[0], parse_dates=["TIME"])
cleaned_data = self.clean_data(insitu_df)
# 保存清理后的数据 (本地时间和UTC时间)
base_path = insitu_files[0].replace(".csv", "")
# UTC版本
cleaned_data.to_csv(f"{base_path}_clean_UTC.csv", index=False)
# 检查1: 总数据量是否满足最低要求
if len(cleaned_data) == 0:
continue
row[level_field] = 1 # 标记为部分数据
if len(cleaned_data) < min_data_num:
continue
# 检查2: 每日数据是否完整 (至少3次观测)
daily_counts = cleaned_data.set_index("TIME").resample("D").size()
# 检查是否覆盖所有日期且每日至少有3次观测
missing_dates = full_dates.difference(daily_counts.index)
insufficient_days = daily_counts[daily_counts < 3]
# if missing_dates.empty or insufficient_days.empty:
if missing_dates.empty:
row[level_field] = 2 # 标记为完整数据
# 本地时间版本
cleaned_data_UTC8 = cleaned_data.copy()
cleaned_data_UTC8["TIME"] = cleaned_data_UTC8["TIME"] + pd.Timedelta(hours=8)
cleaned_data_UTC8.to_csv(f"{base_path}_clean.csv", index=False)
# 将修改后的数据写回文件
output_file = self.all_insitu_file.replace(".csv", "_checked.csv")
with open(output_file, "w", encoding="gbk", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=rows[0].keys())
writer.writeheader()
writer.writerows(rows)
# 提取 LEVEL 列表
levels = [row[level_field] for row in rows]
nodata_count = levels.count(0)
partial_count = levels.count(1)
complete_count = levels.count(2)
logging.info(
f"站点数据完整性检查完毕, {self.start_date} - {self.end_date} 完整数据站点数: {complete_count}, 不完整数据站点数: {partial_count}, 无数据站点数: {nodata_count}."
)
return
def clean_data(self, data: pd.DataFrame):
"""
通过请求直接获取的数据为加密数据, 需要进行数据清洗解密
"""
# 经观察, UTC 时间 00:00, 06:00, 08:00, 12:00, 16:00, 18:00 为真实观测时间
data["REAL_SLM10"] = data["SLM10"].where(
data["TIME"].dt.hour.isin([0, 6, 8, 12, 16, 18])
)
# 仅保留SLM10与REAL_SLM10相等的数据点, 并删除REAL_SLM10列
data_cleaned = data[data["SLM10"] == data["REAL_SLM10"]].drop(
columns=["REAL_SLM10"]
)
# 剔除超过时间范围的数据
data_cleaned = data_cleaned[
(data_cleaned["TIME"] >= self.start_dt) & (data_cleaned["TIME"] < self.end_dt + timedelta(days=1))
]
# 考虑时间重复的情况, 以及极端值噪声过滤
drop_index = []
for i in range(1, len(data_cleaned) - 1):
current_time = data_cleaned.iloc[i]["TIME"]
current_value = data_cleaned.iloc[i]["SLM10"]
if i == 0:
next_value = data_cleaned.iloc[i + 1]["SLM10"]
if abs(current_value - next_value) > current_value:
drop_index.append(i)
elif i == len(data_cleaned) - 1:
previous_value = data_cleaned.iloc[i - 1]["SLM10"]
if abs(current_value - previous_value) > current_value:
drop_index.append(i)
else:
previous_value = data_cleaned.iloc[i - 1]["SLM10"]
next_value = data_cleaned.iloc[i + 1]["SLM10"]
previous_time = data_cleaned.iloc[i - 1]["TIME"]
next_time = data_cleaned.iloc[i + 1]["TIME"]
if current_time == next_time:
# 仅在首次发现时间重复时, 判断异常噪声
if current_value > next_value:
if (current_value - next_value) >= next_value * 0.6:
if (previous_value - next_value) >= next_value * 0.6:
# 下一条数据为极小值噪声
drop_index.append(i + 1)
elif (
current_value - previous_value
) >= previous_value * 0.6:
# 当前数据为极大值噪声
drop_index.append(i)
else:
# 当前数据为极大值噪声
drop_index.append(i)
elif current_value < next_value:
if (next_value - current_value) >= current_value * 0.6:
if (previous_value - current_value) >= current_value * 0.6:
# 当前数据为极小值噪声
drop_index.append(i)
elif (next_value - previous_value) >= current_value * 0.6:
# 下一条数据为极大值噪声
drop_index.append(i + 1)
else:
# 下一条数据为极大值噪声
drop_index.append(i + 1)
elif current_time == previous_time:
# 若与上一条数据时间相同, 已处理跳过
continue
else:
# 处理在观测时间不重复时的异常噪声值
if current_value < previous_value and current_value < next_value:
# 若为极小值
threshold = current_value * 0.6
if (previous_value - current_value) >= threshold or (
next_value - current_value
) >= threshold:
drop_index.append(i)
elif current_value > previous_value and current_value > next_value:
# 若为极大值
if (current_value - previous_value) >= previous_value or (
current_value - next_value
) >= next_value:
drop_index.append(i)
data_cleaned = data_cleaned.drop(data_cleaned.index[drop_index])
return data_cleaned
def main(
output_dir: str,
year: str | int = 2024,
start_date: str = "01-01",
end_date: str = None,
) -> None:
start_date = f"{year}-{start_date}"
end_date = f"{year}-{end_date}" if end_date is not None else start_date
logging.basicConfig(
level=logging.INFO,
format="%(levelname)s:%(asctime)s ||| %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(
f"{output_dir}\\insitu_SuPER_{start_date}_{end_date}.log"
),
],
)
getInsituData(output_dir, start_date, end_date).save_all_insitu_data()
checkInsituData(output_dir, year, start_date, end_date).check_all_insitu_data()
if __name__ == "__main__":
output_dir = ".\\data\\vectors\\HUNAN_INSITUS"
os.makedirs(output_dir, exist_ok=True)
year = 2024
start_date = "01-01"
end_date = "12-31"
main(output_dir, year, start_date, end_date)
# start_date = "2024-01-01"
# end_date = "2024-01-03"
# target_url = f"http://58.20.42.94:9090/api/core/soil/soildrTM/61400400/{start_date}%2000:00/{end_date}%2000:00"
# results = get_data_from_url(target_url)
# print(results)