xhong f608e3afab feat(DataV_SuPER): 重构为支持省-市-县三级行政区划解析
- 将单一城市名称参数改为省、市、县三级参数,提升解析准确性
- 实现层级查找策略:全国->省->市->县,并添加参数校验
- 更新输出文件命名规则,使用"省_市"或"省_市_县"格式
- 完善函数文档字符串和错误提示信息
2026-04-13 13:30:34 +08:00

297 lines
9.5 KiB
Python

"""
访问阿里云 DataV 下载的行政区边界数据保存为原始 JSON, 有部分字段与GDAL不兼容, 导致无法
直接读取, 需要先清洗再保存为 GeoJSON.
- 官方地址: https://datav.aliyun.com/portal/school/atlas/area_selector
- Step1: 按"省-市-县名称"解析为行政区划代码 (adcode);
- Step2: 将 DataV 原始数据保存为 `省-市-县名称.json` 文件;
- Step3: 移除不兼容 GDAL/GeoPandas 的属性字段 (parent, center, centroid, acroutes);
- Step4: 将清洗后的结果写出为 `省-市-县名称.geojson` 文件.
-------------------------------------------------------------------------------
Authors: CVEO Team
Last Updated: 2026-04-13
===============================================================================
"""
import json
import re
from pathlib import Path
from typing import Optional
import requests
def _validate_region_params(province: str, city: str, county: Optional[str]) -> None:
"""
Validate that province and city are non-empty.
Raises
------
ValueError
If province or city is None or empty string.
"""
if not province or not province.strip():
raise ValueError("province (省) cannot be empty")
if not city or not city.strip():
raise ValueError("city (市) cannot be empty")
def get_datav_json(accode: str) -> dict:
"""
从 DataV 接口获取行政区边界的原始 JSON 数据并返回字典.
"""
# 使用路径式接口, 支持如 "420100" 或 "420100_full"
url = f"https://geo.datav.aliyun.com/areas_v3/bound/{accode}.json"
response = requests.get(url, timeout=15)
response.raise_for_status()
return response.json()
def fetch_and_save_geojson(accode: str, region_name: str, out_dir: Path) -> Path:
"""
获取 DataV 原始数据, 先保存为 .json; 随后清洗属性并另存为 .geojson.
Parameters
----------
accode : str
行政区划代码, 如 "420100""420100_full".
region_name : str
区域名称, 用于输出文件名. 当使用省-市-县三级参数时, 文件名为
"{province}_{city}.geojson""{province}_{city}_{county}.geojson".
out_dir : Path
输出目录.
"""
raw_data = get_datav_json(accode)
# 处理 features: 移除不兼容 GeoPandas 的属性
def sanitize_properties(props: dict) -> dict:
out = {}
for k, v in props.items():
# 移除嵌套对象
if k in ("parent", "center", "centroid", "acroutes"):
continue
# 仅保留标量类型; 丢弃其他嵌套结构
if isinstance(v, (str, int, float, bool)) or v is None:
out[k] = v
return out
# 输出路径(确保目录存在)
out_dir_path = Path(out_dir)
out_dir_path.mkdir(parents=True, exist_ok=True)
# 先保存原始 JSON(未清洗)
raw_json_path = out_dir_path / f"{region_name}.json"
with raw_json_path.open("w", encoding="utf-8") as f:
json.dump(raw_data, f, ensure_ascii=False)
# 再保存清洗后的 GeoJSON
out_path = out_dir_path / f"{region_name}.geojson"
# 深拷贝后进行清洗, 避免影响原始数据
data = json.loads(json.dumps(raw_data))
features = data.get("features", [])
for feature in features:
props = feature.get("properties", {})
feature["properties"] = sanitize_properties(props)
# 写出为 .geojson, 确保 UTF-8 且保留中文字符
with out_path.open("w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False)
return out_path
def _normalize_name(name: str) -> str:
name = name.strip()
# 简单去除常见后缀提高匹配鲁棒性
for suffix in ("", "", "地区", "", "自治州", "自治县", "特别行政区"):
if name.endswith(suffix):
name = name[: -len(suffix)]
return name
def _name_matches_exact(target: str, candidate: str) -> bool:
return _normalize_name(target) == _normalize_name(candidate)
def resolve_adcode_by_name(
province: str,
city: str,
county: Optional[str] = None,
prefer_full: bool = False,
) -> Optional[str]:
"""
通过省-市-县名称解析 DataV 行政区划代码.
采用层级查找策略: 全国数据 -> 省级数据 -> 市级数据 -> 区县数据.
前两级 (省/市) 必须提供且不能为空; 第三级 (县) 可为空.
Parameters
----------
province : str
省份名称, 如 "湖北省". 不能为空.
city : str
城市名称, 如 "武汉市". 不能为空.
county : str, optional
区县名称, 如 "江岸区". 可为空 (表示只解析到市级).
prefer_full : bool, optional
是否返回下辖完整边界代码 (如 "420100_full"), 默认 False.
Returns
-------
str or None
行政区划代码, 如 "420100""420100_full", 找不到则返回 None.
Raises
------
ValueError
如果 province 或 city 为空.
"""
_validate_region_params(province, city, county)
# Step 1: 从全国数据中查找省份
try:
cn = requests.get(
"https://geo.datav.aliyun.com/areas_v3/bound/100000_full.json",
timeout=20,
).json()
except Exception:
return None
pcode = None
for feat in cn.get("features", []):
props = feat.get("properties", {})
if props.get("level") == "province":
name = props.get("name", "")
code = str(props.get("adcode", ""))
if re.fullmatch(r"\d{6}", code):
if _name_matches_exact(province, name):
pcode = code
break
if not pcode:
return None
# Step 2: 从省份数据中查找城市
# 特殊处理直辖市: 当 province == city 时 (如 "北京市" == "北京市")
if _name_matches_exact(province, city):
ccode = pcode
else:
try:
prov_data = requests.get(
f"https://geo.datav.aliyun.com/areas_v3/bound/{pcode}_full.json",
timeout=20,
).json()
except Exception:
return None
ccode = None
for feat in prov_data.get("features", []):
props = feat.get("properties", {})
level = props.get("level")
name = props.get("name", "")
code = str(props.get("adcode", ""))
# 匹配城市或区县级别
if level in ("city", "district") and re.fullmatch(r"\d{6}", code):
if _name_matches_exact(city, name):
ccode = code
break
if not ccode:
return None
# Step 3: 如果提供了区县名称, 从城市数据中查找区县
if county:
try:
city_data = requests.get(
f"https://geo.datav.aliyun.com/areas_v3/bound/{ccode}_full.json",
timeout=20,
).json()
except Exception:
return None
dcode = None
for feat in city_data.get("features", []):
props = feat.get("properties", {})
level = props.get("level")
name = props.get("name", "")
code = str(props.get("adcode", ""))
# 匹配区县 (包括县级市, 其 level 可能为 "city")
if level in ("district", "city") and re.fullmatch(r"\d{6}", code):
if _name_matches_exact(county, name):
dcode = code
break
if not dcode:
return None
# 区县级别已经是最终边界,不需要 _full 后缀
return dcode
# 只解析到市级
return f"{ccode}_full" if prefer_full else ccode
def fetch_and_save_geojson_by_name(
province: str,
city: str,
county: Optional[str],
out_dir: Path,
prefer_full: bool = False,
) -> Path:
"""
通过省-市-县名称解析 adcode, 并直接拉取与保存 GeoJSON.
Parameters
----------
province : str
省份名称, 如 "湖北省". 不能为空.
city : str
城市名称, 如 "武汉市". 不能为空.
county : str, optional
区县名称, 如 "江岸区". 可为空 (表示只解析到市级).
out_dir : Path
输出目录, 用于保存 GeoJSON 文件.
prefer_full : bool, optional
是否下载下辖区划的 GeoJSON, 默认 False.
Returns
-------
Path
保存的 GeoJSON 文件路径.
Raises
------
ValueError
如果省或市为空, 或无法解析到行政区划代码.
"""
code = resolve_adcode_by_name(province, city, county, prefer_full=prefer_full)
if not code:
raise ValueError(
f"无法通过名称解析到行政区划代码: province={province}, city={city}, county={county}"
)
# 构建输出文件名
if county:
region_name = f"{province}_{city}_{county}"
else:
region_name = f"{province}_{city}"
return fetch_and_save_geojson(code, region_name, out_dir)
if __name__ == "__main__":
# 示例 1: 只获取市级边界 (province + city)
# out = fetch_and_save_geojson_by_name("湖北省", "武汉市", None, out_dir=Path("./data/vectors/"))
# 示例 2: 获取十堰市县区级边界
# out = fetch_and_save_geojson_by_name(
# "湖北省", "十堰市", None, out_dir=Path("./data/vectors/"), prefer_full=True
# )
# 示例 3: 获取区县级边界 (province + city + county)
out = fetch_and_save_geojson_by_name(
"湖北省", "十堰市", "郧西县", out_dir=Path("./data/vectors/"), prefer_full=True
)
print(f"Saved raw JSON and GeoJSON: {out}.")